diff --git a/Analysis/CCC_analysis.ipynb b/Analysis/CCC_analysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8b5c6715be57e6fec5e87a1b40c7e8f73f3c15ae --- /dev/null +++ b/Analysis/CCC_analysis.ipynb @@ -0,0 +1,1679 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "b07b3d8e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/kprasath/anaconda3/lib/python3.10/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n", + " warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n", + "[nltk_data] Downloading package vader_lexicon to\n", + "[nltk_data] /home/kprasath/nltk_data...\n", + "[nltk_data] Package vader_lexicon is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "import pandas as pd\n", + "import numpy as np\n", + "import re\n", + "import nltk\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", + "import emoji\n", + "from textblob import TextBlob\n", + "from fuzzywuzzy import process\n", + "import matplotlib.pyplot as plt\n", + "from wordcloud import WordCloud\n", + "\n", + "\n", + "nltk.download('vader_lexicon')" + ] + }, + { + "cell_type": "markdown", + "id": "a5829c2a", + "metadata": {}, + "source": [ + "## Getting the Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c36304cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>a40_abduction_and_related_offences</th>\n", + " <th>b30_burglary_break_and_enter</th>\n", + " <th>a10_homicide_and_related_offences</th>\n", + " <th>f30_other_government_regulatory_offences</th>\n", + " <th>lga_code11</th>\n", + " <th>total_division_c_offences</th>\n", + " <th>a30_sexual_offences</th>\n", + " <th>a70_stalking_harassment_and_threatening_behaviour</th>\n", + " <th>e10_justice_procedures</th>\n", + " <th>total_division_f_offences</th>\n", + " <th>...</th>\n", + " <th>total_division_e_offences</th>\n", + " <th>e20_breaches_of_orders</th>\n", + " <th>f10_regulatory_driving_offences</th>\n", + " <th>a60_blackmail_and_extortion</th>\n", + " <th>b40_theft</th>\n", + " <th>c90_other_drug_offences</th>\n", + " <th>f20_transport_regulation_offences</th>\n", + " <th>a20_assault_and_related_offences</th>\n", + " <th>b60_bribery</th>\n", + " <th>c10_drug_dealing_and_trafficking</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1.0</td>\n", + " <td>27</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>20110</td>\n", + " <td>53</td>\n", + " <td>95</td>\n", + " <td>11</td>\n", + " <td>11</td>\n", + " <td>1</td>\n", + " <td>...</td>\n", + " <td>68</td>\n", + " <td>57</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>90</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>81</td>\n", + " <td>0.0</td>\n", + " <td>9.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.0</td>\n", + " <td>69</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>20260</td>\n", + " <td>137</td>\n", + " <td>58</td>\n", + " <td>38</td>\n", + " <td>84</td>\n", + " <td>2</td>\n", + " <td>...</td>\n", + " <td>411</td>\n", + " <td>327</td>\n", + " <td>0.0</td>\n", + " <td>5.0</td>\n", + " <td>222</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>176</td>\n", + " <td>0.0</td>\n", + " <td>24.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21.0</td>\n", + " <td>983</td>\n", + " <td>4.0</td>\n", + " <td>1.0</td>\n", + " <td>20570</td>\n", + " <td>414</td>\n", + " <td>345</td>\n", + " <td>236</td>\n", + " <td>376</td>\n", + " <td>23</td>\n", + " <td>...</td>\n", + " <td>1884</td>\n", + " <td>1508</td>\n", + " <td>0.0</td>\n", + " <td>7.0</td>\n", + " <td>3501</td>\n", + " <td>5.0</td>\n", + " <td>4.0</td>\n", + " <td>1007</td>\n", + " <td>0.0</td>\n", + " <td>68.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9.0</td>\n", + " <td>832</td>\n", + " <td>4.0</td>\n", + " <td>1.0</td>\n", + " <td>20660</td>\n", + " <td>513</td>\n", + " <td>185</td>\n", + " <td>266</td>\n", + " <td>192</td>\n", + " <td>13</td>\n", + " <td>...</td>\n", + " <td>2038</td>\n", + " <td>1846</td>\n", + " <td>0.0</td>\n", + " <td>4.0</td>\n", + " <td>2931</td>\n", + " <td>1.0</td>\n", + " <td>3.0</td>\n", + " <td>611</td>\n", + " <td>0.0</td>\n", + " <td>47.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0.0</td>\n", + " <td>205</td>\n", + " <td>2.0</td>\n", + " <td>2.0</td>\n", + " <td>20740</td>\n", + " <td>154</td>\n", + " <td>74</td>\n", + " <td>119</td>\n", + " <td>48</td>\n", + " <td>5</td>\n", + " <td>...</td>\n", + " <td>378</td>\n", + " <td>330</td>\n", + " <td>0.0</td>\n", + " <td>2.0</td>\n", + " <td>640</td>\n", + " <td>1.0</td>\n", + " <td>2.0</td>\n", + " <td>277</td>\n", + " <td>0.0</td>\n", + " <td>29.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 37 columns</p>\n", + "</div>" + ], + "text/plain": [ + " a40_abduction_and_related_offences b30_burglary_break_and_enter \\\n", + "0 1.0 27 \n", + "1 0.0 69 \n", + "2 21.0 983 \n", + "3 9.0 832 \n", + "4 0.0 205 \n", + "\n", + " a10_homicide_and_related_offences \\\n", + "0 0.0 \n", + "1 0.0 \n", + "2 4.0 \n", + "3 4.0 \n", + "4 2.0 \n", + "\n", + " f30_other_government_regulatory_offences lga_code11 \\\n", + "0 1.0 20110 \n", + "1 1.0 20260 \n", + "2 1.0 20570 \n", + "3 1.0 20660 \n", + "4 2.0 20740 \n", + "\n", + " total_division_c_offences a30_sexual_offences \\\n", + "0 53 95 \n", + "1 137 58 \n", + "2 414 345 \n", + "3 513 185 \n", + "4 154 74 \n", + "\n", + " a70_stalking_harassment_and_threatening_behaviour \\\n", + "0 11 \n", + "1 38 \n", + "2 236 \n", + "3 266 \n", + "4 119 \n", + "\n", + " e10_justice_procedures total_division_f_offences ... \\\n", + "0 11 1 ... \n", + "1 84 2 ... \n", + "2 376 23 ... \n", + "3 192 13 ... \n", + "4 48 5 ... \n", + "\n", + " total_division_e_offences e20_breaches_of_orders \\\n", + "0 68 57 \n", + "1 411 327 \n", + "2 1884 1508 \n", + "3 2038 1846 \n", + "4 378 330 \n", + "\n", + " f10_regulatory_driving_offences a60_blackmail_and_extortion b40_theft \\\n", + "0 0.0 0.0 90 \n", + "1 0.0 5.0 222 \n", + "2 0.0 7.0 3501 \n", + "3 0.0 4.0 2931 \n", + "4 0.0 2.0 640 \n", + "\n", + " c90_other_drug_offences f20_transport_regulation_offences \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 5.0 4.0 \n", + "3 1.0 3.0 \n", + "4 1.0 2.0 \n", + "\n", + " a20_assault_and_related_offences b60_bribery \\\n", + "0 81 0.0 \n", + "1 176 0.0 \n", + "2 1007 0.0 \n", + "3 611 0.0 \n", + "4 277 0.0 \n", + "\n", + " c10_drug_dealing_and_trafficking \n", + "0 9.0 \n", + "1 24.0 \n", + "2 68.0 \n", + "3 47.0 \n", + "4 29.0 \n", + "\n", + "[5 rows x 37 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#crime data\n", + "crime_data = pd.read_csv('crime_data.csv')\n", + "crime_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "63621eda", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'pd' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#state data\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m state_data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstate_data.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 3\u001b[0m state_data\u001b[38;5;241m.\u001b[39mhead()\n", + "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined" + ] + } + ], + "source": [ + "#state data\n", + "state_data = pd.read_csv('state_data.csv')\n", + "state_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "005002d0", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ae7cac44", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4.27 s, sys: 1.02 s, total: 5.29 s\n", + "Wall time: 5.28 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "#using a bigger data \n", + "with open('smallTwitter.json', 'r') as f:\n", + " data = json.load(f)\n", + "\n", + "filtered_data = []\n", + "\n", + "for tweet in data:\n", + " if 'places' in tweet['includes']:\n", + " for place in tweet['includes']['places']:\n", + " if place['full_name'].endswith(', Victoria'):\n", + " area_name, _, _ = place['full_name'].partition(', ')\n", + " if '-' in area_name:\n", + " area_name = area_name.split('-')[0].strip()\n", + "\n", + " filtered_data.append({\n", + " '_id': tweet['_id'],\n", + " #'author_id': tweet['data']['author_id'],\n", + " 'text': tweet['data']['text'],\n", + " 'sentiment': tweet['data'].get('sentiment', None), # returns None if 'sentiment' key doesn't exist\n", + " 'language': tweet['data']['lang'],\n", + " 'area_name': area_name\n", + " })\n", + "\n", + "main_data = pd.DataFrame(filtered_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa806077", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f5483bb", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "main_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "067228db", + "metadata": {}, + "source": [ + "## Data cleaning and transformation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c046c5de", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "print(crime_data.isna().sum())\n", + "print(state_data.isna().sum())\n", + "print(main_data.isna().sum())\n", + "\n", + "#Only sentiment has missing values,\n", + "#but we can ignore for now cuz we will be engineering this variable later" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e615112", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#Same here\n", + "print(crime_data.isnull().sum())\n", + "print(state_data.isnull().sum())\n", + "print(main_data.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0f64e183", + "metadata": {}, + "outputs": [], + "source": [ + "# Variable Type Adjustments\n", + "\n", + "#For consistency \n", + "crime_data.columns = crime_data.columns.str.strip()\n", + "state_data.columns = state_data.columns.str.strip()\n", + "main_data.columns = main_data.columns.str.strip()\n", + "\n", + "crime_data['lga_code11'] = crime_data['lga_code11'].astype(int)\n", + "state_data['lga_code'] = state_data['lga_code'].astype(int)\n", + "main_data['_id'] = main_data['_id'].astype(str)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b1117bb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e98d301", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "3598d554", + "metadata": {}, + "source": [ + "### Text Cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba4a5991", + "metadata": {}, + "outputs": [], + "source": [ + "main_data[['text']].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "62a7dc99", + "metadata": {}, + "source": [ + "Cleaning the data:\n", + "\n", + "- Remove URLs: Since URLs generally don't contribute to sentiment, we can remove them.\n", + "- Remove mentions: Mentions (@username) may or may not contribute to sentiment, but they can be problematic for analysis due to their variability.\n", + "- Remove special characters: These generally don't contribute to sentiment and can make analysis more difficult.\n", + "- Remove numbers: Numbers don't contribute to sentiment and can be safely removed.\n", + "- Convert to lowercase: Case doesn't matter for sentiment, and converting to lowercase can simplify subsequent analysis.\n", + "- Remove extra spaces: Cleaning up extra spaces will make the text more consistent for analysis.\n", + "\n", + "Feature extraction:\n", + "\n", + "- Number of hashtags: Hashtags can be indicators of the topic or intensity of a tweet, so we'll count them.\n", + "- Number of words: The length of a tweet can be a useful feature.\n", + "- Number of characters: This can also be a useful feature.\n", + "- Sentiment analysis: We will use NLTK's VADER sentiment analysis tool to compute sentiment scores for each cleaned tweet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c2098a0", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "main_data.language.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "aa9b575d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 7.76 s, sys: 3.76 ms, total: 7.76 s\n", + "Wall time: 7.76 s\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>_id</th>\n", + " <th>text</th>\n", + " <th>sentiment</th>\n", + " <th>language</th>\n", + " <th>area_name</th>\n", + " <th>cleaned_text</th>\n", + " <th>num_hashtags</th>\n", + " <th>num_words</th>\n", + " <th>num_chars</th>\n", + " <th>num_emojis</th>\n", + " <th>subjectivity</th>\n", + " <th>sentiment_score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1414031972041588744</td>\n", + " <td>Stella is spending the day with her cousin Mon...</td>\n", + " <td>0.000000</td>\n", + " <td>en</td>\n", + " <td>Ocean Grove</td>\n", + " <td>stella is spending the day with her cousin monty</td>\n", + " <td>0</td>\n", + " <td>9</td>\n", + " <td>48</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>0.0000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1414032158197354497</td>\n", + " <td>@eleaud Oh no! I hope they don’t go up any fur...</td>\n", + " <td>0.090909</td>\n", + " <td>en</td>\n", + " <td>Ocean Grove</td>\n", + " <td>oh no i hope they dont go up any further</td>\n", + " <td>0</td>\n", + " <td>10</td>\n", + " <td>40</td>\n", + " <td>0</td>\n", + " <td>0.5</td>\n", + " <td>0.1779</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1414036352308445186</td>\n", + " <td>Sunday @afl R17: #aflgiantssuns #afldogsswans ...</td>\n", + " <td>0.000000</td>\n", + " <td>en</td>\n", + " <td>Drysdale</td>\n", + " <td>sunday r aflgiantssuns afldogsswans afltigerspies</td>\n", + " <td>0</td>\n", + " <td>5</td>\n", + " <td>49</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>0.0000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1414044890040733699</td>\n", + " <td>@BlancheVerlie Sorry to watch the declining si...</td>\n", + " <td>0.058824</td>\n", + " <td>en</td>\n", + " <td>Ocean Grove</td>\n", + " <td>sorry to watch the declining situation up ther...</td>\n", + " <td>0</td>\n", + " <td>33</td>\n", + " <td>166</td>\n", + " <td>0</td>\n", + " <td>0.5</td>\n", + " <td>0.7717</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1414057294950649861</td>\n", + " <td>@VicGovAu One lady had over 40 saved photos of...</td>\n", + " <td>0.090909</td>\n", + " <td>en</td>\n", + " <td>Drysdale</td>\n", + " <td>one lady had over saved photos of login cube s...</td>\n", + " <td>0</td>\n", + " <td>20</td>\n", + " <td>99</td>\n", + " <td>0</td>\n", + " <td>0.5</td>\n", + " <td>0.4215</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " _id text \\\n", + "0 1414031972041588744 Stella is spending the day with her cousin Mon... \n", + "1 1414032158197354497 @eleaud Oh no! I hope they don’t go up any fur... \n", + "2 1414036352308445186 Sunday @afl R17: #aflgiantssuns #afldogsswans ... \n", + "3 1414044890040733699 @BlancheVerlie Sorry to watch the declining si... \n", + "4 1414057294950649861 @VicGovAu One lady had over 40 saved photos of... \n", + "\n", + " sentiment language area_name \\\n", + "0 0.000000 en Ocean Grove \n", + "1 0.090909 en Ocean Grove \n", + "2 0.000000 en Drysdale \n", + "3 0.058824 en Ocean Grove \n", + "4 0.090909 en Drysdale \n", + "\n", + " cleaned_text num_hashtags num_words \\\n", + "0 stella is spending the day with her cousin monty 0 9 \n", + "1 oh no i hope they dont go up any further 0 10 \n", + "2 sunday r aflgiantssuns afldogsswans afltigerspies 0 5 \n", + "3 sorry to watch the declining situation up ther... 0 33 \n", + "4 one lady had over saved photos of login cube s... 0 20 \n", + "\n", + " num_chars num_emojis subjectivity sentiment_score \n", + "0 48 0 0.0 0.0000 \n", + "1 40 0 0.5 0.1779 \n", + "2 49 0 0.0 0.0000 \n", + "3 166 0 0.5 0.7717 \n", + "4 99 0 0.5 0.4215 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time \n", + "\n", + "# Initialize the sentiment analyzer\n", + "sia = SentimentIntensityAnalyzer()\n", + "\n", + "def clean_text(text):\n", + " text = re.sub(r'http\\S+|www.\\S+', '', text) \n", + " text = re.sub(r'@\\w+', '', text) \n", + " text = re.sub(r'[^A-Za-z\\s]', '', text) \n", + " text = re.sub(r'\\d+', '', text) \n", + " text = re.sub(r'\\s{2,}', ' ', text) \n", + " text = text.lower() \n", + " return text.strip() \n", + "\n", + "def extract_features(text):\n", + " num_hashtags = text.count('#')\n", + " num_words = len(text.split())\n", + " num_chars = len(text)\n", + " num_emojis = emoji.emoji_count(text)\n", + " return num_hashtags, num_words, num_chars, num_emojis\n", + "\n", + "def extract_subjectivity(text):\n", + " return TextBlob(text).sentiment.subjectivity\n", + "\n", + "def get_sentiment_score(text):\n", + " return sia.polarity_scores(text)['compound']\n", + "\n", + "# Filter out the non-English tweets from the main data\n", + "main_data = main_data[main_data['language'] == 'en'].copy()\n", + "\n", + "# Apply the cleaning function to the 'text' column\n", + "main_data.loc[:, 'cleaned_text'] = main_data['text'].apply(clean_text)\n", + "\n", + "# Apply the feature extraction function to the cleaned text and create new columns\n", + "main_data['num_hashtags'], main_data['num_words'], main_data['num_chars'], main_data['num_emojis'] = zip(*main_data['cleaned_text'].map(extract_features))\n", + "\n", + "# Apply sentiment analyzer to get the polarity scores and create a new column for that\n", + "main_data['subjectivity'] = main_data['cleaned_text'].apply(extract_subjectivity)\n", + "\n", + "# Add sentiment score\n", + "main_data['sentiment_score'] = main_data['cleaned_text'].apply(get_sentiment_score)\n", + "\n", + "\n", + "main_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a4d5c90b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>area_name</th>\n", + " <th>cleaned_text</th>\n", + " <th>num_hashtags</th>\n", + " <th>num_words</th>\n", + " <th>num_chars</th>\n", + " <th>num_emojis</th>\n", + " <th>subjectivity</th>\n", + " <th>sentiment_score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Ocean Grove</td>\n", + " <td>stella is spending the day with her cousin monty</td>\n", + " <td>0</td>\n", + " <td>9</td>\n", + " <td>48</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>0.0000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Ocean Grove</td>\n", + " <td>oh no i hope they dont go up any further</td>\n", + " <td>0</td>\n", + " <td>10</td>\n", + " <td>40</td>\n", + " <td>0</td>\n", + " <td>0.5</td>\n", + " <td>0.1779</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Drysdale</td>\n", + " <td>sunday r aflgiantssuns afldogsswans afltigerspies</td>\n", + " <td>0</td>\n", + " <td>5</td>\n", + " <td>49</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>0.0000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Ocean Grove</td>\n", + " <td>sorry to watch the declining situation up ther...</td>\n", + " <td>0</td>\n", + " <td>33</td>\n", + " <td>166</td>\n", + " <td>0</td>\n", + " <td>0.5</td>\n", + " <td>0.7717</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Drysdale</td>\n", + " <td>one lady had over saved photos of login cube s...</td>\n", + " <td>0</td>\n", + " <td>20</td>\n", + " <td>99</td>\n", + " <td>0</td>\n", + " <td>0.5</td>\n", + " <td>0.4215</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " area_name cleaned_text \\\n", + "0 Ocean Grove stella is spending the day with her cousin monty \n", + "1 Ocean Grove oh no i hope they dont go up any further \n", + "2 Drysdale sunday r aflgiantssuns afldogsswans afltigerspies \n", + "3 Ocean Grove sorry to watch the declining situation up ther... \n", + "4 Drysdale one lady had over saved photos of login cube s... \n", + "\n", + " num_hashtags num_words num_chars num_emojis subjectivity \\\n", + "0 0 9 48 0 0.0 \n", + "1 0 10 40 0 0.5 \n", + "2 0 5 49 0 0.0 \n", + "3 0 33 166 0 0.5 \n", + "4 0 20 99 0 0.5 \n", + "\n", + " sentiment_score \n", + "0 0.0000 \n", + "1 0.1779 \n", + "2 0.0000 \n", + "3 0.7717 \n", + "4 0.4215 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_data = main_data.drop(['_id', 'text', 'sentiment', 'language'], axis=1)\n", + "text_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "af84dddc", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'state_data' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mstate_data\u001b[49m\u001b[38;5;241m.\u001b[39mdtypes\n", + "\u001b[0;31mNameError\u001b[0m: name 'state_data' is not defined" + ] + } + ], + "source": [ + "state_data.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0ef3468a", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'state_data' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mstate_data\u001b[49m\u001b[38;5;241m.\u001b[39mhead()\n", + "\u001b[0;31mNameError\u001b[0m: name 'state_data' is not defined" + ] + } + ], + "source": [ + "state_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d970d153", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>a40_abduction_and_related_offences</th>\n", + " <th>b30_burglary_break_and_enter</th>\n", + " <th>a10_homicide_and_related_offences</th>\n", + " <th>f30_other_government_regulatory_offences</th>\n", + " <th>lga_code11</th>\n", + " <th>total_division_c_offences</th>\n", + " <th>a30_sexual_offences</th>\n", + " <th>a70_stalking_harassment_and_threatening_behaviour</th>\n", + " <th>e10_justice_procedures</th>\n", + " <th>total_division_f_offences</th>\n", + " <th>...</th>\n", + " <th>total_division_e_offences</th>\n", + " <th>e20_breaches_of_orders</th>\n", + " <th>f10_regulatory_driving_offences</th>\n", + " <th>a60_blackmail_and_extortion</th>\n", + " <th>b40_theft</th>\n", + " <th>c90_other_drug_offences</th>\n", + " <th>f20_transport_regulation_offences</th>\n", + " <th>a20_assault_and_related_offences</th>\n", + " <th>b60_bribery</th>\n", + " <th>c10_drug_dealing_and_trafficking</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1.0</td>\n", + " <td>27</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>20110</td>\n", + " <td>53</td>\n", + " <td>95</td>\n", + " <td>11</td>\n", + " <td>11</td>\n", + " <td>1</td>\n", + " <td>...</td>\n", + " <td>68</td>\n", + " <td>57</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>90</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>81</td>\n", + " <td>0.0</td>\n", + " <td>9.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.0</td>\n", + " <td>69</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>20260</td>\n", + " <td>137</td>\n", + " <td>58</td>\n", + " <td>38</td>\n", + " <td>84</td>\n", + " <td>2</td>\n", + " <td>...</td>\n", + " <td>411</td>\n", + " <td>327</td>\n", + " <td>0.0</td>\n", + " <td>5.0</td>\n", + " <td>222</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>176</td>\n", + " <td>0.0</td>\n", + " <td>24.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21.0</td>\n", + " <td>983</td>\n", + " <td>4.0</td>\n", + " <td>1.0</td>\n", + " <td>20570</td>\n", + " <td>414</td>\n", + " <td>345</td>\n", + " <td>236</td>\n", + " <td>376</td>\n", + " <td>23</td>\n", + " <td>...</td>\n", + " <td>1884</td>\n", + " <td>1508</td>\n", + " <td>0.0</td>\n", + " <td>7.0</td>\n", + " <td>3501</td>\n", + " <td>5.0</td>\n", + " <td>4.0</td>\n", + " <td>1007</td>\n", + " <td>0.0</td>\n", + " <td>68.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9.0</td>\n", + " <td>832</td>\n", + " <td>4.0</td>\n", + " <td>1.0</td>\n", + " <td>20660</td>\n", + " <td>513</td>\n", + " <td>185</td>\n", + " <td>266</td>\n", + " <td>192</td>\n", + " <td>13</td>\n", + " <td>...</td>\n", + " <td>2038</td>\n", + " <td>1846</td>\n", + " <td>0.0</td>\n", + " <td>4.0</td>\n", + " <td>2931</td>\n", + " <td>1.0</td>\n", + " <td>3.0</td>\n", + " <td>611</td>\n", + " <td>0.0</td>\n", + " <td>47.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0.0</td>\n", + " <td>205</td>\n", + " <td>2.0</td>\n", + " <td>2.0</td>\n", + " <td>20740</td>\n", + " <td>154</td>\n", + " <td>74</td>\n", + " <td>119</td>\n", + " <td>48</td>\n", + " <td>5</td>\n", + " <td>...</td>\n", + " <td>378</td>\n", + " <td>330</td>\n", + " <td>0.0</td>\n", + " <td>2.0</td>\n", + " <td>640</td>\n", + " <td>1.0</td>\n", + " <td>2.0</td>\n", + " <td>277</td>\n", + " <td>0.0</td>\n", + " <td>29.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 37 columns</p>\n", + "</div>" + ], + "text/plain": [ + " a40_abduction_and_related_offences b30_burglary_break_and_enter \\\n", + "0 1.0 27 \n", + "1 0.0 69 \n", + "2 21.0 983 \n", + "3 9.0 832 \n", + "4 0.0 205 \n", + "\n", + " a10_homicide_and_related_offences \\\n", + "0 0.0 \n", + "1 0.0 \n", + "2 4.0 \n", + "3 4.0 \n", + "4 2.0 \n", + "\n", + " f30_other_government_regulatory_offences lga_code11 \\\n", + "0 1.0 20110 \n", + "1 1.0 20260 \n", + "2 1.0 20570 \n", + "3 1.0 20660 \n", + "4 2.0 20740 \n", + "\n", + " total_division_c_offences a30_sexual_offences \\\n", + "0 53 95 \n", + "1 137 58 \n", + "2 414 345 \n", + "3 513 185 \n", + "4 154 74 \n", + "\n", + " a70_stalking_harassment_and_threatening_behaviour e10_justice_procedures \\\n", + "0 11 11 \n", + "1 38 84 \n", + "2 236 376 \n", + "3 266 192 \n", + "4 119 48 \n", + "\n", + " total_division_f_offences ... total_division_e_offences \\\n", + "0 1 ... 68 \n", + "1 2 ... 411 \n", + "2 23 ... 1884 \n", + "3 13 ... 2038 \n", + "4 5 ... 378 \n", + "\n", + " e20_breaches_of_orders f10_regulatory_driving_offences \\\n", + "0 57 0.0 \n", + "1 327 0.0 \n", + "2 1508 0.0 \n", + "3 1846 0.0 \n", + "4 330 0.0 \n", + "\n", + " a60_blackmail_and_extortion b40_theft c90_other_drug_offences \\\n", + "0 0.0 90 0.0 \n", + "1 5.0 222 0.0 \n", + "2 7.0 3501 5.0 \n", + "3 4.0 2931 1.0 \n", + "4 2.0 640 1.0 \n", + "\n", + " f20_transport_regulation_offences a20_assault_and_related_offences \\\n", + "0 0.0 81 \n", + "1 0.0 176 \n", + "2 4.0 1007 \n", + "3 3.0 611 \n", + "4 2.0 277 \n", + "\n", + " b60_bribery c10_drug_dealing_and_trafficking \n", + "0 0.0 9.0 \n", + "1 0.0 24.0 \n", + "2 0.0 68.0 \n", + "3 0.0 47.0 \n", + "4 0.0 29.0 \n", + "\n", + "[5 rows x 37 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crime_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "bb62f0d1", + "metadata": {}, + "source": [ + "## Data integration & Mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0960e5a5", + "metadata": {}, + "outputs": [], + "source": [ + "state_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "076dd0c3", + "metadata": {}, + "outputs": [], + "source": [ + "lga_code_mapping = state_data.set_index('scc_name')['lga_code']\n", + "\n", + "text_data['lga_code'] = text_data['area_name'].map(lga_code_mapping)\n", + "\n", + "state_data_unique = state_data.groupby('lga_code').agg({\n", + " 'lga_name': 'first',\n", + " 'lat': 'mean',\n", + " 'lon': 'mean'\n", + "}).reset_index()\n", + "\n", + "text_data = text_data.merge(state_data_unique, on='lga_code', how='left')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "eb8d7900", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>area_name</th>\n", + " <th>cleaned_text</th>\n", + " <th>num_hashtags</th>\n", + " <th>num_words</th>\n", + " <th>num_chars</th>\n", + " <th>num_emojis</th>\n", + " <th>subjectivity</th>\n", + " <th>sentiment_score</th>\n", + " <th>lga_code</th>\n", + " <th>lga_name</th>\n", + " <th>lat</th>\n", + " <th>lon</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Ocean Grove</td>\n", + " <td>stella is spending the day with her cousin monty</td>\n", + " <td>0</td>\n", + " <td>9</td>\n", + " <td>48</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>0.0000</td>\n", + " <td>22750.0</td>\n", + " <td>Greater Geelong</td>\n", + " <td>-38.145829</td>\n", + " <td>144.416254</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Ocean Grove</td>\n", + " <td>oh no i hope they dont go up any further</td>\n", + " <td>0</td>\n", + " <td>10</td>\n", + " <td>40</td>\n", + " <td>0</td>\n", + " <td>0.5</td>\n", + " <td>0.1779</td>\n", + " <td>22750.0</td>\n", + " <td>Greater Geelong</td>\n", + " <td>-38.145829</td>\n", + " <td>144.416254</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Drysdale</td>\n", + " <td>sunday r aflgiantssuns afldogsswans afltigerspies</td>\n", + " <td>0</td>\n", + " <td>5</td>\n", + " <td>49</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>0.0000</td>\n", + " <td>22750.0</td>\n", + " <td>Greater Geelong</td>\n", + " <td>-38.145829</td>\n", + " <td>144.416254</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Ocean Grove</td>\n", + " <td>sorry to watch the declining situation up ther...</td>\n", + " <td>0</td>\n", + " <td>33</td>\n", + " <td>166</td>\n", + " <td>0</td>\n", + " <td>0.5</td>\n", + " <td>0.7717</td>\n", + " <td>22750.0</td>\n", + " <td>Greater Geelong</td>\n", + " <td>-38.145829</td>\n", + " <td>144.416254</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Drysdale</td>\n", + " <td>one lady had over saved photos of login cube s...</td>\n", + " <td>0</td>\n", + " <td>20</td>\n", + " <td>99</td>\n", + " <td>0</td>\n", + " <td>0.5</td>\n", + " <td>0.4215</td>\n", + " <td>22750.0</td>\n", + " <td>Greater Geelong</td>\n", + " <td>-38.145829</td>\n", + " <td>144.416254</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " area_name cleaned_text \\\n", + "0 Ocean Grove stella is spending the day with her cousin monty \n", + "1 Ocean Grove oh no i hope they dont go up any further \n", + "2 Drysdale sunday r aflgiantssuns afldogsswans afltigerspies \n", + "3 Ocean Grove sorry to watch the declining situation up ther... \n", + "4 Drysdale one lady had over saved photos of login cube s... \n", + "\n", + " num_hashtags num_words num_chars num_emojis subjectivity \\\n", + "0 0 9 48 0 0.0 \n", + "1 0 10 40 0 0.5 \n", + "2 0 5 49 0 0.0 \n", + "3 0 33 166 0 0.5 \n", + "4 0 20 99 0 0.5 \n", + "\n", + " sentiment_score lga_code lga_name lat lon \n", + "0 0.0000 22750.0 Greater Geelong -38.145829 144.416254 \n", + "1 0.1779 22750.0 Greater Geelong -38.145829 144.416254 \n", + "2 0.0000 22750.0 Greater Geelong -38.145829 144.416254 \n", + "3 0.7717 22750.0 Greater Geelong -38.145829 144.416254 \n", + "4 0.4215 22750.0 Greater Geelong -38.145829 144.416254 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "91fcfdbb", + "metadata": {}, + "outputs": [], + "source": [ + "#crim data merging\n", + "\n", + "# Assuming crime_data and state_data are your two dataframes\n", + "# and 'lga_code' is the name of the column with the LGA code in both dataframes\n", + "# and 'lga_code11' is the column name in crime_data\n", + "\n", + "# First, make sure the 'lga_code' columns in both dataframes are of the same type\n", + "crime_data['lga_code11'] = crime_data['lga_code11'].astype(str)\n", + "state_data['lga_code'] = state_data['lga_code'].astype(str)\n", + "\n", + "# Then, merge the dataframes\n", + "merged_data = pd.merge(crime_data, state_data, left_on='lga_code11', right_on='lga_code', how='left')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a31d6efe", + "metadata": {}, + "outputs": [], + "source": [ + "#merged_data.to_csv('crime_spatial.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3e96a60f", + "metadata": {}, + "outputs": [], + "source": [ + "#text_data.to_csv('textdatafor_spatial.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "e3c87512", + "metadata": {}, + "source": [ + "# Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "68f5899b", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"['new_sentiment', 'compound'] not in index\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtext_data\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnum_hashtags\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnum_words\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnum_chars\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnew_sentiment\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnum_emojis\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcompound\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msubjectivity\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mdescribe()\n", + "File \u001b[0;32m~/anaconda3/lib/python3.10/site-packages/pandas/core/frame.py:3813\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 3812\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 3813\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n", + "File \u001b[0;32m~/anaconda3/lib/python3.10/site-packages/pandas/core/indexes/base.py:6070\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m 6067\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 6068\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6070\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6072\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m 6073\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[1;32m 6074\u001b[0m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n", + "File \u001b[0;32m~/anaconda3/lib/python3.10/site-packages/pandas/core/indexes/base.py:6133\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m 6130\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6132\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[0;32m-> 6133\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mKeyError\u001b[0m: \"['new_sentiment', 'compound'] not in index\"" + ] + } + ], + "source": [ + "#text_data[['num_hashtags', 'num_words', 'num_chars', 'new_sentiment', 'num_emojis', 'compound', 'subjectivity']].describe()\n" + ] + }, + { + "cell_type": "markdown", + "id": "5db957df", + "metadata": {}, + "source": [ + "plt.hist(text_data['new_sentiment'], bins=20, alpha=0.5)\n", + "plt.xlabel('Sentiment Score')\n", + "plt.ylabel('Frequency')\n", + "plt.title('Sentiment Distribution')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4b8425bb", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 1200x600 with 2 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n", + "\n", + "# Plot 1: Sentiment Distribution\n", + "axes[0].hist(text_data['sentiment_score'], bins=20, alpha=0.5)\n", + "axes[0].set_xlabel('Sentiment Score')\n", + "axes[0].set_ylabel('Frequency')\n", + "axes[0].set_title('Sentiment Distribution')\n", + "\n", + "# Plot 2: Subjectivity Distribution\n", + "axes[1].hist(text_data['subjectivity'], bins=20, alpha=0.5)\n", + "axes[1].set_xlabel('Subjectivity Score')\n", + "axes[1].set_ylabel('Frequency')\n", + "axes[1].set_title('Subjectivity Distribution')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6385dcdb", + "metadata": {}, + "outputs": [], + "source": [ + "# takes way too long, wont be scalable in big data\n", + "'''from wordcloud import WordCloud\n", + "\n", + "wordcloud = WordCloud(width = 800, height = 800, \n", + " background_color ='white', \n", + " stopwords = None, \n", + " min_font_size = 10).generate(' '.join(text_data['cleaned_text']))\n", + "\n", + "plt.figure(figsize = (8, 8), facecolor = None) \n", + "plt.imshow(wordcloud) \n", + "plt.axis(\"off\") \n", + "plt.tight_layout(pad = 0) \n", + "plt.show()'''\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2b43468", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#taking a very long time to run\n", + "\n", + "import geopandas as gpd\n", + "\n", + "gdf = gpd.GeoDataFrame(text_data, geometry=gpd.points_from_xy(text_data.lon_x, text_data.lat_x))\n", + "\n", + "fig, ax = plt.subplots()\n", + "gdf.plot(ax=ax, color='blue')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "697b8a18", + "metadata": {}, + "outputs": [], + "source": [ + "text_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80f4bd71", + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install geoplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6814b63c", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "import geopandas as gpd\n", + "import geoplot as gplt\n", + "import geoplot.crs as gcrs\n", + "import pandas as pd\n", + "from shapely import wkt\n", + "\n", + "def load_wkt(value):\n", + " if isinstance(value, str):\n", + " return wkt.loads(value)\n", + " return None\n", + "\n", + "text_data['geometry_y'] = text_data['geometry_y'].apply(load_wkt) # convert POLYGON string to geometry\n", + "gdf = gpd.GeoDataFrame(text_data, geometry='geometry_y') # convert DataFrame to GeoDataFrame\n", + "\n", + "# Group by 'scc_name_x' and calculate mean sentiment\n", + "gdf['new_sentiment'] = pd.to_numeric(gdf['new_sentiment'], errors='coerce')\n", + "grouped_gdf = gdf.groupby('scc_name_x').agg({'new_sentiment': 'mean'})\n", + "\n", + "# Join the original gdf with grouped_gdf to get the mean sentiment for each row\n", + "gdf = gdf.merge(grouped_gdf, on='scc_name_x')\n", + "\n", + "# Create a KDE plot and overlay it on a polygon plot\n", + "ax = gplt.kdeplot(\n", + " gdf,\n", + " clip=gdf.geometry,\n", + " shade=True,\n", + " cmap='Reds',\n", + " projection=gcrs.AlbersEqualArea(),\n", + ")\n", + "gplt.polyplot(gdf, ax=ax, zorder=1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac8ec1f6", + "metadata": {}, + "outputs": [], + "source": [ + "text_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "586cf916", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "import geopandas as gpd\n", + "import geoplot as gplt\n", + "import geoplot.crs as gcrs\n", + "from shapely.geometry import Point\n", + "\n", + "# Convert lat/lon to a shapely Point object\n", + "text_data['geometry'] = text_data.apply(lambda row: Point(row.lon, row.lat), axis=1)\n", + "\n", + "# Convert DataFrame to GeoDataFrame\n", + "gdf = gpd.GeoDataFrame(text_data, geometry='geometry')\n", + "\n", + "# Group by 'lga_name' and calculate mean sentiment_score\n", + "gdf['sentiment_score'] = pd.to_numeric(gdf['sentiment_score'], errors='coerce')\n", + "grouped_gdf = gdf.groupby('lga_name').agg({'sentiment_score': 'mean'})\n", + "\n", + "# Join the original gdf with grouped_gdf to get the mean sentiment for each row\n", + "gdf = gdf.merge(grouped_gdf, on='lga_name', suffixes=('', '_mean'))\n", + "\n", + "# Create a KDE plot and overlay it on a polygon plot\n", + "ax = gplt.kdeplot(\n", + " gdf,\n", + " hue='sentiment_score_mean',\n", + " clip=gdf.geometry,\n", + " shade=True,\n", + " cmap='coolwarm',\n", + " projection=gcrs.AlbersEqualArea(),\n", + ")\n", + "gplt.polyplot(gdf, ax=ax, zorder=1)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}