From 800b38efb50d7789b1a05d902535df7750b78092 Mon Sep 17 00:00:00 2001 From: Keshav Prasath <kprasath@student.unimelb.edu.au> Date: Thu, 25 May 2023 11:22:33 +0000 Subject: [PATCH] Upload New File --- Analysis/CCC_data_wrangling.ipynb | 675 ++++++++++++++++++++++++++++++ 1 file changed, 675 insertions(+) create mode 100644 Analysis/CCC_data_wrangling.ipynb diff --git a/Analysis/CCC_data_wrangling.ipynb b/Analysis/CCC_data_wrangling.ipynb new file mode 100644 index 0000000..dcc2406 --- /dev/null +++ b/Analysis/CCC_data_wrangling.ipynb @@ -0,0 +1,675 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "144ae6e4", + "metadata": {}, + "source": [ + "## Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b755e6df", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import json\n", + "import geopandas as gpd\n", + "\n", + "#twitter-data-small.json" + ] + }, + { + "cell_type": "markdown", + "id": "30a0611e", + "metadata": {}, + "source": [ + "# Different Data's" + ] + }, + { + "cell_type": "markdown", + "id": "14a18af1", + "metadata": {}, + "source": [ + "## 1) Data from Sudo (Crime)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "021d9519", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>a40_abduction_and_related_offences</th>\n", + " <th>b30_burglary_break_and_enter</th>\n", + " <th>a10_homicide_and_related_offences</th>\n", + " <th>f30_other_government_regulatory_offences</th>\n", + " <th>lga_code11</th>\n", + " <th>total_division_c_offences</th>\n", + " <th>a30_sexual_offences</th>\n", + " <th>a70_stalking_harassment_and_threatening_behaviour</th>\n", + " <th>e10_justice_procedures</th>\n", + " <th>total_division_f_offences</th>\n", + " <th>...</th>\n", + " <th>total_division_e_offences</th>\n", + " <th>e20_breaches_of_orders</th>\n", + " <th>f10_regulatory_driving_offences</th>\n", + " <th>a60_blackmail_and_extortion</th>\n", + " <th>b40_theft</th>\n", + " <th>c90_other_drug_offences</th>\n", + " <th>f20_transport_regulation_offences</th>\n", + " <th>a20_assault_and_related_offences</th>\n", + " <th>b60_bribery</th>\n", + " <th>c10_drug_dealing_and_trafficking</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1.0</td>\n", + " <td>27</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>20110</td>\n", + " <td>53</td>\n", + " <td>95</td>\n", + " <td>11</td>\n", + " <td>11</td>\n", + " <td>1</td>\n", + " <td>...</td>\n", + " <td>68</td>\n", + " <td>57</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>90</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>81</td>\n", + " <td>0.0</td>\n", + " <td>9.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.0</td>\n", + " <td>69</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>20260</td>\n", + " <td>137</td>\n", + " <td>58</td>\n", + " <td>38</td>\n", + " <td>84</td>\n", + " <td>2</td>\n", + " <td>...</td>\n", + " <td>411</td>\n", + " <td>327</td>\n", + " <td>0.0</td>\n", + " <td>5.0</td>\n", + " <td>222</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>176</td>\n", + " <td>0.0</td>\n", + " <td>24.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21.0</td>\n", + " <td>983</td>\n", + " <td>4.0</td>\n", + " <td>1.0</td>\n", + " <td>20570</td>\n", + " <td>414</td>\n", + " <td>345</td>\n", + " <td>236</td>\n", + " <td>376</td>\n", + " <td>23</td>\n", + " <td>...</td>\n", + " <td>1884</td>\n", + " <td>1508</td>\n", + " <td>0.0</td>\n", + " <td>7.0</td>\n", + " <td>3501</td>\n", + " <td>5.0</td>\n", + " <td>4.0</td>\n", + " <td>1007</td>\n", + " <td>0.0</td>\n", + " <td>68.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9.0</td>\n", + " <td>832</td>\n", + " <td>4.0</td>\n", + " <td>1.0</td>\n", + " <td>20660</td>\n", + " <td>513</td>\n", + " <td>185</td>\n", + " <td>266</td>\n", + " <td>192</td>\n", + " <td>13</td>\n", + " <td>...</td>\n", + " <td>2038</td>\n", + " <td>1846</td>\n", + " <td>0.0</td>\n", + " <td>4.0</td>\n", + " <td>2931</td>\n", + " <td>1.0</td>\n", + " <td>3.0</td>\n", + " <td>611</td>\n", + " <td>0.0</td>\n", + " <td>47.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0.0</td>\n", + " <td>205</td>\n", + " <td>2.0</td>\n", + " <td>2.0</td>\n", + " <td>20740</td>\n", + " <td>154</td>\n", + " <td>74</td>\n", + " <td>119</td>\n", + " <td>48</td>\n", + " <td>5</td>\n", + " <td>...</td>\n", + " <td>378</td>\n", + " <td>330</td>\n", + " <td>0.0</td>\n", + " <td>2.0</td>\n", + " <td>640</td>\n", + " <td>1.0</td>\n", + " <td>2.0</td>\n", + " <td>277</td>\n", + " <td>0.0</td>\n", + " <td>29.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 37 columns</p>\n", + "</div>" + ], + "text/plain": [ + " a40_abduction_and_related_offences b30_burglary_break_and_enter \\\n", + "0 1.0 27 \n", + "1 0.0 69 \n", + "2 21.0 983 \n", + "3 9.0 832 \n", + "4 0.0 205 \n", + "\n", + " a10_homicide_and_related_offences \\\n", + "0 0.0 \n", + "1 0.0 \n", + "2 4.0 \n", + "3 4.0 \n", + "4 2.0 \n", + "\n", + " f30_other_government_regulatory_offences lga_code11 \\\n", + "0 1.0 20110 \n", + "1 1.0 20260 \n", + "2 1.0 20570 \n", + "3 1.0 20660 \n", + "4 2.0 20740 \n", + "\n", + " total_division_c_offences a30_sexual_offences \\\n", + "0 53 95 \n", + "1 137 58 \n", + "2 414 345 \n", + "3 513 185 \n", + "4 154 74 \n", + "\n", + " a70_stalking_harassment_and_threatening_behaviour \\\n", + "0 11 \n", + "1 38 \n", + "2 236 \n", + "3 266 \n", + "4 119 \n", + "\n", + " e10_justice_procedures total_division_f_offences ... \\\n", + "0 11 1 ... \n", + "1 84 2 ... \n", + "2 376 23 ... \n", + "3 192 13 ... \n", + "4 48 5 ... \n", + "\n", + " total_division_e_offences e20_breaches_of_orders \\\n", + "0 68 57 \n", + "1 411 327 \n", + "2 1884 1508 \n", + "3 2038 1846 \n", + "4 378 330 \n", + "\n", + " f10_regulatory_driving_offences a60_blackmail_and_extortion b40_theft \\\n", + "0 0.0 0.0 90 \n", + "1 0.0 5.0 222 \n", + "2 0.0 7.0 3501 \n", + "3 0.0 4.0 2931 \n", + "4 0.0 2.0 640 \n", + "\n", + " c90_other_drug_offences f20_transport_regulation_offences \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 5.0 4.0 \n", + "3 1.0 3.0 \n", + "4 1.0 2.0 \n", + "\n", + " a20_assault_and_related_offences b60_bribery \\\n", + "0 81 0.0 \n", + "1 176 0.0 \n", + "2 1007 0.0 \n", + "3 611 0.0 \n", + "4 277 0.0 \n", + "\n", + " c10_drug_dealing_and_trafficking \n", + "0 9.0 \n", + "1 24.0 \n", + "2 68.0 \n", + "3 47.0 \n", + "4 29.0 \n", + "\n", + "[5 rows x 37 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crime_data = pd.read_csv('csa_crime_stats_offences_recorded_offence_type_lga_2010_2019-8538075499928749341.csv')\n", + "crime_data.fillna(0, inplace=True)\n", + "\n", + "crime_data.head()\n", + "\n", + "# column (lga_code11) will be used for matching. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ff495427", + "metadata": {}, + "outputs": [], + "source": [ + "#crime_data.to_csv('crime_data.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "c3080fe1", + "metadata": {}, + "source": [ + "## 2) Data regarding Suburbs and Geocoordinates (for matching) " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0b1894c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>geometry</th>\n", + " <th>lon</th>\n", + " <th>lat</th>\n", + " <th>lga_code</th>\n", + " <th>lga_name</th>\n", + " <th>scc_code</th>\n", + " <th>scc_name</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>POLYGON ((146.00107 -38.32214, 146.00770 -38.3...</td>\n", + " <td>146.021697</td>\n", + " <td>-38.292841</td>\n", + " <td>20830</td>\n", + " <td>Baw Baw</td>\n", + " <td>20024</td>\n", + " <td>Allambee</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>POLYGON ((145.27748 -36.41085, 145.29542 -36.4...</td>\n", + " <td>145.301089</td>\n", + " <td>-36.380130</td>\n", + " <td>22830</td>\n", + " <td>Greater Shepparton</td>\n", + " <td>20063</td>\n", + " <td>Ardmona</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>POLYGON ((143.86374 -36.70341, 143.86334 -36.7...</td>\n", + " <td>143.859519</td>\n", + " <td>-36.675198</td>\n", + " <td>23940</td>\n", + " <td>Loddon</td>\n", + " <td>20069</td>\n", + " <td>Arnold (Vic.)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>POLYGON ((143.18182 -36.89766, 143.17798 -36.9...</td>\n", + " <td>143.217873</td>\n", + " <td>-36.944447</td>\n", + " <td>25990</td>\n", + " <td>Pyrenees</td>\n", + " <td>20142</td>\n", + " <td>Barkly (Vic.)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>POLYGON ((145.15929 -38.18173, 145.15756 -38.1...</td>\n", + " <td>145.156779</td>\n", + " <td>-38.200888</td>\n", + " <td>25340</td>\n", + " <td>Mornington Peninsula</td>\n", + " <td>20169</td>\n", + " <td>Baxter</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " geometry lon lat \\\n", + "0 POLYGON ((146.00107 -38.32214, 146.00770 -38.3... 146.021697 -38.292841 \n", + "1 POLYGON ((145.27748 -36.41085, 145.29542 -36.4... 145.301089 -36.380130 \n", + "2 POLYGON ((143.86374 -36.70341, 143.86334 -36.7... 143.859519 -36.675198 \n", + "3 POLYGON ((143.18182 -36.89766, 143.17798 -36.9... 143.217873 -36.944447 \n", + "4 POLYGON ((145.15929 -38.18173, 145.15756 -38.1... 145.156779 -38.200888 \n", + "\n", + " lga_code lga_name scc_code scc_name \n", + "0 20830 Baw Baw 20024 Allambee \n", + "1 22830 Greater Shepparton 20063 Ardmona \n", + "2 23940 Loddon 20069 Arnold (Vic.) \n", + "3 25990 Pyrenees 20142 Barkly (Vic.) \n", + "4 25340 Mornington Peninsula 20169 Baxter " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load the GeoJSON file\n", + "gdf = gpd.read_file('georef-australia-state-suburb.geojson')\n", + "\n", + "# Convert the nested columns into separate columns\n", + "gdf['lon'] = gdf['geo_point_2d'].apply(lambda x: x.get('lon'))\n", + "gdf['lat'] = gdf['geo_point_2d'].apply(lambda x: x.get('lat'))\n", + "gdf['lga_code'] = gdf['lga_code'].apply(lambda x: x[0] if x else None)\n", + "gdf['lga_name'] = gdf['lga_name'].apply(lambda x: x[0] if x else None)\n", + "gdf['scc_code'] = gdf['scc_code'].apply(lambda x: x[0] if x else None)\n", + "gdf['scc_name'] = gdf['scc_name'].apply(lambda x: x[0] if x else None)\n", + "\n", + "# Keep only the required columns\n", + "gdf = gdf[['geometry', 'lon', 'lat', 'lga_code', 'lga_name', 'scc_code', 'scc_name']]\n", + "state_Data = pd.DataFrame(gdf)\n", + "state_Data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7c6b9531", + "metadata": {}, + "outputs": [], + "source": [ + "#state_Data.to_csv('state_data.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "fb7ba3f1", + "metadata": {}, + "source": [ + "## 3) Main data from Twitter file.json" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8c433b58", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4.07 s, sys: 999 ms, total: 5.07 s\n", + "Wall time: 5.07 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "#using a bigger data (produces 37k rows of data roughly)\n", + "with open('smallTwitter.json', 'r') as f:\n", + " data = json.load(f)\n", + "\n", + "filtered_data = []\n", + "\n", + "for tweet in data:\n", + " if 'places' in tweet['includes']:\n", + " for place in tweet['includes']['places']:\n", + " if place['full_name'].endswith(', Victoria'):\n", + " # Extract first part of the name for area_name and full_name\n", + " area_name, _, _ = place['full_name'].partition(', ')\n", + " # If there's a \"-\", take the part before it for the area_name\n", + " if '-' in area_name:\n", + " area_name = area_name.split('-')[0].strip()\n", + "\n", + " filtered_data.append({\n", + " '_id': tweet['_id'],\n", + " #'author_id': tweet['data']['author_id'],\n", + " 'text': tweet['data']['text'],\n", + " 'sentiment': tweet['data'].get('sentiment', None), # returns None if 'sentiment' key doesn't exist\n", + " 'language': tweet['data']['lang'],\n", + " 'area_name': area_name\n", + " })\n", + "\n", + "main_data = pd.DataFrame(filtered_data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bc8d2580", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>_id</th>\n", + " <th>text</th>\n", + " <th>sentiment</th>\n", + " <th>language</th>\n", + " <th>area_name</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1414031972041588744</td>\n", + " <td>Stella is spending the day with her cousin Mon...</td>\n", + " <td>0.000000</td>\n", + " <td>en</td>\n", + " <td>Ocean Grove</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1414032158197354497</td>\n", + " <td>@eleaud Oh no! I hope they don’t go up any fur...</td>\n", + " <td>0.090909</td>\n", + " <td>en</td>\n", + " <td>Ocean Grove</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1414036352308445186</td>\n", + " <td>Sunday @afl R17: #aflgiantssuns #afldogsswans ...</td>\n", + " <td>0.000000</td>\n", + " <td>en</td>\n", + " <td>Drysdale</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1414044890040733699</td>\n", + " <td>@BlancheVerlie Sorry to watch the declining si...</td>\n", + " <td>0.058824</td>\n", + " <td>en</td>\n", + " <td>Ocean Grove</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1414057294950649861</td>\n", + " <td>@VicGovAu One lady had over 40 saved photos of...</td>\n", + " <td>0.090909</td>\n", + " <td>en</td>\n", + " <td>Drysdale</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " _id text \\\n", + "0 1414031972041588744 Stella is spending the day with her cousin Mon... \n", + "1 1414032158197354497 @eleaud Oh no! I hope they don’t go up any fur... \n", + "2 1414036352308445186 Sunday @afl R17: #aflgiantssuns #afldogsswans ... \n", + "3 1414044890040733699 @BlancheVerlie Sorry to watch the declining si... \n", + "4 1414057294950649861 @VicGovAu One lady had over 40 saved photos of... \n", + "\n", + " sentiment language area_name \n", + "0 0.000000 en Ocean Grove \n", + "1 0.090909 en Ocean Grove \n", + "2 0.000000 en Drysdale \n", + "3 0.058824 en Ocean Grove \n", + "4 0.090909 en Drysdale " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "main_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dce35eb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca9015a8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adc72463", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab