From 69eb3c2501673b13e20a87bb13d202e47d8646ad Mon Sep 17 00:00:00 2001
From: Felipe Ramos <framosmorale@student.unimelb.edu.au>
Date: Wed, 26 May 2021 20:19:43 +1000
Subject: [PATCH] cleaned scripts data collect

---
 Data collection/big_to_couch.py               | 43 +---------------
 Data collection/filterbigupdate.py            | 21 +-------
 .../{to_couch.py => preprocess.py}            | 33 ++----------
 Data collection/searchFile_to_couch.py        | 45 +---------------
 Data collection/streamFile_to_couch.py        | 29 +++++++++++
 Data collection/tweet_gatherer_app.py         | 51 +------------------
 6 files changed, 36 insertions(+), 186 deletions(-)
 rename Data collection/{to_couch.py => preprocess.py} (57%)
 create mode 100644 Data collection/streamFile_to_couch.py

diff --git a/Data collection/big_to_couch.py b/Data collection/big_to_couch.py
index 54f8b60..6c6774f 100644
--- a/Data collection/big_to_couch.py	
+++ b/Data collection/big_to_couch.py	
@@ -1,48 +1,7 @@
 import json
 import couchdb
+from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
 
-def save2Couch(twt, db):
-    doc = twt
-    doc['_id'] = doc['id_str']
-    try:
-        db.save(doc)
-    except:
-        replaceCDB(doc, db)
-
-
-def updateCDB(twt, db):
-    doc = db.get(twt['id_str'])
-    up = twt['tags']
-    doc['tags'] = up
-    db[twt['_id']] = doc
-    print('update:')
-
-def replaceCDB(twt,db):
-    del db[twt['id_str']]
-    db.save(twt)
-
-
-def classifyTwt(twt):
-    tags = []
-    sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby']
-
-    if 'text' in twt:
-        text =  twt['text']
-    else:
-        text = twt['extended_tweet']['full_text']
-    for s in sports:
-        s = s.lower()
-        if s in text.lower():
-            if s in ['afl','footie','#afl','footy', '#afl']:
-                tag = 'Footie'
-            elif s in ['tennis', 'australianopen']:
-                tag = 'Tennis'
-            elif s in ['nrl', '#nrl', 'rugby']:
-                tag = 'Rugby'
-            else:
-                tag = s
-            tags.append(tag)
-    return tags
 
 coords = []
 areas = {}
diff --git a/Data collection/filterbigupdate.py b/Data collection/filterbigupdate.py
index 0083cfe..57c40a4 100644
--- a/Data collection/filterbigupdate.py	
+++ b/Data collection/filterbigupdate.py	
@@ -1,5 +1,6 @@
 import json
 import couchdb
+from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
 
 couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
 db = couchserver.create("live_demo2")
@@ -7,26 +8,6 @@ db = couchserver.create("live_demo2")
 with open('./'+'big_filtered.json', 'r', encoding = 'utf-8') as file:
     biga = json.load(file)
 
-
-def save2Couch(twt, db):
-    doc = twt
-    doc['_id'] = doc['id_str']
-    try:
-        db.save(doc)
-    except:
-        replaceCDB(doc, db)
-
-
-def updateCDB(twt, db):
-    doc = db.get(twt['id_str'])
-    up = twt['tags']
-    doc['tags'] = up
-    db[twt['_id']] = doc
-    print('update:')
-
-def replaceCDB(twt,db):
-    del db[twt['id_str']]
-    db.save(twt)
     
 for k,v in biga.items():
     loc = v['location']
diff --git a/Data collection/to_couch.py b/Data collection/preprocess.py
similarity index 57%
rename from Data collection/to_couch.py
rename to Data collection/preprocess.py
index 494eaa3..ce0d908 100644
--- a/Data collection/to_couch.py	
+++ b/Data collection/preprocess.py	
@@ -1,25 +1,3 @@
-import json
-import codecs
-import ast
-import couchdb
-import os
-
-
-data = []
-log = []
-couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
-db = couchserver['live_demo2']
-
-
-for filename in os.listdir('./UPLOAD/'):
-    if filename.startswith('twt_stream'):
-        with open('./UPLOAD/'+filename, 'r', encoding = 'utf-8') as file:
-            for line in file:
-                try:
-                    line = ast.literal_eval(line)
-                    data.append(line)
-                except:
-                    log.append(filename)
 
 def save2Couch(twt, db):
     doc = twt
@@ -37,6 +15,7 @@ def updateCDB(twt, db):
     db[twt['_id']] = doc
     print('update:')
 
+
 def replaceCDB(twt,db):
     del db[twt['id_str']]
     db.save(twt)
@@ -49,7 +28,7 @@ def classifyTwt(twt):
     if 'text' in twt:
         text =  twt['text']
     else:
-        text = twt['extended_tweet']['full_text']
+        text = twt['full_text']
     for s in sports:
         s = s.lower()
         if s in text.lower():
@@ -63,10 +42,4 @@ def classifyTwt(twt):
                 tag = s
             tags.append(tag)
     tags = list(set(tags))
-    return tags
-
-for i in data:
-    twt = next(iter(i.values()))
-    twt['tags'] = classifyTwt(twt)
-    if len(twt['tags'])> 0:
-        save2Couch(twt, db)
\ No newline at end of file
+    return tags
\ No newline at end of file
diff --git a/Data collection/searchFile_to_couch.py b/Data collection/searchFile_to_couch.py
index f9bda05..67e942a 100644
--- a/Data collection/searchFile_to_couch.py	
+++ b/Data collection/searchFile_to_couch.py	
@@ -2,55 +2,12 @@
 import json
 import couchdb
 import os
+from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
 
 data = {}
 couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
 db = couchserver['live_demo2']
 
-def save2Couch(twt, db):
-    doc = twt
-    doc['_id'] = doc['id_str']
-    try:
-        db.save(doc)
-    except:
-        replaceCDB(doc, db)
-
-
-def updateCDB(twt, db):
-    doc = db.get(twt['id_str'])
-    up = twt['tags']
-    doc['tags'] = up
-    db[twt['_id']] = doc
-    print('update:')
-
-
-def replaceCDB(twt,db):
-    del db[twt['id_str']]
-    db.save(twt)
-
-
-def classifyTwt(twt):
-    tags = []
-    sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby']
-
-    if 'text' in twt:
-        text =  twt['text']
-    else:
-        text = twt['full_text']
-    for s in sports:
-        s = s.lower()
-        if s in text.lower():
-            if s in ['afl','footie','#afl','footy', '#afl']:
-                tag = 'Footie'
-            elif s in ['tennis', 'australianopen']:
-                tag = 'Tennis'
-            elif s in ['nrl', '#nrl', 'rugby']:
-                tag = 'Rugby'
-            else:
-                tag = s
-            tags.append(tag)
-    tags = list(set(tags))
-    return tags
 
 for filename in os.listdir('./UPLOAD'):
     if filename.startswith('twt_search'):
diff --git a/Data collection/streamFile_to_couch.py b/Data collection/streamFile_to_couch.py
new file mode 100644
index 0000000..cb489df
--- /dev/null
+++ b/Data collection/streamFile_to_couch.py	
@@ -0,0 +1,29 @@
+import json
+import codecs
+import ast
+import couchdb
+import os
+from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
+
+
+data = []
+log = []
+couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
+db = couchserver['live_demo2']
+
+
+for filename in os.listdir('./UPLOAD/'):
+    if filename.startswith('twt_stream'):
+        with open('./UPLOAD/'+filename, 'r', encoding = 'utf-8') as file:
+            for line in file:
+                try:
+                    line = ast.literal_eval(line)
+                    data.append(line)
+                except:
+                    log.append(filename)
+
+for i in data:
+    twt = next(iter(i.values()))
+    twt['tags'] = classifyTwt(twt)
+    if len(twt['tags'])> 0:
+        save2Couch(twt, db)
\ No newline at end of file
diff --git a/Data collection/tweet_gatherer_app.py b/Data collection/tweet_gatherer_app.py
index 98e33a7..72f97c9 100644
--- a/Data collection/tweet_gatherer_app.py	
+++ b/Data collection/tweet_gatherer_app.py	
@@ -8,6 +8,7 @@ import os.path
 import datetime 
 import re
 from urllib3.exceptions import ProtocolError
+from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
 
 collector_type = sys.argv[1]
 region = sys.argv[2]
@@ -15,60 +16,12 @@ region = sys.argv[2]
 #change code to your current Twitter account
 
 
-
 if os.path.isfile('twt_stream{}.json'.format(region)):
     outfile = open('twt_stream{}.json'.format(region), 'a+', encoding ='utf-8')
 else:
     outfile = open('twt_stream{}.json'.format(region), 'w+', encoding ='utf-8')
 
 
-
-def save2Couch(twt, db):
-    doc = twt
-    doc['_id'] = doc['id_str']
-    try:
-        db.save(doc)
-    except:
-        print(doc['_id'] +" already exist in the database")
-        replaceCDB(doc, db)
-
-
-def updateCDB(twt, db):
-    doc = db.get(twt['id_str'])
-    up = twt['tags']
-    doc['tags'] = up
-    db[twt['_id']] = doc
-    print('update:')
-
-def replaceCDB(twt,db):
-    del db[twt['id_str']]
-    db.save(twt)
-
-
-def classifyTwt(twt):
-    tags = []
-    sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby']
-
-    if 'text' in twt:
-        text =  twt['text']
-    else:
-        text = twt['extended_tweet']['full_text']
-    for s in sports:
-        s = s.lower()
-        if s in text.lower():
-            if s in ['afl','footie','#afl','footy', '#afl']:
-                tag = 'Footie'
-            elif s in ['tennis', 'australianopen']:
-                tag = 'Tennis'
-            elif s in ['nrl', '#nrl', 'rugby']:
-                tag = 'Rugby'
-            else:
-                tag = s
-            tags.append(tag)
-    tags = list(set(tags))
-    return tags
-
-
 class StreamListener(tweepy.StreamListener):
     
     def __init__(self, config, db):
@@ -177,8 +130,6 @@ if __name__ == "__main__":
                     tmp.append(tweet)
 
 
-
-
         for v in tmp:
             final[v.id_str] = v
 
-- 
GitLab