diff --git a/Data collection/big_to_couch.py b/Data collection/big_to_couch.py index 54f8b6085b43f58fbe923cfed4a7fdc00e84c58f..6c6774f251ba0abad4caf8612d51c9ea6d3e735e 100644 --- a/Data collection/big_to_couch.py +++ b/Data collection/big_to_couch.py @@ -1,48 +1,7 @@ import json import couchdb +from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt -def save2Couch(twt, db): - doc = twt - doc['_id'] = doc['id_str'] - try: - db.save(doc) - except: - replaceCDB(doc, db) - - -def updateCDB(twt, db): - doc = db.get(twt['id_str']) - up = twt['tags'] - doc['tags'] = up - db[twt['_id']] = doc - print('update:') - -def replaceCDB(twt,db): - del db[twt['id_str']] - db.save(twt) - - -def classifyTwt(twt): - tags = [] - sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby'] - - if 'text' in twt: - text = twt['text'] - else: - text = twt['extended_tweet']['full_text'] - for s in sports: - s = s.lower() - if s in text.lower(): - if s in ['afl','footie','#afl','footy', '#afl']: - tag = 'Footie' - elif s in ['tennis', 'australianopen']: - tag = 'Tennis' - elif s in ['nrl', '#nrl', 'rugby']: - tag = 'Rugby' - else: - tag = s - tags.append(tag) - return tags coords = [] areas = {} diff --git a/Data collection/filterbigupdate.py b/Data collection/filterbigupdate.py index 0083cfea8478a90a82a9d3f2588beae84d392aee..57c40a49e55fbf93063d53ee32d01b8411c66708 100644 --- a/Data collection/filterbigupdate.py +++ b/Data collection/filterbigupdate.py @@ -1,5 +1,6 @@ import json import couchdb +from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/") db = couchserver.create("live_demo2") @@ -7,26 +8,6 @@ db = couchserver.create("live_demo2") with open('./'+'big_filtered.json', 'r', encoding = 'utf-8') as file: biga = json.load(file) - -def save2Couch(twt, db): - doc = twt - doc['_id'] = doc['id_str'] - try: - db.save(doc) - except: - replaceCDB(doc, db) - - -def updateCDB(twt, db): - doc = db.get(twt['id_str']) - up = twt['tags'] - doc['tags'] = up - db[twt['_id']] = doc - print('update:') - -def replaceCDB(twt,db): - del db[twt['id_str']] - db.save(twt) for k,v in biga.items(): loc = v['location'] diff --git a/Data collection/to_couch.py b/Data collection/preprocess.py similarity index 57% rename from Data collection/to_couch.py rename to Data collection/preprocess.py index 494eaa3191934dfa85de40853af7d02f8be8f9f3..ce0d9082a02083c2c6e119188c5d20656b7ee425 100644 --- a/Data collection/to_couch.py +++ b/Data collection/preprocess.py @@ -1,25 +1,3 @@ -import json -import codecs -import ast -import couchdb -import os - - -data = [] -log = [] -couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/") -db = couchserver['live_demo2'] - - -for filename in os.listdir('./UPLOAD/'): - if filename.startswith('twt_stream'): - with open('./UPLOAD/'+filename, 'r', encoding = 'utf-8') as file: - for line in file: - try: - line = ast.literal_eval(line) - data.append(line) - except: - log.append(filename) def save2Couch(twt, db): doc = twt @@ -37,6 +15,7 @@ def updateCDB(twt, db): db[twt['_id']] = doc print('update:') + def replaceCDB(twt,db): del db[twt['id_str']] db.save(twt) @@ -49,7 +28,7 @@ def classifyTwt(twt): if 'text' in twt: text = twt['text'] else: - text = twt['extended_tweet']['full_text'] + text = twt['full_text'] for s in sports: s = s.lower() if s in text.lower(): @@ -63,10 +42,4 @@ def classifyTwt(twt): tag = s tags.append(tag) tags = list(set(tags)) - return tags - -for i in data: - twt = next(iter(i.values())) - twt['tags'] = classifyTwt(twt) - if len(twt['tags'])> 0: - save2Couch(twt, db) \ No newline at end of file + return tags \ No newline at end of file diff --git a/Data collection/searchFile_to_couch.py b/Data collection/searchFile_to_couch.py index f9bda056e9f52b987e3f2caa45d2b80163097dfd..67e942a591478678703a8571c2f986e0e1212a48 100644 --- a/Data collection/searchFile_to_couch.py +++ b/Data collection/searchFile_to_couch.py @@ -2,55 +2,12 @@ import json import couchdb import os +from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt data = {} couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/") db = couchserver['live_demo2'] -def save2Couch(twt, db): - doc = twt - doc['_id'] = doc['id_str'] - try: - db.save(doc) - except: - replaceCDB(doc, db) - - -def updateCDB(twt, db): - doc = db.get(twt['id_str']) - up = twt['tags'] - doc['tags'] = up - db[twt['_id']] = doc - print('update:') - - -def replaceCDB(twt,db): - del db[twt['id_str']] - db.save(twt) - - -def classifyTwt(twt): - tags = [] - sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby'] - - if 'text' in twt: - text = twt['text'] - else: - text = twt['full_text'] - for s in sports: - s = s.lower() - if s in text.lower(): - if s in ['afl','footie','#afl','footy', '#afl']: - tag = 'Footie' - elif s in ['tennis', 'australianopen']: - tag = 'Tennis' - elif s in ['nrl', '#nrl', 'rugby']: - tag = 'Rugby' - else: - tag = s - tags.append(tag) - tags = list(set(tags)) - return tags for filename in os.listdir('./UPLOAD'): if filename.startswith('twt_search'): diff --git a/Data collection/streamFile_to_couch.py b/Data collection/streamFile_to_couch.py new file mode 100644 index 0000000000000000000000000000000000000000..cb489df3029bf4d81f3ee9f82253eb7fc07733ba --- /dev/null +++ b/Data collection/streamFile_to_couch.py @@ -0,0 +1,29 @@ +import json +import codecs +import ast +import couchdb +import os +from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt + + +data = [] +log = [] +couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/") +db = couchserver['live_demo2'] + + +for filename in os.listdir('./UPLOAD/'): + if filename.startswith('twt_stream'): + with open('./UPLOAD/'+filename, 'r', encoding = 'utf-8') as file: + for line in file: + try: + line = ast.literal_eval(line) + data.append(line) + except: + log.append(filename) + +for i in data: + twt = next(iter(i.values())) + twt['tags'] = classifyTwt(twt) + if len(twt['tags'])> 0: + save2Couch(twt, db) \ No newline at end of file diff --git a/Data collection/tweet_gatherer_app.py b/Data collection/tweet_gatherer_app.py index 98e33a73560125409aa453337136855dba073d30..72f97c9977c56a04c36227c8bad78695a1751909 100644 --- a/Data collection/tweet_gatherer_app.py +++ b/Data collection/tweet_gatherer_app.py @@ -8,6 +8,7 @@ import os.path import datetime import re from urllib3.exceptions import ProtocolError +from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt collector_type = sys.argv[1] region = sys.argv[2] @@ -15,60 +16,12 @@ region = sys.argv[2] #change code to your current Twitter account - if os.path.isfile('twt_stream{}.json'.format(region)): outfile = open('twt_stream{}.json'.format(region), 'a+', encoding ='utf-8') else: outfile = open('twt_stream{}.json'.format(region), 'w+', encoding ='utf-8') - -def save2Couch(twt, db): - doc = twt - doc['_id'] = doc['id_str'] - try: - db.save(doc) - except: - print(doc['_id'] +" already exist in the database") - replaceCDB(doc, db) - - -def updateCDB(twt, db): - doc = db.get(twt['id_str']) - up = twt['tags'] - doc['tags'] = up - db[twt['_id']] = doc - print('update:') - -def replaceCDB(twt,db): - del db[twt['id_str']] - db.save(twt) - - -def classifyTwt(twt): - tags = [] - sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby'] - - if 'text' in twt: - text = twt['text'] - else: - text = twt['extended_tweet']['full_text'] - for s in sports: - s = s.lower() - if s in text.lower(): - if s in ['afl','footie','#afl','footy', '#afl']: - tag = 'Footie' - elif s in ['tennis', 'australianopen']: - tag = 'Tennis' - elif s in ['nrl', '#nrl', 'rugby']: - tag = 'Rugby' - else: - tag = s - tags.append(tag) - tags = list(set(tags)) - return tags - - class StreamListener(tweepy.StreamListener): def __init__(self, config, db): @@ -177,8 +130,6 @@ if __name__ == "__main__": tmp.append(tweet) - - for v in tmp: final[v.id_str] = v