Skip to content
Snippets Groups Projects
Commit 69eb3c25 authored by Felipe Ramos's avatar Felipe Ramos
Browse files

cleaned scripts data collect

parent 26579962
No related branches found
No related tags found
No related merge requests found
import json
import couchdb
from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
def save2Couch(twt, db):
doc = twt
doc['_id'] = doc['id_str']
try:
db.save(doc)
except:
replaceCDB(doc, db)
def updateCDB(twt, db):
doc = db.get(twt['id_str'])
up = twt['tags']
doc['tags'] = up
db[twt['_id']] = doc
print('update:')
def replaceCDB(twt,db):
del db[twt['id_str']]
db.save(twt)
def classifyTwt(twt):
tags = []
sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby']
if 'text' in twt:
text = twt['text']
else:
text = twt['extended_tweet']['full_text']
for s in sports:
s = s.lower()
if s in text.lower():
if s in ['afl','footie','#afl','footy', '#afl']:
tag = 'Footie'
elif s in ['tennis', 'australianopen']:
tag = 'Tennis'
elif s in ['nrl', '#nrl', 'rugby']:
tag = 'Rugby'
else:
tag = s
tags.append(tag)
return tags
coords = []
areas = {}
......
import json
import couchdb
from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
db = couchserver.create("live_demo2")
......@@ -8,26 +9,6 @@ with open('./'+'big_filtered.json', 'r', encoding = 'utf-8') as file:
biga = json.load(file)
def save2Couch(twt, db):
doc = twt
doc['_id'] = doc['id_str']
try:
db.save(doc)
except:
replaceCDB(doc, db)
def updateCDB(twt, db):
doc = db.get(twt['id_str'])
up = twt['tags']
doc['tags'] = up
db[twt['_id']] = doc
print('update:')
def replaceCDB(twt,db):
del db[twt['id_str']]
db.save(twt)
for k,v in biga.items():
loc = v['location']
v['region'] = loc.capitalize()
......
import json
import codecs
import ast
import couchdb
import os
data = []
log = []
couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
db = couchserver['live_demo2']
for filename in os.listdir('./UPLOAD/'):
if filename.startswith('twt_stream'):
with open('./UPLOAD/'+filename, 'r', encoding = 'utf-8') as file:
for line in file:
try:
line = ast.literal_eval(line)
data.append(line)
except:
log.append(filename)
def save2Couch(twt, db):
doc = twt
......@@ -37,6 +15,7 @@ def updateCDB(twt, db):
db[twt['_id']] = doc
print('update:')
def replaceCDB(twt,db):
del db[twt['id_str']]
db.save(twt)
......@@ -49,7 +28,7 @@ def classifyTwt(twt):
if 'text' in twt:
text = twt['text']
else:
text = twt['extended_tweet']['full_text']
text = twt['full_text']
for s in sports:
s = s.lower()
if s in text.lower():
......@@ -64,9 +43,3 @@ def classifyTwt(twt):
tags.append(tag)
tags = list(set(tags))
return tags
\ No newline at end of file
for i in data:
twt = next(iter(i.values()))
twt['tags'] = classifyTwt(twt)
if len(twt['tags'])> 0:
save2Couch(twt, db)
\ No newline at end of file
......@@ -2,55 +2,12 @@
import json
import couchdb
import os
from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
data = {}
couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
db = couchserver['live_demo2']
def save2Couch(twt, db):
doc = twt
doc['_id'] = doc['id_str']
try:
db.save(doc)
except:
replaceCDB(doc, db)
def updateCDB(twt, db):
doc = db.get(twt['id_str'])
up = twt['tags']
doc['tags'] = up
db[twt['_id']] = doc
print('update:')
def replaceCDB(twt,db):
del db[twt['id_str']]
db.save(twt)
def classifyTwt(twt):
tags = []
sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby']
if 'text' in twt:
text = twt['text']
else:
text = twt['full_text']
for s in sports:
s = s.lower()
if s in text.lower():
if s in ['afl','footie','#afl','footy', '#afl']:
tag = 'Footie'
elif s in ['tennis', 'australianopen']:
tag = 'Tennis'
elif s in ['nrl', '#nrl', 'rugby']:
tag = 'Rugby'
else:
tag = s
tags.append(tag)
tags = list(set(tags))
return tags
for filename in os.listdir('./UPLOAD'):
if filename.startswith('twt_search'):
......
import json
import codecs
import ast
import couchdb
import os
from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
data = []
log = []
couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
db = couchserver['live_demo2']
for filename in os.listdir('./UPLOAD/'):
if filename.startswith('twt_stream'):
with open('./UPLOAD/'+filename, 'r', encoding = 'utf-8') as file:
for line in file:
try:
line = ast.literal_eval(line)
data.append(line)
except:
log.append(filename)
for i in data:
twt = next(iter(i.values()))
twt['tags'] = classifyTwt(twt)
if len(twt['tags'])> 0:
save2Couch(twt, db)
\ No newline at end of file
......@@ -8,6 +8,7 @@ import os.path
import datetime
import re
from urllib3.exceptions import ProtocolError
from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
collector_type = sys.argv[1]
region = sys.argv[2]
......@@ -15,60 +16,12 @@ region = sys.argv[2]
#change code to your current Twitter account
if os.path.isfile('twt_stream{}.json'.format(region)):
outfile = open('twt_stream{}.json'.format(region), 'a+', encoding ='utf-8')
else:
outfile = open('twt_stream{}.json'.format(region), 'w+', encoding ='utf-8')
def save2Couch(twt, db):
doc = twt
doc['_id'] = doc['id_str']
try:
db.save(doc)
except:
print(doc['_id'] +" already exist in the database")
replaceCDB(doc, db)
def updateCDB(twt, db):
doc = db.get(twt['id_str'])
up = twt['tags']
doc['tags'] = up
db[twt['_id']] = doc
print('update:')
def replaceCDB(twt,db):
del db[twt['id_str']]
db.save(twt)
def classifyTwt(twt):
tags = []
sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby']
if 'text' in twt:
text = twt['text']
else:
text = twt['extended_tweet']['full_text']
for s in sports:
s = s.lower()
if s in text.lower():
if s in ['afl','footie','#afl','footy', '#afl']:
tag = 'Footie'
elif s in ['tennis', 'australianopen']:
tag = 'Tennis'
elif s in ['nrl', '#nrl', 'rugby']:
tag = 'Rugby'
else:
tag = s
tags.append(tag)
tags = list(set(tags))
return tags
class StreamListener(tweepy.StreamListener):
def __init__(self, config, db):
......@@ -177,8 +130,6 @@ if __name__ == "__main__":
tmp.append(tweet)
for v in tmp:
final[v.id_str] = v
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment