Skip to content
Snippets Groups Projects
Commit 73e560f4 authored by Terry Liao's avatar Terry Liao
Browse files

update

parent 61b3e743
No related branches found
No related tags found
No related merge requests found
from sklearn.externals import joblib
import json
import couchdb
model = joblib.load("train_model.m")
def data_process(tweet,model):
####filter cities
cities =['melbourne','sydney','adelaide','perth','brisbane']
dataset=['0']
id = tweet['_id']
text = tweet['text']
lang = tweet['lang']
##we only care English
if lang != 'en':
return None
location = tweet['location']
create_time = tweet['create_time']
for city in cities:
#the location contains target city names
if city in location.lower():
#generalize city name
location=city
p_tweet={
'_id':id,
"create_time":create_time,
"location":location,
"lang":lang,
'text':text,
'if_offensive':"false"
}
dataset[0]=text
predicts = model.predict(dataset)
if predicts[0]==1:
p_tweet['if_offensive']="true"
return p_tweet
# tweet1={
# "_id": "1000029867289690113",
# "_rev": "1-6fec69500a19192444c2de7e13c37a08",
# "create_time": "2018-05-25 15:04:34",
# "user_id": 847624802,
# "text": "@maiiron_ Teve loco que até Jesus Cristo citou. Buguei meu",
# "lang": "pt",
# "location": "Sydney"
# }
# tweet2={
# "_id": "1000108067982200832",
# "_rev": "1-d23d7bd9bf292aee6423360ac45abb76",
# "create_time": "2018-05-25 20:15:19",
# "user_id": 3104553206,
# "text": "Finished #plant18 https://t.co/GOWl6bsQUO",
# "lang": "en",
# "location": "Kimba"
# }
# tweet3={
# "_id": "1000641327279824896",
# "_rev": "1-bafd33be6cd735694ddcf9557c34656e",
# "create_time": "2018-05-27 07:34:18",
# "user_id": 110366337,
# "text": "Veer's arrival & Rivi's 5th bday Party https://t.co/xZmhoMgV76",
# "lang": "en",
# "location": "Melbourne, Australia"
# }
# p=data_process(tweet3)
# print(p)
from sklearn.externals import joblib
import json
import couchdb
model = joblib.load("train_model.m")
dataset=[]
user = "admin"
password = "password"
dbserver = couchdb.Server("http://admin:password@172.26.38.157:5984/")
db=dbserver["raw_tweets"]
r_db = dbserver["tweet_results"]
count=0
dataset=["1"]
####filter cities
cities =['melbourne','sydney','adelaide','perth','brisbane']
for id in db:
tweet=db.get(id)
text = tweet['text']
lang = tweet['lang']
location = tweet['location']
create_time = tweet['create_time']
user_id = tweet['user_id']
#find the target city
flag = False
for city in cities:
#the location contains target city names
if city in location.lower():
#generalize city name
flag = True
location=city
#not in target cities,continue
if flag == False:
continue
p_tweet = {
'_id':id,
'user_id':user_id,
"create_time":create_time,
"location":location,
"lang":lang,
'text':text
}
if lang =='en':
dataset[0]=text
predicts = model.predict(dataset)
if predicts[0]==1:
r_db.save(p_tweet)
...@@ -43,6 +43,7 @@ class TweetSearchHavester(): ...@@ -43,6 +43,7 @@ class TweetSearchHavester():
process_db = self.couch['tweet_results'] process_db = self.couch['tweet_results']
for tweet in tweepy.Cursor(api.user_timeline,id = user_id ).items(50): for tweet in tweepy.Cursor(api.user_timeline,id = user_id ).items(50):
# save most recent tweets # save most recent tweets
gp = GP()
dic = {} dic = {}
dic['_id'] = tweet.id_str dic['_id'] = tweet.id_str
dic['create_time'] = str(tweet.created_at) dic['create_time'] = str(tweet.created_at)
...@@ -55,7 +56,7 @@ class TweetSearchHavester(): ...@@ -55,7 +56,7 @@ class TweetSearchHavester():
dic['location'] = tweet.user.location dic['location'] = tweet.user.location
# print(dic) # print(dic)
try: try:
p_dic = date_process(dic,self.model) p_dic = gp.data_process(dic,self.model)
if p_dic != None: if p_dic != None:
process_db.save(p_dic) process_db.save(p_dic)
db.save(dic) db.save(dic)
......
...@@ -8,6 +8,7 @@ from tweepy import OAuthHandler ...@@ -8,6 +8,7 @@ from tweepy import OAuthHandler
from tweepy import Stream from tweepy import Stream
from tweepy.streaming import StreamListener from tweepy.streaming import StreamListener
from sklearn.externals import joblib from sklearn.externals import joblib
import general_process as GP
class listener(StreamListener): class listener(StreamListener):
def __init__(self,path): def __init__(self,path):
...@@ -27,6 +28,7 @@ class listener(StreamListener): ...@@ -27,6 +28,7 @@ class listener(StreamListener):
return dic return dic
def on_data(self,data): def on_data(self,data):
try: try:
gp = GP()
db = self.couch['raw_tweets'] db = self.couch['raw_tweets']
id_db = self.couch['user_id'] id_db = self.couch['user_id']
pc_db = self.couch['tweet_results'] pc_db = self.couch['tweet_results']
...@@ -34,8 +36,9 @@ class listener(StreamListener): ...@@ -34,8 +36,9 @@ class listener(StreamListener):
dic = self.convertValue(content) dic = self.convertValue(content)
id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False} id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False}
# print(id_doc) # print(id_doc)
p_dic = date_process(dic,self.model) p_dic = gp.data_process(dic,self.model)
if p_dic != None: if p_dic != None:
process_db.save(p_dic) process_db.save(p_dic)
id_db.save(id_doc) id_db.save(id_doc)
db.save(dic) db.save(dic)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment