From 73e560f43ffaa3b61645cdd67012b896854dc822 Mon Sep 17 00:00:00 2001 From: Terry Liao <jinliang@student.unimelb.edu.au> Date: Sun, 12 May 2019 20:10:07 +1000 Subject: [PATCH] update --- tweet_havester/general_process.py | 69 +++++++++++++++++++++++++++++++ tweet_havester/process.py | 58 -------------------------- tweet_havester/tweepy_search.py | 3 +- tweet_havester/tweepy_stream.py | 5 ++- 4 files changed, 75 insertions(+), 60 deletions(-) create mode 100644 tweet_havester/general_process.py delete mode 100644 tweet_havester/process.py diff --git a/tweet_havester/general_process.py b/tweet_havester/general_process.py new file mode 100644 index 0000000..b8f95b0 --- /dev/null +++ b/tweet_havester/general_process.py @@ -0,0 +1,69 @@ +from sklearn.externals import joblib +import json +import couchdb +model = joblib.load("train_model.m") + +def data_process(tweet,model): + ####filter cities + cities =['melbourne','sydney','adelaide','perth','brisbane'] + dataset=['0'] + id = tweet['_id'] + text = tweet['text'] + lang = tweet['lang'] + ##we only care English + if lang != 'en': + return None + location = tweet['location'] + create_time = tweet['create_time'] + + for city in cities: + #the location contains target city names + if city in location.lower(): + #generalize city name + location=city + + p_tweet={ + '_id':id, + "create_time":create_time, + "location":location, + "lang":lang, + 'text':text, + 'if_offensive':"false" + } + dataset[0]=text + predicts = model.predict(dataset) + if predicts[0]==1: + p_tweet['if_offensive']="true" + return p_tweet + + +# tweet1={ +# "_id": "1000029867289690113", +# "_rev": "1-6fec69500a19192444c2de7e13c37a08", +# "create_time": "2018-05-25 15:04:34", +# "user_id": 847624802, +# "text": "@maiiron_ Teve loco que até Jesus Cristo citou. Buguei meu", +# "lang": "pt", +# "location": "Sydney" +# } +# tweet2={ +# "_id": "1000108067982200832", +# "_rev": "1-d23d7bd9bf292aee6423360ac45abb76", +# "create_time": "2018-05-25 20:15:19", +# "user_id": 3104553206, +# "text": "Finished #plant18 https://t.co/GOWl6bsQUO", +# "lang": "en", +# "location": "Kimba" +# } +# tweet3={ +# "_id": "1000641327279824896", +# "_rev": "1-bafd33be6cd735694ddcf9557c34656e", +# "create_time": "2018-05-27 07:34:18", +# "user_id": 110366337, +# "text": "Veer's arrival & Rivi's 5th bday Party https://t.co/xZmhoMgV76", +# "lang": "en", +# "location": "Melbourne, Australia" +# } +# p=data_process(tweet3) +# print(p) + diff --git a/tweet_havester/process.py b/tweet_havester/process.py deleted file mode 100644 index fa33429..0000000 --- a/tweet_havester/process.py +++ /dev/null @@ -1,58 +0,0 @@ -from sklearn.externals import joblib -import json -import couchdb -model = joblib.load("train_model.m") - -dataset=[] - -user = "admin" -password = "password" -dbserver = couchdb.Server("http://admin:password@172.26.38.157:5984/") -db=dbserver["raw_tweets"] -r_db = dbserver["tweet_results"] -count=0 -dataset=["1"] - -####filter cities -cities =['melbourne','sydney','adelaide','perth','brisbane'] - -for id in db: - tweet=db.get(id) - text = tweet['text'] - lang = tweet['lang'] - location = tweet['location'] - create_time = tweet['create_time'] - user_id = tweet['user_id'] - - - #find the target city - flag = False - for city in cities: - #the location contains target city names - if city in location.lower(): - #generalize city name - flag = True - location=city - #not in target cities,continue - if flag == False: - continue - - p_tweet = { - '_id':id, - 'user_id':user_id, - "create_time":create_time, - "location":location, - "lang":lang, - 'text':text - } - if lang =='en': - dataset[0]=text - predicts = model.predict(dataset) - if predicts[0]==1: - r_db.save(p_tweet) - - - - - - diff --git a/tweet_havester/tweepy_search.py b/tweet_havester/tweepy_search.py index eaedcb2..7ebdb8a 100644 --- a/tweet_havester/tweepy_search.py +++ b/tweet_havester/tweepy_search.py @@ -43,6 +43,7 @@ class TweetSearchHavester(): process_db = self.couch['tweet_results'] for tweet in tweepy.Cursor(api.user_timeline,id = user_id ).items(50): # save most recent tweets + gp = GP() dic = {} dic['_id'] = tweet.id_str dic['create_time'] = str(tweet.created_at) @@ -55,7 +56,7 @@ class TweetSearchHavester(): dic['location'] = tweet.user.location # print(dic) try: - p_dic = date_process(dic,self.model) + p_dic = gp.data_process(dic,self.model) if p_dic != None: process_db.save(p_dic) db.save(dic) diff --git a/tweet_havester/tweepy_stream.py b/tweet_havester/tweepy_stream.py index 9cb4884..7f3b438 100644 --- a/tweet_havester/tweepy_stream.py +++ b/tweet_havester/tweepy_stream.py @@ -8,6 +8,7 @@ from tweepy import OAuthHandler from tweepy import Stream from tweepy.streaming import StreamListener from sklearn.externals import joblib +import general_process as GP class listener(StreamListener): def __init__(self,path): @@ -27,6 +28,7 @@ class listener(StreamListener): return dic def on_data(self,data): try: + gp = GP() db = self.couch['raw_tweets'] id_db = self.couch['user_id'] pc_db = self.couch['tweet_results'] @@ -34,8 +36,9 @@ class listener(StreamListener): dic = self.convertValue(content) id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False} # print(id_doc) - p_dic = date_process(dic,self.model) + p_dic = gp.data_process(dic,self.model) if p_dic != None: + process_db.save(p_dic) id_db.save(id_doc) db.save(dic) -- GitLab