From 73e560f43ffaa3b61645cdd67012b896854dc822 Mon Sep 17 00:00:00 2001
From: Terry Liao <jinliang@student.unimelb.edu.au>
Date: Sun, 12 May 2019 20:10:07 +1000
Subject: [PATCH] update

---
 tweet_havester/general_process.py | 69 +++++++++++++++++++++++++++++++
 tweet_havester/process.py         | 58 --------------------------
 tweet_havester/tweepy_search.py   |  3 +-
 tweet_havester/tweepy_stream.py   |  5 ++-
 4 files changed, 75 insertions(+), 60 deletions(-)
 create mode 100644 tweet_havester/general_process.py
 delete mode 100644 tweet_havester/process.py

diff --git a/tweet_havester/general_process.py b/tweet_havester/general_process.py
new file mode 100644
index 0000000..b8f95b0
--- /dev/null
+++ b/tweet_havester/general_process.py
@@ -0,0 +1,69 @@
+from sklearn.externals import joblib
+import json
+import couchdb
+model = joblib.load("train_model.m")
+
+def data_process(tweet,model):
+    ####filter cities
+    cities =['melbourne','sydney','adelaide','perth','brisbane']
+    dataset=['0']
+    id = tweet['_id']
+    text = tweet['text']
+    lang = tweet['lang']
+    ##we only care English
+    if lang != 'en':
+        return None 
+    location = tweet['location']
+    create_time = tweet['create_time']
+
+    for city in cities:
+        #the location contains target city names
+        if city in location.lower():
+            #generalize city name
+            location=city
+    
+    p_tweet={
+    '_id':id,
+    "create_time":create_time,
+    "location":location,
+    "lang":lang,
+    'text':text,
+    'if_offensive':"false"
+    }
+    dataset[0]=text
+    predicts = model.predict(dataset)
+    if predicts[0]==1:
+        p_tweet['if_offensive']="true"
+    return p_tweet
+
+
+# tweet1={
+#   "_id": "1000029867289690113",
+#   "_rev": "1-6fec69500a19192444c2de7e13c37a08",
+#   "create_time": "2018-05-25 15:04:34",
+#   "user_id": 847624802,
+#   "text": "@maiiron_ Teve loco que até Jesus Cristo citou. Buguei meu",
+#   "lang": "pt",
+#   "location": "Sydney"
+#     }
+# tweet2={
+#   "_id": "1000108067982200832",
+#   "_rev": "1-d23d7bd9bf292aee6423360ac45abb76",
+#   "create_time": "2018-05-25 20:15:19",
+#   "user_id": 3104553206,
+#   "text": "Finished #plant18 https://t.co/GOWl6bsQUO",
+#   "lang": "en",
+#   "location": "Kimba"
+# }
+# tweet3={
+#   "_id": "1000641327279824896",
+#   "_rev": "1-bafd33be6cd735694ddcf9557c34656e",
+#   "create_time": "2018-05-27 07:34:18",
+#   "user_id": 110366337,
+#   "text": "Veer's arrival &amp; Rivi's 5th bday Party https://t.co/xZmhoMgV76",
+#   "lang": "en",
+#   "location": "Melbourne, Australia"
+# }
+# p=data_process(tweet3)
+# print(p)
+
diff --git a/tweet_havester/process.py b/tweet_havester/process.py
deleted file mode 100644
index fa33429..0000000
--- a/tweet_havester/process.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from sklearn.externals import joblib
-import json
-import couchdb
-model = joblib.load("train_model.m")
-
-dataset=[]
-
-user = "admin"
-password = "password"
-dbserver = couchdb.Server("http://admin:password@172.26.38.157:5984/")
-db=dbserver["raw_tweets"]
-r_db =  dbserver["tweet_results"]
-count=0
-dataset=["1"]
-
-####filter cities
-cities =['melbourne','sydney','adelaide','perth','brisbane']
-
-for id in db:
-    tweet=db.get(id)
-    text = tweet['text']
-    lang = tweet['lang']
-    location = tweet['location']
-    create_time = tweet['create_time']
-    user_id = tweet['user_id']
-
-    
-    #find the target city
-    flag = False
-    for city in cities:
-        #the location contains target city names
-        if city in location.lower():
-            #generalize city name
-            flag = True
-            location=city
-    #not in target cities,continue
-    if flag == False:
-         continue
-
-    p_tweet = {
-    '_id':id,
-    'user_id':user_id,
-    "create_time":create_time,
-    "location":location,
-    "lang":lang,
-    'text':text
-    }
-    if lang =='en':
-        dataset[0]=text
-        predicts = model.predict(dataset)
-        if predicts[0]==1:
-            r_db.save(p_tweet) 
-
-
-
-
-
-
diff --git a/tweet_havester/tweepy_search.py b/tweet_havester/tweepy_search.py
index eaedcb2..7ebdb8a 100644
--- a/tweet_havester/tweepy_search.py
+++ b/tweet_havester/tweepy_search.py
@@ -43,6 +43,7 @@ class TweetSearchHavester():
         process_db = self.couch['tweet_results']
         for tweet in tweepy.Cursor(api.user_timeline,id = user_id ).items(50):
             # save most recent tweets
+            gp = GP()
             dic = {}
             dic['_id'] = tweet.id_str
             dic['create_time'] = str(tweet.created_at)
@@ -55,7 +56,7 @@ class TweetSearchHavester():
                 dic['location'] = tweet.user.location
             # print(dic)
             try:
-                p_dic = date_process(dic,self.model)
+                p_dic = gp.data_process(dic,self.model)
                 if p_dic != None:
                     process_db.save(p_dic)
                 db.save(dic)
diff --git a/tweet_havester/tweepy_stream.py b/tweet_havester/tweepy_stream.py
index 9cb4884..7f3b438 100644
--- a/tweet_havester/tweepy_stream.py
+++ b/tweet_havester/tweepy_stream.py
@@ -8,6 +8,7 @@ from tweepy import OAuthHandler
 from tweepy import Stream
 from tweepy.streaming import StreamListener
 from sklearn.externals import joblib
+import general_process as GP
 
 class listener(StreamListener):
     def __init__(self,path):
@@ -27,6 +28,7 @@ class listener(StreamListener):
         return dic
     def on_data(self,data):
         try:
+            gp = GP()
             db = self.couch['raw_tweets']
             id_db = self.couch['user_id']
             pc_db = self.couch['tweet_results']
@@ -34,8 +36,9 @@ class listener(StreamListener):
             dic = self.convertValue(content)
             id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False}
             # print(id_doc)
-            p_dic = date_process(dic,self.model)
+            p_dic = gp.data_process(dic,self.model)
             if p_dic != None:
+                
                 process_db.save(p_dic)
             id_db.save(id_doc)
             db.save(dic)
-- 
GitLab