融合了文件分析处理

61b3e743 · Terry Liao · a4ee7ebd · 61b3e743 · 61b3e743 · 61b3e743
Commit 61b3e743 authored 6 years ago by Terry Liao
--- a/tweet_havester/process.py
+++ b/tweet_havester/process.py
+from sklearn.externals import joblib
+import json
+import couchdb
+model = joblib.load("train_model.m")
+dataset=[]
+user = "admin"
+password = "password"
+dbserver = couchdb.Server("http://admin:password@172.26.38.157:5984/")
+db=dbserver["raw_tweets"]
+r_db =  dbserver["tweet_results"]
+count=0
+dataset=["1"]
+####filter cities
+cities =['melbourne','sydney','adelaide','perth','brisbane']
+for id in db:
+    tweet=db.get(id)
+    text = tweet['text']
+    lang = tweet['lang']
+    location = tweet['location']
+    create_time = tweet['create_time']
+    user_id = tweet['user_id']
+    #find the target city
+    flag = False
+    for city in cities:
+        #the location contains target city names
+        if city in location.lower():
+            #generalize city name
+            flag = True
+            location=city
+    #not in target cities,continue
+    if flag == False:
+         continue
+    p_tweet = {
+    '_id':id,
+    'user_id':user_id,
+    "create_time":create_time,
+    "location":location,
+    "lang":lang,
+    'text':text
+    }
+    if lang =='en':
+        dataset[0]=text
+        predicts = model.predict(dataset)
+        if predicts[0]==1:
+            r_db.save(p_tweet) 
--- a/tweet_havester/requirement.txt
+++ b/tweet_havester/requirement.txt
+CouchDB
+sklearn
+tweepy
\ No newline at end of file
--- a/tweet_havester/train_model.m
+++ b/tweet_havester/train_model.m
--- a/tweet_havester/tweepy_search.py
+++ b/tweet_havester/tweepy_search.py
 #  -*- coding: utf-8 -*-
 import json
-import csv
 import os
 import couchdb
 import tweepy
 from tweepy import OAuthHandler
+from sklearn.externals import joblib
 class TweetSearchHavester():
    def __init__(self,couch):
        self.couch = couch
+        self.model = joblib.load("./train_model.m")
    def run(self, ids , city):
        dict = {}
@@ -39,6 +40,7 @@ class TweetSearchHavester():
    def get_all_tweets(self, user_id, api):
        new_tweets = api.user_timeline(user_id=user_id, count=50)
        db = self.couch['raw_tweets']
+        process_db = self.couch['tweet_results']
        for tweet in tweepy.Cursor(api.user_timeline,id = user_id ).items(50):
            # save most recent tweets
            dic = {}
@@ -53,6 +55,9 @@ class TweetSearchHavester():
                dic['location'] = tweet.user.location
            # print(dic)
            try:
+                p_dic = date_process(dic,self.model)
+                if p_dic != None:
+                    process_db.save(p_dic)
                db.save(dic)
            except:
                pass

--- a/tweet_havester/tweepy_stream.py
+++ b/tweet_havester/tweepy_stream.py
@@ -7,10 +7,12 @@ import threading
 from tweepy import OAuthHandler
 from tweepy import Stream
 from tweepy.streaming import StreamListener
+from sklearn.externals import joblib
 class listener(StreamListener):
    def __init__(self,path):
        self.couch = couchdb.Server(path)
+        self.model = joblib.load("./train_model.m")
    def convertValue(self,origin):
        dic = {}
        dic['_id'] = origin["id_str"]
@@ -27,10 +29,14 @@ class listener(StreamListener):
        try:
            db = self.couch['raw_tweets']
            id_db = self.couch['user_id']
+            pc_db = self.couch['tweet_results']
            content = json.loads(data)
            dic = self.convertValue(content)
            id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False}
            # print(id_doc)
+            p_dic = date_process(dic,self.model)
+            if p_dic != None:
+                process_db.save(p_dic)
            id_db.save(id_doc)
            db.save(dic)
            # print("success")