diff --git a/tweet_havester/process.py b/tweet_havester/process.py new file mode 100644 index 0000000000000000000000000000000000000000..fa334293163315a0f076c2f0ff06257bb968b66a --- /dev/null +++ b/tweet_havester/process.py @@ -0,0 +1,58 @@ +from sklearn.externals import joblib +import json +import couchdb +model = joblib.load("train_model.m") + +dataset=[] + +user = "admin" +password = "password" +dbserver = couchdb.Server("http://admin:password@172.26.38.157:5984/") +db=dbserver["raw_tweets"] +r_db = dbserver["tweet_results"] +count=0 +dataset=["1"] + +####filter cities +cities =['melbourne','sydney','adelaide','perth','brisbane'] + +for id in db: + tweet=db.get(id) + text = tweet['text'] + lang = tweet['lang'] + location = tweet['location'] + create_time = tweet['create_time'] + user_id = tweet['user_id'] + + + #find the target city + flag = False + for city in cities: + #the location contains target city names + if city in location.lower(): + #generalize city name + flag = True + location=city + #not in target cities,continue + if flag == False: + continue + + p_tweet = { + '_id':id, + 'user_id':user_id, + "create_time":create_time, + "location":location, + "lang":lang, + 'text':text + } + if lang =='en': + dataset[0]=text + predicts = model.predict(dataset) + if predicts[0]==1: + r_db.save(p_tweet) + + + + + + diff --git a/tweet_havester/requirement.txt b/tweet_havester/requirement.txt new file mode 100644 index 0000000000000000000000000000000000000000..51c11cdf9dff5a63c2672e0144b202a022187496 --- /dev/null +++ b/tweet_havester/requirement.txt @@ -0,0 +1,3 @@ +CouchDB +sklearn +tweepy \ No newline at end of file diff --git a/tweet_havester/train_model.m b/tweet_havester/train_model.m new file mode 100755 index 0000000000000000000000000000000000000000..15152e84b51b5407faea84fd3ab158159a429c63 Binary files /dev/null and b/tweet_havester/train_model.m differ diff --git a/tweet_havester/tweepy_search.py b/tweet_havester/tweepy_search.py index 94de0799feab5232cb550377a82726d3c85f0c09..eaedcb2688f26588f123067354da96cbaab0181f 100644 --- a/tweet_havester/tweepy_search.py +++ b/tweet_havester/tweepy_search.py @@ -1,16 +1,17 @@ # -*- coding: utf-8 -*- import json -import csv import os import couchdb import tweepy from tweepy import OAuthHandler +from sklearn.externals import joblib class TweetSearchHavester(): def __init__(self,couch): self.couch = couch + self.model = joblib.load("./train_model.m") def run(self, ids , city): dict = {} @@ -39,6 +40,7 @@ class TweetSearchHavester(): def get_all_tweets(self, user_id, api): new_tweets = api.user_timeline(user_id=user_id, count=50) db = self.couch['raw_tweets'] + process_db = self.couch['tweet_results'] for tweet in tweepy.Cursor(api.user_timeline,id = user_id ).items(50): # save most recent tweets dic = {} @@ -53,6 +55,9 @@ class TweetSearchHavester(): dic['location'] = tweet.user.location # print(dic) try: + p_dic = date_process(dic,self.model) + if p_dic != None: + process_db.save(p_dic) db.save(dic) except: pass diff --git a/tweet_havester/tweepy_stream.py b/tweet_havester/tweepy_stream.py index 8d0d330aa1e33532eb315299a2176e42f6e9dbbf..9cb48840a371f2bf2433b2b11758209324968f6c 100644 --- a/tweet_havester/tweepy_stream.py +++ b/tweet_havester/tweepy_stream.py @@ -7,10 +7,12 @@ import threading from tweepy import OAuthHandler from tweepy import Stream from tweepy.streaming import StreamListener +from sklearn.externals import joblib class listener(StreamListener): def __init__(self,path): self.couch = couchdb.Server(path) + self.model = joblib.load("./train_model.m") def convertValue(self,origin): dic = {} dic['_id'] = origin["id_str"] @@ -27,10 +29,14 @@ class listener(StreamListener): try: db = self.couch['raw_tweets'] id_db = self.couch['user_id'] + pc_db = self.couch['tweet_results'] content = json.loads(data) dic = self.convertValue(content) id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False} # print(id_doc) + p_dic = date_process(dic,self.model) + if p_dic != None: + process_db.save(p_dic) id_db.save(id_doc) db.save(dic) # print("success")