diff --git a/tweet_havester/__pycache__/general_process.cpython-37.pyc b/tweet_havester/__pycache__/general_process.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..499ebe437a399bd4deb4396de8a70396355733ba Binary files /dev/null and b/tweet_havester/__pycache__/general_process.cpython-37.pyc differ diff --git a/tweet_havester/__pycache__/tweepy_search.cpython-37.pyc b/tweet_havester/__pycache__/tweepy_search.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..477cbf31596d50ad27e7ef9bd5185cabc4d57899 Binary files /dev/null and b/tweet_havester/__pycache__/tweepy_search.cpython-37.pyc differ diff --git a/tweet_havester/__pycache__/tweepy_stream.cpython-37.pyc b/tweet_havester/__pycache__/tweepy_stream.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c2944ac09c80c5722949a52ed3eb6e27d4e1422 Binary files /dev/null and b/tweet_havester/__pycache__/tweepy_stream.cpython-37.pyc differ diff --git a/tweet_havester/init_db.py b/tweet_havester/init_db.py new file mode 100644 index 0000000000000000000000000000000000000000..5edc9b130eda671f290ca66b98281ec3e2cb9b13 --- /dev/null +++ b/tweet_havester/init_db.py @@ -0,0 +1,24 @@ +import couchdb +import sys + +def run(server_path): + couch = couchdb.Server(server_path) + couch.create('tweet_2014_raw') + couch.create('raw_tweets') + couch.create('tweet_2014_results') + couch.create('tweet_results') + couch.create('user_id') + print("create all db successful") + + +if __name__ == "__main__": + a=sys.argv + if(len(a) == 4): + ip = a[1] + username = a[2] + password = a[3] + path = 'http://' + username +':' + password +'@'+ip+':5984/' + else: + path = 'http://admin:password@127.0.0.1:5984/' + run(path) + pass diff --git a/tweet_havester/tweepy_search.py b/tweet_havester/tweepy_search.py index fe18c6bc3648654673b92e870080c4ecf504944d..b89fdd7f4bcdb4fe3a9c6c56439fd18d5f089336 100644 --- a/tweet_havester/tweepy_search.py +++ b/tweet_havester/tweepy_search.py @@ -67,11 +67,16 @@ class TweetSearchHavester(): -if __name__ == '__main__': - couch = couchdb.Server('http://admin:password@127.0.0.1:5984/') +def run(server_path): + couch = couchdb.Server(server_path) db = couch['user_id'] # couch.create('test_db') - city = ["melbourne","sydney","perth","adelaide","brisbane"] + dict = {} + with open('./tweet_havester_config.json','r') as f: + dict = json.load(f) + cities = [] + for city in dict: + cities.append(city) switch = 0 count = 0 ids = list() @@ -88,7 +93,7 @@ if __name__ == '__main__': if(count > 20): switch = (switch+1)%5 count = 0 - a.run(ids,city[switch]) + a.run(ids,cities[switch]) for id in ids: data = db[id] data['isSearched'] = True diff --git a/tweet_havester/tweepy_stream.py b/tweet_havester/tweepy_stream.py index c2abc3b21cf69a61b7ca01100fcefcc2cc376927..45fe45c153c1d2e0da175e9156d4eec62161dbc0 100644 --- a/tweet_havester/tweepy_stream.py +++ b/tweet_havester/tweepy_stream.py @@ -12,6 +12,7 @@ import general_process as gp class listener(StreamListener): def __init__(self,path): + StreamListener.__init__(self) self.couch = couchdb.Server(path) self.model = joblib.load("./train_model.m") def convertValue(self,origin): @@ -34,7 +35,7 @@ class listener(StreamListener): content = json.loads(data) dic = self.convertValue(content) id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False} - # print(id_doc) + print(id_doc) p_dic = gp.data_process(dic,self.model) if p_dic != None: process_db.save(p_dic) @@ -52,44 +53,38 @@ class listener(StreamListener): class TweetStreamHavester(): def __init__(self,server_path): self.server_path = server_path - def process(self,city): + def process(self,city,dict): #args是关键字参数,需要加上名字,写成args=(self,) - print("start streaming"+city) - th = threading.Thread(target=TweetStreamHavester.run, args=(self,city)) + print("start streaming city: "+city) + th = threading.Thread(target=TweetStreamHavester.run, args=(self,city,dict)) th.start() th.join() - def run(self, city): - dict = {} - with open('./tweet_havester_config.json','r') as f: - dict = json.load(f) - api_token = dict[city]["API"]["stream"] - stream_area = dict[city]["bound"] - consumer_key = api_token["consumer_key"] - consumer_secret = api_token["consumer_secret"] - access_token = api_token["access_token"] - access_token_secret = api_token["access_token_secret"] - auth = OAuthHandler(consumer_key,consumer_secret) - auth.set_access_token(access_token,access_token_secret) - twitterStream = Stream(auth,listener(self.server_path)) - twitterStream.filter(locations=stream_area,is_async = True) + def run(self, city, dict): + api_token = dict[city]["API"]["stream"] + stream_area = dict[city]["bound"] + consumer_key = api_token["consumer_key"] + consumer_secret = api_token["consumer_secret"] + access_token = api_token["access_token"] + access_token_secret = api_token["access_token_secret"] + auth = OAuthHandler(consumer_key,consumer_secret) + auth.set_access_token(access_token,access_token_secret) + twitterStream = Stream(auth,listener(self.server_path)) + twitterStream.filter(locations=stream_area,is_async = True) - f.close() -if __name__ == "__main__": - couch = couchdb.Server('http://admin:password@127.0.0.1:5984/') - # couch.create('raw_tweets') - # couch.create('new_stream_tweet') - server_path = 'http://127.0.0.1:5984/' +def run(server_path): + couch = couchdb.Server(server_path) + # server_path = 'http://127.0.0.1:5984/' a = TweetStreamHavester(server_path) - try: - a.process("melbourne") - a.process("sydney") - a.process("adelaide") - a.process("brisbane") - a.process("perth") - except Exception as e: - print(e) - pass + with open('./tweet_havester_config.json','r') as f: + dict = json.load(f) + for city in dict: + try: + a.process(city,dict) + except Exception as e: + print(e) + pass + f.close() diff --git a/tweet_havester/tweet_havester.py b/tweet_havester/tweet_havester.py new file mode 100644 index 0000000000000000000000000000000000000000..91d971d224ff92f7c9f414abb88189ce2fa756df --- /dev/null +++ b/tweet_havester/tweet_havester.py @@ -0,0 +1,11 @@ +import tweepy_search as tSearch +import tweepy_stream as tStream +import time + +if __name__ == "__main__": + server_path = 'http://admin:password@127.0.0.1:5984/' + + tStream.run(server_path) + # wait for streamming for a while to start searching + time.sleep(200) + tSearch.run(server_path)