From a4ee7ebde3eef5046a2b6744c83528b4414310cb Mon Sep 17 00:00:00 2001 From: Terry Liao <jinliang@student.unimelb.edu.au> Date: Sun, 12 May 2019 18:40:36 +1000 Subject: [PATCH] add tweet havester --- .gitignore | 2 + tweet_havester/tweepy_search.py | 92 +++++++++++++++ tweet_havester/tweepy_stream.py | 89 ++++++++++++++ tweet_havester/tweet_havester_config.json | 137 ++++++++++++++++++++++ 4 files changed, 320 insertions(+) create mode 100644 .gitignore create mode 100644 tweet_havester/tweepy_search.py create mode 100644 tweet_havester/tweepy_stream.py create mode 100644 tweet_havester/tweet_havester_config.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0485ba8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ + +\.DS_Store diff --git a/tweet_havester/tweepy_search.py b/tweet_havester/tweepy_search.py new file mode 100644 index 0000000..94de079 --- /dev/null +++ b/tweet_havester/tweepy_search.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +import json +import csv +import os +import couchdb +import tweepy +from tweepy import OAuthHandler + + + +class TweetSearchHavester(): + def __init__(self,couch): + self.couch = couch + + def run(self, ids , city): + dict = {} + with open('./tweet_havester_config.json','r') as f: + dict = json.load(f) + api_token = dict[city]["API"]["search"] + stream_area = dict[city]["bound"] + consumer_key = api_token["consumer_key"] + consumer_secret = api_token["consumer_secret"] + access_token = api_token["access_token"] + access_token_secret = api_token["access_token_secret"] + + auth = OAuthHandler(consumer_key,consumer_secret) + auth.set_access_token(access_token,access_token_secret) + api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify=True) + for id in ids: + try: + self.get_all_tweets(id,api) + except tweepy.TweepError: + print ('Failed to run the command on that user, Skipping...') + except IndexError: + print ('List index out of range, Skipping...') + + f.close() + + def get_all_tweets(self, user_id, api): + new_tweets = api.user_timeline(user_id=user_id, count=50) + db = self.couch['raw_tweets'] + for tweet in tweepy.Cursor(api.user_timeline,id = user_id ).items(50): + # save most recent tweets + dic = {} + dic['_id'] = tweet.id_str + dic['create_time'] = str(tweet.created_at) + dic['user_id'] = tweet.user.id + dic['text'] = tweet.text + dic['lang'] = tweet.lang + if(tweet.place != None): + dic['location'] = tweet.place.name + else: + dic['location'] = tweet.user.location + # print(dic) + try: + db.save(dic) + except: + pass + # write to db + + + + +if __name__ == '__main__': + couch = couchdb.Server('http://admin:password@127.0.0.1:5984/') + db = couch['user_id'] + # couch.create('test_db') + city = ["melbourne","sydney","perth","adelaide","brisbane"] + switch = 0 + count = 0 + ids = list() + a = TweetSearchHavester(couch) + while True: + print("start a new round") + for id in db: + data = db[id] + if(not data['isSearched']): + ids.append(id) + count+=1 + else: + continue + if(count > 20): + switch = (switch+1)%5 + count = 0 + a.run(ids,city[switch]) + for id in ids: + data = db[id] + data['isSearched'] = True + db.save(data) + ids = list() + print("finsh a round") + \ No newline at end of file diff --git a/tweet_havester/tweepy_stream.py b/tweet_havester/tweepy_stream.py new file mode 100644 index 0000000..8d0d330 --- /dev/null +++ b/tweet_havester/tweepy_stream.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +import json +import os +import tweepy +import couchdb +import threading +from tweepy import OAuthHandler +from tweepy import Stream +from tweepy.streaming import StreamListener + +class listener(StreamListener): + def __init__(self,path): + self.couch = couchdb.Server(path) + def convertValue(self,origin): + dic = {} + dic['_id'] = origin["id_str"] + dic['create_time'] = origin["created_at"] + dic['user_id'] = origin['user']['id'] + dic['text'] = origin["text"] + dic['lang'] = origin["lang"] + if(origin["place"] != None): + dic['location'] = origin["place"]["name"] + else: + dic['location'] = "None" + return dic + def on_data(self,data): + try: + db = self.couch['raw_tweets'] + id_db = self.couch['user_id'] + content = json.loads(data) + dic = self.convertValue(content) + id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False} + # print(id_doc) + id_db.save(id_doc) + db.save(dic) + # print("success") + pass + except: + pass + + return True + def on_error(self,status): + print(status) + +class TweetStreamHavester(): + def __init__(self,server_path): + self.server_path = server_path + def process(self,city): + #args是关键字参数,需要加上名字,写成args=(self,) + print("start streaming"+city) + th = threading.Thread(target=TweetStreamHavester.run, args=(self,city)) + th.start() + th.join() + def run(self, city): + dict = {} + with open('./tweet_havester_config.json','r') as f: + dict = json.load(f) + api_token = dict[city]["API"]["stream"] + stream_area = dict[city]["bound"] + consumer_key = api_token["consumer_key"] + consumer_secret = api_token["consumer_secret"] + access_token = api_token["access_token"] + access_token_secret = api_token["access_token_secret"] + auth = OAuthHandler(consumer_key,consumer_secret) + auth.set_access_token(access_token,access_token_secret) + twitterStream = Stream(auth,listener(self.server_path)) + twitterStream.filter(locations=stream_area,is_async = True) + + f.close() + +if __name__ == "__main__": + couch = couchdb.Server('http://admin:password@127.0.0.1:5984/') + # couch.create('raw_tweets') + # couch.create('new_stream_tweet') + server_path = 'http://127.0.0.1:5984/' + a = TweetStreamHavester(server_path) + try: + a.process("melbourne") + a.process("sydney") + a.process("adelaide") + a.process("brisbane") + a.process("perth") + except Exception as e: + print(e) + pass + + + + diff --git a/tweet_havester/tweet_havester_config.json b/tweet_havester/tweet_havester_config.json new file mode 100644 index 0000000..df13e25 --- /dev/null +++ b/tweet_havester/tweet_havester_config.json @@ -0,0 +1,137 @@ +{ + "adelaide":{ + "API":{ + "search":{ + "access_token":"501570381-A1xtlFokmlQaSNpMFcZRaeto6Rw3lSkZ4sBCo4Dn", + "access_token_secret":"tYH17Bo84dzhabjCb9e24CgKbzlFC0p07M2ewQ0wQIQXz", + "consumer_key":"c8i3xHwPKd5pbVgmSEK9rSlCH", + "consumer_secret":"dETpVfCIVxpMdOIoPNr0c9eDPNSkLIsUlPirEQ2ewrUpiTVm2v" + }, + "stream":{ + "access_token":"501570381-U7Ki9dhndzfTaVBvWtRNNHPufBALu9SjKWyeDB85", + "access_token_secret":"aXRiTiDtSDQPM7sUVkRi6UClORCFY26Ty2Mn55vKDPqtT", + "consumer_key":"Osvnejara4fXjZSlv2vn3cfsr", + "consumer_secret":"klO1gm2VoJdUyv8x6oEuB5KU64eokOgR1pbHObrMbrwGqH24hW" + } + }, + "bound":[ + 138.4421, + -35.3489, + 138.7801, + -34.6525 + ], + "coordinate":[ + 138.571912, + -35.000767 + ], + "place_id":"01e8a1a140ccdc5c" + }, + "brisbane":{ + "API":{ + "search":{ + "access_token":"501570381-Bo8XIs0CM1bzLge5tsbuAqZsm0NqRBkFb4lzufPG", + "access_token_secret":"fiHJB6KU6UPrDJSnmlEbmMpiMpCpWqkGLF7z0tQWwaAi6", + "consumer_key":"feZrQEQoS3a1PBbdib8ltkRSX", + "consumer_secret":"NcjYRW4FjIjXjiAxJxF17RRyICybviNilBCkCnbV3PViThJKBK" + }, + "stream":{ + "access_token":"501570381-hBxiLFmZMxxqexmM6tiNmPIZchlK3Ne93ds4VKie", + "access_token_secret":"DjGtsoN9s3ppTKs63c9ysOLl1hFvQXpgBtyesudiA5ozo", + "consumer_key":"aYjy8AaMl9GeHX8xHiJ9ciXdP", + "consumer_secret":"xA0GoXl4OqDRiwBkPvSwQEipE2hBXqGhGqyoPZ0BcmIWy4tGI6" + } + }, + "bound":[ + 152.6685, + -27.7674, + 153.3178, + -26.9968 + ], + "coordinate":[ + 153.030209, + -27.382142 + ], + "place_id":"004ec16c62325149" + }, + "melbourne":{ + "API":{ + "search":{ + "access_token":"501570381-U7Ki9dhndzfTaVBvWtRNNHPufBALu9SjKWyeDB85", + "access_token_secret":"aXRiTiDtSDQPM7sUVkRi6UClORCFY26Ty2Mn55vKDPqtT", + "consumer_key":"Osvnejara4fXjZSlv2vn3cfsr", + "consumer_secret":"klO1gm2VoJdUyv8x6oEuB5KU64eokOgR1pbHObrMbrwGqH24hW" + }, + "stream":{ + "access_token":"501570381-A1xtlFokmlQaSNpMFcZRaeto6Rw3lSkZ4sBCo4Dn", + "access_token_secret":"tYH17Bo84dzhabjCb9e24CgKbzlFC0p07M2ewQ0wQIQXz", + "consumer_key":"c8i3xHwPKd5pbVgmSEK9rSlCH", + "consumer_secret":"dETpVfCIVxpMdOIoPNr0c9eDPNSkLIsUlPirEQ2ewrUpiTVm2v" + } + }, + "bound":[ + 144.5937, + -38.4338, + 145.5125, + -37.5112 + ], + "coordinate":[ + 145.152529, + -37.972566 + ], + "place_id":"01864a8a64df9dc4" + }, + "perth":{ + "API":{ + "search":{ + "access_token":"501570381-hBxiLFmZMxxqexmM6tiNmPIZchlK3Ne93ds4VKie", + "access_token_secret":"DjGtsoN9s3ppTKs63c9ysOLl1hFvQXpgBtyesudiA5ozo", + "consumer_key":"aYjy8AaMl9GeHX8xHiJ9ciXdP", + "consumer_secret":"xA0GoXl4OqDRiwBkPvSwQEipE2hBXqGhGqyoPZ0BcmIWy4tGI6" + }, + "stream":{ + "access_token":"501570381-Bo8XIs0CM1bzLge5tsbuAqZsm0NqRBkFb4lzufPG", + "access_token_secret":"fiHJB6KU6UPrDJSnmlEbmMpiMpCpWqkGLF7z0tQWwaAi6", + "consumer_key":"feZrQEQoS3a1PBbdib8ltkRSX", + "consumer_secret":"NcjYRW4FjIjXjiAxJxF17RRyICybviNilBCkCnbV3PViThJKBK" + } + }, + "bound":[ + 138.44212, + -35.348970061, + 138.780189824, + -34.652564053 + ], + "coordinate":[ + 138.571912, + -35.000767 + ], + "place_id":"0118c71c0ed41109" + }, + "sydney":{ + "API":{ + "search":{ + "access_token":"501570381-hBxiLFmZMxxqexmM6tiNmPIZchlK3Ne93ds4VKie", + "access_token_secret":"DjGtsoN9s3ppTKs63c9ysOLl1hFvQXpgBtyesudiA5ozo", + "consumer_key":"aYjy8AaMl9GeHX8xHiJ9ciXdP", + "consumer_secret":"xA0GoXl4OqDRiwBkPvSwQEipE2hBXqGhGqyoPZ0BcmIWy4tGI6" + }, + "stream":{ + "access_token":"501570381-Bo8XIs0CM1bzLge5tsbuAqZsm0NqRBkFb4lzufPG", + "access_token_secret":"fiHJB6KU6UPrDJSnmlEbmMpiMpCpWqkGLF7z0tQWwaAi6", + "consumer_key":"feZrQEQoS3a1PBbdib8ltkRSX", + "consumer_secret":"NcjYRW4FjIjXjiAxJxF17RRyICybviNilBCkCnbV3PViThJKBK" + } + }, + "bound":[ + 150.5209, + -34.1183, + 151.343, + -33.5781 + ], + "coordinate":[ + 150.96870653200818, + -33.84824400225 + ], + "place_id":"0073b76548e5984f" + } +} \ No newline at end of file -- GitLab