# -*- coding: utf-8 -*- import json import os import time import couchdb import tweepy from tweepy import OAuthHandler from sklearn.externals import joblib import general_process as gp class TweetSearchHavester(): def __init__(self,couch): self.couch = couch self.model = joblib.load("./train_model.m") def run(self, ids , city): dict = {} with open('./tweet_havester_config.json','r') as f: dict = json.load(f) api_token = dict[city]["API"]["search"] stream_area = dict[city]["bound"] consumer_key = api_token["consumer_key"] consumer_secret = api_token["consumer_secret"] access_token = api_token["access_token"] access_token_secret = api_token["access_token_secret"] auth = OAuthHandler(consumer_key,consumer_secret) auth.set_access_token(access_token,access_token_secret) api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify=True) for id in ids: try: self.get_all_tweets(id,api) except tweepy.TweepError: print ('Failed to run the command on that user, Skipping...') except IndexError: print ('List index out of range, Skipping...') f.close() def get_all_tweets(self, user_id, api): new_tweets = api.user_timeline(user_id=user_id, count=50) db = self.couch['raw_tweets'] process_db = self.couch['tweet_results'] for tweet in tweepy.Cursor(api.user_timeline,id = user_id ).items(50): # save most recent tweets dic = {} dic['_id'] = tweet.id_str dic['create_time'] = str(tweet.created_at) dic['user_id'] = tweet.user.id dic['text'] = tweet.text dic['lang'] = tweet.lang if(tweet.place != None): dic['location'] = tweet.place.name else: dic['location'] = tweet.user.location # print(dic) try: p_dic = gp.data_process(dic,self.model) print(p_dic) if p_dic != None: process_db.save(p_dic) db.save(dic) except: pass # write to db def run(server_path): couch = couchdb.Server(server_path) db = couch['user_id'] # couch.create('test_db') dict = {} with open('./tweet_havester_config.json','r') as f: dict = json.load(f) cities = [] for city in dict: cities.append(city) switch = 0 count = 0 ids = list() a = TweetSearchHavester(couch) while True: ids = list() # if user id pool less than 40, won't start for query search if(len(db) < 40): time.wait(100) continue print("start a new round on search") for id in db: data = db[id] if(not data['isSearched']): ids.append(id) count+=1 else: continue if(count > 20): switch = (switch+1)%5 count = 0 a.run(ids,cities[switch]) for id in ids: data = db[id] data['isSearched'] = True db.save(data) ids = list() print("finsh a round")