Select Git revision
run_dsa_database.py
harvester.py 3.18 KiB
import logging
import threading
import couchdb
import twitter_harvest.util.utilities as utils
import twitter_harvest.util.config as config
from twitter_harvest.auth import TwitterCredentials
def main():
# logger format
format = "%(asctime)s: %(message)s"
logging.basicConfig(filename='harvester.log', filemode='w', format=format,
level=logging.INFO, datefmt="%H:%M:%S")
# connection to CouchDB
db_conn = config.DB_Config
couchserver = couchdb.Server(db_conn)
couchserver2 = couchdb.Server(db_conn)
db_twitter_name = 'twitter'
if db_twitter_name in couchserver:
db_twitter = couchserver[db_twitter_name]
else:
db_twitter = couchserver.create(db_twitter_name)
db_users_name = 'users'
if db_users_name in couchserver2:
db_users = couchserver2[db_users_name]
else:
db_users = couchserver2.create(db_users_name)
# Preparing a list of theard to cover all available credentials.
threads = list()
num_threads = len(TwitterCredentials.cred)
for i in range(num_threads):
if i == 0:
# Search States
logging.info("Main : create and start Search thread %d.", i)
api_0 = utils.TwitterAPI(cred_rank=i)
threadSearch = threading.Thread(target=searchThread,
args=(api_0, db_twitter, db_users,
config.Keywords,))
threads.append(threadSearch)
threadSearch.start()
else:
# Search user timelines
logging.info("Main : create and start Search Timeline thread %d.", i)
api_i = utils.TwitterAPI(cred_rank=i)
threadSearchTimeline = threading.Thread(target=searchTimelinesThread,
args=(api_i, db_twitter, db_users,
config.Keywords,))
threads.append(threadSearchTimeline)
threadSearchTimeline.start()
def searchThread(api, dbtwitter, dbusers, filter_list):
while True:
# Continuously iterating through the states and search for the target tweets
for state in config.States:
logging.info("Starting search for {} :".format(state['state_name']))
api.search_tweets(state['state_name'], dbtwitter, dbusers, state['geocode'], filter_list)
def searchTimelinesThread(api, db_twitter, db_users, filter_list=[]):
while True:
# Continuously iterating through the users and search their timelines for the target tweets
for userid in db_users:
if db_users[userid]['searched']:
continue
else:
updated_user = db_users[userid]
updated_user['searched'] = True
updated_user['_id'] = userid
try:
db_users.save(updated_user)
except couchdb.http.ResourceConflict:
continue
api.retrieve_timeline_tweets(db_users[userid]['state'], db_twitter, search_user=userid,
filter_list=filter_list)
if __name__ == "__main__":
main()