Skip to content
Snippets Groups Projects
Select Git revision
  • 040388038ff6e9ec7377ffe7de80b8004e0f433c
  • master default protected
2 results

run_dsa_database.py

Blame
  • harvester.py 3.18 KiB
    import logging
    import threading
    import couchdb
    import twitter_harvest.util.utilities as utils
    import twitter_harvest.util.config as config
    from twitter_harvest.auth import TwitterCredentials
    
    
    def main():
        # logger format
        format = "%(asctime)s: %(message)s"
        logging.basicConfig(filename='harvester.log', filemode='w', format=format,
                            level=logging.INFO, datefmt="%H:%M:%S")
    
        # connection to CouchDB
        db_conn = config.DB_Config
        couchserver = couchdb.Server(db_conn)
        couchserver2 = couchdb.Server(db_conn)
    
        db_twitter_name = 'twitter'
        if db_twitter_name in couchserver:
            db_twitter = couchserver[db_twitter_name]
        else:
            db_twitter = couchserver.create(db_twitter_name)
    
        db_users_name = 'users'
        if db_users_name in couchserver2:
            db_users = couchserver2[db_users_name]
        else:
            db_users = couchserver2.create(db_users_name)
    
        # Preparing a list of theard to cover all available credentials.
        threads = list()
        num_threads = len(TwitterCredentials.cred)
    
        for i in range(num_threads):
            if i == 0:
                # Search States
                logging.info("Main    : create and start Search thread %d.", i)
                api_0 = utils.TwitterAPI(cred_rank=i)
                threadSearch = threading.Thread(target=searchThread,
                                                args=(api_0, db_twitter, db_users,
                                                      config.Keywords,))
                threads.append(threadSearch)
                threadSearch.start()
            else:
                # Search user timelines
                logging.info("Main    : create and start Search Timeline thread %d.", i)
                api_i = utils.TwitterAPI(cred_rank=i)
                threadSearchTimeline = threading.Thread(target=searchTimelinesThread,
                                                        args=(api_i, db_twitter, db_users,
                                                              config.Keywords,))
                threads.append(threadSearchTimeline)
                threadSearchTimeline.start()
    
    
    def searchThread(api, dbtwitter, dbusers, filter_list):
        while True:
            # Continuously iterating through the states and search for the target tweets
            for state in config.States:
                logging.info("Starting search for {} :".format(state['state_name']))
                api.search_tweets(state['state_name'], dbtwitter, dbusers, state['geocode'], filter_list)
    
    
    def searchTimelinesThread(api, db_twitter, db_users, filter_list=[]):
        while True:
            # Continuously iterating through the users and search their timelines for the target tweets
            for userid in db_users:
                if db_users[userid]['searched']:
                    continue
                else:
                    updated_user = db_users[userid]
                    updated_user['searched'] = True
                    updated_user['_id'] = userid
                    try:
                        db_users.save(updated_user)
                    except couchdb.http.ResourceConflict:
                        continue
                api.retrieve_timeline_tweets(db_users[userid]['state'], db_twitter, search_user=userid,
                                             filter_list=filter_list)
    
    
    if __name__ == "__main__":
        main()