fix to disk storaga data for both search and stream

150f1d54 · Felipe Ramos · 83d2620b · 150f1d54 · 150f1d54 · 150f1d54
Commit 150f1d54 authored 4 years ago by Felipe Ramos
--- a/data.json
+++ b/data.json
--- a/tweet_results-melb.json
+++ b/tweet_results-melb.json
--- a/tweet_stream.py
+++ b/tweet_stream.py
@@ -15,7 +15,7 @@ access_token = "1386612665682853892-Tp6J5KfT4Wr8gsIPYSr2G5W15axUlj"
 access_secret = "hN59hNhidGKLJpsJ8nX8Dr2EEB4m9ny0w70K2Fd0dpY8Q"
-outfile = open('data.json', 'r+', encoding ='utf-8')
+outfile = open('twt_stream.json', 'r+', encoding ='utf-8')
 class StreamListener(tweepy.StreamListener):
@@ -48,8 +48,9 @@ class StreamListener(tweepy.StreamListener):
            #db.save(doc)
            #self.file.append(json.load(doc))
+            serial = {doc['_id']:doc}
            print(doc)
-            outfile.write(str(doc))
+            outfile.write(str(serial) + '\n')
@@ -65,7 +66,8 @@ if __name__ == "__main__":
    auth = tweepy.OAuthHandler(API_key, API_secret)
    auth.set_access_token(access_token, 
        access_secret)
-    api = tweepy.API(auth)
+    api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
+    #outfile = open('twt_stream.json', 'r+', encoding ='utf-8')
    if collector_type == 'stream':
@@ -79,7 +81,7 @@ if __name__ == "__main__":
        box = [144.5,-38.2,145.49,-37.41] #Melb box
-        stream.filter(track=tags, locations= box)
+        stream.filter(track=tags, locations= box, languages=["en"])
@@ -89,19 +91,19 @@ if __name__ == "__main__":
        twts = {}
        # capure the most data
-        for tweet in api.search(q="AFL tennis footie swimming AustraliaOpen soccer", geocode='-37.7980882,144.9334346,60km', count=100, tweet_mode="extended"):
+        for tweet in api.search(q="AFL OR tennis OR footie OR swimming OR AustraliaOpen OR soccer", geocode='-37.7980882,144.9334346,60km', count=100, tweet_mode="extended"):
-            tweet = tweet._json
+            #tweet = tweet._json
            print("")
            print(tweet.user.screen_name)
            print("")
-            print(tweet.text)
+            print(tweet.full_text )
            tmp.append(tweet)
        replies = []
        for v in tmp:
-            twts[v.id] = v
+            twts[v.id_str] = v
            # Get retweets
            #for tweet in api.retweets(v.id, count=100):
            #   twts[tweet.id] = tweet
@@ -127,11 +129,15 @@ if __name__ == "__main__":
        #db.save(status._json)
-        save results to disk
+        #save results to disk
        serializable = {k: v._json for k,v in twts.items()}
-        with open("tweet_results-melb.json", "w") as outfile: 
+        for k, v in serializable:
-            json.dump(twts, outfile)
+            v['region'] = 'Melbourne'
+        with open("twt_search.json", "w") as outfile: 
+            json.dump(serializable, outfile)