diff --git a/Data collection/big_to_couch.py b/Data collection/big_to_couch.py
index 54f8b6085b43f58fbe923cfed4a7fdc00e84c58f..6c6774f251ba0abad4caf8612d51c9ea6d3e735e 100644
--- a/Data collection/big_to_couch.py	
+++ b/Data collection/big_to_couch.py	
@@ -1,48 +1,7 @@
 import json
 import couchdb
+from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
 
-def save2Couch(twt, db):
-    doc = twt
-    doc['_id'] = doc['id_str']
-    try:
-        db.save(doc)
-    except:
-        replaceCDB(doc, db)
-
-
-def updateCDB(twt, db):
-    doc = db.get(twt['id_str'])
-    up = twt['tags']
-    doc['tags'] = up
-    db[twt['_id']] = doc
-    print('update:')
-
-def replaceCDB(twt,db):
-    del db[twt['id_str']]
-    db.save(twt)
-
-
-def classifyTwt(twt):
-    tags = []
-    sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby']
-
-    if 'text' in twt:
-        text =  twt['text']
-    else:
-        text = twt['extended_tweet']['full_text']
-    for s in sports:
-        s = s.lower()
-        if s in text.lower():
-            if s in ['afl','footie','#afl','footy', '#afl']:
-                tag = 'Footie'
-            elif s in ['tennis', 'australianopen']:
-                tag = 'Tennis'
-            elif s in ['nrl', '#nrl', 'rugby']:
-                tag = 'Rugby'
-            else:
-                tag = s
-            tags.append(tag)
-    return tags
 
 coords = []
 areas = {}
diff --git a/Data collection/filterbigupdate.py b/Data collection/filterbigupdate.py
index 0083cfea8478a90a82a9d3f2588beae84d392aee..57c40a49e55fbf93063d53ee32d01b8411c66708 100644
--- a/Data collection/filterbigupdate.py	
+++ b/Data collection/filterbigupdate.py	
@@ -1,5 +1,6 @@
 import json
 import couchdb
+from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
 
 couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
 db = couchserver.create("live_demo2")
@@ -7,26 +8,6 @@ db = couchserver.create("live_demo2")
 with open('./'+'big_filtered.json', 'r', encoding = 'utf-8') as file:
     biga = json.load(file)
 
-
-def save2Couch(twt, db):
-    doc = twt
-    doc['_id'] = doc['id_str']
-    try:
-        db.save(doc)
-    except:
-        replaceCDB(doc, db)
-
-
-def updateCDB(twt, db):
-    doc = db.get(twt['id_str'])
-    up = twt['tags']
-    doc['tags'] = up
-    db[twt['_id']] = doc
-    print('update:')
-
-def replaceCDB(twt,db):
-    del db[twt['id_str']]
-    db.save(twt)
     
 for k,v in biga.items():
     loc = v['location']
diff --git a/Data collection/to_couch.py b/Data collection/preprocess.py
similarity index 57%
rename from Data collection/to_couch.py
rename to Data collection/preprocess.py
index 494eaa3191934dfa85de40853af7d02f8be8f9f3..ce0d9082a02083c2c6e119188c5d20656b7ee425 100644
--- a/Data collection/to_couch.py	
+++ b/Data collection/preprocess.py	
@@ -1,25 +1,3 @@
-import json
-import codecs
-import ast
-import couchdb
-import os
-
-
-data = []
-log = []
-couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
-db = couchserver['live_demo2']
-
-
-for filename in os.listdir('./UPLOAD/'):
-    if filename.startswith('twt_stream'):
-        with open('./UPLOAD/'+filename, 'r', encoding = 'utf-8') as file:
-            for line in file:
-                try:
-                    line = ast.literal_eval(line)
-                    data.append(line)
-                except:
-                    log.append(filename)
 
 def save2Couch(twt, db):
     doc = twt
@@ -37,6 +15,7 @@ def updateCDB(twt, db):
     db[twt['_id']] = doc
     print('update:')
 
+
 def replaceCDB(twt,db):
     del db[twt['id_str']]
     db.save(twt)
@@ -49,7 +28,7 @@ def classifyTwt(twt):
     if 'text' in twt:
         text =  twt['text']
     else:
-        text = twt['extended_tweet']['full_text']
+        text = twt['full_text']
     for s in sports:
         s = s.lower()
         if s in text.lower():
@@ -63,10 +42,4 @@ def classifyTwt(twt):
                 tag = s
             tags.append(tag)
     tags = list(set(tags))
-    return tags
-
-for i in data:
-    twt = next(iter(i.values()))
-    twt['tags'] = classifyTwt(twt)
-    if len(twt['tags'])> 0:
-        save2Couch(twt, db)
\ No newline at end of file
+    return tags
\ No newline at end of file
diff --git a/Data collection/searchFile_to_couch.py b/Data collection/searchFile_to_couch.py
index f9bda056e9f52b987e3f2caa45d2b80163097dfd..67e942a591478678703a8571c2f986e0e1212a48 100644
--- a/Data collection/searchFile_to_couch.py	
+++ b/Data collection/searchFile_to_couch.py	
@@ -2,55 +2,12 @@
 import json
 import couchdb
 import os
+from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
 
 data = {}
 couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
 db = couchserver['live_demo2']
 
-def save2Couch(twt, db):
-    doc = twt
-    doc['_id'] = doc['id_str']
-    try:
-        db.save(doc)
-    except:
-        replaceCDB(doc, db)
-
-
-def updateCDB(twt, db):
-    doc = db.get(twt['id_str'])
-    up = twt['tags']
-    doc['tags'] = up
-    db[twt['_id']] = doc
-    print('update:')
-
-
-def replaceCDB(twt,db):
-    del db[twt['id_str']]
-    db.save(twt)
-
-
-def classifyTwt(twt):
-    tags = []
-    sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby']
-
-    if 'text' in twt:
-        text =  twt['text']
-    else:
-        text = twt['full_text']
-    for s in sports:
-        s = s.lower()
-        if s in text.lower():
-            if s in ['afl','footie','#afl','footy', '#afl']:
-                tag = 'Footie'
-            elif s in ['tennis', 'australianopen']:
-                tag = 'Tennis'
-            elif s in ['nrl', '#nrl', 'rugby']:
-                tag = 'Rugby'
-            else:
-                tag = s
-            tags.append(tag)
-    tags = list(set(tags))
-    return tags
 
 for filename in os.listdir('./UPLOAD'):
     if filename.startswith('twt_search'):
diff --git a/Data collection/streamFile_to_couch.py b/Data collection/streamFile_to_couch.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb489df3029bf4d81f3ee9f82253eb7fc07733ba
--- /dev/null
+++ b/Data collection/streamFile_to_couch.py	
@@ -0,0 +1,29 @@
+import json
+import codecs
+import ast
+import couchdb
+import os
+from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
+
+
+data = []
+log = []
+couchserver = couchdb.Server("http://admin:admin@172.26.130.79:5984/")
+db = couchserver['live_demo2']
+
+
+for filename in os.listdir('./UPLOAD/'):
+    if filename.startswith('twt_stream'):
+        with open('./UPLOAD/'+filename, 'r', encoding = 'utf-8') as file:
+            for line in file:
+                try:
+                    line = ast.literal_eval(line)
+                    data.append(line)
+                except:
+                    log.append(filename)
+
+for i in data:
+    twt = next(iter(i.values()))
+    twt['tags'] = classifyTwt(twt)
+    if len(twt['tags'])> 0:
+        save2Couch(twt, db)
\ No newline at end of file
diff --git a/Data collection/tweet_gatherer_app.py b/Data collection/tweet_gatherer_app.py
index 98e33a73560125409aa453337136855dba073d30..72f97c9977c56a04c36227c8bad78695a1751909 100644
--- a/Data collection/tweet_gatherer_app.py	
+++ b/Data collection/tweet_gatherer_app.py	
@@ -8,6 +8,7 @@ import os.path
 import datetime 
 import re
 from urllib3.exceptions import ProtocolError
+from preprocess import save2couch, updateCBD, replaceCDB, classifyTwt
 
 collector_type = sys.argv[1]
 region = sys.argv[2]
@@ -15,60 +16,12 @@ region = sys.argv[2]
 #change code to your current Twitter account
 
 
-
 if os.path.isfile('twt_stream{}.json'.format(region)):
     outfile = open('twt_stream{}.json'.format(region), 'a+', encoding ='utf-8')
 else:
     outfile = open('twt_stream{}.json'.format(region), 'w+', encoding ='utf-8')
 
 
-
-def save2Couch(twt, db):
-    doc = twt
-    doc['_id'] = doc['id_str']
-    try:
-        db.save(doc)
-    except:
-        print(doc['_id'] +" already exist in the database")
-        replaceCDB(doc, db)
-
-
-def updateCDB(twt, db):
-    doc = db.get(twt['id_str'])
-    up = twt['tags']
-    doc['tags'] = up
-    db[twt['_id']] = doc
-    print('update:')
-
-def replaceCDB(twt,db):
-    del db[twt['id_str']]
-    db.save(twt)
-
-
-def classifyTwt(twt):
-    tags = []
-    sports = ['AFL', 'tennis', 'footie','swimming','AustralianOpen', 'footy' ,'soccer', 'cricket', '#AFL', 'netball', 'basketball', 'NRL', '#NRL', 'rugby']
-
-    if 'text' in twt:
-        text =  twt['text']
-    else:
-        text = twt['extended_tweet']['full_text']
-    for s in sports:
-        s = s.lower()
-        if s in text.lower():
-            if s in ['afl','footie','#afl','footy', '#afl']:
-                tag = 'Footie'
-            elif s in ['tennis', 'australianopen']:
-                tag = 'Tennis'
-            elif s in ['nrl', '#nrl', 'rugby']:
-                tag = 'Rugby'
-            else:
-                tag = s
-            tags.append(tag)
-    tags = list(set(tags))
-    return tags
-
-
 class StreamListener(tweepy.StreamListener):
     
     def __init__(self, config, db):
@@ -177,8 +130,6 @@ if __name__ == "__main__":
                     tmp.append(tweet)
 
 
-
-
         for v in tmp:
             final[v.id_str] = v