Skip to content
Snippets Groups Projects
Commit 61b3e743 authored by Terry Liao's avatar Terry Liao
Browse files

融合了文件分析处理

parent a4ee7ebd
No related branches found
No related tags found
No related merge requests found
from sklearn.externals import joblib
import json
import couchdb
model = joblib.load("train_model.m")
dataset=[]
user = "admin"
password = "password"
dbserver = couchdb.Server("http://admin:password@172.26.38.157:5984/")
db=dbserver["raw_tweets"]
r_db = dbserver["tweet_results"]
count=0
dataset=["1"]
####filter cities
cities =['melbourne','sydney','adelaide','perth','brisbane']
for id in db:
tweet=db.get(id)
text = tweet['text']
lang = tweet['lang']
location = tweet['location']
create_time = tweet['create_time']
user_id = tweet['user_id']
#find the target city
flag = False
for city in cities:
#the location contains target city names
if city in location.lower():
#generalize city name
flag = True
location=city
#not in target cities,continue
if flag == False:
continue
p_tweet = {
'_id':id,
'user_id':user_id,
"create_time":create_time,
"location":location,
"lang":lang,
'text':text
}
if lang =='en':
dataset[0]=text
predicts = model.predict(dataset)
if predicts[0]==1:
r_db.save(p_tweet)
CouchDB
sklearn
tweepy
\ No newline at end of file
File added
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import csv
import os import os
import couchdb import couchdb
import tweepy import tweepy
from tweepy import OAuthHandler from tweepy import OAuthHandler
from sklearn.externals import joblib
class TweetSearchHavester(): class TweetSearchHavester():
def __init__(self,couch): def __init__(self,couch):
self.couch = couch self.couch = couch
self.model = joblib.load("./train_model.m")
def run(self, ids , city): def run(self, ids , city):
dict = {} dict = {}
...@@ -39,6 +40,7 @@ class TweetSearchHavester(): ...@@ -39,6 +40,7 @@ class TweetSearchHavester():
def get_all_tweets(self, user_id, api): def get_all_tweets(self, user_id, api):
new_tweets = api.user_timeline(user_id=user_id, count=50) new_tweets = api.user_timeline(user_id=user_id, count=50)
db = self.couch['raw_tweets'] db = self.couch['raw_tweets']
process_db = self.couch['tweet_results']
for tweet in tweepy.Cursor(api.user_timeline,id = user_id ).items(50): for tweet in tweepy.Cursor(api.user_timeline,id = user_id ).items(50):
# save most recent tweets # save most recent tweets
dic = {} dic = {}
...@@ -53,6 +55,9 @@ class TweetSearchHavester(): ...@@ -53,6 +55,9 @@ class TweetSearchHavester():
dic['location'] = tweet.user.location dic['location'] = tweet.user.location
# print(dic) # print(dic)
try: try:
p_dic = date_process(dic,self.model)
if p_dic != None:
process_db.save(p_dic)
db.save(dic) db.save(dic)
except: except:
pass pass
......
...@@ -7,10 +7,12 @@ import threading ...@@ -7,10 +7,12 @@ import threading
from tweepy import OAuthHandler from tweepy import OAuthHandler
from tweepy import Stream from tweepy import Stream
from tweepy.streaming import StreamListener from tweepy.streaming import StreamListener
from sklearn.externals import joblib
class listener(StreamListener): class listener(StreamListener):
def __init__(self,path): def __init__(self,path):
self.couch = couchdb.Server(path) self.couch = couchdb.Server(path)
self.model = joblib.load("./train_model.m")
def convertValue(self,origin): def convertValue(self,origin):
dic = {} dic = {}
dic['_id'] = origin["id_str"] dic['_id'] = origin["id_str"]
...@@ -27,10 +29,14 @@ class listener(StreamListener): ...@@ -27,10 +29,14 @@ class listener(StreamListener):
try: try:
db = self.couch['raw_tweets'] db = self.couch['raw_tweets']
id_db = self.couch['user_id'] id_db = self.couch['user_id']
pc_db = self.couch['tweet_results']
content = json.loads(data) content = json.loads(data)
dic = self.convertValue(content) dic = self.convertValue(content)
id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False} id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False}
# print(id_doc) # print(id_doc)
p_dic = date_process(dic,self.model)
if p_dic != None:
process_db.save(p_dic)
id_db.save(id_doc) id_db.save(id_doc)
db.save(dic) db.save(dic)
# print("success") # print("success")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment