From 6196ce44050e20907fcba5bd79b06713d08d393b Mon Sep 17 00:00:00 2001 From: Terry Liao <jinliang@student.unimelb.edu.au> Date: Mon, 13 May 2019 14:16:46 +1000 Subject: [PATCH] =?UTF-8?q?=E6=8A=8A=E4=B8=A4=E4=B8=AAharvester=E7=94=A8?= =?UTF-8?q?=E4=B8=80=E4=B8=AApy=E6=96=87=E4=BB=B6=E8=B0=83=E8=B5=B7?= =?UTF-8?q?=EF=BC=8C=E5=A2=9E=E5=8A=A0=E4=BA=86init=5Fdb.py=E5=8E=BB?= =?UTF-8?q?=E5=88=9B=E5=BB=BA=E6=8C=87=E5=AE=9A=E7=9A=84db?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../general_process.cpython-37.pyc | Bin 0 -> 883 bytes .../__pycache__/tweepy_search.cpython-37.pyc | Bin 0 -> 2899 bytes .../__pycache__/tweepy_stream.cpython-37.pyc | Bin 0 -> 3236 bytes tweet_havester/init_db.py | 24 +++++++ tweet_havester/tweepy_search.py | 13 ++-- tweet_havester/tweepy_stream.py | 61 ++++++++---------- tweet_havester/tweet_havester.py | 11 ++++ 7 files changed, 72 insertions(+), 37 deletions(-) create mode 100644 tweet_havester/__pycache__/general_process.cpython-37.pyc create mode 100644 tweet_havester/__pycache__/tweepy_search.cpython-37.pyc create mode 100644 tweet_havester/__pycache__/tweepy_stream.cpython-37.pyc create mode 100644 tweet_havester/init_db.py create mode 100644 tweet_havester/tweet_havester.py diff --git a/tweet_havester/__pycache__/general_process.cpython-37.pyc b/tweet_havester/__pycache__/general_process.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..499ebe437a399bd4deb4396de8a70396355733ba GIT binary patch literal 883 zcmZ?b<>g{vU|^8(z7gZV&cN^(#DQUE1_lNP1_p*=BL)VB6owSW9EM!RC`LvQn<<AW ziYbL5g(-(QiaC{W0ZR(=LdGc86qXd$7KSLc6sBMXO}3XHlQbD`v1R2a<zy!LRq>V- zC1&Qu=jNxR=IG@nGlAq87#MgM7#P?Y7#N&EmaPC;#+bsC!VEGjowb%Jg)N=2mbr!@ zg<X=NmZgRvg+r2|mbHc<g;SEDmaT>@g)5u6$RdRyn4yLtg<F!Lmc51{g-4R1mZL(c zhRKB?R;re>hO>r!0mDLuY__758qQkwLO+oDLcef^8kQQ48ul8t8rBrv6uvoZwOlo< zHOwjeHVie43m9vd7c$mz*KnusOERPgWHS{lNfE5!N)fVQsNo7`&=mH&#hII$layat zl$Uyot++BJFSYU(M`B89PGV+C>MhoS)S{A%TO3J6nZ-$od8xM;4Q?^VXQtd@DM_s; zxy6!`n3sNwDK+mFM^1inVo7Fx-YxFrqSVBa)cBIj+|*mFX^A<-shVs>oD2*MMSLKF z7esJ_2vE%3;>k>l&reHB%`46<OD$q#U|@I&B5tvi6qTkH2{14)XfoYm&B-rMExN^C zP?VaInOvgDbBnd4JT<lC7ArU^qS%r%OEOc7qu5gtOA?DyOKve`rWApUF9KTxwi0Cd zEtcfWlFBIdf_Sj@C=RHB#VZ+#BpDbOe&y+h7N-^!>*r)9=4WN*<zy!2rR%$-7H5~_ z7wG4tCYO{JrRpc=<d>%CCl(iHrsw6R=9L)fCnqQCBJp)gQWJB{EcL;b#AhUyr52Z@ z7U`#_=A{-T=EN5i<tL{W7wZ*N-r@l{7$(35N<IdlG{wZn$i~P4f=nEYEMS_8iHC`e ziG!Jm5e7M!I2gGYc^Ekui=;up#C}V#I6Egbu_#Y3wW1`oC@(RmxCoR)Z?R+*=jYvG zPtGq*&PYkR#gdbsm{KGLiUN>Eu+NbQDFz0HTO2mI`6;D2sdk{qDCPhqa*#6^xtIVO C1Lphy literal 0 HcmV?d00001 diff --git a/tweet_havester/__pycache__/tweepy_search.cpython-37.pyc b/tweet_havester/__pycache__/tweepy_search.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..477cbf31596d50ad27e7ef9bd5185cabc4d57899 GIT binary patch literal 2899 zcmZ?b<>g{vU|>*qc_Su{mx19ihy%mS3=9ko3=9m#Q49<WDGVu$ISf${nlXwI%x8*X z0@KV<%qa{hOgYTCEKw|sAT`W6thsDaY+yEH4to@bJ3|Uf3Tq2P3Trb{6lV%s3VRDf z6juskFoPz?OOTCznvA!2{2fb6GCUIVQgTv@G#PKPW#uR3WF{qp<YAZz<|-8i28L7y zkV~SNQW#U1S{S03Q<zg&S{R~OQdm>iS{R~OQ`kY~-x3ZfPfaZePE9OI&hSVqOD!%* zElLI%1TqvvgS_et@~RjE149jC4MRLb3F88$6vl-hHggI?FoPzOUlqTeeo0YcW?p=5 zeoAVNUals~E!O1x(&UUHkO8+?a`F>XZn1(THJNU)6sP8-6|pifFsx)K;$&c8_?4p{ zTAW%`te=yan4guImy?;8m#*)UTAW>yU!b3pnp{#^l&YVclV6&mpIBU+nVy%Mnpa|^ zpPZbmi^SJ0NlnZ(v(zsEc`7~w>K`z-pfbJ~>`T3Z%3B=q@tJv<CGqh*AfK^-{L0A2 zSS5ocF!W$blR+K<spMl|U;rfzaHzauU|`5(sAVo;00ne2Q~#-27Et)quq<S(Wv*ea zVUlD>VU}b_VUc90WrgxtB^hejYFJa)BpGViYgkj*B^hctYFJY^BpGTsYgkh_B^heD zN|<ZdYdD&jYPoB;7qHZD)^IIkVq_>`&0?$JPT@-7O5yHhuH~r+t6_9ui1n-GtrVzX zSioMxTf+lV7tT=F!~#;yk<C@Kg^>XSOE^<_ni&}xA|Y~I*<3~EK;n!HC7da|FqsI3 z!lrPBaE26yK!zZO2!<M#1>6f6K*=?jL6gs~N=6SHo+a?$jZe<cOUq2x%PP*#yTw>^ zi`g;2^A;N<u%lR$@=NnlZt*1N=M|UcrWVC#r&iwLhx3Y4lZ#SIZt)~0C#Mz{$Cu=1 zr{>)fM&`vsl!Rz9MG2KBW|qX~=fxK#mZZk#Waeg;L`h<b#pmUhWTsVC8M`HB=A@=5 zl;kTEmF6jwWTYx2=jY}o=A|g)gSd$$3Z=!VMLG(>*_j0enR)4YdU{nVKAFWO3YmE+ zsTB(Or6mgaX$nP&dFiPrsx*0SvE&z|=G|fe`MHRVfq|h2lvknzic?GCkzId_4U`lM zDvNj-7#MExrGun%a^gXPCB;!(p!8PYT2z!@bc?m1C^N6*7MCZ;YN$|hPJVH!CSMT; zNHcS0O7Sh0<jj)FTP!J=$tAZK)1o*N3o;>r7R6m$Qk0sQ8=qK|npngSa*+Ut5CjoI z3=9lWEQzHh8Ml}d3o>spWu}009fSa-<y*`}rFkF?#TuX_&d9>Z!^p)bz{tbM0f9_h zj6942j9iRNi~?YBK1La)Dshkp&`J-zqS8EgUg2b5U|?rpV1Vb98m26U8b(lF>1C>A zu3^Yx1m%ue7BCB(M=B1LFl8}A)v@+6EnoqaH7*RXD{9#?LHUI(i?xP5h0TVcgsp}x zi@lk#hCPMdhM|TnizAD(hCPJ?%;u_LPvHc!xNF!`xWFu)6ozc(qDeJuSv*<1HS8(e zHVk<@V10ZLUZq(H{{n#;_8NvP!G%n<93?_E9H4wr!vWTLqK0*Wa1F;oMn+Js5Uybd zvBDV&tsuE5lA+Klf}zkVoB>=a^7y?pVqjp<WV*!;N@wwzDc~Y8uOy1AD6t%t26(|~ zA-*WJxHPAvSd+0x9u!&3@tG-6+{s0$pp;&cnVVXq1WE%eC8-r9w^(u#^U`l|<m4wO zmSpDVX^KSgf-L~4%gM}31sAMEq9AkFoJ)&}L8&XVBsI4<iY+rGzPP057ISe)Q4|-% z_>}m>l3Oew1He`nfhvz8Rgh-Zf}F(U)LSfhiMgpoY9RZW(hH(^QW8rN;|q%NK{>Vv z6uCv>AUT%e#IjUPPH<ucyS7LUWC&MYD$K99m{O9WxS%@YQ<9=s!G4coPRUG;Vl9YI z$xMc%NKmRN0wv2LP^AM(k;PizWXZ?K#mL1d!^FbK#mEOvq98sW6Au#;BOf@Sg2dSv ztCX-MOw`=xr^$AUBR(GFx%l{7T=Afqt28GyKK>R@e0*VPVh%*6h#QoH6hH*XF-Q(X zxDr%>fRq)3N&*f>4i*lHB9K@zBgoYZ3=E*$4^j@!20V-m3?&Q;7;6|7GS)JJGJ_;T zEfcsBVM<|2VQyyX7pP?^VP3!jsYF1@2~rfNGt{zHh}AH=FvRkKDv$-NHEf_HSm+SW zkit^Sp2AYgQNq^DP|I1uUc=bTSj$y0uY@Co6_kW4I%=3)7-Ac0xoenexFs2Cd1`n- z6;N?S4d(*Z8g7s|H5@7I<_xtQc}#^BH5@4%*(^mBpv)!Ckisd(05QLYYXN5sXANr& zyCegsqNz};;dEh$l|z`D!UZzfhM|UO0apzV$lSsU;S3OW6;^~ZlyIbQgIrXY1@c)i zgC>t3IKAMnS&BeabrC2)s|1Tn5{pU{5*6}N%N2@1^-C02W-+7<lbZ4pq~@gv0|UcL zPy-{RiZ?AYuQ)>?5u#L+`xZO6VUm&*#TJ}eRF+y)Bn*mOkPdKBev2)!pdd9b<rYg$ zW^oBPBEc!V2%HFXKv|mi7I!gNYkWasNd_dp>4FTA0THsGbkCNYS(2GrTqF(V6qjd~ zBxe*UgQTTExrH$ioZD`(fN~R3q5`LSkVS5wmWd50{V;Mdaxj4kWD!P4x@TczVPs?E zU}R$C0Lg(#E+!C<hmi$bNP`M(P+kE$S(Cd6<hNT)`Nc&hAP0a-21tD+Se%`cnpl*l zms(MhT9lWVQw(mdMe(Pn=A{-T=D@SLHb@^RNrO#9axW-VgR|-_4x8Nkl+v73J5Wtn S45~{&t^i>kP&LBLXAJ<xwbwuZ literal 0 HcmV?d00001 diff --git a/tweet_havester/__pycache__/tweepy_stream.cpython-37.pyc b/tweet_havester/__pycache__/tweepy_stream.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c2944ac09c80c5722949a52ed3eb6e27d4e1422 GIT binary patch literal 3236 zcmZ?b<>g{vU|?{6b0db6hk@ZShy%kc3=9ko3=9m#T?`BiDGVu$ISf${nlXwI%x8*X z0@KV<%wU=&iY0|1g(-(Amo<u&5u}zmhbfmWiVe(W$zjjsh~faVS#vmZxuUqhY{ne! zC?0o)6t)!h7KRk|RE}omC|-Al6pj?m7KRkgW~L~<6s{ER7KSMP6vki%O`ew^7y4;3 z-s16hEG^0KNX$#gNiEW3yu}t=Qk0sQtI2qa55n@vEG|jSgUDs&C*@=&C4=<BFcZv& zIt&a9sSF@rMKPr?rZBZIL@}o@r?9jzM6sl>rm(dzM6sr@r*O0|M6soC1~X`K-QviB znw<<X1!N+KW@cbua0Z2$8Uq7E3Bv-$8is|8j0`1A3z%yd7c$l`#IuyJE?`T6u-Q`> zf*CZK{Hpl%^h=5oGxOqe^HWlD^l~*hia?=%iz7ZhGcU6wKK>Saa(-!YMoQ8xw&2vF zvecqmtYCp6Rt5%!TP!*Gi7B^O!J0LhZm|@n=A_+XDM&2ISjkYt&%nU&D@Q-HIJKx) zKPNLWKPxjYCo?fGUEd|OIJ+djKtCrnxumoxRX;f=zcfWZvA8%hJuf#kuf#|{IXPJu ziLYCdnwV>5sb5l_npzT{kyw^mT#{M@<`z`O7lQ*|ub{Gsoq>UY4-^V)pm1U2V&q}0 z5`=}99?VbhfCB{@2!jJIkAZ<9lcAQWhB1XvlA(qvg~^7YhB1X1%w_?zStS`#*uZ>t zFrNd=<^Z!f!E8=2n@f@*g&~`{D5-`q1;pkC%kkJS<Z-0%)-a{;*f7*E1v6;!`Q2j6 zOaZz77IS=N$}O(sqSVBa)Rg$dl3Uymc6>=@Zt5+T(&E&jTTGcLx7a}}kXn|K)QXZ@ zEIEmJ>9<%5auSnMZ?WVh=BD1_$jMJmEXmBzyT#&{pO*@b22JK7UQm3o<rih9XXf2v zPRUFLOF#$#kQ`5PeqLE>QAt>0PHCzTC=f-!fylzh!^p!Z048}Dg_t-Pt3=^JimV47 ztegxC44?=A2kQa`28PPM8ip(eP_WiAfmuwF47JQ)7PBNnElUaG0wz!@t!1rYSioGv zx{$G!t%Rk9Erm&vp_#FUHHAfzAq5=xDeQBYYuQU!YuKAXshe#9dktF+Ll(zErdo~~ z4sf7XlyENKs^I{o{u<^5Ts7=q7SjT*8a5CsoFR{gkpToF8S<DS7*aTb88kWlZgCYQ zmdAtBS8<UzC_H$<ocN;D;?kUw;v!K728JSO1_lN<P39s{@I-NfqXHb}T$#nesfk6& z8L25Dn%qU8qU{z-R&jpbEmlwtDK1g~naEsRQgn;8peQr1<Q7wU!7ZMY#FE7Lf};H7 z)Z*eI9*`0)C@(%G=@v_IVp*ytCnV-rKx%I>r6fhMW~Ri0sDfmWBKG9`ypq(sk|KEq z28Jk5YD&pZzQtM)k0plL^Yh|C#>jypR|y=sOpIJ`D8tCX$j8V5;WM!?vN5tTRSCeO z8mbu{%b-#Rgu$^4a(oFx4I?O4z-0@gpC)6GIs*fPCKJR1Y{eysC8foX1P3Y_iomIk zBR?-bwWuh+$Oz;pka-NqSr#Vir^yZ~lR$wTAAgH09#meH=A_2Q-{OgnFDy;WfyjUh z0B~Rz8GzyoWGIr2aBGV|$rF?#ia8h<7&sU?SU7k%M8Klh%V`y&%IRPRO^#c_A)rDE zQVn=OODgPv21+J1prV-p9%Q$;i@~LAJg5wX23?Uk$SYvCg9%W$fjw%56l5}3Y(*4! zph6B5-=MM`RB(gi45Yn;A%(GqQJeu3ZzYUbOeM@&EHw-@j5SPY%qdL0Ots84%nMi- zGB7gKFfU*O(O{=C`&Ee*mn0UIC_u{I%)E4k<jj&vD+NuaTWlqXMd_&}w^$O3(u+0O z!KvUDXGum;YGO)eUivMz5HPa{WdAMZqSCxutYEWlv1H|E=4rA((hCd7yjv_OnaL%$ zm`XB`ViDy3TkOz`VgU*RP$V)i@i2-paxhj&V+#qW>SS0za5FG4fC2*?5G4!@3^hzO zjFR9Y3RLLSGDG>`;;DuiQ~=eo)-Zz#p<1>YW>76q%U;8r!Y;{B%TdBm!&<}E%v8%+ z!?}R5hP{SkArq+Tt>G*I6>==ijLl57Ts2%-tTim4#zij^sQCe^%>8aLI|g{(VuOU@ zE!L#`(!7*gppv+_G&i*<K0CGY7JqVn9)wq%np~7xa*HQ18B}1zm*i)s=G_uT=EXyl zglIC|;)GQL#kV*zi{lfEEAx^y*@{3p?3O@rYDqk@z9LY#fRjHsvE5=z%giZBEz;x$ z$3>AnC;@_6WVbjI3o@Y=b3+^*pIDTdSmXv$;SM4^K!hi#EMrM5Ey;-DEh*0|DM>AY zWC*lqEV5x>V6X#4A*8MW)iNB65D2Qg1sJQuu|;Q5X`Y`ZTahuyE)x)83L-$Ye~}Z2 z1u_=NDnzycr8$t=Vo(9V!N|eH!3Iu;$&8@v!N9-(G6PhYffGgtQd_E)v4p9Hp_#Fk zse~ESQfp@FkE&%ZVOhXh!@Q8OmZid@hQ);;)}fZQQlN%u0b31g4GSn4gfkTCu`rad zXLA+hFf!D#F)~!~lyKCrfpT0psHMtK%T~u0!4Sz%s29Ods29!v5)EVsVu%3cIZos} zC!+^$@*rA3@yYplX_@JI;HrkPNRz7wlog6V6)@Oaw^;HEQuB&HrB@Ls$%8}Z7N=`P za%usni2%u)tjRg~#i^R?;JAlWUW|#i7}JVCQ4Ws$Ta1u)1SCvB84w(E-k_Kh1BDzT z7ZV2~54h@JViaH$V3cCyV+5<w6aZU)iz&bO78|HaEvSSz(h3w&pyUWne76K3a(a+d zm6?}b1j?ULg2mZ6sfk5-dZ`s9sYQ8-ImM8cCx1Gqd7YR8uRb+E#(`oL>~XNw5Z8hu mA54JUeT%~;H$SB`C)EyQb+H7b_=hGrP(ufVIhX`A_{0F@Cjnak literal 0 HcmV?d00001 diff --git a/tweet_havester/init_db.py b/tweet_havester/init_db.py new file mode 100644 index 0000000..5edc9b1 --- /dev/null +++ b/tweet_havester/init_db.py @@ -0,0 +1,24 @@ +import couchdb +import sys + +def run(server_path): + couch = couchdb.Server(server_path) + couch.create('tweet_2014_raw') + couch.create('raw_tweets') + couch.create('tweet_2014_results') + couch.create('tweet_results') + couch.create('user_id') + print("create all db successful") + + +if __name__ == "__main__": + a=sys.argv + if(len(a) == 4): + ip = a[1] + username = a[2] + password = a[3] + path = 'http://' + username +':' + password +'@'+ip+':5984/' + else: + path = 'http://admin:password@127.0.0.1:5984/' + run(path) + pass diff --git a/tweet_havester/tweepy_search.py b/tweet_havester/tweepy_search.py index fe18c6b..b89fdd7 100644 --- a/tweet_havester/tweepy_search.py +++ b/tweet_havester/tweepy_search.py @@ -67,11 +67,16 @@ class TweetSearchHavester(): -if __name__ == '__main__': - couch = couchdb.Server('http://admin:password@127.0.0.1:5984/') +def run(server_path): + couch = couchdb.Server(server_path) db = couch['user_id'] # couch.create('test_db') - city = ["melbourne","sydney","perth","adelaide","brisbane"] + dict = {} + with open('./tweet_havester_config.json','r') as f: + dict = json.load(f) + cities = [] + for city in dict: + cities.append(city) switch = 0 count = 0 ids = list() @@ -88,7 +93,7 @@ if __name__ == '__main__': if(count > 20): switch = (switch+1)%5 count = 0 - a.run(ids,city[switch]) + a.run(ids,cities[switch]) for id in ids: data = db[id] data['isSearched'] = True diff --git a/tweet_havester/tweepy_stream.py b/tweet_havester/tweepy_stream.py index c2abc3b..45fe45c 100644 --- a/tweet_havester/tweepy_stream.py +++ b/tweet_havester/tweepy_stream.py @@ -12,6 +12,7 @@ import general_process as gp class listener(StreamListener): def __init__(self,path): + StreamListener.__init__(self) self.couch = couchdb.Server(path) self.model = joblib.load("./train_model.m") def convertValue(self,origin): @@ -34,7 +35,7 @@ class listener(StreamListener): content = json.loads(data) dic = self.convertValue(content) id_doc = {"_id":str(dic["user_id"]),"user_name":content['user']['name'],"isSearched":False} - # print(id_doc) + print(id_doc) p_dic = gp.data_process(dic,self.model) if p_dic != None: process_db.save(p_dic) @@ -52,44 +53,38 @@ class listener(StreamListener): class TweetStreamHavester(): def __init__(self,server_path): self.server_path = server_path - def process(self,city): + def process(self,city,dict): #args是关键字参数,需要加上名字,写成args=(self,) - print("start streaming"+city) - th = threading.Thread(target=TweetStreamHavester.run, args=(self,city)) + print("start streaming city: "+city) + th = threading.Thread(target=TweetStreamHavester.run, args=(self,city,dict)) th.start() th.join() - def run(self, city): - dict = {} - with open('./tweet_havester_config.json','r') as f: - dict = json.load(f) - api_token = dict[city]["API"]["stream"] - stream_area = dict[city]["bound"] - consumer_key = api_token["consumer_key"] - consumer_secret = api_token["consumer_secret"] - access_token = api_token["access_token"] - access_token_secret = api_token["access_token_secret"] - auth = OAuthHandler(consumer_key,consumer_secret) - auth.set_access_token(access_token,access_token_secret) - twitterStream = Stream(auth,listener(self.server_path)) - twitterStream.filter(locations=stream_area,is_async = True) + def run(self, city, dict): + api_token = dict[city]["API"]["stream"] + stream_area = dict[city]["bound"] + consumer_key = api_token["consumer_key"] + consumer_secret = api_token["consumer_secret"] + access_token = api_token["access_token"] + access_token_secret = api_token["access_token_secret"] + auth = OAuthHandler(consumer_key,consumer_secret) + auth.set_access_token(access_token,access_token_secret) + twitterStream = Stream(auth,listener(self.server_path)) + twitterStream.filter(locations=stream_area,is_async = True) - f.close() -if __name__ == "__main__": - couch = couchdb.Server('http://admin:password@127.0.0.1:5984/') - # couch.create('raw_tweets') - # couch.create('new_stream_tweet') - server_path = 'http://127.0.0.1:5984/' +def run(server_path): + couch = couchdb.Server(server_path) + # server_path = 'http://127.0.0.1:5984/' a = TweetStreamHavester(server_path) - try: - a.process("melbourne") - a.process("sydney") - a.process("adelaide") - a.process("brisbane") - a.process("perth") - except Exception as e: - print(e) - pass + with open('./tweet_havester_config.json','r') as f: + dict = json.load(f) + for city in dict: + try: + a.process(city,dict) + except Exception as e: + print(e) + pass + f.close() diff --git a/tweet_havester/tweet_havester.py b/tweet_havester/tweet_havester.py new file mode 100644 index 0000000..91d971d --- /dev/null +++ b/tweet_havester/tweet_havester.py @@ -0,0 +1,11 @@ +import tweepy_search as tSearch +import tweepy_stream as tStream +import time + +if __name__ == "__main__": + server_path = 'http://admin:password@127.0.0.1:5984/' + + tStream.run(server_path) + # wait for streamming for a while to start searching + time.sleep(200) + tSearch.run(server_path) -- GitLab