①将输入两个文件的txt,正则匹配,进行拼接:(所有的)
②将text信息(“找一下恒洁”),组成完整的URL链接,进行爬虫搜索
③保存所有的content,匹配自己所需要的信息,进行计数
copy
#!/usr/bin/python
#!-*- coding:utf-8 -*-
import urllib,urllib2,re,time,sys,os,urlparse
def readsrcinfo(filepath):
filepathu = unicode(filepath,'utf-8')
srcfile = open(filepathu)
srcinfo = srcfile.readlines()
return srcinfo
#urlinfo = srcinfo[0]
#maininfo = srcinfo[1:]
#return srcinfo,maininfo
def readseminfo(filepath):
filepathu = unicode(filepath,'utf-8')
srcfile = open(filepathu)
seminfo = srcfile.readlines()
#print srcinfo
return seminfo
def downtext(srcinfo,seminfo):
urlinfo = srcinfo[0]
maininfo = srcinfo[1:]
seminfo = seminfo
totallintent = len(seminfo)
urla = urlinfo.split('?')
url = urla[0] + '?'
print url
#参数字典
valueso = urlparse.parse_qs(urla[1])
values = {}
for key,value in valueso.items():
values[key] = value[0].strip()
resub = re.compile('{.*}')
datafile = file('Totalldata.txt','w')
for title in maininfo:
filename = title.strip().decode('utf-8')+ '.txt'
errorfile = file(filename,'w')
statuscount= 0
for semi in seminfo:
text = resub.sub(title.strip(),semi)
#print type(text)
#ff.write(text)
#ff.write('\n')
values['text'] = str(text.strip())
#print values
data = urllib.urlencode(values)
req = urllib2.Request(url,data)
#print req
try:
response = urllib2.urlopen(req,timeout = 5)
content = response.read()
b = re.compile('"status":(\d)')
if re.findall(b,content)[0] == '1':
#print content
pass
else:
statuscount = statuscount + 1
errorfile.write(text.strip()+'\t'+content)
errorfile.write('\n')
except urllib2.HTTPError, e:
print "The server couldn't fulfill the request"
print "Error code:", e.code
if e.code == 404:
print "Page not found!"
#do someting
elif e.code == 403:
print "Access denied!"
#do someting
else:
print "Something happened! Error code", e.code
print "Return content:", e.read()
except urllib2.URLError, err1:
print "Failed to reach the server"
print "The reason:", e.reason
print statuscount,
datafile.write(title.strip()+'\t'+'statusFail:'+str(statuscount))
statuscount = 0
errorfile.close()
datafile.close()
if __name__=='__main__':
srcfile = '商家信息-大兴店.txt'
srcinfo = readsrcinfo(srcfile)
seminfo = '位置语义.txt'
seminfo = readseminfo(seminfo)
downtext(srcinfo,seminfo)
copy