python 正则匹配+爬虫+返回信息对比 实例(亲自测试)

①将输入两个文件的txt,正则匹配,进行拼接:(所有的)




②将text信息(“找一下恒洁”),组成完整的URL链接,进行爬虫搜索

③保存所有的content,匹配自己所需要的信息,进行计数

 copy
#!/usr/bin/python
#!-*- coding:utf-8 -*-
import urllib,urllib2,re,time,sys,os,urlparse



def readsrcinfo(filepath):
    filepathu = unicode(filepath,'utf-8')
    srcfile = open(filepathu)
    srcinfo = srcfile.readlines()

    return srcinfo
    #urlinfo = srcinfo[0]
    #maininfo = srcinfo[1:]
    #return srcinfo,maininfo


def readseminfo(filepath):
    filepathu = unicode(filepath,'utf-8')
    srcfile = open(filepathu)
    seminfo = srcfile.readlines()
    #print srcinfo
    return seminfo   

    
def downtext(srcinfo,seminfo):

    urlinfo = srcinfo[0]
    maininfo = srcinfo[1:]
    seminfo = seminfo
    totallintent = len(seminfo)

    urla = urlinfo.split('?')
    url = urla[0] + '?'
    print url

    #参数字典
    valueso = urlparse.parse_qs(urla[1])

    values = {}
    for key,value in valueso.items():
        values[key] = value[0].strip()

    resub = re.compile('{.*}')
    


    datafile = file('Totalldata.txt','w')
    for title in maininfo:
        filename = title.strip().decode('utf-8')+ '.txt'
        errorfile = file(filename,'w')
        statuscount= 0
        for semi in seminfo:
            text = resub.sub(title.strip(),semi)

            #print type(text)
            #ff.write(text)
            #ff.write('\n')

            
            values['text'] = str(text.strip())

            #print values
            data  = urllib.urlencode(values)
            req = urllib2.Request(url,data)
            #print req
            

            try:
                response  = urllib2.urlopen(req,timeout = 5)
                content = response.read()
                b = re.compile('"status":(\d)')
                if re.findall(b,content)[0] == '1':
                #print content
                    pass
                else:
                    statuscount = statuscount + 1
                    errorfile.write(text.strip()+'\t'+content)
                    errorfile.write('\n')

            except urllib2.HTTPError, e:  
                print "The server couldn't fulfill the request"  
                print "Error code:", e.code  
                if e.code == 404:
                    print "Page not found!"  
                    #do someting  
                elif e.code == 403:  
                    print "Access denied!"  
                    #do someting  
                else:  
                    print "Something happened! Error code", e.code  
                print "Return content:", e.read()  
            except urllib2.URLError, err1:  
                print "Failed to reach the server"  
                print "The reason:", e.reason

            print statuscount,
        datafile.write(title.strip()+'\t'+'statusFail:'+str(statuscount))
        statuscount = 0
        errorfile.close()
    datafile.close() 
            
    

if __name__=='__main__':

    
    srcfile = '商家信息-大兴店.txt'
    srcinfo = readsrcinfo(srcfile)

    seminfo = '位置语义.txt'
    seminfo = readseminfo(seminfo)

    downtext(srcinfo,seminfo)
        
copy

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

AlexFang0904

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值