python 正则匹配+爬虫+返回信息对比实例（亲自测试）

最新推荐文章于 2024-10-11 13:40:12 发布

AlexFang0904

最新推荐文章于 2024-10-11 13:40:12 发布

阅读量860

点赞数

分类专栏： Python相关

本文链接：https://blog.csdn.net/u012611644/article/details/79462900

版权

Python相关专栏收录该内容

33 篇文章 1 订阅

订阅专栏

①将输入两个文件的txt，正则匹配，进行拼接：（所有的）

②将text信息（“找一下恒洁”），组成完整的URL链接，进行爬虫搜索

③保存所有的content，匹配自己所需要的信息，进行计数

 copy

#!/usr/bin/python
#!-*- coding:utf-8 -*-
import urllib,urllib2,re,time,sys,os,urlparse



def readsrcinfo(filepath):
    filepathu = unicode(filepath,'utf-8')
    srcfile = open(filepathu)
    srcinfo = srcfile.readlines()

    return srcinfo
    #urlinfo = srcinfo[0]
    #maininfo = srcinfo[1:]
    #return srcinfo,maininfo


def readseminfo(filepath):
    filepathu = unicode(filepath,'utf-8')
    srcfile = open(filepathu)
    seminfo = srcfile.readlines()
    #print srcinfo
    return seminfo   

    
def downtext(srcinfo,seminfo):

    urlinfo = srcinfo[0]
    maininfo = srcinfo[1:]
    seminfo = seminfo
    totallintent = len(seminfo)

    urla = urlinfo.split('?')
    url = urla[0] + '?'
    print url

    #参数字典
    valueso = urlparse.parse_qs(urla[1])

    values = {}
    for key,value in valueso.items():
        values[key] = value[0].strip()

    resub = re.compile('{.*}')
    


    datafile = file('Totalldata.txt','w')
    for title in maininfo:
        filename = title.strip().decode('utf-8')+ '.txt'
        errorfile = file(filename,'w')
        statuscount= 0
        for semi in seminfo:
            text = resub.sub(title.strip(),semi)

            #print type(text)
            #ff.write(text)
            #ff.write('\n')

            
            values['text'] = str(text.strip())

            #print values
            data  = urllib.urlencode(values)
            req = urllib2.Request(url,data)
            #print req
            

            try:
                response  = urllib2.urlopen(req,timeout = 5)
                content = response.read()
                b = re.compile('"status":(\d)')
                if re.findall(b,content)[0] == '1':
                #print content
                    pass
                else:
                    statuscount = statuscount + 1
                    errorfile.write(text.strip()+'\t'+content)
                    errorfile.write('\n')

            except urllib2.HTTPError, e:  
                print "The server couldn't fulfill the request"  
                print "Error code:", e.code  
                if e.code == 404:
                    print "Page not found!"  
                    #do someting  
                elif e.code == 403:  
                    print "Access denied!"  
                    #do someting  
                else:  
                    print "Something happened! Error code", e.code  
                print "Return content:", e.read()  
            except urllib2.URLError, err1:  
                print "Failed to reach the server"  
                print "The reason:", e.reason

            print statuscount,
        datafile.write(title.strip()+'\t'+'statusFail:'+str(statuscount))
        statuscount = 0
        errorfile.close()
    datafile.close() 
            
    

if __name__=='__main__':

    
    srcfile = '商家信息-大兴店.txt'
    srcinfo = readsrcinfo(srcfile)

    seminfo = '位置语义.txt'
    seminfo = readseminfo(seminfo)

    downtext(srcinfo,seminfo)