python 多线程采集网页完善版

最新推荐文章于 2023-03-16 10:05:05 发布

lnz1989

最新推荐文章于 2023-03-16 10:05:05 发布

阅读量1.2k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/rrr4578/article/details/8756951

版权

python 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

import threading,time,random,htmllib,urllib,formatter,string,re

def getPageNum(data):
    result=re.findall( r'pageNum">共(\d+)' , data )
    try:
        return result[0]
    except:
        return 0

def geturls(cururl):
    global l
    data = urllib.urlopen(cururl).read()
    linkdemo = GetLinks()
    linkdemo.feed(data)
    linkdemo.close()
    urls=[]
    for item in linkdemo.links:
        try:
            if item.index('/suppliers/') and item.startswith(cururl)==False:
                urls.append(item)
        except ValueError:
            pass

    cursize=getPageNum(data)
    if int(cursize)>100:
        cursize=100
    output = open('datadown/listUrl.txt', 'a+')
    output.write(cururl + "@@@@" + str(cursize) + "\n")
    output.close()

    for ni in urls:
        try:
            l.index(ni)
            pass
        except ValueError:
            l.append(ni)
            output = open('datadown/historyUrl.txt', 'a+')
            output.write(ni + "\n")
            output.close()

class GetLinks(htmllib.HTMLParser):
    def __init__(self):
        self.links = []
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f)

    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        self.link = href

    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.link and text:
            self.links.append(self.link)


mylock = threading.RLock()
num=0
l=[]

class myThread(threading.Thread):
    def __init__(self, name):
        threading.Thread.__init__(self)
        self.t_name = name

    def run(self):
        global num,l
        while True:
            #同步开始
            mylock.acquire()
            num+=1
            if len(l)==0:
                curl="http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/"
                print '当前线程：%s，当前连接：%s，当前数量：%d\n'%(self.t_name, curl, num)
                geturls(curl)

            curl=l[num-1]
            print '当前线程：%s，当前连接：%s，当前数量：%d\n'%(self.t_name, curl, num+1)
            mylock.release()
            #同步结束
            geturls(curl)

            if num>=len(l):
                #mylock.release()
                #print 'len(l):'+ str(len(self.web.l))
                print "\n\nl:\n"
                #for i in l:
                    #print i
                break


def test():
    for i in range(1,6):
        threadi=myThread('A'+str(i))
        threadi.start()

if __name__== '__main__':
    test()