python 多线程采集网页

最新推荐文章于 2023-04-18 11:50:44 发布

lnz1989

最新推荐文章于 2023-04-18 11:50:44 发布

阅读量1.2k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/rrr4578/article/details/8740224

版权

python 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

import threading,time,random,htmllib,urllib,formatter,string,re

def getPageNum(data):
    result=re.findall( r'pageNum">共(\d+)' , data )
    return result[0]

def geturls(data,cururl):
    linkdemo = GetLinks()
    linkdemo.feed(data)
    linkdemo.close()
    urls=[]
    for item in linkdemo.links:
        try:
            if item.index('/suppliers/') and item.startswith(cururl)==False:
                urls.append(item)
        except ValueError:
            pass
    return urls

class GetLinks(htmllib.HTMLParser):
    def __init__(self):
        self.links = []
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f)

    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        self.link = href

    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.link and text:
            self.links.append(self.link)

class weburl():
    def __init__(self):
        self.l=[]

        self.count=0
        self.cururl=""
        self.data=""
    def getUrl(self):
        if len(self.l)==0:
            self.cururl="http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/"
        else:
            self.cururl=self.l[self.count-1]
        self.count +=1
        self.data = urllib.urlopen(self.cururl).read()
        urls=geturls(self.data,self.cururl)
        cursize=getPageNum(self.data)
        if int(cursize)>100:
            cursize=100
        print "cururl:" + str(self.cururl) + "大小" + str(cursize)
        output = open('datadown/listUrl.txt', 'a+')
        output.write(self.cururl + "@@@@" + str(cursize) + "\n")
        output.close()
        #print d[self.cururl] + "大小" + str(cursize)
        #print len(urls)
        return urls

    def getUniqueUrl(self):
        nl=self.getUrl()
        for ni in nl:
            try:
                self.l.index(ni)
                pass
            except ValueError:
                self.l.append(ni)
                output = open('datadown/historyUrl.txt', 'a+')
                output.write(ni + "\n")
                output.close()
        #print len(self.l)



mylock = threading.RLock()
web=weburl()

class myThread(threading.Thread):
    def __init__(self, name, web):
        threading.Thread.__init__(self)
        self.t_name = name
        self.web=web

    def run(self):

        while True:
            #mylock.acquire()
            self.web.getUniqueUrl()
            print '\nThread(%s), Number: %d'%(self.t_name, self.web.count)
            if self.web.count>=len(self.web.l):
                #mylock.release()
                print 'len(l):'+ str(len(self.web.l))
                #for i in self.web.l:
                    #print i
                break
            #mylock.release()
            time.sleep(0)

def test():
    for i in range(1,8):
        threadi=myThread('A'+str(i), web)
        threadi.start()

if __name__== '__main__':
    test()

lnz1989

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 多线程采集网页

import threading,time,random,htmllib,urllib,formatter,string,redef getPageNum(data): result=re.findall( r'pageNum">共(\d+)' , data ) return result[0]def geturls(data,cururl): lin
复制链接

扫一扫