python 多线程采集网页完善版

import threading,time,random,htmllib,urllib,formatter,string,re

def getPageNum(data):
    result=re.findall( r'pageNum">共(\d+)' , data )
    try:
        return result[0]
    except:
        return 0

def geturls(cururl):
    global l
    data = urllib.urlopen(cururl).read()
    linkdemo = GetLinks()
    linkdemo.feed(data)
    linkdemo.close()
    urls=[]
    for item in linkdemo.links:
        try:            
            if item.index('/suppliers/') and item.startswith(cururl)==False:
                urls.append(item)
        except ValueError:
            pass

    cursize=getPageNum(data)
    if int(cursize)>100:
        cursize=100
    output = open('datadown/listUrl.txt', 'a+')
    output.write(cururl + "@@@@" + str(cursize) + "\n")
    output.close()        

    for ni in urls:        
        try:
            l.index(ni)
            pass                
        except ValueError:                
            l.append(ni)
            output = open('datadown/historyUrl.txt', 'a+')
            output.write(ni + "\n")
            output.close()


class GetLinks(htmllib.HTMLParser):
    def __init__(self):
        self.links = []
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f)  
 
    def anchor_bgn(self, href, name, type):
        self.save_bgn()  
        self.link = href  
 
    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.link and text:  
            self.links.append(self.link)    


        
mylock = threading.RLock()
num=0
l=[]
   
class myThread(threading.Thread):  
    def __init__(self, name):  
        threading.Thread.__init__(self)  
        self.t_name = name
          
    def run(self):  
        global num,l        
        while True:
            #同步开始
            mylock.acquire()
            num+=1
            if len(l)==0:
                curl="http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/"
                print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num)
                geturls(curl)

            curl=l[num-1]
            print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num+1)
            mylock.release()
            #同步结束
            geturls(curl)
            
            if num>=len(l):  
                #mylock.release()
                #print 'len(l):'+ str(len(self.web.l))
                print "\n\nl:\n"
                #for i in l:
                    #print i                
                break             
 
            
def test():
    for i in range(1,6):
        threadi=myThread('A'+str(i))
        threadi.start()

if __name__== '__main__':
    test()
    
    

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值