python 多线程采集网页

import threading,time,random,htmllib,urllib,formatter,string,re


def getPageNum(data):
    result=re.findall( r'pageNum">共(\d+)' , data )
    return result[0]

def geturls(data,cururl):
    linkdemo = GetLinks()
    linkdemo.feed(data)
    linkdemo.close()
    urls=[]
    for item in linkdemo.links:
        try:            
            if item.index('/suppliers/') and item.startswith(cururl)==False:
                urls.append(item)
        except ValueError:
            pass
    return urls    

class GetLinks(htmllib.HTMLParser):
    def __init__(self):
        self.links = []
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f)  
 
    def anchor_bgn(self, href, name, type):
        self.save_bgn()  
        self.link = href  
 
    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.link and text:  
            self.links.append(self.link)    

class weburl():
    def __init__(self):
        self.l=[]
        
        self.count=0
        self.cururl=""
        self.data=""
    def getUrl(self):        
        if len(self.l)==0:
            self.cururl="http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/"
        else:
            self.cururl=self.l[self.count-1]                    
        self.count +=1    
        self.data = urllib.urlopen(self.cururl).read()                  
        urls=geturls(self.data,self.cururl)
        cursize=getPageNum(self.data)
        if int(cursize)>100:
            cursize=100
        print "cururl:" + str(self.cururl)  + "大小" + str(cursize)
        output = open('datadown/listUrl.txt', 'a+')
        output.write(self.cururl + "@@@@" + str(cursize) + "\n")
        output.close()
        #print d[self.cururl] + "大小" + str(cursize)
        #print len(urls)            
        return urls

    def getUniqueUrl(self):        
        nl=self.getUrl()
        for ni in nl:        
            try:
                self.l.index(ni)
                pass                
            except ValueError:                
                self.l.append(ni)
                output = open('datadown/historyUrl.txt', 'a+')
                output.write(ni + "\n")
                output.close()                
        #print len(self.l)
        
                
        
mylock = threading.RLock()
web=weburl()
   
class myThread(threading.Thread):  
    def __init__(self, name, web):  
        threading.Thread.__init__(self)  
        self.t_name = name
        self.web=web
          
    def run(self):  
        
        while True:  
            #mylock.acquire()
            self.web.getUniqueUrl()
            print '\nThread(%s), Number: %d'%(self.t_name, self.web.count)  
            if self.web.count>=len(self.web.l):  
                #mylock.release()
                print 'len(l):'+ str(len(self.web.l))
                #for i in self.web.l:
                    #print i                
                break             
            #mylock.release()
            time.sleep(0)
                          
def test():
    for i in range(1,8):
        threadi=myThread('A'+str(i), web)
        threadi.start()

if __name__== '__main__':
    test()
    
    


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值