python线程池实现网络爬虫

参考:http://blog.csdn.net/sding/article/details/5538089

http://blog.daviesliu.net/2006/10/09/234822/

首先是创建线程池:

线程池主要由两个队列维护,线程队列和任务队列,线程队列存放开启的线程,任务队列由用户添加任务,开启的线程一直去任务队列中获取任务

import Queue, threading, sys  
from threading import Thread  
import time  
import urllib  
   
# working thread  
class Worker(Thread):  
    worker_count = 0  
    timeout = 1  
    def __init__( self, workQueue, resultQueue, **kwds):  
        Thread.__init__( self, **kwds )  
        self.id = Worker.worker_count  
        Worker.worker_count += 1  
        self.setDaemon( True )  
        self.workQueue = workQueue  
        self.resultQueue = resultQueue  
        self.start( )  
   
    def run( self ):  
        ''''' the get-some-work, do-some-work main loop of worker threads '''  
        while True:  
            try:  
                callable, args, kwds = self.workQueue.get(timeout=Worker.timeout)  
                res = callable(*args, **kwds)  
                print "worker[%2d]: %s" % (self.id, str(res) )  
                self.resultQueue.put( res )  
                #time.sleep(Worker.sleep)  
            except Queue.Empty:  
                break  
            except :  
                print 'worker[%2d]' % self.id, sys.exc_info()[:2]  
                raise  
                 
class WorkerManager:  
    def __init__( self, num_of_workers=10, timeout = 2):  
        self.workQueue = Queue.Queue()  
        self.resultQueue = Queue.Queue()  
        self.workers = []  
        self.timeout = timeout  
        self._recruitThreads( num_of_workers )  
   
    def _recruitThreads( self, num_of_workers ):  
        for i in range( num_of_workers ):  
            worker = Worker( self.workQueue, self.resultQueue )  
            self.workers.append(worker)  
   
    def wait_for_complete( self):  
        # ...then, wait for each of them to terminate:  
        while len(self.workers):  
            worker = self.workers.pop()  
            worker.join( )  
            if worker.isAlive() and not self.workQueue.empty():  
                self.workers.append( worker )  
        print "All jobs are are completed."  
   
    def add_job( self, callable, *args, **kwds ):  
        self.workQueue.put( (callable, args, kwds) )  
   
    def get_result( self, *args, **kwds ):  
        return self.resultQueue.get( *args, **kwds )  

下面是线程池的使用:

import urllib2  
import time  
import socket  
from datetime import datetime  
from thread_pool import *  
   
def main():  
    url_list = {"sina":"http://www.sina.com.cn",  
               "sohu":"http://www.sohu.com",  
               "yahoo":"http://www.yahoo.com",  
               "xiaonei":"http://www.xiaonei.com",  
               "qihoo":"http://www.qihoo.com",  
               "laohan":"http://www.laohan.org",  
               "eyou":"http://www.eyou.com",  
               "chinaren":"http://www.chinaren.com",  
               "douban":"http://www.douban.com",  
               "163":"http://www.163.com",  
               "daqi":"http://www.daqi.com",  
               "qq":"http://www.qq.com",  
               "baidu_1":"http://www.baidu.com/s?wd=asdfasdf",  
               "baidu_2":"http://www.baidu.com/s?wd=dddddddf",  
               "google_1":"http://www.baidu.com/s?wd=sadfas",  
               "google_2":"http://www.baidu.com/s?wd=sadflasd",  
               "hainei":"http://www.hainei.com",  
               "microsoft":"http://www.microsoft.com",  
               "wlzuojia":"http://www.wlzuojia.com"}  
   
    socket.setdefaulttimeout(10)  
    print 'start testing'  
    wm = WorkerManager(50)  
    for url_name in url_list.keys():  
        wm.add_job(do_get_con, url_name, url_list[url_name])  
    wm.wait_for_complete()  
    print 'end testing'  
   
def do_get_con(url_name,url_link):  
    try:  
        fd = urllib2.urlopen(url_link)  
        data = fd.read()  
        f_hand = open("/tmp/ttt/%s" % url_name,"w")  
        f_hand.write(data)  
        f_hand.close()  
    except Exception,e:  
        pass  
   
if __name__ == "__main__":  
    main()  


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值