这是一种使用多处理的方法. (非常感谢@Voo,建议对代码进行许多改进).
import multiprocessing as mp
import logging
import Queue
import time
logger=mp.log_to_stderr(logging.DEBUG) # or,# logger=mp.log_to_stderr(logging.WARN) # uncomment this to silence debug and info messages
def worker(url_queue,seen):
while True:
url=url_queue.get()
if url not in seen:
logger.info('downloading {u}'.format(u=url))
seen[url]=True
# Replace this with code to dowload url
# urllib2.open(...)
time.sleep(0.5)
content=url
logger.debug('parsing {c}'.format(c=content))
# replace this with code that finds interesting links and
# puts them in url_queue
for i in range(3):
if content<5:
u=2*content+i-1
logger.debug('adding {u} to url_queue'.format(u=u))
time.sleep(0.5)
url_queue.put(u)
else:
logger.debug('skipping {u}; seen before'.format(u=url))
url_queue.task_done()
if __name__=='__main__':
num_workers=4
url_queue=mp.JoinableQueue()
manager=mp.Manager()
seen=manager.dict()
# prime the url queue with at least one url
url_queue.put(1)
downloaders=[mp.Process(target=worker,args=(url_queue,seen))
for i in range(num_workers)]
for p in downloaders:
p.daemon=True
p.start()
url_queue.join()
>创建(4)工作进程池.
>有一个名为url_queue的JoinableQueue.
>每个工作人员从url_queue获取一个url,找到新的url并添加
他们到url_queue.
>只有在添加新项目之后才会调用url_queue.task_done().
>主进程调用url_queue.join().这阻止了主要
进程直到为每个任务调用task_done
url_queue.
>由于工作进程将守护程序属性设置为True,
当主要过程结束时,它们也会结束.