线程池实现爬虫
1 线程池使用方法介绍
实例化线程池对象
from multiprocessing. dummy import Pool
pool = Pool( process= 5 )
把从发送请求,提取数据,到保存合并成一个函数,交给线程池异步执行 使用方法pool.apply_async(func)
def exetute_requests_item_save ( self) :
url = self. queue. get( )
html_str = self. parse_url( url)
content_list = self. get_content_list( html_str)
self. save_content_list( content_list)
self. total_response_num += 1
pool. apply_async( self. exetute_requests_item_save)
添加回调函数 通过apply_async
的方法能够让函数异步执行,但是只能够执行一次 为了让其能够被反复执行,通过添加回调函数的方式能够让_callback 递归的调用自己 同时需要指定递归退出的条件
def _callback ( self, temp) :
if self. is_running:
pool. apply_async( self. exetute_requests_item_save, callback= self. _callback)
pool. apply_async( self. exetute_requests_item_save, callback= self. _callback)
确定程序结束的条件 程序在获取的响应和url数量相同的时候可以结束
while True :
time. sleep( 0.0001 )
if self. total_response_num>= self. total_requests_num:
self. is_running= False
break
self. pool. close( )
2 使用线程池实现爬虫的具体实现案例
import requests
from lxml import etree
from queue import Queue
from multiprocessing. dummy import Pool
import time
class QiubaiSpider :
def __init__ ( self) :
self. url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
self. headers = { "User-Agent" : "Mozilla/ 5.0 ( Macintosh; Intel Mac OS X \
10_13_3) AppleWebKit/ 537.36 ( KHTML, like Gecko) Chrome/ 64.0 .3282 .186 Safari/ 537.36 "}
self. queue = Queue( )
self. pool = Pool( 5 )
self. is_running = True
self. total_requests_num = 0
self. total_response_num = 0
def get_url_list ( self) :
for i in range ( 1 , 14 ) :
self. queue. put( self. url_temp. format ( i) )
self. total_requests_num += 1
def parse_url ( self, url) :
return requests. get( url, headers= self. headers) . content. decode( )
def get_content_list ( self, html_str) :
html = etree. HTML( html_str)
div_list = html. xpath( "//div[@id='content-left']/div" )
content_list = [ ]
for div in div_list:
content = { }
content[ "content" ] = div. xpath( ".//div[@class='content']/span/text()" )
print ( content)
content_list. append( content)
return content_list
def save_content_list ( self, content_list) :
pass
def exetute_requests_item_save ( self) :
url = self. queue. get( )
html_str = self. parse_url( url)
content_list = self. get_content_list( html_str)
self. save_content_list( content_list)
self. total_response_num += 1
def _callback ( self, temp) :
if self. is_running:
self. pool. apply_async( self. exetute_requests_item_save, callback= self. _callback)
def run ( self) :
self. get_url_list( )
for i in range ( 2 ) :
self. pool. apply_async( self. exetute_requests_item_save, callback= self. _callback)
while True :
time. sleep( 0.0001 )
if self. total_response_num >= self. total_requests_num:
self. is_running = False
break
if __name__ == '__main__' :
qiubai = QiubaiSpider( )
qiubai. run( )