爬取糗事百科,正常爬取
import datetime
import requests
from lxml import etree
class Qiushi(object):
def __init__(self):
self.url = 'https://www.qiushibaike.com/8hr/page/{}/'
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
def make_url(self):
return [self.url.format(i) for i in range(1,14)]
def get_data(self,url):
response = requests.get(url,headers=self.headers)
return response.content
def title_list(self,title_html):
html = etree.HTML(title_html)
tl_list = html.xpath('//div[@class="recommend-article"]/ul/li')
# print(len(tl_list))
temp_list = []
for i in tl_list:
title = i.xpath('./div[1]/a/text()')
if title != []:
temp = {}
temp['title'] = i.xpath('./div[1]/a/text()')[0]
temp_list.append(temp)
return temp_list
def save_item(self,temp_list):
"""保存数据"""
for temp in temp_list:
print(temp)
def run(self):
url_list = self.make_url()
for url in url_list:
title_html = self.get_data(url)
temp_list = self.title_list(title_html)
self.save_item(temp_list)
if __name__ == '__main__':
start = datetime.datetime.now()
qiushi = Qiushi()
qiushi.run()
end = datetime.datetime.now()
print('耗时:{}'.format(end - start))
# 耗时:0:00:02.556753 正常
python的Queue模块
线程优先级队列
Queue模块提供了同步的,线程安全的队列类,实现了锁原语
先入先出队列:Queue
后入先出队列:LifoQueue
优先级队列:PriorityQueue
Queue模块常用方法
Queue.qsize() 返回队列的大小
Queue.empty() 如果队列为空,返回True,反之False
Queue.full() 如果队列满了,返回True,反之False
Queue.full 与 maxsize 大小对应
Queue.get([block[, timeout]])获取队列,timeout等待时间
Queue.get_nowait() 相当Queue.get(False)
Queue.put(item) 写入队列,timeout等待时间
Queue.put_nowait(item) 相当Queue.put(item, False)
Queue.task_done() 在完成一项工作之后,Queue.task_done()函数向任务已经完成的队列发送一个信号
Queue.join() 实际上意味着等到队列为空,再执行别的操作
q.join()
# join会阻塞当前线程,前提是计数为0
# q队列在实例化时,默认计数是0
# put成功,计数+1
# q.task_done会调用时,计数-1
q.task_done()
补充python多线程thread join() 作用
join方法的阻塞,等待子线程结束
依次检验线程池中的线程是否结束,没有结束就阻塞直到线程结束,如果结束则跳转执行下一个线程的join函数。
阻塞主进程,专注于执行多线程中的程序。
多线程多join的情况下,依次执行各线程的join方法,前头一个结束了才能执行后面一个。
无参数,则等待到该线程结束,才开始执行下一个线程的join。
参数timeout为线程的阻塞时间,如 timeout=2 就是罩着这个线程2s 以后,就不管他了,继续执行下面的代码。
多线程
t = threading.Thread(target=func, args=(,))
t.setDaemon(True) # 设置为守护线程:主线程结束,子线程随之结束
t.start()
使用多线程爬取糗事百科
import datetime
from queue import Queue
from threading import Thread
import requests
from lxml import etree
class Qiushi(object):
def __init__(self):
self.url = 'https://www.qiushibaike.com/8hr/page/{}/'
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
self.url_q = Queue()
self.html_q = Queue()
self.item_q = Queue()
def make_url(self):
# return [self.url.format(i) for i in range(1,14)]
for i in range(1,14):
self.url_q.put(self.url.format(i))
def get_data(self):
while True:
url = self.url_q.get()
response = requests.get(url,headers=self.headers)
self.html_q.put(response.content)
self.url_q.task_done()
# return response.content
def title_list(self):
while True:
title_html = self.html_q.get()
html = etree.HTML(title_html)
tl_list = html.xpath('//div[@class="recommend-article"]/ul/li')
# print(len(tl_list))
temp_list = []
for i in tl_list:
title = i.xpath('./div[1]/a/text()')
if title != []:
temp = {}
temp['title'] = i.xpath('./div[1]/a/text()')[0]
temp_list.append(temp)
self.item_q.put(temp_list)
self.html_q.task_done()
# return temp_list
def save_item(self):
"""保存数据"""
while True:
temp_list = self.item_q.get()
for temp in temp_list:
print(temp)
self.item_q.task_done()
def run(self):
# self.make_url()
t_list = []
self.make_url()
for i in range(8):
t = Thread(target=self.get_data)
t_list.append(t)
t = Thread(target=self.title_list)
t_list.append(t)
t = Thread(target=self.save_item)
t_list.append(t)
for t in t_list:
t.setDaemon(True) # 守护线程
t.start()
for q in [self.url_q,self.html_q,self.item_q]:
q.join() # 阻塞当前 直到计数为0
# url_list = self.make_url()
# for url in url_list:
#
# title_html = self.get_data(self.url)
#
# temp_list = self.title_list(title_html)
#
# self.save_item(temp_list)
if __name__ == '__main__':
start = datetime.datetime.now()
qiushi = Qiushi()
qiushi.run()
end = datetime.datetime.now()
print('耗时:{}'.format(end - start))
# 使用线程的耗时
# 耗时:0:00:00.723405 5
# 耗时:0:00:00.643361 7
# 耗时:0:00:00.493194 8
# 耗时:0:00:00.742133 9
# 耗时:0:00:16.628856 10
多进程
p = multiprocessing.Process(target=func, args=(,))
p.daemon = True
p.start()
使用多进程爬取,只需在线程的基础上稍加改进
import datetime
# from queue import Queue
# from threading import Thread
from multiprocessing import JoinableQueue as Queue
from multiprocessing import Process
import requests
from lxml import etree
class Qiushi(object):
def __init__(self):
self.url = 'https://www.qiushibaike.com/8hr/page/{}/'
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
self.url_q = Queue()
self.html_q = Queue()
self.item_q = Queue()
def make_url(self):
# return [self.url.format(i) for i in range(1,14)]
for i in range(1,14):
self.url_q.put(self.url.format(i))
def get_data(self):
while True:
url = self.url_q.get()
response = requests.get(url,headers=self.headers)
self.html_q.put(response.content)
self.url_q.task_done()
# return response.content
def title_list(self):
while True:
title_html = self.html_q.get()
html = etree.HTML(title_html)
tl_list = html.xpath('//div[@class="recommend-article"]/ul/li')
# print(len(tl_list))
temp_list = []
for i in tl_list:
title = i.xpath('./div[1]/a/text()')
if title != []:
temp = {}
temp['title'] = i.xpath('./div[1]/a/text()')[0]
temp_list.append(temp)
self.item_q.put(temp_list)
self.html_q.task_done()
# return temp_list
def save_item(self):
"""保存数据"""
while True:
temp_list = self.item_q.get()
for temp in temp_list:
print(temp)
self.item_q.task_done()
def run(self):
# self.make_url()
t_list = []
self.make_url()
for i in range(13):
p = Process(target=self.get_data)
t_list.append(p)
p = Process(target=self.title_list)
t_list.append(p)
p = Process(target=self.save_item)
t_list.append(p)
for p in t_list:
# t.setDaemon(True) # 守护线程
p.daemon = True
p.start()
for q in [self.url_q,self.html_q,self.item_q]:
q.join() # 阻塞当前 直到计数为0
# url_list = self.make_url()
# for url in url_list:
# title_html = self.get_data(self.url)
# temp_list = self.title_list(title_html)
# self.save_item(temp_list)
if __name__ == '__main__':
start = datetime.datetime.now()
qiushi = Qiushi()
qiushi.run()
end = datetime.datetime.now()
print('耗时:{}'.format(end - start))
# 线程
# 耗时:0:00:00.723405 5
# 耗时:0:00:00.643361 7
# 耗时:0:00:00.493194 8
# 耗时:0:00:00.742133 9
# 耗时:0:00:16.628856 10
# 进程
# 耗时:0:00:00.522494 8
# 耗时:0:00:00.238884 13
线程池
from multiprocessing.dummy improt Pool
pool = Pool() # os.cpu_count() or 1 # cpu_count() 指的cpu逻辑核心数
pool.apply_async(func, callback=)
#func函数的返回值作为callback指定回调函数参数!
pool.close()
使用线程池爬取
import time
import requests
import datetime
from lxml import etree
from queue import Queue
from multiprocessing.dummy import Pool
# from threading import Thread
class Qiushi(object):
def __init__(self):
self.url = 'https://www.qiushibaike.com/8hr/page/{}/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
self.url_q = Queue()
self.pool = Pool(6)
self.is_running = True
self.total_url_nums = 0
self.total_response_nums = 0
# self.html_q = Queue()
# self.item_q = Queue()
def make_url(self):
for i in range(1, 14):
self.url_q.put(self.url.format(i))
self.total_url_nums += 1
def get_data(self, url):
response = requests.get(url, headers=self.headers)
return response.content
def title_list(self, title_html):
html = etree.HTML(title_html)
tl_list = html.xpath('//div[@class="recommend-article"]/ul/li')
# print(len(tl_list))
temp_list = []
for i in tl_list:
title = i.xpath('./div[1]/a/text()')
if title != []:
temp = {}
temp['title'] = i.xpath('./div[1]/a/text()')[0]
temp_list.append(temp)
return temp_list
def save_item(self, temp_list):
"""保存数据"""
for temp in temp_list:
print(temp)
def execute_request_item_save(self):
""" 只处理一个url 知道处理完毕 """
# 从队列中取出一个url
url = self.url_q.get()
# 发送请求获取响应
temp_list = self.get_data(url)
# 提取数据
temp_list = self.title_list(temp_list)
# 保存
self.save_item(temp_list)
# 总响应数 +1
self.total_response_nums += 1
return 'heihei'
def _callback(self, xxx):
print(xxx)
if self.is_running:
self.pool.apply_async(self.execute_request_item_save(), callback=self._callback)
def run(self):
self.make_url()
for i in range(8): # 控制异步线程的规模
self.pool.apply_async(self.execute_request_item_save(), callback=self._callback)
# 程序退出的条件:总请求数 == 总响应数
while True:
time.sleep(0.01) # 让cpu能够喘口气(能够切换线程)
if self.total_url_nums == self.total_response_nums:
self.is_running = False
break
print('程序结束')
if __name__ == '__main__':
start = datetime.datetime.now()
qiushi = Qiushi()
qiushi.run()
end = datetime.datetime.now()
print('耗时:{}'.format(end - start))
协程池
improt gevent.monkey
gevent.monkey.patch_all()
from gevent.pool improt Pool
pool = Pool() # 1
pool.apply_async(func, callback=)
# func函数的返回值作为callback指定回调函数参数!
要在特定的地方 time.sleep 让cpu / gevent 自动进行任务切换
协程使用了 gevent 需要安装
import gevent.monkey
gevent.monkey.patch_all()
from gevent.pool import Pool
import time
import datetime
import requests
from lxml import etree
from queue import Queue
class QiushiSpider():
def __init__(self):
self.url = 'https://www.qiushibaike.com/8hr/page/{}/' # 1-13 (1,14)
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'}
self.url_q = Queue()
self.pool = Pool(6)
self.is_running = True
self.total_url_nums = 0
self.total_response_nums = 0
def make_url_list(self):
"""构造url_list"""
for i in range(1,14):
self.url_q.put(self.url.format(i))
# 总请求数 + 1
self.total_url_nums += 1
def get_html(self, url):
"""发送请求获取响应"""
resp = requests.get(url, headers=self.headers)
return resp.content
def parse_items(self, html_content):
"""提取数据"""
html = etree.HTML(html_content)
li_list = html.xpath('//div[@class="recommend-article"]/ul/li')
item_list = []
for li in li_list:
title = li.xpath('./div[1]/a[1]/text()')
if title != []:
item = {}
item['title'] = li.xpath('./div[1]/a[1]/text()')[0]
item_list.append(item)
return item_list
def save_item(self, item_list):
"""保存数据"""
for item in item_list:
print(item)
def execute_request_item_save(self):
"""只处理一个url,直到处理完毕"""
# 从队列中取出一个url
url = self.url_q.get()
# 发送请求获取响应
html_content = self.get_html(url)
# 提取数据
item_list = self.parse_items(html_content)
# 保存
self.save_item(item_list)
# 总响应数 + 1
self.total_response_nums += 1
def _callback(self, xxx):
if self.is_running:
self.pool.apply_async(self.execute_request_item_save, callback=self._callback)
def run(self):
# url queue
self.make_url_list()
for i in range(6): # 控制异步线程的规模
self.pool.apply_async(self.execute_request_item_save, callback=self._callback)
# 程序退出的条件: 总请求数 == 总响应数
while True:
time.sleep(0.1) # 让cpu能够喘口气(能够切换线程)
if self.total_url_nums == self.total_response_nums:
self.is_running = False
break
print('程序结束')
if __name__ == '__main__':
start = datetime.datetime.now()
spider = QiushiSpider()
spider.run()
end = datetime.datetime.now()
print('耗时:{}'.format(end-start))
线程,进程,协成,线程池,协程池 的一些基本概念并没有罗列出来,有不懂的可以先自行百度,这个主要是以代码为主先展示出来的,后续会总结理论知识