使用requests爬取糗事百科(多线程版)
- 使用多线程可以条爬虫效率
- 注意在从队列中取数据并完成操作之后要加上task_down()方法
import requests
from lxml import etree
import threading
from queue import Queue
class QiubaiSpider:
def __init__(self):
self.url = 'https://www.qiushibaike.com/8hr/page/{}/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
for i in range(1, 14):
self.url_queue.put(self.url.format(i))
def parse_url(self):
while True:
url = self.url_queue.get()
print(url)
response = requests.get(url, headers=self.headers)
self.html_queue.put(respo