多线程爬取 queue Thread
from threading import Thread
import requests
from lxml import etree
from queue import Queue
class SpiderUrl(Thread):
def __init__(self, url_queue, html_queue):
self.url_queue = url_queue
self.html_queue = html_queue
Thread.__init__(self)
def run(self):
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"}
while self.url_queue.empty() == False:
response = requests.get(self.url_queue.get())
if response.status_code==200:
self.html_queue.put(response.text)
class Web_Info(Thread):
def __init__(self, html_queue):
Thread.__init__(self)
self.html_queue = html_queue
**解析页面 自己动手**
def GetInfo(self):
e = etree.HTML(self.html_queue.get())
list = e.xpath('')
with open('a.txt', 'a', encoding='utf-8') as f:
for span in list:
info = span.xpath("")**加粗样式**
f.write(info+'\n')
if __name__ == '__main__':
url_queue = Queue()
html_queue = Queue()
url_list = []
url = ""
for i in range(1, 14):
new_url = url.format(i)
url_list.append(new_url)
for url_ in url_list:
url_queue.put(url_)
crawl_list = []
for i in range(20):
crawl = SpiderUrl(url_queue, html_queue)
crawl_list.append(crawl)
crawl.start()
for j in crawl_list:
j.join()
Infor_list = []
for i in range(3):
s = Web_Info(html_queue)
Infor_list.append(s)
s.start()
for m in Infor_list:
m.join()
**如果可以的话 , 能不能点个个关注, 或者加我qq 相互学习 , 代码报错问题我很乐意解决 一起进步!!!!!
qq 2981001569**