# 爬取糗百信息
import threading
from queue import Queue
import requests
from lxml import etree
import time
# 最大开启采集线程数(并发数)
concurrent = 3
# 解析并发数
conparse = 3
class Crawl(threading.Thread):
def __init__(self,i,task_q,data_q):
self.task_q = task_q
self.data_q = data_q
self.i = i
super(Crawl, self).__init__()
def run(self):
print('%d号采集线程启动' % self.i)
while not self.task_q.empty():
fullurl = self.task_q.get()
response = requests.get(fullurl)
if 200 <= response.status_code <= 300:
html = response.text
data_q.put(html)
else:
print('采集异常',response.status_code)
time.sleep(1)
print('%d号采集线程结束over' % self.i)
class Parse(threading.Thread):
def __init__(self,i,data_q,crawl_list):
self.i = i
self.data_q = data_q
self.crawl_list = crawl_list
self.is_parse = True
super(Parse, self).__init__()
def run(self):
print('%d号解析线程开启' % self.i)
# 判断是否结束解析线程条件
# 1.采集线程是否存活
# 2.数据队列是否为空
while True:
for t in self.crawl_list:
# 判断采集线程是否存活
if t.is_alive():
break
else:
if self.data_q.empty():
self.is_parse = False
# 线程有存活的
if self.is_parse:
try:
html = self.data_q.get(timeout=3)
self.parse(html)
except Exception as e:
pass
else:
break
print('%d号解析线程结束' % self.i)
# 解析页面
def parse(self,html):
html = etree.HTML(html)
nick = html.xpath('//h2/text()')
print(nick)
if __name__ == '__main__':
# 任务队列
task_q = Queue()
# 数据队列
data_q =