from threading import Thread
import time
import random
import requests
from queue import Queue
import threading
class Crawl(Thread):
def init(self,url_que,content_que):
Thread.init(self)
self.url_que = url_que
self.content_que = content_que
def run(self):
print(threading.current_thread().name)
print(‘下载中’)
headers = {
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36’
}
while self.url_que.empty() == False:
response = requests.get(self.url_que.get(),headers=headers)
if response.status_code ==200:
self.content_que.put(response.text)
from lxml import etree
创建解析类
class Parse(Thread):
def init(self,content_que):
Thread.init(self)
self.content_que = content_que
def run(self):
print(threading.current_thread().name)
while self.content_que.empty()==False:
# import os
# if not os.path.exists('novel'):
# os.mkdir('novel')
e = etree.HTML(content_list.get())
span_contents = e.xpath('//div[@class="content"]/span[1]')
for span in span_contents:
with open('novel/jok.txt','a',encoding='utf-8') as f:
info = span.xpath('string(.)')
f.write(info)
# print(info)
if name == ‘main’:
url_list = Queue()
# 创建一个爬取内容的队列
content_list = Queue()
base_url = 'https://www.qiushibaike.com/text/page/{}/'
for i in range(2,5):
url_list.put(base_url.format(i))
print(base_url.format(i))
# 创建5个爬虫去爬取内容
crawl_list = []
for i in range(5):
crawl1 = Crawl(url_list,content_list)
crawl_list.append(crawl1)
crawl1.start()
for i in range(5):
crawl_list[0].join()
for i in range(5):
parse = Parse(content_list)
parse.start()