多线程版
代码介绍
接上一个爬虫案例
优点:下载,保存速度很快。
缺点:由于部分电子书每一卷大卷的开始,会从第一章开始,导致会有多个第一章出现,这种问题目前暂未解决
已解决的问题:
问题1:由于线程的速度不一样,保存的小说章节内容顺序不对。
解决方式:使用字典配合sorted这种数据类型来解决这个问题
问题2:解决了 由于之前保存文件的位置不对,导致每一个消费者线程都会执行一边保存命令,
解决方式:使用join方法来阻塞消费者线程,让其只执行一编保存命令。
思路介绍:
1:如下图
全部代码:
import re
import requests
from lxml import etree
import threading
from queue import Queue
import cn2an
def get_url(url):
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
return html
def pj_url(url):
return 'https://www.biquge.com' + url
def data_clean(url):
title_name = url.xpath('//h1/text()')[0]
lis = []
txt_list = url.xpath('//div[@id="content"]/text()')
for txt in txt_list:
i = txt.replace('\t', '').replace('\n', '').replace('\r', '').replace('\u3000', '')
lis.append(i)
txt_list = str(lis)[2:-2].replace(r"'", '').replace(r', (=),', '').replace("。,", '。\n')
return title_name, txt_list
def save(Fiction_data):
Fiction_data = dict(sorted(Fiction_data.items()))
for v in Fiction_data.values():
with open(f'{T_name}.txt', 'a', encoding='utf-8') as fp:
fp.write(str(v))
class Productor(threading.Thread):
def __init__(self, URL_queue, DATA_queue):
threading.Thread.__init__(self)
self.URL_queue = URL_queue
self.DATA_queue = DATA_queue
def run(self):
while True:
if self.URL_queue.empty():
break
url = self.URL_queue.get()
self.get_content(url)
def get_content(self, url):
html = get_url(url)
self.DATA_queue.put(html)
class Consumer(threading.Thread):
def __init__(self, DATA_queue):
threading.Thread.__init__(self)
self.DATA_queue = DATA_queue
def run(self):
while True:
if self.DATA_queue.empty() and switch == 1:
break
try:
html = self.DATA_queue.get(timeout=10)
tible, txt = data_clean(html)
txt = tible + "\n" + txt + "\n"
"""
由于是多线程,无法保证章节是顺序进行---处理方式:
1:建立一个字典保存小说内容
2:k:(将章节名称更换成阿拉伯数字)
3:v:(此章节对应的正文内容)
4:最后对字典的键进行排序,解决问题。
"""
table_num = re.match(r'第(.*?)章', tible)
if type(table_num) == int:
continue
table_num = cn2an.cn2an(table_num.group(1), "normal")
print("下载的章节序号>>>>>>>>>>>>>>>>>>", table_num)
Fiction_data.update({
table_num: txt
})
except Exception as e:
print(f'{e}出错')
def __del__(self):
print("全部章节下载完成--程序退出,线程{}结束任务".format(threading.current_thread()))
if __name__ == '__main__':
switch = 0
Fiction_data = {}
url = 'https://www.biquge.com/135_135772/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400',
}
URL_queue = Queue(3000)
DATA_queue = Queue(3000)
html_text = get_url(url)
dt_url = html_text.xpath('//dt[2]/following-sibling::dd/a/@href')
for url in dt_url:
url = 'https://www.biquge.com' + url
URL_queue.put(url)
T_name = html_text.xpath('//h1/text()')[0]
p_list = []
c_list = []
for i in range(5):
p = Productor(URL_queue, DATA_queue)
c = Consumer(DATA_queue)
p_list.append(p)
c_list.append(c)
p.start()
c.start()
for p in p_list:
p.join()
switch = 1
for c in c_list:
c.join()
save(Fiction_data)
#如有问题,欢迎留意