import requests
from lxml import etree
from threading import Thread
from queue import Queue
class MyThread(Thread):
def __init__(self, q):
Thread.__init__(self)
self.q = q
def run(self):
global index
while not self.q.empty():
data = self.q.get()
url = root + ‘‘.join(data[1])
response = requests.get(url, headers=headers)
page = etree.HTML(response.content)
chapter = page.xpath("//h1/text()")
chapter = ‘‘.join(chapter)
print("爬取 -> %s" % chapter,index)
content = page.xpath("//div[@id=‘content‘]/text()")
content = ‘\n‘.join(content)
content = content.replace("\xa0\xa0\xa0\xa0", "\t")
# 如果当前标记比保存的小说章节序号大于1,阻塞
while data[0] > index + 1:
pass
# 刚好大于1时,通过,保存章节
if data[0] == index + 1:
print("保存 -> %s" % chapter,index)
f.write(‘\n‘ + chapter + ‘\n‘)
f.write(content)
index += 1
if __name__ == ‘__main__‘:
root = "http://www.booktxt.net/8_8455/"
headers = {
‘user-agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36‘
}
index = -1 # 章节标记,表示保存的章数
response = requests.get(root, headers=headers)
page = etree.HTML(response.content)
title = ‘‘.join(page.xpath("//h1/text()")) # 小说名
print(title)
with open("%s.txt" % title, ‘w‘, encoding=‘utf8‘) as f:
f.write(title) # 先写入小说名
hrefs = page.xpath("//div[@id=‘list‘]/dl/dt[2]/following-sibling::[email protected]")
q = Queue()
for i,href in enumerate(hrefs):
q.put((i,href))
ts = []
for i in range(5):
t = MyThread(q)
t.start()
ts.append(t)
for t in ts:
t.join()
原文:https://www.cnblogs.com/fqqwz/p/11656074.html