import requests
from lxml import etree
# 小说网址
url = 'https://www.82zg.com/book/4551/3304465.html'
# 请求头UI伪装
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
# 准备小说存放文件
f = open("斗罗大陆.txt", 'w', encoding='utf-8')
while True:
# 请求数据
resp = requests.get(url=url, headers=headers)
#设置编码
resp.encoding = 'gbk'
# print(resp.text)
#xpath解析,生产HTML节点树
e = etree.HTML(resp.text)
# print(e,type(e))
#提取小说内容
resp_content = '/n'.join(e.xpath("//div[@id='content']/text()"))
#提取章节标题
resp_title = e.xpath("//div[@class='bookname']/h1/text()")[0]
#提取下一章URL
resp_next_url = 'https://www.82zg.com' + e.xpath("//div[@class='bottem1']/a/@href")[3]
print(resp_title, resp_next_url)
#with open("斗罗大陆.txt", 'w', encoding='utf-8') as f:
#保存小说
f.write(resp_title + '\n\n' + resp_content + '\n\n')
#更新URL,循环下载下一章
url = resp_next_url
#定位最后一章网址,跳出循环
if resp_next_url == 'https://www.82zg.com/book/4551/3305371.html':
break
#下载完毕,关闭文件
f.close()