python抓取web文本小说生成txt文件
代码已经亲自运行过,生成以小说标题名命名的txt文件,按章节显现的纯文字纯文本文件,欢迎尝试赏玩。
整个代码如下:
import requests
from lxml import etree
import re #正则表达式模块
a = "https://www.hongxiu.com/book/27263292302202504"
#红袖添香欲抓取的小说界面url,此处可替换任意想抓取该网站上的小说
b = requests.get(a)
c = b.text
d = etree.HTML(c)
a_biao = d.xpath('//div[@class="volume"]/ul[@class="cf"]/li/a')
#解析目录里的所有超文本链接a标签
t_biao = d.xpath('//title') #解析小说标题
wen_title = t_biao[0].xpath('./text()')[0] #获得标题标签
pattern = "《(.+?)》"
matches_t = re.findall(pattern, wen_title) #提取纯标题
file = open(matches_t[0]+'.txt', 'w', encoding='utf-8') #创建以title name命名的txt文件
for i in a_biao:
dizhi = i.xpath('./@href')[0]
new_url = 'https://www.hongxiu.com' + dizhi
names = i.xpath('./text()')[0]
file.write(names) #将章节名写入txt文件
file.write('\n')
resp = requests.get(new_url)
yuanma = resp.text
html_dm = etree.HTML(yuanma)
p_biaoqian = html_dm.xpath('//div[@class="ywskythunderfont"]/p')
#解析P标签里的具体值,即纯文字内容
for i in p_biaoqian:
art = i.xpath('./text()')[0] #按行写入纯文字内容
file.write(art)
file.write('\n')