以笔趣阁小说宋时归为例:
INPUT输入第一页网址:https://www.shengxu5w.com/4_4743/2041526.html
import requests, re
from lxml import etree
def get_text(url, novel):
urls = url
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
req = requests.get(urls, headers=headers)
req.encoding = req.apparent_encoding #获取源文件编码
html = etree.HTML(req.text)
title = html.xpath("//h1/text()")[0] #获取文章标题
contents = html.xpath('//*[@id="content"]/text()') #获取正文
content = ""
for j in contents:
content += j
content = re.sub("s+", "\r\t\n"