import requests
from lxml import etree
url = 'https://www.idejian.com/book/12385810/1.html'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
with open('斗罗.txt', 'w', encoding='utf-8') as f:
while True:
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
# print(response.text)
e = etree.HTML(response.text)
title_read = e.xpath('//h1/text()')
title = ''
if len(title_read) > 0:
title = e.xpath('//h1/text()')[0]
else:
print("获取失败")
break
print(title)
content = ''.join(e.xpath('//div/p/text()'))
# print(content)
url = 'https://www.idejian.com' + e.xpath('//div/div/div/div[4]/a[1]/@href')[0]
f.write(title + "\n\n" + content + "\n\n")
url = 'https://www.idejian.com/book/12385810/1.html' #第一章网页
headers = { #用户代理
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
使用XPath Helper对目标字段或者属性值进行匹配。