import requests from lxml import html etree=html.etree #获取小说url urls=['https://www.ddxs.com/yuzhouzhiyexuanshou/{}.html'.format(i) for i in range(1,31)] headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63'} #设置保存小说的路径 path =r'D:\python爬虫\小说\ ' #获取小说内容并保存 def get_text(url): r=requests.get(url,headers=headers) r.encoding='utf-8' selector=etree.HTML(r.text) #获取文章标题 title=selector.xpath('/html/body/div[5]/div[2]/dl/dd[1]/h1/text()') #获取正文内容 text=selector.xpath('/html/body/div[5]/div[2]/dl/dd[5]/text()') with open(path+title[0],'w',encoding='utf-8') as f: for i in text: f.write(i) if __name__ == '__main__': for url in urls: get_text(url)
lxml的版本是最新的4.9.1版本。