本篇文章使用xpath解析豆瓣读书页面数据,首先我们需要在自己的pycharm里面安装好requests库和lxml,其次需要把服务器server的网址复制下来,然后开始爬取。具体代码如下所示。 import requests from lxml import etree filename= '豆瓣读书文学标签页-小说排行.txt' server = 'https://book.douban.com/' tag='tag/小说' start='?start=' start_num=0 t='&type=T' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48' } print('开始爬取') for pages in range(0,376): url=server+tag+start+str(start_num)+t start_num+=20 r=requests.get(url,headers=headers) r.encoding='utf-8' html=r.text parsed_html = etree.HTML(html) #解析:将内容传到里面去,进行解析 r_list = parsed_html.xpath('//ul[@class="subject-list"]/li') #解析完后的内容,进行调用 for i in r_list: books={} books['书名']=i.xpath('.//div[@class="info"]/h2/a/text()')[0].strip() books['出版信息']=i.xpath('.//div[@class="info"]/div[@class="pub"]/text()')[0].strip() print(books) file=open(file=filename,mode='a',encoding="utf-8") file.write(books['书名']+'\n') file.write(books['出版信息']) file.write('\n\n') file.close() print('爬取结束')