1.导入库
import requests
from lxml import etree
2.文件名
filename=‘豆瓣读书文学标签页-小说排行.txt’
3.目标网址
server='https://book.douban.com/'
tag='tag/小说'
start='?start='
start_num=0
t='&type=T'
4.请求头信息
这里是我自己电脑的哈。
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48' }
5.开始爬取
print('开始爬取') for pages in range(0,376): url=server+tag+start+str(start_num)+t start_num+=20 r=requests.get(url,headers=headers) r.encoding='utf-8' html=r.text parsed_html = etree.HTML(html) #解析:将内容传到里面去,进行解析 r_list = parsed_html.xpath('//ul[@class="subject-list"]/li') #解析完后的内容,进行调用 for i in r_list: books={} books['书名']=i.xpath('.//div[@class="info"]/h2/a/text()')[0].strip() books['出版信息']=i.xpath('.//div[@class="info"]/div[@class="pub"]/text()')[0].strip() print(books) file=open(file=filename,mode='a',encoding="utf-8") file.write(books['书名']+'\n') file.write(books['出版信息']) file.write('\n\n') file.close() print('爬取结束')