起点中文网月票榜历年数据爬取(24年3月更新)_起点月票红包脚本-CSDN博客
学习后改写了下,获取起点热销榜单
import pandas as pd
import requests
from lxml import etree
# 存储字典
item = {
"排行榜":[],
"小说名": [],
"作者": [],
"类别": [],
"细分类别": [],
"连载状态": [],
"最近更新": [],
"更新时间": [],
"简介": []
}
# 定义headers
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:129.0) Gecko/20100101 Firefox/129.0',
'Cookie': 'e1=%7B%22l6%22%3A%22%22%2C%22l7%22%3A%22%22%2C%22l1%22%3A5%2C%22l3%22%3A%22%22%2C%22pid%22%3A%22qd_P_rank_02%22%2C%22eid%22%3A%22qd_C44%22%7D; e2=%7B%22l6%22%3A%22%22%2C%22l7%22%3A%22%22%2C%22l1%22%3A5%2C%22l3%22%3A%22%22%2C%22pid%22%3A%22qd_P_rank%22%2C%22eid%22%3A%22qd_C46%22%7D; e2=%7B%22l6%22%3A%22%22%2C%22l7%22%3A%22%22%2C%22l1%22%3A5%2C%22l3%22%3A%22%22%2C%22pid%22%3A%22qd_P_rank%22%2C%22eid%22%3A%22qd_C46%22%7D; e1=%7B%22l6%22%3A%22%22%2C%22l7%22%3A%22%22%2C%22l1%22%3A%22%22%2C%22l3%22%3A%22%22%2C%22pid%22%3A%22qd_P_rank%22%2C%22eid%22%3A%22%22%7D; newstatisticUUID=1726990220_1641440494; _csrfToken=bfrMbDQVqwBCstad9n0BroZbEdAFbgaJJd1ZgZa4; traffic_utm_referer=https%3A//www.baidu.com/link; fu=146923143; _ga_FZMMH98S83=GS1.1.1727001658.3.1.1727002346.0.0.0; _ga=GA1.1.1355327574.1726990226; _ga_PFYW0QLV3P=GS1.1.1727001658.3.1.1727002346.0.0.0; Hm_lvt_f00f67093ce2f38f215010b699629083=1726990226; Hm_lpvt_f00f67093ce2f38f215010b699629083=1727002343; HMACCOUNT=C0B4255249635B75; _gid=GA1.2.1093265246.1726990226; e1=%7B%22l6%22%3A%22%22%2C%22l7%22%3A%22%22%2C%22l1%22%3A3%2C%22l3%22%3A%22%22%2C%22pid%22%3A%22qd_p_qidian%22%2C%22eid%22%3A%22qd_A16%22%7D; e2=; supportwebp=true; w_tsfp=ltvuV0MF2utBvS0Q663plEmtEz4mfDE4h0wpEaR0f5thQLErU5mH1oZ6vMjyNnzX4cxnvd7DsZoyJTLYCJI3dwNHTcvDJYxEiQSUk9B3jY4RUEM1QMncWQEfIbwjuDhFe3hCNxS00jA8eIUd379yilkMsyN1zap3TO14fstJ019E6KDQmI5uDW3HlFWQRzaLbjcMcuqPr6g18L5a5TjUt1L5KQlzCu5ChhfB0HxMW3kjshbtd+taZhz7Isz7SqA=; _gat_gtag_UA_199934072_2=1'
}
baseurl = "https://www.qidian.com/rank/hotsales/" # 起点月票榜基础网址
for p in range(1,6):
url=baseurl+"page"+str(p)+"/"
response = requests.get(url=url, headers=headers).text # 向页面发出请求并得到回应
html = etree.HTML(response) # 解析页面
hs_list = html.xpath('//div[@class="book-img-text"]') # 小说相关数据归属div
for qd in hs_list: # 每页数据
for i in range(0, 20): # 每页数据的20部小说
print(f"热销榜相关信息打印中...")
novel_rank = qd.xpath('./ul/li/div[1]/span/text()')[i] # 排行榜
item["排行榜"].append(novel_rank)
novel_names = qd.xpath('./ul/li/div[2]/h2/a/text()')[i] # 小说名
item["小说名"].append(novel_names)
novel_authors = qd.xpath('./ul/li/div[2]/p[1]/a[1]/text()')[i] # 作者
item["作者"].append(novel_authors)
novel_class = qd.xpath('./ul/li/div[2]/p[1]/a[2]/text()')[i] # 类别
item["类别"].append(novel_class)
novel_classes = qd.xpath('./ul/li/div[2]/p[1]/a[3]/text()')[i] # 细分类别
item["细分类别"].append(novel_classes)
novel_status = qd.xpath('./ul/li/div[2]/p[1]/span/text()')[i] # 连载状态
item["连载状态"].append(novel_status)
novel_update = qd.xpath('./ul/li/div[2]/p[3]/a/text()')[i] # 最近更新
item["最近更新"].append(novel_update)
novel_time = qd.xpath('./ul/li/div[2]/p[3]/span/text()')[i] # 更新时间
item["更新时间"].append(novel_time)
novel_intro = qd.xpath('./ul/li/div[2]/p[2]/text()')[i] # 简介
item["简介"].append(novel_intro)
#print(item)
da = pd.DataFrame(item) # 转换为dataframe格式
da.to_csv("hotsale.csv", mode='w', encoding="utf-8-sig") # 保存在csv文件中,utf-8-sig避免中文乱码