【声明:仅供新手学习使用,请勿用作非法用途,如有非法情况发生,与本文无关。】
import csv
import requests
from lxml import etree
#对新手友好,不需要写正则
headers = {
"User-Agent": "这里在自己的电脑上可以替换上自己的链接"
}
url_template = "https://某某某小说网站{page}.html" #替换为网站链接
with open("替换成你要保存的路径\\这里是文件名称.csv", "w", newline="", encoding="utf-8-sig") as csvfile:
fieldnames = ["标题", "描述", "链接"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
for page in range(1, 10): # 循环5次,即爬取5页数据
url = url_template.format(page=page)
print(f"正在爬取第{page}页...")
res = requests.get(url, headers=headers)
res.encoding = "gbk" # 设置为gbk编码,根据实际网站情况可能不需要这一步
tree = etree.fromstring(res.text, parser=etree.HTMLParser())
titles = tree.xpath("//h1[@class='fontSize17andHei']/a/text()") #标题位置,可自行替换
hrefs = tree.xpath("//h1[@class='fontSize17andHei']/a/@href") #链接位置,可自行替换
descs = tree.xpath("//div[@class='TwoBox02_06']/a/text()") #描述位置,可自行替换
for title, href, desc in zip(titles, hrefs, descs):
full_url = f"https:{href}"
row = {"标题": title, "描述": desc, "链接": full_url}
writer.writerow(row)