'''采集爬虫目录'''
import requests
url = "https://www.jubiquge.com/14572"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0"}
# 发起伪造请求
response = requests.get(url, headers=headers)
# 设置响应编码
response.encoding = "UTF-8"
# 查看响应数据
content = response.text
# 正则提取章节名称和连接
import re
p = r'<a title="(第.*?)"\s+href="(.*?)">'
# 全部匹配的方式提取数据
chs = re.findall(p, content, re.DOTALL)
chapter = dict()
for ch in chs:
chapter[ch[0]] = "https://www.jubiquge.com" + ch[1]
# 最终章节和链接数据
print(chapter)
# 文件IO中保存目录数据
import json
with open("chapters.txt", mode="wt", encoding="utf-8") as file:
json.dump(chapter, file)
#采集章节内容
import requests, re
import time, random
import json
# 1. 加载需要采集的目录
with open("chapters.txt", encoding="UTF-8") as file:
chs = json.load(file)
# 2. 循环遍历,发起伪造请求
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0"
}
for title, url in chs.items():
print(f"准备采集: {title}")
# 发起伪造请求
response = requests.get(url, headers=headers)
# 设置编码
response.encoding = "UTF-8"
# 分析数据格式
html = response.text
print(html)
print("----------------------")
# 定义正则,匹配数据
p = r'<div id="content">(.*?)</div>'
content = re.search(p, html, re.DOTALL)
# 数据筛选
content = content.group(1).strip()
# 数据清洗
p2 = r'<p>(.*?)</p>'
content = re.findall(p2, content, re.DOTALL)
content = "\n".join(content)
print(content)
with open("逆天神妃.txt", mode="at", encoding="utf-8") as file:
# 保存到文件
file.write("\n\n-----------------------\n\n")
file.write("\n\n" + title + "\n\n") # 标题
file.write(content) # 内容
# 模拟用户请求,每次请求完成休眠3~5S
time.sleep(random.randint(3, 5))
print(f"{title} 章节采集完成")