最右爬虫
爬取最右评论,不过需要时不时换一下headers的信息,容易过期。
import requests
import time
import random
import sys
import json
import os
def is_life_file(filename):
"""
判断文件是否存在,存在,不存在则创建文件
参数:
- filename: 文件名
"""
# 检查文件是否存在
if os.path.exists(filename):
# 如果文件存在,以追加模式打开文件,并写入内容
with open(filename, 'a', encoding='utf-8') as file:
print("文件已存在")
else:
# 如果文件不存在,以写入模式创建文件,并写入内容
with open(filename, 'w', encoding='utf-8') as file:
print("文件不存在,已创建。")
def fetch_data(url, headers, data, num_requests, output_file):
is_life_file(output_file)
for i in range(num_requests):
try:
response = requests.post(url, headers=headers, data=data)
response.raise_for_status() # 如果响应状态码不是 200,将会抛出异常
except requests.exceptions.RequestException as e:
print("Request failed:", e)
print("Retrying...")
continue # 继续下一次请求
json_data = response.json()
if 'data' in json_data and 'posts' in json_data['data']:
posts = json_data['data']['posts']
mode = 'a'
with open(output_file, mode, encoding='utf-8') as file:
for post in posts:
if 'content' in post:
file.write("Content: " + post['content'] + "\n")
if 'review_data' in post and 'reviews' in post['review_data']:
reviews = post['review_data']['reviews']
if reviews is not None: # 检查评论数据是否为None
for review in reviews:
if 'content' in review:
file.write("Review: " + review['content'] + "\n")
else:
print("Reviews data is None. Skipping this request.")
continue # 如果评论数据为None,则跳过当前请求
file.write("-------\n")
print("Processed requests:", i + 1, "/", num_requests) # 打印已处理的请求次数
sys.stdout.write("\033[F") # 光标上移一行
else:
print("No data found in response.")
sleep_time = random.randint(10, 15) # 随机休眠1到5秒
time.sleep(sleep_time) # 添加随机延时,避免频繁请求
print("Data fetching completed.")
if __name__ == "__main__":
# 请求的 URL
url = 'https://www.izuiyou.com/planck/web/feed/rec'
# 请求头部信息
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Authorization': 'sign="dc2891c97831b6e1b4e66b0a63470caea7c234933ff84147c5d1a8432a40347e",timestamp="1712119647",nonce_str="7af0f286aa50ea69a452700c95c66933"',
'Cache-Control': 'no-cache',
'Content-Length': '2',
'Content-Type': 'application/json',
'Cookie': 'Hm_lvt_414bd23f4090657a5e2034429c836cca=1712107177; is_guest=1; web_token=TbKcNxguVeYEBSaYX8iPM-JqFmALLgiKOyBXqaWbt6fT72TeGL4IlhjWdriyjSkIijqf2IIa9sZpsI1sYoF5vPXuAdkPqCldfte7JHhtLbrBBXTq5cRO0TfAPrDPBpmTXnkV5; Hm_lpvt_414bd23f4090657a5e2034429c836cca=1712111547',
'Origin': 'https://www.izuiyou.com',
'Referer': 'https://www.izuiyou.com/',
'Sec-Ch-Ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}
# 请求体内容
data = json.dumps({})
# 设置请求次数
num_requests = 10000
# 输出文件名
output_file = '最右评论.txt'
# 调用函数获取数据并写入本地文件
fetch_data(url, headers, data, num_requests, output_file)