最右评论爬虫

最新推荐文章于 2025-01-21 17:51:17 发布

Zclins

最新推荐文章于 2025-01-21 17:51:17 发布

阅读量375

点赞数 10

分类专栏：爬虫文章标签： python 网络爬虫

本文链接：https://blog.csdn.net/weixin_63025372/article/details/138284078

版权

爬虫专栏收录该内容

3 篇文章

订阅专栏

本文介绍了如何使用Python编写一个爬虫，通过requests库抓取最右网站的评论数据，通过动态改变headers来应对可能的反爬机制。函数fetch_data负责发送POST请求，处理响应并存储评论内容。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

最右爬虫

爬取最右评论，不过需要时不时换一下headers的信息，容易过期。

import requests
import time
import random
import sys
import json
import os

def is_life_file(filename):
    """
    判断文件是否存在，存在，不存在则创建文件

    参数：
    - filename: 文件名
    """
    # 检查文件是否存在
    if os.path.exists(filename):
        # 如果文件存在，以追加模式打开文件，并写入内容
        with open(filename, 'a', encoding='utf-8') as file:
            print("文件已存在")
    else:
        # 如果文件不存在，以写入模式创建文件，并写入内容
        with open(filename, 'w', encoding='utf-8') as file:
            print("文件不存在，已创建。")


def fetch_data(url, headers, data, num_requests, output_file):
    is_life_file(output_file)
    for i in range(num_requests):
        try:
            response = requests.post(url, headers=headers, data=data)
            response.raise_for_status()  # 如果响应状态码不是 200，将会抛出异常
        except requests.exceptions.RequestException as e:
            print("Request failed:", e)
            print("Retrying...")
            continue  # 继续下一次请求
        json_data = response.json()
        if 'data' in json_data and 'posts' in json_data['data']:
            posts = json_data['data']['posts']
            mode = 'a'
            with open(output_file, mode, encoding='utf-8') as file:
                for post in posts:
                    if 'content' in post:
                        file.write("Content: " + post['content'] + "\n")
                    if 'review_data' in post and 'reviews' in post['review_data']:
                        reviews = post['review_data']['reviews']
                        if reviews is not None:  # 检查评论数据是否为None
                            for review in reviews:
                                if 'content' in review:
                                    file.write("Review: " + review['content'] + "\n")
                        else:
                            print("Reviews data is None. Skipping this request.")
                            continue  # 如果评论数据为None，则跳过当前请求
                    file.write("-------\n")
            print("Processed requests:", i + 1, "/", num_requests)  # 打印已处理的请求次数
            sys.stdout.write("\033[F")  # 光标上移一行
        else:
            print("No data found in response.")

        sleep_time = random.randint(10, 15)  # 随机休眠1到5秒
        time.sleep(sleep_time)  # 添加随机延时，避免频繁请求

    print("Data fetching completed.")




if __name__ == "__main__":
    # 请求的 URL
    url = 'https://www.izuiyou.com/planck/web/feed/rec'

    # 请求头部信息
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Authorization': 'sign="dc2891c97831b6e1b4e66b0a63470caea7c234933ff84147c5d1a8432a40347e",timestamp="1712119647",nonce_str="7af0f286aa50ea69a452700c95c66933"',
        'Cache-Control': 'no-cache',
        'Content-Length': '2',
        'Content-Type': 'application/json',
        'Cookie': 'Hm_lvt_414bd23f4090657a5e2034429c836cca=1712107177; is_guest=1; web_token=TbKcNxguVeYEBSaYX8iPM-JqFmALLgiKOyBXqaWbt6fT72TeGL4IlhjWdriyjSkIijqf2IIa9sZpsI1sYoF5vPXuAdkPqCldfte7JHhtLbrBBXTq5cRO0TfAPrDPBpmTXnkV5; Hm_lpvt_414bd23f4090657a5e2034429c836cca=1712111547',
        'Origin': 'https://www.izuiyou.com',
        'Referer': 'https://www.izuiyou.com/',
        'Sec-Ch-Ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
    }

    # 请求体内容
    data = json.dumps({})

    # 设置请求次数
    num_requests = 10000

    # 输出文件名
    output_file = '最右评论.txt'

    # 调用函数获取数据并写入本地文件
    fetch_data(url, headers, data, num_requests, output_file)