微博爬取多页评论

最近在学习爬虫。爬取了第一页后,信心满满。但是翻不了页,就卡住了。搜了好多资料,b站都翻烂了,还是没找到合适的代码。最后在知乎找到了【2023微博评论爬虫】用python爬上千条微博评论,突破15页限制! - 知乎 (zhihu.com)

这篇文章干货满满 ,虽然我只爬出来了15页,但也是进步了,收获满满。

遇到的问题:

1、代码打完之后运行没有结果。(主函数缩进错误,有一段代码不小心写到for循环里了)

2、返回不出来json页面(重新复制了一下url)

3、时间返回不出来(大小写有错)

总结:

1、hotflow 是移动端的评论数据,commentbuild是pc端的评论数据

2、大小写不要随便就按照pycharm给出的提出改代码

3、运行不出结果不报错时,一步一步print,很重要!

import pprint
import re
from time import sleep
import requests
import json
import pandas as pd
import os
import datetime
from time import sleep
import random
from fake_useragent import UserAgent


# 转换时间

def trans_time(v_str):
    GMT_FORMAT = '%a %b %d %H:%M:%S +0800 %Y'
    timeArray = datetime.datetime.strptime(v_str, GMT_FORMAT)
    ret_time = timeArray.strftime("%Y-%m-%d %H:%M:%S")
    return ret_time


# 转换性别
def tran_gender(gender_tag):
    if gender_tag == 'm':
        return '男'
    elif gender_tag == 'f':
        return '女'
    else:
        return


def get_comments(v_weibo_ids, v_comment_file, v_max_page):
    """

    :param v_weibo_ids: 微博id组成的列表
    :param v_comment_file: 保存文件名
    :param v_max_page: 最大页数
    :return: None
    """
    #print(1111111111, v_weibo_ids)
    for weibo_id in v_weibo_ids:
        # 初始化max_id
        max_id = '0'
        # 爬取n页,可任意修改
        for page in range(1, v_max_page + 1):
            wait_second = random.uniform(0, 1)  # 等待时长秒
            print('开始等待{}秒'.format(wait_second))
            sleep(wait_second)  # 随机等待
            print("开始爬取{}页".format(page))
            if page == 1:  # 第一页,没有max_id参数
                url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id_type=0'.format(weibo_id, weibo_id)
                #https://m.weibo.cn/comments/hotflow?id=4934570812901749&mid=4934570812901749&max_id_type=0
            else:

                if max_id == '0':
                    print('max_id is 0,break now')
                    break
                url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id_type=0&max_id={}'.format(weibo_id,weibo_id, max_id)
                #print(222222222,url)
                #https://m.weibo.cn/comments/hotflow?id=4934570812901749&mid=4934570812901749&max_id=162921594700254&max_id_type=0


                #print(1111111111111111111111111111111111111111, url)

                # 发送请求
            ua = UserAgent()

            headers = {
                "user-agent": ua.random,
                'cookie':'Cookie:_T_WM=53471771365; XSRF-TOKEN=918f2d; WEIBOCN_FROM=1110006030; MLOGIN=1; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWvAHGueA85rUwXGGxqn9hF5JpX5K-hUgL.FoMNSo5Ee0Mf1hn2dJLoI7XLxK.L1-eLB-B0e05c; SSOLoginState=1692011683; ALF=1694603683; SCF=Agcu37bX4pp62Fa20LbsJwL-0-g-AN1C0JSpLa1K9-zGYISA3f2KMzHPsnRcp2-RVm8heQi3oc_6Wauh0_9A0wo.; SUB=_2A25J3nzzDeRhGeFJ7VIT8ynJwzSIHXVrIQS7rDV6PUJbktANLRL_kW1Nf5DMr5x6D1W02kjPd5tQ5dcOAK_tZUZ6; mweibo_short_token=70f94b26bd; M_WEIBOCN_PARAMS=lfid%3D1076035997696411%26luicode%3D20000174%26uicode%3D20000061%26fid%3D4934600524044373%26oid%3D4934600524044373',
                'Mweibo-Pwa':'1',
                "Accept": "application/json, text/plain, */*",
                "Accept-Encoding": "gzip, deflate, br",
                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
                'referer': 'https://weibo.com/1496814565/Neh6Vk5o7',


            }

            r = requests.get(url=url, headers=headers)  # 发送请求
            #print(111111111111111111111111, r)
            #pprint.pprint(r.json()) # 查看相应内容
            try:
                max_id = r.json()['data']['max_id']
                datas = r.json()['data']['data']
            except Exception as e:
                print('excepted: ' + str(e))
                continue
            page_list = []
            id_list = []
            text_list = []
            time_list = []
            like_count_list = []
            source_list = []
            user_name_list = []
            user_id_list = []
            user_gender_list = []
            follow_count_list = []
            follower_count_list = []

            datas = r.json()['data']['data']
            #print(2222222222, datas)
            for data in datas:
                #pprint.pprint(data)
                page_list.append(page)
                id_list.append(page)
                content = ''.join(re.findall('[\u4e00-\u9fa5]+', data['text']))
                # dr = re.compile(r'[\u4e00-\u9fa5]+', re.S)
                # text2 = dr.sub('', data['text'])
                text_list.append(content)
                time_list.append(trans_time(v_str=data['created_at']))
                like_count_list.append(data['like_count'])
                source_list.append(data['source'])
                user_name_list.append(data['user']['screen_name'])
                user_id_list.append(data['user']['id'])
                user_gender_list.append(tran_gender(data['user']['gender']))
                follow_count_list.append(data['user']['follow_count'])
                follower_count_list.append(data['user']['followers_count'])

            df = pd.DataFrame(
                {
                    '微博id': [weibo_id] * len(time_list),
                    '评论页码': page_list,
                    '评论id': id_list,
                    '评论时间': time_list,
                    '评论点赞数': like_count_list,
                    '评论者IP归属地': source_list,
                    '评论者姓名': user_name_list,
                    '评论者id': user_id_list,
                    '评论者性别': user_gender_list,
                    '评论者关注数': follow_count_list,
                    '评论者粉丝数': follower_count_list,
                    '评论内容': text_list,
                 }
            )
            print(df)
            if os.path.exists(v_comment_file):
                header = False
            else:
                header = True
            print(v_comment_file)
            df.to_csv(v_comment_file, mode='a+', index=False, header=header, encoding='utf-8')
            print('结果保存成功:{}'.format(v_comment_file))


if __name__ == '__main__':
    weibo_id_list = ['4934570812901749']#4934570812901749','139144656495751','138869780012886
    max_page = 20
    comment_file = '四川医疗腐败.csv'
    if os.path.exists(comment_file):
        print('csv文件已存在,先删除', comment_file)
        os.remove(comment_file)
    print(weibo_id_list)
    get_comments(v_weibo_ids=weibo_id_list, v_comment_file=comment_file, v_max_page=max_page)



  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 6
    评论
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值