python爬取微博评论

最新推荐文章于 2024-07-22 22:58:19 发布
幽夜莫知途
最新推荐文章于 2024-07-22 22:58:19 发布
阅读量178
点赞数 2
文章标签： python 开发语言
本文链接：https://blog.csdn.net/qq_41901089/article/details/138709878
版权
直接cv可用，具体登录没做，需要自己复制token到代码中。
这里首先搜索关键字，然后获取所有搜索出的用户，并获取所有用户下的评论。
import requests
import pandas as pd
from retrying import retry

df=pd.DataFrame(columns=["评论时间",'用户主页','第一条微博'])

keyword='美妆博主'
# keyword=input('请输入关键词：\n')
# need_page=int(input('请输入需要爬取的页数：\n'))
need_page=2
# cookie=input('请输入cookie：\n')
wait_tiime=3  #等待时间
retry_time=3 #重试次数


totla_count=0
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D%E7%BE%8E%E5%A6%86%E5%8D%9A%E4%B8%BB",
    "X-Requested-With": "XMLHttpRequest",
    "MWeibo-Pwa": "1",
    "X-XSRF-TOKEN": "5a892a",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "Connection": "keep-alive",
    'Cookie': 'SINAGLOBAL=6563893439405.94.1660570802061; ULV=1712841158625:23:1:1:8893962698404.082.1712841158625:1708182769914; UOR=,,cn.bing.com; login_sid_t=6e01672ae1e55c60e10ce087039052bf; cross_origin_proto=SSL; _s_tentry=-; Apache=8893962698404.082.1712841158625; ALF=1715433175; SUB=_2A25LE5GHDeRhGeBJ6lcV8yrIwz-IHXVoUKtPrDV8PUJbkNB-LWHMkW1NRk_GU0_vPwzQDxoFWKtGaoYWksRRvwGT; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFzAPTmlKnvDNKwZEKFzmg75JpX5KzhUgL.FoqNeK-Xe0BX1he2dJLoI7HSSsSEe5tt',
    # 'Cookie': cookie,

}
cookies = {
    # "__bid_n": "18577bed0f17a38cc84207",
    # "FEID": "v10-9c49e416195bef3c56a332640aa1034f0ce925f6",
    # "__xaf_fpstarttimer__": "1672751272214",
    # "__xaf_thstime__": "1672751272292",
    # "FPTOKEN": "TWz+AoUOLbjxVkzAcaXmWeWRjg1Cpj7nMtnqSuL0qvW4sV1SqRpbcpSdpuWOZabQnrBw/S+h/6W97xd9yAPQe8/ntWBXqbjwa5Cf8TbylfxfhiBVwT9PIoRWvJShQshdp/tEQtOGg8/NU0/XOwJLkSXzrllYeJnYZ9Ppk6VAUEjBAXH8C8gPBsZwkvIQBNvftDDOA6Tzj/DnY7XTOq83EK/Gvgp5sYhJ9oFaFbykBL/6Ch0q+yID03T4fInC9tLd1OaYcef0iLZXKY+CQ4xZNNzoSjfXyRo00D/VWVAg19iJs8GXl+LwhU3GGXiZUPC8oCntdrOzsq6DNWgH0shHnMYtWG186JHhH/k+9Iuczh+kCEcurvvQts7qAny8jp1yEN6CWkotYJH1huZxytiL/Q==|8+E3gOZcAmOVpi7ea3QlJRSsJ0ULjhsUFmB065iQigw=|10|c0eed64372a5fbfc54cb9fc06908830f",
    # "__xaf_fptokentimer__": "1672751272345",
    # "SCF": "Aub0wSTSBdAh-qetZ_dbidtgpyDMR-T5-QTqgdtUpTxLs551Br27cdCAjAuQv5_ayfHZFGirz3ZPLm_jy3PjmgA.",
    # "_T_WM": "28503866524",
    # "WEIBOCN_FROM": "1110006030",
    # "MLOGIN": "1",
    # "SUB": "_2A25LE5GHDeRhGeBJ6lcV8yrIwz-IHXVoUKtPrDV6PUJbktANLXf8kW1NRk_GUxCZ9TTX7crWIRvqKmYZTrKtpIe2",
    # "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFzAPTmlKnvDNKwZEKFzmg75JpX5KzhUgL.FoqNeK-Xe0BX1he2dJLoI7HSSsSEe5tt",
    # "SSOLoginState": "1712841175",
    # "ALF": "1715433175",
    # "XSRF-TOKEN": "5a892a",
    # "M_WEIBOCN_PARAMS": "uicode%3D10000011%26fid%3D100103type%253D3%2526q%253D%25E7%25BE%258E%25E5%25A6%2586%25E5%258D%259A%25E4%25B8%25BB%2526t%253D"
}



@retry(stop_max_attempt_number=retry_time)
def main(weibo_id):
    global totla_count
    global df
    url = "https://weibo.com/ajax/statuses/buildComments"
    params = {
        "is_reload": "1",
        "id": weibo_id,
        "is_show_bulletin": "2",
        "is_mix": "0",
        "count": "20",
        "type": "feed",
        "uid": user_id,
        "fetch_level": "0",
        "locale": "zh-CN"
    }
    response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
    comment_data = response.json()
    for comment in comment_data['data']:
        need_user_id = comment['user']['id']
        weibo_user_url = f"https://weibo.com/u/{need_user_id}"
        time = comment['created_at']
        url = "https://m.weibo.cn/api/container/getIndex"
        params = {
            "type": "uid",
            "value": need_user_id,
            # "containerid": "1076035881204733"
            # "since_id":weibo_json['data']['cardlistInfo']['since_id']
        }
        response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
        need_containerid_data = response.json()

        url = "https://m.weibo.cn/api/container/getIndex"
        params = {
            "type": "uid",
            "value": need_user_id,
            "containerid": need_containerid_data['data']['tabsInfo']['tabs'][1]['containerid']
            # "since_id":weibo_json['data']['cardlistInfo']['since_id']
        }
        response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
        need_weibo_json = response.json()

        for wwweibo in need_weibo_json['data']['cards']:
            if need_weibo_json['data']['cards'][0]['card_type'] == 9:
                need_weibo_url = f"https://weibo.com/{need_user_id}/{wwweibo['mblog']['id']}"
                totla_count += 1
                print({"评论时间": time, '用户主页': weibo_user_url, '第一条微博': need_weibo_url},
                      f'目前条数{totla_count}')
                df = df._append({"评论时间": time, '用户主页': weibo_user_url, '第一条微博': need_weibo_url},
                                ignore_index=True)
                break
        # print()
    while True:
        url = "https://weibo.com/ajax/statuses/buildComments"
        params['max_id'] = comment_data['max_id']
        response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
        comment_data = response.json()
        if comment_data['ok']:
            for comment in comment_data['data']:
                need_user_id = comment['user']['id']
                weibo_user_url = f"https://weibo.com/u/{need_user_id}"
                time = comment['created_at']
                url = "https://m.weibo.cn/api/container/getIndex"
                params = {
                    "type": "uid",
                    "value": need_user_id,
                    # "containerid": "1076035881204733"
                    # "since_id":weibo_json['data']['cardlistInfo']['since_id']
                }
                response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
                need_containerid_data = response.json()

                url = "https://m.weibo.cn/api/container/getIndex"
                params = {
                    "type": "uid",
                    "value": need_user_id,
                    "containerid": need_containerid_data['data']['tabsInfo']['tabs'][1]['containerid']
                    # "since_id":weibo_json['data']['cardlistInfo']['since_id']
                }
                response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
                need_weibo_json = response.json()
                for wwweibo in need_weibo_json['data']['cards']:
                    if need_weibo_json['data']['cards'][0]['card_type'] == 9:
                        need_weibo_url = f"https://weibo.com/{need_user_id}/{wwweibo['mblog']['id']}"
                        totla_count += 1
                        print({"评论时间": time, '用户主页': weibo_user_url,
                               '第一条微博': need_weibo_url}, f'目前条数{totla_count}')
                        df = df._append({"评论时间": time, '用户主页': weibo_user_url,
                                         '第一条微博': need_weibo_url}, ignore_index=True)
                        break
        else:
            break
        if len(comment_data['data']) == 0:
            break
        # print(response.text)
    # dataframe去重
    df = df.drop_duplicates()
    df['评论时间'] = pd.to_datetime(df['评论时间']).apply(lambda x: x.replace(tzinfo=None))
    df['评论时间'] -= pd.Timedelta(hours=8)  # .tz_localize(None)
    df.to_excel(f'{keyword}.xlsx', index=False)


for page in range(1,need_page+1):
    url = "https://m.weibo.cn/api/container/getIndex"
    print(f'=================page{page}===============================================')
    params = {
        "containerid": f"100103type=3&q={keyword}&t=",
        "page_type": "searchall",
        "page": page
    }
    response = requests.get(url, headers=headers, cookies=cookies, params=params)
    page_data=response.json()
    for card in page_data['data']['cards']:
        if card['card_type']==11:
            for user in card['card_group']:
                user_id=user['user']['id']
                print(f'============================={user_id}===================================')
                weibos=[]

                url = "https://m.weibo.cn/api/container/getIndex"
                params = {
                    "type": "uid",
                    "value": user_id,
                    # "containerid": "1076035881204733"
                    # "since_id":weibo_json['data']['cardlistInfo']['since_id']
                }
                response = requests.get(url, headers=headers, cookies=cookies, params=params)
                containerid_data = response.json()

                url = "https://m.weibo.cn/api/container/getIndex"
                params = {
                    "type": "uid",
                    "value": user_id,
                    "containerid": containerid_data['data']['tabsInfo']['tabs'][1]['containerid']
                    # "since_id":weibo_json['data']['cardlistInfo']['since_id']
                }
                response = requests.get(url, headers=headers, cookies=cookies, params=params)
                weibo_json=response.json()
                weibos+=weibo_json['data']['cards']
                if 'since_id' in weibo_json['data']['cardlistInfo']:
                    params = {
                        "type": "uid",
                        "value": user_id,
                        "containerid": containerid_data['data']['tabsInfo']['tabs'][1]['containerid'],
                        "since_id": weibo_json['data']['cardlistInfo']['since_id']
                    }
                    response = requests.get(url, headers=headers, cookies=cookies, params=params)
                    weibo_json = response.json()
                    weibos += weibo_json['data']['cards']
                else:
                    pass
                if len(weibos)>=17:
                    all_weibos=weibos[2:18]

                else:
                    all_weibos=weibos[2:]

                for weibo in all_weibos:
                    # try:
                    weibo_id=weibo['mblog']['id']

                    main(weibo_id)
                    # except Exception as e:
                    #     print(e)
            break
            # print(response)