直接cv可用,具体登录没做,需要自己复制token到代码中。
这里首先搜索关键字,然后获取所有搜索出的用户,并获取所有用户下的评论。
import requests
import pandas as pd
from retrying import retry
df=pd.DataFrame(columns=["评论时间",'用户主页','第一条微博'])
keyword='美妆博主'
# keyword=input('请输入关键词:\n')
# need_page=int(input('请输入需要爬取的页数:\n'))
need_page=2
# cookie=input('请输入cookie:\n')
wait_tiime=3 #等待时间
retry_time=3 #重试次数
totla_count=0
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D%E7%BE%8E%E5%A6%86%E5%8D%9A%E4%B8%BB",
"X-Requested-With": "XMLHttpRequest",
"MWeibo-Pwa": "1",
"X-XSRF-TOKEN": "5a892a",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"Connection": "keep-alive",
'Cookie': 'SINAGLOBAL=6563893439405.94.1660570802061; ULV=1712841158625:23:1:1:8893962698404.082.1712841158625:1708182769914; UOR=,,cn.bing.com; login_sid_t=6e01672ae1e55c60e10ce087039052bf; cross_origin_proto=SSL; _s_tentry=-; Apache=8893962698404.082.1712841158625; ALF=1715433175; SUB=_2A25LE5GHDeRhGeBJ6lcV8yrIwz-IHXVoUKtPrDV8PUJbkNB-LWHMkW1NRk_GU0_vPwzQDxoFWKtGaoYWksRRvwGT; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFzAPTmlKnvDNKwZEKFzmg75JpX5KzhUgL.FoqNeK-Xe0BX1he2dJLoI7HSSsSEe5tt',
# 'Cookie': cookie,
}
cookies = {
# "__bid_n": "18577bed0f17a38cc84207",
# "FEID": "v10-9c49e416195bef3c56a332640aa1034f0ce925f6",
# "__xaf_fpstarttimer__": "1672751272214",
# "__xaf_thstime__": "1672751272292",
# "FPTOKEN": "TWz+AoUOLbjxVkzAcaXmWeWRjg1Cpj7nMtnqSuL0qvW4sV1SqRpbcpSdpuWOZabQnrBw/S+h/6W97xd9yAPQe8/ntWBXqbjwa5Cf8TbylfxfhiBVwT9PIoRWvJShQshdp/tEQtOGg8/NU0/XOwJLkSXzrllYeJnYZ9Ppk6VAUEjBAXH8C8gPBsZwkvIQBNvftDDOA6Tzj/DnY7XTOq83EK/Gvgp5sYhJ9oFaFbykBL/6Ch0q+yID03T4fInC9tLd1OaYcef0iLZXKY+CQ4xZNNzoSjfXyRo00D/VWVAg19iJs8GXl+LwhU3GGXiZUPC8oCntdrOzsq6DNWgH0shHnMYtWG186JHhH/k+9Iuczh+kCEcurvvQts7qAny8jp1yEN6CWkotYJH1huZxytiL/Q==|8+E3gOZcAmOVpi7ea3QlJRSsJ0ULjhsUFmB065iQigw=|10|c0eed64372a5fbfc54cb9fc06908830f",
# "__xaf_fptokentimer__": "1672751272345",
# "SCF": "Aub0wSTSBdAh-qetZ_dbidtgpyDMR-T5-QTqgdtUpTxLs551Br27cdCAjAuQv5_ayfHZFGirz3ZPLm_jy3PjmgA.",
# "_T_WM": "28503866524",
# "WEIBOCN_FROM": "1110006030",
# "MLOGIN": "1",
# "SUB": "_2A25LE5GHDeRhGeBJ6lcV8yrIwz-IHXVoUKtPrDV6PUJbktANLXf8kW1NRk_GUxCZ9TTX7crWIRvqKmYZTrKtpIe2",
# "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFzAPTmlKnvDNKwZEKFzmg75JpX5KzhUgL.FoqNeK-Xe0BX1he2dJLoI7HSSsSEe5tt",
# "SSOLoginState": "1712841175",
# "ALF": "1715433175",
# "XSRF-TOKEN": "5a892a",
# "M_WEIBOCN_PARAMS": "uicode%3D10000011%26fid%3D100103type%253D3%2526q%253D%25E7%25BE%258E%25E5%25A6%2586%25E5%258D%259A%25E4%25B8%25BB%2526t%253D"
}
@retry(stop_max_attempt_number=retry_time)
def main(weibo_id):
global totla_count
global df
url = "https://weibo.com/ajax/statuses/buildComments"
params = {
"is_reload": "1",
"id": weibo_id,
"is_show_bulletin": "2",
"is_mix": "0",
"count": "20",
"type": "feed",
"uid": user_id,
"fetch_level": "0",
"locale": "zh-CN"
}
response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
comment_data = response.json()
for comment in comment_data['data']:
need_user_id = comment['user']['id']
weibo_user_url = f"https://weibo.com/u/{need_user_id}"
time = comment['created_at']
url = "https://m.weibo.cn/api/container/getIndex"
params = {
"type": "uid",
"value": need_user_id,
# "containerid": "1076035881204733"
# "since_id":weibo_json['data']['cardlistInfo']['since_id']
}
response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
need_containerid_data = response.json()
url = "https://m.weibo.cn/api/container/getIndex"
params = {
"type": "uid",
"value": need_user_id,
"containerid": need_containerid_data['data']['tabsInfo']['tabs'][1]['containerid']
# "since_id":weibo_json['data']['cardlistInfo']['since_id']
}
response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
need_weibo_json = response.json()
for wwweibo in need_weibo_json['data']['cards']:
if need_weibo_json['data']['cards'][0]['card_type'] == 9:
need_weibo_url = f"https://weibo.com/{need_user_id}/{wwweibo['mblog']['id']}"
totla_count += 1
print({"评论时间": time, '用户主页': weibo_user_url, '第一条微博': need_weibo_url},
f'目前条数{totla_count}')
df = df._append({"评论时间": time, '用户主页': weibo_user_url, '第一条微博': need_weibo_url},
ignore_index=True)
break
# print()
while True:
url = "https://weibo.com/ajax/statuses/buildComments"
params['max_id'] = comment_data['max_id']
response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
comment_data = response.json()
if comment_data['ok']:
for comment in comment_data['data']:
need_user_id = comment['user']['id']
weibo_user_url = f"https://weibo.com/u/{need_user_id}"
time = comment['created_at']
url = "https://m.weibo.cn/api/container/getIndex"
params = {
"type": "uid",
"value": need_user_id,
# "containerid": "1076035881204733"
# "since_id":weibo_json['data']['cardlistInfo']['since_id']
}
response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
need_containerid_data = response.json()
url = "https://m.weibo.cn/api/container/getIndex"
params = {
"type": "uid",
"value": need_user_id,
"containerid": need_containerid_data['data']['tabsInfo']['tabs'][1]['containerid']
# "since_id":weibo_json['data']['cardlistInfo']['since_id']
}
response = requests.get(url, headers=headers, cookies=cookies, params=params,timeout=wait_tiime)
need_weibo_json = response.json()
for wwweibo in need_weibo_json['data']['cards']:
if need_weibo_json['data']['cards'][0]['card_type'] == 9:
need_weibo_url = f"https://weibo.com/{need_user_id}/{wwweibo['mblog']['id']}"
totla_count += 1
print({"评论时间": time, '用户主页': weibo_user_url,
'第一条微博': need_weibo_url}, f'目前条数{totla_count}')
df = df._append({"评论时间": time, '用户主页': weibo_user_url,
'第一条微博': need_weibo_url}, ignore_index=True)
break
else:
break
if len(comment_data['data']) == 0:
break
# print(response.text)
# dataframe去重
df = df.drop_duplicates()
df['评论时间'] = pd.to_datetime(df['评论时间']).apply(lambda x: x.replace(tzinfo=None))
df['评论时间'] -= pd.Timedelta(hours=8) # .tz_localize(None)
df.to_excel(f'{keyword}.xlsx', index=False)
for page in range(1,need_page+1):
url = "https://m.weibo.cn/api/container/getIndex"
print(f'=================page{page}===============================================')
params = {
"containerid": f"100103type=3&q={keyword}&t=",
"page_type": "searchall",
"page": page
}
response = requests.get(url, headers=headers, cookies=cookies, params=params)
page_data=response.json()
for card in page_data['data']['cards']:
if card['card_type']==11:
for user in card['card_group']:
user_id=user['user']['id']
print(f'============================={user_id}===================================')
weibos=[]
url = "https://m.weibo.cn/api/container/getIndex"
params = {
"type": "uid",
"value": user_id,
# "containerid": "1076035881204733"
# "since_id":weibo_json['data']['cardlistInfo']['since_id']
}
response = requests.get(url, headers=headers, cookies=cookies, params=params)
containerid_data = response.json()
url = "https://m.weibo.cn/api/container/getIndex"
params = {
"type": "uid",
"value": user_id,
"containerid": containerid_data['data']['tabsInfo']['tabs'][1]['containerid']
# "since_id":weibo_json['data']['cardlistInfo']['since_id']
}
response = requests.get(url, headers=headers, cookies=cookies, params=params)
weibo_json=response.json()
weibos+=weibo_json['data']['cards']
if 'since_id' in weibo_json['data']['cardlistInfo']:
params = {
"type": "uid",
"value": user_id,
"containerid": containerid_data['data']['tabsInfo']['tabs'][1]['containerid'],
"since_id": weibo_json['data']['cardlistInfo']['since_id']
}
response = requests.get(url, headers=headers, cookies=cookies, params=params)
weibo_json = response.json()
weibos += weibo_json['data']['cards']
else:
pass
if len(weibos)>=17:
all_weibos=weibos[2:18]
else:
all_weibos=weibos[2:]
for weibo in all_weibos:
# try:
weibo_id=weibo['mblog']['id']
main(weibo_id)
# except Exception as e:
# print(e)
break
# print(response)