案例需求:
1.爬取该网页下的一级评论和二级评论
https://m.weibo.cn/detail/4813628149072458
2.
3.
分析:
1.找到一级评论请求地址
url请求地址:
二级url地址
分析翻页参数——可知翻页是从第二页开始的
从而得到:
移动端链接:https://m.weibo.cn/detail/4813628149072458
一级评论接口:https://m.weibo.cn/comments/hotflow?id=4813628149072458&mid=4813628149072458&max_id_type=0
--参数:
id: 4813628149072458
mid: 4813628149072458
max_id_type: 0
max_id: 13883307764046392 #翻页参数---从第二页开始 (在上一页一级评论接口可以找到)二级评论接口:https://m.weibo.cn/comments/hotFlowChild?cid=4813628329693567&max_id=0&max_id_type=0
---参数:
cid: 4813628329693567
max_id: 0 #二级翻页参数(在上一页评论接口中可以找到)
max_id_type: 0
获取一级评论
import requests
from jsonpath import jsonpath
import re
class Weibo():
def __init__(self):
self.one_url = 'https://m.weibo.cn/comments/hotflow'
self.one_data = {
'id': '4813628149072458',
'mid': '4813628149072458',
'max_id_type': '0',
'max_id': None
}
self.headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36',
}
def get_one_data(self):
response = requests.get(url=self.one_url, headers=self.headers, params=self.one_data)
# print(response.text)
# print(json.loads(response.text))
json_data = response.json()
# 解析内容
one_name = jsonpath(json_data, '$..data[0:18].user.screen_name')
one_text = jsonpath(json_data, '$..data[0:18].text')
for one_names, one_texts in zip(one_name, one_text):
content = re.sub('<.*?>', '', one_texts)
print('-------一级评论-------')
print(one_names)
print(content)
def man(self):
self.get_one_data()
if __name__ == '__main__':
w = Weibo()
w.man()
一级翻页
max_id = json_data.get('max_id') if max_id: self.one_data['max_id'] = max_id time.sleep(random.uniform(1, 3)) # 随机等待时间,避免被封 self.get_one_data(self.one_url, self.one_data) else: print('该用户的一级评论已经爬完')
获取二级评论
# 获取二级评论 def get_two_data(self): response = requests.get(url=self.two_url, headers=self.headers, params=self.two_data) try: json_data = response.json() except json.JSONDecodeError: print("解析 JSON 失败") return two_name = json_data.get('data', []) two_text = [item.get('text') for item in two_name] for two_names, two_texts in zip(two_name, two_text): content = re.sub('<.*?>', '', two_texts) print('-------二级评论-------') print(two_names.get('user', {}).get('screen_name', '')) print(content)
示例代码:
import requests
import json
import re
import sys
import time
import random
from jsonpath import jsonpath
class Weibo:
def __init__(self):
self.one_url = 'https://m.weibo.cn/comments/hotflow' # 修正 URL
self.two_url = "https://m.weibo.cn/comments/hotFlowChild" # 修正 URL
self.one_data = {
'id': '4813628149072458',
'mid': '4813628149072458',
'max_id_type': '0',
'max_id': None
}
self.two_data = {
'cid': '4813628329693567',
'max_id': '0',
'max_id_type': '0'
}
self.headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36',
'Cookie': '__bid_n = 188b7ef5179e6369a94207;FPTOKEN = crNJdsvZwwGF7dp2IPNpPQfcb8QXfJcGqmdrORkyZDx1InnMS5HmnAi2IuK / +GpaFRGA9SsxzhVt1I6QlRCtoWJsIFKbbc8 // DeUCm0HH9ux6X85QM + Z3WBbGns26hiiQngHN + M5q + ErW1eifOLk + +KasqWhbWrd12AHMF7vC / 3qXfLfRN60SEVuv1ZGCnrBIc3lN1sba2e0UzVGEzYejWmJ / yzCNkUZ1qZHPSNvEzGfJlYhJGxDiyyBInzi / cTWyk1g988msn9UMRE3GBjWIcZXsqOl0HbbsOz5AYS + n1b86VqgY4eVk3EB / Dr9Fgkl2UBcstP5NcEJ9MXcHyZDfRsXbz / rGPbnYsrT7iZxjwGn4gnTqnzQ / HZsyaVvGf1gxf3oEB3SwBHvlflbg7KxXQ == | G4pgWxSL7Ti7lQoJilvXBEq0Vs37iI4r + CsQg3BKI2U = | 10 | 6f811a8a83e72fbe3ec90c6845e372b0;_T_WM = 40779448236;WEIBOCN_FROM = 1110006030;SUBP = 0033WrSXqPxfM725Ws9jqgMF55529P9D9WF_IbR1VnBGsR3IZdB7J.Ey5JpX5K - hUgL.FoMNShMEeK2E1Kq2dJLoIEnLxK - L1hqL1K.LxKMLB.zL1K.LxKnL1hMLB - 2LxK - LBoMLBo27S05N;MLOGIN = 1;SCF = AkYYFk70crAYROKfk6SUopCK_fVD7Tu5nSTQdkf6622fzN_KAP3J3MPopagzOTf3wUrTpRLoS2QmwjIeNQ2nik4.;SUB = _2A25JxhX3DeRhGeFJ71UT8S_OwjqIHXVrSLu_rDV6PUJbktAGLRehkW1Nf8RTnplqgytu5YRphj7 - Op57y6XIcQDE;SSOLoginState = 1690461607;ALF = 1693053607;XSRF - TOKEN = c16fc6;M_WEIBOCN_PARAMS = oid % 3D4813628149072458 % 26luicode % 3D20000061 % 26lfid % 3D4813628149072458 % 26uicode % 3D20000061 % 26fid % 3D4813628149072458;mweibo_short_token = f67e070b3b' }
def get_one_data(self, url, data):
response = requests.get(url=url, headers=self.headers, params=data)
try:
json_data = response.json()
except json.JSONDecodeError:
print("解析 JSON 失败")
return
# 解析内容
one_name = jsonpath(json_data, '$..data[0:18].user.screen_name')
one_text = jsonpath(json_data, '$..data[0:18].text')
cid = jsonpath(json_data, '$..data[0:10].rootid')
for one_names, one_texts, cids in zip(one_name, one_text, cid):
content = re.sub('<.*?>', '', one_texts)
print('-------一级评论-------')
print(one_names)
print(content)
# print(one_texts)
print('跟评ID', cids)
self.two_data['cid'] = cids
self.get_two_data()
# 翻页处理
max_id = json_data.get('max_id')
if max_id:
self.one_data['max_id'] = max_id
time.sleep(random.uniform(1, 3)) # 随机等待时间,避免被封
self.get_one_data(self.one_url, self.one_data)
else:
print('该用户的一级评论已经爬完')
# 获取二级评论
def get_two_data(self):
response = requests.get(url=self.two_url, headers=self.headers, params=self.two_data)
try:
json_data = response.json()
except json.JSONDecodeError:
print("解析 JSON 失败")
return
two_name = json_data.get('data', [])
two_text = [item.get('text') for item in two_name]
for two_names, two_texts in zip(two_name, two_text):
content = re.sub('<.*?>', '', two_texts)
print('-------二级评论-------')
print(two_names.get('user', {}).get('screen_name', ''))
print(content)
def man(self):
self.get_one_data(self.one_url, self.one_data)
if __name__ == '__main__':
w = Weibo()
w.man()