声明:本代码仅供学习与参考之用,未经明确授权,禁止用于任何商业用途或非法活动
前言
小红薯作为当下最火热的社交平台之一,其中的用户评论包含了大量有价值的信息。可以用于各种舆论分析,商业分析,qa问答等。本文旨在帮助读者了解小红薯评论的类型以及如何使用python实现对小红薯评论的抓取。
引入
众所周知小红薯评论有一级评论,二级评论,三级评论...等,但是这些评论并不是通过一个接口获取,其中一级评论是通过comments接口获得,二级评论及以上评论是通过sub_comments接口获得。所以我们的思路就是:
获取一级评论-解析评论-获取二级评论-解析评论-保存评论
一级评论获取:
cookies={你自己的cookies}
params = {
"note_id": note_id,
"cursor": "",
"top_comment_id": "",
"image_formats": "jpg,webp,avif",
"xsec_token": xsec_token
}
try:
b3_trace_id = get_b3_trace_id()
params_encoded = api_endpoint + '?' + urllib.parse.urlencode(params)
headers = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9',
'origin': 'https://www.xiaohongshu.com',
'priority': 'u=1, i',
'referer': 'https://www.xiaohongshu.com/',
'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
"x-b3-traceid": b3_trace_id,
"x_xray_traceid": hashlib.md5(b3_trace_id.encode('utf-8')).hexdigest()
}
with open('1.js', 'r', encoding='utf-8') as f:
js_script = f.read()
context = execjs.compile(js_script)
sign = context.call('getXs', params_encoded, '', current_cookies['a1'])
headers['x-s'] = sign['X-s']
headers['x-t'] = str(sign['X-t'])
headers['X-s-common'] = sign['X-s-common']
response = requests.get(url, headers=headers, cookies=current_cookies, params=params)
data = response.json()
通过对结果数据的分析,我们发现一次返回只有10条评论,每次返回一个cursor游标和一个has_more标志,所以为实现爬取所有的评论,只需不断更新param参数中的cursor就行。
一级评论解析:
这块我们采用python的pandas库对返回的JSON数据进行解析;解析的信息具体包括:
笔记id;评论id,时间,内容,点赞数;评论人昵称,头像,ip,id,子评论数等
def parse_comments(data, sub_cookies):
parsed_comments = []
# 检查数据结构是否有效
if not data or 'data' not in data or 'comments' not in data['data']:
print("无效的数据结构")
return parsed_comments
comments = data['data']['comments']
for comment in comments:
# 提取基础字段
note_id = comment['note_id'] if 'note_id' in comment else ''
comment_id = comment['id'] if 'id' in comment else ''
create_time_ms = comment['create_time'] if 'create_time' in comment else None
comment_time = (
datetime.datetime.fromtimestamp(create_time_ms / 1000).strftime('%Y-%m-%d %H:%M:%S')
if create_time_ms
else ''
)
comment_content = comment['content'] if 'content' in comment else ''
# 提取评论人信息
user_info = comment['user_info'] if 'user_info' in comment else {}
commenter_nickname = user_info['nickname'] if 'nickname' in user_info else ''
commenter_avatar = user_info['image'] if 'image' in user_info else ''
commenter_id = user_info['user_id'] if 'user_id' in user_info else ''
commenter_ip = comment['ip_location'] if 'ip_location' in comment else ''
# 提取点赞数和状态
like_count = comment['like_count'] if 'like_count' in comment else ''
status = comment['status'] if 'status' in comment else ''
# 提取子评论相关信息
sub_comment_count = comment['sub_comment_count'] if 'sub_comment_count' in comment else 0
sub_comment_cursor = comment['sub_comment_cursor'] if 'sub_comment_cursor' in comment else ''
sub_comment_has_more = comment['sub_comment_has_more'] if 'sub_comment_has_more' in comment else False
# 构建评论字典
parsed_comment = {
'note_id': note_id,
'comment_id': comment_id,
'comment_time': comment_time,
'comment_content': comment_content,
'commenter_nickname': commenter_nickname,
'commenter_avatar': commenter_avatar,
'commenter_ip': commenter_ip,
'commenter_id': commenter_id,
'like_count': like_count,
'status': status,
"parent_comment_id": '', # 一级评论没有父评论ID
'comment_level': 1,
'sub_comment_count': sub_comment_count,
'sub_comment_cursor': sub_comment_cursor,
'sub_comment_has_more': sub_comment_has_more
}
parsed_comments.append(parsed_comment)
return parsed_comments
二级评论抓取
二级评论接口需要传入的参数有:笔记id,评论id,cursor游标;
if sub_comment_has_more:
logger.info(f'正在获取评论{comment_id}的子评论')
sub_commment_num = 0
cursor = ''
has_more = True
while has_more:
sub_data, cursor = get_sub_comment(note_id, comment_id, cursor, sub_cookies)
parse_sub_data = parse_sub_comments(sub_data)
save_sub_data_to_csv(parse_sub_data)
has_more = sub_data['data']['has_more']
sub_commment_num += 5
logger.info(f'已经爬取评论{note_id}的子评论数量:{sub_commment_num}')
为了减少不必要的操作,我们通过一级评论返回的sub_comment_has_more判断每条一级评论是否有子评论;依然通过更新cursor来实现所有评论的抓取。
二级评论解析
def parse_sub_comments(sub_comment_data: Dict[str, Any]) -> List[Dict[str, Any]]:
parsed_sub_comments = []
if not sub_comment_data or 'data' not in sub_comment_data or 'comments' not in sub_comment_data['data']:
print("无效的数据结构")
return parsed_sub_comments
comments = sub_comment_data['data']['comments']
for comment in comments:
comment_id = comment['id'] if 'id' in comment else ''
note_id = comment['note_id'] if 'note_id' in comment else ''
create_time_ms = comment['create_time'] if 'create_time' in comment else None
comment_time = (
datetime.datetime.fromtimestamp(create_time_ms / 1000).strftime('%Y-%m-%d %H:%M:%S')
if create_time_ms
else ''
)
comment_content = comment['content'] if 'content' in comment else ''
ip_location = comment['ip_location'] if 'ip_location' in comment else ''
like_count = comment['like_count'] if 'like_count' in comment else ''
status = comment['status'] if 'status' in comment else ''
liked = comment['liked'] if 'liked' in comment else False
user_info = comment['user_info'] if 'user_info' in comment else {}
commenter_nickname = user_info['nickname'] if 'nickname' in user_info else ''
commenter_avatar = user_info['image'] if 'image' in user_info else ''
commenter_id = user_info['user_id'] if 'user_id' in user_info else ''
target_comment = comment['target_comment'] if 'target_comment' in comment else {}
target_comment_id = target_comment['id'] if 'id' in target_comment else ''
target_user_info = target_comment['user_info'] if 'user_info' in target_comment else {}
target_commenter_nickname = target_user_info['nickname'] if 'nickname' in target_user_info else ''
target_commenter_avatar = target_user_info['image'] if 'image' in target_user_info else ''
target_commenter_id = target_user_info['user_id'] if 'user_id' in target_user_info else ''
parsed_sub_comment = {
'comment_id': comment_id,
'note_id': note_id,
'comment_time': comment_time,
'comment_content': comment_content,
'ip_location': ip_location,
'like_count': like_count,
'status': status,
'liked': liked,
'commenter_nickname': commenter_nickname,
'commenter_avatar': commenter_avatar,
'commenter_id': commenter_id,
'target_comment_id': target_comment_id,
'target_commenter_nickname': target_commenter_nickname,
'target_commenter_avatar': target_commenter_avatar,
'target_commenter_id': target_commenter_id
}
parsed_sub_comments.append(parsed_sub_comment)
return parsed_sub_comments
二级评论在一级评论的字段基础上,增加了相关父评论的信息;其中target_comment_id指向它回复的评论。【如果它是二级评论,则它指向一级评论,如果它是三级评论,则它指向其回复的二级评论,四级评论依次类推】
评论保存
def save_sub_data_to_csv(data: List[Dict[str, Any]], filename: str = 'sub_comments_2025_01_22.csv'):
if not data:
print("没有数据可保存")
return
# 定义 CSV 文件的表头
fieldnames = [
'comment_id', 'note_id', 'comment_time', 'comment_content', 'ip_location',
'like_count', 'status', 'liked', 'commenter_nickname', 'commenter_avatar',
'commenter_id', 'target_comment_id', 'target_commenter_nickname',
'target_commenter_avatar', 'target_commenter_id'
]
# 检查文件是否已经存在
file_exists = os.path.isfile(filename)
# 写入 CSV 文件
with open(filename, mode='a', newline='', encoding='utf-8-sig') as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
# 如果文件不存在,写入表头
if not file_exists:
writer.writeheader()
# 写入数据
writer.writerows(data)
print(f"数据已保存到 {filename}")