import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
import random
def fetchUrl(pid, uid, max_id): # 请求函数
time.sleep(random.random() * 3) # 设置睡眠时间
url = "https://weibo.com/ajax/statuses/buildComments"
headers = {
'Cookie': '自己的cookie',
### 填写Cookie
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
}
params = {
"is_reload": 1, # 固定
"id": pid, # 该微博id
"is_show_bulletin": 1, # 固定
"is_mix": 0, # 一级评论是0 二级评论是1
"max_id": max_id, # 起始为0,从上一个请求中找
"count": 20,
"uid": uid, # 发微博的用户id 固定
"fetch_level": 0, # 固定
"locale": "zh - CN"
}
r = requests.get(url, headers=headers, params=params)
return r.json()
def fetchLevel2(cid, uid, cmax_id): # 请求函数 获取二级评论
time.sleep(random.random() * 3) # 设置睡眠时间
curl = "https://weibo.com/ajax/statuses/buildComments"
headers = {
'Cookie': '自己的cookie',
### 填写Cookie
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
}
params = {
"flow": 1, # 固定
"is_reload": 1, # 固定
"id": cid, # 该评论id
"is_show_bulletin": 2, # 固定
"is_mix": 1, # 一级评论是0 二级评论是1
"fetch_level": 1, # 一级评论是0,二级评论是1
"max_id": cmax_id, # 起始为0,从上一个请求中找
"count": 20,
"uid": uid, # 发微博的用户id 固定
}
c = requests.get(curl, headers=headers, params=params)
return c.json()
i = 1
allData = [] # 负责暂时保存数据
def parseJson(jsonObj): # 一级评论解析函数
data = jsonObj["data"]
max_id = jsonObj["max_id"]
global i
for item in data:
#print(i)
i = i + 1
# 回复人
OneRespondent = Respondent(rid) # 一级评论回复人函数
# 一级评论id 后续获取二级评论要用到
comment_Id = item["id"]
# 评论内容
content = BeautifulSoup(item["text"], "html.parser").text
# 评论时间
created_at = item["created_at"]
# 点赞数
like_counts = item["like_counts"]
# 评论者信息
user = item["user"]
userID = user["id"]
userName = user["screen_name"]
usergender = user["gender"]#性别
if usergender=='m':
usergender="男"
elif usergender=="f":
usergender="女"
userfollowers_count = user["followers_count"]#粉丝数
userfriends_count = user["friends_count"]#关注数
userlocation = user["location"]
userSource = item.setdefault("source")
dataItem = ["1", created_at, userID, userName,
usergender,userfollowers_count,userfriends_count,
like_counts, content, userSource, userlocation,
OneRespondent] # 一级评论回复人
# ["评论级别", "发布时间", "用户id", "用户昵称",
# '用户性别', '用户粉丝数', '用户关注数',
# "评论点赞数","评论内容", "发消息时所在地", "居住地",
# "回复人"]
#print(dataItem)
allData.append(dataItem)
# 嵌套循环 开始获取二级评论
cid = comment_Id
cmax_id = 0
while (True):
chtml = fetchLevel2(cid, uid, cmax_id) # 请求函数 获取二级评论
cmax_id = parseLevel2(chtml) # 解析二级评论 返回cmax_id
# cmax_id 为 0 时,表示爬取结束
if cmax_id == 0:
break;
return max_id
def parseLevel2(jsonObj): # 二级评论解析函数
cdata = jsonObj["data"]
cmax_id = jsonObj["max_id"]
for item in cdata:
# 回复人 # 找到screen_name
TwoRespondent = item["reply_comment"]["user"]["screen_name"]
# 二级评论内容
ctext = BeautifulSoup(item["text"], "html.parser").text
# 评论时间
ccreated_at = item["created_at"]
# 点赞数
clike_counts = item["like_counts"]
# 评论者信息
cuser = item["user"]
cuserID = cuser["id"]
cuserName = cuser["screen_name"]
cuserSource = item.setdefault("source")
cuserlocation = cuser["location"]
cuserName = cuser["screen_name"]
cusergender = cuser["gender"] # 性别
if cusergender == 'm':
cusergender = "男"
elif cusergender == "f":
cusergender = "女"
cuserfollowers_count = cuser["followers_count"] # 粉丝数
cuserfriends_count = cuser["friends_count"] # 关注数
cdataItem = ['2', ccreated_at, cuserID, cuserName,
cusergender, cuserfollowers_count, cuserfriends_count,
clike_counts, ctext, cuserSource, cuserlocation,
TwoRespondent]
# ["评论级别", "发布时间", "用户id", "用户昵称",
# '用户性别', '用户粉丝数', '用户关注数',
# "评论点赞数","评论内容", "发消息时所在地", "居住地",
# "回复人"]
#print(cdataItem) # 二级评论回复人
allData.append(cdataItem)
return cmax_id
def Respondent(rid):
time.sleep(random.random() * 3) # 设置睡眠时间
rurl = 'https://weibo.com/ajax/statuses/show' # url地址
headers = {
'Cookie': '自己的cookie',
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
}
paramas = {
'id': rid, # 参数
"locale": "zh-CN",
}
r = requests.get(rurl, headers=headers, params=paramas).json()
user = r["user"]
user_name = user['screen_name'] # 找到名字
return user_name
def save_data(data, path, filename):
global allData
#print("开始存取")
if not os.path.exists(path):
os.makedirs(path)
dataframe = pd.DataFrame(data)
dataframe.to_csv(path + filename, encoding='utf_8_sig', mode='a', index=False, sep=',', header=False)
allData = [] # 清空alldata
if __name__ == "__main__":
pid = 4722701329501610 # 微博id,固定
uid = 6593199887 # 发微博的用户id,固定
max_id = 0
rid = 'L9wz2DRNM' # 参数
path = "F:\\shiyan\\weibo"
# 保存的路径
filename = "微博(神女批观).csv" # 保存的文件名
csvHeader = [["评论级别", "发布时间", "用户id", "用户昵称",
'用户性别','用户粉丝数','用户关注数', "评论点赞数",
"评论内容", "发消息时所在地", "居住地",
"回复人"]] # csv文件第一行
save_data(csvHeader, path, filename) # 将csvheader保存至csv文件
while (True):
html = fetchUrl(pid, uid, max_id) # 发起请求
max_id = parseJson(html) # 解析数据 返回max_id
# max_id 为 0 时,表示单次爬取结束 存储数据至csv文件 通常情况下i=20...40...60...时存储
if max_id == 0:
break
save_data(allData, path, filename)
使用指南(main里面)
1.修改保存的文件路径
2.根据对应微博,查找对应pid及id(uid)