[Python] 纯文本查看 复制代码import requests
import time
import re
import json
import math
#设置协议头
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Connection": "close"
}
#提取前面的链接cookie值
res = requests.get(
"https://www.baidu.com/s?cl=3&tn=baidutop10&fr=top1000&wd=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5%3A%\
E6%89%AB%E7%A0%81%E7%82%B9%E9%A4%90%E4%B8%8D%E8%AF%A5%E6%98%AF%E5%94%AF%E4%B8%80%E9%80%89%E6%8B%A9&rsv_idx=2&rsv\
_dl=fyb_n_homepage&hisfilter=1",
headers=headers)
#提取cookie为字典形式
cookie = res.cookies.get_dict()
data_dict = {}
count = 0
i = 0
while True:
time.sleep(3)
#生成13位时间戳
current_milli_time = lambda: int(round(time.time() * 1000))
now_time = current_milli_time()
#完善协议头
headers.update({'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
#设置来路的,不设置这个无法返回评论数据
'Referer': 'https://baijiahao.baidu.com/s?id=1690446693112251324&wfr=spider&for=pc'})
#根据自己的url改变其中的变动参数
url = f"https://ext.baidu.com/api/comment/v2/comment/list?thread_id=1004000038741948&reply_id=&start={i * 20}&num=20&appid=22862035&order=12&inner_order=9&use_list=1&callback=_boxjsonpcd1d7651&use_uk=1&ts={now_time}"
i += 1
res = requests.get(url, headers=headers, cookies=cookie)
#替换返回内容包含\/的链接
new = res.text.replace(r"\/", "/")
#使用正则提取出来
full_json = "{" + re.search(r'_boxjsonpcd1d7651\(\{(.*?)\}\)', new).group(1) + "}"
#load已经完整的json
new = json.loads(full_json)
#在无法提取数据的时候报错跳出死循环
try:
data = new['ret']['list']
except Exception as e:
print(e)
break
for da in data:
#count计数几楼
count += 1
#提取具体内容,可自己决定提取什么内容
uname = da['uname']
like_count = da['like_count']
text = da['content']
reply_count = da['reply_count']
print(f"评论者:{uname}\n评论的内容:{text}\n喜欢数:{like_count}\n")
reply_list = []
if reply_count == "0":
print("该评论无回复数")
else:
reply_id = da['reply_id']
#每页10个,用全部回复数除以10,然后进一整数方式估算出页数,当然这里可以使用死循环
page = math.ceil(int(reply_count) / 10)
for n in range(page):
time.sleep(3)
current_milli_time = lambda: int(round(time.time() * 1000))
now_time = current_milli_time()
#提取评论中的回复数据,返回的方式和提取跟上面一样
new_url = f"https://ext.baidu.com/api/comment/v2/comment/detail?thread_id=1004000038741948&reply_id={reply_id}&start={n * 10}&num=10&appid=22862035&order=9&use_list=0&callback=_boxjsonp370b5194&use_uk=1&ts={now_time}"
res1 = requests.get(new_url, headers=headers, cookies=cookie)
new1 = res1.text.replace(r"\/", "/")
full_json = "{" + re.search(r'_boxjsonp370b5194\(\{(.*?)\}\)', new1).group(1) + "}"
new1 = json.loads(full_json)
#以防估算出错,用try来避免
try:
new_data = new1['ret']['list']
except Exception as e:
print(e)
break
for rp in new_data:
f_uname = rp['uname']
f_like_count = rp['like_count']
f_text = rp['content']
#把所有回复数据用字典的形式放在列表里
reply_list.append({"replier": f_uname, "reply_content": f_text, "reply_likes": f_like_count})
#保存进字典里面
data_dict.update({f"floor_{count}": {"commenter": uname, "content": text, "like_count": like_count,
"reply_list": reply_list}})
print(data_dict)
#爬取完后进行保存数据
with open("data.json", "w", encoding="utf-8") as f:
json.dump(data_dict, f)