python爬取文章分析_python分析百家号文章评论并进行爬取

最新推荐文章于 2023-06-20 15:22:46 发布

weixin_39936134

最新推荐文章于 2023-06-20 15:22:46 发布

阅读量572

点赞数 2

文章标签： python爬取文章分析

本文链接：https://blog.csdn.net/weixin_39936134/article/details/113672452

版权

本文演示了如何使用Python爬虫获取并分析百家号文章的评论数据。通过设置请求头、提取cookie，逐页抓取评论及回复内容，包括评论者的用户名、点赞数和评论内容，以及回复的用户名、内容和点赞数。最终，将数据保存到data.json文件中。

摘要由CSDN通过智能技术生成

[Python] 纯文本查看复制代码import requests

import time

import re

import json

import math

#设置协议头

headers = {

"User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",

"Connection": "close"

}

#提取前面的链接cookie值

res = requests.get(

"https://www.baidu.com/s?cl=3&tn=baidutop10&fr=top1000&wd=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5%3A%\

E6%89%AB%E7%A0%81%E7%82%B9%E9%A4%90%E4%B8%8D%E8%AF%A5%E6%98%AF%E5%94%AF%E4%B8%80%E9%80%89%E6%8B%A9&rsv_idx=2&rsv\

_dl=fyb_n_homepage&hisfilter=1",

headers=headers)

#提取cookie为字典形式

cookie = res.cookies.get_dict()

data_dict = {}

count = 0

i = 0

while True:

time.sleep(3)

#生成13位时间戳

current_milli_time = lambda: int(round(time.time() * 1000))

now_time = current_milli_time()

#完善协议头

headers.update({'Accept': '*/*',

'Accept-Encoding': 'gzip, deflate, br',

'Accept-Language': 'zh-CN,zh;q=0.9',

#设置来路的，不设置这个无法返回评论数据

'Referer': 'https://baijiahao.baidu.com/s?id=1690446693112251324&wfr=spider&for=pc'})

#根据自己的url改变其中的变动参数

url = f"https://ext.baidu.com/api/comment/v2/comment/list?thread_id=1004000038741948&reply_id=&start={i * 20}&num=20&appid=22862035&order=12&inner_order=9&use_list=1&callback=_boxjsonpcd1d7651&use_uk=1&ts={now_time}"

i += 1

res = requests.get(url, headers=headers, cookies=cookie)

#替换返回内容包含\/的链接

new = res.text.replace(r"\/", "/")

#使用正则提取出来

full_json = "{" + re.search(r'_boxjsonpcd1d7651\(\{(.*?)\}\)', new).group(1) + "}"

#load已经完整的json

new = json.loads(full_json)

#在无法提取数据的时候报错跳出死循环

try:

data = new['ret']['list']

except Exception as e:

print(e)

break

for da in data:

#count计数几楼

count += 1

#提取具体内容，可自己决定提取什么内容

uname = da['uname']

like_count = da['like_count']

text = da['content']

reply_count = da['reply_count']

print(f"评论者：{uname}\n评论的内容：{text}\n喜欢数:{like_count}\n")

reply_list = []

if reply_count == "0":

print("该评论无回复数")

else:

reply_id = da['reply_id']

#每页10个，用全部回复数除以10，然后进一整数方式估算出页数，当然这里可以使用死循环

page = math.ceil(int(reply_count) / 10)

for n in range(page):

time.sleep(3)

current_milli_time = lambda: int(round(time.time() * 1000))

now_time = current_milli_time()

#提取评论中的回复数据，返回的方式和提取跟上面一样

new_url = f"https://ext.baidu.com/api/comment/v2/comment/detail?thread_id=1004000038741948&reply_id={reply_id}&start={n * 10}&num=10&appid=22862035&order=9&use_list=0&callback=_boxjsonp370b5194&use_uk=1&ts={now_time}"

res1 = requests.get(new_url, headers=headers, cookies=cookie)

new1 = res1.text.replace(r"\/", "/")

full_json = "{" + re.search(r'_boxjsonp370b5194\(\{(.*?)\}\)', new1).group(1) + "}"

new1 = json.loads(full_json)

#以防估算出错，用try来避免

try:

new_data = new1['ret']['list']

except Exception as e:

print(e)

break

for rp in new_data:

f_uname = rp['uname']