python抓取取微博评论简单示例
使用python中的requests以及re库对人民日报的一篇新冠疫苗文章前30页评论进行抓取。抓取微博评论使用的是微博的移动端网页。具体代码如下
import requests
import re
import time
import pandas as pd
data = pd.DataFrame(columns=['用户评论']) # 创建DataFrame用于存储评论
url = 'https://m.weibo.cn/comments/hotflow?'
header = {
'Referer': 'https://m.weibo.cn/status/K9nlrqOa7?from=page_1002062803301701_profile&wvr=6&mod=weibotime',
'cookie': 'WEIBOCN_FROM=1110006030; SUB=_2A25NaHmIDeRhGeNI7FEZ9ybMzz-IHXVukwfArDV6PUJbkdAfLRTBkW1NSCLugpy2B0l2GHcHM-YPQW7Aaxu7g6jc; _T_WM=62217879831; MLOGIN=1; XSRF-TOKEN=2f83ea; M_WEIBOCN_PARAMS=from%3Dpage_1002062803301701_profile%26oid%3D4622346936389355%26luicode%3D20000061%26lfid%3D4622346936389355%26uicode%3D20000061%26fid%3D4622346936389355',
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Mobile Safari/537.36'
}
# 设置第一次爬虫参数
param = {
'id': '4622346936389355',
'mid': '4622346936389355',
'max_id_type': '0'
}
# 抓取前三十页
for n in range(30):
resp = requests.get(url,headers=header,params=param)
resp.encoding = 'utf-8'
dic = resp.json()
regex = re.compile(r'<span.*?</span>|<a.*?/a>',re.S) # 正则表达式提取表情
# 获得下次爬虫的url参数
max_id = dic['data']['max_id']
max_id_type = dic['data']['max_id_type']
page = dic['data']['data']
for i in range(len(page)):
data.loc[page[i]['user']['screen_name']] = regex.sub('',page[i]['text'])
param = {
'id':'4622346936389355',
'mid':'4622346936389355',
'max_id':str(max_id),
'max_id_type': str(max_id_type)
}
print(f'第{n}页')
time.sleep(3)
data.to_csv('新冠疫苗接种评论.csv') # 保存数据
resp.close()