Python 爬虫之WB评论爬取
在本篇博客中,我们将分享如何使用 Python 爬取微博评论。
准备爬取的链接:https://m.weibo.cn/detail/4899535271232791
mid:4899535271232791
如果需要爬取多篇,在mid列表中添加这个值即可。
准备工作
在开始之前,我们需要安装以下 Python 库:
- requests
- json
- csv
可以使用以下命令安装:
pip install requests
pip install json
pip install csv
获取网页源码
首先,我们需要获取网页源码。我们使用 requests
库发送请求,然后使用 response.text
获取文本格式的响应。Cookie需要替换成自己的,参考链接。
import requests
def get_html(url):
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Mobile Safari/537.36 Edg/113.0.1774.35"
}
cookies = {
"cookie": "_T_WM=731%26lfid%3D4899535271232791%26uicode%3D20000061%26fid%3D4899535271232791"
}
response = requests.get(url, headers=headers, cookies=cookies)
response.encoding = response.apparent_encoding
return response.text
解析网页源码
接下来,我们需要从网页源码中解析出评论内容。首先,我们需要使用 json
库将响应解析为 JSON 格式。然后,我们可以使用以下代码获取评论内容:
import json
def get_first_comments(mid):
max_id = 0
max_id_type = 0
url = '<https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type={}>'
while True:
response = get_html(url.format(mid, mid, max_id, max_id_type))
content = json.loads(response)
max_id = content['data']['max_id']
max_id_type = content['data']['max_id_type']
text_list = content['data']['data']
for text in text_list:
text_data = text['text']
save_text_data(text_data)
if int(max_id) == 0: # 如果max_id==0表明评论已经抓取完毕了
break
保存评论
最后,我们需要将评论保存到 CSV 文件中。我们可以使用 csv
库将评论写入 CSV 文件。
import csv
def save_text_data(text_data):
with open("data.csv", "a", encoding="utf-8", newline="")as fi:
fi = csv.writer(fi)
fi.writerow([text_data])
完整代码
import json
import csv
import re
import requests
import time
NeedGetSecond = False # 如果需要获取二级评论可以添加这句
# 获取网页源码的文本文件
def get_html(url):
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Mobile Safari/537.36 Edg/113.0.1774.35"
}
cookies = {
"cookie": "_T_WM=73032936361; WEIBOCN_FROM=1110006030; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW2KFjqOW7WkW1NSjONOgWjqQfLNw0ch4HorcwXlmNcOXZAY=|10|b3abadaf12c4b3b187a67d7b2d7d3fc7; XSRF-TOKEN=ca3cdf; mweibo_short_token=8a67338aaf; M_WEIBOCN_PARAMS=oid%3D4899535271232791%26luicode%3D20000061%26lfid%3D4899535271232791%26uicode%3D20000061%26fid%3D4899535271232791"
}
response = requests.get(url, headers=headers, cookies=cookies)
response.encoding = response.apparent_encoding
time.sleep(3) # 加上3s 的延时防止被反爬
return response.text
def get_string(text):
t = ''
flag = 1
for i in text:
if i == '<':
flag = 0
elif i == '>':
flag = 1
elif flag == 1:
t += i
return t
# 保存评论
def save_text_data(text_data):
text_data = get_string(text_data)
with open("data.csv", "a", encoding="utf-8", newline="")as fi:
fi = csv.writer(fi)
fi.writerow([text_data])
# 获取二级评论
def get_second_comments(cid):
max_id = 0
max_id_type = 0
url = '<https://m.weibo.cn/comments/hotFlowChild?cid={}&max_id={}&max_id_type={}>'
while True:
response = get_html(url.format(cid, max_id, max_id_type))
content = json.loads(response)
comments = content['data']
for i in comments:
text_data = i['text']
save_text_data(text_data)
max_id = content['max_id']
max_id_type = content['max_id_type']
if max_id == 0: # 如果max_id==0表明评论已经抓取完毕了
break
# 获取一级评论
def get_first_comments(mid):
max_id = 0
max_id_type = 0
url = '<https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type={}>'
while True:
response = get_html(url.format(mid, mid, max_id, max_id_type))
content = json.loads(response)
max_id = content['data']['max_id']
max_id_type = content['data']['max_id_type']
text_list = content['data']['data']
for text in text_list:
text_data = text['text']
total_number = text['total_number']
if int(total_number) != 0 and NeedGetSecond: # 如果有二级评论就去获取二级评论。
get_second_comments(text['id'])
save_text_data(text_data)
if int(max_id) == 0: # 如果max_id==0表明评论已经抓取完毕了
break
if __name__ == '__main__':
"""
<https://m.weibo.cn/detail/4899535271232791>
"""
mid = ["4899535271232791"]
for id in mid:
get_first_comments(id) # 爬取一级评论
总结
本篇博客介绍了如何使用 Python 爬取微博评论。我们使用 requests
库发送请求,使用 json
库解析响应,使用 csv
库将评论保存到 CSV 文件中。