爬取某视频网站评论
效果展示
实现细节
0x01 评论来源
B站的评论需要滑到评论区才会加载,显然是Ajax动态加载的,只 request
页面html
是肯定爬不到的。
F12查看源代码, 点击查看网络
终于找到了!!!
https://api.bilibili.com/x/v2/reply/main?callback=jQuery17208391449976544798_1638881131792&jsonp=jsonp&next=3&type=1&oid=82123220&mode=3&plat=1&_=1638881405984
其实翻一下源码也可以看到
霸特!!!
为什么呢?
看看请求包
会不会是没带cookie
,用 hackbar
试一下
那么我们就知道了B站的评论来源
url = f"https://api.bilibili.com/x/v2/reply/main?callback=jQuery1720170271645222964_1638025631980&jsonp=jsonp&next={i}&type=1&oid=82123220&mode=3&plat=1&_=1638025655607"
html = requests.get(url, headers=headers)
其中
?
之后的内容即参数,其中的第一项callback
和最后一项参数下划线是以某种方式随机生成的记录用户行为的(我也不清楚具体原理),我们在爬取的时候需要将这两项参数去除就能正常访问json文件。
0x02 评论内容提取
reply = reply[reply.find('{', 0, len(reply)):-1] # 去除前缀
reply = json.loads(reply) # 将json格式转换成python的字典格式
分析一下评论json
的格式
可以看出
- 主评论存放在
["data"]["replies"]
列表, - 主评论内容存放在
["data"]["replies"]["content"]["message"]
- 评论回复存放在
["data"]["replies"]["replies"]
列表 - 评论回复内容存放在
["data"]["replies"]["replies"]["content"]["message"]
replies = reply["data"]["replies"]
if not replies:
break
# 主评论
for reply in replies:
reply_content = reply["content"]["message"]
reply_file.write(reply_content)
insert_db(reply_content)
print(reply_content)
length += 1
try:
# 评论回复
for reply_reply in reply["replies"]:
reply_reply_content = reply_reply["content"]["message"]
reply_file.write(reply_reply_content)
insert_db(reply_reply_content)
print(" ", reply_reply_content)
length += 1
except (TypeError, json.decoder.JSONDecodeError):
pass
0x03 插入数据库
数据库结构
def insert_db(reply_content):
try:
conn = pymysql.connect(host="127.0.0.1", user="root", password="root", database="bilibilireply", charset='utf8') # 连接数据库
cursor = conn.cursor()
insert_news_sql = "INSERT INTO `replies` (`reply`) VALUES(%s);"
cursor.execute(insert_news_sql, reply_content)
conn.commit()
except pymysql.err.DataError: # 非法字符无法插入数据库
pass
finally:
cursor.close() # 关闭游标
conn.close() # 关闭数据库连接
0x04 词云
reply_file = open("replies.txt", "r", encoding="utf-8")
text = reply_file.read()
reply_file.close()
ls = jieba.lcut(text) # jieba中文分词
excludes = {"的", "我", "了", "你", "是", "呀", "或者", "在"} # 过滤常用词
for word in ls:
if word in excludes:
ls.remove(word)
txt = " ".join(ls)
w = wordcloud.WordCloud(font_path="msyh.ttc", width=1000, height=700,\
background_color="white", max_words=300)
w.generate(txt)
w.to_file("cloud.png")
源码:
import requests
import json
import pymysql
import wordcloud
import jieba
def insert_db(reply_content):
try:
conn = pymysql.connect(host="127.0.0.1", user="root", password="root", database="bilibilireply", charset='utf8') # 连接数据库
cursor = conn.cursor()
insert_news_sql = "INSERT INTO `replies` (`reply`) VALUES(%s);"
cursor.execute(insert_news_sql, reply_content)
conn.commit()
except pymysql.err.DataError:
pass
finally:
cursor.close() # 关闭游标
conn.close() # 关闭数据库连接
headers = {
"Host": "api.bilibili.com",
"Connection": "close",
"Cache-Control": "max-age=0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.34",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Referer": "https://api.bilibili.com/x/v2/reply/main?callback=jQuery17209005301603174096_1638023424899&jsonp=jsonp&next=0&type=1&oid=82123220&mode=3&plat=1&_=1638023431977",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Cookie": "..."
}
length = 0
reply_file = open("replies.txt", "w", encoding="utf-8")
try:
for i in range(1, 10000):
url = f"https://api.bilibili.com/x/v2/reply/main?callback=jQuery1720170271645222964_1638025631980&jsonp=jsonp&next={i}&type=1&oid=82123220&mode=3&plat=1&_=1638025655607"
html = requests.get(url, headers=headers)
reply = html.text
reply = reply[reply.find('{', 0, len(reply)):-1]
reply = json.loads(reply)
replies = reply["data"]["replies"]
if not replies:
break
for reply in replies:
reply_content = reply["content"]["message"]
reply_file.write(reply_content)
insert_db(reply_content)
print(reply_content)
length += 1
try:
for reply_reply in reply["replies"]:
reply_reply_content = reply_reply["content"]["message"]
reply_file.write(reply_reply_content)
insert_db(reply_reply_content)
print(" ", reply_reply_content)
length += 1
except (TypeError, json.decoder.JSONDecodeError):
pass
except (TypeError, json.decoder.JSONDecodeError):
pass
print(length)
reply_file.close()
reply_file = open("replies.txt", "r", encoding="utf-8")
text = reply_file.read()
reply_file.close()
ls = jieba.lcut(text)
excludes = {"的", "我", "了", "你", "是", "呀", "或者", "在"}
for word in ls:
if word in excludes:
ls.remove(word)
txt = " ".join(ls)
w = wordcloud.WordCloud(font_path="msyh.ttc", width=1000, height=700,\
background_color="white", max_words=300)
w.generate(txt)
w.to_file("cloud.png")