爬取某站弹幕和评论
相关参考文章:
以爬该站点为例:
https://www.bilibili.com/bangumi/play/ep403767from_spmid=666.25.episode.0&from_outer_spmid=333.337.0.0
主要目录:
爬取弹幕
- 分析B站网页的内容—>F12打开开发者管理—>找到network,很多网页的东西都在这里面可以找到。
上面的图片中,展示了弹幕中的所有内容。仔细看保存在seg.so这个文件中了,这对于下一步操作 就方便了许多。
2.要分析你所需要爬取B站地址的aid,cid等内容,参考第一个链接。
3.依靠前面的分析,根据所需要的内容进行变更即可下载。
import re
import requests
def downloadfiles(url,count):
f = requests.get(url)
filename = str(count)+".so"
with open(filename,"wb") as code:
code.write(f.content)
print('文件下载成功!')
print('-----------')
def calculatenum():
global endnum
f = open('test2.so', "r", encoding='utf-8')
lines = f.readlines()
for item in lines:
d = re.findall('>(.*?)<', item)
emptycontain = []
i = 0
for index in d:
emptycontain.append('{}'.format(i + 1) + index)
i += 1
print('列表中元素总数为:{}'.format(i))
print('------------------')
endnum = i - 2
def convert():
global endnum
ff = open('./峡谷-弹幕文件.txt', "w", encoding='utf-8')
f = open('./test2.so', "r", encoding='utf-8')
lines = f.readlines()
content = []
calculatenum()
for item in lines:
d = re.findall('>(.*?)<', item)
i = 16
while i <= endnum:
ff.write(d[i])
ff.write('\n')
content.append(d[i])
i += 2
else:
break
print('弹幕整理完成!')
if __name__ == '__main__':
downloadfiles('https://api.bilibili.com/x/v1/dm/list.so?oid=314105897', 'test1') # 草海
# downloadfiles('https://api.bilibili.com/x/v1/dm/list.so?oid=314094543', 'test2') # 冰河
# downloadfiles('https://api.bilibili.com/x/v1/dm/list.so?oid=314116841', 'test3') # 峡谷
convert()
爬取评论
与上方介绍同理,找到需要的内容,进行爬取。
import requests
f = open('评论.txt', "w", encoding='utf-8')
f.write("评论" + "\n")
f.write("\n")
def get():
# url = "https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next=0&type=1&oid=379680114&mode=3&plat=1"
url = "https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next=0&type=1&oid=204809650&mode=3&plat=1"
#需要加申请头,注意字符串中出现有一些特殊符号需要保留需要加转义符号\
#cookie是用户信息
#user-agent是服务器申请
headers = {
"cookie": "换成自己的",
'accept-language': 'zh-CN,zh;q=0.9',
'referer': 'https://www.bilibili.com/bangumi/play/ep403767?from_spmid=666.25.player.continue&from_outer_spmid=333.337.0.0',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'script',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-site',
'user-agent': '换成自己的'
}
response = requests.get(url=url, headers=headers).json()
result = response['data']['replies']
try:
for item in result:
user = item['member']['uname']
comment = item['content']['message']
f.write(user)
f.write(":")
f.write(comment)
f.write("\n")
try:
for index in item['replies']:
f.write(" "+index['member']['uname'])
f.write(":")
f.write(index['content']['message'])
f.write("\n")
except(TypeError):
f.write("\n")
i = 1
while response['data']['cursor']['is_end'] == False:
i += 1
url1 = "https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next=" + str(i) + "&type=1&oid=204809650&mode=3&plat=1"
response1 = requests.get(url=url1, headers=headers).json()
result1 = response1['data']['replies']
for item2 in result1:
user = item2['member']['uname']
comment = item2['content']['message']
f.write(user)
f.write(":")
f.write(comment)
f.write("\n")
try:
for index1 in item2['replies']:
f.write(" " + index1['member']['uname'])
f.write(":")
f.write(index1['content']['message'])
f.write("\n")
except(TypeError):
f.write("\n")
else:
print('程序停止')
except(TypeError):
print("程序停止")
if __name__ == '__main__':
get()