爬取初一到初三的数学视频
好久没学数学了 为了大专的数学能跟上所以想重新把初中数学重新学一遍,早知今日何必当初。
教程开始:
- 寻找视频的地址,我这边选择了哔哩哔哩的初中数学视频教学url地址传送门
2.按下F12进去调试工具 ,发现电脑版的页面的内容太多了,所以果断切换手机版的页面, 这次运气很好一下就找到了url地址:
因为这是html页面的文件 我,们再来看看主页的代码是否存在这个url地址,我发现主页html也是存在这个url地址的:
下面我再用xpath尝试提取视频的url地址, 成功
下面就可以写python代码进行提取了:
基本流程样式:
下面开始完成逻辑代码的编写:
下面出现了一个问题, 我请求的是手机版的页面但是一直给我 返回电脑版的页面, 导致我xpath提取不到对应的数据
页面的数据:
程序请求的数据:
这就很纳闷了,请求发送的headers也是手机的headers, 也尝试过添加完整的cookis,也都没有用
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
发现火候不够只能暂时放下这个项目了, 不过提取到了关键的信息, 和视频弹幕
具体程序实现如下:
import requests
import json
import time
class Mathematics_api():
"""获取json_api文件信息类"""
def __init__(self):
"""初始化数据"""
self.api_url = "https://api.bilibili.com/x/player/pagelist?bvid=BV114411Q7Y4&jsonp=jsonp"
self.api_headers = {
"cookie": "_uuid=B4D50761-F743-3D25-E32C-834CE4F469CC02885infoc; buvid3=67F7DDD3-CD6D-4511-A0CE-19E56C3AFBE0138375infoc; sid=j7u10jnk; DedeUserID=495804684; DedeUserID__ckMd5=f3328acbcf3a9c47; SESSDATA=7f957db1%2C1614083813%2C89040*81; bili_jct=3038ae8434f056c3e83f356a2b3525c4; blackside_state=1; rpdid=|(um|)YkJR~u0J'ulm)YulmY); CURRENT_FNVAL=80; CURRENT_QUALITY=80; bsource=search_baidu; PVID=7",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}
self.id_url_api = "https://bvc.bilivideo.com/pbp/data?r=loader&cid={}&aid=63413537"
self.id_url_headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}
self.aid_url = "https://api.bilibili.com/x/web-interface/archive/stat?aid=63413537&jsonp=jsonp"
self.aid_headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}
def Response_get(self, url, headers):
"""发送请求获取响应"""
response = requests.get(url=url, headers=headers)
return json.loads(response.content.decode())
def Video_content(self, api_data):
"""获取视频信息"""
# 获取视频名字
video_name = list()
# 获取视频cid
self.video_cid = list()
# 获取page
self.video_page = list()
for data in api_data:
# 获取视频信息
video_name.append(data['part'])
# 获取视频cid
self.video_cid.append(data['cid'])
# 获取page
self.video_page.append(data['page'])
return video_name, self.video_cid, self.video_page
def Merge_data(self, name, cid):
"""对数据进行合并"""
num = 1
print("*" * 50)
print("初一数学-七年级数学-上册-下册-初中数学:")
print()
for i in ["序号", "\t\t\t\t", "课程"]:
print(i, end="")
print()
print("*" * 50)
for name in zip(name, cid):
print('%s\t\t\t\t%s' % (num, name[0]))
print("\t\t\t\trid:", name[1])
print()
num += 1
def User_Choice(self):
"""用户的选择"""
num = int(input("请输入你要下载观看的视频:"))
video_rid = self.video_cid[num - 1]
video_page = self.video_page[num - 1]
return video_rid, video_page
def Api_Url_To(self, rid, url, headers):
"""构建第二个api的url地址, 并获取响应"""
urls = url.format(rid)
response = requests.get(url=urls, headers=headers)
return json.loads(response.content.decode())
def Aid_Api(self, url, headers):
"""获取aid"""
response = requests.get(url=url, headers=headers)
return json.loads(response.content.decode())['data']['aid']
######################################实现主要逻辑方法调用
def Run(self):
"""获取每一集的api信息"""
# 发送请求获取api相应
api_data = self.Response_get(url=self.api_url, headers=self.api_headers)
# 获取视频的详信息
name, cid, page = self.Video_content(api_data=api_data['data'])
print("cid>", cid)
# 对数据进行合并, 输出数据
self.Merge_data(name=name, cid=cid)
# 用户的选择 并返回rid
rid, page = self.User_Choice()
print("rid是:", rid)
print("page是:", page)
# 获取另一个api的信息, 将返回大的rid填写到第二个api地址中
json_api_to = self.Api_Url_To(rid=rid, url=self.id_url_api, headers=self.id_url_headers)
# 获取aid
aid = self.Aid_Api(self.aid_url, self.aid_headers)
print("aid是:",aid)
#####################################################获取视频弹幕类#######################################################
class Bullet_Chat():
"""获取视频弹幕"""
def __init__(self):
"""视频单幕信息"""
self.chats_url = "https://api.bilibili.com/x/v2/reply?pn={}&type=1&oid=63413537"
self.chat_headers = {
"cookie": "_uuid=B4D50761-F743-3D25-E32C-834CE4F469CC02885infoc; buvid3=67F7DDD3-CD6D-4511-A0CE-19E56C3AFBE0138375infoc; sid=j7u10jnk; DedeUserID=495804684; DedeUserID__ckMd5=f3328acbcf3a9c47; SESSDATA=7f957db1%2C1614083813%2C89040*81; bili_jct=3038ae8434f056c3e83f356a2b3525c4; blackside_state=1; rpdid=|(um|)YkJR~u0J'ulm)YulmY); CURRENT_FNVAL=80; CURRENT_QUALITY=80; bsource=search_baidu; PVID=2",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"}
def Request_Chat(self, char_url, headers):
"""获取单幕json信息"""
response = requests.get(url=char_url, headers=headers)
return json.loads(response.content.decode())['data']['replies'], json.loads(response.content.decode())['data'][
'hots']
def Content_Data(self, response):
"""清洗json数据"""
content_list = []
for data in response:
# print(data)
data_list = {}
# 获取用户名
data_list['user_name'] = data['member']['uname']
data_list['user_name'] = data_list['user_name'] if len(data_list['user_name']) > 0 else None
# 获取性别
data_list['gender'] = data['member']['sex']
data_list['gender'] = data_list['gender'] if len(data_list['gender']) > 0 else None
# 获取用户头像地址
data_list['user_img'] = data['member']['avatar']
data_list['user_img'] = data_list['user_img'] if len(data_list['user_img']) > 0 else None
# 获取弹幕
data_list['content'] = data['content']['message']
data_list['content'] = data_list['content'] if len(data_list['content']) > 0 else None
content_list.append(data_list)
return content_list
def Print_data(self, content, content2):
"""整理打印评论"""
print("=" * 100)
print("用户的评论:")
print()
for data in content:
print("用户名:{}\n用户头像地址:{}\n姓名:{}\n评论内容:{}".format(data['user_name'], data['user_img'], data['gender'],
data['content']))
print("#" * 100)
print()
for data in content2:
print("用户名:{}\n用户头像地址:{}\n姓名:{}\n评论内容:{}".format(data['user_name'], data['user_img'], data['gender'],
data['content']))
print("#" * 100)
print()
def url_list(self):
"""构建url"""
url = int(input("请输入你要爬取的页数:"))
return self.chats_url.format(url)
######################################实现主要逻辑方法调用
def run(self):
# 发送请求获取响应
urls = self.url_list()
response, response2 = self.Request_Chat(char_url=urls, headers=self.chat_headers)
# 清洗json数据
content = self.Content_Data(response=response)
content2 = self.Content_Data(response=response2)
self.Print_data(content, content2)
#####################################################获取视频类#######################################################
class Mathematics_video():
"""用获取视频的类"""
def __init__(self):
"""初始化数据"""
self.video_url = "https://m.bilibili.com/video/BV114411Q7Y4"
self.video_headers = {
":authority": "m.bilibili.com",
":method": "GET",
":path": "/video/BV114411Q7Y4",
":scheme": "https",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": "finger=1571944565; _uuid=B4D50761-F743-3D25-E32C-834CE4F469CC02885infoc; buvid3=67F7DDD3-CD6D-4511-A0CE-19E56C3AFBE0138375infoc; sid=j7u10jnk; DedeUserID=495804684; DedeUserID__ckMd5=f3328acbcf3a9c47; SESSDATA=7f957db1%2C1614083813%2C89040*81; bili_jct=3038ae8434f056c3e83f356a2b3525c4; blackside_state=1; rpdid=|(um|)YkJR~u0J'ulm)YulmY); CURRENT_FNVAL=80; CURRENT_QUALITY=80; bsource=search_baidu; PVID=2",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36",
}
def Run(self):
"""获取视频地址"""
response = requests.get(self.video_url, self.video_headers)
print(response.content.decode())
def Main():
"""实例化对象 调用主要逻辑方法"""
star_time = time.time()
mathematics_api = Mathematics_api()
mathematics_api.Run()
mathematics_video = Mathematics_video()
mathematics_video.Run()
bullet_Chat = Bullet_Chat()
bullet_Chat.run()
end_time = time.time()
print("用时:",end_time-star_time)
if __name__ == '__main__':
Main()