python爬虫 爬取初一到初三的数学视频信息

爬取初一到初三的数学视频

好久没学数学了 为了大专的数学能跟上所以想重新把初中数学重新学一遍,早知今日何必当初。
教程开始:

  1. 寻找视频的地址,我这边选择了哔哩哔哩的初中数学视频教学url地址传送门

2.按下F12进去调试工具 ,发现电脑版的页面的内容太多了,所以果断切换手机版的页面, 这次运气很好一下就找到了url地址:
在这里插入图片描述
因为这是html页面的文件 我,们再来看看主页的代码是否存在这个url地址,我发现主页html也是存在这个url地址的:
在这里插入图片描述

下面我再用xpath尝试提取视频的url地址, 成功
在这里插入图片描述

下面就可以写python代码进行提取了:
基本流程样式:
下面开始完成逻辑代码的编写:
在这里插入图片描述
下面出现了一个问题, 我请求的是手机版的页面但是一直给我 返回电脑版的页面, 导致我xpath提取不到对应的数据
页面的数据:
在这里插入图片描述
程序请求的数据:
在这里插入图片描述
这就很纳闷了,请求发送的headers也是手机的headers, 也尝试过添加完整的cookis,也都没有用

        "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"

发现火候不够只能暂时放下这个项目了, 不过提取到了关键的信息, 和视频弹幕
具体程序实现如下:

import requests
import json
import time


class Mathematics_api():
    """获取json_api文件信息类"""

    def __init__(self):
        """初始化数据"""
        self.api_url = "https://api.bilibili.com/x/player/pagelist?bvid=BV114411Q7Y4&jsonp=jsonp"
        self.api_headers = {
            "cookie": "_uuid=B4D50761-F743-3D25-E32C-834CE4F469CC02885infoc; buvid3=67F7DDD3-CD6D-4511-A0CE-19E56C3AFBE0138375infoc; sid=j7u10jnk; DedeUserID=495804684; DedeUserID__ckMd5=f3328acbcf3a9c47; SESSDATA=7f957db1%2C1614083813%2C89040*81; bili_jct=3038ae8434f056c3e83f356a2b3525c4; blackside_state=1; rpdid=|(um|)YkJR~u0J'ulm)YulmY); CURRENT_FNVAL=80; CURRENT_QUALITY=80; bsource=search_baidu; PVID=7",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
        }

        self.id_url_api = "https://bvc.bilivideo.com/pbp/data?r=loader&cid={}&aid=63413537"
        self.id_url_headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
        }

        self.aid_url = "https://api.bilibili.com/x/web-interface/archive/stat?aid=63413537&jsonp=jsonp"
        self.aid_headers = {

            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"

        }


    def Response_get(self, url, headers):
        """发送请求获取响应"""
        response = requests.get(url=url, headers=headers)
        return json.loads(response.content.decode())


    def Video_content(self, api_data):
        """获取视频信息"""
        # 获取视频名字
        video_name = list()
        # 获取视频cid
        self.video_cid = list()
        # 获取page
        self.video_page = list()
        for data in api_data:
            # 获取视频信息
            video_name.append(data['part'])

            # 获取视频cid
            self.video_cid.append(data['cid'])

            # 获取page
            self.video_page.append(data['page'])

        return video_name, self.video_cid, self.video_page


    def Merge_data(self, name, cid):
        """对数据进行合并"""
        num = 1
        print("*" * 50)
        print("初一数学-七年级数学-上册-下册-初中数学:")
        print()

        for i in ["序号", "\t\t\t\t", "课程"]:
            print(i, end="")
        print()
        print("*" * 50)

        for name in zip(name, cid):
            print('%s\t\t\t\t%s' % (num, name[0]))
            print("\t\t\t\trid:", name[1])
            print()
            num += 1


    def User_Choice(self):
        """用户的选择"""
        num = int(input("请输入你要下载观看的视频:"))
        video_rid = self.video_cid[num - 1]
        video_page = self.video_page[num - 1]
        return video_rid, video_page


    def Api_Url_To(self, rid, url, headers):
        """构建第二个api的url地址, 并获取响应"""
        urls = url.format(rid)
        response = requests.get(url=urls, headers=headers)
        return json.loads(response.content.decode())


    def Aid_Api(self, url, headers):
        """获取aid"""
        response = requests.get(url=url, headers=headers)
        return json.loads(response.content.decode())['data']['aid']


    ######################################实现主要逻辑方法调用
    def Run(self):
        """获取每一集的api信息"""
        # 发送请求获取api相应
        api_data = self.Response_get(url=self.api_url, headers=self.api_headers)

        # 获取视频的详信息
        name, cid, page = self.Video_content(api_data=api_data['data'])
        print("cid>", cid)
        # 对数据进行合并, 输出数据
        self.Merge_data(name=name, cid=cid)

        # 用户的选择 并返回rid
        rid, page = self.User_Choice()
        print("rid是:", rid)
        print("page是:", page)

        # 获取另一个api的信息, 将返回大的rid填写到第二个api地址中
        json_api_to = self.Api_Url_To(rid=rid, url=self.id_url_api, headers=self.id_url_headers)


        # 获取aid
        aid = self.Aid_Api(self.aid_url, self.aid_headers)
        print("aid是:",aid)




#####################################################获取视频弹幕类#######################################################
class Bullet_Chat():
    """获取视频弹幕"""

    def __init__(self):
        """视频单幕信息"""
        self.chats_url = "https://api.bilibili.com/x/v2/reply?pn={}&type=1&oid=63413537"
        self.chat_headers = {
            "cookie": "_uuid=B4D50761-F743-3D25-E32C-834CE4F469CC02885infoc; buvid3=67F7DDD3-CD6D-4511-A0CE-19E56C3AFBE0138375infoc; sid=j7u10jnk; DedeUserID=495804684; DedeUserID__ckMd5=f3328acbcf3a9c47; SESSDATA=7f957db1%2C1614083813%2C89040*81; bili_jct=3038ae8434f056c3e83f356a2b3525c4; blackside_state=1; rpdid=|(um|)YkJR~u0J'ulm)YulmY); CURRENT_FNVAL=80; CURRENT_QUALITY=80; bsource=search_baidu; PVID=2",

            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"}


    def Request_Chat(self, char_url, headers):
        """获取单幕json信息"""
        response = requests.get(url=char_url, headers=headers)
        return json.loads(response.content.decode())['data']['replies'], json.loads(response.content.decode())['data'][
            'hots']


    def Content_Data(self, response):
        """清洗json数据"""
        content_list = []
        for data in response:
            # print(data)
            data_list = {}
            # 获取用户名
            data_list['user_name'] = data['member']['uname']
            data_list['user_name'] = data_list['user_name'] if len(data_list['user_name']) > 0 else None

            # 获取性别
            data_list['gender'] = data['member']['sex']
            data_list['gender'] = data_list['gender'] if len(data_list['gender']) > 0 else None

            # 获取用户头像地址
            data_list['user_img'] = data['member']['avatar']
            data_list['user_img'] = data_list['user_img'] if len(data_list['user_img']) > 0 else None

            # 获取弹幕
            data_list['content'] = data['content']['message']
            data_list['content'] = data_list['content'] if len(data_list['content']) > 0 else None
            content_list.append(data_list)

        return content_list


    def Print_data(self, content, content2):
        """整理打印评论"""
        print("=" * 100)
        print("用户的评论:")
        print()
        for data in content:
            print("用户名:{}\n用户头像地址:{}\n姓名:{}\n评论内容:{}".format(data['user_name'], data['user_img'], data['gender'],
                                                             data['content']))
            print("#" * 100)
            print()

        for data in content2:
            print("用户名:{}\n用户头像地址:{}\n姓名:{}\n评论内容:{}".format(data['user_name'], data['user_img'], data['gender'],
                                                             data['content']))
            print("#" * 100)
            print()


    def url_list(self):
        """构建url"""
        url = int(input("请输入你要爬取的页数:"))
        return self.chats_url.format(url)


    ######################################实现主要逻辑方法调用
    def run(self):
        # 发送请求获取响应
        urls = self.url_list()
        response, response2 = self.Request_Chat(char_url=urls, headers=self.chat_headers)
        # 清洗json数据
        content = self.Content_Data(response=response)

        content2 = self.Content_Data(response=response2)

        self.Print_data(content, content2)




#####################################################获取视频类#######################################################
class Mathematics_video():
    """用获取视频的类"""

    def __init__(self):
        """初始化数据"""
        self.video_url = "https://m.bilibili.com/video/BV114411Q7Y4"

        self.video_headers = {

            ":authority": "m.bilibili.com",
            ":method": "GET",
            ":path": "/video/BV114411Q7Y4",
            ":scheme": "https",
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "cookie": "finger=1571944565; _uuid=B4D50761-F743-3D25-E32C-834CE4F469CC02885infoc; buvid3=67F7DDD3-CD6D-4511-A0CE-19E56C3AFBE0138375infoc; sid=j7u10jnk; DedeUserID=495804684; DedeUserID__ckMd5=f3328acbcf3a9c47; SESSDATA=7f957db1%2C1614083813%2C89040*81; bili_jct=3038ae8434f056c3e83f356a2b3525c4; blackside_state=1; rpdid=|(um|)YkJR~u0J'ulm)YulmY); CURRENT_FNVAL=80; CURRENT_QUALITY=80; bsource=search_baidu; PVID=2",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36",
        }

    def Run(self):
        """获取视频地址"""
        response = requests.get(self.video_url, self.video_headers)
        print(response.content.decode())

def Main():
    """实例化对象 调用主要逻辑方法"""
    star_time = time.time()
    mathematics_api = Mathematics_api()
    mathematics_api.Run()

    mathematics_video = Mathematics_video()
    mathematics_video.Run()

    bullet_Chat = Bullet_Chat()
    bullet_Chat.run()
    end_time = time.time()
    print("用时:",end_time-star_time)


if __name__ == '__main__':
    Main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值