b站路飞学城python课梨视频项目代码

@b站路飞学城python课梨视频项目代码

# -*- coding:utf-8 -*-
import requests
from lxml import etree
import random
import re
import os
from multiprocessing.dummy import Pool
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}

url = 'https://www.pearvideo.com/category_5'

page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('.//ul[@id="listvideoListUl"]/li')

name_urls = []
for li in li_list:
    detail_url = 'https://www.pearvideo.com/' + li.xpath('./div[@class="vervideo-bd"]/a[1]/@href')[0]
    title = li.xpath('./div[@class="vervideo-bd"]/a[1]//div[@class="vervideo-title"]/text()')[0] + '.mp4'
    # print(title, detail_url)

    video_id = detail_url.split('_')[1]
    video_url = 'https://www.pearvideo.com/videoStatus.jsp?'
    params = {
        'contId': video_id,
        'mrd': str(random.random())
    }

    new_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
        # Referer检测:Referer是http协议中定义的,Referer就是上一个页面的地址,这个是浏览器在用户点击一个链接时自动添加到请求头中,防止盗连
        # 加入Referer绕过Referer检测
        'Referer': 'https://www.pearvideo.com/video_' + video_id,
    }

    data_dic = requests.get(url=video_url, headers=new_headers, params=params).json()
    video_src = data_dic['videoInfo']['videos']['srcUrl']
    # print(video_src)

    # 爬取到的视频链接为伪链接
    # 伪链接
    # https://video.pearvideo.com/mp4/third/20210512/1621065741471-12785353-102622-hd.mp4
    # 真链接
    # https://video.pearvideo.com/mp4/third/20210512/cont-1729172-12785353-102622-hd.mp4

    # 处理伪链接,得到真链接
    first_url = os.path.dirname(video_src)  # https://video.pearvideo.com/mp4/third/20210512
    last_url = os.path.basename(video_src)  # 1621069443338-12785353-102622-hd.mp4
    true_part = re.match('^\d+(.*)', last_url).group(1)  # -12785353-102622-hd.mp4
    # print(true_part)
    true_url = first_url + '/cont-' + video_id + true_part
    video_dict = {
        'name': title,
        'url': true_url
    }
    name_urls.append(video_dict)

start_time = time.time()

# 使用单线程串行方式执行
# file_path = './pear_video'
# if not os.path.exists(file_path):
#     os.mkdir(file_path)
# for name_url in name_urls:
#     name = name_url['name']
#     real_url = name_url['url']
#     print(name, '正在下载中...')
#     video_data = requests.get(url=real_url, headers=headers).content
#     # 持久化存储
#     with open(file_path + '/' + name, 'wb') as fp:
#         fp.write(video_data)
#         print(name, '下载完成!!!')


# 使用线程池对视频数据进行请求
def get_video_data(dic):
    name = dic['name']
    real_url = dic['url']
    print(name, '正在下载中...')
    video_data = requests.get(url=real_url, headers=headers).content
    file_path = './pear_video'
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    # 持久化存储
    with open(file_path + '/' + name, 'wb') as fp:
        fp.write(video_data)
        print(name, '下载完成!!!')


pool = Pool(4)
pool.map(get_video_data, name_urls)
pool.close()
pool.join()

end_time = time.time()

print(end_time - start_time)

最后爬取到的视频:
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值