@b站路飞学城python课梨视频项目代码
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import random
import re
import os
from multiprocessing.dummy import Pool
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
url = 'https://www.pearvideo.com/category_5'
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('.//ul[@id="listvideoListUl"]/li')
name_urls = []
for li in li_list:
detail_url = 'https://www.pearvideo.com/' + li.xpath('./div[@class="vervideo-bd"]/a[1]/@href')[0]
title = li.xpath('./div[@class="vervideo-bd"]/a[1]//div[@class="vervideo-title"]/text()')[0] + '.mp4'
# print(title, detail_url)
video_id = detail_url.split('_')[1]
video_url = 'https://www.pearvideo.com/videoStatus.jsp?'
params = {
'contId': video_id,
'mrd': str(random.random())
}
new_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
# Referer检测:Referer是http协议中定义的,Referer就是上一个页面的地址,这个是浏览器在用户点击一个链接时自动添加到请求头中,防止盗连
# 加入Referer绕过Referer检测
'Referer': 'https://www.pearvideo.com/video_' + video_id,
}
data_dic = requests.get(url=video_url, headers=new_headers, params=params).json()
video_src = data_dic['videoInfo']['videos']['srcUrl']
# print(video_src)
# 爬取到的视频链接为伪链接
# 伪链接
# https://video.pearvideo.com/mp4/third/20210512/1621065741471-12785353-102622-hd.mp4
# 真链接
# https://video.pearvideo.com/mp4/third/20210512/cont-1729172-12785353-102622-hd.mp4
# 处理伪链接,得到真链接
first_url = os.path.dirname(video_src) # https://video.pearvideo.com/mp4/third/20210512
last_url = os.path.basename(video_src) # 1621069443338-12785353-102622-hd.mp4
true_part = re.match('^\d+(.*)', last_url).group(1) # -12785353-102622-hd.mp4
# print(true_part)
true_url = first_url + '/cont-' + video_id + true_part
video_dict = {
'name': title,
'url': true_url
}
name_urls.append(video_dict)
start_time = time.time()
# 使用单线程串行方式执行
# file_path = './pear_video'
# if not os.path.exists(file_path):
# os.mkdir(file_path)
# for name_url in name_urls:
# name = name_url['name']
# real_url = name_url['url']
# print(name, '正在下载中...')
# video_data = requests.get(url=real_url, headers=headers).content
# # 持久化存储
# with open(file_path + '/' + name, 'wb') as fp:
# fp.write(video_data)
# print(name, '下载完成!!!')
# 使用线程池对视频数据进行请求
def get_video_data(dic):
name = dic['name']
real_url = dic['url']
print(name, '正在下载中...')
video_data = requests.get(url=real_url, headers=headers).content
file_path = './pear_video'
if not os.path.exists(file_path):
os.mkdir(file_path)
# 持久化存储
with open(file_path + '/' + name, 'wb') as fp:
fp.write(video_data)
print(name, '下载完成!!!')
pool = Pool(4)
pool.map(get_video_data, name_urls)
pool.close()
pool.join()
end_time = time.time()
print(end_time - start_time)
最后爬取到的视频: