import requests
from util import headers_utils as hd, download_util as du
from lxml import html
import random
import json
from multiprocessing.dummy import Pool
# 下载方法 配合线程池
def batch_download(dict):
file_path = dict['file_path']
file_name = dict['file_name']
url = dict['new_video_url']
try:
du.download_file(file_path, file_name, url)
except Exception as e:
print('下载失败 ', file_name, url, e)
pass
if __name__ == '__main__':
headers = hd.headers()
etree = html.etree
url = 'https://www.pearvideo.com/category_59'
resp = requests.get(url=url, headers=headers)
page_text = resp.text
# resp.encoding = 'gbk'
# 解析网页
tree = etree.HTML(page_text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
# print(li_list)
index_url = 'https://www.pearvideo.com/'
file_path = './pear_video'
get_video_status_url = 'https://www.pearvideo.com/videoStatus.jsp'
download_dict_list = []
for li in li_list:
video_page_url = index_url + li.xpath('./div/a/@href')[0]
file_name = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
# print('video_url', video_url)
video_page_resp = requests.get(url=video_page_url, headers=headers)
video_page_detail = video_page_resp.text
# header里参数要带url才能请求到
headers['Referer'] = video_page_url
contId = video_page_url.split('_')[1]
params = (
('contId', contId),
('mrd', random.random()),
)
get_video_resp = requests.get(url=get_video_status_url, headers=headers, params=params)
video_text = get_video_resp.text
video_url = json.loads(video_text)['videoInfo']['videos']['srcUrl']
# 虚假的 video_url
# https://video.pearvideo.com/mp4/third/20210625/1624789819336-12719568-215128-hd.mp4
# 通过观察发现真实的url在另一个请求中的vu字段中
#'https://videocloud.cn-hangzhou.log.aliyuncs.com/logstores/newplayer/track?APIVersion=0.6.0&t=1624796276685&ll=info&lv=1.0&pd=player&md=saas_player&ui=saas_player&sm=play&os=windows&ov=10&et=Chrome&ev=91.0.4472.77&uat=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/91.0.4472.77%20Safari/537.36&hn=&bi=&ri=1490324F-1256-43F3-B9FC-3A1CBCB9733E&e=2003&args=vt%3D2210%26vid%3D&vt=vod&tt=pc&dm=h5&av=2.6.0&uuid=E7474B6E-EB3F-4156-9881-3823260DE2ED&vu=https%3A%2F%2Fvideo.pearvideo.com%2Fmp4%2Fthird%2F20210625%2Fcont-1733335-12719568-215128-hd.mp4&vd=video.pearvideo.com&ua=0&dn=custom&cdn_ip=&app_n=www.pearvideo.com&r=https%3A%2F%2Fwww.pearvideo.com%2Fcategory_59&pu=https%3A%2F%2Fwww.pearvideo.com%2Fvideo_1733335&callback=jsonp_callback_96521&cb=jsonp_callback_96521'
# vu字段就是实际的video_url
# https://video.pearvideo.com/mp4/third/20210625/cont-1733335-12719568-215128-hd.mp4
# 根据'/'切割取最后1段,再根据'-'切割,取第一断拼接替换,1624789819336 替换为 cont-1733335
fake_url = video_url.split('/')[-1].split('-')[0]
# 拼接成正确的url
new_video_url = video_url.replace(fake_url, 'cont-' + contId)
# print(new_video_url)
download_dict_list.append({
'file_path': file_path,
'file_name': file_name,
'new_video_url': new_video_url
})
pool = Pool(4)
pool.map(batch_download, download_dict_list)
python爬虫入门学习8-xpath-pearvideo-视频批量下载-线程池
最新推荐文章于 2022-06-29 11:10:51 发布