python爬虫入门学习8-xpath-pearvideo-视频批量下载-线程池

6 篇文章 0 订阅
4 篇文章 0 订阅
import requests
from util import headers_utils as hd, download_util as du
from lxml import html
import random
import json
from multiprocessing.dummy import Pool

# 下载方法 配合线程池
def batch_download(dict):
    file_path = dict['file_path']
    file_name = dict['file_name']
    url = dict['new_video_url']
    try:
        du.download_file(file_path, file_name, url)
    except Exception as e:
        print('下载失败 ', file_name, url, e)
        pass


if __name__ == '__main__':
    headers = hd.headers()
    etree = html.etree
    url = 'https://www.pearvideo.com/category_59'
    resp = requests.get(url=url, headers=headers)
    page_text = resp.text
    # resp.encoding = 'gbk'
    # 解析网页
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
    # print(li_list)
    index_url = 'https://www.pearvideo.com/'
    file_path = './pear_video'
    get_video_status_url = 'https://www.pearvideo.com/videoStatus.jsp'
    download_dict_list = []
    for li in li_list:
        video_page_url = index_url + li.xpath('./div/a/@href')[0]
        file_name = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
        # print('video_url', video_url)
        video_page_resp = requests.get(url=video_page_url, headers=headers)
        video_page_detail = video_page_resp.text

        # header里参数要带url才能请求到
        headers['Referer'] = video_page_url
        contId = video_page_url.split('_')[1]
        params = (
            ('contId', contId),
            ('mrd', random.random()),
        )
        get_video_resp = requests.get(url=get_video_status_url, headers=headers, params=params)
        video_text = get_video_resp.text
        video_url = json.loads(video_text)['videoInfo']['videos']['srcUrl']
        # 虚假的 video_url
        # https://video.pearvideo.com/mp4/third/20210625/1624789819336-12719568-215128-hd.mp4
        # 通过观察发现真实的url在另一个请求中的vu字段中
        #'https://videocloud.cn-hangzhou.log.aliyuncs.com/logstores/newplayer/track?APIVersion=0.6.0&t=1624796276685&ll=info&lv=1.0&pd=player&md=saas_player&ui=saas_player&sm=play&os=windows&ov=10&et=Chrome&ev=91.0.4472.77&uat=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/91.0.4472.77%20Safari/537.36&hn=&bi=&ri=1490324F-1256-43F3-B9FC-3A1CBCB9733E&e=2003&args=vt%3D2210%26vid%3D&vt=vod&tt=pc&dm=h5&av=2.6.0&uuid=E7474B6E-EB3F-4156-9881-3823260DE2ED&vu=https%3A%2F%2Fvideo.pearvideo.com%2Fmp4%2Fthird%2F20210625%2Fcont-1733335-12719568-215128-hd.mp4&vd=video.pearvideo.com&ua=0&dn=custom&cdn_ip=&app_n=www.pearvideo.com&r=https%3A%2F%2Fwww.pearvideo.com%2Fcategory_59&pu=https%3A%2F%2Fwww.pearvideo.com%2Fvideo_1733335&callback=jsonp_callback_96521&cb=jsonp_callback_96521'
        # vu字段就是实际的video_url
        # https://video.pearvideo.com/mp4/third/20210625/cont-1733335-12719568-215128-hd.mp4
        # 根据'/'切割取最后1段,再根据'-'切割,取第一断拼接替换,1624789819336 替换为 cont-1733335
        fake_url = video_url.split('/')[-1].split('-')[0]
        # 拼接成正确的url
        new_video_url = video_url.replace(fake_url, 'cont-' + contId)
        # print(new_video_url)
        download_dict_list.append({
            'file_path': file_path,
            'file_name': file_name,
            'new_video_url': new_video_url
        })

    pool = Pool(4)
    pool.map(batch_download, download_dict_list)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值