要用多线程的方法下载梨视频,但是由于一直梨视频视频的数据对应的url一直拿不到,开始的时候还直接访问了某个视频的ajax返回的url,是个没用的,后来基本上完全参考了一篇博客,才踉踉跄跄的把这个小视频下载完成。
转载博客地址,在这里很感谢此博客的博主,去拿一个视频的对应的json解析出来然后构造出url这个自己目前都不知道是怎么得到的。
from lxml import etree
import requests
import os
import random
from multiprocessing.dummy import Pool
if not os.path.exists('./pearvideo/'):
os.mkdir('./pearvideo/')
urls = []
download_urls = []
name_list = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
def home_parse(url):
home_page = requests.get(url=url, headers=headers).text
home_tree = etree.HTML(home_page)
li_list = home_tree.xpath('//*[@id="listvideoListUl"]/li')
for li in li_list:
href = 'https://www.pearvideo.com/' + li.xpath('.//@href')[0]
urls.append(href)
name = li.xpath('.//@vervideo-title')
name_list.append(name)
# print('li_href:', urls)
def get_download_urls(url_list):
for url in url_list:
contId = str(url.split('_')[-1])
params = {
'contId': contId,
'mrd': str(random.random())
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
"Referer": "https://www.pearvideo.com/video_" + contId
}
url = 'https://www.pearvideo.com/videoStatus.jsp'
rsp = requests.get(url=url, headers=headers, params=params)
json = rsp.json()
if 'videoInfo' in json.keys():
srcUrl = json['videoInfo']['videos']['srcUrl']
title = '/'.join(srcUrl.split('/')[0:-1])
body = "/cont-" + contId + '-'
footer = '-'.join(srcUrl.split('-')[1:])
file_root = title + body + footer
download_urls.append(file_root)
print(download_urls)
def get_video(url):
name = url.split('/')[-1]
print('正在下载:{}'.format(name))
data = requests.get(url=url, headers=headers).content
with open('./pearvideo/' + name, 'wb') as f:
f.write(data)
print('成功下载:{}'.format(name))
# def multi_wrapper(args):
# return get_video(*args)
def main():
url = 'https://www.pearvideo.com/category_5'
home_parse(url)
get_download_urls(urls)
pool = Pool(4)
# url_name = list(zip(download_urls, name_list))
# pool.map(multi_wrapper, url_name)
# name_list传不进去,
pool.map(get_video, download_urls)
pool.close()
pool.join()
if __name__ == '__main__':
print('start')
main()
- get_down_urls()函数中的henders的构造比较特别。
- home_parse()中不同视频的url的解析方式需要注意。