Python多线程爬虫
需求教程(旧版):bilibili传送门
完整代码
import requests
from lxml import etree
import re
import os
from multiprocessing.dummy import Pool
# 保存视频
def get_video(video_all):
print(video_all['title'],'正在下载...')
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
response = requests.get(video_all['true_url'],headers = headers)
mp4 = response.content
code = response.status_code
with open('.\pearvideo\%s.mp4'%video_all['title'],'wb') as fp:
fp.write(mp4)
print(video_all['title'],'响应码:',code,end=" ")
if code == 200:
print('视频保存成功')
else:
print('视频保存失败')
# 创建文件夹
if not os.path.exists("./pearvideo"):
os.mkdir("./pearvideo")
# 获取主页面
url = 'https://www.pearvideo.com/category_5'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
text = requests.get(url,headers = headers).text
tree = etree.HTML(text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li/div/a/@href')
print(li_list)
video_list = []
for li in li_list:
# 在XHR中找到json()数据,其中包含mp4的url
ex = 'video_([\d]*)'
v_li = re.findall(ex,li)[0]
v_url = 'https://www.pearvideo.com/videoStatus.jsp'
v_headers = {
'Referer':'https://www.pearvideo.com/'+li,
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
v_param = {
'contId':v_li
}
v_text = requests.get(v_url,params = v_param,headers = v_headers).json()
mp4_url = v_text['videoInfo']['videos']['srcUrl']
'''
mp4_url
https://video.pearvideo.com/mp4/adshort/20210201/1612171096323-15594744_adpkg-ad_hd.mp4
true_url
'https://video.pearvideo.com/mp4/adshort/20210201/cont-1718799-15594744_adpkg-ad_hd.mp4
'''
mp4_x = 'https://video.pearvideo.com/mp4/adshort/([\d]*?)/[\d]*?-([\d]*?)_'
mp4_re = re.findall(mp4_x,mp4_url)[0]
mp4_date = mp4_re[0]
mp4_id = mp4_re[1]
true_url = 'https://video.pearvideo.com/mp4/adshort/'+mp4_date+'/cont-'+v_li+'-'+mp4_id+'_adpkg-ad_hd.mp4'
print(true_url)
# 获取标题
t_url = 'https://www.pearvideo.com/'+li
t_text = requests.get(t_url,headers = headers).text
t_tree = etree.HTML(t_text)
tx = '(.*?)_'
title = re.findall(tx,t_tree.xpath('//title/text()')[0])[0]
print(title)
video_all = {
'true_url':true_url,
'title':title
}
video_list.append(video_all)
print()
print(video_list)
print()
# 创建线程池,调用下载函数
pool = Pool(4)
pool.map(get_video,video_list)
# 关闭线程池
pool.close()
pool.join()