Python多线程爬虫

Python多线程爬虫

需求教程(旧版):bilibili传送门
完整代码

import requests
from lxml import etree
import re
import os
from multiprocessing.dummy import Pool
# 保存视频
def get_video(video_all):
    print(video_all['title'],'正在下载...')
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    }
    response = requests.get(video_all['true_url'],headers = headers)
    mp4 = response.content
    code = response.status_code
    with open('.\pearvideo\%s.mp4'%video_all['title'],'wb') as fp:
        fp.write(mp4)
        print(video_all['title'],'响应码:',code,end=" ")
        if code == 200:
            print('视频保存成功')
        else:
            print('视频保存失败')
# 创建文件夹
if not os.path.exists("./pearvideo"):
    os.mkdir("./pearvideo")
# 获取主页面
url = 'https://www.pearvideo.com/category_5'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
text = requests.get(url,headers = headers).text
tree = etree.HTML(text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li/div/a/@href')
print(li_list)
video_list = []
for li in li_list:
    # 在XHR中找到json()数据,其中包含mp4的url
    ex = 'video_([\d]*)'
    v_li = re.findall(ex,li)[0]
    v_url = 'https://www.pearvideo.com/videoStatus.jsp'
    v_headers = {
        'Referer':'https://www.pearvideo.com/'+li,
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    }
    v_param = {
        'contId':v_li
    }
    v_text = requests.get(v_url,params = v_param,headers = v_headers).json()
    mp4_url = v_text['videoInfo']['videos']['srcUrl']
    '''
    mp4_url
    https://video.pearvideo.com/mp4/adshort/20210201/1612171096323-15594744_adpkg-ad_hd.mp4
    true_url
    'https://video.pearvideo.com/mp4/adshort/20210201/cont-1718799-15594744_adpkg-ad_hd.mp4
    '''
    mp4_x = 'https://video.pearvideo.com/mp4/adshort/([\d]*?)/[\d]*?-([\d]*?)_'
    mp4_re = re.findall(mp4_x,mp4_url)[0]
    mp4_date = mp4_re[0]
    mp4_id = mp4_re[1]
    true_url = 'https://video.pearvideo.com/mp4/adshort/'+mp4_date+'/cont-'+v_li+'-'+mp4_id+'_adpkg-ad_hd.mp4'
    print(true_url)
    # 获取标题
    t_url = 'https://www.pearvideo.com/'+li
    t_text = requests.get(t_url,headers = headers).text
    t_tree = etree.HTML(t_text)
    tx = '(.*?)_'
    title = re.findall(tx,t_tree.xpath('//title/text()')[0])[0]
    print(title)
    video_all = {
        'true_url':true_url,
        'title':title
    }
    video_list.append(video_all)
print()
print(video_list)
print()
# 创建线程池,调用下载函数
pool = Pool(4)
pool.map(get_video,video_list)
# 关闭线程池
pool.close()
pool.join()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值