p站多线程下载视频

最新推荐文章于 2024-07-17 14:17:17 发布

dkdklam

最新推荐文章于 2024-07-17 14:17:17 发布

阅读量2w

点赞数 3

文章标签： python

本文链接：https://blog.csdn.net/weixin_44879856/article/details/105885950

版权

import re              ######################较稳定，可以爬取多部
import requests as req
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
import threading
from tqdm import tqdm_notebook as tqdm
import os
import winsound

def download(res,title,number):
                try:
                    first_byte=0
                    html=req.get(res,stream=True)
                    length = (html.headers['content-length'])
                    size=int((length))
                    if (size/1024/1024)<0.001:
                        print('------------------------第{}部影片{}爬取失败,请换取url-----------------------------'.format(str(number),title))
                    else:
                        print('--正在爬取-第{}部影片--{}'.format(str(number),title))
                        with open(r'G:\xxxxxx\{}.mp4'.format(title),'wb+') as mp4:
                                pbar=tqdm(total=size, initial=first_byte,unit='B',unit_scale=True,desc=str(number))
                                for chunk in html.iter_content(chunk_size=1024):
                                    if chunk:
                                        mp4.write(chunk)
                                        pbar.update(1024)
                        pbar.close()
                    winsound.Beep(220,500)
                except:
                    print('--{}部影片{}http出现错误'.format(str(number),title))
        
if __name__=='__main__':
    ths=[]
    realurl=[]
    list1=[]
    titles=[]
    list=input('请输入爬取几部:')
    for i in range((int(list))):
        name=input('输入网址:')
        realurl.append(name)
    n=1
    for i in range(len(realurl)):
        url=req.get(url=realurl[i],headers={'User-Agent': 'xxxxxxxx'})
        data=url.text
        x=re.findall('"quality_1080p":"(.*?)"',data,re.S)
        soup=bs(data,'lxml')
        y=soup.find('title').text.split('-')[0].replace('/','').replace('*','').replace('<','').replace('>','').replace(':','').replace('?','').replace('|','').replace('\\','')
#         print(titles)
        if len(x)==0:
            x=re.findall('"quality_720p":"(.*?)"',data,re.S)
            if len(x)==0:
                print('--第{}部{}---dont have 720p,已丢取'.format(str(i+1),y))
                continue
            else:
                print('--准备爬取--第{}部影片-720p-{}'.format(str(i+1),y),'\n')
                while 'ttl'not in x[0]:
#                     print(x[0])
                    url=req.get(url=realurl[i],headers={'User-Agent': 'xxxxxxxxxxxxxxx'})
                    data=url.text
                    x=re.findall('"quality_720p":"(.*?)"',data,re.S)
                    time.sleep(0.2)
                url2=x[0].replace('\\','')
                url3=url2.split('/')
                del url3[1] 
                res=('//'.join(url3))
            
        else:
            print('--准备爬取--第{}部影片-1080p-{}'.format(str(i+1),y),'\n')
            while 'ttl'not in x[0]:
#                     print(x[0])
                url=req.get(url=realurl[i],headers={'User-Agent': 'xxxxxxxxxxxxxxx'})
                data=url.text
                x=re.findall('"quality_1080p":"(.*?)"',data,re.S)
                time.sleep(0.2)
            url2=x[0].replace('\\','')
            url3=url2.split('/')
            del url3[1] 
            res=('//'.join(url3))
            
        list1.append(res)
        titles.append(y)

    for i in range(len(list1)):
            print ('--开始爬取--第{}部影片--{}'.format(i+1,titles[i]),'\n')
            th=threading.Thread(target=download,args=(list1[i],titles[i],i+1,))
            th.start()
            time.sleep(0.1)
            n+=1
            ths.append(th)
    for i in ths:
        i.join()
    print('=================================全部视频已爬取完毕=========================================')
    winsound.Beep(1000,500)