思路:创建进程池,每个进程下载一页的所有视频。首先是解析论坛的页面,从页面中找到每个视频的url,得到url列表。然后循环请求url,并解析请求结果,从中找到视频源真实url(也就是.ts文件),将ts文件写入对应文件夹,在全部完成后,利用系统命令将零散的ts文件合成为一个完整的mp4文件,并删除原有ts文件。
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
import os
import json
import re
cookie='__cfduid=dd57287ca0909b82cd4d6a662901033b61580542107; RSWf_2132_saltkey=S1nxAz3e; RSWf_2132_lastvisit=1580538507; RSWf_2132_sendmail=1; __51cke__=; RSWf_2132_it618_loginpreurl=https%3A%2F%2Fmtlluntan13.com%2Fportal.php; RSWf_2132_ulastactivity=1580809941%7C0; RSWf_2132_auth=3dcalCLAV3G3kwsI0WvpVOR8%2BU55VRDTz4N1O1jmxUeDlft%2Fc43tuUY1ybYrySdIUZ1ZZUKto%2FbM%2B0G34Px2kW5%2F%2Bw4; RSWf_2132_noticeTitle=1; RSWf_2132_st_t=341885%7C1580809957%7C7e30799b7609e1bd19d774f757f371d3; RSWf_2132_atarget=1; RSWf_2132_forum_lastvisit=D_86_1580809957; RSWf_2132_lastact=1580809983%09forum.php%09viewthread; RSWf_2132_st_p=341885%7C1580809983%7C926f3f419c7a3777f4901ead6306df9b; RSWf_2132_visitedfid=86D84; RSWf_2132_viewid=tid_1174176; __tins__19395688=%7B%22sid%22%3A%201580809916931%2C%20%22vd%22%3A%206%2C%20%22expires%22%3A%201580811772713%7D; __51laig__=6'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363','Cookie':cookie}
#将.ts文件转换为mp4文件,并将.ts文件删除
def convertformattomp4(root):
os.chdir(root)
os.system(f"copy /b *.ts a.mp4")
for x in os.listdir(root):
if os.path.splitext(x)[1]=='.ts':
os.system(f'del {x}')
#页面下载函数
def download(number):
global cookie,headers
number=str(number)
#通过改变url中的数字,来改变下载的页数
res=requests.get('https://aaaaaa13.com/forum-86-'+number+'.html',headers=headers).text
res=BeautifulSoup(res,'lxml')
print(res)
#找到页面中视频的url,返回列表
Urls=res.select('th a.s.xst')
assert Urls!=[] ,'nonetype'
count=0
#循环打开每个视频的url,找到其视频源url
for Url in Urls:
count=str(count+1)
Url=str(Url['href'])
print(Url)
res=requests.get('https://aaaaa13.com/'+Url,headers=headers).text
res=BeautifulSoup(res,'lxml')
print(res)
url=res.select('[data-high]')[0]['data-high']
print(url)
#命名将要写入的文件夹
title=res.select('head title')[0].text
title=title[:10]
print(title)
if not os.path.exists(f'video/{title}'):
os.mkdir(f'video/{title}')
#将视频写入文件夹中,并进行格式转换
write_to_file(url,title)
convertformattomp4(f'video//{title}')
print('已写入'+count+'个视频')
#传入视频源url进行分析
def write_to_file(url,title):
res1= requests.get(url).text
#分析结果为两种情况,一种直接list出所有的.ts地址,我们可以直接获得列表;另一种则提供链向真实.ts地址的字符段,通过该字符段与上一层url组合得到真实.ts地址,请求该地址并得到列表
if res1.find('.ts') != -1:
ds = re.findall('(https.*?ts)', res1, re.S)
number = len(ds)
else:
g = re.findall('(\d*?k.*?m3u8)', res1, re.S)[0]
print(g)
#将得到的列表名称与url进行组合,得到新的列表,列表呈有待下载的.ts文件地址
urp = urp[:-10] + g
print(urp)
res2 = requests.get(url=urp, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'}).text
print(res2)
da = re.findall('(\w*?.ts)', res2, re.S)
ds=map(lambda x:urp[:-10]+x,da)
assert ds != [], 's'
#循环下载所有.ts文件到对应的文件夹中
for d in ds:
d = str(d)
print(d)
name = d[-9:]
print(name)
with open(f'video/{title}/{name}','wb') as f:
print('正在写入')
f.write(requests.get(d,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'}).content)
print('down')
if __name__=='__main__':
#定义进程池,进行多进程下载
p=Pool(8)
for i in range(9):
p.apply_async(download,args=(i+2,))
p.close()
p.join()
print('爬取完毕')