第七章视频爬取

大橘杂货铺

已于 2023-12-01 21:59:25 修改

阅读量1.1k

点赞数 24

分类专栏：爬虫文章标签：音视频

于 2023-11-27 23:13:21 首次发布

本文链接：https://blog.csdn.net/weixin_49806206/article/details/134587859

版权

爬虫专栏收录该内容

15 篇文章 0 订阅

订阅专栏

1. m3u8文件内容解析

#EXTM3U
#EXT-X-VERSION:3
#EXT-X-TARGETDURATION:6 每个ts文件的最大长度
#EXT-X-PLAYLIST-TYPE:VOD
#EXT-X-MEDIA-SEQUENCE:0
#EXT-X-KEY:METHOD=AES-128,URI=“/20231106/rPXApTHz/2000kb/hls/key.key” 切片文件的加密方式以及加密的秘钥地址
#EXTINF:3,
/20231106/rPXApTHz/2000kb/hls/45NdQTke.ts 不带#是每个ts文件的地址

2. 爬取视频的步骤

拿到视频页的页面源代码
从视频页的页面源代码中找到对应的iframe，提取到iframe里面的src
请求到src对应的页面源代码。在该页面中解析出真正的M3U8文件地址
下载第一层M3U8，从第一层M3U8中解析出第二层的地址
下载第二层M3U8．从第二层M3U8中解析出每一个TS文件的路径，启动协程任务
对ts文件进行解密操作:先拿key
对ts文件进行合并．还原回mp4文件

1. 拿到视频页的页面源代码

使用request获取视频页面的源码

def get_page_source(url):
    head = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    }
    resp = requests.get(url=url,headers=head)
    resp.encoding = "utf-8"
    return resp.text

2. 提取源码中第一层m3u8的地址

使用正则re模块

def get_first_m3u8_src(url):
    page_source = get_page_source(url)
    tree = etree.HTML(page_source)
    obj = re.compile(r'"link_pre":"","url":"(?P<m3u8_src>.*?)index.m3u8"',re.S)
    result = obj.search(page_source)
    tmp_ifram_src = result.group("m3u8_src")
    ifram_src = tmp_ifram_src.replace('\\','') + 'index.m3u8'
    return ifram_src

3. 获取m3u8文件并下载

下载是因为在后续的步骤中会使用到文件
with open(file = ,mode = ,encoding=) as f:
f.write()
使用with创建文件，并写入

def download_second_m3u8(url):
    resp = get_page_source(url)
    resp_lst = resp.split('\n')
    second_m3u8_src = urljoin(url,resp_lst[2])
    page_source = get_page_source(second_m3u8_src)
    with open(file='./m3u8.txt',mode='w',encoding='utf-8') as f:
        f.write(page_source)

4. 获取m3u8中的ts_src

使用strip()去皮
使用startwith()判断字符串的开头

def get_merge_ts():
    ls = []
    with open(file='./m3u8.txt',mode='r',encoding='utf-8') as f:
        ts = f.readlines()
        for i in ts:
            if i.strip().startswith('#'):
                continue
            ls.append(i.strip())
    return ls

5. 下载单个ts

使用协程快速同时多个任务执行asyncio
aiohttp： async with aiohttp.ClientSession() as session 协程中用来请求接口，类似于requests
aiofiles： async with aiofiles.open(file=,mode=‘wb’) as f: 协程中用来创建文件，类似于with open
resp.content.read() 协程中用来转化页面源内容，类似于resp.content

async def download_one_ts(ts_src):
    file_name = ts_src.split('/')[-1]
    for i in range(10):
        print(f'开始下载{file_name}......')
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(ts_src) as resp:
                    content = await resp.content.read()
                    async with aiofiles.open(file=f"./movies_yuan/{file_name}",mode='wb') as f:
                        await f.write(content)
                        break
        except:
            print(f"{file_name}下载失败，重新下载")
            time.sleep(2*i)
        print(f'{file_name}下载完成.....')

6. 下载所有的ts

协程的执行
await asyncio.wait(tasks)
asyncio.run(download_all_ts())

async def download_all_ts():
    url = 'https://ukzy.ukubf3.com/'
    tasks = []
    ts = get_merge_ts()
    print(f"共计{len(ts)}个电影")
    for i in ts:
        ts_src = urljoin(url,i)
        task = asyncio.create_task(download_one_ts(ts_src))
        tasks.append(task)
    await asyncio.wait(tasks)

7. 获取秘钥文件

def get_key():
    key_url = ''
    obj = re.compile(r'#EXT-X-KEY:METHOD=AES-128,URI="(?P<key_url>.*?)"')
    with open(file='./m3u8.txt', mode='r', encoding='utf-8') as f:
        key_url = obj.search(f.read()).group('key_url')
    url = urljoin("https://ukzy.ukubf3.com/",key_url)
    key_tmp = get_page_source(url)
    key = key_tmp.encode('utf-8')
    return key

8. 文件解密

下载包pip install pycryptodome
导包 from Crypto.Cipher import AES
aes = AES.new(key=key,IV=b"0000000000000000",mode=AES.MODE_CBC)

# 单个文件解密
async def dos_one_file(file_name,key):
    print(f"开始解密{file_name}")
    aes = AES.new(key=key,IV=b"0000000000000000",mode=AES.MODE_CBC)
    async with aiofiles.open(file=f"./movies_yuan/{file_name}",mode='rb') as f1, \
            aiofiles.open(f"./movies_mudi/{file_name}", mode='wb') as f2:
        content = await f1.read()
        bs = aes.decrypt(content)
        await f2.write(bs)
    print(f"{file_name}解密完毕")

#解密所有文件
async def dos_all_file():
    tasks = []
    key = get_key()
    for i in get_merge_ts():
        file_name = i.split('/')[-1]
        task = asyncio.create_task(dos_one_file(file_name,key))
        tasks.append(task)
    await asyncio.wait(tasks)

9. 所有ts文件合并

python中使用windows命令合并所有的ts文件
使用os模块os.system('copy /b file1+file2 mew_file')合并file1 file2 文件为new_file
可能合并文件的数量存在数量限制，所以一次合并ts文件数量不能过多，可以采用分批合并的方法

def merge_all_ts():
    now_dir = os.getcwd()
    file_names = []
    for i in get_merge_ts():
        name = i.split('/')[-1]
        file_names.append(name)
    os.chdir('./movies_mudi')
    tmp = []
    n = 1
    for i in range(len(file_names)):
        tmp.append(file_names[i])
        if i%100 == 0 and i!=0:
            tmp_name = '+'.join(tmp)
            os.system(f'copy /b {tmp_name} {n}.mp4')
            n += 1
            tmp = []
        tmp_name = '+'.join(tmp)
        os.system(f'copy /b {tmp_name} {n}.mp4')
    last_ls = [f"{i}.mp4" for i in range(1,n+1)]
    print(last_ls)
    os.system(f'copy /b {"+".join(last_ls)} movie.mp4')
    os.chdir(now_dir)

10. 整体代码

import asyncio
import os
import time

import aiofiles
import aiohttp
import requests
from lxml import etree
import re
from urllib.parse import urljoin
from Crypto.Cipher import AES

# 获取页面源码
def get_page_source(url):
    head = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    }
    resp = requests.get(url=url,headers=head)
    resp.encoding = "utf-8"
    return resp.text

#提取源码中第一层m3u8的地址
def get_first_m3u8_src(url):
    page_source = get_page_source(url)
    tree = etree.HTML(page_source)
    obj = re.compile(r'"link_pre":"","url":"(?P<m3u8_src>.*?)index.m3u8"',re.S)
    result = obj.search(page_source)
    tmp_ifram_src = result.group("m3u8_src")
    ifram_src = tmp_ifram_src.replace('\\','') + 'index.m3u8'
    return ifram_src

#获取m3u8文件并下载
def download_second_m3u8(url):
    resp = get_page_source(url)
    resp_lst = resp.split('\n')
    second_m3u8_src = urljoin(url,resp_lst[2])
    page_source = get_page_source(second_m3u8_src)
    with open(file='./m3u8.txt',mode='w',encoding='utf-8') as f:
        f.write(page_source)

# 获取m3u8中的ts_src
def get_merge_ts():
    ls = []
    with open(file='./m3u8.txt',mode='r',encoding='utf-8') as f:
        ts = f.readlines()
        for i in ts:
            if i.strip().startswith('#'):
                continue
            ls.append(i.strip())
    return ls




#下载单个ts
async def download_one_ts(ts_src):
    file_name = ts_src.split('/')[-1]
    for i in range(10):
        print(f'开始下载{file_name}......')
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(ts_src) as resp:
                    content = await resp.content.read()
                    async with aiofiles.open(file=f"./movies_yuan/{file_name}",mode='wb') as f:
                        await f.write(content)
                        break
        except:
            print(f"{file_name}下载失败，重新下载")
            time.sleep(2*i)
        print(f'{file_name}下载完成.....')


#下载所有的ts
async def download_all_ts():
    url = 'https://ukzy.ukubf3.com/'
    tasks = []
    ts = get_merge_ts()
    print(f"共计{len(ts)}个电影")
    for i in ts:
        ts_src = urljoin(url,i)
        task = asyncio.create_task(download_one_ts(ts_src))
        tasks.append(task)
    await asyncio.wait(tasks)

def get_key():
    key_url = ''
    obj = re.compile(r'#EXT-X-KEY:METHOD=AES-128,URI="(?P<key_url>.*?)"')
    with open(file='./m3u8.txt', mode='r', encoding='utf-8') as f:
        key_url = obj.search(f.read()).group('key_url')
    url = urljoin("https://ukzy.ukubf3.com/",key_url)
    key_tmp = get_page_source(url)
    key = key_tmp.encode('utf-8')
    return key






# 单个文件解密
async def dos_one_file(file_name,key):
    print(f"开始解密{file_name}")
    aes = AES.new(key=key,IV=b"0000000000000000",mode=AES.MODE_CBC)
    async with aiofiles.open(file=f"./movies_yuan/{file_name}",mode='rb') as f1, \
            aiofiles.open(f"./movies_mudi/{file_name}", mode='wb') as f2:
        content = await f1.read()
        bs = aes.decrypt(content)
        await f2.write(bs)
    print(f"{file_name}解密完毕")

#解密所有文件
async def dos_all_file():
    tasks = []
    key = get_key()
    for i in get_merge_ts():
        file_name = i.split('/')[-1]
        task = asyncio.create_task(dos_one_file(file_name,key))
        tasks.append(task)
    await asyncio.wait(tasks)

#所有ts文件合并
def merge_all_ts():
    now_dir = os.getcwd()
    file_names = []
    for i in get_merge_ts():
        name = i.split('/')[-1]
        file_names.append(name)
    os.chdir('./movies_mudi')
    tmp = []
    n = 1
    for i in range(len(file_names)):
        tmp.append(file_names[i])
        if i%100 == 0 and i!=0:
            tmp_name = '+'.join(tmp)
            os.system(f'copy /b {tmp_name} {n}.mp4')
            n += 1
            tmp = []
        tmp_name = '+'.join(tmp)
        os.system(f'copy /b {tmp_name} {n}.mp4')
    last_ls = [f"{i}.mp4" for i in range(1,n+1)]
    print(last_ls)
    os.system(f'copy /b {"+".join(last_ls)} movie.mp4')
    os.chdir(now_dir)

def main():
    # url = "http://www.slxljy.com/tvshow/877737-2-1.html"
    # # first_m3u8_src = get_first_m3u8_src(url)
    # print("开始下载电影......")
    # #下载所有的ts
    # asyncio.run(download_all_ts())
    # print('电影下载完成')
    # # 解密所有ts
    # asyncio.run(dos_all_file())
    merge_all_ts()


if __name__ == '__main__':
    main()