爬虫项目——m3u8后缀的电影

最新推荐文章于 2024-01-12 13:41:11 发布

未来影子

最新推荐文章于 2024-01-12 13:41:11 发布

阅读量926

点赞数

分类专栏： Python 爬虫实战文章标签： python 爬虫

本文链接：https://blog.csdn.net/mynameisgt/article/details/120623927

版权

Python 同时被 2 个专栏收录

20 篇文章 8 订阅

订阅专栏

爬虫实战

5 篇文章 1 订阅

订阅专栏

该博客介绍了如何使用Python进行网络爬虫，从网页中提取m3u8链接，然后逐层解析下载ts片段，并利用异步协程加速下载。最后，将ts文件合并为完整视频。涉及到的技术包括正则表达式、requests、aiohttp和文件操作。

摘要由CSDN通过智能技术生成

# -*- coding: utf-8 -*-
"""
Created on Wed Oct  6 10:47:47 2021

@author: yingzi

E-mail:guotaomath@163.com
"""
'''
目标：找到目标网页,源代码中已找到m3u8,且通过抓包工具知道，
第一层m3u8嵌套第二层m3u8(真实ts的下载地址)，视频无加密
'''

'''
流程：
    1. 拿到网页源代码
    2. 从网页源代码提取第一层m3u8的url
    3. 解析第一层的m3u8，获取第二层m3u8的url
    4. 解析第二层的m3u8,分别下载相应的ts
    5. 合并ts
'''
import requests
import re
import asyncio
import aiohttp
import aiofiles
import os

def get_m3u8_url(url):  #输入网页地址,得到网页地址内的m3u8的地址
    resp = requests.get(url)
    resp.encoding = "utf-8"
    obj = re.compile(r'now="(?P<url>.*?)"',re.S) #用来提取m3u8地址
    m3u8_url = obj.search(resp.text).group("url")
    return m3u8_url
    
def down_first_m3u8(url):
    resp = requests.get(url)
    resp.encoding = "utf-8"
    with open("法律之地.txt",mode="wb") as f:
        f.write(resp.content)
        
def get_second_m3u8_url(url):
    with open("法律之地.txt",mode="r",encoding="utf-8") as f:
        for line in f:
            if line.startswith("#"):
                continue
            m3u8_url = url.split(r"/20210704")[0]+line.strip()
    return m3u8_url

def  down_second_m3u8(url):
    resp = requests.get(url)
    resp.encoding = "utf-8"
    with open("法律之地2.txt",mode="wb") as f:
        f.write(resp.content)

async def download_ts(url,name,session):
    async with session.get(url) as resp:
        async with aiofiles.open(f"video/法律之地/{name}",mode="wb") as f:
            await f.write(await resp.content.read())
    print(f"{name}下载完毕!!")


async def aio_download():
    tasks = []
    async with aiohttp.ClientSession() as session:    # 提前准备好session
        async with aiofiles.open("法律之地2.txt",mode="r",encoding="utf-8") as f:
            async for line in f:
                if line.startswith("#"):
                    continue
                ts_url = line.strip()
                task = asyncio.create_task(download_ts(ts_url,ts_url.rsplit("/",1)[1],session))
                tasks.append(task)
            await asyncio.wait(tasks)
    
# def merge_ts_1():
#     lst = []
#     with open("法律之地2.txt",mode="r",encoding="utf-8") as f:
#         for line in f:
#             if line.startswith("#"):
#                 continue
#             name = line.strip().rsplit("/",1)[1]
#             lst.append(f"video/法律之地/{name}")
#     s = "+".join(lst)
#     os.system(f"copy /b {s} video.mp4")
#     print("搞定！！")
   
def merge_ts_2():
    with open("法律之地2.txt",mode="r",encoding="utf-8") as f:
        with open("video.ts",'wb+') as f1:
            for line in f:
                if line.startswith("#"):
                    continue
                name = line.strip().rsplit("/",1)[1]
                if os.path.exists(os.path.join(f"video/法律之地/{name}")):
                    ts_video_path = os.path.join(f"video/法律之地/{name}")
                    f1.write(open(ts_video_path,'rb').read())
    print("搞定！！")     

if __name__ == '__main__':
    url = "https://www.daquan.cc/play/?15855-1-0.html"
    m3u8_first_url = get_m3u8_url(url)   # 2.1 获取第一层的m3u8的地址
    
    down_first_m3u8(m3u8_first_url)      # 3.1 解析第一层的m3u8
    m3u8_second_url = get_second_m3u8_url(m3u8_first_url) # 3.2获取第二层m3u8的url
    
    down_second_m3u8(m3u8_second_url)    # 4.1 解析第二层的m3u8
    asyncio.create_task(aio_download())  # 4.2 调用异步协程，加快下载ts文件
    
    merge_ts_2()  # 5合并ts文件