基于异步协程的Demo搞个电视剧全集

最新推荐文章于 2024-10-08 12:37:10 发布

TurboJesean

最新推荐文章于 2024-10-08 12:37:10 发布

阅读量312

点赞数

文章标签： python 爬虫

本文链接：https://blog.csdn.net/TurboJesean/article/details/129894909

版权

该文章仅作为学习异步协程时的一个学习笔记

比Demo增加了如下难度：

1.在Windows系统上，异步协程必须限制并发数量（semaphore），因为*.ts文件实在是太多了；

2.获取key值，基于AES-128解密*.ts文件(异步协程方式)

3.*.ts文件名是随机的，必须按照m3u8文件里的顺序进行合并，然后转为*.mp4

总之，唯有多写多练，方能真正理解。

"""
1.目的：从网站中基于异步协程爬取整个狂飙电视剧全集
2.目标网站：http://yintaitv.com/v/58362.html
"""

import requests
import re
from lxml import etree  # Xpath
import asyncio
import aiohttp
import aiofiles
import time
import os
from Crypto.Cipher import AES  # AES-128解密
from fake_useragent import UserAgent  # 随机UA


def get_useful_information(information):  # 去除由#开头的文本行,保留有效信息
    with open('temp.txt', 'wb') as f:
        f.write(information.content)
    with open('temp.txt', 'r', encoding='utf-8') as f:
        lines = []
        for line in f:
            if line.startswith('#'):
                continue
            else:
                lines.append(line.strip('\n'))
    return lines


async def download(url, path, semaphore):
    # 文件夹不存在，则创建文件夹
    folder = os.path.exists(path)
    if not folder:
        os.makedirs(path)

    # 相当于requests
    file_name = url.split("/")[-1]
    async with semaphore:
        async with aiohttp.ClientSession() as session:
            # 发送网络请求
            async with session.get(url, headers=get_random_ua()) as resp_ts:
                # 拿到服务器响应内容
                page_content = await resp_ts.content.read()
                # 将服务器响应内容写入文件
                file_path = os.path.join(path, file_name)
                async with aiofiles.open(file_path, mode='wb') as f:
                    await f.write(page_content)
    print("下载完成----" + path + '----' + file_name)


def get_key():
    obj = re.compile(r'URI="(?P<key_url>.*?)"')
    with open("temp.txt", mode="r", encoding='utf-8') as f:
        result = obj.search(f.read())
        key_url = result.group("key_url")
    # 请求到key的url, 获取到真正的秘钥
    key_str = requests.get(key_url).text
    return key_str.encode('utf-8')


async def des_ts_file(path, file_name, key):  # 基于异步协程进行解密
    print("即将开始解密----"+path+'----'+file_name)

    # 解密后的文件夹若不存在，则创建文件夹
    path_already_des = path.replace('未解密', '已解密')
    folder = os.path.exists(path_already_des)
    if not folder:
        os.makedirs(path_already_des)

    # 加密解密对象创建
    aes = AES.new(key=key, IV=b"0000000000000000", mode=AES.MODE_CBC)
    async with aiofiles.open(path+f"/{file_name}", mode="rb") as f1, aiofiles.open(path_already_des+f"/{file_name}", mode="wb") as f2:
        # 从加密后的文件中读取出来. 进行解密. 保存在未加密文件中
        content = await f1.read()
        bs = aes.decrypt(content)
        await f2.write(bs)
    print("解密完毕----"+path_already_des+'----'+file_name)


async def des_all_ts_file(path, key):  # 基于异步协程解密所有ts文件
    tasks = []
    with open("temp.txt", mode="r", encoding='utf-8') as f:
        for line in f:
            if line.startswith("#"):
                continue
            line = line.strip()
            file_name = line.split("/")[-1]
            # 准备异步操作
            task = asyncio.create_task(des_ts_file(path, file_name, key))
            tasks.append(task)
    await asyncio.wait(tasks)


def merge_ts(path):
    now_dir = os.getcwd()  # 记录当前工作目录
    ts_path = now_dir+'/temp.txt'
    video_name = path.replace('./download/', '').replace('_未解密', '')+'.mp4'  # 合并后的视频名称

    with open(video_name, "wb") as f:
        with open(ts_path, "r") as f1:
            for line in f1:
                if line.startswith("#"):
                    continue
                line = line.strip()
                ts_name = line.split("/")[-1]
                with open(path.replace('未解密', '已解密')+"/{}".format(ts_name), "rb") as f2:
                    f.write(f2.read())

    print('----'+video_name+' 下载完毕'+'----')


async def main():
    # 1.获取狂飙首页源代码
    resp = requests.get('http://yintaitv.com/v/58362.html', headers=get_random_ua())  # 获取狂飙这部电视剧的网页源代码

    # 2.通过Xpath获取每一集的视频播放地址,每一集的名称
    html = etree.HTML(resp.text)
    href_urls = html.xpath('/html/body/div[2]/div[2]/div/div/div[2]/div/div[1]/div[2]/div[2]/div/div/ul/li[*]/a/@href')
    names = html.xpath('/html/body/div[2]/div[2]/div/div/div[2]/div/div[1]/div[2]/div[2]/div/div/ul/li[*]/a/text()')

    episode_urls = []  # 每一集电视剧对应的URL地址
    episode_names = []  # 每一集电视剧对应的名称
    for href_url in href_urls:
        episode_urls.append('http://yintaitv.com' + href_url)
    for name in names:
        episode_names.append('《狂飙》_未解密_' + name)
    ################################################################################################################################################
    # 对于每一集的电视剧URL地址进行处理 #
    ################################################################################################################################################

    for episode_url, episode_name in zip(episode_urls, episode_names):
        time_begin = time.time()
        # 3.获取该集的第一层m3u8
        resp_url_first = requests.get(episode_url, headers=get_random_ua())
        obj = re.compile(r'https.*?m3u8', re.S)
        m3u8 = obj.findall(resp_url_first.text)[0].replace('\\', '')  # 第一层m3u8的标头url

        m3u8_url_first = requests.get(m3u8, headers=get_random_ua())
        m3u8_url_second_right = get_useful_information(m3u8_url_first)  # 第一层m3u8的响应(只保留不是以#开头的有效信息)，即第二层m3u8的标头URL右半部分

        # 4.获取该集的第二层m3u8地址
        m3u8_url_second = requests.get('https://baidu.sd-play.com' + m3u8_url_second_right[0], headers=get_random_ua())
        m3u8_all_urls = get_useful_information(m3u8_url_second)  # 每一集对应的ts文件

        # 5.基于异步协程进行下载
        tasks = []
        semaphore = asyncio.Semaphore(300)  # 限制并发量为300
        path = './download/' + episode_name
        for m3u8_all_url in m3u8_all_urls:
            t = asyncio.create_task(download(m3u8_all_url, path, semaphore))
            tasks.append(t)
        await asyncio.wait(tasks)

        # 6.解密下载的ts文件
        key = get_key()
        await des_all_ts_file(path, key)

        # 7.合并解密后的ts文件
        merge_ts(path)

        time_end = time.time()
        print('总共耗时： ' + str(time_end - time_begin) + '秒')


def get_random_ua():  # 获取随机UA，防止反爬
    ua = UserAgent()
    headers = {
        'User-Agent': ua.chrome
    }
    return headers


if __name__ == '__main__':
    time_start = time.time()
    asyncio.run(main())
    time_stop = time.time()
    print('总共耗时： ' + str(time_stop - time_start) + '秒')