利用python的异步爬虫

主要使用了异步爬虫的asyncio操作,如果觉得速度太慢可以添加多线程和多进程,下面就是全部代码。

import requests
from lxml import etree
import urllib.parse
import re
import aiofiles
import aiohttp
import asyncio
import os
os.mkdir("电影")


def get_page_source(url):
    req = requests.get(url)
    return req.text


def get_href(url):
    rep = requests.get(url)
    tree = etree.HTML(rep.text)
    href = tree.xpath('//div[@class="playlist"]/ul/li/a/@href')
    return href


def get_iframe_src(new_href):
    rep = requests.get(new_href)
    obj = re.compile('var now="(?P<m3u8>.*?)";var pn="wjm3u8";', re.S)
    m3u8 = obj.search(rep.text)
    return m3u8.group("m3u8")


def get_second_m3u8(m3u8):
    req = requests.get(m3u8)
    m3u8_list = req.text.split()[-1]
    name = m3u8.split("/")[3]
    second_m3u8 = urllib.parse.urljoin(m3u8, m3u8_list)
    page = get_page_source(second_m3u8)
    with open(f"{name}m3u8.text", mode="w", encoding="utf-8") as f:
        f.write(page)
    return name


async def down_load(name):
    with open(f"{name}m3u8.text", mode="r", encoding="utf-8") as f:
        lines = f.read().split()
        tasks = []
        for line in lines:
            if line.startswith("#"):
                continue
            file_name = line.split("/")[-1]
            task = asyncio.create_task(down_load_one(line, file_name))
            tasks.append(task)
            await asyncio.wait(tasks)


async def down_load_one(line, file_name):
    while 1:
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(line) as req:
                    content = await req.content.read()
                    async with aiofiles.open(f"./电影/{file_name}", mode="wb") as f1:
                        await f1.write(content)
                        print("下载成功", line)
                        break
        except Exception as e:
            print("下载失败", e)


def merge_ts(name):
    print("正在进行合并")
    temp = []
    with open(f"{name}m3u8.text", mode="r", encoding="utf-8") as f:
        for item in f.read().split():
            if item.startswith("#"):
                continue
            ts_names = item.split("/")[-1]
            temp.append(ts_names)
        now_chr = os.getcwd()
        name_list = []
        n = 1
        for i in range(len(temp)):
            everyone = temp[i]
            name_list.append(everyone)
            if i != 0 and i % 100 == 0:
                merge_name = " + ".join(name_list)
                os.system(f"copy /b {merge_name} {n}.ts")
                n += 1
                name_list = []  # 清空合并列表
        merge_name = " + ".join(temp)
        os.system(f"copy /b {merge_name} {n}.ts")
        n += 1
        temp_2 = []
        for i in range(1, n):
            temp_2.append(f"{i}.ts")
        merge_name = " + ".join(temp_2)
        os.system(f"copy /b {merge_name} 指环王.mp4")
        os.chdir(now_chr)
        print("合并完成")
        return n


def cut_down(name, n):
    print("正在删除多余文件")
    with open(f"{name}m3u8.text", mode="r", encoding="utf-8") as f:
        for i in f.read().split():
            if i.startswith("#"):
                continue
            names = i.split("/")[-1]
            os.remove(f"./电影/{names}")
            for num in range(1, n + 1):
                try:
                    os.remove(f"./电影/{num}.ts")
                except Exception as e:
                    print("完成", e)
            os.remove(f"{name}m3u8.text")
            print("文件删除完成")


def main():
    url = "http://www.yaboeye.com/mz/64626.html"
    hrefs = get_href(url)
    for href in hrefs:
        new_href = urllib.parse.urljoin(url, href)
        m3u8 = get_iframe_src(new_href)
        name = get_second_m3u8(m3u8)
        event_loop = asyncio.get_event_loop()
        event_loop.run_until_complete(down_load(name))
        n = merge_ts(name)
        cut_down(name, n)


if __name__ == '__main__':
    main()

下载好后不会出现多余的文件,过程可能有些漫长,请耐心等待。

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

༺ༀ少年ༀ༻

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值