asyncio 异步io方式 批量下载资源

 下载大型资源支持断点续传、下载进度条,强大的 async,异步批量下载资源,榨干带宽,自身cpu内存消耗极小,恐怖如斯。主要使用 aiohttp、aiofiles。

import asyncio
import json
from urllib.parse import urljoin
import aiohttp
import aiofiles
from aiofiles import os
from bs4 import BeautifulSoup
from tqdm import tqdm
from rich import print
import random
import misaka
import unicodedata
import os as sync_os
import sys
from functools import wraps
import csv
 
 
def with_retries(max_tries: int = 5, retries_sleep_second: float = 1):
    def wrapper(async_function):
        @wraps(async_function)
        async def wrapped(*args, **kwargs):
            tries = 1
            while tries <= max_tries:
                try:
                    return await async_function(*args, **kwargs)
                except Exception as e:
                    print(f"Aiohttp ClientError: {str(e)}, tries: {tries}")
                    tries += 1
                    await asyncio.sleep(retries_sleep_second)
            else:
                raise TimeoutError("Reached aiohttp max tries")

        return wrapped

    return wrapper


class SN(asyncio.Future):
    BASE_URL = "http://ip"
    URL = f"{BASE_URL}/api/mp/auth/login"
    USERNAME = "**"
    PASSWORD = "**"

    token = ""
    cookie_value = ""
    participant_list_data = []
    resource_url = []
    questions = []

    def get_session(self):
        headers: dict[str, str] = {"Authorization": self.token}
        timeout = aiohttp.ClientTimeout(total=3600 * 48)
        self.client = aiohttp.ClientSession(
            headers=headers,
            timeout=timeout
        )
        self.RECORD_TXT = sync_os.path.basename(sys.argv[0]).replace("py", "txt")
        if not sync_os.path.exists(self.RECORD_TXT):
            sync_os.mknod(self.RECORD_TXT)
        with open(self.RECORD_TXT, encoding='utf-8') as f:
            self.record = set(f.read().split("|"))

    async def close_session(self):
        await self.client.close()

    async def login(self):
        data = {
            "username": self.USERNAME,
            "password": self.PASSWORD,
        }
        async with aiohttp.ClientSession() as session:
            async with session.post(self.URL, json=data) as resp:
                if resp.status == 200:
                    pass
                else:
                    print('no')

                res = await resp.json()

                self.token = f'JWT {res.get("data", {}).get("token")}'
        return self

    def __await__(self):
        return self.login().__await__()

    async def get(self, url):
        async with self.client.get(url=url) as resp:
            if resp.status == 200:
                pass
            else:
                print('no')
            res = await resp.json()
            if not res.get("data"):
                print(url, res)
                raise BaseException(str(res))
            return res

    async def big_list(self, page, size):
        """外层列表"""
        url = f"{self.BASE_URL}/api/ojj/oj/admin/practice/operation/list/?per_page={size}&page={page}"

        res = await self.get(url)
        big_list = []
        for one in res.get("data", {}).get("list", []):
            name: str = one.get("title")
            print(name)
            resource_id = one.get("resource_id")
            url = f'{self.BASE_URL}/api/ojj/oj/admin/practice/operation/?resource_id={resource_id}'
            big_list.append({"url": url, "name": f'{name}-{one.get("practice_set_name")}'})
        return big_list

    async def participant_list(self, url, name):
        """内层列表"""
        res = await self.get(url)
        data = res.get("data", {})
        pic_url_list = []
        file_data = {}
        md = {}
        old_name = data["title"]
        data["title"] = name
        difficulty = {
            0: "简单",
            1: "一般",
            2: "较难",
            3: "困难",
        }
        self.questions.append([
            data.get("title", "").split("-")[0],
            old_name,
            json.loads(data.get("answer"))[0],
            difficulty.get(data.get("difficulty")),
            "|".join(json.loads(data.get("knowledge"))),
            str(bool(data.get("scene_config_data")))])

        question = {
            "title": data.get("title"),
            "answer": data.get("answer"),
            "difficulty": data.get("difficulty"),
            "origin_data": data,
        }
        if data.get("files"):
            for file in data.get("files"):
                file_data = {
                    "file_dir": data.get("title").replace("/", "_"),
                    "file_name": file.get("file_name"),
                    "file_url": urljoin(self.BASE_URL, file.get("file_url"))
                }
        if data.get("writeup"):
            md_render = misaka.Markdown(misaka.HtmlRenderer())
            html = md_render(data.get("writeup"))
            soup = BeautifulSoup(html, features="html.parser")
            md = {
                "name": data.get("title"),
                "content": data.get("writeup").replace("/cpms/oj/practice_attachment/", "pic")
            }
            for raw in soup.find_all("img"):
                pic_url: str = raw.get("src", "")
                if not pic_url:
                    continue
                pic_name = pic_url.split("/")[-1]
                all_pic_url = urljoin(self.BASE_URL, pic_url)
                pic_url_list.append({
                    "pic_url": all_pic_url,
                    "file_name": pic_name,
                    "list_name": data.get("title"),
                })
        self.resource_url.append({
            "question": question,
            "file": file_data,
            "pic": pic_url_list,
            "md": md,
            "scene": bool(data.get("scene_config_data"))
        })

    async def save_json(self, middle_dir, file_name, content):
        if f'{middle_dir}/{file_name}' not in self.record:
            dir = f"resources_qs/{middle_dir}".replace(" ", "_")
            if not await os.path.exists(dir):
                try:
                    await os.makedirs(dir)
                except Exception as e:
                    pass
            file = f"{dir}/{file_name}"
            if await os.path.exists(file):
                try:
                    await os.remove(file)
                except Exception as e:
                    pass
            async with aiofiles.open(file, mode="w") as f:
                await f.write(str(content))
        else:
            print(f'{file_name}已经保存')

    @with_retries()
    async def download(self, url, middle_dir, file_name):
        if url and url.startswith(f'{self.BASE_URL}/'):
            if f'{middle_dir}/{file_name}' not in self.record:
                dir = f"resources_qs/{middle_dir}".replace(" ", "_")
                if not await os.path.exists(dir):
                    try:
                        await os.makedirs(dir)
                    except Exception as e:
                        pass
                file = f"{dir}/{file_name}"
                if await os.path.exists(file):
                    try:
                        await os.remove(file)
                    except Exception as e:
                        pass
                async with self.client.get(url=url) as response:
                    async with aiofiles.open(file, mode="wb",) as f:
                        async for chunk in response.content.iter_chunked(1024):
                            await f.write(chunk)
                        # await f.write(await response.content.read())
                    print(f'end {file_name}')
                    async with aiofiles.open(self.RECORD_TXT, mode="a") as f_record:
                        await f_record.write(f'{middle_dir}/{file_name}|')
            else:
                print(f"{file_name} 已经下载跳过")
     
    @with_retries()
    async def zip_download(self, url, middle_dir, file_name):
         if url and url.startswith("http"):
            dir = f"resources_sn/{middle_dir}".replace(" ", "_")
            if not await os.path.exists(dir):
                try:
                    await os.makedirs(dir)
                except Exception as e:
                    pass
            file = f"{dir}/{file_name}"
            if await os.path.exists(file):
                temp_size = await os.path.getsize(file)
            else:
                temp_size = 0

            async with self.client.get(url=url) as resp:
                total_size = int(resp.headers.get("content-length", 0))
                if temp_size >= total_size:
                    print(f"{file_name} 下载完成")
                    return
            headers = {"Range": f'bytes={temp_size}-{total_size}'}
            async with self.client.get(url=url, headers=headers) as response:
                async with aiofiles.open(file, mode="ab+",) as f:
                    with tqdm(initial=temp_size,
                              desc=f"{self.cjkrjust(file_name, 40)}",
                              total=total_size,
                              unit='',
                              unit_scale=True,
                              unit_divisor=1024,
                              colour=self.random_color()) as bar:
                        async for chunk in response.content.iter_chunked(1024):
                            await f.write(chunk)
                            bar.update(len(chunk))
    def random_color(self):
        colorArr = ['1', '2', '3', '4', '5', '6', '7', '8', '9',
                    'A', 'B', 'C', 'D', 'E', 'F']
        color = ""
        for i in range(6):
            color += colorArr[random.randint(0, 14)]
        return "#"+color

    def count_cjk_chars(self, string):
        return sum(unicodedata.east_asian_width(c) in 'FW' for c in string)

    def cjkrjust(self, string, width, fillbyte=' '):
        """
        右对齐
        """
        return string.rjust(width - self.count_cjk_chars(string), fillbyte)

    async def save_md(self, middle_dir, file_name, content):
        if file_name not in self.record:
            dir = f"resources_qs/{middle_dir}".replace(" ", "_")
            if not await os.path.exists(dir):
                try:
                    await os.makedirs(dir)
                except Exception as e:
                    pass
            file = f"{dir}/{file_name}"
            if await os.path.exists(file):
                try:
                    await os.remove(file)
                except Exception as e:
                    pass
            async with aiofiles.open(file, mode="w") as f:
                await f.write(content)
                async with aiofiles.open(self.RECORD_TXT, mode="a") as f_record:
                    await f_record.write(f'{file_name}|')
        else:
            print(f'{file_name} 已经保存跳过')

    async def save_csv(self, data):
        async with aiofiles.open("question.csv", 'w', encoding='utf-8') as f:
            csv_writer = csv.writer(f)
            # 调用类中的方法:对象.方法名()
            csv_writer.writerows([_ for _ in data])


async def main():
    sn = await SN()
    sn.get_session()
    print(sn.token)
    big_list = await sn.big_list(1, 200)
    await asyncio.gather(*[sn.participant_list(
        data.get("url"), f'{index}-{data.get("name")}'
    ) for index, data in enumerate(big_list)])
    await sn.save_csv(sn.questions)
    tasks = []
    for one in sn.resource_url:
        if one.get("scene"):
            scene_dir = "scene_yes"
        else:
            scene_dir = "scene_no"
        question = one.get("question")
        tasks.append(sn.save_json(
            f'{scene_dir}/{question.get("title")}',
            f'{question.get("title")}.json',
            question.get("origin_data")))
        file_data = one.get("file")
        if file_data:
            tasks.append(sn.zip_download(
                file_data.get("file_url"),
                f'{scene_dir}/{file_data.get("file_dir")}',
                file_data.get("file_name")
            ))
        md = one.get("md")
        if md:
            tasks.append(sn.save_md(
                f'{scene_dir}/{md.get("name")}',
                f'{md.get("name")}.md',
                md.get("content")
            ))
        for two in one.get("pic", []):
            tasks.append(sn.download(
                two.get("pic_url"),
                f'{scene_dir}/{two.get("list_name")}/pic',
                two.get("file_name")
            ))

    await asyncio.gather(*tasks)

    await sn.close_session()

    print("----------------------end------------------------")


asyncio.run(main())

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值