asyncio 异步io方式批量下载资源

最新推荐文章于 2024-08-08 14:28:23 发布
weixin_43990194
最新推荐文章于 2024-08-08 14:28:23 发布
阅读量216
点赞数
文章标签： python json javascript
本文链接：https://blog.csdn.net/weixin_43990194/article/details/130594982
版权
下载大型资源支持断点续传、下载进度条，强大的 async，异步批量下载资源，榨干带宽，自身cpu内存消耗极小，恐怖如斯。主要使用 aiohttp、aiofiles。
import asyncio
import json
from urllib.parse import urljoin
import aiohttp
import aiofiles
from aiofiles import os
from bs4 import BeautifulSoup
from tqdm import tqdm
from rich import print
import random
import misaka
import unicodedata
import os as sync_os
import sys
from functools import wraps
import csv
 
 
def with_retries(max_tries: int = 5, retries_sleep_second: float = 1):
    def wrapper(async_function):
        @wraps(async_function)
        async def wrapped(*args, **kwargs):
            tries = 1
            while tries <= max_tries:
                try:
                    return await async_function(*args, **kwargs)
                except Exception as e:
                    print(f"Aiohttp ClientError: {str(e)}, tries: {tries}")
                    tries += 1
                    await asyncio.sleep(retries_sleep_second)
            else:
                raise TimeoutError("Reached aiohttp max tries")

        return wrapped

    return wrapper


class SN(asyncio.Future):
    BASE_URL = "http://ip"
    URL = f"{BASE_URL}/api/mp/auth/login"
    USERNAME = "**"
    PASSWORD = "**"

    token = ""
    cookie_value = ""
    participant_list_data = []
    resource_url = []
    questions = []

    def get_session(self):
        headers: dict[str, str] = {"Authorization": self.token}
        timeout = aiohttp.ClientTimeout(total=3600 * 48)
        self.client = aiohttp.ClientSession(
            headers=headers,
            timeout=timeout
        )
        self.RECORD_TXT = sync_os.path.basename(sys.argv[0]).replace("py", "txt")
        if not sync_os.path.exists(self.RECORD_TXT):
            sync_os.mknod(self.RECORD_TXT)
        with open(self.RECORD_TXT, encoding='utf-8') as f:
            self.record = set(f.read().split("|"))

    async def close_session(self):
        await self.client.close()

    async def login(self):
        data = {
            "username": self.USERNAME,
            "password": self.PASSWORD,
        }
        async with aiohttp.ClientSession() as session:
            async with session.post(self.URL, json=data) as resp:
                if resp.status == 200:
                    pass
                else:
                    print('no')

                res = await resp.json()

                self.token = f'JWT {res.get("data", {}).get("token")}'
        return self

    def __await__(self):
        return self.login().__await__()

    async def get(self, url):
        async with self.client.get(url=url) as resp:
            if resp.status == 200:
                pass
            else:
                print('no')
            res = await resp.json()
            if not res.get("data"):
                print(url, res)
                raise BaseException(str(res))
            return res

    async def big_list(self, page, size):
        """外层列表"""
        url = f"{self.BASE_URL}/api/ojj/oj/admin/practice/operation/list/?per_page={size}&page={page}"

        res = await self.get(url)
        big_list = []
        for one in res.get("data", {}).get("list", []):
            name: str = one.get("title")
            print(name)
            resource_id = one.get("resource_id")
            url = f'{self.BASE_URL}/api/ojj/oj/admin/practice/operation/?resource_id={resource_id}'
            big_list.append({"url": url, "name": f'{name}-{one.get("practice_set_name")}'})
        return big_list

    async def participant_list(self, url, name):
        """内层列表"""
        res = await self.get(url)
        data = res.get("data", {})
        pic_url_list = []
        file_data = {}
        md = {}
        old_name = data["title"]
        data["title"] = name
        difficulty = {
            0: "简单",
            1: "一般",
            2: "较难",
            3: "困难",
        }
        self.questions.append([
            data.get("title", "").split("-")[0],
            old_name,
            json.loads(data.get("answer"))[0],
            difficulty.get(data.get("difficulty")),
            "|".join(json.loads(data.get("knowledge"))),
            str(bool(data.get("scene_config_data")))])

        question = {
            "title": data.get("title"),
            "answer": data.get("answer"),
            "difficulty": data.get("difficulty"),
            "origin_data": data,
        }
        if data.get("files"):
            for file in data.get("files"):
                file_data = {
                    "file_dir": data.get("title").replace("/", "_"),
                    "file_name": file.get("file_name"),
                    "file_url": urljoin(self.BASE_URL, file.get("file_url"))
                }
        if data.get("writeup"):
            md_render = misaka.Markdown(misaka.HtmlRenderer())
            html = md_render(data.get("writeup"))
            soup = BeautifulSoup(html, features="html.parser")
            md = {
                "name": data.get("title"),
                "content": data.get("writeup").replace("/cpms/oj/practice_attachment/", "pic")
            }
            for raw in soup.find_all("img"):
                pic_url: str = raw.get("src", "")
                if not pic_url:
                    continue
                pic_name = pic_url.split("/")[-1]
                all_pic_url = urljoin(self.BASE_URL, pic_url)
                pic_url_list.append({
                    "pic_url": all_pic_url,
                    "file_name": pic_name,
                    "list_name": data.get("title"),
                })
        self.resource_url.append({
            "question": question,
            "file": file_data,
            "pic": pic_url_list,
            "md": md,
            "scene": bool(data.get("scene_config_data"))
        })

    async def save_json(self, middle_dir, file_name, content):
        if f'{middle_dir}/{file_name}' not in self.record:
            dir = f"resources_qs/{middle_dir}".replace(" ", "_")
            if not await os.path.exists(dir):
                try:
                    await os.makedirs(dir)
                except Exception as e:
                    pass
            file = f"{dir}/{file_name}"
            if await os.path.exists(file):
                try:
                    await os.remove(file)
                except Exception as e:
                    pass
            async with aiofiles.open(file, mode="w") as f:
                await f.write(str(content))
        else:
            print(f'{file_name}已经保存')

    @with_retries()
    async def download(self, url, middle_dir, file_name):
        if url and url.startswith(f'{self.BASE_URL}/'):
            if f'{middle_dir}/{file_name}' not in self.record:
                dir = f"resources_qs/{middle_dir}".replace(" ", "_")
                if not await os.path.exists(dir):
                    try:
                        await os.makedirs(dir)
                    except Exception as e:
                        pass
                file = f"{dir}/{file_name}"
                if await os.path.exists(file):
                    try:
                        await os.remove(file)
                    except Exception as e:
                        pass
                async with self.client.get(url=url) as response:
                    async with aiofiles.open(file, mode="wb",) as f:
                        async for chunk in response.content.iter_chunked(1024):
                            await f.write(chunk)
                        # await f.write(await response.content.read())
                    print(f'end {file_name}')
                    async with aiofiles.open(self.RECORD_TXT, mode="a") as f_record:
                        await f_record.write(f'{middle_dir}/{file_name}|')
            else:
                print(f"{file_name} 已经下载跳过")
     
    @with_retries()
    async def zip_download(self, url, middle_dir, file_name):
         if url and url.startswith("http"):
            dir = f"resources_sn/{middle_dir}".replace(" ", "_")
            if not await os.path.exists(dir):
                try:
                    await os.makedirs(dir)
                except Exception as e:
                    pass
            file = f"{dir}/{file_name}"
            if await os.path.exists(file):
                temp_size = await os.path.getsize(file)
            else:
                temp_size = 0

            async with self.client.get(url=url) as resp:
                total_size = int(resp.headers.get("content-length", 0))
                if temp_size >= total_size:
                    print(f"{file_name} 下载完成")
                    return
            headers = {"Range": f'bytes={temp_size}-{total_size}'}
            async with self.client.get(url=url, headers=headers) as response:
                async with aiofiles.open(file, mode="ab+",) as f:
                    with tqdm(initial=temp_size,
                              desc=f"{self.cjkrjust(file_name, 40)}",
                              total=total_size,
                              unit='',
                              unit_scale=True,
                              unit_divisor=1024,
                              colour=self.random_color()) as bar:
                        async for chunk in response.content.iter_chunked(1024):
                            await f.write(chunk)
                            bar.update(len(chunk))
    def random_color(self):
        colorArr = ['1', '2', '3', '4', '5', '6', '7', '8', '9',
                    'A', 'B', 'C', 'D', 'E', 'F']
        color = ""
        for i in range(6):
            color += colorArr[random.randint(0, 14)]
        return "#"+color

    def count_cjk_chars(self, string):
        return sum(unicodedata.east_asian_width(c) in 'FW' for c in string)

    def cjkrjust(self, string, width, fillbyte=' '):
        """
        右对齐
        """
        return string.rjust(width - self.count_cjk_chars(string), fillbyte)

    async def save_md(self, middle_dir, file_name, content):
        if file_name not in self.record:
            dir = f"resources_qs/{middle_dir}".replace(" ", "_")
            if not await os.path.exists(dir):
                try:
                    await os.makedirs(dir)
                except Exception as e:
                    pass
            file = f"{dir}/{file_name}"
            if await os.path.exists(file):
                try:
                    await os.remove(file)
                except Exception as e:
                    pass
            async with aiofiles.open(file, mode="w") as f:
                await f.write(content)
                async with aiofiles.open(self.RECORD_TXT, mode="a") as f_record:
                    await f_record.write(f'{file_name}|')
        else:
            print(f'{file_name} 已经保存跳过')

    async def save_csv(self, data):
        async with aiofiles.open("question.csv", 'w', encoding='utf-8') as f:
            csv_writer = csv.writer(f)
            # 调用类中的方法：对象.方法名（）
            csv_writer.writerows([_ for _ in data])


async def main():
    sn = await SN()
    sn.get_session()
    print(sn.token)
    big_list = await sn.big_list(1, 200)
    await asyncio.gather(*[sn.participant_list(
        data.get("url"), f'{index}-{data.get("name")}'
    ) for index, data in enumerate(big_list)])
    await sn.save_csv(sn.questions)
    tasks = []
    for one in sn.resource_url:
        if one.get("scene"):
            scene_dir = "scene_yes"
        else:
            scene_dir = "scene_no"
        question = one.get("question")
        tasks.append(sn.save_json(
            f'{scene_dir}/{question.get("title")}',
            f'{question.get("title")}.json',
            question.get("origin_data")))
        file_data = one.get("file")
        if file_data:
            tasks.append(sn.zip_download(
                file_data.get("file_url"),
                f'{scene_dir}/{file_data.get("file_dir")}',
                file_data.get("file_name")
            ))
        md = one.get("md")
        if md:
            tasks.append(sn.save_md(
                f'{scene_dir}/{md.get("name")}',
                f'{md.get("name")}.md',
                md.get("content")
            ))
        for two in one.get("pic", []):
            tasks.append(sn.download(
                two.get("pic_url"),
                f'{scene_dir}/{two.get("list_name")}/pic',
                two.get("file_name")
            ))

    await asyncio.gather(*tasks)

    await sn.close_session()

    print("----------------------end------------------------")


asyncio.run(main())