[Python中基于异步协程框架爬光一部小说]

"""
# 目标网站:https://www.zanghaihua.org/mingchaonaxieshier/
# 最终目的:按每一部的名称保存所有章节内容
"""

import requests
from lxml import etree
import re
import asyncio
import aiohttp
import aiofiles
import os
import time
from fake_useragent import UserAgent  # 随机UA


def get_all_book_names(url):  # 获取所有书的名称
    # <span class="v"> .*? </span>
    resp_book_names = requests.get(url, headers=get_random_ua())
    # print(resp_book_names.text)
    obj = re.compile(r'<span class="v"> (?P<book_names>.*?) </span>', re.S)
    # obj = re.compile(r'<span class="v"> .*? </span>', re.S)
    result_book_names = obj.findall(resp_book_names.text)
    # print(result_book_names)
    return result_book_names


def get_all_chapter_urls(url):  # 获取所有章节的url
    resp_chapter_urls = requests.get(url, headers=get_random_ua())
    html = etree.HTML(resp_chapter_urls.text)
    # 拿到每个章节的
    all_chapter_urls = html.xpath('/html/body/div[6]/span[*]/a/@href')
    return all_chapter_urls
    # print(all_chapter_urls)


def get_random_ua():  # 获取随机UA,防止反爬
    ua = UserAgent()
    headers = {
        'User-Agent': ua.chrome
    }
    return headers


def split_items(items):
    items1 = items[0:33]
    items2 = items[33:55]
    items3 = items[55:75]
    items4 = items[75:97]
    items5 = items[97:117]
    items6 = items[117:137]
    items7 = items[137:159]
    return [items1, items2, items3, items4, items5, items6, items7]


async def download(url, path):  # 基于异步协程获取每个章节的内容,并保存
    # 文件夹不存在,则创建文件夹
    folder = os.path.exists(path)
    if not folder:
        os.makedirs(path)

    # 相当于requests
    async with aiohttp.ClientSession() as session:
        # 发送网络请求
        async with session.get(url) as resp:
            # 拿到服务器响应内容
            page_source = await resp.text()
            # 解析数据
            tree = etree.HTML(page_source)
            title = tree.xpath("//div[@class='chaptertitle clearfix']/h1/text()")[0].strip()
            content = "\n".join(tree.xpath("//div[@id='BookText']/text()"))
            # 将服务器响应内容写入文件
            file_path = os.path.join(path, title+'.txt')  # 此路径到书的章节级别
            async with aiofiles.open(file_path, mode='w', encoding='utf-8') as f:
                await f.write(content)
    print("下载完成-->" + path.split('/')[-1] + '_' + title)


async def main():
    url = 'https://www.zanghaihua.org/mingchaonaxieshier/'  # 目标网站

    # 1.获取所有书的名称
    all_book_names = get_all_book_names(url)

    # 2.获取所有章节的url地址
    all_chapter_urls = get_all_chapter_urls(url)

    # 3.基于异步协程进行下载
    for i in range(7):
        path = './download/'+all_book_names[i]  # 此路径到书级别
        tasks = []
        for chapter_url in split_items(all_chapter_urls)[i]:
            t = asyncio.create_task(download(chapter_url, path))
            tasks.append(t)
        await asyncio.wait(tasks)


if __name__ == '__main__':
    time_begin = time.time()
    asyncio.run(main())
    time_end = time.time()
    print('总共耗时: ' + str(time_end - time_begin) + '秒')

理解异步协程的Demo,方得永生......

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值