python asyncio 协程 异步

python asyncio 协程异步

适当修改dowland(url,file_name)中内容

# coding=utf-8
import requests
import asyncio
import aiohttp
from lxml import etree
import aiofiles
import time

async def dowland(url,file_name):
    print(f'download {file_name} ing...')
    async with aiohttp.ClientSession() as sessoion:
        async with sessoion.get(url) as resp:
            async with aiofiles.open(f'./baiDuXiaoShuo/{file_name}','w',encoding='utf-8') as f:
                resp = await resp.content.read()
                html = etree.HTML(resp)
                content = html.xpath("//div[@class='panel-body']//text()")
                page_start,page_end = html.xpath('//small//text()')[0].strip('(').strip(')').split('/')
                page_start,page_end = int(page_start),int(page_end)
                # print('页数:',page_start,page_end)
                # 这里存在某些章节被分成了多页的情况
                content = '\n'.join([i.strip() for i in content])
                await f.write(content)
                while page_start < page_end:
                    addi_url = url.rstrip('.html') + '_' + str(page_start + 1) + '.html'
                    page_start += 1
                    async with sessoion.get(addi_url) as resp:
                        resp = await resp.content.read()
                        html = etree.HTML(resp)
                        add_content = html.xpath("//div[@class='panel-body']//text()")
                        add_content = '\n'.join([i.strip() for i in add_content])
                        await f.write(add_content)
                    # print(addi_url)

                # content = html.xpath("/html/body/div[2]/div[1]/div[6]//text()")
                print(file_name,'done!')

def get_list(url):
    href_name = []
    with requests.get(url) as resp:
        html = etree.HTML(resp.text)
        blocks = html.xpath("/html/body/div[2]/div[3]/dl//dd")
        # print(blocks)
        for i in blocks:
            # print(i)
            href = i.xpath("./a/@href")[0]
            title = i.xpath("./a/@title")[0]
            new_href = url + href
            file_name = title.split(' ')[0] + '.txt'
            href_name.append((new_href,file_name))
            # break
        return href_name

async def main():
    href_name = get_list(url)
    href_name = list(set(href_name)) # 去一下重复
    print(len(href_name),'-- get download list')
    # print(href_name[0][0])
    # 异步下载
    task_list = []
    for m,n in href_name:
        task_list.append(dowland(m,n))
    await asyncio.wait(task_list)

if __name__ == '__main__':
	url = "http://www.baiduxs.net/book/47672/"
    # asyncio.run(main())
    s_time = time.time()
    event_loop = asyncio.get_event_loop()
    event_loop.run_until_complete(main())
    print(f'cost time: {time.time() - s_time}')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值