python asyncio 协程异步
适当修改dowland(url,file_name)
中内容
# coding=utf-8
import requests
import asyncio
import aiohttp
from lxml import etree
import aiofiles
import time
async def dowland(url,file_name):
print(f'download {file_name} ing...')
async with aiohttp.ClientSession() as sessoion:
async with sessoion.get(url) as resp:
async with aiofiles.open(f'./baiDuXiaoShuo/{file_name}','w',encoding='utf-8') as f:
resp = await resp.content.read()
html = etree.HTML(resp)
content = html.xpath("//div[@class='panel-body']//text()")
page_start,page_end = html.xpath('//small//text()')[0].strip('(').strip(')').split('/')
page_start,page_end = int(page_start),int(page_end)
# print('页数:',page_start,page_end)
# 这里存在某些章节被分成了多页的情况
content = '\n'.join([i.strip() for i in content])
await f.write(content)
while page_start < page_end:
addi_url = url.rstrip('.html') + '_' + str(page_start + 1) + '.html'
page_start += 1
async with sessoion.get(addi_url) as resp:
resp = await resp.content.read()
html = etree.HTML(resp)
add_content = html.xpath("//div[@class='panel-body']//text()")
add_content = '\n'.join([i.strip() for i in add_content])
await f.write(add_content)
# print(addi_url)
# content = html.xpath("/html/body/div[2]/div[1]/div[6]//text()")
print(file_name,'done!')
def get_list(url):
href_name = []
with requests.get(url) as resp:
html = etree.HTML(resp.text)
blocks = html.xpath("/html/body/div[2]/div[3]/dl//dd")
# print(blocks)
for i in blocks:
# print(i)
href = i.xpath("./a/@href")[0]
title = i.xpath("./a/@title")[0]
new_href = url + href
file_name = title.split(' ')[0] + '.txt'
href_name.append((new_href,file_name))
# break
return href_name
async def main():
href_name = get_list(url)
href_name = list(set(href_name)) # 去一下重复
print(len(href_name),'-- get download list')
# print(href_name[0][0])
# 异步下载
task_list = []
for m,n in href_name:
task_list.append(dowland(m,n))
await asyncio.wait(task_list)
if __name__ == '__main__':
url = "http://www.baiduxs.net/book/47672/"
# asyncio.run(main())
s_time = time.time()
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(main())
print(f'cost time: {time.time() - s_time}')