"""
# 目标网站:https://www.zanghaihua.org/mingchaonaxieshier/
# 最终目的:按每一部的名称保存所有章节内容
"""
import requests
from lxml import etree
import re
import asyncio
import aiohttp
import aiofiles
import os
import time
from fake_useragent import UserAgent # 随机UA
def get_all_book_names(url): # 获取所有书的名称
# <span class="v"> .*? </span>
resp_book_names = requests.get(url, headers=get_random_ua())
# print(resp_book_names.text)
obj = re.compile(r'<span class="v"> (?P<book_names>.*?) </span>', re.S)
# obj = re.compile(r'<span class="v"> .*? </span>', re.S)
result_book_names = obj.findall(resp_book_names.text)
# print(result_book_names)
return result_book_names
def get_all_chapter_urls(url): # 获取所有章节的url
resp_chapter_urls = requests.get(url, headers=get_random_ua())
html = etree.HTML(resp_chapter_urls.text)
# 拿到每个章节的
all_chapter_urls = html.xpath('/html/body/div[6]/span[*]/a/@href')
return all_chapter_urls
# print(all_chapter_urls)
def get_random_ua(): # 获取随机UA,防止反爬
ua = UserAgent()
headers = {
'User-Agent': ua.chrome
}
return headers
def split_items(items):
items1 = items[0:33]
items2 = items[33:55]
items3 = items[55:75]
items4 = items[75:97]
items5 = items[97:117]
items6 = items[117:137]
items7 = items[137:159]
return [items1, items2, items3, items4, items5, items6, items7]
async def download(url, path): # 基于异步协程获取每个章节的内容,并保存
# 文件夹不存在,则创建文件夹
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
# 相当于requests
async with aiohttp.ClientSession() as session:
# 发送网络请求
async with session.get(url) as resp:
# 拿到服务器响应内容
page_source = await resp.text()
# 解析数据
tree = etree.HTML(page_source)
title = tree.xpath("//div[@class='chaptertitle clearfix']/h1/text()")[0].strip()
content = "\n".join(tree.xpath("//div[@id='BookText']/text()"))
# 将服务器响应内容写入文件
file_path = os.path.join(path, title+'.txt') # 此路径到书的章节级别
async with aiofiles.open(file_path, mode='w', encoding='utf-8') as f:
await f.write(content)
print("下载完成-->" + path.split('/')[-1] + '_' + title)
async def main():
url = 'https://www.zanghaihua.org/mingchaonaxieshier/' # 目标网站
# 1.获取所有书的名称
all_book_names = get_all_book_names(url)
# 2.获取所有章节的url地址
all_chapter_urls = get_all_chapter_urls(url)
# 3.基于异步协程进行下载
for i in range(7):
path = './download/'+all_book_names[i] # 此路径到书级别
tasks = []
for chapter_url in split_items(all_chapter_urls)[i]:
t = asyncio.create_task(download(chapter_url, path))
tasks.append(t)
await asyncio.wait(tasks)
if __name__ == '__main__':
time_begin = time.time()
asyncio.run(main())
time_end = time.time()
print('总共耗时: ' + str(time_end - time_begin) + '秒')
理解异步协程的Demo,方得永生......