import asyncio
import aiohttp
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
#异步获取正文链接源代码
async def get_request(url):
async with aiohttp.ClientSession() as session:
async with session.get(url=url) as response:
page_text = await response.text('gb18030','ignore') #忽略指定编码,这个网页没办法,必须这么做不然的话会报错
return page_text
#回调方法
def a(t):
page_text = t.result()
tree = etree.HTML(page_text)
result = tree.xpath("/html/body/div[3]/div[2]/div[1]/text()")
a = ''.join(result)
b = a.replace(' ', "\n")
for i in range(0,100):
with open(task_htmlname[i], mode="w") as f: # 开始下载
f.write(b.encode("gbk", "ignore").decode("gbk", "ignore"))
print("已执行完成:", task_htmlname[i])
#获取正文链接
def get_text_html(url):
resp = requests.get(url)
resp.encoding='gbk' #设置编码,这个网页没办法,必须这么做不然的话会报错
tree = etree.HTML(resp.text)
result = tree.xpath("/html/body/div[4]/dl/dd/a/@href")
result_name = tree.xpath("/html/body/div[4]/dl/dd/a/text()")
for i in range(0,100):
html = f"https://www.qb5.tw/book_51585/{result[i]}" #合并正文链接
html_name = result_name[i]+'.txt' #组成文件名
task_html.append(html)
task_htmlname.append(html_name)
if __name__ == '__main__':
url_html = 'https://www.qb5.tw/book_51585/'
task_html = [] #正文网页链接
task_htmlname = [] #章名
tasks = [] #任务列表
get_text_html(url_html)
for url in task_html:
print(url)
#创建协程对象
c = get_request(url)
# 创建任务对象
task = asyncio.ensure_future(c)
# 回调
task.add_done_callback(a)
# 把任务加进任务列表
tasks.append(task)
# 创建事件循环对象
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
【Python】爬虫-----多任务异步处理、异步http获取案例Ⅱ
最新推荐文章于 2024-09-09 00:00:00 发布
该博客展示了如何使用Python的aiohttp和lxml库进行异步网络请求和HTML解析。通过创建协程和事件循环,实现批量下载网页内容,并利用XPath提取文本信息,最终将数据写入文件。整个过程涉及到了异步IO和前端网页结构的理解,对于提升爬虫效率和处理大量数据很有帮助。
摘要由CSDN通过智能技术生成