python asyncio queue_asyncio Queue的使用例子

import aiohttp

import asyncio

import async_timeout

from urllib.parse import urljoin, urldefrag

root_url = "http://python.org/"

crawled_urls, url_hub = [], [root_url, "%s/sitemap.xml" % (root_url), "%s/robots.txt" % (root_url)]

headers = {'user-agent': 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10'}

async def get_body(url):

async with aiohttp.ClientSession() as session:

try:

with async_timeout.timeout(10):

async with session.get(url, headers=headers) as response:

if response.status == 200:

html = await response.text()

return {'error': '', 'html': html}

else:

return {'error': response.status, 'html': ''}

except Exception as err:

return {'error': err, 'html': ''}

async def handle_task(task_id, work_queue):

while not work_queue.empty():

queue_url = await work_queue.get()

if not queue_url in crawled_urls:

crawled_urls.append(queue_url)

body = await get_body(queue_url)

if not body['error']:

for new_url in get_urls(body['html']):

if root_url in new_url and not new_url in crawled_urls:

work_queue.put_nowait(new_url)

else:

print(f"Error: {body['error']} - {queue_url}")

def remove_fragment(url):

pure_url, frag = urldefrag(url)

return pure_url

def get_urls(html):

new_urls = [url.split('"')[0] for url in str(html).replace("'",'"').split('href="')[1:]]

return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls]

if __name__ == "__main__":

q = asyncio.Queue()

[q.put_nowait(url) for url in url_hub]

loop = asyncio.get_event_loop()

tasks = [handle_task(task_id, q) for task_id in range(3)]

loop.run_until_complete(asyncio.wait(tasks))

loop.close()

for u in crawled_urls:

print(u)

print('-'*30)

print(len(crawled_urls))

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值