asyncio aiohttp 异步爬虫实例

最新推荐文章于 2020-09-09 20:12:24 发布

songhao8080

最新推荐文章于 2020-09-09 20:12:24 发布

阅读量172

点赞数

本文链接：https://blog.csdn.net/songhao8080/article/details/103670178

版权

Python

import urllib.request as request from bs4 import BeautifulSoup as bs import <a href="https://www.168seo.cn/tag/asyncio" title="View all posts in asyncio" target="_blank">asyncio</a> import <a href="https://www.168seo.cn/tag/aiohttp" title="View all posts in aiohttp" target="_blank">aiohttp</a>,re @<a href="https://www.168seo.cn/tag/asyncio" title="View all posts in asyncio" target="_blank">asyncio</a>.coroutine async def getPage(url,res_list): print(url) headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} # conn = <a href="https://www.168seo.cn/tag/aiohttp" title="View all posts in aiohttp" target="_blank">aiohttp</a>.ProxyConnector(proxy="http://127.0.0.1:8087") async with <a href="https://www.168seo.cn/tag/aiohttp" title="View all posts in aiohttp" target="_blank">aiohttp</a>.ClientSession() as session: async with session.get(url,headers=headers) as resp: assert resp.status==200 res_list.append(await resp.text()) async def getTitle(url,res_list): print(url) headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} # conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087") async with aiohttp.ClientSession() as session: async with session.get(url,headers=headers) as resp: assert resp.status==200 html = await resp.text() title=re.search("<title>(.*?)</title>",html,re.S).group(0) print(title) # with open('title.txt','a+') as f: # print(title,url) # f.write(title+","+url+"\n") # print(type(await resp.text())) # res_list.append(await resp.text()) class parseListPage(): def __init__(self,page_str): self.page_str = page_str def __enter__(self): page_str = self.page_str page = bs(page_str,'lxml') # 获取文章链接 articles = page.select('.txtList30 li') art_urls = [] for a in articles: x = a.find('a')['href'] art_urls.append(x) return art_urls def __exit__(self, exc_type, exc_val, exc_tb): pass page_num = 100 page_url_base = 'http://news.artron.net/morenews/list728/p' page_urls = [page_url_base + str(i+1) for i in range(page_num)] loop = asyncio.get_event_loop() ret_list = [] tasks = [getPage(host,ret_list) for host in page_urls] print(tasks) loop.run_until_complete(asyncio.wait(tasks)) articles_url = [] for ret in ret_list: with parseListPage(ret) as tmp: articles_url += tmp ret_list = [] tasks = [getTitle(url, ret_list) for url in articles_url] loop.run_until_complete(asyncio.wait(tasks)) loop.close() # 例子 0 import asyncio import aiohttp,time NUMBERS = range(12) ''' 1. 当我们给一个函数添加了async关键字，就会把它变成一个异步函数。每个线程有一个事件循环，主线程调用asyncio.get_event_loop时会创建事件循环，你需要把异步的任务丢给这个循环的run_until_complete方法，事件循环会安排协同程序的执行。和方法名字一样，异步的任务完成方法才会就执行完成了。 await asyncio.wait(blocking_tasks)就是协同的执行那些同步的任务，直到完成。 ''' URL = 'http://httpbin.org/get?a={}' async def fetch_async(a): async with aiohttp.ClientSession() as session: async with session.get(URL.format(a)) as r: data = await r.json() #希望能进行协程切换的地方，就需要使用await关键字。如上的例子中r.json方法会等待I/O（也就是正在做一个网络请求），这种就可以切换去做其他的时候，之后再切换回来。 return data['args']['a'] start = time.time() event_loop = asyncio.get_event_loop() #会创建事件循环 tasks = [fetch_async(num) for num in NUMBERS] results = event_loop.run_until_complete(asyncio.gather(*tasks)) for num, result in zip(NUMBERS, results): print('fetch({}) = {}'.format(num, result)) print('Use asyncio aiohttp : {}'.format(time.time() - start))

import urllib . request as request

from bs4 import BeautifulSoup as bs

import asyncio

import aiohttp , re

@ asyncio . coroutine

async def getPage ( url , res_list ) :

print ( url )

headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' }

# conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087")

async with aiohttp . ClientSession ( ) as session :

async with session . get ( url , headers = headers ) as resp :

assert resp . status == 200

res_list . append ( await resp . text ( ) )

async def getTitle ( url , res_list ) :

print ( url )

headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' }

# conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087")

async with aiohttp . ClientSession ( ) as session :

async with session . get ( url , headers = headers ) as resp :

assert resp . status == 200

html = await resp . text ( )

title = re . search ( "<title>(.*?)</title>" , html , re . S ) . group ( 0 )

print ( title )

# with open('title.txt','a+') as f:

# print(title,url)

# f.write(title+","+url+"\n")

# print(type(await resp.text()))

# res_list.append(await resp.text())

class parseListPage ( ) :

def __init__ ( self , page_str ) :

self . page_str = page_str

def __enter__ ( self ) :

page_str = self . page_str

page = bs ( page_str , 'lxml' )

# 获取文章链接

articles = page . select ( '.txtList30 li' )

art_urls = [ ]

for a in articles :

x = a . find ( 'a' ) [ 'href' ]

art_urls . append ( x )

return art_urls

def __exit__ ( self , exc_type , exc_val , exc_tb ) :

pass

page_num = 100

page_url_base = 'http://news.artron.net/morenews/list728/p'

page_urls = [ page_url_base + str ( i + 1 ) for i in range ( page_num ) ]

loop = asyncio . get_event_loop ( )

ret_list = [ ]

tasks = [ getPage ( host , ret_list ) for host in page_urls ]

print ( tasks )

loop . run_until_complete ( asyncio . wait ( tasks ) )

articles_url = [ ]

for ret in ret_list :

with parseListPage ( ret ) as tmp :

articles_url += tmp

ret_list = [ ]

tasks = [ getTitle ( url , ret_list ) for url in articles_url ]

loop . run_until_complete ( asyncio . wait ( tasks ) )

loop . close ( )

# 例子 0

import asyncio

import aiohttp , time

NUMBERS = range ( 12 )

'''

1. 当我们给一个函数添加了async关键字，就会把它变成一个异步函数。

每个线程有一个事件循环，主线程调用asyncio.get_event_loop时会创建事件循环，

你需要把异步的任务丢给这个循环的run_until_complete方法，事件循环会安排协同程序的执行。

和方法名字一样，异步的任务完成方法才会就执行完成了。

await asyncio.wait(blocking_tasks)就是协同的执行那些同步的任务，直到完成。

'''

URL = 'http://httpbin.org/get?a={}'

async def fetch_async ( a ) :

async with aiohttp . ClientSession ( ) as session :

async with session . get ( URL . format ( a ) ) as r :

data = await r . json ( )

#希望能进行协程切换的地方，就需要使用await关键字。如上的例子中r.json方法会等待I/O（也就是正在做一个网络请求），这种就可以切换去做其他的时候，之后再切换回来。

return data [ 'args' ] [ 'a' ]

start = time . time ( )

event_loop = asyncio . get_event_loop ( )

#会创建事件循环

tasks = [ fetch_async ( num ) for num in NUMBERS ]

results = event_loop . run_until_complete ( asyncio . gather ( * tasks ) )

for num , result in zip ( NUMBERS , results ) :

print ( 'fetch({}) = {}' . format ( num , result ) )

print ( 'Use asyncio aiohttp : {}' . format ( time . time ( ) - start ) )

参考:http://blog.csdn.net/u014595019/article/details/52295642

zeropython 微信公众号 5868037 QQ号 5868037@qq.com QQ邮箱

songhao8080

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

asyncio aiohttp 异步爬虫 实例

asyncio aiohttp 异步爬虫实例