import urllib.request as request from bs4 import BeautifulSoup as bs import <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/asyncio" title="View all posts in asyncio" target="_blank">asyncio</a></span> import <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/aiohttp" title="View all posts in aiohttp" target="_blank">aiohttp</a></span>,re @<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/asyncio" title="View all posts in asyncio" target="_blank">asyncio</a></span>.coroutine async def getPage(url,res_list): print(url) headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} # conn = <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/aiohttp" title="View all posts in aiohttp" target="_blank">aiohttp</a></span>.ProxyConnector(proxy="http://127.0.0.1:8087") async with <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/aiohttp" title="View all posts in aiohttp" target="_blank">aiohttp</a></span>.ClientSession() as session: async with session.get(url,headers=headers) as resp: assert resp.status==200 res_list.append(await resp.text()) async def getTitle(url,res_list): print(url) headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} # conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087") async with aiohttp.ClientSession() as session: async with session.get(url,headers=headers) as resp: assert resp.status==200 html = await resp.text() title=re.search("<title>(.*?)</title>",html,re.S).group(0) print(title) # with open('title.txt','a+') as f: # print(title,url) # f.write(title+","+url+"\n") # print(type(await resp.text())) # res_list.append(await resp.text()) class parseListPage(): def __init__(self,page_str): self.page_str = page_str def __enter__(self): page_str = self.page_str page = bs(page_str,'lxml') # 获取文章链接 articles = page.select('.txtList30 li') art_urls = [] for a in articles: x = a.find('a')['href'] art_urls.append(x) return art_urls def __exit__(self, exc_type, exc_val, exc_tb): pass page_num = 100 page_url_base = 'http://news.artron.net/morenews/list728/p' page_urls = [page_url_base + str(i+1) for i in range(page_num)] loop = asyncio.get_event_loop() ret_list = [] tasks = [getPage(host,ret_list) for host in page_urls] print(tasks) loop.run_until_complete(asyncio.wait(tasks)) articles_url = [] for ret in ret_list: with parseListPage(ret) as tmp: articles_url += tmp ret_list = [] tasks = [getTitle(url, ret_list) for url in articles_url] loop.run_until_complete(asyncio.wait(tasks)) loop.close() # 例子 0 import asyncio import aiohttp,time NUMBERS = range(12) ''' 1. 当我们给一个函数添加了async关键字,就会把它变成一个异步函数。 每个线程有一个事件循环,主线程调用asyncio.get_event_loop时会创建事件循环, 你需要把异步的任务丢给这个循环的run_until_complete方法,事件循环会安排协同程序的执行。 和方法名字一样,异步的任务完成方法才会就执行完成了。 await asyncio.wait(blocking_tasks)就是协同的执行那些同步的任务,直到完成。 ''' URL = 'http://httpbin.org/get?a={}' async def fetch_async(a): async with aiohttp.ClientSession() as session: async with session.get(URL.format(a)) as r: data = await r.json() #希望能进行协程切换的地方,就需要使用await关键字。如上的例子中r.json方法会等待I/O(也就是正在做一个网络请求),这种就可以切换去做其他的时候,之后再切换回来。 return data['args']['a'] start = time.time() event_loop = asyncio.get_event_loop() #会创建事件循环 tasks = [fetch_async(num) for num in NUMBERS] results = event_loop.run_until_complete(asyncio.gather(*tasks)) for num, result in zip(NUMBERS, results): print('fetch({}) = {}'.format(num, result)) print('Use asyncio aiohttp : {}'.format(time.time() - start))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
import
urllib
.
request
as
request
from
bs4
import
BeautifulSoup
as
bs
import
asyncio
import
aiohttp
,
re
@
asyncio
.
coroutine
async
def
getPage
(
url
,
res_list
)
:
print
(
url
)
headers
=
{
'User-Agent'
:
'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
}
# conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087")
async
with
aiohttp
.
ClientSession
(
)
as
session
:
async
with
session
.
get
(
url
,
headers
=
headers
)
as
resp
:
assert
resp
.
status
==
200
res_list
.
append
(
await
resp
.
text
(
)
)
async
def
getTitle
(
url
,
res_list
)
:
print
(
url
)
headers
=
{
'User-Agent'
:
'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
}
# conn = aiohttp.ProxyConnector(proxy="http://127.0.0.1:8087")
async
with
aiohttp
.
ClientSession
(
)
as
session
:
async
with
session
.
get
(
url
,
headers
=
headers
)
as
resp
:
assert
resp
.
status
==
200
html
=
await
resp
.
text
(
)
title
=
re
.
search
(
"<title>(.*?)</title>"
,
html
,
re
.
S
)
.
group
(
0
)
print
(
title
)
# with open('title.txt','a+') as f:
# print(title,url)
# f.write(title+","+url+"\n")
# print(type(await resp.text()))
# res_list.append(await resp.text())
class
parseListPage
(
)
:
def
__init__
(
self
,
page_str
)
:
self
.
page_str
=
page_str
def
__enter__
(
self
)
:
page_str
=
self
.
page_str
page
=
bs
(
page_str
,
'lxml'
)
# 获取文章链接
articles
=
page
.
select
(
'.txtList30 li'
)
art_urls
=
[
]
for
a
in
articles
:
x
=
a
.
find
(
'a'
)
[
'href'
]
art_urls
.
append
(
x
)
return
art_urls
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
)
:
pass
page_num
=
100
page_url_base
=
'http://news.artron.net/morenews/list728/p'
page_urls
=
[
page_url_base
+
str
(
i
+
1
)
for
i
in
range
(
page_num
)
]
loop
=
asyncio
.
get_event_loop
(
)
ret_list
=
[
]
tasks
=
[
getPage
(
host
,
ret_list
)
for
host
in
page_urls
]
print
(
tasks
)
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
)
)
articles_url
=
[
]
for
ret
in
ret_list
:
with
parseListPage
(
ret
)
as
tmp
:
articles_url
+=
tmp
ret_list
=
[
]
tasks
=
[
getTitle
(
url
,
ret_list
)
for
url
in
articles_url
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
)
)
loop
.
close
(
)
# 例子 0
import
asyncio
import
aiohttp
,
time
NUMBERS
=
range
(
12
)
'''
1. 当我们给一个函数添加了async关键字,就会把它变成一个异步函数。
每个线程有一个事件循环,主线程调用asyncio.get_event_loop时会创建事件循环,
你需要把异步的任务丢给这个循环的run_until_complete方法,事件循环会安排协同程序的执行。
和方法名字一样,异步的任务完成方法才会就执行完成了。
await asyncio.wait(blocking_tasks)就是协同的执行那些同步的任务,直到完成。
'''
URL
=
'http://httpbin.org/get?a={}'
async
def
fetch_async
(
a
)
:
async
with
aiohttp
.
ClientSession
(
)
as
session
:
async
with
session
.
get
(
URL
.
format
(
a
)
)
as
r
:
data
=
await
r
.
json
(
)
#希望能进行协程切换的地方,就需要使用await关键字。如上的例子中r.json方法会等待I/O(也就是正在做一个网络请求),这种就可以切换去做其他的时候,之后再切换回来。
return
data
[
'args'
]
[
'a'
]
start
=
time
.
time
(
)
event_loop
=
asyncio
.
get_event_loop
(
)
#会创建事件循环
tasks
=
[
fetch_async
(
num
)
for
num
in
NUMBERS
]
results
=
event_loop
.
run_until_complete
(
asyncio
.
gather
(
*
tasks
)
)
for
num
,
result
in
zip
(
NUMBERS
,
results
)
:
print
(
'fetch({}) = {}'
.
format
(
num
,
result
)
)
print
(
'Use asyncio aiohttp : {}'
.
format
(
time
.
time
(
)
-
start
)
)
|
参考:http://blog.csdn.net/u014595019/article/details/52295642