# 摘
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
-
对于HTTP & HTTPS URL,
urllib.request.urlopen(url, timeout=timeout)
返回一个可供with
上下文管理器调用的对象http.client.HTTPResponse对象。方法.read(num)
返回指定页面内容,默认返回全部页面全部内容。如需解码需使用.decode('utf-8')
如下指定:with urllib.request.urlopen('http://www.python.org/') as f: ... print(f.read(100).decode('utf-8'))
-
区别于
requests.get()
,其返回一个Response对象,包括返回结果的类型、状态码、编码方式、cookies等。import concurrent.futures import requests URLS = ['http://www.foxnews.com/', 'http://www.cnn.com/', 'http://europe.wsj.com/', 'http://www.bbc.co.uk/', 'http://some-made-up-domain.com/'] # Retrieve a single page and report the URL and contents def load_url(url, timeout): try: req = requests.get(url, timeout=timeout) req.raise_for_status() req.encoding = req.apparent_encoding return req.text except: return '404' # We can use a with statement to ensure threads are cleaned up promptly with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: # Start the load operations and mark each future with its URL future_to_url = {executor.submit(load_url, url, 60): url for url in URLS} for future in concurrent.futures.as_completed(future_to_url): url = future_to_url[future] try: data = future.result() except Exception as exc: print('%r generated an exception: %s' % (url, exc)) else: print('%r page is %d bytes' % (url, len(data))) # ********************输出 'http://www.foxnews.com/' page is 226081 bytes 'http://www.cnn.com/' page is 991488 bytes 'http://www.bbc.co.uk/' page is 3 bytes 'http://some-made-up-domain.com/' page is 3 bytes 'http://europe.wsj.com/' page is 3 bytes
-
Executor Objects 抽象类(
concurrent.futures.Executor
),不直接使用,通常使用其子类concurrent.futures.ThreadPoolExecutor
,concurrent.futures.ProcessPoolExecutor
,concurrent.futures.Future
. -
concurrent.futures.Executor
有三个基本的方法,.submit(func, *agrs, **kw)
,map(func, *iterables, timeout=None, chunksize=1)
,shutdown(wait=True)
。使用with语句,可以避免显示调用.shutdown()
。 -
concurrent.futures.wait()
等待Future运行完成,concurrent.futures.as_completed()
返回一个Future的迭代器。