(1) corcurrent线程池(python3)
#!/usr/bin/python3
import socket
import sys
import ssl
import concurrent.futures
from contextlib import closing
import time
import certifi
socket.setdefaulttimeout(5)
def do_https(host, port=443):
try:
assert(host)
sk = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
ss = ssl.wrap_socket(sk, ca_certs=certifi.where(), cert_reqs=ssl.CERT_REQUIRED)
ss.connect((host, port))
with closing(ss):
request = b'GET / HTTP/1.0\r\n\r\n'
ret = ""
ss.send(request)
while True:
buf = ss.recv(1024)
if len(buf) == 0:
break
ret += str(buf)
if "AkamaiGHost" in ret:
ret_info = ("{}........................{}".format(host, 1))
else:
ret_info = ("{}........................{}".format(host, 0))
return ret_info
except socket.error as e:
print("Error: {} ".format(e))
ret_info = "{}........................{}".format(host, -1)
return ret_info
if __name__ == "__main__":
with concurrent.futures.ThreadPoolExecutor(max_workers=1000) as executor:
with open(sys.argv[1], 'r') as f:
results = executor.map(do_https, [line.split('\t')[0] for line in f])
for ret_info in results:
print(ret_info)
耗时:
real 0m8.750s
user 0m38.899s
sys 0m3.134s
(2)asyncio (>= python3.4.4)
#!/usr/bin/python3
import asyncio
import sys
import ssl
from contextlib import closing
import certifi
@asyncio.coroutine
def print_http_headers(host):
context = ssl.SSLContext(ssl.PROTOCOL_TLS)
context.verify_mode = ssl.CERT_REQUIRED
context.check_hostname = False
context.load_verify_locations(certifi.where())
connect = asyncio.open_connection(host, 443, ssl=context, server_hostname=None)
try:
# with (yield from semaphore):
# reader, writer = yield from connect
# reader, writer = yield from connect
reader, writer = yield from asyncio.wait_for(connect,timeout=5)
with closing(writer):
query ='GET / HTTP/1.0\r\n\r\n'
writer.write(query.encode('utf-8'))
ret = ""
while True:
line = yield from asyncio.wait_for(reader.readline(), timeout=5)
if not line:
break
line = line.decode('utf-8').rstrip()
ret += line
# if line:
# print('httpheader>%s'%line)
if "AkamaiGHost" in ret:
print("{}........................{}".format(host, 1))
else:
print("{}........................{}".format(host, 0))
return 1
except Exception as e:
#print("Error: {} ".format(e))
print("{}........................{}".format(host, -1))
return -1
loop = asyncio.get_event_loop()
loop.set_debug(enabled=True)
#semaphore = asyncio.Semaphore(500)
with open(sys.argv[1], encoding='utf-8') as f:
tasks = [asyncio.ensure_future(print_http_headers(line.split('\t')[0])) for line in f]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
耗时:
real 0m10.892s
user 0m7.659s
sys 0m0.558s
结论:
只看时间的话,corcurrent的速度比asyncio要快。但是,corcurrent为了达到同等速度开了几百个线程,后台可以看到两者的CPU占用情况:
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME
26551 root 20 0 13.373g 466684 4628 S 813.3 0.7 0:27.54 python3
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
26521 root 20 0 508240 446864 4400 R 83.7 0.7 0:02.52 python3
前者的CPU占用大概是后者的10倍。所以总的来看,asyncio的性价比更高。
又跑了一遍,这次,asyncio时间上超过corcurrent了,可能因为网络环境影响。比较的环境和条件可能并不严格,但是大体可以说明两者各自的优劣所在:
针对IO密集型操作,协程相对多进程并发的资源消耗更小,而多进程的优势在于可以更有效利用多核并发。
TODO: 协程的异常处理和取消