初步解析 socket 流
import socket
EOL = b'\n\n'
response = b'HTTP/1.0 200 OK\r\nDate: Mon, 1 Jan 1996 01:01:01 GMT\r\n'
response += b'Content-Type: text/plain\r\nContent-Length: 13\r\n\r\n'
response += b'Hello, world!'
serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
serversocket.bind(('0.0.0.0', 8080))
serversocket.listen(1)
try:
while True:
connectiontoclient, address = serversocket.accept()
request = b''
while EOL not in request:
request += connectiontoclient.recv(1024) # 此处会造成阻塞,因为不停的从缓冲区里面读取数据, 如果网络数据还没有到达就会阻塞住等待数据到来
print ('-'*40 + '\n' + request.decode()[:-2])
connectiontoclient.send(response)
connectiontoclient.close()
finally:
serversocket.close()
上述是一个阻塞的server,请求是顺序处理的, 其中response字符串是HTTP 响应报文。
request += connectiontoclient.recv(1024)
recv(1024) 会造成阻塞。因为不停的从缓冲区读取数据, 如果网络数据还没有到达就会阻塞住等待数据到来。
当程序使用阻塞 socket 的时候, 它通常使用一个线程(甚至是专用进程)在每个socket上执行通信。主程序线程监听服务器 socket, 这个socket 接受来自客户端的传入连接。服务端一次创建一个新的 socket 接受一个连接,将新创建的 socket 传递给一个单独的线程,然后该线程将于客户端进行交互, 因为一个连接都有一个新的线程进行通信,所以任何阻塞也不会影响其他线程执行其各自的任务。
这就是最传统的IO模型 PPC(process per connection),TPC(thread per connection), 效率极低。
C10K 问题主要讨论了处理并发 socket 的一些替代方法,比如使用异步套接字。 在事件发生之前,这些socket不会堵塞,相反,当 socket 收到 信号时 程序立即在异步 socket 上执行操作。应用程序将信号进行初步处理。由于异步 socket 是非阻塞的,因此不需要多个执行线程,所有的工作都可以在一个线程中完成。
linux 有一系列的机制去管理异步 socket, select, poll, epoll 这三种API已经被python实现。
tornado 内部是使用了 epoll 时间通知机制来进行更新的。下面上epoll版的 HTTPServer
这里 使用了 epoll 的水平触发模式
import socket, select
# select 模块包括 epoll 方法
EOL = b'\n\n'
response = b'HTTP/1.0 200 OK\r\nDate: Mon, 1 Jan 1996 01:01:01 GMT\r\n'
response += b'Content-Type: text/plain\r\nContent-Length: 13\r\n\r\n'
response += b'Hello, world!'
serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
serversocket.bind(('0.0.0.0', 8080))
serversocket.listen(1)
serversocket.setblocking(0) # 这里使用非阻塞的socket是很有必要的
epoll = select.epoll() # 创建epoll对象
epoll.register(serversocket.fileno(), select.EPOLLIN) # 在服务器的 socket 上注册读事件, 也就是accept事件
try:
connections = {}; requests = {}; responses = {}
while True:
events = epoll.poll(1) # 查询epoll对象以查明是否发生了感兴趣的事件,参数 1 表示我们愿意等待一秒钟才能发生此类事件。 如果在此查询之前发生了感兴趣的事件,这个查询将立刻返回这些事件的列表
for fileno, event in events:
if fileno == serversocket.fileno(): # 当没有通知的时候接受新的请求, 为新的请求创建新的connection
connection, address = serversocket.accept() # 这里会创建新的socket
connection.setblocking(0) # 为新的socket设置为非阻塞
epoll.register(connection.fileno(), select.EPOLLIN)
connections[connection.fileno()] = connection
requests[connection.fileno()] = b''
responses[connection.fileno()] = response
elif event & select.EPOLLIN: # 缓冲区已满, 可以读数据, 发来 select.EPOLLIN 信号, 开始读取数据
requests[fileno] += connections[fileno].recv(1024)
if EOL in requests[fileno]:
epoll.modify(fileno, select.EPOLLOUT)
print('-'*40 + '\n' + requests[fileno].decode()[:-2])
elif event & select.EPOLLOUT: # 缓冲区已空, 请求已经完了,需要发送响应。发来 select.EPOLLOUT 状态
bytewritten = connections[fileno].send(responses[fileno])
responses[fileno] = responses[fileno][byteswritter:]
if len(responses[fileno]) == 0:
epoll.modify(fileno, 0)
connections[fileno].shutdown(socket.SHUT_RDWR)
elif event & select.EPOLLHUP:
epoll.unregister(fileno)
connections[fileno].close()
del connections[fileno]
finally:
epoll.unregister(serversocket.fileno())
epoll.close()
serversocket.close()
这里每一个connection 在服务端都对应着一个 文件描述符 fd, 也就是一个新的socket,这里recv(1024)永远能读到, 不会发生等待数据的情况
下面写一个边缘触发的epoll
import socket, select
EOL1 = b'\n\n'
EOL2 = b'\n\r\n'
response = b'HTTP/1.0 200 OK\r\nDate: Mon, 1 Jan 1996 01:01:01 GMT\r\n'
response += b'Content-Type: text/plain\r\nContent-Length: 13\r\n\r\n'
response += b'Hello, world!'
serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
serversocket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
serversocket.bind(('0.0.0.0', 8080))
serversocket.listen(1)
serversocket.setblocking(0)
epoll = select.epoll()
epoll.register(serversocket.fileno(), select.EPOLLIN | select.EPOLLET)
try:
connections = {}; requests = {}; responses = {}
while True:
events = epoll.poll(1)
for fileno, event in events:
if fileno == serversocket.fileno():
try:
while True:
connection, address = serversocket.accept()
connection.setblocking(0)
epoll.register(connection.fileno(), select.EPOLLIN | select.EPOLLET)
connections[connection.fileno()] = connection
requests[connection.fileno()] = b''
responses[connection.fileno()] = response
except socket.error:
pass
elif event & select.EPOLLIN:
try:
while True:
requests[fileno] += connections[fileno].recv(1024)
except socket.error:
pass
if EOL1 in requests[fileno] or EOL2 in requests[fileno]:
epoll.modify(fileno, select.EPOLLOUT | select.EPOLLET)
print ('-'*40 + '\n' + requests[fileno].decode()[: -2])
elif event & select.EPOLLOUT:
try:
while len(responses[fileno]) > 0:
byteswritten = connections[fileno].send(responses[fileno]) responses[fileno] = responses[fileno][byteswritten:]
except socket.error:
pass
if len(responses[fileno]) == 0:
epol.modify(fileno, select.EPOLLET)
connections[fileno].shutdown(socket.SHUT_RDWR)
elif event & select.EPOLLHUP:
epoll.unregister(fileno)
connections[fileno].close()
del connections[fileno]
finally:
epoll.unregister(seversocket.fileno())
epoll.close()
seversocket.close()
由于他们相似, 在移植用 select 或者 poll的应用程序时,大多使用水平触发模式, 当程序员不希望操作系统进行过多介入的时候使用边缘触发模式
这里有一个点,一个port可能有多个请求过来, 每个请求都是TCP请求(有状态的),也就是意味着socket无法复用(UDP可以多个请求公用一个socket, 因为UDP无状态,无需时刻监听socket)那么每个请求过来都务必在当前端口上创建一个新的socket来保证这次连接(无法使用其他端口, 因为存在防火墙无法确定那个端口是可用的)这也就意味着存在多个socket共同监听同一个端口
official example
import errno
import functools
import socket
import tornado.ioloop
from tornado import gen
for tornado.iostream import IOStream
@gen.coroutine
def handle_connection(connection, address):
stream = IOStream(connection)
message = yield stream.read_until_close()
print ('message from client: ', message.decode().strip())
def connection_ready(socket, fd, events):
while True:
try:
connection, address = sock.accept()
except socket.error as e:
if e.args[0] not in (errno.EWOULDBLOCK, errno.EAGAIN):
raise
return
connection.setblocking(0)
handle_connection(connection, address)
if __name__ == '__main__':
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.setblocking(0)
sock.bind(("", 8888))
sock.listen(128)
io_loop = tornado.ioloop.IOLoop.current()
callback = functools.partial(connection_ready, sock)
io_loop.add_handler(sock.fileno(), callback, io_loop.READ)
io_loop.start()
默认情况下, 新构造的 IOLoop
自动成为当前线程的 IOLoop
, 除非当前进程已经存在 IOLoop
。这个行为可以被传给IOLoop
构造器的 make_current
参数控制。如果 make_current=True ,那么新的IOLoop将尝试成为最新的IOLoop, 如果当前进程已经存在IOLoop,那么这个地方会抛出一个异常。
一般来说,一个IOLoop无法以任何方式在fork函数中存活或者被共享,当使用多进程时,每个进程应该创建自己的IOLoop, 这也意味着任何依赖于 IOLoop的对象都需要在子进程里面被创建
再来一版HTTP server
import sys
import socket
import logging
import StringIO
from datetime import datetime
from ioloop import IOLoop
EOL1 = b'\n\n'
EOL2 = b'\n\r\n'
class WSGIServer(object):
ADDRESS_FAMILY = socket.AF_INET
SOCKET_TYPE = socket.SOCK_STREAM
BACKLOG = 5
HEADER_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S GMT"
SERVER_NAME = "zigmo/WSGIServer 0.3"
def __init__(self, server_address):
self.ssocket = self.setup_server_socket(server_address)
host, self.server_port = self.ssocket.getsockname()[:2]
self.server_name = socket.getfqdn(host)
self.ioloop = IOLoop.instance()
self.conn_poll = {}
@classmethod
def setup_server_socket(cls, server_address):
ssocket = socket.socket(cls.ADDRESS_FAMILY, cls.SOCKET_TYPE)
ssocket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
ssocket.bind(server_address)
ssocket.listen(cls.BACKLOG)
ssocket.setblocking(0)
return ssocket
def set_app(self, application):
self.application = application
def _accept(self, ssocket, event):
if event & IOLoop.ERROR:
self._close(ssocket)
connect, addr = ssocket.accept()
connect.setblocking(0)
ioloop = IOLoop.instance()
ioloop.add_handler(connect, self._receive, IOLoop.READ)
fd = connect.fileno()
connection = Connection(fd)
connection.address = addr
self.conn_pool[fd] = connection
def _receive(self, connect, event):
if event & IOLoop.ERROR:
self._close(connect)
fd = connect.fileno()
connection = self.conn_pool[fd]
fragment = connect.recv(1024)
connection.request_buffer.append(fragment)
last_fragment = ''.join(connection.request_buffer[:2])
if EOL2 in last_fragment:
ioloop = IOLoop.instance()
ioloop.update_handler(fd, IOLoop.WRITE)
ioloop.replace_handler(fd, self._send)
def _send(self, connect, event):
if event & IOLoop.ERROR:
self._close(connect)
fd = connect.fileno()
connection = self.conn_pool[fd]
if not connection.handled:
self.handle(connection)
byteswritten = connect.send(connection.response)
if byteswritten:
connection.response = connection.response[byteswritter:]
if not len(connection.response):
self._close(connect)
def _close(self, connect, event=None):
fd = connect.fileno()
connect.shutdown(socket.SHUT_RDWR)
connect.close()
ioloop = IOLoop.instance()
ioloop.remove_handler(fd)
del self.conn_pool[fd]
def handle(self, connection):
def start_response(status, response_headers, exc_info=False):
utc_now = datetime.utcnow().strftime(self.HEADER_DATE_FORMAT)
connection.headers = response_headers + [
('Date', utc_now),
('Server', self.SERVER_NAME),
]
connection.status = status
request_text = ''.join(connection.request_buffer)
environ = self.get_environ(request_text)
body = self.application(environ, start_response)
connection.response = self.package_response(body, connection)
@classmethod
def parse_request_buffer(cls, text):
content_lines = text.splitlines()
request_line = content_lines[0].rstrip('\r\n')
request_method, path, request_version = request_line.split()
if '?' in path:
path, query_string = path.split('?', 1)
else:
path, query_string = path, ''
return {
'PATH_INFO': path,
'REQUEST_METHOD': request_method,
'SERVER_PROTOCOL': request_version,
'QUERY_STRING': query_string,
}
def get_environ(self, request_text):
request_data = self.parse_request_buffer(request_text)
scheme = request_data['SERVER_PROTOCOL'].split('/')[1].lower()
environ = {
'wsgi.version': (1, 0),
'wsgi.url_scheme': scheme,
'wsgi.input': StringIO.StringIO(request_text)
'wsgi.errors': sys.stderr,
'wsgi.multithread': False,
'wsgi.multiprocess': False,
'wsgi.run_once': False,
'SERVER_NAME': self.server_name,
'SERVER_PORT': self.server_port
}
environ.update(request_data)
return environ
def package_response(self, body, connection):
response = 'HTTP/1.1 {status}\r\n'.format(status=connection.status)
for header in connection.headers:
response += '{0}: {1}\r\n'.format(*header)
response += '\r\n'
for data in body:
response += data
access_logger.debug('\n' + ''.join('> {line}\n'.format(line=line) for line in response.splitlines()))
return response