网络服务器传统上对每个连接用一个单独的进程或线程来实现.对于高性能应用需要同时处理大量客户端,这种方法工作显然不好,因为如此的资源使用和上下文切换时间影响了在某一时刻并发处理客户端的能力.一个可选的方法是在一个单线程用非阻塞IO,用一些可读方法来通知你在一个socket上可读或者可写.
这篇文章介绍linux 的epoll,它是linux下最好的可读通知.我们将写一个简单的完整TCP服务器实现.我想你已经有C语言编程经历,知道如何在linux运行C程序,并且能读一下你用到的多个C函数.
epoll在linux2.6被介绍进来,在其他类unix操作系统下不可用.它提供与select和poll类似的功能:
- select能同时监控FD_SETSIZE个描述符,特别的决定与libc编译期的一个确定数目.
- poll没有select这样的限制,但是我们必须对所有传进来的描述符进行现行扫描来检测可读通知,复杂度是O(N),所用低效.
epoll没有这种限制,并不是线性扫描.因此能够表现地更好并且处理大量事件.
一个epoll实例可以被epoll_create创建.epoll_ctl是一个增加或者移除描述符来监视epoll实例.epoll_wait来等待事件,它将阻塞到有事件到来.
当一个描述符被添加到epoll实例,他们能被以边缘触发和水平触发来添加.当你用水平触发模式,数据可读时,epoll_wait将一直返回可读事件.如果你没读完数据,调用epoll_wait来继续监视描述符,它将再次返回一个读事件因为数据已经准备好了.
在边缘触发模式,你将只得到一个可读通知.如果你没有读完数据,可以调用epoll_wait来再次监视描述符,它将阻塞因为可读事件已经通知了.
传给epoll_wait的事件结构在下面.对于每个被监视的描述符,你可以与一个整数或者指针作为用户数据:
typedef union epoll_data { void *ptr; int fd; __uint32_t u32; __uint64_t u64; } epoll_data_t; struct epoll_event { __uint32_t events; /* Epoll events */ epoll_data_t data; /* User data variable */ };
来让我们写代码.我们将会实现一个小型TCP服务器,它会打印传输到这个socket上的数据到标准输出.我们可以开始写一个create_and_bind.
static int create_and_bind (char *port) { struct addrinfo hints; struct addrinfo *result, *rp; int s, sfd; memset (&hints, 0, sizeof (struct addrinfo)); hints.ai_family = AF_UNSPEC; /* Return IPv4 and IPv6 choices */ hints.ai_socktype = SOCK_STREAM; /* We want a TCP socket */ hints.ai_flags = AI_PASSIVE; /* All interfaces */ s = getaddrinfo (NULL, port, &hints, &result); if (s != 0) { fprintf (stderr, "getaddrinfo: %s\n", gai_strerror (s)); return -1; } for (rp = result; rp != NULL; rp = rp->ai_next) { sfd = socket (rp->ai_family, rp->ai_socktype, rp->ai_protocol); if (sfd == -1) continue; s = bind (sfd, rp->ai_addr, rp->ai_addrlen); if (s == 0) { /* We managed to bind successfully! */ break; } close (sfd); } if (rp == NULL) { fprintf (stderr, "Could not bind\n"); return -1; } freeaddrinfo (result); return sfd; }
create_and_bind函数包含一个标准IPV4或者IPV6 socket.
struct addrinfo { int ai_flags; int ai_family; int ai_socktype; int ai_protocol; size_t ai_addrlen; struct sockaddr *ai_addr; char *ai_canonname; struct addrinfo *ai_next; };
下面我们将用一个函数来设置非阻塞socket.
static int make_socket_non_blocking (int sfd) { int flags, s; flags = fcntl (sfd, F_GETFL, 0); if (flags == -1) { perror ("fcntl"); return -1; } flags |= O_NONBLOCK; s = fcntl (sfd, F_SETFL, flags); if (s == -1) { perror ("fcntl"); return -1; } return 0; }
现在,main函数包含eventloop.
static int make_socket_non_blocking (int sfd) { int flags, s; flags = fcntl (sfd, F_GETFL, 0); if (flags == -1) { perror ("fcntl"); return -1; } flags |= O_NONBLOCK; s = fcntl (sfd, F_SETFL, flags); if (s == -1) { perror ("fcntl"); return -1; } return 0; }
Now, on to the main() function of the program which contains the event loop. This is the bulk of the program:
#define MAXEVENTS 64 int main (int argc, char *argv[]) { int sfd, s; int efd; struct epoll_event event; struct epoll_event *events; if (argc != 2) { fprintf (stderr, "Usage: %s [port]\n", argv[0]); exit (EXIT_FAILURE); } sfd = create_and_bind (argv[1]); if (sfd == -1) abort (); s = make_socket_non_blocking (sfd); if (s == -1) abort (); s = listen (sfd, SOMAXCONN); if (s == -1) { perror ("listen"); abort (); } efd = epoll_create1 (0); if (efd == -1) { perror ("epoll_create"); abort (); } event.data.fd = sfd; event.events = EPOLLIN | EPOLLET; s = epoll_ctl (efd, EPOLL_CTL_ADD, sfd, &event); if (s == -1) { perror ("epoll_ctl"); abort (); } /* Buffer where events are returned */ events = calloc (MAXEVENTS, sizeof event); /* The event loop */ while (1) { int n, i; n = epoll_wait (efd, events, MAXEVENTS, -1); for (i = 0; i < n; i++) { if ((events[i].events & EPOLLERR) || (events[i].events & EPOLLHUP) || (!(events[i].events & EPOLLIN))) { /* An error has occured on this fd, or the socket is not ready for reading (why were we notified then?) */ fprintf (stderr, "epoll error\n"); close (events[i].data.fd); continue; } else if (sfd == events[i].data.fd) { /* We have a notification on the listening socket, which means one or more incoming connections. */ while (1) { struct sockaddr in_addr; socklen_t in_len; int infd; char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; in_len = sizeof in_addr; infd = accept (sfd, &in_addr, &in_len); if (infd == -1) { if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { /* We have processed all incoming connections. */ break; } else { perror ("accept"); break; } } s = getnameinfo (&in_addr, in_len, hbuf, sizeof hbuf, sbuf, sizeof sbuf, NI_NUMERICHOST | NI_NUMERICSERV); if (s == 0) { printf("Accepted connection on descriptor %d " "(host=%s, port=%s)\n", infd, hbuf, sbuf); } /* Make the incoming socket non-blocking and add it to the list of fds to monitor. */ s = make_socket_non_blocking (infd); if (s == -1) abort (); event.data.fd = infd; event.events = EPOLLIN | EPOLLET; s = epoll_ctl (efd, EPOLL_CTL_ADD, infd, &event); if (s == -1) { perror ("epoll_ctl"); abort (); } } continue; } else { /* We have data on the fd waiting to be read. Read and display it. We must read whatever data is available completely, as we are running in edge-triggered mode and won't get a notification again for the same data. */ int done = 0; while (1) { ssize_t count; char buf[512]; count = read (events[i].data.fd, buf, sizeof buf); if (count == -1) { /* If errno == EAGAIN, that means we have read all data. So go back to the main loop. */ if (errno != EAGAIN) { perror ("read"); done = 1; } break; } else if (count == 0) { /* End of file. The remote has closed the connection. */ done = 1; break; } /* Write the buffer to standard output */ s = write (1, buf, count); if (s == -1) { perror ("write"); abort (); } } if (done) { printf ("Closed connection on descriptor %d\n", events[i].data.fd); /* Closing the descriptor will make epoll remove it from the set of descriptors which are monitored. */ close (events[i].data.fd); } } } } free (events); close (sfd); return EXIT_SUCCESS; }