epoll机制相比select/poll机制能更有效地实现描述符的多路复用(支持更多的描述符,处理效率更高[具体机制这边不展开了]),本文从编程的角度做一个介绍。
epoll接口函数
头文件: #include<sys/epoll.h>
可以通过man epoll查看对应的帮助信息
最大描述符限制:/proc/sys/fs/epoll/max_user_watches
创建epoll实例
int epoll_create(int size);
int epoll_create1(int flag);
epoll_create中,size只是给内核的一个维数提示,并不是队列中的最大数,Now days, size is ignored since Linux 2.6.8
epoll_create1中flag取值如下:
- 0:epoll_create1 == epoll_create (size argument is dropped)
- EPOLL_CLOEXEC:含义同open函数的O_CLOEXEC选项;当执行execve创建新进程时,打开的描述符自动关闭
p.s: 当使用完毕时,需要调用close关闭epoll实例句柄
管理epoll事件
int epoll_ctl (int epfd, int op, int fd, struct epoll_event *event);
参数说明:
- epfd: epoll_create返回的epoll实例
- op: 对应的操作
- fd: 监听的fd
- event: 监听的事件
其中op取值如下: - EPOLL_CTL_ADD:添加监听的事件
- EPOLL_CTL_DEL:删除监听的事件
- EPOLL_CTL_MOD:修改监听的事件
struct epoll_event定义如下:
typedef union epoll_data
{
void *ptr;
int fd;
uint32_t u32;
uint64_t u64;
} epoll_data_t;
struct epoll_event
{
uint32_t events; /* Epoll events */
epoll_data_t data; /* User data variable */
};
其中events可以包含以下事件类型:
- EPOLLIN: 描述符可读
- EPOLLOUT: 描述符可写
- EPOLLRDHUP(since Linux 2.6.17): 流套接字对端关闭连接或者关闭写端
- EPOLLPRI: 紧急数据可读
- EPOLLERR: 描述符发生错误,该事件由内核一直监听(比如connect套接字失败会返回EPOLLERR)
- EPOLLHUP: 文件秒杀符被中断,该事件由内核一直监听
- EPOLLET: 开启边缘触发,默认是水平触发
- EPOLLONESHOT: 一个事件发生并读取之后,fd自动不再监控;若要重新监控需要使用EPOLL_CTL_MOD重新设置
返回值: 成功返回0,失败返回-1并设置errno
等待epoll事件
int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);
参数说明:
- epfd: epoll_create返回的epoll实例
- events: 存储epoll_event的数组地址
- maxevents: 最大事件的数量,需>0
- timeout: 等待的最长时间
返回值:
成功时返回就绪的监听文件描述符数;当超出timeout指定的时间后如果无就绪的文件描述符,返回0;发生错误时返回-1并设置errno
另外,Linux kernel 2.6.19 引入了epoll_pwait,可以在等待时设置信号掩码,其使用方式类似pselect
some problems:
- epoll 怎么判断是connect请求还是有数据可读?
ans: 判断events[i].data.fd == listen_fd - read 返回值说明:
- return -1 and errno == EAGAIN: 数据已经读完,没有可读数据
- return 0: end of file,对端关闭连接
关于水平触发(Level-Triggered)和边缘触发(Edge-Triggered)
当缓冲区有数据可读时,ET会触发一次事件,之后就不会再触发;而LT只要我们没有读完缓冲区的数据,事件就会一直触发。
推荐使用的epoll ET方式如下:
- 设置fd为非阻塞
- 当调用read或write读写时,在其返回-1,且errno == EAGAIN 后再调用epoll_wait等待
tips:
ET模式只能用于设置了O_NONBLOCK的fd,而LT则同时支持同步及异步。如果将ET模式应用与阻塞情况,将出现如下问题:
当对端send 2 byte数据,而服务端只读取了1 byte后再去调用epoll_wait,这时将不产生读事件。直到对端又有数据发送过来,epoll_wait才会再次返回
补充:
Q:当又有事件产生时会怎么样,原来的数据还在吗?
A:原来的数据还在socket缓冲区
epoll实例
epoll使用参考:
- 服务端代码:How to use epoll? A complete example in C, it’s a well write paper.
static int
create_and_bind (char *port)
{
struct addrinfo hints;
struct addrinfo *result, *rp;
int s, sfd;
memset (&hints, 0, sizeof (struct addrinfo));
hints.ai_family = AF_UNSPEC; /* Return IPv4 and IPv6 choices */
hints.ai_socktype = SOCK_STREAM; /* We want a TCP socket */
hints.ai_flags = AI_PASSIVE; /* All interfaces */
s = getaddrinfo (NULL, port, &hints, &result);
if (s != 0)
{
fprintf (stderr, "getaddrinfo: %s\n", gai_strerror (s));
return -1;
}
for (rp = result; rp != NULL; rp = rp->ai_next)
{
sfd = socket (rp->ai_family, rp->ai_socktype, rp->ai_protocol);
if (sfd == -1)
continue;
s = bind (sfd, rp->ai_addr, rp->ai_addrlen);
if (s == 0)
{
/* We managed to bind successfully! */
break;
}
close (sfd);
}
if (rp == NULL)
{
fprintf (stderr, "Could not bind\n");
return -1;
}
freeaddrinfo (result);
return sfd;
}
static int
make_socket_non_blocking (int sfd)
{
int flags, s;
flags = fcntl (sfd, F_GETFL, 0);
if (flags == -1)
{
perror ("fcntl");
return -1;
}
flags |= O_NONBLOCK;
s = fcntl (sfd, F_SETFL, flags);
if (s == -1)
{
perror ("fcntl");
return -1;
}
return 0;
}
#define MAXEVENTS 64
int
main (int argc, char *argv[])
{
int sfd, s;
int efd;
struct epoll_event event;
struct epoll_event *events;
if (argc != 2)
{
fprintf (stderr, "Usage: %s [port]\n", argv[0]);
exit (EXIT_FAILURE);
}
sfd = create_and_bind (argv[1]);
if (sfd == -1)
abort ();
s = make_socket_non_blocking (sfd);
if (s == -1)
abort ();
s = listen (sfd, SOMAXCONN);
if (s == -1)
{
perror ("listen");
abort ();
}
efd = epoll_create1 (0);
if (efd == -1)
{
perror ("epoll_create");
abort ();
}
event.data.fd = sfd;
event.events = EPOLLIN | EPOLLET;
s = epoll_ctl (efd, EPOLL_CTL_ADD, sfd, &event);
if (s == -1)
{
perror ("epoll_ctl");
abort ();
}
/* Buffer where events are returned */
events = calloc (MAXEVENTS, sizeof event);
/* The event loop */
while (1)
{
int n, i;
n = epoll_wait (efd, events, MAXEVENTS, -1);
for (i = 0; i < n; i++)
{
if ((events[i].events & EPOLLERR) ||
(events[i].events & EPOLLHUP) ||
(!(events[i].events & EPOLLIN)))
{
/* An error has occured on this fd, or the socket is not
ready for reading (why were we notified then?) */
fprintf (stderr, "epoll error\n");
close (events[i].data.fd);
continue;
}
else if (sfd == events[i].data.fd)
{
/* We have a notification on the listening socket, which
means one or more incoming connections. */
while (1)
{
struct sockaddr in_addr;
socklen_t in_len;
int infd;
char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
in_len = sizeof in_addr;
infd = accept (sfd, &in_addr, &in_len);
if (infd == -1)
{
if ((errno == EAGAIN) ||
(errno == EWOULDBLOCK))
{
/* We have processed all incoming
connections. */
break;
}
else
{
perror ("accept");
break;
}
}
s = getnameinfo (&in_addr, in_len,
hbuf, sizeof hbuf,
sbuf, sizeof sbuf,
NI_NUMERICHOST | NI_NUMERICSERV);
if (s == 0)
{
printf("Accepted connection on descriptor %d "
"(host=%s, port=%s)\n", infd, hbuf, sbuf);
}
/* Make the incoming socket non-blocking and add it to the
list of fds to monitor. */
s = make_socket_non_blocking (infd);
if (s == -1)
abort ();
event.data.fd = infd;
event.events = EPOLLIN | EPOLLET;
s = epoll_ctl (efd, EPOLL_CTL_ADD, infd, &event);
if (s == -1)
{
perror ("epoll_ctl");
abort ();
}
}
continue;
}
else
{
/* We have data on the fd waiting to be read. Read and
display it. We must read whatever data is available
completely, as we are running in edge-triggered mode
and won't get a notification again for the same
data. */
int done = 0;
while (1)
{
ssize_t count;
char buf[512];
count = read (events[i].data.fd, buf, sizeof buf);
if (count == -1)
{
/* If errno == EAGAIN, that means we have read all
data. So go back to the main loop. */
if (errno != EAGAIN)
{
perror ("read");
done = 1;
}
break;
}
else if (count == 0)
{
/* End of file. The remote has closed the
connection. */
done = 1;
break;
}
/* Write the buffer to standard output */
s = write (1, buf, count);
if (s == -1)
{
perror ("write");
abort ();
}
}
if (done)
{
printf ("Closed connection on descriptor %d\n",
events[i].data.fd);
/* Closing the descriptor will make epoll remove it
from the set of descriptors which are monitored. */
close (events[i].data.fd);
}
}
}
}
free (events);
close (sfd);
return EXIT_SUCCESS;
}
- 客户端代码:
#include <stdio.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>
#include <string.h>
#include <stdlib.h>
int create_and_connect(char * port)
{
if(NULL == port)
{
return -1;
}
struct addrinfo hints;
memset(&hints, 0, sizeof(struct addrinfo));
hints.ai_family = AF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
hints.ai_flags = AI_PASSIVE;
struct addrinfo *result;
int ret = getaddrinfo(NULL, port, &hints, &result);
if(ret != 0)
{
fprintf(stderr, "getaddrinfo error: %s\n", gai_strerror(ret));
return -1;
}
struct addrinfo *rp;
int cfd;
for(rp = result; rp != NULL; rp = rp->ai_next)
{
cfd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
if(-1 == cfd)
{
continue;
}
//client connect
ret = connect(cfd, rp->ai_addr, rp->ai_addrlen);
if(0 == ret)
{
break;
}
close(cfd);
}
if(NULL == rp)
{
fprintf(stderr, "connect to port failed!\n");
return -1;
}
freeaddrinfo(result);
return cfd;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "Usage: %s [port]\n", argv[0]);
exit(-1);
}
//clinet send something
int cfd = create_and_connect(argv[1]);
if(-1 == cfd)
{
fprintf(stderr, "create_and_connect failed\n");
return -1;
}
char *pData = "Client hello!";
int dataLen = strlen(pData);
send(cfd, pData, dataLen, 0);
sleep(1);
send(cfd, pData, dataLen, 0);
return 0;
}
mac下的epoll
mac os不支持epoll,其使用kqueue实现(类似epoll),头文件 sys/event.h
link:https://zhuanlan.zhihu.com/p/21375144
epoll源码实现
参考博客:Linux epoll 详解