epoll是Linux内核为处理大批量句柄而作了改进的poll,是Linux下多路复用IO接口select/poll的增强版本,它能显著减少程序在大量并发连接中只有少量活跃的情况下的系统CPU利用率。
epoll主要涉及epoll_create epoll_ctl epoll_wait三个基本系统调用,及增加功能的epoll_create1,epoll_pwait
epoll相对poll/select比较:
1.poll/select每次调用都要将文件描述符及事件由用户空间复制到内核空间;epoll只需要在epoll_ctl中一次复制并保存在内核中
2.poll/select调用被阻塞时都会添加waiter到文件描述符表中所有文件的等待队列中,poll/select退出时再从等待队列中删除;epoll只需在epoll_ctl中将waiter添加到文件的等待队列中,直到显示被删除(close,epoll_ctl)
3.poll/select每次调用都会轮询文件描述符表中所有文件是否有事件发生;而epoll_wait只是从readylist链表中的取文件描述符及发生事件(有事件发生时内核调用epoll_ctl注册的回调函数将文件描述符及事件添加到readylist中),没有时阻塞epoll_wait
由以上3点可以看出epoll在大量并发连接且只有少量活跃时相对poll/select有相当大的性能优势
下面讨论epoll的实现,内核版本2.6.32.60
I.数据结构
i.eventpoll
epoll接口的主要数据结构
/* fs/eventpoll.c */
165 /*
166 * This structure is stored inside the "private_data" member of the file
167 * structure and rapresent the main data sructure for the eventpoll
168 * interface.
169 */
170 struct eventpoll {
171 /* Protect the this structure access */
172 spinlock_t lock;
173
174 /*
175 * This mutex is used to ensure that files are not removed
176 * while epoll is using them. This is held during the event
177 * collection loop, the file cleanup path, the epoll file exit
178 * code and the ctl operations.
179 */
180 struct mutex mtx;
181
182 /* Wait queue used by sys_epoll_wait() */
183 wait_queue_head_t wq;
184
185 /* Wait queue used by file->poll() */
186 wait_queue_head_t poll_wait;
187
188 /* List of ready file descriptors */
189 struct list_head rdllist;
190
191 /* RB tree root used to store monitored fd structs */
192 struct rb_root rbr;
193
194 /*
195 * This is a single linked list that chains all the "struct epitem" that
196 * happened while transfering ready events to userspace w/out
197 * holding ->lock.
198 */
199 struct epitem *ovflist;
200
201 /* The user that created the eventpoll descriptor */
202 struct user_struct *user;
203
204 struct file *file;
205
206 /* used to optimize loop detection check */
207 int visited;
208 struct list_head visited_list_link;
209 };
wq:用于记录epoll_wait系统调用的waiter
poll_wait:用于记录文件poll操作的waiter(由epoll_create创建的文件)
rdllist:已经ready的文件描述符链表
rbr:红黑树树根,用于文件描述符及事件的存储及快速查找
ovflist:文件描述符及事件由内核空间复制到用户空间时,期间出现的ready文件描述符及事件记录在该链表
ii.epitem
每个添加到eventpoll接口的文件都有对应的epitem,记录相应的事件信息等
129 /*
130 * Each file descriptor added to the eventpoll interface will
131 * have an entry of this type linked to the "rbr" RB tree.
132 */
133 struct epitem {
134 /* RB tree node used to link this structure to the eventpoll RB tree */
135 struct rb_node rbn;
136
137 /* List header used to link this structure to the eventpoll ready list */
138 struct list_head rdllink;
139
140 /*
141 * Works together "struct eventpoll"->ovflist in keeping the
142 * single linked chain of items.
143 */
144 struct epitem *next;
145
146 /* The file descriptor information this item refers to */
147 struct epoll_filefd ffd;
148
149 /* Number of active wait queue attached to poll operations */
150 int nwait;
151
152 /* List containing poll wait queues */
153 struct list_head pwqlist;
154
155 /* The "container" of this item */
156 struct eventpoll *ep;
157
158 /* List header used to link this item to the "struct file" items list */
159 struct list_head fllink;
160
161 /* The structure that describe the interested events and the source fd */
162 struct epoll_event event;
163 };
rbn:将epitem添加到eventpoll红黑树中
rdllink:将epitem添加到eventpoll就绪链表中
next:将epitem添加到ovflist链表中
ffd:表示添加到eventpoll的文件
nwait:wait queue大小
pwqlist:wait queue链表
ep:epitem所属的eventpoll
iii.eppoll_entry
211 /* Wait structure used by the poll hooks */
212 struct eppoll_entry {
213 /* List header used to link this structure to the "struct epitem" */
214 struct list_head llink;
215
216 /* The "base" pointer is set to the container "struct epitem" */
217 struct epitem *base;
218
219 /*
220 * Wait queue item that will be linked to the target file wait
221 * queue head.
222 */
223 wait_queue_t wait;
224
225 /* The wait queue head that linked the "wait" wait queue item */
226 wait_queue_head_t *whead;
227 };
llink:链入epitem的pwqlist
base:指向epitem
wait:链入目标文件的等待队列
whead:目标文件的等待等待队列
iv.数据结构关系图
II.epoll_create
1456 /*
1457 * Open an eventpoll file descriptor.
1458 */
1459 SYSCALL_DEFINE1(epoll_create1, int, flags)
1460 {
1461 int error, fd;
1462 struct eventpoll *ep = NULL;
1463 struct file *file;
1464
1465 /* Check the EPOLL_* constant for consistency. */
1466 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1467
1468 if (flags & ~EPOLL_CLOEXEC)
1469 return -EINVAL;
1470 /*
1471 * Create the internal data structure ("struct eventpoll").
1472 */
1473 error = ep_alloc(&ep);
1474 if (error < 0)
1475 return error;
1476 /*
1477 * Creates all the items needed to setup an eventpoll file. That is,
1478 * a file structure and a free file descriptor.
1479 */
1480 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
1481 if (fd < 0) {
1482 error = fd;
1483 goto out_free_ep;
1484 }
1485 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1486 O_RDWR | (flags & O_CLOEXEC));
1487 if (IS_ERR(file)) {
1488 error = PTR_ERR(file);
1489 goto out_free_fd;
1490 }
1491 fd_install(fd, file);
1492 ep->file = file;
1493 return fd;
1494
1495 out_free_fd:
1496 put_unused_fd(fd);
1497 out_free_ep:
1498 ep_free(ep);
1499 return error;
1500 }
1501
1502 SYSCALL_DEFINE1(epoll_create, int, size)
1503 {
1504 if (size <= 0)
1505 return -EINVAL;
1506
1507 return sys_epoll_create1(0);
1508 }
1.参数检查,size/flags
2.分配eventpoll并初始化
3.分配eventpoll文件描述符
4.创建文件对象,操作是eventpoll_fops
5.文件对象与文件描述符关联
6.返回eventpoll文件描述符
由以上可以看出epoll_create中的size在该版本中没有用
i.ep_eventpoll_poll
ep_eventpoll_poll是eventpoll文件的poll操作,主要用于将wait添加到文件的等待队列中并返回POLL事件
686 static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
687 void *priv)
688 {
689 struct epitem *epi, *tmp;
690
691 list_for_each_entry_safe(epi, tmp, head, rdllink) {
692 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
693 epi->event.events)
694 return POLLIN | POLLRDNORM;
695 else {
696 /*
697 * Item has been dropped into the ready list by the poll
698 * callback, but it's not actually ready, as far as
699 * caller requested events goes. We can remove it here.
700 */
701 list_del_init(&epi->rdllink);
702 }
703 }
704
705 return 0;
706 }
707
708 static int ep_poll_readyevents_proc(void *pr