本篇主要简述epoll的内核实现,以及结合之前select与poll的实现分析作出它与这两者间的区别。
一、epoll简介
使用过epoll的同学都知道我们要使用epoll主要使用三个函数,epoll_create
,epoll_ctl
与epoll_wait
。在man手册中给出的epoll使用的例子如下。大致可以分为三个部分:
- 使用epoll_create创建epoll的文件描述符;
- 将我们需要监听的文件描述符和事件通过epoll_ctl加入到epoll描述符中;
- 使用epoll_wait函数等待事件的发生。
#define MAX_EVENTS 10
struct epoll_event ev, events[MAX_EVENTS];
int listen_sock, conn_sock, nfds, epollfd;
/* Code to set up listening socket, 'listen_sock',
(socket(), bind(), listen()) omitted */
/*1.创建epoll的文件描述符*/
epollfd = epoll_create1(0);
if (epollfd == -1) {
perror("epoll_create1");
exit(EXIT_FAILURE);
}
/*2.将我们需要监听的文件描述符和事件通过epoll_ctl加入到epoll描述符中*/
ev.events = EPOLLIN;
ev.data.fd = listen_sock;
if (epoll_ctl(epollfd, EPOLL_CTL_ADD, listen_sock, &ev) == -1) {
perror("epoll_ctl: listen_sock");
exit(EXIT_FAILURE);
}
for (;;) {
/*3.等待事件的发生*/
nfds = epoll_wait(epollfd, events, MAX_EVENTS, -1);
if (nfds == -1) {
perror("epoll_wait");
exit(EXIT_FAILURE);
}
for (n = 0; n < nfds; ++n) {
if (events[n].data.fd == listen_sock) {
conn_sock = accept(listen_sock,
(struct sockaddr *) &addr, &addrlen);
if (conn_sock == -1) {
perror("accept");
exit(EXIT_FAILURE);
}
setnonblocking(conn_sock);
ev.events = EPOLLIN | EPOLLET;
ev.data.fd = conn_sock;
if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn_sock,
&ev) == -1) {
perror("epoll_ctl: conn_sock");
exit(EXIT_FAILURE);
}
} else {
do_use_fd(events[n].data.fd);
}
}
}
当然以上都是在用户空间的使用,下面将对内核空间的实现进行介绍。
二、epoll在内核中的实现
与用户空间对应,用户空间调用的三个epoll函数在内核中都有对应的实现。
1、epoll_create
首先我们看到epoll_create
与epoll_create1
的实现基本是一样的,两者最后调用的都是epoll_create1
函数。
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
return sys_epoll_create1(0);
}
而epoll_create1
的函数主要是在内核中申请了一个eventpoll
结构,这个结构承载着epoll的核心数据,包括已经就绪的描述符列表、各种互斥锁等。除此之外该函数还申请了一个文件描述符,并为该描述符增加了文件指针核inode结点,使得返回的epoll描述符可以像普通文件一样进行操作,在最后它将eventpoll
结构放置在了文件的私有数据域。做完上述操作之后,将申请的epoll描述符返回给用户,这样之后用户只需要传入epoll的描述符就可以访问到epoll的的主数据结构eventpoll。
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
error = fd;
goto out_free_ep;
}
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
return fd;
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
2、epoll_ctl
epoll_ctl主要是对event_poll的主数据结构中的描述符集合进行操作,这些描述符集合就是我们等待事件的那些文件描述符。从前一篇文章I/O多路复用(一)select与poll的内核实现简述中我们知道,select使用数组存放描述符集合,poll使用数组+链表存放描述符集合,而在这里epoll使用的则是红黑树来存放描述符集合。
epoll_ctl对描述符集合的操作主要包括增加、修改与删除。增加即在红黑树中增加描述符,修改就是找到目标描述符并修改它的属性,删除就是删除红黑树中的目标描述符。修改与删除操作较为简单,这里主要介绍增加操作。
int epoll_ctl(...)
{
...
/*在红黑树中查找目标描述符*/
epi = ep_find(ep, tf.file, fd);
error = -EINVAL;
switch (op) {
/*增加描述符*/
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tf.file, fd, full_check);
} else
error = -EEXIST;
if (full_check)
clear_tfile_check_list();
break;
/*删除描述符*/
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
/*修改描述符*/
case EPOLL_CTL_MOD:
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}
...
}
增加描述符是调用了ep_insert
函数。ep_insert
函数主要做了四件事情:
- 为增加的描述符申请epoll_item结构体,用来存放等待事件等等;
- 为该描述符对应的设备驱动的poll函数中的
poll_wait
赋值(设备驱动的poll函数调用的其实是poll_wait函数,poll_wait函数的主要功能是将当前进程加入到该设备的等待队列中,当设备状态就绪时将唤醒等待队列上的进程并执行添加的唤醒回调); - 立即检查当前设备是否就绪,如果就绪的话就将当前设备的epoll_item结构体添加至
event_poll
的ready_list
中,并唤醒当前进程; - 将申请的epoll_item添加至红黑树中。
static int ep_insert(...)
{
/*1、为所添加的描述符申请epoll_item结构,并初始化*/
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
epq.epi = epi;
/*2、为poll_wait函数赋值*/
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*3、立即进行对添加的设备进行poll操作获取当前状态*/
revents = ep_item_poll(epi, &epq.pt);
/*4、将当前设备的epoll_item添加到红黑树中*/
ep_rbtree_insert(ep, epi);
/*如果当前设备已经就绪那么久唤醒等待队列*/
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
}
注意上述代码中的ep_ptable_queue_proc
函数,该函数即是设备执行poll函数中的poll_wait
真正执行的函数(可以查看init_poll_funcptr
函数和poll.h中的poll_wait
知晓)。这个函数的功能是将当前进程添加到设备就绪的等待列表中,除此之外它还为设备唤醒添加了一个回调函数ep_poll_callback,这个函数将在队列唤醒时执行。
static void ep_ptable_queue_proc(...)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
/*被唤醒的时候执行的回调*/
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
ep_poll_callback
函数将在队列被唤醒时调用。它的主要功能是将当前设备的epoll_item
添加到event_poll
的ready_list
中,并唤醒当前进程。这里需要注意的是ovflist
列表,ovflist
存放的是epoll正在把就绪结果拷贝到用户空间时,刚好有设备就绪,此时应把该就绪设备加入到ovflist
列表中,之后再进行处理。
static int ep_poll_callback(...)
{
int pwake = 0;
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
if ((unsigned long)key & POLLFREE) {
ep_pwq_from_wait(wait)->whead = NULL;
/*
* whead = NULL above can race with ep_remove_wait_queue()
* which can do another remove_wait_queue() after us, so we
* can't use __remove_wait_queue(). whead->lock is held by
* the caller.
*/
list_del_init(&wait->task_list);
}
spin_lock_irqsave(&ep->lock, flags);
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto out_unlock;
/*
* Check the events coming with the callback. At this stage, not
* every device reports the events in the "key" parameter of the
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
if (key && !((unsigned long) key & epi->event.events))
goto out_unlock;
/*
* If we are transferring events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
if (epi->next == EP_UNACTIVE_PTR) {
epi->next = ep->ovflist;
ep->ovflist = epi;
if (epi->ws) {
/*
* Activate ep->ws since epi->ws may get
* deactivated at any time.
*/
__pm_stay_awake(ep->ws);
}
}
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake_rcu(epi);
}
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
/*这个是eventpoll作为fd进行poll操作时*/
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
return 1;
}
3、epoll_wait
epoll_wait其实就是一直在等待目标设备的就绪。epoll_wait最终调用的是ep_poll
函数。
int epoll_wait(...)
{
/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);
ep = f.file->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout);
error_fput:
fdput(f);
return error;
}
ep_poll主要是在睡眠,等到超时或者被唤醒的时候才会继续执行。
static int ep_poll(...)
{
fetch_events:
spin_lock_irqsave(&ep->lock, flags);
if (!ep_events_available(ep)) {
/*加入到等待队列*/
init_waitqueue_entry(&wait, current);
__add_wait_queue_exclusive(&ep->wq, &wait);
for (;;) {
/*睡眠,等待唤醒*/
set_current_state(TASK_INTERRUPTIBLE);
if (ep_events_available(ep) || timed_out)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}
spin_unlock_irqrestore(&ep->lock, flags);
/*检查是否超时*/
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
}
__remove_wait_queue(&ep->wq, &wait);
__set_current_state(TASK_RUNNING);
}
check_events:
/* Is it worth to try to dig for events ? */
eavail = ep_events_available(ep);
spin_unlock_irqrestore(&ep->lock, flags);
/*将就绪的设备拷贝到用户空间*/
if (!res && eavail &&
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
return res;
}
ep_send_events函数负责处理已经就绪的设备,其调用的是ep_scan_ready_list
函数,而观察ep_scan_ready_list
函数我们发现,最终调用的是ep_send_events_proc
将就绪的设备信息拷贝到用户空间的。
static int ep_send_events(...)
{
struct ep_send_events_data esed;
esed.maxevents = maxevents;
esed.events = events;
return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
}
static int ep_scan_ready_list(...)
{
...
/*将就绪的设备状态拷贝到用户空间*/
error = (*sproc)(ep, &txlist, priv);
/*
*下面这些就是处理ovflist列表,即发送时就绪的设备最终还是
*会放到readylist中,等待发送到用户空间
*/
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
* We re-insert them inside the main ready-list here.
*/
for (nepi = ep->ovflist; (epi = nepi) != NULL;
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
/*
* We need to check if the item is already in the list.
* During the "sproc" callback execution time, items are
* queued into ->ovflist but the "txlist" might already
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
/*
* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
* releasing the lock, events will be queued in the normal way inside
* ep->rdllist.
*/
ep->ovflist = EP_UNACTIVE_PTR;
/*
* Quickly re-inject items left on "txlist".
*/
list_splice(&txlist, &ep->rdllist);
__pm_relax(ep->ws);
if (!list_empty(&ep->rdllist)) {
/*
* Wake up (if active) both the eventpoll wait list and
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
return error;
}
ep_send_events_proc函数就是将就绪的设备信息拷贝到用户空间的地方。主要就是遍历readylist,然后把readylist中就绪设备的信息拷贝到用户的event内存中。
static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
init_poll_funcptr(&pt, NULL);
/*遍历readylist*/
for (eventcnt = 0, uevent = esed->events;
!list_empty(head) && eventcnt < esed->maxevents;) {
epi = list_first_entry(head, struct epitem, rdllink);
/*
* Activate ep->ws before deactivating epi->ws to prevent
* triggering auto-suspend here (in case we reactive epi->ws
* below).
*
* This could be rearranged to delay the deactivation of epi->ws
* instead, but then epi->ws would temporarily be out of sync
* with ep_is_linked().
*/
ws = ep_wakeup_source(epi);
if (ws) {
if (ws->active)
__pm_stay_awake(ep->ws);
__pm_relax(ws);
}
list_del_init(&epi->rdllink);
revents = ep_item_poll(epi, &pt);
/*
* If the event mask intersect the caller-requested one,
* deliver the event to userspace. Again, ep_scan_ready_list()
* is holding "mtx", so no operations coming from userspace
* can change the item.
*/
if (revents) {
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
ep_pm_stay_awake(epi);
return eventcnt ? eventcnt : -EFAULT;
}
eventcnt++;
uevent++;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
else if (!(epi->event.events & EPOLLET)) {
/*
* If this file has been added with Level
* Trigger mode, we need to insert back inside
* the ready list, so that the next call to
* epoll_wait() will check again the events
* availability. At this point, no one can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
* ep_scan_ready_list() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
}
return eventcnt;
}