linux代码版本:linux4.4
导读:在监控大量 fd 的时候, select 和 poll 有着明显的缺点:1. copy 所有的 fd 到内核 2. 活跃数不多的时候轮询方式效率低 3. 无法精确产生事件的 fd 。而 epoll 对这几个缺点有明显的改进:1. 开始的时候就将 fd 传递给内核,监控的时候不需要再 copy 到内核 2. 采用 event 的方式 3. 将产生事件的 fd 放入链表,直接查询该链表就行了。
一、epoll
先看一下 epoll 的几个接口函数:
1. epoll_create(int size);
创建 epoll 句柄,size 为监控 fd 数量的最大值
2. int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
epoll 事件注冊函数。
epfd是 epoll_create 产生的句柄
op表示动作:用三个宏表示:
EPOLL_CTL_ADD:注冊新的fd到epfd中;
EPOLL_CTL_MOD:改动已经注冊的fd的监听事件;
EPOLL_CTL_DEL:从epfd中删除一个fd;
fd 就是监听的fd
events可以是下面几种的组合:
EPOLLIN :相应的 fd 能够读
EPOLLOUT:相应的 fd 能够写
EPOLLPRI:相应的 fd 有紧急的数据可读(这里应该表示有带外数据到来);
EPOLLERR:相应的 fd 发生错误;
EPOLLHUP:相应的 fd 发生错误挂断。
EPOLLET: 将EPOLL设为边缘触发(Edge Triggered)模式。这是相对于水平触发(Level Triggered)来说的。
EPOLLONESHOT:仅仅监听一次事件。当监听完这次事件之后,就会把这个fd从epoll的队列中删除。
3.
3. int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);
等待事件的产生,并将产生的事件放到 events ,避免像 select 和 poll 一样还需要遍历查询都哪些 fd 上有事件
二、epoll 函数分析
SYSCALL_DEFINE1(epoll_create, int, size)
{
/*就判断一下非负值就没用了?*/
if (size <= 0)
return -EINVAL;
return sys_epoll_create1(0);
}
可以看到,size 参数只要不是负值,也没什么用,并没有向下传递,接下来看 sys_epoll_create1
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
/*由 epoll_creat 传递的 0 */
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
*/
/*申请 eventpoll 结构体并初始化*/
error = ep_alloc(&ep);
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
/*从当前进程的文件表中取一个未用的 fd ,并设置可读写的权限*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
error = fd;
goto out_free_ep;
}
/*再匿名文件系统中创建一个匿名文件( inode) ,并返回 file 结构体*/
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
{
struct qstr this;
struct path path;
struct file *file;
/*匿名文件系统初始化后就为合法值*/
if (IS_ERR(anon_inode_inode))
return ERR_PTR(-ENODEV);
/* fops 是不是很眼熟,驱动的工作不就是填充它的成员吗,这里直接用的 eventpoll_fops */
if (fops->owner && !try_module_get(fops->owner))
return ERR_PTR(-ENOENT);
/*
* Link the inode to a directory entry by creating a unique name
* using the inode sequence number.
*/
file = ERR_PTR(-ENOMEM);
this.name = name;
this.len = strlen(name);
this.hash = 0;
/*inode 有了 ,再申请一个目录项 dentry */
path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
if (!path.dentry)
goto err_module;
path.mnt = mntget(anon_inode_mnt);
/*
* We know the anon_inode inode count is always greater than zero,
* so ihold() is safe.
*/
ihold(anon_inode_inode);
/*denty 和 inode 关联*/
d_instantiate(path.dentry, anon_inode_inode);
/*申请并填充 file 结构体*/
file = alloc_file(&path, OPEN_FMODE(flags), fops);
if (IS_ERR(file))
goto err_dput;
file->f_mapping = anon_inode_inode->i_mapping;
file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
file->private_data = priv;
return file;
err_dput:
path_put(&path);
err_module:
module_put(fops->owner);
return file;
}
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
/*将file 赋值给 eventpoll */
ep->file = file;
fd_install(fd, file);
return fd;
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
这段代码就非常有意思,
1. 先获取一个进程未使用的 fd ,作为 epoll_creat 返回值
2. 在匿名文件系统中创建一个 eventpoll 的文件(创建 dentry 并链接 inode),再分配 file 结构体并填充相关内容,fops 用的是 eventpoll_fops
3. fd 和 file 关联
4. eventpoll 是一个非常重要的结构体,作为 私有结构赋值给 file
整个过程就类似于 open 函数创建一个 eventpoll 文件并打开,返回 fd ,可以简单认为,epoll_creat 再匿名文件系统中创建了一个 eventpoll 文件并打开。
下面看 epoll_ctl :
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
struct eventpoll *tep = NULL;
error = -EFAULT;
/*只要不是 EPOLL_CTL_DEL ,就先 copy 进来*/
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
error = -EBADF;
/*根据 epfd 获取 file */
f = fdget(epfd);
if (!f.file)
goto error_return;
/* Get the "struct file *" for the target file */
/*再获取要操作 fd 的 file */
tf = fdget(fd);
if (!tf.file)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
/*要监听 fd 的 f_op->poll 不能为 NULL,或者说驱动必须挂接了 poll 成员*/
if (!tf.file->f_op->poll)
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
/*将 EPOLLWAKEUP 标志清除了 */
ep_take_care_of_epollwakeup(&epds);
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
/*上面的注释写的很清楚 ,不能把 epfd 给添加进来*/
if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
/*获取到 eventpoll ,在 epoll_creat 时挂接的,驱动开发中也常将一些私有结构赋值给 file->private_data */
ep = f.file->private_data;
/*
* When we insert an epoll file descriptor, inside another epoll file
* descriptor, there is the change of creating closed loops, which are
* better be handled here, than in more critical paths. While we are
* checking for loops we also determine the list of files reachable
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
* the epoll file descriptor is attaching directly to a wakeup source,
* unless the epoll file descriptor is nested. The purpose of taking the
* 'epmutex' on add is to prevent complex toplogies such as loops and
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
mutex_lock_nested(&ep->mtx, 0);
/*添加 fd */
if (op == EPOLL_CTL_ADD) {
/*epfd 的 f_ep_links 不为 NULL 或者传递进来的 fd 是 epfd */
if (!list_empty(&f.file->f_ep_links) ||
is_file_epoll(tf.file)) {
full_check = 1;
mutex_unlock(&ep->mtx);
mutex_lock(&epmutex);
/*添加的 fd 不能时 epoll 的fd*/
if (is_file_epoll(tf.file)) {
error = -ELOOP;
if (ep_loop_check(ep, tf.file) != 0) {
clear_tfile_check_list();
goto error_tgt_fput;
}
} else
/* 将添加的 fd 对应的file 插入到 tfile_check_list */
list_add(&tf.file->f_tfile_llink,
&tfile_check_list);
mutex_lock_nested(&ep->mtx, 0);
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
mutex_lock_nested(&tep->mtx, 1);
}
}
}
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
/*从 eventpoll 中找到传递进来 fd 对应的 epitem */
epi = ep_find(ep, tf.file, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
/*既然要add就说明之前不存在 epitem ,否则重复添加,报错*/
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tf.file, fd, full_check);
{
int error, revents, pwake = 0;
unsigned long flags;
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
/*获取当前 eventpoll 监控的数量 */
user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
/*创建 eventpoll 条目*/
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
/* Item initialization follow here ... */
/*就绪链表*/
INIT_LIST_HEAD(&epi->rdllink);
/*file 链表 */
INIT_LIST_HEAD(&epi->fllink);
/* wait 链表*/
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
if (epi->event.events & EPOLLWAKEUP) {
/*申请 wakeupsource */
error = ep_create_wakeup_source(epi);
if (error)
goto error_create_wakeup_source;
} else {
RCU_INIT_POINTER(epi->ws, NULL);
}
/* Initialize the poll table using the queue callback */
epq.epi = epi;
/*驱动实现 poll 的时候调用 poll_wait 进而调用 ep_ptable_queue_proc */
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
{
/*创建 eppoll_entry 并加入等待队列头,同时加入 epi->pwqlist */
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function. Note that after
* this operation completes, the poll callback can start hitting
* the new item.
*/
/*调用驱动实现的 poll ,并返回结果*/
revents = ep_item_poll(epi, &epq.pt);
{
pt->_key = epi->event.events;
return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
}
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
error = -ENOMEM;
if (epi->nwait < 0)
goto error_unregister;
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_lock);
list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);
/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
*/
ep_rbtree_insert(ep, epi);
/* now check if we've created too many backpaths */
error = -EINVAL;
if (full_check && reverse_path_check())
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irqsave(&ep->lock, flags);
/* If the file is already "ready" we drop it inside the ready list */
/*如果有就绪的epi 插入到 eventpoll 的就绪链表中*/
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
spin_unlock_irqrestore(&ep->lock, flags);
atomic_long_inc(&ep->user->epoll_watches);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
return 0;
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
spin_unlock(&tfile->f_lock);
rb_erase(&epi->rbn, &ep->rbr);
error_unregister:
ep_unregister_pollwait(ep, epi);
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
spin_lock_irqsave(&ep->lock, flags);
if (ep_is_linked(&epi->rdllink))
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
wakeup_source_unregister(ep_wakeup_source(epi));
error_create_wakeup_source:
kmem_cache_free(epi_cache, epi);
return error;
}
} else
error = -EEXIST;
if (full_check)
clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}
if (tep != NULL)
mutex_unlock(&tep->mtx);
mutex_unlock(&ep->mtx);
error_tgt_fput:
if (full_check)
mutex_unlock(&epmutex);
fdput(tf);
error_fput:
fdput(f);
error_return:
return error;
}
简单总结以下:
1. 根据 epfd、fd 获取到 相应的 file
2. 将 epoll_event copy 到内核
3. 要保证传递的 fd 不能是 epoll 的 fd ,且其 file->f_op->poll 不能为 NULL
4. 查询 fd 对应的 epi(每添加一个 fd ,为其分配一个 epi 结构体及 ep_pqueue ) ,
5. EPOLL_CTL_ADD 为 添加的 fd 创建对应的 epi 及 eppoll_entry,调用驱动 poll 时,将 eppoll_entry 添加到文件的等待队列头,其 eppoll_entry 回调函数为 ep_poll_callback ,
6. 接下来调用 驱动对应的 poll 函数,得到返回值,如果有事件,将 epi->rdllink (可通过 epi 获得 fd 及其 event )插入到 ep->rdllist ,同时wakeup 因sys_epoll_wait(ep->wq)和 by file->poll (ep->poll_wait) 而休眠的进程(其实就是调用 epoll_wait 的进程)
下面看一个非常关键的函数 ep_poll_callback ,当监控文件有 事件产生时,会被 wake up,并调用等待队列的回调函数 ep_poll_callback
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait); //由等待队列获取到 epi
struct eventpoll *ep = epi->ep; //获取到 eventpoll
spin_lock_irqsave(&ep->lock, flags);
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto out_unlock;
/*
* Check the events coming with the callback. At this stage, not
* every device reports the events in the "key" parameter of the
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
if (key && !((unsigned long) key & epi->event.events))
goto out_unlock;
/*
* If we are transferring events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
/* 将epi 插入到 ep->ovflist */
if (ep->ovflist != EP_UNACTIVE_PTR) {
if (epi->next == EP_UNACTIVE_PTR) {
epi->next = ep->ovflist;
ep->ovflist = epi;
if (epi->ws) {
/*
* Activate ep->ws since epi->ws may get
* deactivated at any time.
*/
__pm_stay_awake(ep->ws);
}
}
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
/*epi 掺入到 ep 的就绪链表*/
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake_rcu(epi);
}
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
/* 激活 ep 的等待队列 */
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
/*查询到就删除的情况*/
if ((unsigned long)key & POLLFREE) {
/*
* If we race with ep_remove_wait_queue() it can miss
* ->whead = NULL and do another remove_wait_queue() after
* us, so we can't use __remove_wait_queue().
*/
list_del_init(&wait->task_list);
/*
* ->whead != NULL protects us from the race with ep_free()
* or ep_remove(), ep_remove_wait_queue() takes whead->lock
* held by the caller. Once we nullify it, nothing protects
* ep/epi or even wait.
*/
smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
}
return 1;
}
从代码中可知,当监控的 fd 有事件发生,竟会wakeup 等待队列,然后调用 ep_poll_callback ,而该函数总结下来就做了一件事,就是将 对应的 epi 插入到 eventpoll 的就绪链表中。
猜测 epoll_wait 的主要任务就是监控 eventpoll 的就绪链表,然后将相应的 epi 转发为 event 输出给用户程序。
下面看 epoll_wait :
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
int error;
struct fd f;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
/*判断监控的最大 event 数合法 */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
/*必须验证用户空间的可写属性或者说合法性*/
if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
/*过去 file 结构体*/
f = fdget(epfd);
if (!f.file)
return -EBADF;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
/* 必须是 epoll 的 fd */
if (!is_file_epoll(f.file))
goto error_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
/* 获取 eventpoll ,epoll_creat 时分配的 */
ep = f.file->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout);
{
int res = 0, eavail, timed_out = 0;
unsigned long flags;
u64 slack = 0;
wait_queue_t wait;
ktime_t expires, *to = NULL;
if (timeout > 0) {
struct timespec end_time = ep_set_mstimeout(timeout);
slack = select_estimate_accuracy(&end_time);
to = &expires;
*to = timespec_to_ktime(end_time);
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation.
*/
/*非阻塞式访问,不用添加等待队列,检查完就返回*/
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
goto check_events;
}
fetch_events:
spin_lock_irqsave(&ep->lock, flags);
if (!ep_events_available(ep)) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
*/
/*将当前进程添加到等待队列 ep->wq */
init_waitqueue_entry(&wait, current);
__add_wait_queue_exclusive(&ep->wq, &wait);
/*要么事件超时,要么被事件唤醒(epoll_ctl 和 ep_poll_callback 都会唤醒)*/
for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
if (ep_events_available(ep) || timed_out)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}
spin_unlock_irqrestore(&ep->lock, flags);
if (!freezable_schedule_hrtimeout_range(to, slack,
HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
}
__remove_wait_queue(&ep->wq, &wait);
__set_current_state(TASK_RUNNING);
}
check_events:
/* Is it worth to try to dig for events ? */
eavail = ep_events_available(ep);
spin_unlock_irqrestore(&ep->lock, flags);
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
/*有就绪的事件(eventpoll 就绪队列不为 空),将epi 转换发送到用户空间的 events */
if (!res && eavail &&
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
return res;
}
error_fput:
fdput(f);
return error;
}
从上面的代码可知,除了一些参数合法性判断,其实就是检查 eventpoll 就绪链表(ep->rdllist) 是否有epi ,然后将 epi 转换成 events 再输出给用户态。
总结:做过底层开发的人都知道,通常对于硬件事件有 查询法 和 中断 两种方式,select和poll 就类似 查询法 ,而 epoll 则类似中断法。而实际底层开发过程中,一般也是用中断的方法处理硬件事件,
避免了 CPU 一次次的白转。