一。模块的加载
linux把EPOLL当做一个模块,模块入口函数的代码如下:
/************epoll模块入口函数***********/
static int __init eventpoll_init(void)
{
struct sysinfo si;
si_meminfo(&si);
/*
* Allows top 4% of lomem to be allocated for epoll watches (per user).
*/
max_user_watches = (((si.totalram - si.totalhigh) / 25) <
EP_ITEM_COST;
BUG_ON(max_user_watches
/*
* Initialize the structure used to perform epoll file descriptor
* inclusion loops checks.
*/
ep_nested_calls_init(&poll_loop_ncalls);
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_nested_calls_init(&poll_safewake_ncalls);
/* Initialize the structure used to perform file's f_op->poll() calls */
ep_nested_calls_init(&poll_readywalk_ncalls);
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
/* Allocates slab cache used to allocate "struct eppoll_entry" */
pwq_cache = kmem_cache_create("eventpoll_pwq",
sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
return 0;
}
fs_initcall(eventpoll_init);
/*fs_initcall 函数即是module_init函数*/
这个函数主要是进行一些初始化配置,同时创建了2个内核cache用于存放epitem和epoll_entry。
二 epoll_create函数的实现
epoll_create 创建一个epoll实例,即一个epoll的文件(epfd),同时创建并初始化一个struct eventpoll,其中efpd所对应的file的private_data指针即指向了eventpoll变量,因此,知道epfd就可以拿到file,即拿到了eventpoll变量。
下面我们来看具体实现:
SYSCALL_DEFINE1(epoll_create, int, size) //epoll_create函数带一个整型参数
{
if (size <= 0)
return -EINVAL;
return sys_epoll_create1(0); //实际上是调用epoll_create1
}
/*
* Open an eventpoll file descriptor.
*/
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
int error;
struct eventpoll *ep = NULL;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);//分配eventpoll结构体
if (error
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC)); //创建与eventpoll结构体相对应的file结构,ep保存在file->private_data结构中
//eventpoll_fops 为该文件所对应的操作函数
if (error
ep_free(ep); //如果出错则释放该eventpoll结构体
return error;
}
epoll_create可能调用一组关联函数ep_alloc和ep_free函数分别负责eventpoll结构体的内存分配和释放
函数anon_inode_getfd创建与eventpoll结构体相对应的file结构,ep保存在file->private_data结构中,同时为该新文件定义操作函数
从这几行代码可以看出,epoll_create主要做了两件事:
* 创建并初始化一个eventpoll结构体变量
* 创建epoll的file结构,并指定file的private_data指针指向刚创建的eventpoll变量,这样,只要根据epoll文件描述符epfd就可以拿到file进而就拿到了eventpoll变量了,该eventpoll就是epoll_ctl和epoll_wait工作的场所
对外看来,epoll_create就做了一件事,那就是创建一个epoll文件,事实上,更关键的是,它创建了一个eventpoll结构体变量,该变量为epoll_ctl和epoll_wait的工作打下了基础。
ep_alloc,ep_free以及anon_inode_getfd的具体实现可以查看源代码。
三 epoll_ctl的实现
epoll_ctl系统调用主要是针对epfd所对应的epoll实例进行增、删、改fd的操作,一个新创建的epoll文件带有一个struct eventpoll结构,同时struct eventpoll这个结构上再挂一个红黑树,红黑树上的每个节点挂的都是struct epitem,这个红黑树就是每次epoll_ctl时fd存放的地方!对应该红黑树上节点的操作,有ep_find,ep_insert,ep_remove,ep_modify四个函数,它们都将epoll文件实例的eventpoll结构作为参数传递。
下面来看看该函数的具体实现:
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
* epfd为该epoll套接字实例,op表示对应的操作,fd表示新加入的套接字,
* 结构体epoll_event 用于注册fd所感兴趣的事件和回传在fd上所发生待处理的事件
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
int did_lock_epmutex = 0;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
error = -EFAULT;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event))) //copy_from_user将用户传入的event_poll拷贝到epds中,以供自己使用
goto error_return;
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd); //获取该epoll套接字实例所对应的文件
if (!file)
goto error_return;
/* Get the "struct file *" for the target file */
tfile = fget(fd);
if (!tfile)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll)
goto error_tgt_fput;
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
if (file == tfile || !is_file_epoll(file))
goto error_tgt_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;//获取epoll实例所对应的eventpoll结构体
/*
* When we insert an epoll file descriptor, inside another epoll file
* descriptor, there is the change of creating closed loops, which are
* better be handled here, than in more critical paths.
*
* We hold epmutex across the loop check and the insert in this case, in
* order to prevent two separate inserts from racing and each doing the
* insert "at the same time" such that ep_loop_check passes on both
* before either one does the insert, thereby creating a cycle.
*/
if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
mutex_lock(&epmutex);
did_lock_epmutex = 1;
error = -ELOOP;
if (ep_loop_check(ep, tfile) != 0)
goto error_tgt_fput;
}
mutex_lock(&ep->mtx);
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
* ep_find即从ep中的红黑树中根据tfile和fd来查找epitem
*/
epi = ep_find(ep, tfile, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD: //对应于socket上事件注册
if (!epi) { //红黑树中不存在这个节点
epds.events |= POLLERR | POLLHUP; //或操作,确保“出错、连接挂起”被当做感兴趣事件,因为底层有义务将出错信息返回给应用
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
break;
case EPOLL_CTL_DEL: //删除
if (epi) //存在则删除这个节点,不存在则报错
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD: //修改
if (epi) { //存在则修改该fd所对应的事件,不存在则报错
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}
mutex_unlock(&ep->mtx);
error_tgt_fput:
if (unlikely(did_lock_epmutex))
mutex_unlock(&epmutex);
fput(tfile);
error_fput:
fput(file);
error_return:
return error;
}
结合上一篇博文的内容,对于往epoll实例中添加新的套接字,其实现主要通过函数ep_insert来完成,本文先分析epoll_wait再回过头来分析ep_insert
四 epoll_wait的实现
epoll_wait等待epoll文件上的I/O事件发生,其代码如下:
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
int error;
struct file *file;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
error = -EFAULT;
goto error_return;
}
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd);
if (!file)
goto error_return;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!is_file_epoll(file))
goto error_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;//获取struct eventpoll结构
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout);//核心代码
error_fput:
fput(file);
error_return:
return error;
}可以看出该函数主要时通过epfd获取对应的struct eventpoll结构,然后调用ep_poll函数,下面来看ep_poll函数的实现:
/**
* ep_poll - Retrieves ready events, and delivers them to the caller supplied
* event buffer.
*
* @ep: Pointer to the eventpoll context.
* @events: Pointer to the userspace buffer where the ready events should be
* stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in
* milliseconds. If the @timeout is zero, the function will not block,
* while if the @timeout is less than zero, the function will block
* until at least one event has been retrieved (or an error
* occurred).
*
* Returns: Returns the number of ready events which have been fetched, or an
* error code, in case of error.
*/
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res = 0, eavail, timed_out = 0;
unsigned long flags;
long slack = 0;
wait_queue_t wait;
ktime_t expires, *to = NULL;
/* The call waits for a maximum time of timeout milliseconds. Specifying a timeout of -1 makes epoll_wait()
wait indefinitely,while specifying a timeout equal to zero makes epoll_wait() to return immediately even if no events
are available (return code equal to zero).*/
if (timeout > 0) {
struct timespec end_time = ep_set_mstimeout(timeout);
slack = select_estimate_accuracy(&end_time);
to = &expires;
*to = timespec_to_ktime(end_time);
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation.
*/
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
goto check_events;
}
fetch_events:
spin_lock_irqsave(&ep->lock, flags);
// 如果rdllist中还没有epitem时,就开始等待了
if (!ep_events_available(ep)) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
*/
// 初始化等待队列,等待队列项对应的线程即为当前线程
init_waitqueue_entry(&wait, current);
// 不用多说,先将当前线程挂到等待队列上,之后在调用schedule_timeout
// 时,就开始了超时等待了
__add_wait_queue_exclusive(&ep->wq, &wait);
for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
// 因为会被阻塞,这里先设置线程状态为可中断
set_current_state(TASK_INTERRUPTIBLE);
// 整个循环的核心,其实就在看rdllist中是否有数据,或者等待超时
// 应征了前面的说明,epoll_wait只需要等着收集数据即可
if (ep_events_available(ep) || timed_out)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}
spin_unlock_irqrestore(&ep->lock, flags);
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
}
__remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
check_events:
/* Is it worth to try to dig for events ? */
eavail = ep_events_available(ep);
spin_unlock_irqrestore(&ep->lock, flags);
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
if (!res && eavail &&
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
return res;
}
29-43行:主要是超时时间的处理,若超时时间为0,则直接检查有没有准备好的I/O事件,有则立即发送给用户空间去处理;若超时时间大于0,计算好精确的超时时间后,等待事件的发生,45-86行等待指定的时间直到有I/O事件出现;
54-58行:如果还没有I/O事件出现,则准备休眠。先初始化等待队列,把当前线程挂在该队列上,同时把这个队列挂在eventpoll结构的wq上,
60-82行:在指定的超时时间内循环检测有没有I/O事件发生,有事件发生、超时或者收到信号都会跳出循环。
83行:运行到此处有I/O事件发生,不用再等待,则移除该队列