本文将选择epoll源码中的部分核心函数进行注释和分析,可以帮助更好地学习和理解epoll。
/*
epoll内部有三类核心的数据结构,分别是epoll_entry存储等待队列,当等待队列的头ready后,会依次通知等待队列中的其他成员调用回调函数;epitem表示一个正在监听的epollfd,使用红黑树存储,当有事件发生时,会移到就绪队列中,后续再移回用户程序;eventpoll,与eventfd对应,在内核中存储所有相关的epollfd信息,包含红黑树的fd存储,等待队列和就绪队列的存储以及相关操作的互斥锁等。
*/
/* Wait structure used by the poll hooks */
struct eppoll_entry {
/* List header used to link this structure to the "struct epitem" */
struct eppoll_entry *next;
/* The "base" pointer is set to the container "struct epitem" */
//每个队列对应的fd
struct epitem *base;
/*
* Wait queue item that will be linked to the target file wait
* queue head.
*/
//等待队列的每一个结点,等待头结点ready
wait_queue_entry_t wait;
/* The wait queue head that linked the "wait" wait queue item */
//等待队列的头结点,当它ready会触发整个队列的回调函数
wait_queue_head_t *whead;
};
//每一个epitem对应一个正在监听的fd
struct epitem {
//红黑树结点,在内核中使用红黑树存储
union {
/* RB tree node links this structure to the eventpoll RB tree */
struct rb_node rbn;
/* Used to free the struct epitem */
struct rcu_head rcu;
};
/* List header used to link this structure to the eventpoll ready list */
//如果该fd已经ready会连接至eventpoll中的就绪队列中
struct list_head rdllink;
/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/
struct epitem *next;
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;
/*
* Protected by file->f_lock, true for to-be-released epitem already
* removed from the "struct file" items list; together with
* eventpoll->refcount orchestrates "struct eventpoll" disposal
*/
bool dying;
/* List containing poll wait queues */
struct eppoll_entry *pwqlist;
/* The "container" of this item */
//属于哪一个eventpoll
struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */
struct hlist_node fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd */
//表示相关的eventpoll,可以从用户态使用epoll_ctl()进行修改
struct epoll_event event;
};
//这是epoll最核心也是最主要的数据结构
struct eventpoll {
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
/*互斥锁:对fd操作和epoll_wait()等都会触发,保证线程安全,所以epoll的操作实际上是线程安全的*/
struct mutex mtx;
/* Wait queue used by sys_epoll_wait() */
//等待队列,当使用epoll_wait()时,调用该队列
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
/* List of ready file descriptors */
//已经ready的fd
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
//读写操作的锁
rwlock_t lock;
/* RB tree root used to store monitored fd structs */
//存储fd的红黑树结构的根节点,CRUD操作都是从根节点开始的
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/
struct epitem *ovflist;
/* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
//用户变量,比如设置的最大fd上限等
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
u64 gen;
struct hlist_head refs;
/*
* usage count, used together with epitem->dying to
* orchestrate the disposal of this struct
*/
refcount_t refcount;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* tracks wakeup nests for lockdep validation */
u8 nests;
#endif
};
//下面我们来看epoll最核心的三个函数,epoll_creat(), epoll_ctl(), epoll_wait()
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
return do_epoll_create(flags);
}
SYSCALL_DEFINE1(epoll_create, int, size)
{
//size就是我们epoll_create(1)中的参数,必须大于0,该参数没有实际意义
if (size <= 0)
return -EINVAL;
return do_epoll_create(0);
}
//这个是epoll_create的本体
static int do_epoll_create(int flags)
{
//返回一个fd代表该epoll
int error, fd;
//epoll的主要结构
struct eventpoll *ep = NULL;
//创建一个文件,这是每个epoll会存在一个真实的内核的文件与其对应并存储和操作
struct file *file;
/* Check the EPOLL_* constant for consistency. */
//应该是可持久化检测...
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
//标志位检测
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
//获得一个未使用的fd标志
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
error = fd;
goto out_free_ep;
}
//为file实现eventpoll的操作,通过文件指针指向eventpoll的各种操作
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
//eventfd指向改文件,可以通过改文件进行读写操作
ep->file = file;
fd_install(fd, file);
return fd;
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_clear_and_put(ep);
return error;
}
//以下为epoll_ctl函数的实现
/*epoll的controller操作,向epoll内添加fd也就是epfd,op类型有三种分别是:ADD, MOD, DEL,fd表示需要操作的文件描述符,event表示需要监听的event。*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
struct epoll_event epds;
//将eventpoll从用户态拷贝到内核态,这里可以看出没有使用mmap结构,而是每一次都要copy
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
return -EFAULT;
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
//epoll_ctl的本体
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct eventpoll *tep = NULL;
//获取epollfd对应的file,在epoll_create()中我们已经创建的那个
error = -EBADF;
f = fdget(epfd);
if (!f.file)
goto error_return;
/* Get the "struct file *" for the target file */
//获取监听fd的file
tf = fdget(fd);
if (!tf.file)
goto error_fput;
/* The target file descriptor must support poll */
//判断该文件是否支持poll_operations
error = -EPERM;
if (!file_can_poll(tf.file))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(epds);
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
//判断监听的fd是否是自己,不能监听自己
if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;
/*
* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
//先判断是否是EPOLLEXCLUSIVE独占状态,该状态只能进行ADD,不能进行MOD
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
//从各个的epoll_create()创建中获取file中的数据
ep = f.file->private_data;
/*
* When we insert an epoll file descriptor inside another epoll file
* descriptor, there is the chance of creating closed loops, which are
* better be handled here, than in more critical paths. While we are
* checking for loops we also determine the list of files reachable
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
* the epoll file descriptor is attaching directly to a wakeup source,
* unless the epoll file descriptor is nested. The purpose of taking the
* 'epnested_mutex' on add is to prevent complex toplogies such as loops and
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
//修改数据结构中的内容,需要加锁
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
//这一部分来判断fd是否会发生闭环,即自己监听自己并确定可达的文件列表,同时根据文件是否嵌套来判断使用全局锁还是不用。
if (op == EPOLL_CTL_ADD) {
if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
if (error)
goto error_tgt_fput;
loop_check_gen++;
full_check = 1;
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
error = -ELOOP;
if (ep_loop_check(ep, tep) != 0)
goto error_tgt_fput;
}
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
}
}
/*
* Try to lookup the file inside our RB tree. Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
//先进行ADD操作判断,首先需要查找是否已经存在相同的ADD
epi = ep_find(ep, tf.file, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
//没有存在相同更多fd就添加至epollevent
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
break;
case EPOLL_CTL_DEL:
if (epi) {
/*
* The eventpoll itself is still alive: the refcount
* can't go to zero here.
*/
//删除操作
ep_remove_safe(ep, epi);
error = 0;
} else {
//没有该节点则不删除返回错误
error = -ENOENT;
}
break;
case EPOLL_CTL_MOD:
if (epi) {
//找到则对应标志进行修改
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, epds);
}
} else
//没有找到同理返回错误
error = -ENOENT;
break;
}
//操作之后进行解锁
mutex_unlock(&ep->mtx);
error_tgt_fput:
if (full_check) {
clear_tfile_check_list();
loop_check_gen++;
mutex_unlock(&epnested_mutex);
}
fdput(tf);
error_fput:
fdput(f);
error_return:
return error;
}
//以下是epoll_wait()函数的实现
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
struct timespec64 to;
return do_epoll_wait(epfd, events, maxevents,
ep_timeout_to_timespec(&to, timeout));
}
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
int error;
struct fd f;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
/*内核防止从用户直接读取数据,所以每次都是检测是否可写,内核不允许用户执行该操作,必须每次通过copy的形式进行数据互传*/
if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
//获取fd的file
f = fdget(epfd);
if (!f.file)
return -EBADF;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
//检测是否是epoll的file
if (!is_file_epoll(f.file))
goto error_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
//同epoll_ctl()获取file中的数据
ep = f.file->private_data;
/* Time to fish for events ... */
//将执行epoll_wait()的进程带入睡眠状态
error = ep_poll(ep, events, maxevents, to);
error_fput:
fdput(f);
return error;
}
//用于获取就绪的事件,即那些符合 epoll 监控条件且发生了相应活动的文件描述符的事件
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, struct timespec64 *timeout)
{
int res, eavail, timed_out = 0;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
lockdep_assert_irqs_enabled();
//时间转换,实际上就是设置阻塞时间
if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
slack = select_estimate_accuracy(timeout);
to = &expires;
*to = timespec64_to_ktime(*timeout);
} else if (timeout) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation.
*/
timed_out = 1;
}
/*
* This call is racy: We may or may not see events that are being added
* to the ready list under the lock (e.g., in IRQ callbacks). For cases
* with a non-zero timeout, this thread will check the ready list under
* lock and will add to the wait queue. For cases with a zero
* timeout, the user by definition should not care and will have to
* recheck again.
*/
eavail = ep_events_available(ep);
while (1) {
if (eavail) {
/*
* Try to transfer events to user space. In case we get
* 0 events and there's still timeout left over, we go
* trying again in search of more luck.
*/
res = ep_send_events(ep, events, maxevents);
if (res)
return res;
}
if (timed_out)
return 0;
eavail = ep_busy_loop(ep, timed_out);
if (eavail)
continue;
if (signal_pending(current))
return -EINTR;
/*
* Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each
* wakeup. Why it is important? In case of several waiters
* each new wakeup will hit the next waiter, giving it the
* chance to harvest new event. Otherwise wakeup can be
* lost. This is also good performance-wise, because on
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
*
* In fact, we now use an even more aggressive function that
* unconditionally removes, because we don't reuse the wait
* entry between loop iterations. This lets us also avoid the
* performance issue if a process is killed, causing all of its
* threads to wake up without being removed normally.
*/
//如果进行到这一步表示ready list不为空,唤醒,开始回调
init_wait(&wait);
wait.func = ep_autoremove_wake_function;
write_lock_irq(&ep->lock);
/*
* Barrierless variant, waitqueue_active() is called under
* the same lock on wakeup ep_poll_callback() side, so it
* is safe to avoid an explicit barrier.
*/
__set_current_state(TASK_INTERRUPTIBLE);
/*
* Do the final check under the lock. ep_start/done_scan()
* plays with two lists (->rdllist and ->ovflist) and there
* is always a race when both lists are empty for short
* period of time although events are pending, so lock is
* important.
*/
eavail = ep_events_available(ep);
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
if (!eavail)
timed_out = !schedule_hrtimeout_range(to, slack,
HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/*
* We were woken up, thus go and try to harvest some events.
* If timed out and still on the wait queue, recheck eavail
* carefully under lock, below.
*/
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
write_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
* timeout expired before it could reacquire the lock.
* Thus, when wait.entry is empty, it needs to harvest
* events.
*/
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
}
}