epoll源码解析
struct epoll_filefd {
struct file *file;
int fd;
} __packed;
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
* Avoid increasing the size of this struct, there can be many thousands
* of these on a server and we do not want this to take another cache line.
*/
struct epitem {
union {
/* RB tree node links this structure to the eventpoll RB tree */
//红黑树节点,根节点连在epoll的rbr上面
struct rb_node rbn;
/* Used to free the struct epitem */
struct rcu_head rcu;
};
/* List header used to link this structure to the eventpoll ready list */
//链表节点,已经ready的epitem都会被链到rdllist中
struct list_head rdllink;
/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/
//?
struct epitem *next;
/* The file descriptor information this item refers to */
//此epitem监听的文件描述符
struct epoll_filefd ffd;
/* Number of active wait queue attached to poll operations */
//poll操作中事件的个数
int nwait;
/* List containing poll wait queues */
//一个双向链表,保存着被监视文件的等待队列
struct list_head pwqlist;
/* The "container" of this item */
//所属的eventpoll结构
struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */
//ffd文件描述符所对应的文件的链表
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd */
//当前epitem所关心的事件,通过epoll_ctl从用户态传来
struct epoll_event event;
};
/*
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
*
* Access to it is protected by the lock inside wq.
*/
struct eventpoll {
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
//对本数据结构的访问加锁
struct mutex mtx;
/* Wait queue used by sys_epoll_wait() */
//调用epoll_wait()时, 我们就是"睡"在了这个等待队列上...
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
//调用file->poll()时的等待队列
wait_queue_head_t poll_wait;
/* List of ready file descriptors */
//ready list 不为空可唤醒epoll_wait
struct list_head rdllist;
/* RB tree root used to store monitored fd structs */
//红黑树根节点
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->wq.lock.
*/
//传向用户空间的一个中间态
struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
//?
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
//创建epollfd的用户的信息
struct user_struct *user;
//此epollfd对应的文件
struct file *file;
/* used to optimize loop detection check */
//?优化
int visited;
struct list_head visited_list_link;
};
/* Wait structure used by the poll hooks */
//等待队列的结点结构
struct eppoll_entry {
/* List header used to link this structure to the "struct epitem" */
struct list_head llink;
/* The "base" pointer is set to the container "struct epitem" */
//指向一个epitem
struct epitem *base;
/*
* Wait queue item that will be linked to the target file wait
* queue head.
*/
//等待队列节点
wait_queue_entry_t wait;
/* The wait queue head that linked the "wait" wait queue item */
//等待队列头
wait_queue_head_t *whead;
};
/* Wrapper struct used by poll queueing */
//
struct ep_pqueue {
//一个函数指针,最终指向回调函数
poll_table pt;
//指向一个epitem
struct epitem *epi;
};
/* Used by the ep_send_events() function as callback private data */
//用作回调函数的参数的结构体
struct ep_send_events_data {
int maxevents;
struct epoll_event __user *events;
int res;
};
static inline int ep_events_available(struct eventpoll *ep)
{
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
}
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);//从等待队列获取epitem
struct eventpoll *ep = epi->ep;//从epitem获取eventpoll
__poll_t pollflags = key_to_poll(key);
int ewake = 0;
spin_lock_irqsave(&ep->wq.lock, flags);
ep_set_busy_poll_napi_id(epi);
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
//判断注册的感兴趣事件
//#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto out_unlock;
/*
* Check the events coming with the callback. At this stage, not
* every device reports the events in the "key" parameter of the
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
if (pollflags && !(pollflags & epi->event.events))
goto out_unlock;
/*
* If we are transferring events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
//若epoll_wait已经返回,则等待下一次epoll_wait
if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
if (epi->next == EP_UNACTIVE_PTR) {
epi->next = ep->ovflist;
ep->ovflist = epi;
if (epi->ws) {
/*
* Activate ep->ws since epi->ws may get
* deactivated at any time.
*/
__pm_stay_awake(ep->ws);
}
}
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
//将fd/eptiem添加到ready list中
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake_rcu(epi);
}
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
//唤醒进程epoll_wait()的进程
if (waitqueue_active(&ep->wq)) {
if ((epi->event.events & EPOLLEXCLUSIVE) &&
!(pollflags & POLLFREE)) {
switch (pollflags & EPOLLINOUT_BITS) {
case EPOLLIN:
if (epi->event.events & EPOLLIN)
ewake = 1;
break;
case EPOLLOUT:
if (epi->event.events & EPOLLOUT)
ewake = 1;
break;
case 0:
ewake = 1;
break;
}
}
wake_up_locked(&ep->wq);
}
//唤醒file->poll()的进程