![21c620a0f54b27a2085c7a4656ee220c.png](https://img-blog.csdnimg.cn/img_convert/21c620a0f54b27a2085c7a4656ee220c.png)
- 本文建议在了解epoll基本实现原理基础上阅读
- 注释大部分是手敲的,可能有误。
Epoll与虚拟文件系统
从epoll_create
下手。epoll_create 作为syscall有两种,epoll_create1
和epoll_create
在flag=0
的情况下没有差别, 除非 flag= EPOLL_CLOEXEC
(open 的O_CLOEXEC
)。O_CLOEXEC 即 close on execve()
。
man page里面说 O_CLOEXEC主要用途是多线程情况下avoid race conditions where one thread opens a file descriptor at the same time as another thread does a fork() + execve().
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
return do_epoll_create(flags);
}
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
return do_epoll_create(0);
}
我们接下来看 do_epoll_create.
- epoll注册的文件描述符一般称为epfd,struct eventpoll称为ep, epitem一般是epi.
- ep_alloc分配内存用的是slab cache.
- anonymous inode 是没有对应的文件系统中文件的 inode, 可以保证这个文件只有自己能打开,关闭以后就不能再使用了,一般作为临时文件。
- 使用虚拟文件的原因:It makes the epoll instance also pollable. This allows us to monitor epoll instances with epoll, select or even poll!
static int do_epoll_create(int flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
* ep_alloc uses kzalloc, slab cache quickens memory use.
*/
error = ep_alloc(&ep);
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file.
* Get an unused file descriptor and an anonymous inode file, use fd_install to bind them.
* Set file operations: poll, release, llseek.
* release closes file and llseek repositions the offset of the open file descriptor.
* See eventpoll_fops below.
*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)
if (fd < 0) {
error = fd;
goto out_free_ep;
}
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
return fd;
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
epoll 的file operations 定义如下:
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = ep_show_fdinfo,
#endif
.release = ep_eventpoll_release,
.poll = ep_eventpoll_poll,
.llseek = noop_llseek,
};
/*release closes file and llseek repositions the offset of the open file descriptor.*/
__poll_t (*poll) (struct file *, struct poll_table_struct *);
int (*release) (struct inode *, struct file *);
loff_t (*llseek) (struct file *, loff_t, int);
Epoll中的红黑树
struct event poll 比较复杂, 我们先记住里面有一个rbtree root, 一个rdlist就绪队列, 一个ovflist就绪的溢出队列, 一个epfd file指针,两个wait queue, 其中一个是poll_wait epoll本身作为虚拟文件的等待队列,另一个是syscall epoll_wait的等待队列,也就是所有等待的进程队列。
epoll 使用红黑树管理监听的file descriptors。红黑树的Key是 struct epoll_filefd
, Value是 struct epitem
. Key的比较是直接用pointer arithmetic 先比较file地址再比较fd值。(fdget其实可以用fd数值计算返回file指针,但显然这样比较快。)epi也是用slab cache分配内存的, ep_insert
中使用kmem_cache_alloc
。
struct epoll_filefd {
struct file *file; // file defined in <linux/fs.h>
int fd;
} __packed;
ep_find
就是一个很典型的红黑树查找。
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
int kcmp;
struct rb_node *rbp;
struct epitem *epi, *epir = NULL;
struct epoll_filefd ffd;
ep_set_ffd(&ffd, file, fd);
for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
epi = rb_entry(rbp, struct epitem, rbn);
kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
if (kcmp > 0)
rbp = rbp->rb_right;
else if (kcmp < 0)
rbp = rbp->rb_left;
else {
epir = epi;
break;
}
}
return epir;
}
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
* Avoid increasing the size of this struct, there can be many thousands
* of these on a server and we do not want this to take another cache line.
*/
struct epitem {
union {
/* RB tree node links this structure to the eventpoll RB tree
* epitems are Values of the KV pair nodes in the RedblackTree
*/
struct rb_node rbn;
/* Used to free the struct epitem
* RCU=Read-Copy Update Lock, lock for rbtree nodes
*/
struct rcu_head rcu;
};
/* List header used to link this structure to the eventpoll ready list
* "struct eventpoll"->rdlist
*/
struct list_head rdllink;
/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items. ovflist (overflow list) is used when we are
* copying rdlist(ready list) from kernel space to user space
* using epoll_wait.
*/
struct epitem *next;
/* The file descriptor information this item refers to.
* Rbtree key.
* Contains file* and fd. See struct epoll_filefd definition above.
*/
struct epoll_filefd ffd;
/* Number of active wait queue attached to poll operations */
int nwait;
/* List containing poll wait queues.
* This wait queue is the linux file system defined wait queue in <linux/wait.h>
* Items in pwq list have type struct eppoll_entry.
* It is a glue struct that links epitem to its wait_queue_entry and wait_queue_head
* (the head of the target file's wait queue). Given a wait_queue_entry
* we can calculate its epitem and eppoll_entry using pointer arithmetic.
* However, normally a single epitem has only 1 epoll_entry linked to 1 wait queue
* waiting on 1 file, it seems strange to have a linked list.
*/
struct list_head pwqlist;
/* The "container" of this item
* The epoll instance, core of this piece of code.
*/
struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list
* Again, this normally contains only 1 item.
*/
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd
* One of the input structs in epoll system call.
*/
struct epoll_event event;
};
红黑树的锁使用RCU lock, reader不需要获得锁就能访问,writer需要copy一个副本然后通过callback将指针替换为新地址。多reader性能显著提高,多writer可能有性能问题。RCU lock能保证reader不被阻塞, 在除了context switch以外的时间都能无锁读取。writer写一个副本,在合适时机callback替换。等待适当时机的这一时期称为grace period,而CPU context switch称为经历一个quiescent state,会发送信号告知它们的状态,grace period就是等到所有CPU都经历在quiescent state所需要的时间。垃圾收集器就是在grace period之后调用writer callback的。
Epoll的回调函数
epoll的一个核心点是回调函数。以这个函数为核心封装好的wait_queue_entry
排在被监听文件的wait queue里面,可以唤醒wait queue里面的其他线程,然后把这个它对应的epitem link 到就绪列表上(epi->rdllink
)。这样 syscall epoll_wait
的时候只要看ready list就可以获得就绪文件的所有信息了。(通过rdllink pointer arithmetic可以获得epitem的指针).
wait_queue的定义在<linux/wait.h>
中. 这是排在wait_queue里的struct, wait_queue_func_t func
就是callback function也是这个entry的wake u function, 其中epoll会把ep_poll_callback
register到这个位置。
struct wait_queue_entry {
unsigned int flags;
void *private;
wait_queue_func_t func;
struct list_head entry;
};
/* wait_queue_func_t func is wake up function */
typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
这里有一个问题,就是不同类型文件的wait_queue_head
位置有一定差异。因此有一个eppoll_entry
glue struct对wait queue结构和对应的epitem进行了封装.
struct eppoll_entry {
/* List header used to link this structure to the "struct epitem",
* its struct list_head pwqlist.
*/
struct list_head llink;
/* The "base" pointer is set to the container "struct epitem" */
struct epitem *base;
/*
* Wait queue entry that will be linked to the target file wait
* queue. It has a wake up function pointer in it.
*/
wait_queue_entry_t wait;
/* The wait queue head that linked the "wait" wait queue item
* It is stored at different places according to the particular
* file implemenntation, take tcp as an example, it is sk_wq
* stored inside the struct sock struct.
*/
wait_queue_head_t *whead;
};
那么我们怎么在epoll中把call_back放到 wait queue 里面呢?我们可以从ep_insert中注册callback中着手,其中和callback相关的几行是:
struct ep_pqueue epq;
...
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/* ep_item_poll calls the corresponding file's poll() implementation */
revents = ep_item_poll(epi, &epq.pt, 1);
...
下面是 ep_pqueue和poll_table的定义。
下面是 ep_pqueue和poll_table的定义。
/* glue struct to connect poll_table with epitem*/
struct ep_pqueue {
poll_table pt;
struct epitem *epi;
};
typedef struct poll_table_struct {
poll_queue_proc _qproc;
__poll_t _key;
} poll_table;
/*
* structures and helpers for f_op->poll implementations
*/
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt);
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait);
poll_queue_proc
是对应不同文件 file operation 中的pollf_op->poll
, 定义在<linux/poll.h>
里面。如果这个文件是tcp socket, 就是tcp_poll()
;如果这个文件是epoll,那就是ep_ptable_queue_proc
.- _key 是掩码/flag
ep_pqueue
是glue struct 连接poll_table和epitem.
/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
/* use pointer arithmetic from glue struct ep_pqueue */
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
/* init wait_queue_entry->wait_queue_func_t func as ep_poll_callback
* add the wait_queue_entry to the files wait queue,Register ep_poll_callback() as wake * queue wakeup function.
*/
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
/* As stated above, nwait is normally 1 */
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}
实际进入文件的wait queue的函数是ep_poll_callback,把file放进ready list, 并唤醒队列中其他entry.
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
/* use glue struct eppoll_entry to get epitem */
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
unsigned long flags;
int ewake = 0;
read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto out_unlock;
/*
* Check the events coming with the callback. We had earlier set key_ to ~u0,
* to listen to all events, but at this stage, not
* every device reports the events in the "key" parameter of the
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
if (pollflags && !(pollflags & epi->event.events))
goto out_unlock;
/*
* If we are transferring events to userspace which makes rdlist unusable
* (via s y syscall epoll_wait()) we can hold no locks (because we read user space
* memory, and because of linux f_op->poll() semantics). All the events that
* happen during that period of time are chained in ep->ovflist as overflow list
* and requeued to rdlist later on after transfer event has finished.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
if (epi->next == EP_UNACTIVE_PTR &&
chain_epi_lockless(epi))
ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon, this happens
* when user does not call epoll_wait() for a long time. Else, we add it.
*/
if (!ep_is_linked(epi) &&
list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
/*
* Wake up ( if active, which means not empty) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq)) {
if ((epi->event.events & EPOLLEXCLUSIVE) &&
!(pollflags & POLLFREE)) {
switch (pollflags & EPOLLINOUT_BITS) {
case EPOLLIN:
if (epi->event.events & EPOLLIN)
ewake = 1;
break;
case EPOLLOUT:
if (epi->event.events & EPOLLOUT)
ewake = 1;
break;
case 0:
ewake = 1;
break;
}
}
/* Wait queue used by syscall epoll_wait() */
wake_up(&ep->wq);
}
/* epoll Wait queue as a file, pwake keeps track of levels of reference */
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
if (pollflags & POLLFREE) {
/*
* If we race with ep_remove_wait_queue() it can miss
* ->whead = NULL and do another remove_wait_queue() after
* us, so we can't use __remove_wait_queue().
*/
list_del_init(&wait->entry);
/*
* ->whead != NULL protects us from the race with ep_free()
* or ep_remove(), ep_remove_wait_queue() takes whead->lock
* held by the caller. Once we nullify it, nothing protects
* ep/epi or even wait.
*/
smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
}
return ewake;
}
ep_item_poll
里面会对这个epitem的文件进行vfs_poll
,即使用此类文件自己特殊实现的f_op->poll
, 比如tcp socket的 tcp_poll.
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
int depth)
{
struct eventpoll *ep;
bool locked;
pt->_key = epi->event.events;
if (!is_file_epoll(epi->ffd.file))
/* If it is not an epoll file, vfs_poll calls corresponding f_op->poll
* of the file, tcp_poll for example.
*/
return vfs_poll(epi->ffd.file, pt) & epi->event.events;
ep = epi->ffd.file->private_data;
/* If the file is an epoll, do another level of poll_wait, call ep_ptable_queue_proc again*/
poll_wait(epi->ffd.file, &ep->poll_wait, pt);
locked = pt && (pt->_qproc == ep_ptable_queue_proc);
return ep_scan_ready_list(epi->ffd.file->private_data,
ep_read_events_proc, &depth, depth,
locked) & epi->event.events;
}
static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
if (unlikely(!file->f_op->poll))
return DEFAULT_POLLMASK;
return file->f_op->poll(file, pt);
}
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);
}
下面两个函数定义在 <net/sock.h> 和 <net/ipv4/tcp.c>。
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
...
/*wait is the poll_table passed in by ep_item_poll*/
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
sock_poll_wait(file, sock, wait)
...
}
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
poll_table *p)
{
if (!poll_does_not_wait(p)) {
poll_wait(filp, &sock->wq->wait, p);
/* We need to be sure we are in sync with the
* socket flags modification.
*
* This memory barrier is paired in the wq_has_sleeper.
*/
smp_mb();
}
}
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);
}
可以看出,不同的文件的f_op->poll
差别只在于如何确定 wait_queue_head
的位置。比如上述epoll文件与tcp socket文件,我们已经知道ep->poll_wait
是epoll文件的wait_queue_head
,而对于tcp我们需要sock_poll_wait
来告诉我们sock->wq->wait
是tcp socket 文件的wait_queue_head
。一旦找到wait_queue_head
的指针,就可以直接调用poll_wait
, 也就是普通的调用 ep_ptable_queue_proc
把我们的wait_queue_entry
放入文件的等待队列并把 ep_poll_callback
登记为这个struct的wake up function.
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
ET 和 LT
在调用epoll_wait
时如果是 LT 会把 epitem 重新加回到 rdlist,等待下次以确认是否仍然有未处理的事件,只要仍然有未处理的事件就一直返回。ET如果监听的事件发生了,都只会返回epoll_wait 一次。
LT 惊群
在向用户copy完数据以后,如果 rdlist 上很快又有未处理的 epitem,则唤醒所有ep_>wq
和ep->poll_wait
进程。
if (!list_empty(&ep->rdllist)) {
/*
* Wake up (if active) both the eventpoll wait list and
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}