epoll原理_Epoll源码阅读手札

21c620a0f54b27a2085c7a4656ee220c.png
  • 本文建议在了解epoll基本实现原理基础上阅读
  • 注释大部分是手敲的,可能有误。

Epoll与虚拟文件系统

epoll_create下手。epoll_create 作为syscall有两种,epoll_create1epoll_createflag=0的情况下没有差别, 除非 flag= EPOLL_CLOEXEC (openO_CLOEXEC)。O_CLOEXEC 即 close on execve()

man page里面说 O_CLOEXEC主要用途是多线程情况下avoid race conditions where one thread opens a file descriptor at the same time as another thread does a fork() + execve().

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
  return do_epoll_create(flags);
}
​
SYSCALL_DEFINE1(epoll_create, int, size)
{
  if (size <= 0)
    return -EINVAL;
​
  return do_epoll_create(0);
}

我们接下来看 do_epoll_create.
  • epoll注册的文件描述符一般称为epfd,struct eventpoll称为ep, epitem一般是epi.
  • ep_alloc分配内存用的是slab cache.
  • anonymous inode 是没有对应的文件系统中文件的 inode, 可以保证这个文件只有自己能打开,关闭以后就不能再使用了,一般作为临时文件。
  • 使用虚拟文件的原因:It makes the epoll instance also pollable. This allows us to monitor epoll instances with epoll, select or even poll!
static int do_epoll_create(int flags)
{
  int error, fd;
  struct eventpoll *ep = NULL;
  struct file *file;
​
  /* Check the EPOLL_* constant for consistency.  */
  BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
​
  if (flags & ~EPOLL_CLOEXEC)
    return -EINVAL;
  /*
   * Create the internal data structure ("struct eventpoll").
   * ep_alloc uses kzalloc, slab cache quickens memory use.
   */
  error = ep_alloc(&ep);
  if (error < 0)
    return error;
  /*
   * Creates all the items needed to setup an eventpoll file. 
   * Get an unused file descriptor and an anonymous inode file, use fd_install to bind them.
   * Set file operations: poll, release, llseek. 
   * release closes file and llseek repositions the offset of the open file descriptor.
   * See eventpoll_fops below.
   */
  fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)
  if (fd < 0) {
    error = fd;
    goto out_free_ep;
  }
  file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
         O_RDWR | (flags & O_CLOEXEC));
  if (IS_ERR(file)) {
    error = PTR_ERR(file);
    goto out_free_fd;
  }
  ep->file = file;
  fd_install(fd, file);
  return fd;
​
out_free_fd:
  put_unused_fd(fd);
out_free_ep:
  ep_free(ep);
  return error;
}

epoll 的file operations 定义如下:

static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
  .show_fdinfo  = ep_show_fdinfo,
#endif
  .release  = ep_eventpoll_release,
  .poll   = ep_eventpoll_poll,
  .llseek   = noop_llseek,
};
/*release closes file and llseek repositions the offset of the open file descriptor.*/
__poll_t (*poll) (struct file *, struct poll_table_struct *);
int (*release) (struct inode *, struct file *);
loff_t (*llseek) (struct file *, loff_t, int);

Epoll中的红黑树

struct event poll 比较复杂, 我们先记住里面有一个rbtree root, 一个rdlist就绪队列, 一个ovflist就绪的溢出队列, 一个epfd file指针,两个wait queue, 其中一个是poll_wait epoll本身作为虚拟文件的等待队列,另一个是syscall epoll_wait的等待队列,也就是所有等待的进程队列。

epoll 使用红黑树管理监听的file descriptors。红黑树的Key是 struct epoll_filefd , Value是 struct epitem. Key的比较是直接用pointer arithmetic 先比较file地址再比较fd值。(fdget其实可以用fd数值计算返回file指针,但显然这样比较快。)epi也是用slab cache分配内存的, ep_insert中使用kmem_cache_alloc

struct epoll_filefd {
  struct file *file; // file defined in <linux/fs.h>
  int fd;
} __packed;

ep_find就是一个很典型的红黑树查找。

static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
  int kcmp;
  struct rb_node *rbp;
  struct epitem *epi, *epir = NULL;
  struct epoll_filefd ffd;
​
  ep_set_ffd(&ffd, file, fd);
  for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
    epi = rb_entry(rbp, struct epitem, rbn);
    kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
    if (kcmp > 0)
      rbp = rbp->rb_right;
    else if (kcmp < 0)
      rbp = rbp->rb_left;
    else {
      epir = epi;
      break;
    }
  }
​
  return epir;
}
/*
 * Each file descriptor added to the eventpoll interface will
 * have an entry of this type linked to the "rbr" RB tree.
 * Avoid increasing the size of this struct, there can be many thousands
 * of these on a server and we do not want this to take another cache line.
 */
struct epitem {
  union {
    /* RB tree node links this structure to the eventpoll RB tree
    * epitems are Values of the KV pair nodes in the RedblackTree
    */
    struct rb_node rbn;
    /* Used to free the struct epitem
    * RCU=Read-Copy Update Lock, lock for rbtree nodes
    */
    struct rcu_head rcu;
  };
​
  /* List header used to link this structure to the eventpoll ready list
  * "struct eventpoll"->rdlist
  */
  struct list_head rdllink;
​
  /*
   * Works together "struct eventpoll"->ovflist in keeping the
   * single linked chain of items. ovflist (overflow list) is used when we are
   * copying rdlist(ready list) from kernel space to user space
   * using epoll_wait.
   */
  struct epitem *next;
​
  /* The file descriptor information this item refers to. 
  * Rbtree key.
  * Contains file* and fd. See struct epoll_filefd definition above. 
  */
  struct epoll_filefd ffd;
​
  /* Number of active wait queue attached to poll operations */
  int nwait;
​
  /* List containing poll wait queues.
  * This wait queue is the linux file system defined wait queue in <linux/wait.h>
  * Items in pwq list have type struct eppoll_entry.
  * It is a glue struct that links epitem to its wait_queue_entry and wait_queue_head
  *  (the head of the target file's wait queue). Given a wait_queue_entry 
  * we can calculate its epitem and eppoll_entry using pointer arithmetic.
  * However, normally a single epitem has only 1 epoll_entry linked to 1 wait queue 
  * waiting on 1 file, it seems strange to have a linked list.
  */
  struct list_head pwqlist;
​
  /* The "container" of this item
  * The epoll instance, core of this piece of code.
  */
  struct eventpoll *ep;
​
  /* List header used to link this item to the "struct file" items list 
  * Again, this normally contains only 1 item.
  */
  struct list_head fllink;
​
  /* wakeup_source used when EPOLLWAKEUP is set */
  struct wakeup_source __rcu *ws;
​
  /* The structure that describe the interested events and the source fd
  *  One of the input structs in epoll system call.
  */
  struct epoll_event event;
};

红黑树的锁使用RCU lock, reader不需要获得锁就能访问,writer需要copy一个副本然后通过callback将指针替换为新地址。多reader性能显著提高,多writer可能有性能问题。RCU lock能保证reader不被阻塞, 在除了context switch以外的时间都能无锁读取。writer写一个副本,在合适时机callback替换。等待适当时机的这一时期称为grace period,而CPU context switch称为经历一个quiescent state,会发送信号告知它们的状态,grace period就是等到所有CPU都经历在quiescent state所需要的时间。垃圾收集器就是在grace period之后调用writer callback的。

Epoll的回调函数

epoll的一个核心点是回调函数。以这个函数为核心封装好的wait_queue_entry排在被监听文件的wait queue里面,可以唤醒wait queue里面的其他线程,然后把这个它对应的epitem link 到就绪列表上(epi->rdllink)。这样 syscall epoll_wait 的时候只要看ready list就可以获得就绪文件的所有信息了。(通过rdllink pointer arithmetic可以获得epitem的指针).

wait_queue的定义在<linux/wait.h>中. 这是排在wait_queue里的struct, wait_queue_func_t func就是callback function也是这个entry的wake u function, 其中epoll会把ep_poll_callback register到这个位置。

struct wait_queue_entry {
  unsigned int    flags;
  void      *private;
  wait_queue_func_t func;
  struct list_head  entry;
};
/* wait_queue_func_t  func is wake up function */
typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);

这里有一个问题,就是不同类型文件的wait_queue_head 位置有一定差异。因此有一个eppoll_entryglue struct对wait queue结构和对应的epitem进行了封装.

struct eppoll_entry {
  /* List header used to link this structure to the "struct epitem", 
  *  its struct list_head pwqlist.
  */
  struct list_head llink;
  /* The "base" pointer is set to the container "struct epitem" */
  struct epitem *base;
  /*
   * Wait queue entry that will be linked to the target file wait
   * queue. It has a wake up function pointer in it.
   */
  wait_queue_entry_t wait;
  /* The wait queue head that linked the "wait" wait queue item
  * It is stored at different places according to the particular 
  * file implemenntation, take tcp as an example, it is sk_wq 
  * stored inside the struct sock struct. 
  */
  wait_queue_head_t *whead;
};

那么我们怎么在epoll中把call_back放到 wait queue 里面呢?我们可以从ep_insert中注册callback中着手,其中和callback相关的几行是:

struct ep_pqueue epq;
...
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/* ep_item_poll calls the corresponding file's poll() implementation */
revents = ep_item_poll(epi, &epq.pt, 1);
...

下面是 ep_pqueue和poll_table的定义。

下面是 ep_pqueue和poll_table的定义。

/* glue struct to connect poll_table with epitem*/
struct ep_pqueue {
  poll_table pt;
  struct epitem *epi;
};
typedef struct poll_table_struct {
  poll_queue_proc _qproc;
  __poll_t _key;
} poll_table;
/* 
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
​
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
         poll_table *pt);
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait);
  • poll_queue_proc是对应不同文件 file operation 中的poll f_op->poll, 定义在<linux/poll.h>里面。如果这个文件是tcp socket, 就是tcp_poll();如果这个文件是epoll,那就是 ep_ptable_queue_proc.
  • _key 是掩码/flag
  • ep_pqueue 是glue struct 连接poll_table和epitem.
/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
			 poll_table *pt)
{
  /* use pointer arithmetic from glue struct ep_pqueue */
	struct epitem *epi = ep_item_from_epqueue(pt);
	struct eppoll_entry *pwq;

	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
    /* init wait_queue_entry->wait_queue_func_t	func as ep_poll_callback
    * add the wait_queue_entry to the files wait queue,Register ep_poll_callback() as wake 			  * queue wakeup function.
    */
		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
		pwq->whead = whead;
		pwq->base = epi;
		add_wait_queue(whead, &pwq->wait);
		list_add_tail(&pwq->llink, &epi->pwqlist);
    /* As stated above, nwait is normally 1 */
		epi->nwait++;
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;
	}
}

实际进入文件的wait queue的函数是ep_poll_callback,把file放进ready list, 并唤醒队列中其他entry.

static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
	int pwake = 0;
  /* use glue struct eppoll_entry to get epitem */
	struct epitem *epi = ep_item_from_wait(wait);
	struct eventpoll *ep = epi->ep;
	__poll_t pollflags = key_to_poll(key);
	unsigned long flags;
	int ewake = 0;
	read_lock_irqsave(&ep->lock, flags);
	ep_set_busy_poll_napi_id(epi);
	/*
	 * If the event mask does not contain any poll(2) event, we consider the
	 * descriptor to be disabled. This condition is likely the effect of the
	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
	 * until the next EPOLL_CTL_MOD will be issued.
	 */
	if (!(epi->event.events & ~EP_PRIVATE_BITS))
		goto out_unlock;
	/*
	 * Check the events coming with the callback. We had earlier set key_ to ~u0, 
	 * to listen to all events, but at this stage, not
	 * every device reports the events in the "key" parameter of the
	 * callback. We need to be able to handle both cases here, hence the
	 * test for "key" != NULL before the event match test.
	 */
	if (pollflags && !(pollflags & epi->event.events))
		goto out_unlock;
	/*
	 * If we are transferring events to userspace which makes rdlist unusable
	 * (via s y syscall epoll_wait()) we can hold no locks (because we read user space
   * memory, and because of linux f_op->poll() semantics). All the events that 
	 * happen during that period of time are chained in ep->ovflist as overflow list 
	 * and requeued to rdlist later on after transfer event has finished.
	 */
	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
		if (epi->next == EP_UNACTIVE_PTR &&
		    chain_epi_lockless(epi))
			ep_pm_stay_awake_rcu(epi);
		goto out_unlock;
	}
	/* If this file is already in the ready list we exit soon, this happens
	* when user does not call epoll_wait() for a long time. Else, we add it.
	*/
	if (!ep_is_linked(epi) &&
	    list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
		ep_pm_stay_awake_rcu(epi);
	}
	/*
	 * Wake up ( if active, which means not empty) both the eventpoll wait list and the ->poll()
	 * wait list.
	 */
	if (waitqueue_active(&ep->wq)) {
		if ((epi->event.events & EPOLLEXCLUSIVE) &&
					!(pollflags & POLLFREE)) {
			switch (pollflags & EPOLLINOUT_BITS) {
			case EPOLLIN:
				if (epi->event.events & EPOLLIN)
					ewake = 1;
				break;
			case EPOLLOUT:
				if (epi->event.events & EPOLLOUT)
					ewake = 1;
				break;
			case 0:
				ewake = 1;
				break;
			}
		}
    /* Wait queue used by syscall epoll_wait() */
		wake_up(&ep->wq);
	}
  /* epoll Wait queue as a file, pwake keeps track of levels of reference */
	if (waitqueue_active(&ep->poll_wait))
		pwake++;
out_unlock:
	read_unlock_irqrestore(&ep->lock, flags);
	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);
	if (!(epi->event.events & EPOLLEXCLUSIVE))
		ewake = 1;
	if (pollflags & POLLFREE) {
		/*
		 * If we race with ep_remove_wait_queue() it can miss
		 * ->whead = NULL and do another remove_wait_queue() after
		 * us, so we can't use __remove_wait_queue().
		 */
		list_del_init(&wait->entry);
		/*
		 * ->whead != NULL protects us from the race with ep_free()
		 * or ep_remove(), ep_remove_wait_queue() takes whead->lock
		 * held by the caller. Once we nullify it, nothing protects
		 * ep/epi or even wait.
		 */
		smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
	}
	return ewake;
}

ep_item_poll里面会对这个epitem的文件进行vfs_poll,即使用此类文件自己特殊实现的f_op->poll, 比如tcp socket的 tcp_poll.

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
				 int depth)
{
	struct eventpoll *ep;
	bool locked;
	pt->_key = epi->event.events;
	if (!is_file_epoll(epi->ffd.file))
    /* If it is not an epoll file, vfs_poll calls corresponding f_op->poll 
    * of the file, tcp_poll for example.
    */
		return vfs_poll(epi->ffd.file, pt) & epi->event.events;
	ep = epi->ffd.file->private_data;
  /* If the file is an epoll, do another level of poll_wait, call ep_ptable_queue_proc again*/
	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
	locked = pt && (pt->_qproc == ep_ptable_queue_proc);
	return ep_scan_ready_list(epi->ffd.file->private_data,
				  ep_read_events_proc, &depth, depth,
				  locked) & epi->event.events;
}

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
	if (unlikely(!file->f_op->poll))
		return DEFAULT_POLLMASK;
	return file->f_op->poll(file, pt);
}

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && p->_qproc && wait_address)
		p->_qproc(filp, wait_address, p);
}

下面两个函数定义在 <net/sock.h> 和 <net/ipv4/tcp.c>。

unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
	...
   /*wait is the poll_table passed in by ep_item_poll*/
	struct sock *sk = sock->sk;
	const struct tcp_sock *tp = tcp_sk(sk);

	sock_poll_wait(file, sock, wait)

	...
}

static inline void sock_poll_wait(struct file *filp, struct socket *sock,
				  poll_table *p)
{
	if (!poll_does_not_wait(p)) {
		poll_wait(filp, &sock->wq->wait, p);
		/* We need to be sure we are in sync with the
		 * socket flags modification.
		 *
		 * This memory barrier is paired in the wq_has_sleeper.
		 */
		smp_mb();
	}
}

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && p->_qproc && wait_address)
		p->_qproc(filp, wait_address, p);
}

可以看出,不同的文件的f_op->poll差别只在于如何确定 wait_queue_head的位置。比如上述epoll文件与tcp socket文件,我们已经知道ep->poll_wait是epoll文件的wait_queue_head,而对于tcp我们需要sock_poll_wait来告诉我们sock->wq->wait是tcp socket 文件的wait_queue_head。一旦找到wait_queue_head的指针,就可以直接调用poll_wait, 也就是普通的调用 ep_ptable_queue_proc把我们的wait_queue_entry放入文件的等待队列并把 ep_poll_callback登记为这个struct的wake up function.

init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);

ET 和 LT

在调用epoll_wait时如果是 LT 会把 epitem 重新加回到 rdlist,等待下次以确认是否仍然有未处理的事件,只要仍然有未处理的事件就一直返回。ET如果监听的事件发生了,都只会返回epoll_wait 一次。

LT 惊群

在向用户copy完数据以后,如果 rdlist 上很快又有未处理的 epitem,则唤醒所有ep_>wqep->poll_wait进程。

if (!list_empty(&ep->rdllist)) {
    /*
     * Wake up (if active) both the eventpoll wait list and
     * the ->poll() wait list (delayed after we release the lock).
     */
    if (waitqueue_active(&ep->wq))
      wake_up(&ep->wq);
    if (waitqueue_active(&ep->poll_wait))
      pwake++;
  }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值