[笔记] debug epoll

debug epoll

对于外设的访问可分为阻塞和非阻塞两种方式。阻塞的方案则是访问外设,如果无法取得数据(或写入数据),那么进程陷入睡眠状态。非阻塞的方式则是访问,然后判断此次访问的结果是否符合条件.即轮询的方式.

如果驱动支持poll,那么当一个进程访问file时,如果可访问,那么将所需要的上下文记录下来,否则,在该file内部的等待队列上加入当前进程(下次(软)中断可能会执行)。当file可访问的时候,之前被阻塞的进程可以被唤醒然后顺序执行后续的操作即可。

select和poll

对于select来说,我们管理了很多fd,对每个fd都进行poll询问,如果没有数据,那么就将进程挂在fd对应的等待队列中等待唤醒。那么任意的fd有数据时都可以唤醒select/poll进程,且进程不知道是哪一个fd的回调将自己唤醒的. 所以又得对整个fd集合进行遍历,找到有新事件的fd。

调用关系

在我们进行epoll_ctl的系统调用时, 会进行一些注册工作

  • 注册回调函数
  • 想红黑树中插入对应的epi节点
fs/eventpoll.c: ep_insert
	-> ep_rbtree_insert(ep, epi) // 红黑树插入对应的epi节点
	-> ep_item_poll
		-> include/linux/poll.h: __poll_t vfs_poll() // 其实在这里是通过poll_wait向sock的等待队列中添加一个entry
			-> file->f_op->poll() // 注意, 此处的file类型是socket
				-> net/socket.c:141(file_operations) .poll = sock_poll
					-> sock->ops->poll(file, sock, wait) | flag
						-> 这里的ops和具体的协议有关, 类似地,它也会添加.poll
							-> ...
							-> 最后则调用的签名为typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *)的函数

具体源码:

/*
 * Must be called with "mtx" held.
 */
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
	int error, pwake = 0;
	__poll_t revents;
	long user_watches;
	struct epitem *epi;
	struct ep_pqueue epq;

	lockdep_assert_irqs_enabled();

	user_watches = atomic_long_read(&ep->user->epoll_watches);
	if (unlikely(user_watches >= max_user_watches))
		return -ENOSPC;
	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
		return -ENOMEM;

	/* Item initialization follow here ... */
	INIT_LIST_HEAD(&epi->rdllink);
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->pwqlist);
	epi->ep = ep;
	ep_set_ffd(&epi->ffd, tfile, fd);
	epi->event = *event;
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;
	if (epi->event.events & EPOLLWAKEUP) {
		error = ep_create_wakeup_source(epi);
		if (error)
			goto error_create_wakeup_source;
	} else {
		RCU_INIT_POINTER(epi->ws, NULL);
	}

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_lock);
	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_lock);

	/*
	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	 */
	ep_rbtree_insert(ep, epi);

	/* now check if we've created too many backpaths */
	error = -EINVAL;
	if (full_check && reverse_path_check())
		goto error_remove_epi;

	/* Initialize the poll table using the queue callback */
	epq.epi = epi;
	// ep_ptable_queue_proc的作用是向sock(TCP语境下)的等待队列中添加新的项, 回调时调用的是ep_poll_callback
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	/*
	 * Attach the item to the poll hooks and get current event bits.
	 * We can safely use the file* here because its usage count has
	 * been increased by the caller of this function. Note that after
	 * this operation completes, the poll callback can start hitting
	 * the new item.
	 */
	revents = ep_item_poll(epi, &epq.pt, 1);

	/*
	 * We have to check if something went wrong during the poll wait queue
	 * install process. Namely an allocation for a wait queue failed due
	 * high memory pressure.
	 */
	error = -ENOMEM;
	if (epi->nwait < 0)
		goto error_unregister;

	/* We have to drop the new item inside our item list to keep track of it */
	write_lock_irq(&ep->lock);

	/* record NAPI ID of new item if present */
	ep_set_busy_poll_napi_id(epi);

	/* If the file is already "ready" we drop it inside the ready list */
	if (revents && !ep_is_linked(epi)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake(epi);

		/* Notify waiting tasks that events are available */
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	write_unlock_irq(&ep->lock);

	atomic_long_inc(&ep->user->epoll_watches);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

	return 0;

error_unregister:
	ep_unregister_pollwait(ep, epi);
error_remove_epi:
	spin_lock(&tfile->f_lock);
	list_del_rcu(&epi->fllink);
	spin_unlock(&tfile->f_lock);

	rb_erase_cached(&epi->rbn, &ep->rbr);

	/*
	 * We need to do this because an event could have been arrived on some
	 * allocated wait queue. Note that we don't care about the ep->ovflist
	 * list, since that is used/cleaned only inside a section bound by "mtx".
	 * And ep_insert() is called with "mtx" held.
	 */
	write_lock_irq(&ep->lock);
	if (ep_is_linked(epi))
		list_del_init(&epi->rdllink);
	write_unlock_irq(&ep->lock);

	wakeup_source_unregister(ep_wakeup_source(epi));

error_create_wakeup_source:
	kmem_cache_free(epi_cache, epi);

	return error;
}

具体流程如下:
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
执行ep_ptable_queue_proc函数, 注:poll_wait非阻塞
执行ep_ptable_queue_proc函数, 注:poll_wait非阻塞
在这里插入图片描述
然后等待事件发生时执行ep_poll_callback
在这里插入图片描述

eventpoll结构

/*
 * This structure is stored inside the "private_data" member of the file
 * structure and represents the main data structure for the eventpoll
 * interface.
 */
struct eventpoll {
	/*
	 * This mutex is used to ensure that files are not removed
	 * while epoll is using them. This is held during the event
	 * collection loop, the file cleanup path, the epoll file exit
	 * code and the ctl operations.
	 */
	struct mutex mtx;

	// 注意以下两个等待队列
	/* Wait queue used by sys_epoll_wait() */
	wait_queue_head_t wq;

	/* Wait queue used by file->poll() */
	wait_queue_head_t poll_wait;

	/* List of ready file descriptors */
	struct list_head rdllist;

	/* Lock which protects rdllist and ovflist */
	rwlock_t lock;

	/* RB tree root used to store monitored fd structs */
	struct rb_root_cached rbr;

	/*
	 * This is a single linked list that chains all the "struct epitem" that
	 * happened while transferring ready events to userspace w/out
	 * holding ->lock.
	 */
	struct epitem *ovflist;

	/* wakeup_source used when ep_scan_ready_list is running */
	struct wakeup_source *ws;

	/* The user that created the eventpoll descriptor */
	struct user_struct *user;

	struct file *file;

	/* used to optimize loop detection check */
	u64 gen;

#ifdef CONFIG_NET_RX_BUSY_POLL
	/* used to track busy poll napi_id */
	unsigned int napi_id;
#endif
};

回调函数 ep_ptable_queue_proc

/*
 * This is the callback that is used to add our wait queue to the
 * target file wakeup lists.
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
				 poll_table *pt)
{
	struct epitem *epi = ep_item_from_epqueue(pt);
	struct eppoll_entry *pwq;

	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
		// 初始化eppoll_entry的回调函数为ep_poll_callback
		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
		pwq->whead = whead; // 如果监听socket, whead是sock结构的sk_sleep成员的地址
		pwq->base = epi;
		// 将pwq->wait添加到whead中.
		if (epi->event.events & EPOLLEXCLUSIVE)
			add_wait_queue_exclusive(whead, &pwq->wait);
		else
			add_wait_queue(whead, &pwq->wait);
		list_add_tail(&pwq->llink, &epi->pwqlist);
		epi->nwait++;
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;
	}
}

ep_poll_callback

把在红黑树上收到event的epi插入到ep->rdllist中, 这样在epoll_wait返回时rdllist中就是已经就绪的fd了。

eventpoll filesystem

gdb

gdb vmlinux
target remote:1234
b poll_wait
b ep_insert
b ep_item_poll
b ep_ptable_queue_proc
b ep_poll_callback

reference

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值