debug epoll
对于外设的访问可分为阻塞和非阻塞两种方式。阻塞的方案则是访问外设,如果无法取得数据(或写入数据),那么进程陷入睡眠状态。非阻塞的方式则是访问,然后判断此次访问的结果是否符合条件.即轮询的方式.
如果驱动支持poll,那么当一个进程访问file时,如果可访问,那么将所需要的上下文记录下来,否则,在该file内部的等待队列上加入当前进程(下次(软)中断可能会执行)。当file可访问的时候,之前被阻塞的进程可以被唤醒然后顺序执行后续的操作即可。
select和poll
对于select来说,我们管理了很多fd,对每个fd都进行poll询问,如果没有数据,那么就将进程挂在fd对应的等待队列中等待唤醒。那么任意的fd有数据时都可以唤醒select/poll进程,且进程不知道是哪一个fd的回调将自己唤醒的. 所以又得对整个fd集合进行遍历,找到有新事件的fd。
调用关系
在我们进行epoll_ctl的系统调用时, 会进行一些注册工作
- 注册回调函数
- 想红黑树中插入对应的epi节点
fs/eventpoll.c: ep_insert
-> ep_rbtree_insert(ep, epi) // 红黑树插入对应的epi节点
-> ep_item_poll
-> include/linux/poll.h: __poll_t vfs_poll() // 其实在这里是通过poll_wait向sock的等待队列中添加一个entry
-> file->f_op->poll() // 注意, 此处的file类型是socket
-> net/socket.c:141(file_operations) .poll = sock_poll
-> sock->ops->poll(file, sock, wait) | flag
-> 这里的ops和具体的协议有关, 类似地,它也会添加.poll
-> ...
-> 最后则调用的签名为typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *)的函数
具体源码:
/*
* Must be called with "mtx" held.
*/
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
int error, pwake = 0;
__poll_t revents;
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
lockdep_assert_irqs_enabled();
user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error)
goto error_create_wakeup_source;
} else {
RCU_INIT_POINTER(epi->ws, NULL);
}
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_lock);
list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);
/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
*/
ep_rbtree_insert(ep, epi);
/* now check if we've created too many backpaths */
error = -EINVAL;
if (full_check && reverse_path_check())
goto error_remove_epi;
/* Initialize the poll table using the queue callback */
epq.epi = epi;
// ep_ptable_queue_proc的作用是向sock(TCP语境下)的等待队列中添加新的项, 回调时调用的是ep_poll_callback
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function. Note that after
* this operation completes, the poll callback can start hitting
* the new item.
*/
revents = ep_item_poll(epi, &epq.pt, 1);
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
error = -ENOMEM;
if (epi->nwait < 0)
goto error_unregister;
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
/* If the file is already "ready" we drop it inside the ready list */
if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irq(&ep->lock);
atomic_long_inc(&ep->user->epoll_watches);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
return 0;
error_unregister:
ep_unregister_pollwait(ep, epi);
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
spin_unlock(&tfile->f_lock);
rb_erase_cached(&epi->rbn, &ep->rbr);
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
error_create_wakeup_source:
kmem_cache_free(epi_cache, epi);
return error;
}
具体流程如下:
执行ep_ptable_queue_proc函数, 注:poll_wait非阻塞
然后等待事件发生时执行ep_poll_callback
eventpoll结构
/*
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
*/
struct eventpoll {
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
struct mutex mtx;
// 注意以下两个等待队列
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
/* List of ready file descriptors */
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
rwlock_t lock;
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/
struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
u64 gen;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
};
回调函数 ep_ptable_queue_proc
/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
// 初始化eppoll_entry的回调函数为ep_poll_callback
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead; // 如果监听socket, whead是sock结构的sk_sleep成员的地址
pwq->base = epi;
// 将pwq->wait添加到whead中.
if (epi->event.events & EPOLLEXCLUSIVE)
add_wait_queue_exclusive(whead, &pwq->wait);
else
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}
ep_poll_callback
把在红黑树上收到event的epi插入到ep->rdllist中, 这样在epoll_wait返回时rdllist中就是已经就绪的fd了。
eventpoll filesystem
gdb
gdb vmlinux
target remote:1234
b poll_wait
b ep_insert
b ep_item_poll
b ep_ptable_queue_proc
b ep_poll_callback