http://www.pagefault.info/?p=264
原创文章,转载请注明: 转载自pagefault
本文链接地址: linux kernel中epoll的设计和实现
这里就不贴源码了,源码分析的话,网上一大堆,我这里只是简要的描述下epoll的实现和一些关键的代码片段。
相关的文件在 fs/eventpoll.c中,我看的是2.6.38的内核代码.
1 epoll在创建的时候会调用anon_inode_getfd新建一个file instance,也就是epoll可以看成一个文件。因此我们可以看到epoll_create会返回一个fd.
1
2
|
error = anon_inode_getfd(
"[eventpoll]"
, &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
|
2 epoll所管理的所有的句柄都是放在一个大的结构eventpoll(红黑树)中,而这个结构是保存在file 的private_data域中的(因为epoll本身就是一个文件).这样每次通过epoll fd就可以直接得到eventpoll.
1
2
3
4
5
|
file = fget(epfd);
/* Get the "struct file *" for the target file */
tfile = fget(fd);
...................................
ep = file->private_data;
|
3 每一个加入到epoll监听的句柄(也就是红黑树的一个节点)都是一个epitem.它包含了一个 struct eventpoll *ep,也就是它所属于的eventpoll(epoll实例).
1
2
3
4
5
6
7
8
9
10
11
12
|
if
(!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return
-ENOMEM;
/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
|
4 在eventpoll中包含两个wait queue,一个是被epoll_wait使用的(wq),一个是被file->poll使用的(poll_wait).这两个都是属于eventpoll.而在epitem也有一个wait queue(pwqlist),这个queue是fd私有的wait queue,所以它是保存在epitem中的。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
struct
epitem {
/* RB tree node used to link this structure to the eventpoll RB tree */
struct
rb_node rbn;
................................................
/* List containing poll wait queues */
struct
list_head pwqlist;
/* The "container" of this item */
struct
eventpoll *ep;
.............................................
};
struct
eventpoll {
/* Protect the this structure access */
spinlock_t lock;
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
struct
mutex mtx;
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
/* List of ready file descriptors */
struct
list_head rdllist;
/* RB tree root used to store monitored fd structs */
struct
rb_root rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transfering ready events to userspace w/out
* holding ->lock.
*/
struct
epitem *ovflist;
.................................................
};
|
5 当我们添加一个句柄到一个epoll fd的时候,默认是会包含POLLERR和POLLHUP事件.并将epitem插入到红黑树中(eventpoll).然后会初始化一个poll_table,然后设置它的回调函数为ep_poll_callback,紧接着调用file->poll,如果是socket,则会调用tcp_poll,这个函数将会调用ep_poll_callback.
1
2
3
4
5
6
7
8
9
|
switch
(op) {
case
EPOLL_CTL_ADD:
if
(!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tfile, fd);
}
else
error = -EEXIST;
break
;
.............................................
|
6 ep_poll_callback这个函数主要是绑定epitem的wait queue的回调为ep_poll_callback.也就是对应fd如果有事件,则就会调用ep_poll_callback。
7 epoll中保存了一个read list(rdllist),所有的已经有通知事件的句柄,都会放到这个list中,而对应的操作就是在ep_poll_callback中,在ep_poll_callback中主要就是由wait queue指针来取得对应的epitem,然后再取得eventpoll,并将这个epitem加入到ready list,唤醒epoll_wait(wq).这里可以看到由于一个句柄只会对应一个epitem,所以在rdllist中,也不会有重复的epitem,在ep_poll_callback会判断是否rdllist中是否已经包含了将要插入的epitem,如果包含,则直接返回.
1
2
3
4
5
6
7
8
|
static
int
ep_poll_callback(wait_queue_t *wait, unsigned mode,
int
sync,
void
*key)
{
struct
epitem *epi = ep_item_from_wait(wait);
struct
eventpoll *ep = epi->ep;
...............................................
/* If this file is already in the ready list we exit soon */
if
(!ep_is_linked(&epi->rdllink))
list_add_tail(&epi->rdllink, &ep->rdllist);
|
8 当系统调用epoll_wait,如果ready list(rdllist)为空,则休眠等待被唤醒。当被唤醒之后(第7条),将rdllist复制(指针)到一个新的list(主要是针对LT),然后调用ep_send_events_proc对这个新的list进行遍历(对应的会从rdllist中删除这个epitem).遍历完毕后,最终会返回给用户对应的数据.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
if
(list_empty(&ep->rdllist)) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
*/
init_waitqueue_entry(&wait, current);
__add_wait_queue_exclusive(&ep->wq, &wait);
for
(;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
if
(!list_empty(&ep->rdllist) || timed_out)
break
;
if
(signal_pending(current)) {
res = -EINTR;
break
;
}
spin_unlock_irqrestore(&ep->lock, flags);
if
(!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
}
__remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
|
9 LT和ET的区别是在ep_send_events_proc中处理的,如果是LT,不但会将对应的数据返回给用户,并且会将当前的epitem再次加入到rdllist中。这样子,如果下次再次被唤醒就会给用户空间再次返回事件.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
static
int
ep_send_events_proc(
struct
eventpoll *ep,
struct
list_head *head,
void
*priv)
{
....................................................
if
(__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
return
eventcnt ? eventcnt : -EFAULT;
}
eventcnt++;
uevent++;
if
(epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
else
if
(!(epi->event.events & EPOLLET)) {
/*
* If this file has been added with Level
* Trigger mode, we need to insert back inside
* the ready list, so that the next call to
* epoll_wait() will check again the events
* availability. At this point, noone can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
* ep_scan_ready_list() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
}
..............................
}
|
10 eventpol还有一个list,叫做ovflist,主要是解决当内核在传输数据给用户空间(ep_send_events_proc)时的锁(eventpoll->mtx),此时epoll就是将这个时候传递上来的事件保存到ovflist中。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
static
int
ep_poll_callback(wait_queue_t *wait, unsigned mode,
int
sync,
void
*key)
{
......................................
/*
* If we are trasfering events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
* semantics). All the events that happens during that period of time are
* chained in ep->ovflist and requeued later on.
*/
if
(unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
if
(epi->next == EP_UNACTIVE_PTR) {
epi->next = ep->ovflist;
ep->ovflist = epi;
}
goto
out_unlock;
}
.........................
}
|
11 当从ep_send_events_proc返回(ep_scan_ready_list)后,会遍历ovflist,然后将ready的epitem保存到rdllist,以便与下次再次被唤醒时进行操作.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
static
int
ep_scan_ready_list(
struct
eventpoll *ep,
int
(*sproc)(
struct
eventpoll *,
struct
list_head *,
void
*),
void
*priv)
{
...........................................
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
* We re-insert them inside the main ready-list here.
*/
for
(nepi = ep->ovflist; (epi = nepi) != NULL;
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
/*
* We need to check if the item is already in the list.
* During the "sproc" callback execution time, items are
* queued into ->ovflist but the "txlist" might already
* contain them, and the list_splice() below takes care of them.
*/
if
(!ep_is_linked(&epi->rdllink))
list_add_tail(&epi->rdllink, &ep->rdllist);
}
........................
}
|