在完成listen系统调用后,作为TCP Server的进程就可以等待接受连接请求了。当请求到来时,进程需要调用accept系统调用生成一个新的socket,并用之与客户端传输数据。这时进程需要管理的socket有两类:1)等待请求到来并与之建立连接的socket;2)已经与客户端建立的一对一的连接并与之进行数据传输的socket。当这些socket的数量很多时,如何及时获知哪些socket有可读|可写等I/O事件通告到来并对其进行处理,就成了会极大影响TCP Server性能的关键问题。
11.2.1 epoll模型
Linux epoll是一个高效的I/O事件通告机制。下面说明一下TCP是如何使用epoll完成对I/O事件的监控的。使用epoll的模型示例:
int main(void)
{
struct epoll_event ev,events[20] = {};
int fd;
int listenfd;
int sockfd;
int nfds;
int i;
int rfd;
ssize_t rlen;
ssize_t wlen;
listenfd = socket(AF_INET, SOCK_STREAM, 0);
...
bind(listenfd, serveraddr, serveraddrlen);
...
listen(listenfd, 10);
...
epfd = epoll_create(256);
ev.data.fd = listenfd;
ev.events = EPOLLIN;
epoll_ctl(epfd, EPOLL_CTL_ADD, listenfd, &ev); //监控listenfd上发生的I/O事件
while (1) {
nfds = epoll_wait(epfd, events, 20, -1); //等待事件通告,当没有通告时进程睡眠,不占用CPU;有事件通告时进程被唤醒,然后处理事件
for(i = 0; i < nfds; ++i) {
if (events[i].events & EPOLLIN) {
if ((sockfd = events[i].data.fd) < 0) {
continue;
}
if (sockfd == listenfd) { //1类socket的可读事件发生,即新的连接请求到来
printf("Registered vm has changed!\n");
sockfd = accept(listenfd, clientaddr, clientaddrlen); //接受请求,产生新的socket描述符
...
} else { //2类socket的可读事件发生
rlen = read(sockfd, buf, sizeof(buf));
}
...
ev.data.fd = sockfd;
ev.events = EPOLLIN|EPOLLET;
epoll_ctl(epfd, EPOLL_CTL_ADD, sockfd, &ev);
...
} else if(events[i].events & EPOLLOUT) { //可写事件到来,即告知进程sockfd可以发送数据
sockfd = events[i].data.fd;
wlen = write(sockfd, data, data_len);
if (0 <= wlen && wlen < n) { //当有数据无法发送完毕时,可以定制可写事件通告,使epoll在得知sockfd可发送数据时通知进程
ev.data.fd = sockfd;
ev.events = EPOLLOUT|EPOLLET;
epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev);
}
...
}
...
}
}
...
return 0;
}
epoll_create用于产生一个epoll的文件描述符,一个epoll文件描述符对应一个文件描述符集合。
epoll_ctl用于控制这个集合中的成员(加入、删除、变更定制事件等)。在内核中,epoll_ctl会将新的fd加入到一颗红黑树中加以管理。
epoll_wait用于等待集合中成员的I/O事件发生;如果所有成员都没有I/O事件,则保持进程的睡眠状态;否则,进程会被唤醒,epoll_wait会返回所有发生的事件的信息。下面重点研究epoll是如何使进程睡眠,在有I/O事件时内核又如何唤醒进程的。
11.2.1 epoll_ctl内核代码
epoll_ctl内核代码如下:
1788 SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1789 struct epoll_event __user *, event)
1790 {
1791 int error;
1792 int did_lock_epmutex = 0;
1793 struct file *file, *tfile;
1794 struct eventpoll *ep;
1795 struct epitem *epi;
1796 struct epoll_event epds;
1797
1798 error = -EFAULT;
1799 if (ep_op_has_event(op) &&
1800 copy_from_user(&epds, event, sizeof(struct epoll_event)))
1801 goto error_return;
1802
1803 /* Get the "struct file *" for the eventpoll file */
1804 error = -EBADF;
1805 file = fget(epfd);
1806 if (!file)
1807 goto error_return;
1808
1809 /* Get the "struct file *" for the target file */
1810 tfile = fget(fd);
1811 if (!tfile)
1812 goto error_fput;
1813
1814 /* The target file descriptor must support poll */
1815 error = -EPERM;
1816 if (!tfile->f_op || !tfile->f_op->poll)
1817 goto error_tgt_fput;
1818
1819 /* Check if EPOLLWAKEUP is allowed */
1820 if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
1821 epds.events &= ~EPOLLWAKEUP;
1822
1823 /*
1824 * We have to check that the file structure underneath the file descriptor
1825 * the user passed to us _is_ an eventpoll file. And also we do not permit
1826 * adding an epoll file descriptor inside itself.
1827 */
1828 error = -EINVAL;
1829 if (file == tfile || !is_file_epoll(file))
1830 goto error_tgt_fput;
1831
1832 /*
1833 * At this point it is safe to assume that the "private_data" contains
1834 * our own data structure.
1835 */
1836 ep = file->private_data;
...
1850 if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1851 mutex_lock(&epmutex);
1852 did_lock_epmutex = 1;
1853 }
1854 if (op == EPOLL_CTL_ADD) {
1855 if (is_file_epoll(tfile)) {
1856 error = -ELOOP;
1857 if (ep_loop_check(ep, tfile) != 0) {
1858 clear_tfile_check_list();
1859 goto error_tgt_fput;
1860 }
1861 } else
1862 list_add(&tfile->f_tfile_llink, &tfile_check_list);
1863 }
1864
1865 mutex_lock_nested(&ep->mtx, 0);
1866
1867 /*
1868 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
1869 * above, we can be sure to be able to use the item looked up by
1870 * ep_find() till we release the mutex.
1871 */
1872 epi = ep_find(ep, tfile, fd);
1873
1874 error = -EINVAL;
1875 switch (op) {
1876 case EPOLL_CTL_ADD: //添加
1877 if (!epi) {
1878 epds.events |= POLLERR | POLLHUP;
1879 error = ep_insert(ep, &epds, tfile, fd);
1880 } else
1881 error = -EEXIST;
1882 clear_tfile_check_list();
1883 break;
1884 case EPOLL_CTL_DEL: //删除
1885 if (epi)
1886 error = ep_remove(ep, epi);
1887 else
1888 error = -ENOENT;
1889 break;
1890 case EPOLL_CTL_MOD: //修改
1891 if (epi) {
1892 epds.events |= POLLERR | POLLHUP;
1893 error = ep_modify(ep, epi, &epds);
1894 } else
1895 error = -ENOENT;
1896 break;
1897 }
1898 mutex_unlock(&ep->mtx);
1899
1900 error_tgt_fput:
1901 if (did_lock_epmutex)
1902 mutex_unlock(&epmutex);
1903
1904 fput(tfile);
1905 error_fput:
1906 fput(file);
1907 error_return:
1908
1909 return error;
1910 }
一个socket调用epoll_ctl加入epoll的文件描述符集合时,会调用ep_insert函数:
1231 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1232 struct file *tfile, int fd)
1233 {
1234 int error, revents, pwake = 0;
1235 unsigned long flags;
1236 long user_watches;
1237 struct epitem *epi;
1238 struct ep_pqueue epq;
1239
1240 user_watches = atomic_long_read(&ep->user->epoll_watches);
1241 if (unlikely(user_watches >= max_user_watches))
1242 return -ENOSPC;
1243 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
1244 return -ENOMEM;
1245
1246 /* Item initialization follow here ... */
1247 INIT_LIST_HEAD(&epi->rdllink);
1248 INIT_LIST_HEAD(&epi->fllink);
1249 INIT_LIST_HEAD(&epi->pwqlist);
1250 epi->ep = ep;
1251 ep_set_ffd(&epi->ffd, tfile, fd); //将sockfd对应的file结构体指针赋予epi->ffd.file
1252 epi->event = *event;
1253 epi->nwait = 0;
1254 epi->next = EP_UNACTIVE_PTR;
1255 if (epi->event.events & EPOLLWAKEUP) {
1256 error = ep_create_wakeup_source(epi);
1257 if (error)
1258 goto error_create_wakeup_source;
1259 } else {
1260 RCU_INIT_POINTER(epi->ws, NULL);
1261 }
1262
1263 /* Initialize the poll table using the queue callback */
1264 epq.epi = epi;
1265 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); //将epq.pt._qproc指向ep_ptable_queue_proc
1266
1267 /*
1268 * Attach the item to the poll hooks and get current event bits.
1269 * We can safely use the file* here because its usage count has
1270 * been increased by the caller of this function. Note that after
1271 * this operation completes, the poll callback can start hitting
1272 * the new item.
1273 */
1274 revents = ep_item_poll(epi, &epq.pt); //使用epi->ffd.file->f_op->poll指向的函数查询I/O事件,对于这个函数为sock_poll,sock_poll又会调用tcp_poll
1275
1276 /*
1277 * We have to check if something went wrong during the poll wait queue
1278 * install process. Namely an allocation for a wait queue failed due
1279 * high memory pressure.
1280 */
1281 error = -ENOMEM;
1282 if (epi->nwait < 0)
1283 goto error_unregister;
1284
1285 /* Add the current item to the list of active epoll hook for this file */
1286 spin_lock(&tfile->f_lock);
1287 list_add_tail(&epi->fllink, &tfile->f_ep_links);
1288 spin_unlock(&tfile->f_lock);
1289
1290 /*
1291 * Add the current item to the RB tree. All RB tree operations are
1292 * protected by "mtx", and ep_insert() is called with "mtx" held.
1293 */
1294 ep_rbtree_insert(ep, epi);
1295
1296 /* now check if we've created too many backpaths */
1297 error = -EINVAL;
1298 if (reverse_path_check())
1299 goto error_remove_epi;
1300
1301 /* We have to drop the new item inside our item list to keep track of it */
1302 spin_lock_irqsave(&ep->lock, flags);
1303
1304 /* If the file is already "ready" we drop it inside the ready list */
1305 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
1306 list_add_tail(&epi->rdllink, &ep->rdllist); //如果有定制的I/O事件发生,则将epi结构体加入到ep的rdllist中
1307 ep_pm_stay_awake(epi);
1308
1309 /* Notify waiting tasks that events are available */
1310 if (waitqueue_active(&ep->wq))
1311 wake_up_locked(&ep->wq); //如果有等待事件通告的进程,唤醒之
1312 if (waitqueue_active(&ep->poll_wait))
1313 pwake++;
1314 }
1315
1316 spin_unlock_irqrestore(&ep->lock, flags);
1317
1318 atomic_long_inc(&ep->user->epoll_watches);
1319
1320 /* We have to call this outside the lock */
1321 if (pwake)
1322 ep_poll_safewake(&ep->poll_wait); //如果有定制了epfd I/O事件的进程,则唤醒
1323
1324 return 0;
1325
1326 error_remove_epi:
1327 spin_lock(&tfile->f_lock);
1328 if (ep_is_linked(&epi->fllink))
1329 list_del_init(&epi->fllink);
1330 spin_unlock(&tfile->f_lock);
1331
1332 rb_erase(&epi->rbn, &ep->rbr);
1333
1334 error_unregister:
1335 ep_unregister_pollwait(ep, epi);
1336
1337 /*
1338 * We need to do this because an event could have been arrived on some
1339 * allocated wait queue. Note that we don't care about the ep->ovflist
1340 * list, since that is used/cleaned only inside a section bound by "mtx".
1341 * And ep_insert() is called with "mtx" held.
1342 */
1343 spin_lock_irqsave(&ep->lock, flags);
1344 if (ep_is_linked(&epi->rdllink))
1345 list_del_init(&epi->rdllink);
1346 spin_unlock_irqrestore(&ep->lock, flags);
1347
1348 wakeup_source_unregister(ep_wakeup_source(epi));
1349
1350 error_create_wakeup_source:
1351 kmem_cache_free(epi_cache, epi);
1352
1353 return error;
1354 }
tcp_poll函数:
433 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
434 {
435 unsigned int mask;
436 struct sock *sk = sock->sk;
437 const struct tcp_sock *tp = tcp_sk(sk);
438
439 sock_poll_wait(file, sk_sleep(sk), wait);//会通过函数指针调用之前设置的ep_ptable_queue_proc函数
440 if (sk->sk_state == TCP_LISTEN)
441 return inet_csk_listen_poll(sk); //返回处于listen的socket的可读事件,即有新连接到来
442
443 /* Socket is not locked. We are protected from async events
444 * by poll logic and correct handling of state changes
445 * made by other threads is impossible in any case.
446 */
447
448 mask = 0;
...
477 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
478 mask |= POLLHUP;
479 if (sk->sk_shutdown & RCV_SHUTDOWN)
480 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
481
482 /* Connected or passive Fast Open socket? */
483 if (sk->sk_state != TCP_SYN_SENT &&
484 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
485 int target = sock_rcvlowat(sk, 0, INT_MAX);
486
487 if (tp->urg_seq == tp->copied_seq &&
488 !sock_flag(sk, SOCK_URGINLINE) &&
489 tp->urg_data)
490 target++;
491
492 /* Potential race condition. If read of tp below will
493 * escape above sk->sk_state, we can be illegally awaken
494 * in SYN_* states. */
495 if (tp->rcv_nxt - tp->copied_seq >= target)
496 mask |= POLLIN | POLLRDNORM; //有数据未读,返回可读事件
497
498 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
499 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
500 mask |= POLLOUT | POLLWRNORM; //socket有空间可写,返回可写事件
501 } else { /* send SIGIO later */
502 set_bit(SOCK_ASYNC_NOSPACE,
503 &sk->sk_socket->flags);
504 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
505
506 /* Race breaker. If space is freed after
507 * wspace test but before the flags are set,
508 * IO signal will be lost.
509 */
510 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) //发送缓存有空间
511 mask |= POLLOUT | POLLWRNORM;
512 }
513 } else
514 mask |= POLLOUT | POLLWRNORM;
515
516 if (tp->urg_data & TCP_URG_VALID) //有紧急数据可读
517 mask |= POLLPRI;
518 }
519 /* This barrier is coupled with smp_wmb() in tcp_reset() */
520 smp_rmb();
521 if (sk->sk_err)
522 mask |= POLLERR; //出现错误
523
524 return mask;
525 }
ep_ptable_queue_proc函数:
1058 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
1059 poll_table *pt)
1060 {
1061 struct epitem *epi = ep_item_from_epqueue(pt);//找到ep_inster中申请的struct epitem *epi
1062 struct eppoll_entry *pwq;
1063
1064 if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
1065 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);//将pwq->wait.func设置为ep_poll_callback
1066 pwq->whead = whead;
1067 pwq->base = epi;
1068 add_wait_queue(whead, &pwq->wait);//将pwq结构体加入到sk_sleep(sk)的等待队列中
1069 list_add_tail(&pwq->llink, &epi->pwqlist); //将pwq结构体加入到epi的pwqlist中
1070 epi->nwait++;
1071 } else {
1072 /* We have to signal that an error occurred */
1073 epi->nwait = -1;
1074 }
1075 }
epoll_ctl返回后,进程会调用epoll_wait函数。
11.2.3 epoll_wait内核代码
epoll_wait内核代码如下:
1916 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1917 int, maxevents, int, timeout)
1918 {
1919 int error;
1920 struct fd f;
1921 struct eventpoll *ep;
1922
1923 /* The maximum number of event must be greater than zero */
1924 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1925 return -EINVAL;
1926
1927 /* Verify that the area passed by the user is writeable */
1928 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
1929 return -EFAULT;
1930
1931 /* Get the "struct file *" for the eventpoll file */
1932 f = fdget(epfd);
1933 if (!f.file)
1934 return -EBADF;
1935
1936 /*
1937 * We have to check that the file structure underneath the fd
1938 * the user passed to us _is_ an eventpoll file.
1939 */
1940 error = -EINVAL;
1941 if (!is_file_epoll(f.file))
1942 goto error_fput;
1943
1944 /*
1945 * At this point it is safe to assume that the "private_data" contains
1946 * our own data structure.
1947 */
1948 ep = f.file->private_data;
1949
1950 /* Time to fish for events ... */
1951 error = ep_poll(ep, events, maxevents, timeout); //核心函数
1952
1953 error_fput:
1954 fdput(f);
1955 return error;
1956 }
分析核心函数ep_poll:
1553 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1554 int maxevents, long timeout)
1555 {
1556 int res = 0, eavail, timed_out = 0;
1557 unsigned long flags;
1558 long slack = 0;
1559 wait_queue_t wait;
1560 ktime_t expires, *to = NULL;
1561
1562 if (timeout > 0) {
1563 struct timespec end_time = ep_set_mstimeout(timeout);
1564
1565 slack = select_estimate_accuracy(&end_time);
1566 to = &expires;
1567 *to = timespec_to_ktime(end_time);
1568 } else if (timeout == 0) {
1569 /*
1570 * Avoid the unnecessary trip to the wait queue loop, if the
1571 * caller specified a non blocking operation.
1572 */
1573 timed_out = 1;
1574 spin_lock_irqsave(&ep->lock, flags);
1575 goto check_events;
1576 }
1577
1578 fetch_events:
1579 spin_lock_irqsave(&ep->lock, flags);
1580
1581 if (!ep_events_available(ep)) { //当ep->rdllist队列不为空时,ep_events_available(ep)为真
1582 /*
1583 * We don't have any available event to return to the caller.
1584 * We need to sleep here, and we will be wake up by
1585 * ep_poll_callback() when events will become available.
1586 */
1587 init_waitqueue_entry(&wait, current);
1588 __add_wait_queue_exclusive(&ep->wq, &wait);//加入到ep的wq等待队列中等待被唤醒
1589
1590 for (;;) {
1591 /*
1592 * We don't want to sleep if the ep_poll_callback() sends us
1593 * a wakeup in between. That's why we set the task state
1594 * to TASK_INTERRUPTIBLE before doing the checks.
1595 */
1596 set_current_state(TASK_INTERRUPTIBLE);
1597 if (ep_events_available(ep) || timed_out) //如果有事件通告或超时
1598 break;
1599 if (signal_pending(current)) {
1600 res = -EINTR;
1601 break;
1602 }
1603
1604 spin_unlock_irqrestore(&ep->lock, flags);
1605 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))//在此处执行schedule放弃CPU,直到超时或被唤醒时才能再次被调度
1606 timed_out = 1;
1607
1608 spin_lock_irqsave(&ep->lock, flags);
1609 }
1610 __remove_wait_queue(&ep->wq, &wait);
1611
1612 set_current_state(TASK_RUNNING);
1613 }
1614 check_events:
1615 /* Is it worth to try to dig for events ? */
1616 eavail = ep_events_available(ep);
1617
1618 spin_unlock_irqrestore(&ep->lock, flags);
1619
1620 /*
1621 * Try to transfer events to user space. In case we get 0 events and
1622 * there's still timeout left over, we go trying again in search of
1623 * more luck.
1624 */
1625 if (!res && eavail &&
1626 !(res = ep_send_events(ep, events, maxevents)) && !timed_out) //将所有要通告的事件返回给用户态
1627 goto fetch_events;
1628
1629 return res;
1630 }
当一个socket有I/O事件到来时,以可读事件为例,内核会调用sk->sk_data_ready,这个指针指向sock_def_readable:
2157 static void sock_def_readable(struct sock *sk, int len)
2158 {
2159 struct socket_wq *wq;
2160
2161 rcu_read_lock();
2162 wq = rcu_dereference(sk->sk_wq);
2163 if (wq_has_sleeper(wq)) //如果有进程需要事件通告,则唤醒之
2164 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2165 POLLRDNORM | POLLRDBAND);
2166 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2167 rcu_read_unlock();
2168 }
wake_up_interruptible_sync_poll封装了__wake_up_sync_key,__wake_up_sync_key调用__wake_up_common:
3159 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3160 int nr_exclusive, int wake_flags, void *key)
3161 {
3162 wait_queue_t *curr, *next;
3163
3164 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { //遍历等待队列中所有节点
3165 unsigned flags = curr->flags;
3166
3167 if (curr->func(curr, mode, wake_flags, key) &&
3168 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) //调用各个节点设置的唤醒函数,对于epoll这个函数是在ep_ptable_queue_proc中设置的ep_poll_callback
3169 break;
3170 }
3171 }
ep_poll_callback函数:
969 static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
970 {
971 int pwake = 0;
972 unsigned long flags;
973 struct epitem *epi = ep_item_from_wait(wait); //找到ep_ptable_queue_proc设置的epi
974 struct eventpoll *ep = epi->ep;
975
976 if ((unsigned long)key & POLLFREE) {
977 ep_pwq_from_wait(wait)->whead = NULL;
978 /*
979 * whead = NULL above can race with ep_remove_wait_queue()
980 * which can do another remove_wait_queue() after us, so we
981 * can't use __remove_wait_queue(). whead->lock is held by
982 * the caller.
983 */
984 list_del_init(&wait->task_list);
985 }
986
987 spin_lock_irqsave(&ep->lock, flags);
988
989 /*
990 * If the event mask does not contain any poll(2) event, we consider the
991 * descriptor to be disabled. This condition is likely the effect of the
992 * EPOLLONESHOT bit that disables the descriptor when an event is received,
993 * until the next EPOLL_CTL_MOD will be issued.
994 */
995 if (!(epi->event.events & ~EP_PRIVATE_BITS))
996 goto out_unlock;
997
998 /*
999 * Check the events coming with the callback. At this stage, not
1000 * every device reports the events in the "key" parameter of the
1001 * callback. We need to be able to handle both cases here, hence the
1002 * test for "key" != NULL before the event match test.
1003 */
1004 if (key && !((unsigned long) key & epi->event.events))
1005 goto out_unlock;
1006
1007 /*
1008 * If we are transferring events to userspace, we can hold no locks
1009 * (because we're accessing user memory, and because of linux f_op->poll()
1010 * semantics). All the events that happen during that period of time are
1011 * chained in ep->ovflist and requeued later on.
1012 */
1013 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
1014 if (epi->next == EP_UNACTIVE_PTR) {
1015 epi->next = ep->ovflist;
1016 ep->ovflist = epi;
1017 if (epi->ws) {
1018 /*
1019 * Activate ep->ws since epi->ws may get
1020 * deactivated at any time.
1021 */
1022 __pm_stay_awake(ep->ws);
1023 }
1024
1025 }
1026 goto out_unlock;
1027 }
1028
1029 /* If this file is already in the ready list we exit soon */
1030 if (!ep_is_linked(&epi->rdllink)) { //如果这个epi还没有被加入到别的epfd中,则将其加入到当前epfd的rdllist中
1031 list_add_tail(&epi->rdllink, &ep->rdllist);
1032 ep_pm_stay_awake_rcu(epi);
1033 }
1034
1035 /*
1036 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
1037 * wait list.
1038 */
1039 if (waitqueue_active(&ep->wq))
1040 wake_up_locked(&ep->wq); //唤醒等待事件通告的进程,即唤醒在ep_poll的1588行加入等待队列的进程
1041 if (waitqueue_active(&ep->poll_wait))
1042 pwake++;
1043
1044 out_unlock:
1045 spin_unlock_irqrestore(&ep->lock, flags);
1046
1047 /* We have to call this outside the lock */
1048 if (pwake)
1049 ep_poll_safewake(&ep->poll_wait);
1050
1051 return 1;
1052 }
唤醒进程后,ep_poll会调用ep_send_events将发生的事件收集到一起返回给用户态:
1514 static int ep_send_events(struct eventpoll *ep,
1515 struct epoll_event __user *events, int maxevents)
1516 {
1517 struct ep_send_events_data esed;
1518
1519 esed.maxevents = maxevents;
1520 esed.events = events;
1521
1522 return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
1523 }
ep_scan_ready_list中会调用ep_send_events_proc函数:
1434 static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1435 void *priv)
1436 {
1437 struct ep_send_events_data *esed = priv;
1438 int eventcnt;
1439 unsigned int revents;
1440 struct epitem *epi;
1441 struct epoll_event __user *uevent;
1442 struct wakeup_source *ws;
1443 poll_table pt;
1444
1445 init_poll_funcptr(&pt, NULL);
1446
1447 /*
1448 * We can loop without lock because we are passed a task private list.
1449 * Items cannot vanish during the loop because ep_scan_ready_list() is
1450 * holding "mtx" during this call.
1451 */
1452 for (eventcnt = 0, uevent = esed->events;
1453 !list_empty(head) && eventcnt < esed->maxevents;) {//遍历已发生的事件的队列
1454 epi = list_first_entry(head, struct epitem, rdllink);
1455
1456 /*
1457 * Activate ep->ws before deactivating epi->ws to prevent
1458 * triggering auto-suspend here (in case we reactive epi->ws
1459 * below).
1460 *
1461 * This could be rearranged to delay the deactivation of epi->ws
1462 * instead, but then epi->ws would temporarily be out of sync
1463 * with ep_is_linked().
1464 */
1465 ws = ep_wakeup_source(epi);
1466 if (ws) {
1467 if (ws->active)
1468 __pm_stay_awake(ep->ws);
1469 __pm_relax(ws);
1470 }
1471
1472 list_del_init(&epi->rdllink);
1473
1474 revents = ep_item_poll(epi, &pt);//调用socket中设定的poll函数(tcp_poll)查看是否有用户关心的事件发生
1475
1476 /*
1477 * If the event mask intersect the caller-requested one,
1478 * deliver the event to userspace. Again, ep_scan_ready_list()
1479 * is holding "mtx", so no operations coming from userspace
1480 * can change the item.
1481 */
1482 if (revents) {//如果有用户关心的事件发生,则将世界信息copy回位于用户态的epoll_event数组中
1483 if (__put_user(revents, &uevent->events) ||
1484 __put_user(epi->event.data, &uevent->data)) {
1485 list_add(&epi->rdllink, head);
1486 ep_pm_stay_awake(epi);
1487 return eventcnt ? eventcnt : -EFAULT;
1488 }
1489 eventcnt++;
1490 uevent++;
1491 if (epi->event.events & EPOLLONESHOT)
1492 epi->event.events &= EP_PRIVATE_BITS;
1493 else if (!(epi->event.events & EPOLLET)) {
1494 /*
1495 * If this file has been added with Level
1496 * Trigger mode, we need to insert back inside
1497 * the ready list, so that the next call to
1498 * epoll_wait() will check again the events
1499 * availability. At this point, no one can insert
1500 * into ep->rdllist besides us. The epoll_ctl()
1501 * callers are locked out by
1502 * ep_scan_ready_list() holding "mtx" and the
1503 * poll callback will queue them in ep->ovflist.
1504 */
1505 list_add_tail(&epi->rdllink, &ep->rdllist);
1506 ep_pm_stay_awake(epi);
1507 }
1508 }
1509 }
1510
1511 return eventcnt;
1512 }
epoll_wait返回后,进程就可以遍历epoll_event数组来查询发生了哪些事件。
wake_up_locked封装了__wake_up_locked函数,__wake_up_locked封装了__wake_up_common函数。这次在__wake_up_common函数中curr->func指向的函数是在ep_poll的1587行调用的init_waitqueue_entry中设置的default_wake_function,它会调用try_to_wake_up函数唤醒进程。在__wake_up_common中遍历ep->wq的等待队列时,第一个节点的唤醒函数执行完毕后就会退出循环,以避免惊群效应。这时,同时等待一个epfd中I/O事件的进程中会有一个被唤醒,执行相应的操作,然后继续调用epoll_wait等待事件通告,直到再次被唤醒。
综上所述,事件通知的函数调用过程为:
sock_def_readable->wake_up_interruptible_sync_poll->__wake_up_sync_key->__wake_up_common->
ep_poll_callback->wake_up_locked->__wake_up_locked->__wake_up_common->default_wake_function->try_to_wake_up
整个事件通知机制中,epoll相当于一个联络员,它监听各个socket;socket有事件发生时通知epoll,epoll再去唤醒关心socket事件的进程。当有多于一个线程关心socket的I/O事件时,它们会在epoll中排队,而不是在socket的等待队列中排队。有事件通告时epoll只唤醒队列中的一个线程,其它线程会再下次事件到来时被唤醒。也就是说多线程模型可以避免惊群效应。如果是多进程模型,每个进程有自己独立的epoll集合,它们只能通过sock_def_readable来唤醒,从而无法解决惊群效应。Nginx中的解决方法是设置一个accept lock,但这样做的影响是大幅降低了CPS性能。