Linux select/poll/epoll 实现原理剖析

落寞的温暖

于 2023-03-27 10:30:00 发布

阅读量448

点赞数

文章标签： linux c++ 链表数据结构算法

本文链接：https://blog.csdn.net/u014441159/article/details/129751570

版权

1、IO 复用模型

2、select()

2.1、core_sys_select()

1、IO 复用模型

select(2)，poll(2) 和 epoll(7) 都是 I/O 多路复用的机制。I/O 多路复用就是通过一种机制，一个进程可以监视多个描述符，一旦某个描述符就绪（可读、可写或出错），能够通知程序进行相应的读写操作。但 select(2)，poll(2) 和 epoll(7) 本质上都是同步 I/O，因为他们都需要在读写事件就绪后自己负责读写数据，也就是说这个读写过程是阻塞的，而异步 I/O 则无需自己负责进行读写，异步 I/O 的实现会负责把数据从内核拷贝到用户空间。

有了 IO 复用，就可以调用 select()、poll() 或 epoll()，阻塞在这三个系统调用上（可以设置不阻塞），而不是阻塞在真正的 IO 系统调用上。

等待阻塞于 select() 调用，等待文件变为可读。当 select() 返回套接字可读这一条件时，调用 recvfrom() 把所读数据复制到应用进程缓冲区。

2、select()

select(2) 提供一种 fd_set 的数据结构，实际上是一个 long 类型的数组，并提供操作 fd_set 的函数 FD_ZERO()、FD_SET()、FD_CLR() 和 FD_ISSET()。fd_set 中的每一位都能与已打开的文件描述符 fd 建立联系。当调用 select(2) 时，由内核遍历 fd_set 的内容，根据 IO 状态修改 fd_set 的内容，通过将某位设置为 1 标记描述符已经就绪。

#include <sys/select.h>
int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *execeptfds,
           struct timeval *timeout);
void FD_ZERO(fd_set *set);
void FD_SET(int fd, fd_set *set);
void FD_CLR(int fd, fd_set *set);
int FD_ISSET(int fd, fd_set *set);

fd_set 其实是一个 unsigned long 类型的数组，数组大小为 16（假设 long 占用 8 个字节），可以表示 1024 个文件描述符的状态。

/// include/uapi/linux/posix_types.h
#undef __FD_SETSIZE
#define __FD_SETSIZE    1024
 
typedef struct {
    unsigned long fds_bits[__FD_SETSIZE / (8 * sizeof(long))];
} __kernel_fd_set;

/// @file include/linux/types.h
typedef __kernel_fd_set        fd_set;

2.1、core_sys_select()

当调用 select() 系统调用后，进入 kernel 使用 kern_select() 函数处理：

1）转换超时时间，将其转换为绝对时间（纳秒级）
2）调用 core_sys_select() 函数处理
3）调用 poll_select_finish() 拷贝剩余时间

/// fs/select.c
static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
               fd_set __user *exp, struct __kernel_old_timeval __user *tvp)
{
    struct timespec64 end_time, *to = NULL;
    struct __kernel_old_timeval tv;
    int ret;

    if (tvp) {
        if (copy_from_user(&tv, tvp, sizeof(tv)))
            return -EFAULT;

        to = &end_time;
        if (poll_select_set_timeout(to,
                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
            return -EINVAL;
    }

    ret = core_sys_select(n, inp, outp, exp, to);
    return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); // 拷贝剩余时间
}

SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
        fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp)
{
    return kern_select(n, inp, outp, exp, tvp);
}

core_sys_select() 主要工作是将 user 传入的 fd_set 数据拷贝到 kernel。kernel 中使用 fd_set_bits 结构保存 user 传入的 fd_set。fd_set_bits 只有六个指向 unsigned long 类型的指针。

/// include/linux/poll.h
#define FRONTEND_STACK_ALLOC    256
#define SELECT_STACK_ALLOC    FRONTEND_STACK_ALLOC

/// fs/select.c
typedef struct {
    unsigned long *in, *out, *ex;
    unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

#define FDS_BITPERLONG    (8*sizeof(long))
#define FDS_LONGS(nr)    (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr)    (FDS_LONGS(nr)*sizeof(long))

core_sys_select() 会预先在栈空间上分配 SELECT_STACK_ALLOC（为 256）字节的空间（使用的是 long 数组，32 个元素），可以保存最大的 fd 为 320。如果最大 fd 大于 320，则栈空间不能保存 user 传入的 fd_set，就需要在堆空间上申请内存，预分配的栈空间不再使用。

然后调用 do_select() 轮询处理 fd_set_bits 中 fd 是否可读或者可写。

最后将结果从 kernel 拷贝到 user 空间。

/// fs/select.c
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
               fd_set __user *exp, struct timespec64 *end_time)
{
    fd_set_bits fds;
    void *bits;
    int ret, max_fds;
    size_t size, alloc_size;
    struct fdtable *fdt;
    /* Allocate small arguments on the stack to save memory and be faster */
    long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

    ret = -EINVAL;
    if (n < 0)
        goto out_nofds;

    /* max_fds can increase, so grab it once to avoid race */
    rcu_read_lock();
    fdt = files_fdtable(current->files);
    max_fds = fdt->max_fds; // 当前进程打开的最大文件描述符
    rcu_read_unlock();
    if (n > max_fds)
        n = max_fds;

    /*
     * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
     * since we used fdset we need to allocate memory in units of
     * long-words. 
     */
    size = FDS_BYTES(n);
    bits = stack_fds;
    if (size > sizeof(stack_fds) / 6) { // n 最大为 320
        /* Not enough space in on-stack array; must use kmalloc */
        ret = -ENOMEM;
        if (size > (SIZE_MAX / 6))
            goto out_nofds;

        alloc_size = 6 * size;
        bits = kvmalloc(alloc_size, GFP_KERNEL); // 在堆上分配
        if (!bits)
            goto out_nofds;
    } // 下面将 bits 管理的内存分配给 fd_set_bits 数据结构
    fds.in      = bits;
    fds.out     = bits +   size;
    fds.ex      = bits + 2*size;
    fds.res_in  = bits + 3*size;
    fds.res_out = bits + 4*size;
    fds.res_ex  = bits + 5*size;
  // 将 user 传入的 fd_set 拷贝到 fd_set_bits 中
    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;
    zero_fd_set(n, fds.res_in); // 清空 out
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);

    ret = do_select(n, &fds, end_time); // 轮询的主要工作

    if (ret < 0)
        goto out;
    if (!ret) {
        ret = -ERESTARTNOHAND;
        if (signal_pending(current))
            goto out;
        ret = 0;
    }
  // 将 kernel 结果 fd_set_bits 拷贝到 fd_set
    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;

out:
    if (bits != stack_fds)
        kvfree(bits);
out_nofds:
    return ret;
}

2.2、do_select()

do_select() 用轮询的方式检测监听描述符的状态是否满足条件，若达到符合的相关条件则在返回 fd_set_bits 对应的数据域中标记该描述符。

虽然该轮询的机制是死循环，但是不是一直轮询，当内核轮询一遍文件描述符没有发现任何事件就绪时，会调用 poll_schedule_timeout() 函数将自己睡眠，等待相应的文件或定时器来唤醒自己，然后再继续循环体看看哪些文件已经就绪，以此减少对 CPU 的占用。

/// fs/select.c
tatic int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
    ktime_t expire, *to = NULL;
    struct poll_wqueues table; // 后面分析
    poll_table *wait;
    int retval, i, timed_out = 0;
    u64 slack = 0;
    __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
    unsigned long busy_start = 0;

    rcu_read_lock();
    retval = max_select_fd(n, fds); // 获取监听的最大描述符
    rcu_read_unlock();

    if (retval < 0) // 传入的文件描述符可能被意外关闭
        return retval;
    n = retval;

    poll_initwait(&table);
    wait = &table.pt;
    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
        wait->_qproc = NULL; // 定时器唤醒自己，不需要文件来唤醒
        timed_out = 1;
    }
  // 现在到超时时间的纳秒数
    if (end_time && !timed_out)
        slack = select_estimate_accuracy(end_time);

    retval = 0;
    for (;;) { // 主循环，开始轮询
        unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
        bool can_busy_loop = false;

        inp = fds->in; outp = fds->out; exp = fds->ex;
        rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
    // 每次处理 8 字节（unsigned long）
        for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
            unsigned long in, out, ex, all_bits, bit = 1, j;
            unsigned long res_in = 0, res_out = 0, res_ex = 0;
            __poll_t mask;
      // 本次处理的 8 个字节
            in = *inp++; out = *outp++; ex = *exp++;
            all_bits = in | out | ex;
            if (all_bits == 0) { // 全为空，继续下一个 8 字节
                i += BITS_PER_LONG;
                continue;
            }
      // 否则开始每一位进行检测
            for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
                struct fd f;
                if (i >= n)
                    break;
                if (!(bit & all_bits)) // 本位没有事件，下一位
                    continue;
                mask = EPOLLNVAL;
                f = fdget(i);
                if (f.file) {
                    wait_key_set(wait, in, out, bit,
                             busy_flag);
                    mask = vfs_poll(f.file, wait); // poll，获取可读或者可写

                    fdput(f);
                }
                if ((mask & POLLIN_SET) && (in & bit)) {
                    res_in |= bit;
                    retval++;
                    wait->_qproc = NULL; // 不需要唤醒
                }
                if ((mask & POLLOUT_SET) && (out & bit)) {
                    res_out |= bit;
                    retval++;
                    wait->_qproc = NULL; // 不需要唤醒
                }
                if ((mask & POLLEX_SET) && (ex & bit)) {
                    res_ex |= bit;
                    retval++;
                    wait->_qproc = NULL; // 不需要唤醒
                }
                /* got something, stop busy polling */
                if (retval) { // 有就绪事件，轮询结束就返回
                    can_busy_loop = false;
                    busy_flag = 0;

                /*
                 * only remember a returned
                 * POLL_BUSY_LOOP if we asked for it
                 */
                } else if (busy_flag & mask)
                    can_busy_loop = true;

            } // 8 循环，下面记录结果
            if (res_in)
                *rinp = res_in;
            if (res_out)
                *routp = res_out;
            if (res_ex)
                *rexp = res_ex;
            cond_resched(); // 暂时放弃 CPU
        } // 所有 fd 一遍轮询结束
        wait->_qproc = NULL;
        if (retval || timed_out || signal_pending(current))
            break; // 有就绪事件、超时、信号事件，跳出主循环返回
        if (table.error) {
            retval = table.error;
            break;
        }

        /* only if found POLL_BUSY_LOOP sockets && not out of time */
        if (can_busy_loop && !need_resched()) { // 可以忙等
            if (!busy_start) {
                busy_start = busy_loop_current_time();
                continue;
            }
            if (!busy_loop_timeout(busy_start))
                continue;
        }
        busy_flag = 0;

        /*
         * If this is the first loop and we have a timeout
         * given, then we convert to ktime_t and set the to
         * pointer to the expiry value.
         */
        if (end_time && !to) {
            expire = timespec64_to_ktime(*end_time);
            to = &expire;
        }
    // 设置当前进程的状态为 TASK_INTERRUPTIBLE，进入睡眠
        if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                       to, slack))
            timed_out = 1;
    }

    poll_freewait(&table);

    return retval;
}

2.3、poll_wqueues

poll_wqueues 是为了实现 select/poll 而设计的。poll_wqueues 用于管理 select() 调用时插入到文件等待队列上的所有 wait_queue_entry_t 对象。

poll_wqueues 会预先在栈空间申请 N_INLINE_POLL_ENTRIES 个 poll_table_entry 对象，inline_index 表示 inline_entries 数组索引。如果栈空间使用完，则在堆空间申请一个 page，当作 poll_table_page。

/// include/linux/poll.h
typedef struct poll_table_struct {
    poll_queue_proc _qproc;
    __poll_t _key;
} poll_table;

struct poll_table_entry {
    struct file *filp;
    __poll_t key;
    wait_queue_entry_t wait;
    wait_queue_head_t *wait_address;
};

/*
 * Structures and helpers for select/poll syscall
 */
struct poll_wqueues {
    poll_table pt;
    struct poll_table_page *table;
    struct task_struct *polling_task;
    int triggered;
    int error;
    int inline_index;
    struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

/// fs/select.c
struct poll_table_page {
    struct poll_table_page * next;
    struct poll_table_entry * entry;
    struct poll_table_entry entries[];
};

poll_get_entry() 函数可以清晰地看到 poll_table_entry 布局，

/// fs/select.c
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
    struct poll_table_page *table = p->table;

    if (p->inline_index < N_INLINE_POLL_ENTRIES) // 栈空间
        return p->inline_entries + p->inline_index++;

    if (!table || POLL_TABLE_FULL(table)) { // table 不存在或者使用完
        struct poll_table_page *new_table;
    // 重新申请一个 page，构造 poll_table_page
        new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
        if (!new_table) {
            p->error = -ENOMEM;
            return NULL;
        }
        new_table->entry = new_table->entries;
        new_table->next = table; // 插入到头部
        p->table = new_table;
        table = new_table;
    }

    return table->entry++;
}

poll_initwait() 函数初始化 poll_wqueues 对象

1）将 _qproc 赋值为 __pollwait 函数
2）polling_task 指向当前进程。

/// fs/select.c
void poll_initwait(struct poll_wqueues *pwq)
{
    init_poll_funcptr(&pwq->pt, __pollwait);
    pwq->polling_task = current;
    pwq->triggered = 0;
    pwq->error = 0;
    pwq->table = NULL;
    pwq->inline_index = 0;
}

/// include/linux/poll.h
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
    pt->_qproc = qproc;
    pt->_key   = ~(__poll_t)0; /* all events enabled */
}

我们知道，vfs_poll() 函数内部，将调用 _qproc 指向的函数，也就是 __pollwait 函数。

__pollwait 函数逻辑如下：

1）设置唤醒函数为 pollwake
2）将 wait 插入到文件等待队列上；

/// fs/select.c
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                poll_table *p)
{
    struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
    struct poll_table_entry *entry = poll_get_entry(pwq);
    if (!entry)
        return;
    entry->filp = get_file(filp);
    entry->wait_address = wait_address;
    entry->key = p->_key;
    init_waitqueue_func_entry(&entry->wait, pollwake);
    entry->wait.private = pwq;
    add_wait_queue(wait_address, &entry->wait);
}

当等待事件就绪时，调用 pollwake 函数。pollwake() 首先对 key 做检查，确认等待事件发生，调用 __pollwake() 函数。

/// fs/select.c
static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_table_entry *entry;

    entry = container_of(wait, struct poll_table_entry, wait);
    if (key && !(key_to_poll(key) & entry->key))
        return 0;
    return __pollwake(wait, mode, sync, key);
}

__pollwake() 函数调用 default_wake_function() 函数，将进程切换为 Running 状态。

/// fs/select.c
static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_wqueues *pwq = wait->private;
    DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

    /*
     * Although this function is called under waitqueue lock, LOCK
     * doesn't imply write barrier and the users expect write
     * barrier semantics on wakeup functions.  The following
     * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
     * and is paired with smp_store_mb() in poll_schedule_timeout.
     */
    smp_wmb();
    pwq->triggered = 1;

    /*
     * Perform the default wake up operation using a dummy
     * waitqueue.
     *
     * TODO: This is hacky but there currently is no interface to
     * pass in @sync.  @sync is scheduled to be removed and once
     * that happens, wake_up_process() can be used directly.
     */
    return default_wake_function(&dummy_wait, mode, sync, key);
}

2.4、select 优缺点

优点：跨平台
缺点
- 单个进程能够监视的文件描述符的数量存在最大限制，通常是 1024。当然可以更改数量，但由于 select(2) 采用轮询的方式扫描文件描述符，文件描述符数量越多，性能越差
- 每次调用 select(2)，都需要把 fd_set 对象从用户空间拷贝到内核空间，在返回时，从内核空间拷贝到用户空间
- select(2) 返回的是含有整个监视的文件描述符，应用程序需要遍历整个数组才能发现哪些句柄发生了事件
- 会（清空）修改传入的 fd_set 对象（地址传递），返回的使用当作返回空间。所以应用程序所以每次都需要重新拷贝，传入副本，以免自己维持的 fd_set 被污染

3、poll()

poll(2) 和 select(2) 类似，没有本质差别，管理多个描述符也是进行轮询，根据描述符的状态进行处理。但是 poll(2) 用链表管理监视事件，没有最大描述符数量的限制，并且传入的 fds 在 poll(2) 函数返回后不会清空，就绪事件记录在 revents 成员中。

#include <poll.h>
int poll(struct pollfd *fds, nfds_t nfds, int timeout);

/// include/uapi/asm-generic/poll.h
struct pollfd {
    int fd;
    short events;
    short revents;
};

3.1、do_sys_poll

当调用 poll() 系统调用后，kernel 处理逻辑如下：

1）转换超时时间，将其转换为绝对时间（纳秒级）
2）调用 do_sys_poll() 函数处理
3）拷贝剩余时间

/// fs/select.c
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
        int, timeout_msecs)
{
    struct timespec64 end_time, *to = NULL;
    int ret;

    if (timeout_msecs >= 0) { // 转换超时事件，微秒级
        to = &end_time;
        poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
            NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
    }

    ret = do_sys_poll(ufds, nfds, to); // 轮询

    if (ret == -ERESTARTNOHAND) {
        struct restart_block *restart_block;

        restart_block = &current->restart_block;
        restart_block->poll.ufds = ufds;
        restart_block->poll.nfds = nfds;

        if (timeout_msecs >= 0) {
            restart_block->poll.tv_sec = end_time.tv_sec;
            restart_block->poll.tv_nsec = end_time.tv_nsec;
            restart_block->poll.has_timeout = 1;
        } else
            restart_block->poll.has_timeout = 0;

        ret = set_restart_fn(restart_block, do_restart_poll);
    }
    return ret;
}

和 select(2) 一样，poll(2) 也会预先在栈空间申请大小为 POLL_STACK_ALLOC 的内存，栈空间可以处理 30 个文件描述符。不过和 select(2) 不同的是，即使栈空间太小，要从堆上申请内存，预先分配的栈空间也是被使用的。

/// fs/select.c
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
        struct timespec64 *end_time)
{
    struct poll_wqueues table;
    int err = -EFAULT, fdcount, len;
    /* Allocate small arguments on the stack to save memory and be
       faster - use long to make sure the buffer is aligned properly
       on 64 bit archs to avoid unaligned access */
    long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; // 预先分配 256B
    struct poll_list *const head = (struct poll_list *)stack_pps;
     struct poll_list *walk = head;
     unsigned long todo = nfds;

    if (nfds > rlimit(RLIMIT_NOFILE))
        return -EINVAL;

    len = min_t(unsigned int, nfds, N_STACK_PPS);
    for (;;) {
        walk->next = NULL;
        walk->len = len;
        if (!len)
            break;
    // 从用于空间拷贝到内核空间
        if (copy_from_user(walk->entries, ufds + nfds-todo,
                    sizeof(struct pollfd) * walk->len))
            goto out_fds;

        todo -= walk->len;
        if (!todo)
            break;
    // 一次最大申请一页，可以 510 个文件描述符 
        len = min(todo, POLLFD_PER_PAGE);
        walk = walk->next = kmalloc(struct_size(walk, entries, len),
                        GFP_KERNEL);
        if (!walk) {
            err = -ENOMEM;
            goto out_fds;
        }
    }

    poll_initwait(&table);
    fdcount = do_poll(head, &table, end_time);
    poll_freewait(&table);

    if (!user_write_access_begin(ufds, nfds * sizeof(*ufds)))
        goto out_fds;
  // 将结果从内核空间拷贝到用户空间
    for (walk = head; walk; walk = walk->next) {
        struct pollfd *fds = walk->entries;
        int j;

        for (j = walk->len; j; fds++, ufds++, j--)
            unsafe_put_user(fds->revents, &ufds->revents, Efault);
      }
    user_write_access_end();

    err = fdcount;
out_fds:
    walk = head->next;
    while (walk) {
        struct poll_list *pos = walk;
        walk = walk->next;
        kfree(pos);
    }

    return err;

Efault:
    user_write_access_end();
    err = -EFAULT;
    goto out_fds;
}

3.2、do_poll()

和 do_select() 的原理一样，也是通过轮询的方法查看每个 fd 的状态。

/// fs/select.c
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
           struct timespec64 *end_time)
{
    poll_table* pt = &wait->pt;
    ktime_t expire, *to = NULL;
    int timed_out = 0, count = 0;
    u64 slack = 0;
    __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
    unsigned long busy_start = 0;

    /* Optimise the no-wait case */
    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
        pt->_qproc = NULL;
        timed_out = 1;
    }

    if (end_time && !timed_out)
        slack = select_estimate_accuracy(end_time);

    for (;;) { // 主循环
        struct poll_list *walk;
        bool can_busy_loop = false;
    // 处理每个 walk
        for (walk = list; walk != NULL; walk = walk->next) {
            struct pollfd * pfd, * pfd_end;

            pfd = walk->entries;
            pfd_end = pfd + walk->len;
            for (; pfd != pfd_end; pfd++) {
                /*
                 * Fish for events. If we found one, record it
                 * and kill poll_table->_qproc, so we don't
                 * needlessly register any other waiters after
                 * this. They'll get immediately deregistered
                 * when we break out and return.
                 */
                if (do_pollfd(pfd, pt, &can_busy_loop,
                          busy_flag)) {
                    count++;
                    pt->_qproc = NULL;
                    /* found something, stop busy polling */
                    busy_flag = 0;
                    can_busy_loop = false;
                }
            }
        }
        /*
         * All waiters have already been registered, so don't provide
         * a poll_table->_qproc to them on the next loop iteration.
         */
        pt->_qproc = NULL;
        if (!count) {
            count = wait->error;
            if (signal_pending(current))
                count = -ERESTARTNOHAND;
        }
        if (count || timed_out)
            break;

        /* only if found POLL_BUSY_LOOP sockets && not out of time */
        if (can_busy_loop && !need_resched()) {
            if (!busy_start) {
                busy_start = busy_loop_current_time();
                continue;
            }
            if (!busy_loop_timeout(busy_start))
                continue;
        }
        busy_flag = 0;

        /*
         * If this is the first loop and we have a timeout
         * given, then we convert to ktime_t and set the to
         * pointer to the expiry value.
         */
        if (end_time && !to) {
            expire = timespec64_to_ktime(*end_time);
            to = &expire;
        }
    // 睡眠
        if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
            timed_out = 1;
    }
    return count;
}

3.3、poll 优缺点

优点
- select(2) 会修改传入的 fd_set 参数，把它当作返回的空间存储返回的数据，而 poll(2) 不会，返回数据和传入的数据不互相干扰；
- poll(2) 的描述符类型使用链表实现，没有描述符数量的限制；
缺点
- 每次调用 poll(2)，都需要把 pollfd 链表从用户空间拷贝到内核空间，在返回时，将返回数据从内核空间拷贝到用户空间
- poll(2) 返回的是含有整个 pollfd 链表，应用程序需要遍历整个链表才能发现哪些句柄发生了事件

4、epoll()

相对于 select(2) 来说，epoll(7) 没有描述符个数限制。调用 epoll_ctl(2) 注册事件的时候将相关数据拷入内核，以后调用 epoll_wait(2) 不会像 select(2) 或 poll(2) 那样，每次都从用户空间拷贝数据到内核空间。

并且与 select(2) 或 poll(2) 返回所有事件不同的是，epoll(7) 返回的是处于就绪的事件的列表。

此外 epoll(7) 是基于事件驱动的，在所有添加事件会建立回调关系，也就是说，当相应的事件发生时会调用这个回调方法，它会将发生的事件添加到就绪链表中。

/// @file sys/epoll.h
typedef union epoll_data
{
  void *ptr;
  int fd;
  uint32_t u32;
  uint64_t u64;
} epoll_data_t;

struct epoll_event
{
  uint32_t events;    /* Epoll events */
  epoll_data_t data;    /* User data variable */
} __EPOLL_PACKED;

#include <sys/epoll.h>
int epoll_create(int size);
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
int epoll_wait(int epfd, struct epoll_event *events, 
               int maxevents, int timeout);

4.1、eventpoll

epoll(7) 的原理和 select(2) 和 poll(2) 都不一样。epoll(7) 在内核中会保存需要监视事件（文件和期望的就绪状态），这时通过 epoll_ctl(2) 来完成监听事件的添加、修改和移除。所以内核中需要特定的数据结构来保存需要监视的事件。

首先内核用 eventpoll 结构体来管理所有的监视事件：

1）所有监视事件用红黑树串联起来
2）一切皆文件，创建 eventpoll 对象会绑定一个匿名文件的 file 对象。我们可以像操作文件一样操作 eventpoll 对象。

eventpoll 结构几个成员简单备注一下

1）wq：等待队列，等待 epoll_wait() 返回的进程；
2）poll_wait：epoll 文件的等待队列。epfd 也可以注册到其他 epoll 中；
3）rdllist：就绪队列，epoll_wait() 将 rdllist 链表上就绪事件拷贝到用户空间返回；
4）rbr：红黑树根节点，所有 epitem 都在一棵红黑树上；
5）ovflist：当 rdllist 正在使用时，就绪的事件添加到 ovflist 链表上；

/// fs/eventpoll.c
struct eventpoll {
    /*
     * This mutex is used to ensure that files are not removed
     * while epoll is using them. This is held during the event
     * collection loop, the file cleanup path, the epoll file exit
     * code and the ctl operations.
     */
    struct mutex mtx;

    /* Wait queue used by sys_epoll_wait() */
    wait_queue_head_t wq;

    /* Wait queue used by file->poll() */
    wait_queue_head_t poll_wait;

    /* List of ready file descriptors */
    struct list_head rdllist;

    /* Lock which protects rdllist and ovflist */
    rwlock_t lock;

    /* RB tree root used to store monitored fd structs */
    struct rb_root_cached rbr;

    /*
     * This is a single linked list that chains all the "struct epitem" that
     * happened while transferring ready events to userspace w/out
     * holding ->lock.
     */
    struct epitem *ovflist;

    /* wakeup_source used when ep_scan_ready_list is running */
    struct wakeup_source *ws;

    /* The user that created the eventpoll descriptor */
    struct user_struct *user;

    struct file *file;

    /* used to optimize loop detection check */
    u64 gen;

#ifdef CONFIG_NET_RX_BUSY_POLL
    /* used to track busy poll napi_id */
    unsigned int napi_id;
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
    /* tracks wakeup nests for lockdep validation */
    u8 nests;
#endif
};

ep_alloc() 用于申请一个 eventpoll 对象，ovflist 被初始化为 EP_UNACTIVE_PTR，当 rdllist 在使用时，ovflist 被修改为 NULL，开始暂时接受就绪事件。

/// fs/eventpoll.c
static int ep_alloc(struct eventpoll **pep)
{
    int error;
    struct user_struct *user;
    struct eventpoll *ep;

    user = get_current_user();
    error = -ENOMEM;
    ep = kzalloc(sizeof(*ep), GFP_KERNEL);
    if (unlikely(!ep))
        goto free_uid;

    mutex_init(&ep->mtx);
    rwlock_init(&ep->lock);
    init_waitqueue_head(&ep->wq);
    init_waitqueue_head(&ep->poll_wait);
    INIT_LIST_HEAD(&ep->rdllist);
    ep->rbr = RB_ROOT_CACHED;
    ep->ovflist = EP_UNACTIVE_PTR;
    ep->user = user;

    *pep = ep;

    return 0;

free_uid:
    free_uid(user);
    return error;
}

向 epoll 注册的 fd，epoll 都使用 epitem 表示。epoll 中所有的 epitem 用红黑树串联起来。

/// fs/eventpoll.c
struct epitem {
    union {
        /* RB tree node links this structure to the eventpoll RB tree */
        struct rb_node rbn;
        /* Used to free the struct epitem */
        struct rcu_head rcu;
    };

    /* List header used to link this structure to the eventpoll ready list */
    struct list_head rdllink;

    /*
     * Works together "struct eventpoll"->ovflist in keeping the
     * single linked chain of items.
     */
    struct epitem *next;

    /* The file descriptor information this item refers to */
    struct epoll_filefd ffd;

    /* Number of active wait queue attached to poll operations */
    int nwait;

    /* List containing poll wait queues */
    struct list_head pwqlist;

    /* The "container" of this item */
    struct eventpoll *ep;

    /* List header used to link this item to the "struct file" items list */
    struct list_head fllink;

    /* wakeup_source used when EPOLLWAKEUP is set */
    struct wakeup_source __rcu *ws;

    /* The structure that describe the interested events and the source fd */
    struct epoll_event event;
};

4.2、epoll_create()

可以看到 size 参数没有任何作用，只要大于 0 就行。

/// fs/eventpoll.c
SYSCALL_DEFINE1(epoll_create, int, size)
{
    if (size <= 0)
        return -EINVAL;

    return do_epoll_create(0);
}

do_epoll_create() 函数做三件事：

1）调用 ep_alloc() 申请一个 eventpoll 对象；
2）调用 get_unused_fd_flags() 获取一个 fd；
3）调用 anon_inode_getfile() 从匿名文件系统申请一个 file 对象，filp->private_data 指向 eventpoll 对象；另外文件系统 file_operations 是 eventpoll_fops；

/// fs/eventpoll.c
/*
 * Open an eventpoll file descriptor.
 */
static int do_epoll_create(int flags)
{
    int error, fd;
    struct eventpoll *ep = NULL;
    struct file *file;

    /* Check the EPOLL_* constant for consistency.  */
    BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

    if (flags & ~EPOLL_CLOEXEC)
        return -EINVAL;
    /*
     * Create the internal data structure ("struct eventpoll").
     */
    error = ep_alloc(&ep); // 申请一个 eventpoll 对象
    if (error < 0)
        return error;
    /*
     * Creates all the items needed to setup an eventpoll file. That is,
     * a file structure and a free file descriptor.
     */
    fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    if (fd < 0) {
        error = fd;
        goto out_free_ep;
    }
    file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                 O_RDWR | (flags & O_CLOEXEC)); // 匿名文件
    if (IS_ERR(file)) {
        error = PTR_ERR(file);
        goto out_free_fd;
    }
    ep->file = file;
    fd_install(fd, file);
    return fd;

out_free_fd:
    put_unused_fd(fd);
out_free_ep:
    ep_free(ep);
    return error;
}

anon_inode_getfile() 函数不会申请 inode，而是使用匿名文件统一的 inode。然后调用 alloc_file_pseudo() 申请一个 file 对象，private_data 指向 eventpoll 对象。

/// fs/anon_inodes.c
struct file *anon_inode_getfile(const char *name,
                const struct file_operations *fops,
                void *priv, int flags)
{
    struct file *file;

    if (IS_ERR(anon_inode_inode))
        return ERR_PTR(-ENODEV);

    if (fops->owner && !try_module_get(fops->owner))
        return ERR_PTR(-ENOENT);

    /*
     * We know the anon_inode inode count is always greater than zero,
     * so ihold() is safe.
     */
    ihold(anon_inode_inode);
    file = alloc_file_pseudo(anon_inode_inode, anon_inode_mnt, name,
                 flags & (O_ACCMODE | O_NONBLOCK), fops);
    if (IS_ERR(file))
        goto err;

    file->f_mapping = anon_inode_inode->i_mapping;

    file->private_data = priv;

    return file;

err:
    iput(anon_inode_inode);
    module_put(fops->owner);
    return file;
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);

eventpoll_fops.poll 指向的是 ep_eventpoll_poll 函数。

/// fs/eventpoll.c
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
    .show_fdinfo    = ep_show_fdinfo,
#endif
    .release    = ep_eventpoll_release,
    .poll        = ep_eventpoll_poll,
    .llseek        = noop_llseek,
};

is_file_epoll() 可以判断 file 是否为 epoll 类型的 file，依据是 f_op 是否指向 eventpoll_fops。

/// fs/eventpoll.c
static inline int is_file_epoll(struct file *f)
{
    return f->f_op == &eventpoll_fops;
}

epoll_create(2) 系统调用完成了 fd、file 和 eventpoll 三个对象之间的关联，并将 fd 返回给用户态应用程序。每一个 fd 都会对应一个 eventpoll 对象，用户通过 fd 可以将需要监视的目标事件添加到 eventpoll 中。

4.3、ep_item_poll()

epoll 和 select/poll 相同，也会调用 vfs_poll() 获取文件状态。对于 epfd 文件，也可以被注册到其他 epoll 中，但是 epoll 文件需要特殊处理，直接调用 poll_wait() 函数。

/// fs/eventpoll.c
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
                 int depth)
{
    struct eventpoll *ep;
    bool locked;

    pt->_key = epi->event.events; // 等待事件
    if (!is_file_epoll(epi->ffd.file)) // 非 epoll 文件，走 vfs
        return vfs_poll(epi->ffd.file, pt) & epi->event.events;
  // epoll 文件，调用 poll_wait
    ep = epi->ffd.file->private_data;
    poll_wait(epi->ffd.file, &ep->poll_wait, pt);
    locked = pt && (pt->_qproc == ep_ptable_queue_proc);

    return ep_scan_ready_list(epi->ffd.file->private_data,
                  ep_read_events_proc, &depth, depth,
                  locked) & epi->event.events;
}

pt->_qproc 只有在 ep_insert() 函数中被赋值，指向 ep_ptable_queue_proc() 函数，其他时候调用时，其为 NULL。当 _qproc 为 NULL 时，poll_wait() 什么都不做。

4.3.1、ep_scan_ready_list()

ep_scan_ready_list() 遍历就绪链表 rdllist。

1）处理 rdllist 就绪链表。为了减少对 ep->lock 锁的占用，ep_scan_ready_list() 会先将 rdllist 链表替换出来，然后将 ovflist 赋值为 NULL。后续就绪的事件会添加到 ovflist 链表上。
2）处理 ovflist 链表。当 rdllist 链表处理完，接着处理 ovflist 上的就绪链表。处理 ovflist 链表时，lock 锁一直被占用。
3）将处理后的就绪链表，再次替换到 rdllist 链表。此时 rdllist 链表上都是就绪的。

/// fs/eventpoll.c
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
                  __poll_t (*sproc)(struct eventpoll *,
                       struct list_head *, void *),
                  void *priv, int depth, bool ep_locked)
{
    __poll_t res;
    struct epitem *epi, *nepi;
    LIST_HEAD(txlist);

    lockdep_assert_irqs_enabled();

    /*
     * We need to lock this because we could be hit by
     * eventpoll_release_file() and epoll_ctl().
     */

    if (!ep_locked)
        mutex_lock_nested(&ep->mtx, depth);

    /*
     * Steal the ready list, and re-init the original one to the
     * empty list. Also, set ep->ovflist to NULL so that events
     * happening while looping w/out locks, are not lost. We cannot
     * have the poll callback to queue directly on ep->rdllist,
     * because we want the "sproc" callback to be able to do it
     * in a lockless way.
     */
    write_lock_irq(&ep->lock);
    list_splice_init(&ep->rdllist, &txlist);
    WRITE_ONCE(ep->ovflist, NULL);
    write_unlock_irq(&ep->lock);

    /*
     * Now call the callback function.
     */
    res = (*sproc)(ep, &txlist, priv);

    write_lock_irq(&ep->lock);
    /*
     * During the time we spent inside the "sproc" callback, some
     * other events might have been queued by the poll callback.
     * We re-insert them inside the main ready-list here.
     */
    for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
         nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
        /*
         * We need to check if the item is already in the list.
         * During the "sproc" callback execution time, items are
         * queued into ->ovflist but the "txlist" might already
         * contain them, and the list_splice() below takes care of them.
         */
        if (!ep_is_linked(epi)) {
            /*
             * ->ovflist is LIFO, so we have to reverse it in order
             * to keep in FIFO.
             */
            list_add(&epi->rdllink, &ep->rdllist); // 添加到 rdlist
            ep_pm_stay_awake(epi);
        }
    }
    /*
     * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
     * releasing the lock, events will be queued in the normal way inside
     * ep->rdllist.
     */
    WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);

    /*
     * Quickly re-inject items left on "txlist".
     */
    list_splice(&txlist, &ep->rdllist); // txlist 添加到 rdlist
    __pm_relax(ep->ws);

    if (!list_empty(&ep->rdllist)) { // rdlist 不为空
        if (waitqueue_active(&ep->wq))
            wake_up(&ep->wq); // 唤醒阻塞在 epoll_wait 的进程
    }

    write_unlock_irq(&ep->lock);

    if (!ep_locked)
        mutex_unlock(&ep->mtx);

    return res;
}

在调用 ep_scan_ready_list() 函数时，传入了 ep_read_events_proc() 函数。该函数遍历传入的链表，都调用 ep_item_poll() 处理，如果有一个文件就绪，就返回。没有就绪事件的 epi，从就绪链表中删除。

/// fs/eventpoll.c
static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
                   void *priv)
{
    struct epitem *epi, *tmp;
    poll_table pt;
    int depth = *(int *)priv;

    init_poll_funcptr(&pt, NULL);
    depth++;

    list_for_each_entry_safe(epi, tmp, head, rdllink) {
        if (ep_item_poll(epi, &pt, depth)) {
            return EPOLLIN | EPOLLRDNORM;
        } else {
            /*
             * Item has been dropped into the ready list by the poll
             * callback, but it's not actually ready, as far as
             * caller requested events goes. We can remove it here.
             */
            __pm_relax(ep_wakeup_source(epi));
            list_del_init(&epi->rdllink); // 没有就绪事件，删除
        }
    }

    return 0;
}

4.3.2、ep_ptable_queue_proc()

在调用 vfs_poll() 函数时，处理函数是 ep_ptable_queue_proc() 函数。

ep_ptable_queue_proc() 将一个 eppoll_entry 对象挂到某个文件（监视的文件）的等待队列上，并且设置就绪事件发生时，调用的函数是 ep_poll_callback() 函数。

/// fs/eventpoll.c
/*
 * This is the callback that is used to add our wait queue to the
 * target file wakeup lists.
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                 poll_table *pt)
{
    struct epitem *epi = ep_item_from_epqueue(pt);
    struct eppoll_entry *pwq;

    if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        if (epi->event.events & EPOLLEXCLUSIVE)
            add_wait_queue_exclusive(whead, &pwq->wait);
        else
            add_wait_queue(whead, &pwq->wait);
        list_add_tail(&pwq->llink, &epi->pwqlist);
        epi->nwait++;
    } else {
        /* We have to signal that an error occurred */
        epi->nwait = -1;
    }
}

4.3.3、ep_poll_callback()

可以看到，回调函数 ep_poll_callback() 在事件就绪后，将对应的 epitem 对象添加到就绪链表中，然后唤醒阻塞的进程。

/// fs/eventpoll.c
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
    int pwake = 0;
    struct epitem *epi = ep_item_from_wait(wait);
    struct eventpoll *ep = epi->ep;
    __poll_t pollflags = key_to_poll(key);
    unsigned long flags;
    int ewake = 0;

    read_lock_irqsave(&ep->lock, flags);

    ep_set_busy_poll_napi_id(epi);

    /*
     * If the event mask does not contain any poll(2) event, we consider the
     * descriptor to be disabled. This condition is likely the effect of the
     * EPOLLONESHOT bit that disables the descriptor when an event is received,
     * until the next EPOLL_CTL_MOD will be issued.
     */
    if (!(epi->event.events & ~EP_PRIVATE_BITS)) // 没有事件，返回
        goto out_unlock;

    /*
     * Check the events coming with the callback. At this stage, not
     * every device reports the events in the "key" parameter of the
     * callback. We need to be able to handle both cases here, hence the
     * test for "key" != NULL before the event match test.
     */
    if (pollflags && !(pollflags & epi->event.events))
        goto out_unlock;

    /*
     * If we are transferring events to userspace, we can hold no locks
     * (because we're accessing user memory, and because of linux f_op->poll()
     * semantics). All the events that happen during that period of time are
     * chained in ep->ovflist and requeued later on.
     */
    if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { // rdlist 正在处理
        if (chain_epi_lockless(epi)) // 添加到 ovflist 链表
            ep_pm_stay_awake_rcu(epi);
    } else if (!ep_is_linked(epi)) { // 添加到 rdlist
        /* In the usual case, add event to ready list. */
        if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
            ep_pm_stay_awake_rcu(epi);
    }

    /*
     * Wake up ( if active ) both the eventpoll wait list and the ->poll()
     * wait list.
     */
    if (waitqueue_active(&ep->wq)) { // 唤醒阻塞在 epoll_wait 的进程
        if ((epi->event.events & EPOLLEXCLUSIVE) &&
                    !(pollflags & POLLFREE)) {
            switch (pollflags & EPOLLINOUT_BITS) {
            case EPOLLIN:
                if (epi->event.events & EPOLLIN)
                    ewake = 1;
                break;
            case EPOLLOUT:
                if (epi->event.events & EPOLLOUT)
                    ewake = 1;
                break;
            case 0:
                ewake = 1;
                break;
            }
        }
        wake_up(&ep->wq);
    }
    if (waitqueue_active(&ep->poll_wait)) // 唤醒其他阻塞 epoll 文件的进程
        pwake++;

out_unlock:
    read_unlock_irqrestore(&ep->lock, flags);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(ep, epi);

    if (!(epi->event.events & EPOLLEXCLUSIVE))
        ewake = 1;

    if (pollflags & POLLFREE) {
        /*
         * If we race with ep_remove_wait_queue() it can miss
         * ->whead = NULL and do another remove_wait_queue() after
         * us, so we can't use __remove_wait_queue().
         */
        list_del_init(&wait->entry);
        /*
         * ->whead != NULL protects us from the race with ep_free()
         * or ep_remove(), ep_remove_wait_queue() takes whead->lock
         * held by the caller. Once we nullify it, nothing protects
         * ep/epi or even wait.
         */
        smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
    }

    return ewake;
}

4.4、epoll_ctl()

/// fs/eventpoll.c
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        struct epoll_event __user *, event)
{
    struct epoll_event epds;

    if (ep_op_has_event(op) &&
        copy_from_user(&epds, event, sizeof(struct epoll_event)))
        return -EFAULT;

    return do_epoll_ctl(epfd, op, fd, &epds, false);
}

在添加监视事件的时候，首先要保证没有注册过，如果存在，返回 -EEXIST 错误。不过，检测是否注册不仅仅依靠文件描述符，还会查看其绑定的 file 对象的地址。默认设置对目标文件的 POLLERR 和 POLLHUP 监听事件，然后调用 ep_insert() 函数，其函数核心的两个工作是：（1）将回调函数加入到要监视的文件文件描述符的等待队列上；（2）将要监听事件插入到的红黑树里面。

/// fs/eventpoll.c
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
         bool nonblock)
{
    int error;
    int full_check = 0;
    struct fd f, tf;
    struct eventpoll *ep;
    struct epitem *epi;
    struct eventpoll *tep = NULL;

    error = -EBADF;
    f = fdget(epfd);
    if (!f.file)
        goto error_return;

    /* Get the "struct file *" for the target file */
    tf = fdget(fd);
    if (!tf.file)
        goto error_fput;

    /* The target file descriptor must support poll */
    error = -EPERM;
    if (!file_can_poll(tf.file)) // 监视文件不支持 poll，无法处理
        goto error_tgt_fput;

    /* Check if EPOLLWAKEUP is allowed */
    if (ep_op_has_event(op))
        ep_take_care_of_epollwakeup(epds);

    /*
     * We have to check that the file structure underneath the file descriptor
     * the user passed to us _is_ an eventpoll file. And also we do not permit
     * adding an epoll file descriptor inside itself.
     */
    error = -EINVAL;
    if (f.file == tf.file || !is_file_epoll(f.file))
        goto error_tgt_fput; // 不能监视自己

    /*
     * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
     * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
     * Also, we do not currently supported nested exclusive wakeups.
     */
    if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
        if (op == EPOLL_CTL_MOD)
            goto error_tgt_fput; // EPOLLEXCLUSIVE 只能在 ADD 是设定
        if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
                (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
            goto error_tgt_fput;
    }

    /*
     * At this point it is safe to assume that the "private_data" contains
     * our own data structure.
     */
    ep = f.file->private_data;

    /*
     * When we insert an epoll file descriptor, inside another epoll file
     * descriptor, there is the change of creating closed loops, which are
     * better be handled here, than in more critical paths. While we are
     * checking for loops we also determine the list of files reachable
     * and hang them on the tfile_check_list, so we can check that we
     * haven't created too many possible wakeup paths.
     *
     * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
     * the epoll file descriptor is attaching directly to a wakeup source,
     * unless the epoll file descriptor is nested. The purpose of taking the
     * 'epmutex' on add is to prevent complex toplogies such as loops and
     * deep wakeup paths from forming in parallel through multiple
     * EPOLL_CTL_ADD operations.
     */
    error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
    if (error)
        goto error_tgt_fput;
    if (op == EPOLL_CTL_ADD) {
        if (!list_empty(&f.file->f_ep_links) ||
                ep->gen == loop_check_gen ||
                        is_file_epoll(tf.file)) {
            mutex_unlock(&ep->mtx);
            error = epoll_mutex_lock(&epmutex, 0, nonblock);
            if (error)
                goto error_tgt_fput;
            loop_check_gen++;
            full_check = 1;
            if (is_file_epoll(tf.file)) {
                error = -ELOOP;
                if (ep_loop_check(ep, tf.file) != 0)
                    goto error_tgt_fput;
            } else {
                get_file(tf.file);
                list_add(&tf.file->f_tfile_llink,
                            &tfile_check_list);
            }
            error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
            if (error)
                goto error_tgt_fput;
            if (is_file_epoll(tf.file)) {
                tep = tf.file->private_data;
                error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
                if (error) {
                    mutex_unlock(&ep->mtx);
                    goto error_tgt_fput;
                }
            }
        }
    }

    /*
     * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
     * above, we can be sure to be able to use the item looked up by
     * ep_find() till we release the mutex.
     */
    epi = ep_find(ep, tf.file, fd); // 在红黑树查找 fd

    error = -EINVAL;
    switch (op) {
    case EPOLL_CTL_ADD:
        if (!epi) { // ADD 操作并且没有注册，执行 ep_insert 插入
            epds->events |= EPOLLERR | EPOLLHUP;
            error = ep_insert(ep, epds, tf.file, fd, full_check);
        } else // 已经存在，返回错误
            error = -EEXIST;
        break;
    case EPOLL_CTL_DEL: 
        if (epi) // DEL 操作并且存在，执行 ep_remove 删除
            error = ep_remove(ep, epi);
        else // 不存在，返回错误
            error = -ENOENT;
        break;
    case EPOLL_CTL_MOD:
        if (epi) { // MOD 操作并且，存在
            if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                epds->events |= EPOLLERR | EPOLLHUP;
                error = ep_modify(ep, epi, epds); // 修改 ep_insert
            }
        } else // 不存在，返回错误
            error = -ENOENT;
        break;
    }
    if (tep != NULL)
        mutex_unlock(&tep->mtx);
    mutex_unlock(&ep->mtx);

error_tgt_fput:
    if (full_check) {
        clear_tfile_check_list();
        loop_check_gen++;
        mutex_unlock(&epmutex);
    }

    fdput(tf);
error_fput:
    fdput(f);
error_return:

    return error;
}

4.4.1、ep_insert()

最多内核空间的 4% 给 eventpoll 使用，每个事件占用 (sizeof(struct epitem) + sizeof(struct eppoll_entry))，在 64 位系统中占用 128 + 72 = 200 字节。

/// fs/eventpoll.c
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
             struct file *tfile, int fd, int full_check)
{
    int error, pwake = 0;
    __poll_t revents;
    long user_watches;
    struct epitem *epi;
    struct ep_pqueue epq;

    lockdep_assert_irqs_enabled();

    user_watches = atomic_long_read(&ep->user->epoll_watches);
    if (unlikely(user_watches >= max_user_watches))
        return -ENOSPC;
    if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) // 分配 epitem
        return -ENOMEM;

    /* Item initialization follow here ... */
    INIT_LIST_HEAD(&epi->rdllink);
    INIT_LIST_HEAD(&epi->fllink);
    INIT_LIST_HEAD(&epi->pwqlist);
    epi->ep = ep;
    ep_set_ffd(&epi->ffd, tfile, fd); // 设置监视文件
    epi->event = *event; // 设置监视事件
    epi->nwait = 0;
    epi->next = EP_UNACTIVE_PTR;
    if (epi->event.events & EPOLLWAKEUP) {
        error = ep_create_wakeup_source(epi);
        if (error)
            goto error_create_wakeup_source;
    } else {
        RCU_INIT_POINTER(epi->ws, NULL);
    }

    /* Add the current item to the list of active epoll hook for this file */
    spin_lock(&tfile->f_lock);
    list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
    spin_unlock(&tfile->f_lock);

    /*
     * Add the current item to the RB tree. All RB tree operations are
     * protected by "mtx", and ep_insert() is called with "mtx" held.
     */
    ep_rbtree_insert(ep, epi); // 插入到红黑树

    /* now check if we've created too many backpaths */
    error = -EINVAL;
    if (full_check && reverse_path_check())
        goto error_remove_epi;

    /* Initialize the poll table using the queue callback */
    epq.epi = epi;
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); // poll cb

    /*
     * Attach the item to the poll hooks and get current event bits.
     * We can safely use the file* here because its usage count has
     * been increased by the caller of this function. Note that after
     * this operation completes, the poll callback can start hitting
     * the new item.
     */
    revents = ep_item_poll(epi, &epq.pt, 1); // 获取文件状态

    /*
     * We have to check if something went wrong during the poll wait queue
     * install process. Namely an allocation for a wait queue failed due
     * high memory pressure.
     */
    error = -ENOMEM;
    if (epi->nwait < 0)
        goto error_unregister;

    /* We have to drop the new item inside our item list to keep track of it */
    write_lock_irq(&ep->lock);

    /* record NAPI ID of new item if present */
    ep_set_busy_poll_napi_id(epi);

    /* If the file is already "ready" we drop it inside the ready list */
    if (revents && !ep_is_linked(epi)) { // 添加到就绪链表
        list_add_tail(&epi->rdllink, &ep->rdllist);
        ep_pm_stay_awake(epi);

        /* Notify waiting tasks that events are available */
        if (waitqueue_active(&ep->wq))
            wake_up(&ep->wq);
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }

    write_unlock_irq(&ep->lock);

    atomic_long_inc(&ep->user->epoll_watches);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(ep, NULL);

    return 0;

error_unregister:
    ep_unregister_pollwait(ep, epi);
error_remove_epi:
    spin_lock(&tfile->f_lock);
    list_del_rcu(&epi->fllink);
    spin_unlock(&tfile->f_lock);

    rb_erase_cached(&epi->rbn, &ep->rbr);

    /*
     * We need to do this because an event could have been arrived on some
     * allocated wait queue. Note that we don't care about the ep->ovflist
     * list, since that is used/cleaned only inside a section bound by "mtx".
     * And ep_insert() is called with "mtx" held.
     */
    write_lock_irq(&ep->lock);
    if (ep_is_linked(epi))
        list_del_init(&epi->rdllink);
    write_unlock_irq(&ep->lock);

    wakeup_source_unregister(ep_wakeup_source(epi));

error_create_wakeup_source:
    kmem_cache_free(epi_cache, epi);

    return error;
}

4.4.2、ep_remove()

/// fs/eventpoll.c
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
{
    struct file *file = epi->ffd.file;

    lockdep_assert_irqs_enabled();

    /*
     * Removes poll wait queue hooks.
     */
    ep_unregister_pollwait(ep, epi);

    /* Remove the current item from the list of epoll hooks */
    spin_lock(&file->f_lock);
    list_del_rcu(&epi->fllink);
    spin_unlock(&file->f_lock);

    rb_erase_cached(&epi->rbn, &ep->rbr);

    write_lock_irq(&ep->lock);
    if (ep_is_linked(epi))
        list_del_init(&epi->rdllink);
    write_unlock_irq(&ep->lock);

    wakeup_source_unregister(ep_wakeup_source(epi));
    /*
     * At this point it is safe to free the eventpoll item. Use the union
     * field epi->rcu, since we are trying to minimize the size of
     * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
     * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
     * use of the rbn field.
     */
    call_rcu(&epi->rcu, epi_rcu_free);

    atomic_long_dec(&ep->user->epoll_watches);

    return 0;
}

4.4.3、ep_modify()

/// fs/eventpoll.c
static int ep_modify(struct eventpoll *ep, struct epitem *epi,
             const struct epoll_event *event)
{
    int pwake = 0;
    poll_table pt;

    lockdep_assert_irqs_enabled();

    init_poll_funcptr(&pt, NULL);

    /*
     * Set the new event interest mask before calling f_op->poll();
     * otherwise we might miss an event that happens between the
     * f_op->poll() call and the new event set registering.
     */
    epi->event.events = event->events; /* need barrier below */
    epi->event.data = event->data; /* protected by mtx */
    if (epi->event.events & EPOLLWAKEUP) {
        if (!ep_has_wakeup_source(epi))
            ep_create_wakeup_source(epi);
    } else if (ep_has_wakeup_source(epi)) {
        ep_destroy_wakeup_source(epi);
    }

    /*
     * The following barrier has two effects:
     *
     * 1) Flush epi changes above to other CPUs.  This ensures
     *    we do not miss events from ep_poll_callback if an
     *    event occurs immediately after we call f_op->poll().
     *    We need this because we did not take ep->lock while
     *    changing epi above (but ep_poll_callback does take
     *    ep->lock).
     *
     * 2) We also need to ensure we do not miss _past_ events
     *    when calling f_op->poll().  This barrier also
     *    pairs with the barrier in wq_has_sleeper (see
     *    comments for wq_has_sleeper).
     *
     * This barrier will now guarantee ep_poll_callback or f_op->poll
     * (or both) will notice the readiness of an item.
     */
    smp_mb();

    /*
     * Get current event bits. We can safely use the file* here because
     * its usage count has been increased by the caller of this function.
     * If the item is "hot" and it is not registered inside the ready
     * list, push it inside.
     */
    if (ep_item_poll(epi, &pt, 1)) {
        write_lock_irq(&ep->lock);
        if (!ep_is_linked(epi)) {
            list_add_tail(&epi->rdllink, &ep->rdllist);
            ep_pm_stay_awake(epi);

            /* Notify waiting tasks that events are available */
            if (waitqueue_active(&ep->wq))
                wake_up(&ep->wq);
            if (waitqueue_active(&ep->poll_wait))
                pwake++;
        }
        write_unlock_irq(&ep->lock);
    }

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(ep, NULL);

    return 0;
}

4.5、epoll_wait()

do_epoll_wait() 函数主要调用 ep_poll() 函数。

4.5.1、do_epoll_wait

/// fs/eventpoll.c
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
        int, maxevents, int, timeout)
{
    return do_epoll_wait(epfd, events, maxevents, timeout);
}

static int do_epoll_wait(int epfd, struct epoll_event __user *events,
             int maxevents, int timeout)
{
    int error;
    struct fd f;
    struct eventpoll *ep;

    /* The maximum number of event must be greater than zero */
    if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
        return -EINVAL;

    /* Verify that the area passed by the user is writeable */
    if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
        return -EFAULT;

    /* Get the "struct file *" for the eventpoll file */
    f = fdget(epfd);
    if (!f.file)
        return -EBADF;

    /*
     * We have to check that the file structure underneath the fd
     * the user passed to us _is_ an eventpoll file.
     */
    error = -EINVAL;
    if (!is_file_epoll(f.file)) // 不是 epoll 文件
        goto error_fput;

    /*
     * At this point it is safe to assume that the "private_data" contains
     * our own data structure.
     */
    ep = f.file->private_data;

    /* Time to fish for events ... */
    error = ep_poll(ep, events, maxevents, timeout); // 主要操作

error_fput:
    fdput(f);
    return error;
}

4.5.2、ep_poll()

ep_poll() 函数逻辑如下

1）设置超时时间；
2）调用 ep_events_available() 判断是否存在就绪队列，如果存在就绪队列，调用 ep_send_events() 将就绪事件拷贝到 user 空间返回；
3）不存在就绪队列，将当前调用进程挂在 ep->wq 等待队列上，然后让出 CPU；

/// fs/eventpoll.c
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
           int maxevents, long timeout)
{
    int res = 0, eavail, timed_out = 0;
    u64 slack = 0;
    wait_queue_entry_t wait;
    ktime_t expires, *to = NULL;

    lockdep_assert_irqs_enabled();

    if (timeout > 0) { // 设置超时时间
        struct timespec64 end_time = ep_set_mstimeout(timeout);

        slack = select_estimate_accuracy(&end_time);
        to = &expires;
        *to = timespec64_to_ktime(end_time);
    } else if (timeout == 0) {
        /*
         * Avoid the unnecessary trip to the wait queue loop, if the
         * caller specified a non blocking operation. We still need
         * lock because we could race and not see an epi being added
         * to the ready list while in irq callback. Thus incorrectly
         * returning 0 back to userspace.
         */
        timed_out = 1;

        write_lock_irq(&ep->lock);
        eavail = ep_events_available(ep);
        write_unlock_irq(&ep->lock);

        goto send_events;
    }

fetch_events:

    if (!ep_events_available(ep))
        ep_busy_loop(ep, timed_out);

    eavail = ep_events_available(ep);
    if (eavail)
        goto send_events;

    /*
     * Busy poll timed out.  Drop NAPI ID for now, we can add
     * it back in when we have moved a socket with a valid NAPI
     * ID onto the ready list.
     */
    ep_reset_busy_poll_napi_id(ep);

    do {
        /*
         * Internally init_wait() uses autoremove_wake_function(),
         * thus wait entry is removed from the wait queue on each
         * wakeup. Why it is important? In case of several waiters
         * each new wakeup will hit the next waiter, giving it the
         * chance to harvest new event. Otherwise wakeup can be
         * lost. This is also good performance-wise, because on
         * normal wakeup path no need to call __remove_wait_queue()
         * explicitly, thus ep->lock is not taken, which halts the
         * event delivery.
         */
        init_wait(&wait);

        write_lock_irq(&ep->lock);
        /*
         * Barrierless variant, waitqueue_active() is called under
         * the same lock on wakeup ep_poll_callback() side, so it
         * is safe to avoid an explicit barrier.
         */
        __set_current_state(TASK_INTERRUPTIBLE); // 准备睡眠

        /*
         * Do the final check under the lock. ep_scan_ready_list()
         * plays with two lists (->rdllist and ->ovflist) and there
         * is always a race when both lists are empty for short
         * period of time although events are pending, so lock is
         * important.
         */
        eavail = ep_events_available(ep);
        if (!eavail) {
            if (signal_pending(current))
                res = -EINTR;
            else
                __add_wait_queue_exclusive(&ep->wq, &wait);
        }
        write_unlock_irq(&ep->lock);

        if (!eavail && !res) // 调度，让出 CPU，投入睡眠
            timed_out = !schedule_hrtimeout_range(to, slack,
                                  HRTIMER_MODE_ABS);

        /*
         * We were woken up, thus go and try to harvest some events.
         * If timed out and still on the wait queue, recheck eavail
         * carefully under lock, below.
         */
        eavail = 1;
    } while (0);

    __set_current_state(TASK_RUNNING); // 被唤醒，开始处理

    if (!list_empty_careful(&wait.entry)) {
        write_lock_irq(&ep->lock);
        /*
         * If the thread timed out and is not on the wait queue, it
         * means that the thread was woken up after its timeout expired
         * before it could reacquire the lock. Thus, when wait.entry is
         * empty, it needs to harvest events.
         */
        if (timed_out)
            eavail = list_empty(&wait.entry);
        __remove_wait_queue(&ep->wq, &wait);
        write_unlock_irq(&ep->lock);
    }

send_events:
    if (fatal_signal_pending(current)) {
        /*
         * Always short-circuit for fatal signals to allow
         * threads to make a timely exit without the chance of
         * finding more events available and fetching
         * repeatedly.
         */
        res = -EINTR;
    }
    /*
     * Try to transfer events to user space. In case we get 0 events and
     * there's still timeout left over, we go trying again in search of
     * more luck.
     */
    if (!res && eavail &&
        !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
        goto fetch_events;

    return res;
}

4.5.3 ep_send_events()

如果有就绪事件发生，则调用 ep_send_events() 函数做进一步处理。ep_send_events() 函数调用 ep_send_events_proc() 函数处理 rdllist 链表。

/// fs/eventpoll.c
struct ep_send_events_data {
    int maxevents;
    struct epoll_event __user *events;
    int res;
};

static int ep_send_events(struct eventpoll *ep,
              struct epoll_event __user *events, int maxevents)
{
    struct ep_send_events_data esed;

    esed.maxevents = maxevents;
    esed.events = events;

    ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
    return esed.res;
}

ep_send_events_proc() 遍历 rdllink 就绪链表，将就绪的事件拷贝到 user 空间。

在拷贝前，再次调用 ep_item_poll() 检查是否真的就绪。

如果采用 EPOLLET，就绪的事件会被再次添加到 rdllist 链表中。

/// fs/eventpoll.c
static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
                   void *priv)
{
    struct ep_send_events_data *esed = priv;
    __poll_t revents;
    struct epitem *epi, *tmp;
    struct epoll_event __user *uevent = esed->events;
    struct wakeup_source *ws;
    poll_table pt;

    init_poll_funcptr(&pt, NULL);
    esed->res = 0;

    /*
     * We can loop without lock because we are passed a task private list.
     * Items cannot vanish during the loop because ep_scan_ready_list() is
     * holding "mtx" during this call.
     */
    lockdep_assert_held(&ep->mtx);

    list_for_each_entry_safe(epi, tmp, head, rdllink) {
        if (esed->res >= esed->maxevents)
            break;

        /*
         * Activate ep->ws before deactivating epi->ws to prevent
         * triggering auto-suspend here (in case we reactive epi->ws
         * below).
         *
         * This could be rearranged to delay the deactivation of epi->ws
         * instead, but then epi->ws would temporarily be out of sync
         * with ep_is_linked().
         */
        ws = ep_wakeup_source(epi);
        if (ws) {
            if (ws->active)
                __pm_stay_awake(ep->ws);
            __pm_relax(ws);
        }

        list_del_init(&epi->rdllink);

        /*
         * If the event mask intersect the caller-requested one,
         * deliver the event to userspace. Again, ep_scan_ready_list()
         * is holding ep->mtx, so no operations coming from userspace
         * can change the item.
         */
        revents = ep_item_poll(epi, &pt, 1);
        if (!revents)
            continue;

        if (__put_user(revents, &uevent->events) ||
            __put_user(epi->event.data, &uevent->data)) {
            list_add(&epi->rdllink, head);
            ep_pm_stay_awake(epi);
            if (!esed->res)
                esed->res = -EFAULT;
            return 0;
        }
        esed->res++;
        uevent++;
        if (epi->event.events & EPOLLONESHOT)
            epi->event.events &= EP_PRIVATE_BITS;
        else if (!(epi->event.events & EPOLLET)) {
            /*
             * If this file has been added with Level
             * Trigger mode, we need to insert back inside
             * the ready list, so that the next call to
             * epoll_wait() will check again the events
             * availability. At this point, no one can insert
             * into ep->rdllist besides us. The epoll_ctl()
             * callers are locked out by
             * ep_scan_ready_list() holding "mtx" and the
             * poll callback will queue them in ep->ovflist.
             */
            list_add_tail(&epi->rdllink, &ep->rdllist);
            ep_pm_stay_awake(epi);
        }
    }

    return 0;
}

epoll(7) 并不是新添加到系统的黑科技，而是原有系统接口的组合。可以看到，select(2) 和 poll(2) 也利用了虚拟文件系统poll 机制，只不过仅仅是唤醒 do_select() 或者 do_poll() 进程，而 epoll(7) 的实现中就绪文件不仅唤醒 epoll_wait(2) 进程，在这之前还将就绪的事件添加到就绪的队列，减少了唤醒之后的遍历所有文件描述符检查就绪工作，而是仅仅检查处于就绪链表上的事件，复杂度大大减少。

5、总结

select(2)，poll(2) 以轮询的方式检查 fd 集合，轮询一遍没有就绪事件发生就会将自己挂起，等待监视的文件将自己唤醒（超时和信号都可以唤醒）。而 epoll(7) 是设备就绪时，调用回调函数，把就绪 fd 放入就绪链表中，并唤醒在 epoll_wait(2) 中进入睡眠的进程。虽然都要睡眠和交替，但是 select(2) 和 poll(2) 在“醒着”的时候要遍历整个 fd 集合，而 epoll(7) 在“醒着”的时候只要判断一下就绪链表是否为空就行了，这节省了大量的 CPU 时间。这就是回调机制带来的性能提升。

select(2) 和 poll(2) 每次调用都要把 fd 集合从用户态往内核态拷贝一次，而 epoll(7) 只要一次拷贝，这也能节省不少的开销。

select(2) 的超时时间 timeout 参数精度为纳秒，而 poll(2) 和 epoll(7) 为微秒，因此 select(2) 更加适用于实时性要求比较高的场景。select(2) 可移植性更好，几乎被所有主流平台所支持。

poll(2) 没有最大描述符数量的限制，如果平台支持并且对实时性要求不高，应该使用 poll(2) 而不是 select(2)。

epoll(7) 只能运行在 Linux 平台上，有大量的描述符需要同时轮询，并且这些连接最好是长连接（不会频繁地调用 epoll_clt(2) 进行事件的注册、修改和移除，频繁系统调用降低效率）。需要同时监控小于 1000 个描述符，就没有必要使用 epoll(7)，因为这个应用场景下并不能体现 epoll(7) 的优势。

select(2)/poll(2)/epoll(7) 区别

--	select(2)	poll(2)	epoll(7)
操作方式	遍历	遍历	回调
底层实现	数组	链表	红黑树
IO效率	每次线性遍历，O(n)	每次线性遍历，O(n)	调用回调函数，O(k)
最大连接数	1024	无上限	无上限
fd拷贝	每次调用，从用户态拷贝到内核态	每次调用，从用户态拷贝到内核态	调用 epoll_ctl(2) 是拷贝进内存并保存，之后调用 epoll_wait(2) 只拷贝就绪事件到用户空间