select, poll, epoll详解(二)

最新推荐文章于 2022-05-12 16:07:40 发布

水草

最新推荐文章于 2022-05-12 16:07:40 发布

阅读量2k

点赞数

分类专栏： Windows/Linux 文章标签： select epoll socket select源码

本文链接：https://blog.csdn.net/shltsh/article/details/39349433

版权

Windows/Linux 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

1.select源码解析

2.核心函数do_select

3.select实现总结

1.select源码解析

基于2.6.28内核代码，select主要包含4个函数。

sys_select：处理时间参数，然后调用core_sys_select。
core_sys_select：处理三个fd_set参数(in, out, ex)，然后调用do_select。
do_select：遍历所有的fd，做select/poll的工作。在合适的时机把自己挂起等待，然后调用sock_poll。
sock_poll：利用函数指针，来调用具体的文件系统poll函数，包括tcp_poll, udp_poll, datagram_poll。

//位置：fs/Select.c
//作用：处理超时时间(如果存在的话)。将timeval转换为时钟周期，接着调用core_sys_select，最后使用剩余的时间(end_time)。
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
            fd_set __user *exp, struct timeval __user *tvp)
{
    struct timespec end_time, *to = NULL;
    struct timeval tv;
    int ret;
    //如果超时时间存在
    if (tvp) {
        if (copy_from_user(&tv, tvp, sizeof(tv)))  //用户空间拷贝到内核空间
            return -EFAULT;
        to = &end_time;     //获取剩余时间
        if (poll_select_set_timeout(to,
                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
            return -EINVAL;
    }
    //主要功能都在此函数中实现
    ret = core_sys_select(n, inp, outp, exp, to);   

    //此函数会调用copy_to_user，拷贝到用户空间
    ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);   
    return ret;
}

前面主要是从用户空间拷贝到内核空间，具体工作在core_sys_select函数中实现，而真正的核心内容位于其中的do_select函数里。

//位置：fs/Select.c
//作用：准备好fd位图，作为参数传入do_select，然后将其返回值，传递给用户空间(见前面一个函数的分析)
int core_sys_select(int n, fd_set __user* inp, fd_set __user* outp,
    fd_set __user* exp, struct timespec* end_time)
{
    fd_set_bits fds;
    void* bits;
    int ret, max_fds;
    unsigned int size;
    struct fdtable* fdt;
    //在栈上分配小块参数，以节省内存及提高速度。SELECT_STACK_ALLOC 定义为256
    long stack_fds[SELECT_STACK_ALLOC / sizeof(long)];

    ret = -EINVAL;
    if (n < 0)
        goto out_nofds;

    //max_fds是可以增长的，因此这里对其加锁以避免竞争
    rcu_read_lock();

    //获取当前进程的文件描述符表
    //files_fdtable为宏定义，调用的是rcu_dereference，即内存屏障。
    //current为全局静态变量(表示current node for SINGLE view)。
    fdt = files_fdtable(current->files);

    max_fds = fdt->max_fds;
    rcu_read_unlock();
    //如果传入的fd个数(即sys_select的第一个参数)超过了最大值，则修改为最大值。
    if (n > max_fds)
        n = max_fds;

    /*
     * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
     * since we used fdset we need to allocate memory in units of
     * long-words.
     */
    //如果stack_fds数组的大小不能容纳下所有的fd_set,则使用kmalloc重新分配一个大的数组。
    //然后将位图平均分配，并初始化fds结构体
    size = FDS_BYTES(n);  //计算存放n个long所需要的字节数
    bits = stack_fds;
    if (size > sizeof(stack_fds) / 6) {
        // Not enough space in on-stack array; must use kmalloc
        ret = -ENOMEM;
        bits = kmalloc(6 * size, GFP_KERNEL);
        if (!bits)
            goto out_nofds;
    }
    fds.in = bits;
    fds.out = bits + size;
    fds.ex = bits + 2 * size;
    fds.res_in = bits + 3 * size;
    fds.res_out = bits + 4 * size;
    fds.res_ex = bits + 5 * size;

    //get_fd_set仅仅是调用copy_from_user, 将fd_set从用户空间拷贝到内核
    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);

    //核心内容在do_select中实现。注意，fds传入的是引用，就是依靠它来返回处理结果的。
    ret = do_select(n, &fds, end_time);

    if (ret < 0)
        goto out;

    //do_select返回异常
    if (!ret) {
        //ERESTARTNOHAND会被转换为EINTR,表示系统调用被中断
        ret = -ERESTARTNOHAND;
        //如果当前进程有信号需要处理时，则返回true, 符合EINTR的处理机制
        if (signal_pending(current))
            goto out;
        ret = 0;
    }

    //set_fd_set仅仅是调用copy_to_user, 将处理结果集(fds),拷贝回用户空间。
    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;

out:
    if (bits != stack_fds)
        kfree(bits);   //对应前面的kmalloc
out_nofds:
    return ret;
}

2.核心函数do_select

介绍完core_sys_select函数后，接下来就到了其真正处理select逻辑的核心函数do_select了。

//位置：fs/Select.c
//作用：真正的select逻辑在此实现。遍历所有的fd,调用对应的xxx_poll函数(tcp_poll, udp_poll, datagram_poll等)
int do_select(int n, fd_set_bits* fds, struct timespec* end_time)
{
    ktime_t expire, * to = NULL;
    struct poll_wqueues table;   //sys_poll的结构体
    poll_table* wait;
    int retval, i, timed_out = 0;
    unsigned long slack = 0;

    rcu_read_lock();

    //根据已经打开fd的位图(fds)检查用户打开的fd, 要求对应fd必须打开, 并且返回最大的fd
    retval = max_select_fd(n, fds);
    rcu_read_unlock();

    if (retval < 0)   //如果没有打开的fd, 则直接返回了
        return retval;
    n = retval;

    poll_initwait(&table);   //初始化table
        //将当前进程放入自已的等待队列table, 并将该等待队列加入到该测试表wait中
    wait = &table.pt;
    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
        wait = NULL;
        timed_out = 1;
    }

    if (end_time && !timed_out)
        slack = estimate_accuracy(end_time);

    retval = 0;
    //这里是死循环
    for (;;) {
        unsigned long* rinp, * routp, * rexp, * inp, * outp, * exp;

        set_current_state(TASK_INTERRUPTIBLE);  //设置为可以中断的睡眠状态

        inp = fds->in; outp = fds->out; exp = fds->ex;
        rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

        //遍历所有的fd
        for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
            unsigned long in, out, ex, all_bits, bit = 1, mask, j;
            unsigned long res_in = 0, res_out = 0, res_ex = 0;
            const struct file_operations* f_op = NULL;
            struct file* file = NULL;

            in = *inp++; out = *outp++; ex = *exp++;
            all_bits = in | out | ex;
            //__NFDBITS是一个宏，定义为(8 * sizeof(unsigned long))，即一个long所代表的位数。
            //因为位图是以long为单位的，所以跳至下一个位图需要__NFDBITS个比特。
            if (all_bits == 0) {
                i += __NFDBITS;
                continue;
            }

            for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
                int fput_needed;
                if (i >= n)
                    break;
                //从右至左测试all_bits中的每一位。如果当前bit是1，则继续下面操作。否则continue检测下一个。
                if (!(bit & all_bits))
                    continue;
                //轻量级的文件查找，得到file结构体指针，并增加它的引用计数字段f_count(加1)
                file = fget_light(i, &fput_needed);
                if (file) {
                    f_op = file->f_op;
                    mask = DEFAULT_POLLMASK;
                    //poll是一个函数指针。对于socket描述符，f_op->poll代表的函数就是sock_poll.
                    //函数的第二个参数是我们之前传递的等待队列，在poll成功后会将本进程唤醒执行。
                    if (f_op && f_op->poll)
                        mask = (*f_op->poll)(file, retval ? NULL : wait);
                    //释放file结构体指针，实际上就是减小它的引用计数字段f_count(减1)
                    fput_light(file, fput_needed);
                    //根据poll返回的结果来设置状态。因为要返回select出来的fd数目，所以这里retval++。
                    //注意：retval是in, out, ex这三个集合的总和
                    if ((mask & POLLIN_SET) && (in & bit)) {
                        res_in |= bit;
                        retval++;
                    }
                    if ((mask & POLLOUT_SET) && (out & bit)) {
                        res_out |= bit;
                        retval++;
                    }
                    if ((mask & POLLEX_SET) && (ex & bit)) {
                        res_ex |= bit;
                        retval++;
                    }
                }
            }
            //将poll的结果写回到输出位图里
            if (res_in)
                *rinp = res_in;
            if (res_out)
                *routp = res_out;
            if (res_ex)
                *rexp = res_ex;

            //注意前面的set_current_state(TASK_INTERRUPTIBLE)。因为已经进入了TASK_INTERRUPTIBLE状态，
            //所以cond_resched会调度其他进程来运行，这里的目的纯粹是为了增加一个抢占点。被抢占后，由等待队列机制唤醒。

            //这个函数具有主动被调度的作用。为了及时响应实时过程，需要中断线程化，而在中断线程化的过程中，
            //需要调用cond_resched这个函数。在目前的内核代码中,一般在读磁盘前(或者其它可能费时操作前),会调用这个函数。
            //在支持抢占式调度的内核中(定义了CONFIG_PREEMPT),cond_resched是空操作.
            cond_resched();
        }
        wait = NULL;
        //当前进程有信号要处理时,signal_pending返回true
        if (retval || timed_out || signal_pending(current))
            break;
        if (table.error) {
            retval = table.error;
            break;
        }

        /*
         * If this is the first loop and we have a timeout
         * given, then we convert to ktime_t and set the to
         * pointer to the expiry value.
         */
        if (end_time && !to) {
            expire = timespec_to_ktime(*end_time);
            to = &expire;
        }

        if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
            timed_out = 1;
    }
    //设置为运行状态
    __set_current_state(TASK_RUNNING);

    //清理等待队列
    poll_freewait(&table);

    return retval;
}

前面的这个函数代码很多，实际上最关键的一句就是：
mask = (*f_op->poll)(file, retval ? NULL : wait);

1)返回值mask，用来设置状态值，返回给select函数。
第二个函数poll_wait，是为了加入设备的等待队列，提供给后面的cond_resched()使用。如果没有任何一个file被唤醒，则cond_resched()会立刻切换到其他进程，用户空间的select进入休眠(timeout非0时)。为何这个参数有两种可能性呢？
代码中有两个地方可以跳出最外层的for死循环：

if (retval || !__timeout || signal_pending(current))
  break;
if(table.error) {
  retval = table.error;
  break;
}

当retval非零，或者timeout为0(即立刻返回，不阻塞)，或者休眠过程中当前进程收到signal，或者table收到error时，则跳出循环。
这4种异常情况中，后面的3种容易理解。但是第1种情况，retval什么时候非零呢？可以看到前面给retval赋值的相关代码：

retval = max_select_fd(n, fds);
n = retval;
retval = 0;

先获取最大的文件描述符，保存至n。然后在进入for死循环前retval被赋值为0。通过mask比较对应的位，如果可以进行I/O，则retval加1，变成非零，此时第二个参数就变成NULL，原因很简单，因为内核知道此时不会发生任何等待，因此也不需要构造等待队列。另外，当timeout为0时(不阻塞，select立即返回)，wait会被设为NULL，因此这种情况下即使retval为0，即没有可用I/O，poll的第二个参数还是NULL，系统不需要处理等待队列。

2)这里调用的是文件系统的poll函数。不同的文件系统，poll函数自然不同。我们在这里关注的是socket类型，而socketfs的注册在net/Socket.c里。
register_filesystem(&sock_fs_type);

//sock_fs_type定义为
static struct file_system_type sock_fs_type = {
	.name =		"sockfs",
	.get_sb =	sockfs_get_sb,
	.kill_sb =	kill_anon_super,
};

//而file_system_type定义在include/linux/Fs.h中
struct file_system_type {
	const char *name;
	int fs_flags;
	int (*get_sb) (struct file_system_type *, int,
		       const char *, void *, struct vfsmount *);
	void (*kill_sb) (struct super_block *);
	struct module *owner;
	struct file_system_type * next;
	struct list_head fs_supers;

	struct lock_class_key s_lock_key;
	struct lock_class_key s_umount_key;

	struct lock_class_key i_lock_key;
	struct lock_class_key i_mutex_key;
	struct lock_class_key i_mutex_dir_key;
	struct lock_class_key i_alloc_sem_key;
};

Socket文件系统的相关函数也在net/Socket.c里，如下所示：

/*
 *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *	in the operation structures but are done directly via the socketcall() multiplexor.
 */
static const struct file_operations socket_file_ops = {
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
	.aio_read =	sock_aio_read,
	.aio_write =	sock_aio_write,
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = compat_sock_ioctl,
#endif
	.mmap =		sock_mmap,
	.open =		sock_no_open,	/* special open code to disallow open via /proc */
	.release =	sock_close,
	.fasync =	sock_fasync,
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
	.splice_read =	sock_splice_read,
};

接下来看看sock_poll的实现：

/* No kernel lock held - perfect */
static unsigned int sock_poll(struct file *file, poll_table *wait)
{
	struct socket *sock;

	/*
	 *      We can't return errors to poll, so it's either yes or no.
	 */
	sock = file->private_data;  //约定socket的file->private_data字段放着对应的socket结构指针

        //对应了三个协议的函数tcp_poll,udp_poll,datagram_poll，其中udp_poll几乎直接调用了datagram_poll
	return sock->ops->poll(file, sock, wait);
}

以tcp_poll为例，代码位于net/ipv4/Tcp.c中。
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
这个就是最终的查询函数了。也就是说，select的主要功能就是调用tcp文件系统的poll函数，不停的查询，如果没有想要的数据，则主动执行一次调度(防止一直占用CPU)，直到有一个连接有想要的消息为止。
从这里可以看出，select的作用就是不停的调用poll函数，直到有需要的消息为止。如果select处理的socket很多，机器性能消耗会很大。Select有最大数目限制(Windows下好像没有)，每个进程的select最多能处理FD_SETSIZE(1024)个fd，如果超过此最大值，则只能采用多进程。
常见的select多进程模型为：一个进程专门处理accept，成功后将fd通过unix socket传递给子进程处理，父进程可以根据子进程负载均衡分派。

3.select实现总结

基于前一节的源码分析，可以看出select的实现，可以概括为下面几点：
1) 调用copy_from_user，将fd_set从用户空间拷贝到内核空间；
2) 注册回调函数__pollwait()。调用关系为__pollwait() <---- poll_initwait() <---- do_select() <---- core_sys_select() <---- sys_select()

static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
		       poll_table *p);

3) 遍历所有的fd, 调用其对应的poll方法。对于socket文件系统, 对应方法为sock_poll。其会根据具体协议，分别调用tcp_poll, udp_poll及datagram_poll。poll指向的函数会返回当前可否读写的信息。
a). 如果当前可以读写，则返回读写信息;
b). 如果当前不可读写，则阻塞进程，并等待驱动程序唤醒，重新调用poll函数，或超时返回;
c). 底层的驱动程序需要实现这个poll函数。
4) 以tcp_poll举例，其主要功能就是__pollwait()。此函数的主要工作就是把current（当前进程）挂到设备的等待队列中，不同的设备有不同的等待队列，对于tcp_poll来说，其等待队列是sk->sk_sleep（注意把进程挂到等待队列中并不代表进程已经睡眠了）。在设备收到一条消息（网络设备）或填写完文件数据（磁盘设备）后，会唤醒设备等待队列上睡眠的进程，这时current便被唤醒了。

unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) {
  unsigned int mask;
  struct sock *sk = sock->sk;
  struct tcp_sock *tp = tcp_sk(sk);
  
  poll_wait(file, sk->sk_sleep, wait); //此处将当前进程加入到等待队列中，但并不阻塞
                                       //在中断中使用wake_up_interruptible(&wait_q)唤醒等待队列
  ...............
}

5) poll方法(sock_poll)返回时会返回一个描述读写操作是否就绪的mask掩码，根据这个mask掩码给fd_set赋值。
6) 如果遍历完所有的fd，还没有返回一个可读写的mask掩码，则会调用schedule_timeout使得调用select的进程（也就是current）进入睡眠。当设备驱动发生自身资源可读写后，会唤醒其等待队列上睡眠的进程。如果超过一定的超时时间（schedule_timeout指定），还是没人唤醒，则调用select的进程会重新被唤醒获得CPU，进而重新遍历fd，判断有没有就绪的fd。
7) 将fd_set从内核空间拷贝到用户空间。