Linux Kernel2.6.9内核源码分析--select

最新推荐文章于 2022-05-17 14:01:13 发布

猿来如此yXy

最新推荐文章于 2022-05-17 14:01:13 发布

阅读量402

点赞数

分类专栏： Linux kernel 文章标签： linux 内核 epoll

本文链接：https://blog.csdn.net/weixin_38537730/article/details/104097648

版权

Linux kernel 专栏收录该内容

24 篇文章 13 订阅

订阅专栏

Linux Kernel2.6.9内核源码分析–select

需要解决的问题：
通过追踪内核源码，查看内核是如何实现select监听的功能

首先来看下select API的定义和参数：
int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
参数说明：
int nfds：是一个整数值，表示集合中所有文件描述符的范围，即所有文件描述符的最大值+1
在后面的代码中可以看到Linux 内核的实现方式是从0 ~~ nfds作为下标在进程描述符中的files数组中，依次监听各个文件描述
符有没有事件上报
fd_set *readfds, *writefds,*exceptfds：分别代表监听读/写/错误的文件描述符集，实际上是一个long型的数组.当select返回后，内核会修改集合中的值从而集合中的值不再代表原始的文件描述符，因此每次调用select前都需要重新初始化这些文件描述符集.如果某个文件描述上有事件发生，则将对应fds中的值设置为1，没有时间发生就设置为0。如readfds集合中第二个文件有读时间发生，则该数组中第二个值变成了1，不再是原始的文件描述符的值.
struct timeval *timeout:超时时间，超过这个时间，无论有没有监听到事件，则不再阻塞都立刻返回.
返回值：返回执行错误代码

再来看下select API对应的系统调用：
long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
1.查看参数n是否超过了最大值max_fdset，超过了n就等于max_fdset
2.从slab中分配6 x n x sizeof(long)空间，分别为:fds.in ,fds.out, fds.ex, fds.res_in fds.res_out,fds.res_ex,然后将user space的参数inp, outp, exp copy赋值分别赋值给fds.in ,fds.out, fds.ex
3.调用do_select，在该函数中会sleep直到timeout //核心函数
4.将fds.res_in fds.res_out,fds.res_ex 赋值给fds.in ,fds.out, fds.ex，从而用户就知道哪个文件描述符有事件发生.

long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
{
    ...............
	/* max_fdset can increase, so grab it once to avoid race */
	max_fdset = current->files->max_fdset;
	if (n > max_fdset)
		n = max_fdset;
	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	ret = -ENOMEM;
	size = FDS_BYTES(n);
	bits = select_bits_alloc(size);
	if (!bits)
		goto out_nofds;
	fds.in      = (unsigned long *)  bits;
	fds.out     = (unsigned long *) (bits +   size);
	fds.ex      = (unsigned long *) (bits + 2*size);
	fds.res_in  = (unsigned long *) (bits + 3*size);
	fds.res_out = (unsigned long *) (bits + 4*size);
	fds.res_ex  = (unsigned long *) (bits + 5*size);

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

	ret = do_select(n, &fds, &timeout);
    ...........
	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;
out:
	select_bits_free(bits, size);
out_nofds:
	return ret;
}

再来看 int do_select(int n, fd_set_bits *fds, long *timeout)

void poll_initwait(struct poll_wqueues *pwq) 是将struct poll_wqueues table变量进行初始化:
struct poll_wqueues {
poll_table pt;
struct poll_table_page * table;
int error;
};
其中pt 是一个函数指针:typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
table.pt = __pollwait 函数,后面再来分析这个函数
table.table = NULL，
table.error = 0
大循环依次遍历每个文件描述符，调用file->f_op->poll,并只传入table.pt 一次(不是很理解！！！！)
在前一边博客eventpoll中有解释到，以本地socket为例，file->f_op->poll，最终会调用到unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)，其会先调用
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && wait_address)
p->qproc(filp, wait_address, p);
}
也就是前面table.pt 即__pollwait 函数
再来看下__pollwait函数:
void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p)
{
…
{
struct poll_table_entry * entry = table->entry;
table->entry = entry+1;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
init_waitqueue_entry(&entry->wait, current);
add_wait_queue(wait_address,&entry->wait);
}
}
也就是将当前file关联的等待队列加入到sk->sk_sleep链表，再socket状态变化时，执行等待队列的回调函数，唤醒等待的进程.

调用file->f_op->poll返回后，如果有event，则将对应的res_in/res_out/res_ex设定为1.

static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
{
	struct sock *sk = sock->sk;
	unsigned int mask;

	poll_wait(file, sk->sk_sleep, wait);
	mask = 0;

	/* exceptional events? */
	if (sk->sk_err)
		mask |= POLLERR;
	if (sk->sk_shutdown == SHUTDOWN_MASK)
		mask |= POLLHUP;

	/* readable? */
	if (!skb_queue_empty(&sk->sk_receive_queue) ||
	    (sk->sk_shutdown & RCV_SHUTDOWN))
		mask |= POLLIN | POLLRDNORM;

	/* Connection-based need to check for termination and startup */
	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
		mask |= POLLHUP;

	/*
	 * we set writable also when the other side has shut down the
	 * connection. This prevents stuck sockets.
	 */
	if (unix_writable(sk))
		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;

	return mask;
}

int do_select(int n, fd_set_bits *fds, long *timeout)
{
    .............
	poll_initwait(&table); // -------> step 1
	wait = &table.pt;
	if (!__timeout)
		wait = NULL;
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
		set_current_state(TASK_INTERRUPTIBLE);
		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
			struct file_operations *f_op = NULL;
			struct file *file = NULL;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
				i += __NFDBITS;
				continue;
			}

			for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
				file = fget(i);
				if (file) {
					f_op = file->f_op;
					mask = DEFAULT_POLLMASK;
					if (f_op && f_op->poll)
						mask = (*f_op->poll)(file, retval ? NULL : wait);  // -------> 为何retval > 0 就不传入wait了 ？？？？？？
					fput(file);
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
					}
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
		}
		wait = NULL;
		if (retval || !__timeout || signal_pending(current))  // -------> step 3
			break;
		if(table.error) {
			retval = table.error;
			break;
		}
		__timeout = schedule_timeout(__timeout);
	}
	__set_current_state(TASK_RUNNING);
	poll_freewait(&table);
	*timeout = __timeout;
	return retval;
}