linux select和poll系统调用源码分析

linux select内核实现原理:

本文对应的linux 内核版本为5.0.3
select系统调用定义,位置在\fs\select.c
在看linux内核对select实现之前,最好先了解用户态程序是怎么使用的,这样有利于我们对select实现的理解,select函数的使用可以自行上网搜索。

select系统调用格式定义如下:
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timeval __user *, tvp)

select函数直接调用kern_select函数	
select->kern_select
	
static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
		       fd_set __user *exp, struct timeval __user *tvp)
{
	struct timespec64 end_time, *to = NULL;
	struct timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

		to = &end_time;
		//将用户态的时间数据结构的数据,转换为内核态的定时器使用的数据结构
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
			return -EINVAL;
	}

	ret = core_sys_select(n, inp, outp, exp, to);
	ret = poll_select_copy_remaining(&end_time, tvp, PT_TIMEVAL, ret);

	return ret;
}	

kern_select函数中,如果用户程序定义了超时时间,那么将超时时间复制到内核态,并转换为内核态定时器使用的数据结构
剩下的主要功能是在core_sys_select函数中完成

int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
			   fd_set __user *exp, struct timespec64 *end_time)
{
	fd_set_bits fds;
	void *bits;
	int ret, max_fds;
	size_t size, alloc_size;
	struct fdtable *fdt;
	/* Allocate small arguments on the stack to save memory and be faster */
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

	ret = -EINVAL;
	if (n < 0)
		goto out_nofds;

	/* max_fds can increase, so grab it once to avoid race */
	rcu_read_lock();
	//取出当前进程的文件描述符表
	fdt = files_fdtable(current->files);
	//当前进程的最大文件描述符数
	max_fds = fdt->max_fds;
	rcu_read_unlock();
	if (n > max_fds)
		n = max_fds;

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	 //这里是为了计算位图的大小,因为每个字节8bit,n表示要操作的最大文件描述符+1,每一位表示一个
	 //文件描述符。这里计算的大小是以字节为单位的,且这个size表示要分配的inp/outp/exp/中的一个大小。
	size = FDS_BYTES(n);
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		/* Not enough space in on-stack array; must use kmalloc */
		ret = -ENOMEM;
		if (size > (SIZE_MAX / 6))
			goto out_nofds;

		alloc_size = 6 * size;
		bits = kvmalloc(alloc_size, GFP_KERNEL);
		if (!bits)
			goto out_nofds;
	}
	fds.in      = bits;
	fds.out     = bits +   size;
	fds.ex      = bits + 2*size;
	fds.res_in  = bits + 3*size;
	fds.res_out = bits + 4*size;
	fds.res_ex  = bits + 5*size;
	//将用户空间传递的输入参数复制到内核空间,例如inp表示关心的读操作的文件描述符集合
	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	//将返回的文件描述符位图清零
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

	ret = do_select(n, &fds, end_time);

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	//这里检测的结果赋值给用户态输入的参数,所以会把用户设置的参数覆盖掉,因此用户每次调用select函数的时候,需要重新设置一下输入参数
	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;

out:
	if (bits != stack_fds)
		kvfree(bits);
out_nofds:
	return ret;
}

core_sys_select函数的主要作用 :
1.在栈上分配用于用户关心事件和用于返回发生时间的空间,然后将用户空间传入的关心事件拷贝到内核空间,
2.调用do_select函数,主要的工作是在do_select函数中进行的。
3.将检测的结果赋值给用户态输入的参数,所以会把用户设置的参数覆盖掉,因此用户每次调用select函数的时候,需要重新设置一下输入参数

static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
	ktime_t expire, *to = NULL;
	struct poll_wqueues table;
	poll_table *wait;
	int retval, i, timed_out = 0;
	u64 slack = 0;
	__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
	unsigned long busy_start = 0;

	rcu_read_lock();
	retval = max_select_fd(n, fds);
	rcu_read_unlock();

	if (retval < 0)
		return retval;
	//最大的文件描述符号
	n = retval;
	/*将当前进程与poll_wqueues结构体绑定*/ 
	poll_initwait(&table);
	//这个wait很重要,下面会把这个wait当做参数传递给各个去定的poll函数
	wait = &table.pt;
	//如果设置了超时时间,判断是否超时
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
		wait->_qproc = NULL;
		timed_out = 1;
	}

	if (end_time && !timed_out)
		slack = select_estimate_accuracy(end_time);
	//这里表示这一次循环中有多少个文件描述符可以操作
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
		bool can_busy_loop = false;

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		//循环遍历每个fd,这里是从0开始遍历所有的fd,因此就可以理解为啥select系统调用传入的参数为maxfd+1了
		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
			__poll_t mask;
			//这里是处理位图,每次处理long类型的数据长度
			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
				i += BITS_PER_LONG;
				continue;
			}
			//这里对位图进行循环,处理long长度的位图
			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
				struct fd f;
				if (i >= n)
					break;
				if (!(bit & all_bits))//每一位进行测试
					continue;
				f = fdget(i);
				if (f.file) {
					wait_key_set(wait, in, out, bit,
						     busy_flag);
					//这里会调用fd对应的poll函数
					mask = vfs_poll(f.file, wait);

					fdput(f);
					//这里判断驱动程序poll函数的读写事项
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;//增加可操作文件句柄的个数
						wait->_qproc = NULL;
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
						wait->_qproc = NULL;
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
						wait->_qproc = NULL;
					}
					/* got something, stop busy polling */
					//如果关心的事件发生了
					if (retval) {
						can_busy_loop = false;
						busy_flag = 0;

					/*
					 * only remember a returned
					 * POLL_BUSY_LOOP if we asked for it
					 */
					} else if (busy_flag & mask)
						can_busy_loop = true;

				}
			}
			//如果检测的文件内容可操作性,那么对返回值进行复制
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
			cond_resched();
		}
		wait->_qproc = NULL;
		//注意,这里是把所有的fd都扫描了一遍之后才进行下面的判断的,而不是当一个fd有数据时就会退出
		//如果有可操作的文件句柄,或者超时时间到,或者有信号需要处理,则退出
		if (retval || timed_out || signal_pending(current))
			break;
		if (table.error) {
			retval = table.error;
			break;
		}

		/* only if found POLL_BUSY_LOOP sockets && not out of time */
		if (can_busy_loop && !need_resched()) {
			if (!busy_start) {
				busy_start = busy_loop_current_time();
				continue;
			}
			if (!busy_loop_timeout(busy_start))
				continue;
		}
		busy_flag = 0;

		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec64_to_ktime(*end_time);
			to = &expire;
		}
		//执行到这里,说明没有任何fd有数据准备好,且没有超时,当前进程进入睡眠
		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
					   to, slack))
			timed_out = 1;
	}

	poll_freewait(&table);

	return retval;
}

do_select函数的主要功能:
1.调用poll_initwait(&table)函数,将当前进程与poll_wqueues结构体绑定。
2.如果超时时间到,那么会设置超时标志
3.进入一个for循环,对用户输入的每个关心的描述符都进行遍历,依次调用该文件描述符对应的操作方法集中的poll函数,如果poll函数轮询的事情发生了,那么会返回所关心的事件。
4.如果没到超时时间,或者没有信号发生,那么将当前进程睡眠
 

void poll_initwait(struct poll_wqueues *pwq)
{
	//特别注意这个回调函数,会被驱动的poll函数中通过poll_wait函数回调
	init_poll_funcptr(&pwq->pt, __pollwait);
	//这里是调用select系统调用的进程
	pwq->polling_task = current;
	pwq->triggered = 0;
	pwq->error = 0;
	pwq->table = NULL;
	pwq->inline_index = 0;
}


//参数wait_address是由驱动程序调用poll_wait函数传递的
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	//从poll_table中获取一个表项
	struct poll_table_entry *entry = poll_get_entry(pwq);
	if (!entry)
		return;
	//初始化表项
	entry->filp = get_file(filp);
	entry->wait_address = wait_address;
	entry->key = p->_key;
	//初始化唤醒等待队列中等待队列项的函数,由此可知,每个poll_table_entry都会有一个自己的唤醒函数
	init_waitqueue_func_entry(&entry->wait, pollwake);
	entry->wait.private = pwq;
	//将poll table entry加入到等待队列中,__pollwait函数是被驱动的poll函数调用的,每个驱动都有自己的wait_address
	//因此,每个等待队列就有一个等待队列项 
	add_wait_queue(wait_address, &entry->wait);
}

static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
	struct poll_table_entry *entry;

	entry = container_of(wait, struct poll_table_entry, wait);
	if (key && !(key_to_poll(key) & entry->key))
		return 0;
	return __pollwake(wait, mode, sync, key);
}


static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
	struct poll_wqueues *pwq = wait->private;
	//pwq->polling_task表示调用select函数时的进程
	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

	/*
	 * Although this function is called under waitqueue lock, LOCK
	 * doesn't imply write barrier and the users expect write
	 * barrier semantics on wakeup functions.  The following
	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
	 * and is paired with smp_store_mb() in poll_schedule_timeout.
	 */
	smp_wmb();
	pwq->triggered = 1;

	/*
	 * Perform the default wake up operation using a dummy
	 * waitqueue.
	 *
	 * TODO: This is hacky but there currently is no interface to
	 * pass in @sync.  @sync is scheduled to be removed and once
	 * that happens, wake_up_process() can be used directly.
	 */
	return default_wake_function(&dummy_wait, mode, sync, key);
}

poll_initwait函数会给poll_wqueues结构体初始化,设置一个__pollwait回调函数,这个回调函数会被各个驱动程序的poll函数中调用,
__pollwait函数主要从poll_wqueues中申请一个poll_table_entry,然后给poll_table_entry设置一个唤醒函数,这个唤醒函数会被在调用wake_up_interruptible函数的时候
调用,最后将poll_table_entry加入到一个等待队列中,值得注意的是,这个等待队列是由驱动程序poll传递进来的参数,每个驱动的poll函数都会传递自己的等待队列,
因此唤醒进程的时候,是根据不同的等待队列唤醒的。例如,a驱动和b驱动都有自己的poll函数分别为a_poll和b_poll,那么在这两个函数中都会调用poll_wait函数,
从而间接调用__pollwait函数,然后将各自的poll_table_entry结构体绑定一个pollwake唤醒函数,然后加入到各自的等待队列中。当a_poll函数轮询的事件发生时,那么在a驱动中会调用
wake_up_interruptible函数将a驱动程序中的等待队列进程唤醒,也就是会调用pollwake函数。从而最终唤醒的是pwq->polling_task进程,这个进程是在poll_initwait函数中被设置的,
因此,poll机制中的整个睡眠和唤醒流程就清晰了。
 

 

下面是一个简单的poll驱动程序:

static unsigned int demo_poll(struct file *filp, struct poll_table_struct *pts)
{
	/* 如果关心的事件没有发生的话,就返回0 */
	unsigned int mask = 0;

	/* 把wait_queue_head_t和文件描述符(在filp中)提交给内核轮询代码,
	 *
	 * 以便轮询代码休眠唤醒之
	 */
	poll_wait(filp, &wq, pts);

	/* 如果有数据可读,设置相应的位域 */
	if(counter){
		mask = (POLLIN | POLLRDNORM);
	}
	
	//如果mask返回值为0,则在核心轮询代码就会将其休眠,如果mask为非0,就会返回给应用层。
	return mask;
}

select的缺点
     每次调用select,都必须把fd集合从用户态拷贝到内核态,这个开销在fd很多时会很大;
     文件描述符就绪时,内核会修改readfds、writefds、execptfds结构,所以每次调用select之前,必须重新将文件描述符注册一遍;
     每次调用select都必须在内核遍历传递进来的所有fd,这个开销在fd很多时会很大(时间复杂度O(n));
             每次都必须循环探测哪些文件描述符就绪(O(n));
             调用前都必须重新设置结构体变量
     单个进程能够监视的文件描述符存在最大的限制

 

poll系统调用的实现

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
		int, timeout_msecs)
{
	struct timespec64 end_time, *to = NULL;
	int ret;

	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
	}

	ret = do_sys_poll(ufds, nfds, to);

	if (ret == -EINTR) {
		struct restart_block *restart_block;

		restart_block = &current->restart_block;
		restart_block->fn = do_restart_poll;
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

poll系统调用的主要功能是在do_sys_poll函数中。

static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
		struct timespec64 *end_time)
{
	struct poll_wqueues table;
 	int err = -EFAULT, fdcount, len, size;
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;

	if (nfds > rlimit(RLIMIT_NOFILE))
		return -EINVAL;

	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;
		//这里把用户程序的参数拷贝到内核层
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)
			break;

		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
			goto out_fds;
		}
	}

	poll_initwait(&table);
	fdcount = do_poll(head, &table, end_time);
	poll_freewait(&table);

	for (walk = head; walk; walk = walk->next) {
		struct pollfd *fds = walk->entries;
		int j;

		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
				goto out_fds;
  	}

	err = fdcount;
out_fds:
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
	}

	return err;
}

在do_sys_poll函数中主要做如下功能:
1.首先会将用户态的参数拷贝到内核态中,如果在栈上分配的内存不够用,那么会使用kmalloc分配内存,使用链表的形式。
2.调用poll_initwait(&table)函数初始化一个struct poll_wqueues,该函数的作用是在select实现中是一样的,把当前进程绑定到struct poll_wqueues中,同时也绑定__pollwait回调函数
3.调用do_poll函数
4.将收集到可以操作的文件描述符拷贝到用户空间
 

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
		   struct timespec64 *end_time)
{
	poll_table* pt = &wait->pt;
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
	u64 slack = 0;
	__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
	unsigned long busy_start = 0;

	/* Optimise the no-wait case */
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
		pt->_qproc = NULL;
		timed_out = 1;
	}

	if (end_time && !timed_out)
		slack = select_estimate_accuracy(end_time);

	for (;;) {
		struct poll_list *walk;
		bool can_busy_loop = false;

		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
				 * and kill poll_table->_qproc, so we don't
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
				if (do_pollfd(pfd, pt, &can_busy_loop,
					      busy_flag)) {
					//统计可操作的文件数
					count++;
					pt->_qproc = NULL;
					/* found something, stop busy polling */
					busy_flag = 0;
					can_busy_loop = false;
				}
			}
		}
		/*
		 * All waiters have already been registered, so don't provide
		 * a poll_table->_qproc to them on the next loop iteration.
		 */
		pt->_qproc = NULL;
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -EINTR;
		}
		if (count || timed_out)
			break;

		/* only if found POLL_BUSY_LOOP sockets && not out of time */
		if (can_busy_loop && !need_resched()) {
			if (!busy_start) {
				busy_start = busy_loop_current_time();
				continue;
			}
			if (!busy_loop_timeout(busy_start))
				continue;
		}
		busy_flag = 0;

		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec64_to_ktime(*end_time);
			to = &expire;
		}

		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
			timed_out = 1;
	}
	return count;
}

do_poll函数的主要作用:
1.如果超时时间到,设置超时标志
2.循环遍历用户程序传递下来的文件描述符,对每个描述符调用do_pollfd函数,该函数的作用是调用该文件描述符相对应的poll函数,返回可操作事件的位图。
3.如果超时时间没到,且没有任何信号要处理,那么把当前进程进行睡眠
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值