嵌入式软件开发之------浅析 linux select/poll(十三)

本文深入分析了Linux内核中的poll函数,展示了其如何处理用户态的poll调用,包括设置超时时间、拷贝用户态数据到内核、遍历文件描述符并调用驱动的poll函数等步骤。同时,文章提到了select函数的类似实现,并指出了两者共同存在的性能问题,如频繁的数据拷贝、遍历所有fd和无法精确获知哪个fd有事件等。最后,文章指出当面对大量文件描述符时,epoll相比select和poll更具优势。
摘要由CSDN通过智能技术生成

linux代码版本:linux4.4

导读:之前在学习 linux 驱动的时候,有涉及到 file->poll  成员,由用户态 select/poll 最终调用到,而 select 和 poll 又是用户态编程常用接口,当要对大量文件进行读写时,尤其是 socket ,只用read和write函数显然不是一个好的选择,但  select 和 poll 也有一些局限性,而 epoll 相比确实存在一些优势。下面就分析 select 和 poll 。

一、poll 函数

由于看着部分代码的时候,先看的 poll ,就从 poll 开始,

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
		int, timeout_msecs)
{
	struct timespec end_time, *to = NULL;
	int ret;

    /*设置超时时间*/
	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
	}

	ret = do_sys_poll(ufds, nfds, to);

	if (ret == -EINTR) {
		struct restart_block *restart_block;

		restart_block = &current->restart_block;
		restart_block->fn = do_restart_poll;
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

很明显,核心函数是  do_sys_poll 

int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
		struct timespec *end_time)
{
	struct poll_wqueues table;
 	int err = -EFAULT, fdcount, len, size;
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];   /*32 个long类型*/
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;
    /*监控的 fd 数量超过当前进程限制*/
	if (nfds > rlimit(RLIMIT_NOFILE))
		return -EINVAL;

	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;
        /*将要监控的 fd 拷贝到链表里,可想而知每 poll 一次 copy 一次,严重影响性能*/
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;
   
		todo -= walk->len;   //待处理fd的数量
		if (!todo)
			break;
        /*一次没 copy 完,再申请内存 copy ,并且用链表串起来,只有由内存就可以申请更多的链表成员,所以 pool 相比
        select 明显灵活的多*/
		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
			goto out_fds;
		}
	}
    /*初始化 pool 的等待队列*/
	poll_initwait(&table);
	fdcount = do_poll(nfds, head, &table, end_time);
              {
                	poll_table* pt = &wait->pt;   //目前还为 NULL 
                	ktime_t expire, *to = NULL;
                	int timed_out = 0, count = 0;
                	u64 slack = 0;
                	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
                	unsigned long busy_end = 0;

                	/* Optimise the no-wait case */
                    /* timeout 时间为 0 的情况,也就非阻塞式访问*/
                	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                		pt->_qproc = NULL;
                		timed_out = 1;
                	}
                    /* timeout 不为 0 的情况*/
                	if (end_time && !timed_out)
                		slack = select_estimate_accuracy(end_time);

                	for (;;) {
                		struct poll_list *walk;
                		bool can_busy_loop = false;
                        /*开始遍历链表成员了*/
                		for (walk = list; walk != NULL; walk = walk->next) {
                			struct pollfd * pfd, * pfd_end;

                			pfd = walk->entries;
                			pfd_end = pfd + walk->len;
                            /*遍历链表成员里所有的fd*/
                			for (; pfd != pfd_end; pfd++) {
                				/*
                				 * Fish for events. If we found one, record it
                				 * and kill poll_table->_qproc, so we don't
                				 * needlessly register any other waiters after
                				 * this. They'll get immediately deregistered
                				 * when we break out and return.
                				 */
                				if (do_pollfd(pfd, pt, &can_busy_loop,
                					      busy_flag)) {
                				    /*pollfd.events事件发生的文件描述符的个数*/
                					count++;
                					pt->_qproc = NULL;
                					/* found something, stop busy polling */
                					busy_flag = 0;
                					can_busy_loop = false;
                				}
                			}
                		}
                		/*
                		 * All waiters have already been registered, so don't provide
                		 * a poll_table->_qproc to them on the next loop iteration.
                		 */
                		pt->_qproc = NULL;
                		if (!count) {
                			count = wait->error;
                			if (signal_pending(current))
                				count = -EINTR;
                		}
                        /*有 pollfd.events事件发生 或者超时了*/
                		if (count || timed_out)
                			break;

                		/* only if found POLL_BUSY_LOOP sockets && not out of time */
                		if (can_busy_loop && !need_resched()) {
                			if (!busy_end) {
                				busy_end = busy_loop_end_time();
                				continue;
                			}
                			if (!busy_loop_timeout(busy_end))
                				continue;
                		}
                		busy_flag = 0;

                		/*
                		 * If this is the first loop and we have a timeout
                		 * given, then we convert to ktime_t and set the to
                		 * pointer to the expiry value.
                		 */
                		if (end_time && !to) {
                			expire = timespec_to_ktime(*end_time);
                			to = &expire;
                		}
                        /*任务调度,驱动poll函数已经该进程加入等待队列,要么超时,要么被唤醒*/
                		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
                			timed_out = 1;
                	}
                	return count;
                }
	poll_freewait(&table);

	for (walk = head; walk; walk = walk->next) {
		struct pollfd *fds = walk->entries;
		int j;

		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
				goto out_fds;
  	}

	err = fdcount;
out_fds:
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
	}

	return err;
}

由代码可知,主要进行以下几个步骤:

1. 将用户态传递 的pollfd copy 到内核链表中,而链表的好处,就是可以很容易的拓展,这也是相比  select 的一个优点,

2. 接下来就是 遍历 fd.file->poll ,也就和驱动对上了,

3. 将驱动poll 返回的结果存到 pollfd->revents,读取到事件则将 count 增加,最终返回读到时间的 fd 数量,并将结果存在了 

从代码可知以下非常明显的缺点:

1. 每次调用 poll  ,都需要将 pollfd 通过 copy_from_user 的方式拷贝到内核,而这个过程本身就比较浪费时间(触发异常,查询异常表)

2. 每次都要将 所有的 pollfd 轮询一遍,如果活跃数比较少,做很多无用功

3. 返回值只是 有 pollfd->events  的 fd 数量,用户程序还要轮询一边 pollfd 看究竟哪些有事件,本来就是个低效率的事。

二、 select 函数

select 和 poll 的实现都在 select.c 文件,两个函数的逻辑也基本一致 。

SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timeval __user *, tvp)
{
	struct timespec end_time, *to = NULL;
	struct timeval tv;
	int ret;

	if (tvp) {
        /* copy 超时时间,每调一次该函数就 copy 一次,会严重影响性能*/
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

		to = &end_time;
        /*超时时间设置到 end_time */
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
			return -EINVAL;
	}

	ret = core_sys_select(n, inp, outp, exp, to);
          {
            	fd_set_bits fds;
            	void *bits;
            	int ret, max_fds;
            	size_t size, alloc_size;
            	struct fdtable *fdt;
            	/* Allocate small arguments on the stack to save memory and be faster */
            	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

            	ret = -EINVAL;
                /*参数错误的情况*/
            	if (n < 0)
            		goto out_nofds;

            	/* max_fds can increase, so grab it once to avoid race */
                /*获得当前进程打开的文件 fd 表,获取最大的 fd */
            	rcu_read_lock();
            	fdt = files_fdtable(current->files);
            	max_fds = fdt->max_fds;
            	rcu_read_unlock();
            	if (n > max_fds)
            		n = max_fds; //纠正参数

            	/*
            	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
            	 * since we used fdset we need to allocate memory in units of
            	 * long-words. 
            	 */
            	 /*需要 6 个bitmap 分别对应 in/out/ex  */
            	size = FDS_BYTES(n);
            	bits = stack_fds;
            	if (size > sizeof(stack_fds) / 6) {
            		/* Not enough space in on-stack array; must use kmalloc */
            		ret = -ENOMEM;
            		if (size > (SIZE_MAX / 6))
            			goto out_nofds;

            		alloc_size = 6 * size;
            		bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
            		if (!bits && alloc_size > PAGE_SIZE)
            			bits = vmalloc(alloc_size);

            		if (!bits)
            			goto out_nofds;
            	}
            	fds.in      = bits;
            	fds.out     = bits +   size;
            	fds.ex      = bits + 2*size;
            	fds.res_in  = bits + 3*size;
            	fds.res_out = bits + 4*size;
            	fds.res_ex  = bits + 5*size;

                /*copy 用户态传递的 fdset */
            	if ((ret = get_fd_set(n, inp, fds.in)) ||
            	    (ret = get_fd_set(n, outp, fds.out)) ||
            	    (ret = get_fd_set(n, exp, fds.ex)))
            		goto out;
                /*保存结果的 fdset    bit map 清零*/
            	zero_fd_set(n, fds.res_in);
            	zero_fd_set(n, fds.res_out);
            	zero_fd_set(n, fds.res_ex);

            	ret = do_select(n, &fds, end_time);
                      {
                        	ktime_t expire, *to = NULL;
                        	struct poll_wqueues table;
                        	poll_table *wait;
                        	int retval, i, timed_out = 0;
                        	u64 slack = 0;
                        	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
                        	unsigned long busy_end = 0;

                        	rcu_read_lock();
                        	retval = max_select_fd(n, fds);
                        	rcu_read_unlock();

                        	if (retval < 0)
                        		return retval;
                        	n = retval;

                        	poll_initwait(&table);
                        	wait = &table.pt;
                            /* timeout 时间为 0 的情况,也就非阻塞式访问*/
                        	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                        		wait->_qproc = NULL;
                        		timed_out = 1;
                        	}

                        	if (end_time && !timed_out)
                        		slack = select_estimate_accuracy(end_time);

                        	retval = 0;
                        	for (;;) {
                        		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
                        		bool can_busy_loop = false;

                        		inp = fds->in; outp = fds->out; exp = fds->ex;
                        		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
                               /*下面就是根据 传递进来的 fdset bitmap 遍历相应 fd 的 poll 函数,并将结果放到 res_* */
                        		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
                        			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
                        			unsigned long res_in = 0, res_out = 0, res_ex = 0;

                        			in = *inp++; out = *outp++; ex = *exp++;
                        			all_bits = in | out | ex;
                        			if (all_bits == 0) {
                        				i += BITS_PER_LONG;
                        				continue;
                        			}

                        			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
                        				struct fd f;
                        				if (i >= n)
                        					break;
                        				if (!(bit & all_bits))
                        					continue;
                        				f = fdget(i);
                        				if (f.file) {
                        					const struct file_operations *f_op;
                        					f_op = f.file->f_op;
                        					mask = DEFAULT_POLLMASK;
                        					if (f_op->poll) {
                        						wait_key_set(wait, in, out,
                        							     bit, busy_flag);
                        						mask = (*f_op->poll)(f.file, wait);
                        					}
                        					fdput(f);
                        					if ((mask & POLLIN_SET) && (in & bit)) {
                        						res_in |= bit;
                        						retval++;
                        						wait->_qproc = NULL;
                        					}
                        					if ((mask & POLLOUT_SET) && (out & bit)) {
                        						res_out |= bit;
                        						retval++;
                        						wait->_qproc = NULL;
                        					}
                        					if ((mask & POLLEX_SET) && (ex & bit)) {
                        						res_ex |= bit;
                        						retval++;
                        						wait->_qproc = NULL;
                        					}
                        					/* got something, stop busy polling */
                        					if (retval) {
                        						can_busy_loop = false;
                        						busy_flag = 0;

                        					/*
                        					 * only remember a returned
                        					 * POLL_BUSY_LOOP if we asked for it
                        					 */
                        					} else if (busy_flag & mask)
                        						can_busy_loop = true;

                        				}
                        			}
                        			if (res_in)
                        				*rinp = res_in;
                        			if (res_out)
                        				*routp = res_out;
                        			if (res_ex)
                        				*rexp = res_ex;
                        			cond_resched();
                        		}
                        		wait->_qproc = NULL;
                        		if (retval || timed_out || signal_pending(current))
                        			break;
                        		if (table.error) {
                        			retval = table.error;
                        			break;
                        		}

                        		/* only if found POLL_BUSY_LOOP sockets && not out of time */
                        		if (can_busy_loop && !need_resched()) {
                        			if (!busy_end) {
                        				busy_end = busy_loop_end_time();
                        				continue;
                        			}
                        			if (!busy_loop_timeout(busy_end))
                        				continue;
                        		}
                        		busy_flag = 0;

                        		/*
                        		 * If this is the first loop and we have a timeout
                        		 * given, then we convert to ktime_t and set the to
                        		 * pointer to the expiry value.
                        		 */
                        		if (end_time && !to) {
                        			expire = timespec_to_ktime(*end_time);
                        			to = &expire;
                        		}
                                /*任务调度,驱动poll函数已经该进程加入等待队列,要么超时,要么被唤醒*/
                        		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                        					   to, slack))
                        			timed_out = 1;
                        	}
                            /*从等待队列中删除*/
                        	poll_freewait(&table);

                        	return retval;
                        }

            	if (ret < 0)
            		goto out;
            	if (!ret) {
            		ret = -ERESTARTNOHAND;
            		if (signal_pending(current))
            			goto out;
            		ret = 0;
            	}
                /*将结果 copy 到用户态的          fdset */
            	if (set_fd_set(n, inp, fds.res_in) ||
            	    set_fd_set(n, outp, fds.res_out) ||
            	    set_fd_set(n, exp, fds.res_ex))
            		ret = -EFAULT;

            out:
            	if (bits != stack_fds)
            		kvfree(bits);
            out_nofds:
            	return ret;
            }
	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);

	return ret;
}

由代码可知:

1. 将用户态传递 的fdset copy 到内核 bitmap,很明显的缺点,一旦大小定了,支持的 fdset 就定了

2. 接下来就是 遍历 file->poll 

3. 将驱动poll 返回的结果存到 fdset,跟 poll 一样,无法精确到是哪个 fd 上的时间,还得用户程序轮询

三、 select 和 poll 总结

1. 两者每次调用 都需要重新 copy fd 的集合到内核态,这本身就浪费性能;而 select 的fdset 既做输入值又做输出值,每次还得再设置下 fdset 才行

2. poll 采用的链表方式,显然相比 select bitmap的方式支持的 fd 限制更少

3. 两者都会轮询一遍 所有的 fd ,这本身也是一种效率低的方式

4. 两者都不能精确的得到是哪个 fd 有事件,poll 返回数量,select 返回 fdset 的bitmap,都需要用户程序再轮询一遍是哪个fd 事件

5. poll 事件相比 select 的in/out/err ,明显支持的情况更多

显然,当 fd 数量特别大的时候(如大量的socket),select 和 poll 的缺点就会很明显,而 epoll 提供了一种更好的方式。 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值