Linux 系统调用之 poll 源码剖析

关于驱动函数 poll 的详细解析参考 Linux 内核驱动 poll 函数解析

核心逻辑

对 Linux 2.6.36 中 poll 的代码简化如下,只列出了关键步骤以展示核心逻辑。

static int do_poll(unsigned int nfds,  struct poll_list *list,
		   struct poll_wqueues *wait, struct timespec *end_time)
{
	for (;;) {
		for (walk = list; walk != NULL; walk = walk->next) {   /* 遍历每一个 pfd */
			for (; pfd != pfd_end; pfd++) {
				if (do_pollfd(pfd, pt)) {   /* 判断 pfd 是否可无阻塞读写,并将pfd加入等待队列 */
					count++;
					pt = NULL;
				}
			}
		}

		if (count || timed_out) /* 如果有描述符就绪或者超时,退出循环 */
			break;

		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))    /* 调度 */
			timed_out = 1;
	}
	return count;
}

调用路径

poll
	sys_poll				/* 函数入口,判断是否需要超时设置 */
		do_sys_poll			/* 处理参数传递和结果返回 */
			do_poll			/* 循环遍历所有 pollfd */
				do_pollfd	/* 检测 pollfd,并将其注册到等到队列 */

详细代码分析及注释

sys_poll

系统调用 poll 的入口函数,判断是否需要超时设置。

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
		long, timeout_msecs)
{
	struct timespec end_time, *to = NULL;
	int ret;

	if (timeout_msecs >= 0) {	/* 是否设定超时 */
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
	}

	ret = do_sys_poll(ufds, nfds, to);

	return ret;
}

do_sys_poll

该函数负责将用户空间的 pollfds 复制到内核空间,并将最后结果返回给用户空间。

看出 poll 代码在参数传递方面不够有效:

  • 需要将用户空间的 pollfds 复制到内核空间
  • 在返回给用户空间结果时,也是将所有 pollfd 返回
  • 用户拿到 pollfd 后需要遍历所有的结果才能确定可用的描述符
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
		struct timespec *end_time)
{
	struct poll_wqueues table;
 	int err = -EFAULT, fdcount, len, size;
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;

	if (nfds > rlimit(RLIMIT_NOFILE))
		return -EINVAL;

	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;

		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)	/* 如果预申请的栈空间能满足要求,则退出循环 */
			break;

		/* 为不能放在栈空间的 pollfds 申请动态内存 */
		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
			goto out_fds;
		}
	}

	poll_initwait(&table);
	fdcount = do_poll(nfds, head, &table, end_time);
	poll_freewait(&table);

	/* 将所有 pollfds 拷贝到用户空间 */
	for (walk = head; walk; walk = walk->next) {
		struct pollfd *fds = walk->entries;
		int j;

		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
				goto out_fds;
  	}

	err = fdcount;
out_fds:
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
	}

	return err;
}

do_poll

遍历每一个 pollfd,如果发现一个准备好的描述符则立即返回。

static int do_poll(unsigned int nfds,  struct poll_list *list,
		   struct poll_wqueues *wait, struct timespec *end_time)
{
	poll_table* pt = &wait->pt;
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
	unsigned long slack = 0;

	/* Optimise the no-wait case */
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
		pt = NULL;
		timed_out = 1;
	}

	if (end_time && !timed_out)
		slack = estimate_accuracy(end_time);

	for (;;) {
		struct poll_list *walk;

		for (walk = list; walk != NULL; walk = walk->next) {	/* 遍历pollfd */
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * 测试 pfd,如果发现一个准备好的描述符,
				 * 记录它,并销毁 poll_table,因为后续循环不再需要
				 */
				if (do_pollfd(pfd, pt)) {
					count++;
					pt = NULL;
				}
			}
		}
		/* 所有等待项都注册完成,所以 poll_table 不再需要 */
		pt = NULL;
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -EINTR;
		}
		if (count || timed_out)
			break;

		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec_to_ktime(*end_time);
			to = &expire;
		}

		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) /* 调度 */
			timed_out = 1;
	}
	return count;
}

do_pollfd

通过调用驱动函数 poll 将文件注册到等待队列,并返回基于位的 mask,指明当前描述符准备好的动作。

static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
	unsigned int mask;
	int fd;

	mask = 0;
	fd = pollfd->fd;
	if (fd >= 0) {
		int fput_needed;
		struct file * file;

		file = fget_light(fd, &fput_needed);	/* 由文件描述符得到文件指针,并增加文件的引用计数 */
		mask = POLLNVAL;
		if (file != NULL) {
			mask = DEFAULT_POLLMASK;
			if (file->f_op && file->f_op->poll) {
				if (pwait)
					pwait->key = pollfd->events |
							POLLERR | POLLHUP;
				mask = file->f_op->poll(file, pwait);
			}
			/* Mask out unneeded events. */
			mask &= pollfd->events | POLLERR | POLLHUP;
			fput_light(file, fput_needed);	/* 递减文件的引用计数 */
		}
	}
	pollfd->revents = mask;

	return mask;
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

lylhw13_

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值