【Poll】Linux Poll源码

最新推荐文章于 2023-11-04 16:10:32 发布

Obito_uchiha

最新推荐文章于 2023-11-04 16:10:32 发布

阅读量322

点赞数

分类专栏： Linux

本文链接：https://blog.csdn.net/sinat_33822516/article/details/113758344

版权

Linux 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

上一篇博客大致分析了select的源码，其实poll和select的实现基本上是相同，只是poll改进了一些select的缺点

1. select可以监听的描述符是有限的，默认是1024个，因为select使用fd_set这个数据结构来保存要监听的描述符，其实fd_set就是一个unsigned long数组，这个数组默认是16，也就是16*8*8=1024。具体可以参考我上一篇博客Linux Select源码。

2. 每次select返回后，response fd_set都被重置了，所以用户态的程序每次调用select，都需要重新设置要监听的文件描述符，这样给开发带来很大的麻烦，特别是需要监听的fd比较多的时候。

本篇博客将分析Poll的源码，来分析poll是如何实现来改进select以上的缺点。

poll函数原型

#include <poll.h>

// 8字节和unsigned long一样
struct pollfd {
    int   fd;         /* file descriptor */
    short events;     /* requested events */
    short revents;    /* returned events */
};
 
int poll(struct pollfd *fds, nfds_t nfds, int timeout);

我们将举个比较简单的demo，根据demo来解析poll系统调用

int sockfd;             //套接字句柄  
struct pollfd pollfds;  
int timeout;  
  
timeout = 5000;  
pollfds.fd = sockfd;                //设置监控sockfd  
pollfds.events = POLLIN|POLLPRI;            //设置监控的事件  
  
for(;;){  
    switch(poll(&pollfds,1,timeout)){       //开始监控  
    case -1:                    //函数调用出错  
        printf("poll error \r\n");  
    break;  
    case 0:  
        printf("time out \r\n");  
    break;  
    default:                    //得到数据返回  
        printf("sockfd have some event \r\n");  
        printf("event value is 0x%x",pollfds.revents);  
    break;  
    }  
}

poll系统调用的源码在select.c中

sys_poll

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
		int, timeout_msecs)
{
	struct timespec end_time, *to = NULL;
	int ret;

    /*如果设置超时时间，将根据超时时间来设置end_time*/
	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
	}
   
   /*核心代码*/
	ret = do_sys_poll(ufds, nfds, to);

	if (ret == -EINTR) {
		struct restart_block *restart_block;

		restart_block = &current_thread_info()->restart_block;
		restart_block->fn = do_restart_poll;
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

do_sys_poll

int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
		struct timespec *end_time)
{
	struct poll_wqueues table;
 	int err = -EFAULT, fdcount, len, size;
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
    /*跟select一样，预分配一个long型数组，如果需要监听的fd数量较少，则直接使用
    这个数组来存储需要监听的fd和events*/
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
    /*poll_list这个结构体正好也是8个字节，末尾使用一个柔性数组*/
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;

	if (nfds > rlimit(RLIMIT_NOFILE))
		return -EINVAL;

    /*这里主要判断监听的fd数量能不能在预分配的数组中存储完，如果
    存储完，就不需要后面再使用kmalloc和链表来存储了*/
	len = min_t(unsigned int, nfds, N_STACK_PPS);

    /*这个循环就是将用户态传递下来fd保存起来，如果fd较多，无法在预
    分配的数组中存储完，就重新配置一块内存，同样使用poll_list数据结构来存储，
    使用poll_list->next来将所有的poll_list串接起来*/
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;
        
        /*按照目前存储空间大小，来拷贝pollfd到存储空间中*/
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)
			break;

		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
        /*分配空间，来存储剩余的pollfd*/
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
			goto out_fds;
		}
	}
    /*这里跟select一样，设置poll_wait中需要调用的回调函数*/
	poll_initwait(&table);
    /*核心函数*/
	fdcount = do_poll(nfds, head, &table, end_time);
	poll_freewait(&table);

    /*拷贝触发的事件到用户态*/
	for (walk = head; walk; walk = walk->next) {
		struct pollfd *fds = walk->entries;
		int j;

		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
				goto out_fds;
  	}

	err = fdcount;
out_fds:
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
	}

	return err;
}

do_poll

static int do_poll(unsigned int nfds,  struct poll_list *list,
		   struct poll_wqueues *wait, struct timespec *end_time)
{
	poll_table* pt = &wait->pt;
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
	unsigned long slack = 0;

	/* Optimise the no-wait case */
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
		pt->_qproc = NULL;
		timed_out = 1;
	}

	if (end_time && !timed_out)
		slack = select_estimate_accuracy(end_time);

    /*跟select一样，使用三层for循环，来遍历所有的fd*/
	for (;;) {
		struct poll_list *walk;
        
        /*这里遍历链表，因为每个链表下还挂着一个数组，这个数组里存储着
        fd*/
		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
            /*遍历当前链表下的数组，将每个fd取出，然后执行对应fd文件的poll函数*/
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
				 * and kill poll_table->_qproc, so we don't
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
                 /*do_pollfd中其实就是调用对应fd的poll函数，在poll函数中
                 调用poll_wait(),然后再poll_wait中执行之前设置的回调函数，这里跟
                 select是一样的，如果发现一个fd上已经产生事件，那么将回调函数置空，因为
                  在这三个for循环执行完，就会返回，无需再将当前fd挂载到对应的等待队列中，
                 如果挂载上去，还要消耗时间来删除,跟select中实现原理一样的*/
				if (do_pollfd(pfd, pt)) {
					count++;
					pt->_qproc = NULL;
				}
			}
		}
		/*
		 * All waiters have already been registered, so don't provide
		 * a poll_table->_qproc to them on the next loop iteration.
		 */
        /*因为所有的waiter已经注册过了，所以即使需要下一次循环，也不需要在注册了
        所以将回调函数置空*/
		pt->_qproc = NULL;
		if (!count) {
			count = wait->error;
            /*判断是否有信号需要处理*/
			if (signal_pending(current))
				count = -EINTR;
		}
        /*如果超时了，或者产生事件，或者需要处理信号，那么就结束for
         循环*/
		if (count || timed_out)
			break;

		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec_to_ktime(*end_time);
			to = &expire;
		}
        /*这里就是睡眠的地方*/
		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
			timed_out = 1;
	}
	return count;
}

do_pollfd

static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
	unsigned int mask;
	int fd;

	mask = 0;
	fd = pollfd->fd;
	if (fd >= 0) {
		struct fd f = fdget(fd);
		mask = POLLNVAL;
		if (f.file) {
			mask = DEFAULT_POLLMASK;
			if (f.file->f_op && f.file->f_op->poll) {
                /*设置需要监听的事件类型，可以看到POLLERR和POLLHUP是
               内核自己帮我加上去，所以即使用户态没有监听这类事件，还是需要
               处理这类事件，否则用户态程序会一直收到POLLHUP/POLLERR的，比如socket
               断开连接的时候会产生POLLHUP，毕竟POLL和Select都是LT模式的*/
				pwait->_key = pollfd->events|POLLERR|POLLHUP;
                /*执行对应的poll函数*/
				mask = f.file->f_op->poll(f.file, pwait);
			}
			/* Mask out unneeded events. */
			mask &= pollfd->events | POLLERR | POLLHUP;
			fdput(f);
		}
	}
	pollfd->revents = mask;

	return mask;
}

至于执行poll函数可以参考我上一篇博客Linux Select源码，原理是一样的。

总结：

为什么poll解决了文章开篇提到的两个select的缺点呢？

1. 首先poll使用pollfd的数据结构，这个数据结构将events和revents区分开了，即使某个fd对应上没有产生事件，也不会影响这个fd对应的events，这样用户态就不需要每次都要重新设置了，

而select是使用fd_set，内核返回的时候只有触发事件的fd对应的位置为1，其他都是0，所以没有触发事件的fd的位置已经是0了，所以下次使用select的时候仍然要重新设置。

2. poll使用了链表加数组的方式解决了select有最大监听数量的缺陷，因为select是一个预先分配好大小的unsigned long数组，数组的每一个bit代表一个fd是否需要监听，但是预分配大小的数组毕竟是大小固定的。而poll使用一个poll_list数据结构，poll_list 第一个元素就是poll_list *next指向下一个poll_list，而poll_list最后一个元素是柔性数组，可以在分配空间的时候，使用poll_list->entries来定位到最后需要存储pollfd的内存空间位置。所以解决了select有最大监听数量的缺陷。

struct poll_list {
	struct poll_list *next;
	int len;
	struct pollfd entries[0];
};

从分析poll和select的内核代码来看，其实很容易就发现一个问题，在内核代码中都使用了三层for循环，来遍历所有的文件描述符，假设现在有一个fd触发事件了，那么select/poll进程会被唤醒，唤醒后依然要进行遍历（执行三层for循环）,但是遍历后，发现只有一个fd有事件，这样其实效率是很低的，所以select和poll不适用在fd较多且只有部分活跃的情况的，那么就需要我们的epoll了。epoll和poll和select最大的区别就是不再轮询了，而是真正的类似于事件监听一样。

Obito_uchiha

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
【Poll】Linux Poll源码

上一篇博客大致分析了select的源码，其实poll和select的实现基本上是相同，只是poll改进了一些select的缺点 1. select可以监听的描述符是有限的，默认是1024个，因为select使用fd_set这个数据结构来保存要监听的描述符，其实fd_set就是一个unsigned long数组，这个数组默认是16，也就是16*8*8=1024。具体可以参考我上一篇博客Linux Select源码。2. 每次select返回后，response fd_set都被重置了，所以用户态的程.
复制链接

扫一扫