select多路复用源码剖析

最新推荐文章于 2022-09-26 22:48:32 发布

EW_DUST

最新推荐文章于 2022-09-26 22:48:32 发布

阅读量388

点赞数

分类专栏：计算机系统计算机网络文章标签： linux 内核 epoll 源码

本文链接：https://blog.csdn.net/qq_40871466/article/details/104477455

版权

计算机系统同时被 2 个专栏收录

5 篇文章 0 订阅

订阅专栏

计算机网络

1 篇文章 0 订阅

订阅专栏

简单说一下5种IO

阻塞IO：一直等待知道数据到来。
非阻塞IO：直接返回有没有数据，没有就直接返回错误。
IO复用：将多个IO，放在一起，一个个轮询。
信号驱动：设置一个信号，当有IO的信号的时候告诉我。
异步IO：直接丢给别人做。

可以去看看这个博客。
select是IO复用的一种。
函数原型如下。

int select (int __nfds, fd_set *__restrict __readfds,
		   fd_set *__restrict __writefds,
		   fd_set *__restrict __exceptfds,
		   struct timeval *__restrict __timeout);

参数也很容易懂.

nfds: 最大的文件描述符+1
readfds:读的描述符集。
writefds：写的描述符集。
exceptfds：异常描述符集。
timeout：超时时间。

直接看看结构体是啥。

/* fd_set for select and pselect.  */
typedef long int __fd_mask;//我的64位环境
//简单来讲 就是个位图。
typedef struct
  {
    /* XPG4.2 requires this member name.  Otherwise avoid the name
       from the global namespace.  */
#ifdef __USE_XOPEN
    __fd_mask fds_bits[__FD_SETSIZE / __NFDBITS];
# define __FDS_BITS(set) ((set)->fds_bits)
#else
    __fd_mask __fds_bits[__FD_SETSIZE / __NFDBITS];
# define __FDS_BITS(set) ((set)->__fds_bits)
#endif
  } fd_set;

测试下多少位。

int main(int argc, char* argv[])
{
    printf("%d\n",sizeof(fd_set)*8);
    return 0;
}
//输出 1024  这个地方告诉我们最大能设置为1024，听说有方法可以改，这个我们不去关心了。

看看时间结构体，也很简单

struct timeval
  {
    __time_t tv_sec;		/* Seconds.  秒*/ 
    __suseconds_t tv_usec;	/* Microseconds. 毫秒 */
  };

几个常用的操作

/* Access macros for `fd_set'.  */
#define	FD_SET(fd, fdsetp)	__FD_SET (fd, fdsetp)//设置
#define	FD_CLR(fd, fdsetp)	__FD_CLR (fd, fdsetp)//清除
#define	FD_ISSET(fd, fdsetp)	__FD_ISSET (fd, fdsetp)//查看
#define	FD_ZERO(fdsetp)		__FD_ZERO (fdsetp)//清0
//宏定义实现也简单
#ifndef FD_SET
#define FD_SET(n, p)    (__XFDS_BITS(p, ((n)/NFDBITS)) |= ((fd_mask)1 << ((n) % NFDBITS)))// 或上去，置1
#endif
#ifndef FD_CLR
#define FD_CLR(n, p)    (__XFDS_BITS((p), ((n)/NFDBITS)) &= ~((fd_mask)1 << ((n) % NFDBITS))) //与上取反的  清0
#endif
#ifndef FD_ISSET
#define FD_ISSET(n, p)  ((__XFDS_BITS((p), ((n)/NFDBITS))) & ((fd_mask)1 << ((n) % NFDBITS)))//与上不取反的获取标志位
#endif
#ifndef FD_ZERO
#define FD_ZERO(p)      bzero((char *)(p), sizeof(*(p)))//这个有点不合格啊竟然调用函数，清0函数。    我还以为是自己异或自己。
#endif

看我大致是啥我们就去看内核代码。系统调用select。
我在linux2.6里面找到的是SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct timeval __user *, tvp)这个东西，在fs/select.c里面，有的是sys_select这个函数，不知道是内核版本问题还是啥。反正实现都是一样的将就着看。

select()

SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timeval __user *, tvp)
{
	struct timespec end_time, *to = NULL;
	struct timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))//把用户态的东西复制到内核里面
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))//转换成timespec 这个
			return -EINVAL;
	}
	//前面就是把时间复制到了内核，改了下时钟的形式
	ret = core_sys_select(n, inp, outp, exp, to);//重头戏
	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);//将剩余时间拷贝回用户空间进程
	return ret;
}

core_sys_select()

后面如果有些函数不知道在哪，可以用grep搜索一下，或者装个vscode直接全局搜索。
大部分都在include/linux/poll.h不同内核版本可能路径不一样。

int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
			   fd_set __user *exp, struct timespec *end_time)
{
	fd_set_bits fds;
	/*
	typedef struct {
	unsigned long *in, *out, *ex;
	unsigned long *res_in, *res_out, *res_ex;
	} fd_set_bits;//用来指向描述符的指针
	*/
	void *bits;
	int ret, max_fds;
	unsigned int size;
	struct fdtable *fdt;
	/*
	struct fdtable {
	unsigned int max_fds;
	struct file __rcu **fd;       //current fd array 
	fd_set *close_on_exec;
	fd_set *open_fds;
	struct rcu_head rcu;
	struct fdtable *next;
	};	
	
	*/
	/* Allocate small arguments on the stack to save memory and be faster */
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];//有限考虑使用栈来存

	ret = -EINVAL;
	if (n < 0)
		goto out_nofds;

	/* max_fds can increase, so grab it once to avoid race */
	rcu_read_lock();//rcu 锁
	fdt = files_fdtable(current->files);//这个函数看里面的内容是读取文件描述符
	//current 是当前进程，没猜错的话应该是把所有打开的文件描述符读出来
	max_fds = fdt->max_fds;//设置下最大fd
	rcu_read_unlock();//删除锁
	if (n > max_fds)//n 和最大打开的文件取个最小值
		n = max_fds;

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. //我们需要 6个 位图来处理
	 */
	size = FDS_BYTES(n);//这个应该是判断需要多少个位
	/* 这个函数在 include/linux/poll.h 里面
//How many longwords for "nr" bits?
#define FDS_BITPERLONG	(8*sizeof(long))
#define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))
	*/
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {//如果栈开不了那么多，就只能用堆了
		/* Not enough space in on-stack array; must use kmalloc */
		ret = -ENOMEM;
		bits = kmalloc(6 * size, GFP_KERNEL);
		if (!bits)
			goto out_nofds;
	}
	fds.in      = bits;
	fds.out     = bits +   size;
	fds.ex      = bits + 2*size;
	fds.res_in  = bits + 3*size;
	fds.res_out = bits + 4*size;
	fds.res_ex  = bits + 5*size;//这几个不用说了吧

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	/*  这回函数就是复制了一遍 ，所以说 select 每次都会拷贝一份，所以非常慢。
static inline
int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	nr = FDS_BYTES(nr);
	if (ufdset)
		return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;

	memset(fdset, 0, nr);
	return 0;
}
	*/
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);
/*
static inline
void zero_fd_set(unsigned long nr, unsigned long *fdset)
{
	memset(fdset, 0, FDS_BYTES(nr));
}
*/
	ret = do_select(n, &fds, end_time);//重头戏

	if (ret < 0)//出错处理
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;
/*复制回去
static inline unsigned long __must_check
set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	if (ufdset)
		return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
	return 0;
}
*/
out:
	if (bits != stack_fds)
		kfree(bits);//如果是在堆里面释放内存   
out_nofds:
	return ret;
}

do_select

所有的核心都在这个里面了。

int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
{
	ktime_t expire, *to = NULL;
	struct poll_wqueues table;
	/*
//Structures and helpers for select/poll syscall  
帮助调用select和poll，select和poll原理都是一样的
struct poll_wqueues {
	poll_table pt;
	struct poll_table_page *table;
	struct task_struct *polling_task;//当前用户环境 PCB
	int triggered;// 当前用户进程被唤醒后置成1，以免该进程接着进睡眠
	int error;// 错误码
	int inline_index;// 数组inline_entries的引用下标
	struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};
*/
	poll_table *wait;
	/*
 structures and helpers for f_op->poll implementations //帮助调用 poll
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

typedef struct poll_table_struct {
	poll_queue_proc qproc;
	unsigned long key;
} poll_table;
	*/
	int retval, i, timed_out = 0;
	unsigned long slack = 0;

	rcu_read_lock();
	//根据已经设置好的fd位图检查用户打开的fd, 要求对应fd必须打开, 并且返回最大的fd。
	retval = max_select_fd(n, fds);
	rcu_read_unlock();

	if (retval < 0)
		return retval;
	n = retval;
    /* 一些重要的初始化：
       poll_wqueues.poll_table.qproc函数指针初始化，
       该函数是驱动程序中poll函数（fop->poll）实现中必须要调用的poll_wait()中使用的函数。  */
	poll_initwait(&table);
	wait = &table.pt;
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
		wait = NULL;
		timed_out = 1;
	}

	if (end_time && !timed_out)
		slack = select_estimate_accuracy(end_time);//不知道有啥用，好像是参数如果设置了等待时间，就获取一个等待值

	retval = 0;
	for (;;) {//这个循环才是真正的检查
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
			const struct file_operations *f_op = NULL;
			struct file *file = NULL;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;//如果所有位 0 直接跳过
			if (all_bits == 0) {
				i += __NFDBITS;
				continue;
			}

			for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
				int fput_needed;
				if (i >= n)
					break;
				if (!(bit & all_bits)) //这一位为0 就直接跳过
					continue;
				file = fget_light(i, &fput_needed);//当做打开文件
				if (file) {
					f_op = file->f_op;
					mask = DEFAULT_POLLMASK;
					if (f_op && f_op->poll) {//如果存在poll 函数
						wait_key_set(wait, in, out, bit);
						/*
static inline void wait_key_set(poll_table *wait, unsigned long in,unsigned long out, unsigned long bit)
{
	if (wait) {				//设置 poll_table 的标志位
		wait->key = POLLEX_SET;
		if (in & bit)
			wait->key |= POLLIN_SET;
		if (out & bit)
			wait->key |= POLLOUT_SET;
	}
}

						*/
						mask = (*f_op->poll)(file, wait);//调用poll
					}
					fput_light(file, fput_needed);//关闭文件
					if ((mask & POLLIN_SET) && (in & bit)) {//更具poll获取到的mask 更新返回值
						res_in |= bit;
						retval++;
						wait = NULL;//置空，意味着如果 一旦又一次mask 成功，就再也不会挂再了
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
						wait = NULL;
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
						wait = NULL;
					}
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
			cond_resched();//调度
		}
		wait = NULL;
		if (retval || timed_out || signal_pending(current))//如果有其他信号需要处理或者超时
			break;
		if (table.error) {//如果出错了
			retval = table.error;
			break;
		}

		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 * 如果这是第一个循环，并且给出了超时，那么我们将转换为ktime_t，并将to指针设置为到期值。
		 */
		if (end_time && !to) {
			expire = timespec_to_ktime(*end_time);
			to = &expire;
		}

		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
					   to, slack))//这个地方很有意思，这个调度只有 当时间到了，或者其他事件到达的时候唤醒    也就是两种情况，一种是超时了，一种 是poll 里面有东西准备好了唤醒了。
			timed_out = 1;//如果是超时了 就是 timeout` 返回1
	}

	poll_freewait(&table);

	return retval;
}

简单来讲 do_select 循环调用了poll来设置标志位，每次一次循环完判断一下有没有超时，如果有结果，超时或者有信号处理了，就直接返回。
大致流程我们已经全看明白了。
一开始系统调用select,将时间复制到了内核并转换成了内核里面识别的时间，然后调用了core_sys_select，当返回的时候说明处理完毕了，最后修改了一下时间。

core_sys_select 在系统里面开辟了真正的文件描述符空间，先将select 传进来的文件描述符拷贝了一份，然后又设置了三个表示结果的描述符res_in,res_out,resrx,然后调用do_select,等do_select返回，然后将res_in,res_out,resrx 重新写回，用户空间。

do_select首先检查了文件描述符对应的文件是否打开，然后初始化了poll_wqueues队列，里面含有回调函数，然后挂载到fd的poll() 函数,每次poll，返回一个mask，再根据mask设置对应状态也就是前面的res_in,res_out,resrx。最后阻塞进程，等待时间到或者wake唤醒。

最核心的三个问题已经全部出来了

最大只能检测 1024 个df
在第一次所有监听都没有事件时，调用 select 都需要把进程挂到所有监听的文件描述符一次。有事件到来时，不知道是哪些文件描述符有数据可以读写，需要把所有的文件描述符都轮询一遍才能知道。
需要拷贝bitmap。写回去又要拷贝一次。

如果不纠结poll()函数是怎么个过程就不用继续看了。看到这基本上就已经可以了。

static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
		       poll_table *p);

void poll_initwait(struct poll_wqueues *pwq)
{
	init_poll_funcptr(&pwq->pt, __pollwait);//在这个地方我们初始化了一个函数指针
	pwq->polling_task = current;
	pwq->triggered = 0;
	pwq->error = 0;
	pwq->table = NULL;
	pwq->inline_index = 0;
}


/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	//这个地方是一个很巧妙的宏定义，获取到的pwq里面的pt 会使 p
	struct poll_table_entry *entry = poll_get_entry(pwq);
	/*
	struct poll_table_entry {
       struct file     *filp;                 // 指向特定fd对应的file结构体;
       unsigned long   key;                   // 等待特定fd对应硬件设备的事件掩码，如POLLIN、 POLLOUT、POLLERR;
       wait_queue_t    wait;                  // 代表调用select()的应用进程，等待在fd对应设备的特定事件 (读或者写)的等待队列头上，的等待队列项;
       wait_queue_head_t   *wait_address;     // 设备驱动程序中特定事件的等待队列头(该fd执行fop->poll，需要等待时在哪等，所以叫等待地址)；
};
*/
	if (!entry)
		return;
	get_file(filp);
	entry->filp = filp;
	entry->wait_address = wait_address;
	//不设置这个 key  可能会因为你需要的是POLLIN，却因为POLLOUT 唤醒；额
	entry->key = p->key;
	//后面这几个应该是给驱动用的，驱动需要知道要唤醒什么。如果一旦有了唤醒条件就会唤醒当前进程。
	init_waitqueue_func_entry(&entry->wait, pollwake); 
	
	entry->wait.private = pwq;
	add_wait_queue(wait_address, &entry->wait);
}

wait = &table.pt
mask = (*f_op->poll)(file, wait); //这个地方 通过wait 成功把__pollwait挂到了对应file 上面。
//linux/fs.h
struct file_operations {
	...
	unsigned int (*poll) (struct file *, struct poll_table_struct *);
	...
}
// include/linux/poll.h
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
    pt->_qproc = qproc;
    pt->_key   = ~0UL; /* all events enabled */
}
 
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && p->_qproc && wait_address)
		p->_qproc(filp, wait_address, p);
}
 
typedef struct poll_table_struct {
	poll_queue_proc _qproc;
	unsigned long _key;
} poll_table;

找了半天没有找到poll 具体干了啥可能是每个设备都不一样。

给两个图理解一下。

在这里插入图片描述
这图听说可以放大。

博客主要参考两位神仙大佬：

EW_DUST

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
select多路复用源码剖析

简单说一下5种IO阻塞IO：一直等待知道数据到来。非阻塞IO：直接返回有没有数据，没有就直接返回错误。IO复用：将多个IO，放在一起，一个个轮询。信号驱动：设置一个信号，当有IO的信号的时候告诉我。异步IO：直接丢给别人做。可以去看看这个博客。select是IO复用的一种。函数原型如下。int select (int __nfds, fd_set *__restrict ...
复制链接

扫一扫