linux select内核实现原理:
本文对应的linux 内核版本为5.0.3
select系统调用定义,位置在\fs\select.c
在看linux内核对select实现之前,最好先了解用户态程序是怎么使用的,这样有利于我们对select实现的理解,select函数的使用可以自行上网搜索。
select系统调用格式定义如下:
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
fd_set __user *, exp, struct timeval __user *, tvp)
select函数直接调用kern_select函数
select->kern_select
static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timeval __user *tvp)
{
struct timespec64 end_time, *to = NULL;
struct timeval tv;
int ret;
if (tvp) {
if (copy_from_user(&tv, tvp, sizeof(tv)))
return -EFAULT;
to = &end_time;
//将用户态的时间数据结构的数据,转换为内核态的定时器使用的数据结构
if (poll_select_set_timeout(to,
tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
return -EINVAL;
}
ret = core_sys_select(n, inp, outp, exp, to);
ret = poll_select_copy_remaining(&end_time, tvp, PT_TIMEVAL, ret);
return ret;
}
kern_select函数中,如果用户程序定义了超时时间,那么将超时时间复制到内核态,并转换为内核态定时器使用的数据结构
剩下的主要功能是在core_sys_select函数中完成
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timespec64 *end_time)
{
fd_set_bits fds;
void *bits;
int ret, max_fds;
size_t size, alloc_size;
struct fdtable *fdt;
/* Allocate small arguments on the stack to save memory and be faster */
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
ret = -EINVAL;
if (n < 0)
goto out_nofds;
/* max_fds can increase, so grab it once to avoid race */
rcu_read_lock();
//取出当前进程的文件描述符表
fdt = files_fdtable(current->files);
//当前进程的最大文件描述符数
max_fds = fdt->max_fds;
rcu_read_unlock();
if (n > max_fds)
n = max_fds;
/*
* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
* since we used fdset we need to allocate memory in units of
* long-words.
*/
//这里是为了计算位图的大小,因为每个字节8bit,n表示要操作的最大文件描述符+1,每一位表示一个
//文件描述符。这里计算的大小是以字节为单位的,且这个size表示要分配的inp/outp/exp/中的一个大小。
size = FDS_BYTES(n);
bits = stack_fds;
if (size > sizeof(stack_fds) / 6) {
/* Not enough space in on-stack array; must use kmalloc */
ret = -ENOMEM;
if (size > (SIZE_MAX / 6))
goto out_nofds;
alloc_size = 6 * size;
bits = kvmalloc(alloc_size, GFP_KERNEL);
if (!bits)
goto out_nofds;
}
fds.in = bits;
fds.out = bits + size;
fds.ex = bits + 2*size;
fds.res_in = bits + 3*size;
fds.res_out = bits + 4*size;
fds.res_ex = bits + 5*size;
//将用户空间传递的输入参数复制到内核空间,例如inp表示关心的读操作的文件描述符集合
if ((ret = get_fd_set(n, inp, fds.in)) ||
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
//将返回的文件描述符位图清零
zero_fd_set(n, fds.res_in);
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex);
ret = do_select(n, &fds, end_time);
if (ret < 0)
goto out;
if (!ret) {
ret = -ERESTARTNOHAND;
if (signal_pending(current))
goto out;
ret = 0;
}
//这里检测的结果赋值给用户态输入的参数,所以会把用户设置的参数覆盖掉,因此用户每次调用select函数的时候,需要重新设置一下输入参数
if (set_fd_set(n, inp, fds.res_in) ||
set_fd_set(n, outp, fds.res_out) ||
set_fd_set(n, exp, fds.res_ex))
ret = -EFAULT;
out:
if (bits != stack_fds)
kvfree(bits);
out_nofds:
return ret;
}
core_sys_select函数的主要作用 :
1.在栈上分配用于用户关心事件和用于返回发生时间的空间,然后将用户空间传入的关心事件拷贝到内核空间,
2.调用do_select函数,主要的工作是在do_select函数中进行的。
3.将检测的结果赋值给用户态输入的参数,所以会把用户设置的参数覆盖掉,因此用户每次调用select函数的时候,需要重新设置一下输入参数
static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
poll_table *wait;
int retval, i, timed_out = 0;
u64 slack = 0;
__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_start = 0;
rcu_read_lock();
retval = max_select_fd(n, fds);
rcu_read_unlock();
if (retval < 0)
return retval;
//最大的文件描述符号
n = retval;
/*将当前进程与poll_wqueues结构体绑定*/
poll_initwait(&table);
//这个wait很重要,下面会把这个wait当做参数传递给各个去定的poll函数
wait = &table.pt;
//如果设置了超时时间,判断是否超时
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
wait->_qproc = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time);
//这里表示这一次循环中有多少个文件描述符可以操作
retval = 0;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
bool can_busy_loop = false;
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
//循环遍历每个fd,这里是从0开始遍历所有的fd,因此就可以理解为啥select系统调用传入的参数为maxfd+1了
for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = 1, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
__poll_t mask;
//这里是处理位图,每次处理long类型的数据长度
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
if (all_bits == 0) {
i += BITS_PER_LONG;
continue;
}
//这里对位图进行循环,处理long长度的位图
for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
struct fd f;
if (i >= n)
break;
if (!(bit & all_bits))//每一位进行测试
continue;
f = fdget(i);
if (f.file) {
wait_key_set(wait, in, out, bit,
busy_flag);
//这里会调用fd对应的poll函数
mask = vfs_poll(f.file, wait);
fdput(f);
//这里判断驱动程序poll函数的读写事项
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;//增加可操作文件句柄的个数
wait->_qproc = NULL;
}
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
wait->_qproc = NULL;
}
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
wait->_qproc = NULL;
}
/* got something, stop busy polling */
//如果关心的事件发生了
if (retval) {
can_busy_loop = false;
busy_flag = 0;
/*
* only remember a returned
* POLL_BUSY_LOOP if we asked for it
*/
} else if (busy_flag & mask)
can_busy_loop = true;
}
}
//如果检测的文件内容可操作性,那么对返回值进行复制
if (res_in)
*rinp = res_in;
if (res_out)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
cond_resched();
}
wait->_qproc = NULL;
//注意,这里是把所有的fd都扫描了一遍之后才进行下面的判断的,而不是当一个fd有数据时就会退出
//如果有可操作的文件句柄,或者超时时间到,或者有信号需要处理,则退出
if (retval || timed_out || signal_pending(current))
break;
if (table.error) {
retval = table.error;
break;
}
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
if (!busy_start) {
busy_start = busy_loop_current_time();
continue;
}
if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec64_to_ktime(*end_time);
to = &expire;
}
//执行到这里,说明没有任何fd有数据准备好,且没有超时,当前进程进入睡眠
if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
to, slack))
timed_out = 1;
}
poll_freewait(&table);
return retval;
}
do_select函数的主要功能:
1.调用poll_initwait(&table)函数,将当前进程与poll_wqueues结构体绑定。
2.如果超时时间到,那么会设置超时标志
3.进入一个for循环,对用户输入的每个关心的描述符都进行遍历,依次调用该文件描述符对应的操作方法集中的poll函数,如果poll函数轮询的事情发生了,那么会返回所关心的事件。
4.如果没到超时时间,或者没有信号发生,那么将当前进程睡眠
void poll_initwait(struct poll_wqueues *pwq)
{
//特别注意这个回调函数,会被驱动的poll函数中通过poll_wait函数回调
init_poll_funcptr(&pwq->pt, __pollwait);
//这里是调用select系统调用的进程
pwq->polling_task = current;
pwq->triggered = 0;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
//参数wait_address是由驱动程序调用poll_wait函数传递的
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
//从poll_table中获取一个表项
struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
//初始化表项
entry->filp = get_file(filp);
entry->wait_address = wait_address;
entry->key = p->_key;
//初始化唤醒等待队列中等待队列项的函数,由此可知,每个poll_table_entry都会有一个自己的唤醒函数
init_waitqueue_func_entry(&entry->wait, pollwake);
entry->wait.private = pwq;
//将poll table entry加入到等待队列中,__pollwait函数是被驱动的poll函数调用的,每个驱动都有自己的wait_address
//因此,每个等待队列就有一个等待队列项
add_wait_queue(wait_address, &entry->wait);
}
static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct poll_table_entry *entry;
entry = container_of(wait, struct poll_table_entry, wait);
if (key && !(key_to_poll(key) & entry->key))
return 0;
return __pollwake(wait, mode, sync, key);
}
static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct poll_wqueues *pwq = wait->private;
//pwq->polling_task表示调用select函数时的进程
DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
/*
* Although this function is called under waitqueue lock, LOCK
* doesn't imply write barrier and the users expect write
* barrier semantics on wakeup functions. The following
* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
* and is paired with smp_store_mb() in poll_schedule_timeout.
*/
smp_wmb();
pwq->triggered = 1;
/*
* Perform the default wake up operation using a dummy
* waitqueue.
*
* TODO: This is hacky but there currently is no interface to
* pass in @sync. @sync is scheduled to be removed and once
* that happens, wake_up_process() can be used directly.
*/
return default_wake_function(&dummy_wait, mode, sync, key);
}
poll_initwait函数会给poll_wqueues结构体初始化,设置一个__pollwait回调函数,这个回调函数会被各个驱动程序的poll函数中调用,
__pollwait函数主要从poll_wqueues中申请一个poll_table_entry,然后给poll_table_entry设置一个唤醒函数,这个唤醒函数会被在调用wake_up_interruptible函数的时候
调用,最后将poll_table_entry加入到一个等待队列中,值得注意的是,这个等待队列是由驱动程序poll传递进来的参数,每个驱动的poll函数都会传递自己的等待队列,
因此唤醒进程的时候,是根据不同的等待队列唤醒的。例如,a驱动和b驱动都有自己的poll函数分别为a_poll和b_poll,那么在这两个函数中都会调用poll_wait函数,
从而间接调用__pollwait函数,然后将各自的poll_table_entry结构体绑定一个pollwake唤醒函数,然后加入到各自的等待队列中。当a_poll函数轮询的事件发生时,那么在a驱动中会调用
wake_up_interruptible函数将a驱动程序中的等待队列进程唤醒,也就是会调用pollwake函数。从而最终唤醒的是pwq->polling_task进程,这个进程是在poll_initwait函数中被设置的,
因此,poll机制中的整个睡眠和唤醒流程就清晰了。
下面是一个简单的poll驱动程序:
static unsigned int demo_poll(struct file *filp, struct poll_table_struct *pts)
{
/* 如果关心的事件没有发生的话,就返回0 */
unsigned int mask = 0;
/* 把wait_queue_head_t和文件描述符(在filp中)提交给内核轮询代码,
*
* 以便轮询代码休眠唤醒之
*/
poll_wait(filp, &wq, pts);
/* 如果有数据可读,设置相应的位域 */
if(counter){
mask = (POLLIN | POLLRDNORM);
}
//如果mask返回值为0,则在核心轮询代码就会将其休眠,如果mask为非0,就会返回给应用层。
return mask;
}
select的缺点
每次调用select,都必须把fd集合从用户态拷贝到内核态,这个开销在fd很多时会很大;
文件描述符就绪时,内核会修改readfds、writefds、execptfds结构,所以每次调用select之前,必须重新将文件描述符注册一遍;
每次调用select都必须在内核遍历传递进来的所有fd,这个开销在fd很多时会很大(时间复杂度O(n));
每次都必须循环探测哪些文件描述符就绪(O(n));
调用前都必须重新设置结构体变量
单个进程能够监视的文件描述符存在最大的限制
poll系统调用的实现
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
struct timespec64 end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}
ret = do_sys_poll(ufds, nfds, to);
if (ret == -EINTR) {
struct restart_block *restart_block;
restart_block = ¤t->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
if (timeout_msecs >= 0) {
restart_block->poll.tv_sec = end_time.tv_sec;
restart_block->poll.tv_nsec = end_time.tv_nsec;
restart_block->poll.has_timeout = 1;
} else
restart_block->poll.has_timeout = 0;
ret = -ERESTART_RESTARTBLOCK;
}
return ret;
}
poll系统调用的主要功能是在do_sys_poll函数中。
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds;
if (nfds > rlimit(RLIMIT_NOFILE))
return -EINVAL;
len = min_t(unsigned int, nfds, N_STACK_PPS);
for (;;) {
walk->next = NULL;
walk->len = len;
if (!len)
break;
//这里把用户程序的参数拷贝到内核层
if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len))
goto out_fds;
todo -= walk->len;
if (!todo)
break;
len = min(todo, POLLFD_PER_PAGE);
size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
walk = walk->next = kmalloc(size, GFP_KERNEL);
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}
poll_initwait(&table);
fdcount = do_poll(head, &table, end_time);
poll_freewait(&table);
for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;
for (j = 0; j < walk->len; j++, ufds++)
if (__put_user(fds[j].revents, &ufds->revents))
goto out_fds;
}
err = fdcount;
out_fds:
walk = head->next;
while (walk) {
struct poll_list *pos = walk;
walk = walk->next;
kfree(pos);
}
return err;
}
在do_sys_poll函数中主要做如下功能:
1.首先会将用户态的参数拷贝到内核态中,如果在栈上分配的内存不够用,那么会使用kmalloc分配内存,使用链表的形式。
2.调用poll_initwait(&table)函数初始化一个struct poll_wqueues,该函数的作用是在select实现中是一样的,把当前进程绑定到struct poll_wqueues中,同时也绑定__pollwait回调函数
3.调用do_poll函数
4.将收集到可以操作的文件描述符拷贝到用户空间
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
u64 slack = 0;
__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_start = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
pt->_qproc = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time);
for (;;) {
struct poll_list *walk;
bool can_busy_loop = false;
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt, &can_busy_loop,
busy_flag)) {
//统计可操作的文件数
count++;
pt->_qproc = NULL;
/* found something, stop busy polling */
busy_flag = 0;
can_busy_loop = false;
}
}
}
/*
* All waiters have already been registered, so don't provide
* a poll_table->_qproc to them on the next loop iteration.
*/
pt->_qproc = NULL;
if (!count) {
count = wait->error;
if (signal_pending(current))
count = -EINTR;
}
if (count || timed_out)
break;
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
if (!busy_start) {
busy_start = busy_loop_current_time();
continue;
}
if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec64_to_ktime(*end_time);
to = &expire;
}
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
return count;
}
do_poll函数的主要作用:
1.如果超时时间到,设置超时标志
2.循环遍历用户程序传递下来的文件描述符,对每个描述符调用do_pollfd函数,该函数的作用是调用该文件描述符相对应的poll函数,返回可操作事件的位图。
3.如果超时时间没到,且没有任何信号要处理,那么把当前进程进行睡眠