关于驱动函数 poll 的详细解析参考 Linux 内核驱动 poll 函数解析
核心逻辑
对 Linux 2.6.36 中 select 的代码简化如下,只列出了关键步骤以展示核心逻辑。
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
{
for(;;) {
for (i = 0; i < n; ++rinp, ++routp, ++rexp) { /* 每次考察 8 * sizeof(long) 位*/
for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { /* 逐位考察 */
mask = (*f_op->poll)(file, wait); /* 将file加入等待队列,并返回 mask 指明哪些操作是可以无阻塞进行的 */
if (readable() || writeable() || exception()){
fds.res++;
retval++;
}
}
cond_resched(); /* 判断是否需要调度,根据需要进行调度 */
}
if (retval || timed_out || signal_pending(current)) /* 如果存在无阻塞的操作,超时或者收到信号则退出循环 */
break;
}
return retval;
}
调用路径
select
sys_select /* 系统调用 select 函数入口,判断是否设置超时 */
core_sys_select /* 参数传递和初始化,并将结果返回到用户空间。 */
do_select /* 循环遍历 fd_set */
sys_select
系统调用 select 函数入口
/* 系统调用 select 函数入口 */
SYSCALL_DEFINE5(select, int, n, fd_set *, inp, fd_set *, outp,
fd_set *, exp, struct timeval *, tvp)
{
if (tvp) { /* 判断是否设置超时 */
if (poll_select_set_timeout(tv))
return -EINVAL;
}
ret = core_sys_select(n, inp, outp, exp, to); /* 核心逻辑 */
return ret;
}
core_sys_select
参数传递和初始化,调用 do_select(), 并将结果返回到用户空间。
看出 select 代码在参数传递这块不够有效:
- 在核心循环中需要逐位遍历所有的描述符,效率不够高
- 需要将所有的 fd_set 返回
- 在返回给用户空间后,用户也需要遍历所有的描述符以确定可用的描述符
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timespec *end_time)
{
fd_set_bits fds;
void *bits;
int ret, max_fds;
unsigned int size;
struct fdtable *fdt;
/* 预申请栈空间以节省内存提高速度 */
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
ret = -EINVAL;
if (n < 0)
goto out_nofds;
/* max_fds 可能会增加, 使用 rcu 读锁以避免竞争 */
rcu_read_lock();
fdt = files_fdtable(current->files);
max_fds = fdt->max_fds;
rcu_read_unlock();
if (n > max_fds)
n = max_fds;
/*
* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
* since we used fdset we need to allocate memory in units of
* long-words.
*/
size = FDS_BYTES(n); /* 计算 fd_set 占用的字节数 */
bits = stack_fds;
if (size > sizeof(stack_fds) / 6) {
/* 如果空间不够,则使用 kmalloc 动态申请 */
ret = -ENOMEM;
bits = kmalloc(6 * size, GFP_KERNEL);
if (!bits)
goto out_nofds;
}
fds.in = bits;
fds.out = bits + size;
fds.ex = bits + 2*size;
fds.res_in = bits + 3*size;
fds.res_out = bits + 4*size;
fds.res_ex = bits + 5*size;
if ((ret = get_fd_set(n, inp, fds.in)) || /* 使用用户指定的描述符初始化 fds */
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
zero_fd_set(n, fds.res_in); /* 对结果部分进行零初始化 */
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex);
ret = do_select(n, &fds, end_time); /* 核心逻辑 */
if (ret < 0)
goto out;
if (!ret) {
ret = -ERESTARTNOHAND;
if (signal_pending(current))
goto out;
ret = 0;
}
if (set_fd_set(n, inp, fds.res_in) || /* 将结果复制到用户空间 */
set_fd_set(n, outp, fds.res_out) ||
set_fd_set(n, exp, fds.res_ex))
ret = -EFAULT;
out:
if (bits != stack_fds)
kfree(bits);
out_nofds:
return ret;
}
do_select
do_select 代码通过轮询调用驱动程序的 poll 函数来实现多路复用。
文件驱动 (*f_op->poll) 函数会做两件事情:
- 调用 poll_wait 将包含当前文件描述符的 poll_table_entry 加入等待队列
- 返回一个 mask 值,表明目前哪些操作是可以无阻塞进行的
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
poll_table *wait;
int retval, i, timed_out = 0;
unsigned long slack = 0;
rcu_read_lock();
retval = max_select_fd(n, fds);
rcu_read_unlock();
if (retval < 0)
return retval;
n = retval;
poll_initwait(&table);
wait = &table.pt;
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
wait = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = estimate_accuracy(end_time);
retval = 0;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
for (i = 0; i < n; ++rinp, ++routp, ++rexp) { /* 每次考察 8 * sizeof(long) 位*/
unsigned long in, out, ex, all_bits, bit = 1, mask, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
const struct file_operations *f_op = NULL;
struct file *file = NULL;
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
if (all_bits == 0) {
i += __NFDBITS;
continue;
}
for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { /* 逐位考察 */
int fput_needed;
if (i >= n)
break;
if (!(bit & all_bits))
continue;
file = fget_light(i, &fput_needed); /* 通过文件描述符得到文件指针,并增加文件的引用计数 */
if (file) {
f_op = file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll) { /* 判断文件是否支持 poll */
/* 设定 wait 值,wait 值代表感兴趣的操作 */
wait_key_set(wait, in, out, bit);
/*
* 调用poll函数,将file加入等待队列,
* 并返回 mask 指明哪些操作是可以无阻塞进行的
*/
mask = (*f_op->poll)(file, wait);
}
fput_light(file, fput_needed); /* 递减文件的引用计数 */
if ((mask & POLLIN_SET) && (in & bit)) { /* 如果可读 */
res_in |= bit;
retval++;
wait = NULL;
}
if ((mask & POLLOUT_SET) && (out & bit)) { /* 如果可写 */
res_out |= bit;
retval++;
wait = NULL;
}
if ((mask & POLLEX_SET) && (ex & bit)) { /* 如果异常 */
res_ex |= bit;
retval++;
wait = NULL;
}
}
}
if (res_in)
*rinp = res_in;
if (res_out)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
cond_resched(); /* 判断是否需要调度,根据需要进行调度 */
}
wait = NULL;
if (retval || timed_out || signal_pending(current)) /* 如果存在无阻塞的操作,超时或者收到信号则退出循环 */
break;
if (table.error) {
retval = table.error;
break;
}
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec_to_ktime(*end_time);
to = &expire;
}
if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
to, slack))
timed_out = 1;
}
poll_freewait(&table);
return retval;
}