关于驱动函数 poll 的详细解析参考 Linux 内核驱动 poll 函数解析
核心逻辑
对 Linux 2.6.36 中 poll 的代码简化如下,只列出了关键步骤以展示核心逻辑。
static int do_poll(unsigned int nfds, struct poll_list *list,
struct poll_wqueues *wait, struct timespec *end_time)
{
for (;;) {
for (walk = list; walk != NULL; walk = walk->next) { /* 遍历每一个 pfd */
for (; pfd != pfd_end; pfd++) {
if (do_pollfd(pfd, pt)) { /* 判断 pfd 是否可无阻塞读写,并将pfd加入等待队列 */
count++;
pt = NULL;
}
}
}
if (count || timed_out) /* 如果有描述符就绪或者超时,退出循环 */
break;
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) /* 调度 */
timed_out = 1;
}
return count;
}
调用路径
poll
sys_poll /* 函数入口,判断是否需要超时设置 */
do_sys_poll /* 处理参数传递和结果返回 */
do_poll /* 循环遍历所有 pollfd */
do_pollfd /* 检测 pollfd,并将其注册到等到队列 */
详细代码分析及注释
sys_poll
系统调用 poll 的入口函数,判断是否需要超时设置。
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
long, timeout_msecs)
{
struct timespec end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) { /* 是否设定超时 */
to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}
ret = do_sys_poll(ufds, nfds, to);
return ret;
}
do_sys_poll
该函数负责将用户空间的 pollfds 复制到内核空间,并将最后结果返回给用户空间。
看出 poll 代码在参数传递方面不够有效:
- 需要将用户空间的 pollfds 复制到内核空间
- 在返回给用户空间结果时,也是将所有 pollfd 返回
- 用户拿到 pollfd 后需要遍历所有的结果才能确定可用的描述符
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds;
if (nfds > rlimit(RLIMIT_NOFILE))
return -EINVAL;
len = min_t(unsigned int, nfds, N_STACK_PPS);
for (;;) {
walk->next = NULL;
walk->len = len;
if (!len)
break;
if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len))
goto out_fds;
todo -= walk->len;
if (!todo) /* 如果预申请的栈空间能满足要求,则退出循环 */
break;
/* 为不能放在栈空间的 pollfds 申请动态内存 */
len = min(todo, POLLFD_PER_PAGE);
size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
walk = walk->next = kmalloc(size, GFP_KERNEL);
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}
poll_initwait(&table);
fdcount = do_poll(nfds, head, &table, end_time);
poll_freewait(&table);
/* 将所有 pollfds 拷贝到用户空间 */
for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;
for (j = 0; j < walk->len; j++, ufds++)
if (__put_user(fds[j].revents, &ufds->revents))
goto out_fds;
}
err = fdcount;
out_fds:
walk = head->next;
while (walk) {
struct poll_list *pos = walk;
walk = walk->next;
kfree(pos);
}
return err;
}
do_poll
遍历每一个 pollfd,如果发现一个准备好的描述符则立即返回。
static int do_poll(unsigned int nfds, struct poll_list *list,
struct poll_wqueues *wait, struct timespec *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
unsigned long slack = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
pt = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = estimate_accuracy(end_time);
for (;;) {
struct poll_list *walk;
for (walk = list; walk != NULL; walk = walk->next) { /* 遍历pollfd */
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
/*
* 测试 pfd,如果发现一个准备好的描述符,
* 记录它,并销毁 poll_table,因为后续循环不再需要
*/
if (do_pollfd(pfd, pt)) {
count++;
pt = NULL;
}
}
}
/* 所有等待项都注册完成,所以 poll_table 不再需要 */
pt = NULL;
if (!count) {
count = wait->error;
if (signal_pending(current))
count = -EINTR;
}
if (count || timed_out)
break;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec_to_ktime(*end_time);
to = &expire;
}
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) /* 调度 */
timed_out = 1;
}
return count;
}
do_pollfd
通过调用驱动函数 poll 将文件注册到等待队列,并返回基于位的 mask,指明当前描述符准备好的动作。
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
unsigned int mask;
int fd;
mask = 0;
fd = pollfd->fd;
if (fd >= 0) {
int fput_needed;
struct file * file;
file = fget_light(fd, &fput_needed); /* 由文件描述符得到文件指针,并增加文件的引用计数 */
mask = POLLNVAL;
if (file != NULL) {
mask = DEFAULT_POLLMASK;
if (file->f_op && file->f_op->poll) {
if (pwait)
pwait->key = pollfd->events |
POLLERR | POLLHUP;
mask = file->f_op->poll(file, pwait);
}
/* Mask out unneeded events. */
mask &= pollfd->events | POLLERR | POLLHUP;
fput_light(file, fput_needed); /* 递减文件的引用计数 */
}
}
pollfd->revents = mask;
return mask;
}