从应用层面,当上层应用调用poll函数时,会进入内核调用系统调用sys_poll
asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
long timeout_msecs)
{
s64 timeout_jiffies;
if (timeout_msecs > 0) {
timeout_jiffies = msecs_to_jiffies(timeout_msecs);
} else {
/* Infinite (< 0) or no (0) timeout */
timeout_jiffies = timeout_msecs;
}
return do_sys_poll(ufds, nfds, &timeout_jiffies);
}
sys_poll会判断timeout_msecs是否为正数,如果是,就开始计时,如果不是,就把timeout_msecs传递给do_sys_poll函数
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
{
struct poll_wqueues table;
int fdcount, err;
unsigned int i;
struct poll_list *head;
struct poll_list *walk;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
struct poll_list *stack_pp = NULL;
/* Do a sanity check on nfds ... */
if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
return -EINVAL;
poll_initwait(&table);
head = NULL;
walk = NULL;
i = nfds;
err = -ENOMEM;
while(i!=0) {
struct poll_list *pp;
int num, size;
if (stack_pp == NULL)
num = N_STACK_PPS;
else
num = POLLFD_PER_PAGE;
if (num > i)
num = i;
size = sizeof(struct poll_list) + sizeof(struct pollfd)*num;
if (!stack_pp)
stack_pp = pp = (struct poll_list *)stack_pps;
}
pp->next=NULL;
pp->len = num;
if (head == NULL)
head = pp;
else
walk->next = pp;
walk = pp;
if (copy_from_user(pp->entries, ufds + nfds-i,
sizeof(struct pollfd)*num)) {
err = -EFAULT;
goto out_fds;
}
i -= pp->len;
}
fdcount = do_poll(nfds, head, &table, timeout);
/* OK, now copy the revents fields back to user space. */
walk = head;
err = -EFAULT;
while(walk != NULL) {
struct pollfd *fds = walk->entries;
int j;
for (j=0; j < walk->len; j++, ufds++) {
if(__put_user(fds[j].revents, &ufds->revents))
goto out_fds;
}
walk = walk->next;
}
err = fdcount;
if (!fdcount && signal_pending(current))
err = -EINTR;
}
首先对nfds是否有超过rlimit的限定后调用poll_initwait函数
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
init_poll_funcptr(&pwq->pt, __pollwait)相当于pwq->pt.__pollwait
在这里就相当于table.pt.qproc = __pollwait
在从用户空间获得信息以后,调用do_poll函数,调用完do_poll函数后,就可以把信息传递给用户空间,说明核心在于do_poll。值得一提的是,因为返回的结果非常简单,所以调用_put_usr
static int do_poll(unsigned int nfds, struct poll_list *list,
struct poll_wqueues *wait, s64 *timeout)
{
int count = 0;
poll_table* pt = &wait->pt;
for (;;) {
struct poll_list *walk;
long __timeout;
set_current_state(TASK_INTERRUPTIBLE);
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
/*
* Fish for events. If we found one, record it
* and kill the poll_table, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt)) {
count++;
pt = NULL;
}
}
}
/*
* All waiters have already been registered, so don't provide
* a poll_table to them on the next loop iteration.
*/
pt = NULL;
if (count || !*timeout || signal_pending(current))
break;
count = wait->error;
if (count)
break;
...
__timeout = schedule_timeout(__timeout);
if (*timeout >= 0)
*timeout += __timeout;
}
__set_current_state(TASK_RUNNING);
return count;
}
do_sys_poll函数中,fdcount = do_poll(nfds, head, &table, timeout); walk = head;我们可以看到监听文件描述符的结果将会放在head中,所以要重点关注do_poll的struct poll_list *list将会怎么样被初始化
简单分析do_poll函数
首先直接进入一个死循环里,在死循环中将当前进程设置为可中断睡眠状态,既然已经进入睡眠状态那为什么还直到后面才有schedule_timeout,有点奇怪
在这个死循环里嵌套了一个两层嵌套循环,这是为了处理不同类型的事件。
然后开始遍历poll_list list,真正来监控文件描述符的是do_pollfd(pfd, pt),如果有结果了,计数,然后清空poll_table。
if (count || !*timeout || signal_pending(current))如果此次遍历所有的pfd都没有在do_pollfd得到返回正返回值,或者timeout没有耗尽,或者也没有signal在等待处理,那么将会调用schedule_timeout(__timeout),使当前进程休眠_timeout长时间,如果在休眠时间内没有被打断(当然即使被打断,后面两行代码也很好地处理掉了),则会再次循环,但是当到达if (count || !*timeout || signal_pending(current));时,此时timeout已经耗尽了,而不用管另外两个条件是否成立,所以直接返回。
返回后设置当前进程为运行态。
原来真正用来监听pfd是do_pollfd
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
unsigned int mask;
int fd;
mask = 0;
fd = pollfd->fd;
if (fd >= 0) {
int fput_needed;
struct file * file;
file = fget_light(fd, &fput_needed);
mask = POLLNVAL;
if (file != NULL) {
mask = DEFAULT_POLLMASK;
if (file->f_op && file->f_op->poll)
mask = file->f_op->poll(file, pwait);
/* Mask out unneeded events. */
mask &= pollfd->events | POLLERR | POLLHUP;
fput_light(file, fput_needed);
}
}
pollfd->revents = mask;
return mask;
}
核心代码是mask = file->f_op->poll(file, pwait);pollfd->revents = mask; 然后返回mask
file是怎么来的,就是通过fd获得的,fget_light函数有点像应用层open函数的逆向过程。调用file的file_operation中的poll函数指针获得一个mask,然后把mask赋值给pollfd->event,最后将mask返回,然后我们在do_poll来判断计数
通过从应用层往下分析,最终我们知道要实现应用层的poll调用,需要在驱动层为poll函数指针实现一个可以返回mask的函数