linux2.6.9 poll系统调用源码分析
1、涉及到的数据结构
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
//声明回调函数类型,内核中不同的模块实现不同的回调函数(但接口统一),以提供给驱动程序调用。
typedef struct poll_table_struct {
poll_queue_proc qproc;
} poll_table;
//对上述类型的封装,便于以后的扩充
struct poll_table_page {
struct poll_table_page * next; //指向下一个元素
struct poll_table_entry * entry; //保存entries中的当前元素
struct poll_table_entry entries[0]; //保存poll_table_entry元素
};
//内核中用来保存套接字的数据结构
struct poll_table_entry {
struct file * filp;
wait_queue_t wait;
wait_queue_head_t * wait_address;
};
//一个句柄对应的内核结构
struct poll_wqueues {
poll_table pt;
struct poll_table_page * table;
int error;
};
//用于等待函数的结构
struct pollfd{
int fd;
short events;
short revents;
}
//用户态的句柄
struct poll_list {
struct poll_list *next;
int len;
struct pollfd entries[0];
};
//将用户态的句柄拷贝到内核时使用的结构
2、对应函数源码分析
1、sys_poll
asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout)
{
struct poll_wqueues table;
int fdcount, err;
unsigned int i;
struct poll_list *head;
struct poll_list *walk;
/* Do a sanity check on nfds ... */
检测句柄数目
用户传入的句柄数目不能超过进程所支持的最大句柄数。
if (nfds > current->files->max_fdset && nfds > OPEN_MAX)
return -EINVAL;
if (timeout) {
/* Careful about overflow in the intermediate values */
if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
timeout = (unsigned long)(timeout*HZ+999)/1000+1;
else /* Negative or overflow */
timeout = MAX_SCHEDULE_TIMEOUT;
}
初始化变量table
从字面上看,应该是初始化变量table,注意此处table在整个执行poll的过程中是很关键的变量。
poll_initwait(&table);
拷贝用户传入的句柄
下面的while循环就是建立一个链表,每个链表的节点是一个page大小(通常是4k),这链表节点由一个指向struct poll_list的指针掌控,而众多的struct pollfd就通过struct poll_list的entries成员访问。上面的循环就是把用户态的struct pollfd拷进这些entries里。通常用户程序的poll调用就监控几个fd,所以上面这个链表通常也就只需要一个节点,即操作系统的一页。但是,当用户传入的fd很多时,由于poll系统调用每次都要把所有struct pollfd拷进内核,所以参数传递和页分配此时就成了poll系统调用的性能瓶颈。
head = NULL;
walk = NULL;
i = nfds;
err = -ENOMEM;
while(i!=0) {
struct poll_list *pp;
pp = kmalloc(sizeof(struct poll_list)+
sizeof(struct pollfd)*
(i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i),
GFP_KERNEL);
if(pp==NULL)
goto out_fds;
pp->next=NULL;
pp->len = (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i);
if (head == NULL)
head = pp;
else
walk->next = pp;
walk = pp;
if (copy_from_user(pp->entries, ufds + nfds-i,
sizeof(struct pollfd)*pp->len)) {
err = -EFAULT;
goto out_fds;
}
i -= pp->len;
}
调用do_poll,检测是否有句柄就绪
fdcount = do_poll(nfds, head, &table, timeout);
/* OK, now copy the revents fields back to user space. */
将revents字段拷贝到用户空间
下面的while循环就是将do_poll之后的revents字段拷贝到用户空间,当句柄数很多时,这也是poll系统调用的性能瓶颈 。
walk = head;
err = -EFAULT;
while(walk != NULL) {
struct pollfd *fds = walk->entries;
int j;
for (j=0; j < walk->len; j++, ufds++) {
if(__put_user(fds[j].revents, &ufds->revents))
goto out_fds;
}
walk = walk->next;
}
err = fdcount;
if (!fdcount && signal_pending(current))
err = -EINTR;
释放分配的空间
out_fds:
walk = head;
while(walk!=NULL) {
struct poll_list *pp = walk->next;
kfree(walk);
walk = pp;
}
poll_freewait(&table);
return err;
}
2、poll_initwait
设定poll_table对应回调函数
很明显,poll_initwait的主要动作就是把table变量的成员poll_table对应的回调函数置为__pollwait。这个__pollwait不仅是poll系统调用需要,select系统调用也一样是用这个__pollwait,说白了,这(__pollwait)是个操作系统的异步操作的“御用”回调函数。
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
//&(pwq->pt)->qproc=__pollwait;
pwq->error = 0;
pwq->table = NULL;
}
3、do_poll
static int do_poll(unsigned int nfds, struct poll_list *list,
struct poll_wqueues *wait, long timeout)
{
int count = 0;
poll_table* pt = &wait->pt;
if (!timeout)
pt = NULL;
for (;;) {
struct poll_list *walk;
set_current_state(TASK_INTERRUPTIBLE);
walk = list;
while(walk != NULL) {
DO_POLLFD调用
对用户传入的每一个句柄,调用do_pollfd设定在本句柄上等待的事件发生时需要执行的回调函数,同时检测本句柄上等待的事件是否就绪,如果就绪则递增count。当用户传入的fd很多时(比如1000个),对do_pollfd就会调用很多次,poll效率瓶颈的另一原因就在这里
do_pollfd( walk->len, walk->entries, &pt, &count);
walk = walk->next;
}
pt = NULL;
返回
当有等待的时间就绪,设定的时间超时或者本进程收到信号则跳出循环,返回。
if (count || !timeout || signal_pending(current))
break;
count = wait->error;
if (count)
break;
休眠
让current挂起,别的进程跑,timeout到了以后再回来运行current,或者是在current等待的事件发生时返回,然后回到for循环的其实处,重新扫描所有句柄。
timeout = schedule_timeout(timeout);
}
__set_current_state(TASK_RUNNING);
return count;
}
4、do_pollfd
static void do_pollfd(unsigned int num, struct pollfd * fdpage,
poll_table ** pwait, int *count)
{
int i;
for (i = 0; i < num; i++) {
int fd;
unsigned int mask;
struct pollfd *fdp;
mask = 0;
fdp = fdpage+i;
fd = fdp->fd;
if (fd >= 0) {
struct file * file = fget(fd);
mask = POLLNVAL;
if (file != NULL) {
mask = DEFAULT_POLLMASK;
if (file->f_op && file->f_op->poll)
调用设备驱动程序的poll函数
如果fd对应的是某个socket,do_pollfd调用的就是网络设备驱动实现的poll;如果fd对应的是某个ext3文件系统上的一个打开文件,那do_pollfd调用的就是ext3文件系统驱动实现的poll。一句话,这个file->f_op->poll是设备驱动程序实现的,那设备驱动程序的poll实现通常又是什么样子呢?其实,设备驱动程序的标准实现是:调用poll_wait,即以设备自己的等待队列为参数(通常设备都有自己的等待队列,不然一个不支持异步操作的设备会让人很郁闷)调用struct poll_table的回调函数。
mask = file->f_op->poll(file, *pwait);
mask &= fdp->events | POLLERR | POLLHUP;
fput(file);
}
if (mask) {
*pwait = NULL;
(*count)++;
}
}
fdp->revents = mask;
}
}
5、tcp_poll
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
unsigned int mask;
struct sock *sk = sock->sk;
struct tcp_opt *tp = tcp_sk(sk);
调用__poll_wait,设定回调函数
tcp_poll的核心实现就是poll_wait,而poll_wait就是调用struct poll_table对应的回调函数,那poll系统调用对应的回调函数就是__poll_wait,所以这里几乎就可以把tcp_poll理解为一个语句:__poll_wait(file, sk->sk_sleep, wait);由此也可以看出,每个socket自己都带有一个等待队列sk_sleep,所以上面我们所说的“设备的等待队列”其实不止一个。
poll_wait(file, sk->sk_sleep, wait);
//__poll_wait(file, sk->sk_sleep, wait);
if (sk->sk_state == TCP_LISTEN)
return tcp_listen_poll(sk, wait);
/* Socket is not locked. We are protected from async events
by poll logic and correct handling of state changes
made by another threads is impossible in any case.
*/
测试套接字的状态,返回给调用者
mask = 0;
if (sk->sk_err)
mask = POLLERR;
/*
* POLLHUP is certainly not done right. But poll() doesn't
* have a notion of HUP in just one direction, and for a
* socket the read side is more interesting.
*
* Some poll() documentation says that POLLHUP is incompatible
* with the POLLOUT/POLLWR flags, so somebody should check this
* all. But careful, it tends to be safer to return too many
* bits than too few, and you can easily break real applications
* if you don't tell them that something has hung up!
*
* Check-me.
*
* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
* our fs/select.c). It means that after we received EOF,
* poll always returns immediately, making impossible poll() on write()
* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
* if and only if shutdown has been made in both directions.
* Actually, it is interesting to look how Solaris and DUX
* solve this dilemma. I would prefer, if PULLHUP were maskable,
* then we could set it on SND_SHUTDOWN. BTW examples given
* in Stevens' books assume exactly this behaviour, it explains
* why PULLHUP is incompatible with POLLOUT. --ANK
*
* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
* blocking on fresh not-connected or disconnected socket. --ANK
*/
if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
mask |= POLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLIN | POLLRDNORM;
/* Connected? */
if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
/* Potential race condition. If read of tp below will
* escape above sk->sk_state, we can be illegally awaken
* in SYN_* states. */
if ((tp->rcv_nxt != tp->copied_seq) &&
(tp->urg_seq != tp->copied_seq ||
tp->rcv_nxt != tp->copied_seq + 1 ||
sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
mask |= POLLIN | POLLRDNORM;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
set_bit(SOCK_ASYNC_NOSPACE,
&sk->sk_socket->flags);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
/* Race breaker. If space is freed after
* wspace test but before the flags are set,
* IO signal will be lost.
*/
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
mask |= POLLOUT | POLLWRNORM;
}
}
if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
}
return mask;
}
6、__pollwait
void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p)
{
struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
struct poll_table_page *table = p->table;
创建相应的数据结构
if (!table || POLL_TABLE_FULL(table)) {
struct poll_table_page *new_table;
new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
if (!new_table) {
p->error = -ENOMEM;
__set_current_state(TASK_RUNNING);
return;
}
new_table->entry = new_table->entries;
new_table->next = table;
p->table = new_table;
table = new_table;
}
将当前进程挂在设备的等待队列上
/* Add a new entry */
{
struct poll_table_entry * entry = table->entry;
table->entry = entry+1;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
init_waitqueue_entry(&entry->wait, current);
add_wait_queue(wait_address,&entry->wait);
}
}
__poll_wait的作用就是创建了下图所示的数据结构(一次__poll_wait即一次设备poll调用只创建一个poll_table_entry),并通过struct poll_table_entry的wait成员,把current挂在了设备的等待队列上,此处的等待队列是wait_address,对应tcp_poll里的sk->sk_sleep。
3、poll的弱点
从上面的源码分析可以看出,虽然解决了select在句柄数目方面的问题,但select的其它弱点它没有解决,poll的弱点主要表现在如下四个方面:
n poll系统调用每次都要把所有struct pollfd拷进内核,当句柄数目很大时,参数传递和页分配此时就成了poll系统调用的性能瓶颈。
n poll必须轮询所有置位的套接字,不论该套接字就绪与否。这导致select的执行时 间随着套接字数目的增加而增加。
n poll系统调用要将revents字段拷贝到用户空间,当句柄数很多时,这也是poll系统调用的性能瓶颈。
n poll只查询套接字就绪与否,并不复制数据的处理,用户必须在套接字就绪后,自己处理所要作的工作。