linux2.6.9 poll系统调用源码分析

最新推荐文章于 2022-10-20 13:16:14 发布

fz835304205

最新推荐文章于 2022-10-20 13:16:14 发布

阅读量961

点赞数

分类专栏： linux经典知识解读

linux经典知识解读专栏收录该内容

9 篇文章 1 订阅

订阅专栏

linux2.6.9 poll系统调用源码分析

转者按：笔者本身是linux菜鸟，在遇到按键驱动程序的时候实现poll方法时很不理解处理机制不明白系统调用的处理机制，在网上参看多篇文章然后才有一点渐渐明白，现将典型文章分析摘抄如下，期望和我一样有迫切学习要求的同道能早日敲开linux内核的神秘大门。

1、涉及到的数据结构

typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

//声明回调函数类型，内核中不同的模块实现不同的回调函数（但接口统一），以提供给驱动程序调用。

typedef struct poll_table_struct {

poll_queue_proc qproc;

} poll_table;

//对上述类型的封装，便于以后的扩充

struct poll_table_page {

struct poll_table_page * next; //指向下一个元素

struct poll_table_entry * entry; //保存entries中的当前元素

struct poll_table_entry entries[0]; //保存poll_table_entry元素

};

//内核中用来保存套接字的数据结构

struct poll_table_entry {

struct file * filp;

wait_queue_t wait;

wait_queue_head_t * wait_address;

};

//一个句柄对应的内核结构

struct poll_wqueues {

poll_table pt;

struct poll_table_page * table;

int error;

};

//用于等待函数的结构

struct pollfd{

int fd;

short events;

short revents;

}

//用户态的句柄

struct poll_list {

struct poll_list *next;

int len;

struct pollfd entries[0];

};

//将用户态的句柄拷贝到内核时使用的结构

2、对应函数源码分析

1、sys_poll

asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout)

{

struct poll_wqueues table;

int fdcount, err;

unsigned int i;

struct poll_list *head;

struct poll_list *walk;

/* Do a sanity check on nfds ... */

检测句柄数目

用户传入的句柄数目不能超过进程所支持的最大句柄数。

if (nfds > current->files->max_fdset && nfds > OPEN_MAX)

return -EINVAL;

if (timeout) {

/* Careful about overflow in the intermediate values */

if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)

timeout = (unsigned long)(timeout*HZ+999)/1000+1;

else /* Negative or overflow */

timeout = MAX_SCHEDULE_TIMEOUT;

}

初始化变量table

从字面上看，应该是初始化变量table，注意此处table在整个执行poll的过程中是很关键的变量。

poll_initwait(&table);

拷贝用户传入的句柄

下面的while循环就是建立一个链表，每个链表的节点是一个page大小（通常是4k），这链表节点由一个指向struct poll_list的指针掌控，而众多的struct pollfd就通过struct poll_list的entries成员访问。上面的循环就是把用户态的struct pollfd拷进这些entries里。通常用户程序的poll调用就监控几个fd，所以上面这个链表通常也就只需要一个节点，即操作系统的一页。但是，当用户传入的fd很多时，由于poll系统调用每次都要把所有struct pollfd拷进内核，所以参数传递和页分配此时就成了poll系统调用的性能瓶颈。

head = NULL;

walk = NULL;

i = nfds;

err = -ENOMEM;

while(i!=0) {

struct poll_list *pp;

pp = kmalloc(sizeof(struct poll_list)+

sizeof(struct pollfd)*

(i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i),

GFP_KERNEL);

if(pp==NULL)

goto out_fds;

pp->next=NULL;

pp->len = (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i);

if (head == NULL)

head = pp;

else

walk->next = pp;

walk = pp;

if (copy_from_user(pp->entries, ufds + nfds-i,

sizeof(struct pollfd)*pp->len)) {

err = -EFAULT;

goto out_fds;

}

i -= pp->len;

}

调用do_poll，检测是否有句柄就绪

fdcount = do_poll(nfds, head, &table, timeout);

/* OK, now copy the revents fields back to user space. */

将revents字段拷贝到用户空间

下面的while循环就是将do_poll之后的revents字段拷贝到用户空间，当句柄数很多时，这也是poll系统调用的性能瓶颈。

walk = head;

err = -EFAULT;

while(walk != NULL) {

struct pollfd *fds = walk->entries;

int j;

for (j=0; j < walk->len; j++, ufds++) {

if(__put_user(fds[j].revents, &ufds->revents))

goto out_fds;

}

walk = walk->next;

}

err = fdcount;

if (!fdcount && signal_pending(current))

err = -EINTR;

释放分配的空间

out_fds:

walk = head;

while(walk!=NULL) {

struct poll_list *pp = walk->next;

kfree(walk);

walk = pp;

}

poll_freewait(&table);

return err;

}

2、poll_initwait

设定poll_table对应回调函数

很明显，poll_initwait的主要动作就是把table变量的成员poll_table对应的回调函数置为__pollwait。这个__pollwait不仅是poll系统调用需要，select系统调用也一样是用这个__pollwait，说白了，这（__pollwait）是个操作系统的异步操作的“御用”回调函数。

void poll_initwait(struct poll_wqueues *pwq)

{

init_poll_funcptr(&pwq->pt, __pollwait);

//&(pwq->pt)->qproc=__pollwait;

pwq->error = 0;

pwq->table = NULL;

}

3、do_poll

static int do_poll(unsigned int nfds, struct poll_list *list,

struct poll_wqueues *wait, long timeout)

{

int count = 0;

poll_table* pt = &wait->pt;

if (!timeout)

pt = NULL;

for (;;) {

struct poll_list *walk;

set_current_state(TASK_INTERRUPTIBLE);

walk = list;

while(walk != NULL) {

DO_POLLFD调用

对用户传入的每一个句柄,调用do_pollfd设定在本句柄上等待的事件发生时需要执行的回调函数，同时检测本句柄上等待的事件是否就绪，如果就绪则递增count。当用户传入的fd很多时（比如1000个），对do_pollfd就会调用很多次，poll效率瓶颈的另一原因就在这里

do_pollfd( walk->len, walk->entries, &pt, &count);

walk = walk->next;

}

pt = NULL;

当有等待的时间就绪，设定的时间超时或者本进程收到信号则跳出循环，返回。

if (count || !timeout || signal_pending(current))

break;

count = wait->error;

if (count)

break;

休眠

让current挂起，别的进程跑，timeout到了以后再回来运行current，或者是在current等待的事件发生时返回，然后回到for循环的其实处，重新扫描所有句柄。

timeout = schedule_timeout(timeout);

}

__set_current_state(TASK_RUNNING);

return count;

}

4、do_pollfd

static void do_pollfd(unsigned int num, struct pollfd * fdpage,

poll_table ** pwait, int *count)

{

int i;

for (i = 0; i < num; i++) {

int fd;

unsigned int mask;

struct pollfd *fdp;

mask = 0;

fdp = fdpage+i;

fd = fdp->fd;

if (fd >= 0) {

struct file * file = fget(fd);

mask = POLLNVAL;

if (file != NULL) {

mask = DEFAULT_POLLMASK;

if (file->f_op && file->f_op->poll)

调用设备驱动程序的poll函数

如果fd对应的是某个socket，do_pollfd调用的就是网络设备驱动实现的poll；如果fd对应的是某个ext3文件系统上的一个打开文件，那do_pollfd调用的就是ext3文件系统驱动实现的poll。一句话，这个file->f_op->poll是设备驱动程序实现的，那设备驱动程序的poll实现通常又是什么样子呢？其实，设备驱动程序的标准实现是：调用poll_wait，即以设备自己的等待队列为参数（通常设备都有自己的等待队列，不然一个不支持异步操作的设备会让人很郁闷）调用struct poll_table的回调函数。

mask = file->f_op->poll(file, *pwait);

mask &= fdp->events | POLLERR | POLLHUP;

fput(file);

}

if (mask) {

*pwait = NULL;

(*count)++;

}

fdp->revents = mask;

}

5、tcp_poll

unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)

{

unsigned int mask;

struct sock *sk = sock->sk;

struct tcp_opt *tp = tcp_sk(sk);

调用__poll_wait,设定回调函数

tcp_poll的核心实现就是poll_wait，而poll_wait就是调用struct poll_table对应的回调函数，那poll系统调用对应的回调函数就是__poll_wait，所以这里几乎就可以把tcp_poll理解为一个语句：__poll_wait(file, sk->sk_sleep, wait);由此也可以看出，每个socket自己都带有一个等待队列sk_sleep，所以上面我们所说的“设备的等待队列”其实不止一个。

poll_wait(file, sk->sk_sleep, wait);

//__poll_wait(file, sk->sk_sleep, wait);

if (sk->sk_state == TCP_LISTEN)

return tcp_listen_poll(sk, wait);

/* Socket is not locked. We are protected from async events

by poll logic and correct handling of state changes

made by another threads is impossible in any case.

测试套接字的状态，返回给调用者

mask = 0;

if (sk->sk_err)

mask = POLLERR;

* POLLHUP is certainly not done right. But poll() doesn't

* have a notion of HUP in just one direction, and for a

* socket the read side is more interesting.

* Some poll() documentation says that POLLHUP is incompatible

* with the POLLOUT/POLLWR flags, so somebody should check this

* all. But careful, it tends to be safer to return too many

* bits than too few, and you can easily break real applications

* if you don't tell them that something has hung up!

* Check-me.

* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and

* our fs/select.c). It means that after we received EOF,

* poll always returns immediately, making impossible poll() on write()

* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP

* if and only if shutdown has been made in both directions.

* Actually, it is interesting to look how Solaris and DUX

* solve this dilemma. I would prefer, if PULLHUP were maskable,

* then we could set it on SND_SHUTDOWN. BTW examples given

* in Stevens' books assume exactly this behaviour, it explains

* why PULLHUP is incompatible with POLLOUT. --ANK

* NOTE. Check for TCP_CLOSE is added. The goal is to prevent

* blocking on fresh not-connected or disconnected socket. --ANK

if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)

mask |= POLLHUP;

if (sk->sk_shutdown & RCV_SHUTDOWN)

mask |= POLLIN | POLLRDNORM;

/* Connected? */

if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {

/* Potential race condition. If read of tp below will

* escape above sk->sk_state, we can be illegally awaken

* in SYN_* states. */

if ((tp->rcv_nxt != tp->copied_seq) &&

(tp->urg_seq != tp->copied_seq ||

tp->rcv_nxt != tp->copied_seq + 1 ||

sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))

mask |= POLLIN | POLLRDNORM;

if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {

if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {

mask |= POLLOUT | POLLWRNORM;

} else { /* send SIGIO later */

set_bit(SOCK_ASYNC_NOSPACE,

&sk->sk_socket->flags);

set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);

/* Race breaker. If space is freed after

* wspace test but before the flags are set,

* IO signal will be lost.

if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))

mask |= POLLOUT | POLLWRNORM;

}

if (tp->urg_data & TCP_URG_VALID)

mask |= POLLPRI;

}

return mask;

}

6、__pollwait

void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p)

{

struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);

struct poll_table_page *table = p->table;

创建相应的数据结构

if (!table || POLL_TABLE_FULL(table)) {

struct poll_table_page *new_table;

new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);

if (!new_table) {

p->error = -ENOMEM;

__set_current_state(TASK_RUNNING);

return;

}

new_table->entry = new_table->entries;

new_table->next = table;

p->table = new_table;

table = new_table;

}

将当前进程挂在设备的等待队列上

/* Add a new entry */

{

struct poll_table_entry * entry = table->entry;

table->entry = entry+1;

get_file(filp);

entry->filp = filp;

entry->wait_address = wait_address;

init_waitqueue_entry(&entry->wait, current);

add_wait_queue(wait_address,&entry->wait);

}

__poll_wait的作用就是创建了下图所示的数据结构（一次__poll_wait即一次设备poll调用只创建一个poll_table_entry），并通过struct poll_table_entry的wait成员，把current挂在了设备的等待队列上，此处的等待队列是wait_address，对应tcp_poll里的sk->sk_sleep。