Linux Kernel2.6.9内核源码分析–select
需要解决的问题:
通过追踪内核源码,查看内核是如何实现select监听的功能
首先来看下select API的定义和参数:
int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
参数说明:
int nfds:是一个整数值, 表示集合中所有文件描述符的范围,即所有文件描述符的最大值+1
在后面的代码中可以看到Linux 内核的实现方式是从0 ~~ nfds作为下标在进程描述符中的files数组中,依次监听各个文件描述
符有没有事件上报
fd_set *readfds, *writefds,*exceptfds:分别代表监听读/写/错误的文件描述符集,实际上是一个long型的数组.当select返回后,内核会修改集合中的值从而集合中的值不再代表原始的文件描述符,因此每次调用select前都需要重新初始化这些文件描述符集.如果某个文件描述上有事件发生,则将对应fds中的值设置为1,没有时间发生就设置为0。如readfds集合中第二个文件有读时间发生,则该数组中第二个值变成了1,不再是原始的文件描述符的值.
struct timeval *timeout:超时时间,超过这个时间,无论有没有监听到事件,则不再阻塞都立刻返回.
返回值:返回执行错误代码
再来看下select API对应的系统调用:
long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
1.查看参数n是否超过了最大值max_fdset,超过了n就等于max_fdset
2.从slab中分配6 x n x sizeof(long)空间,分别为:fds.in ,fds.out, fds.ex, fds.res_in fds.res_out,fds.res_ex,然后将user space的参数inp, outp, exp copy赋值分别赋值给fds.in ,fds.out, fds.ex
3.调用do_select,在该函数中会sleep直到timeout //核心函数
4.将fds.res_in fds.res_out,fds.res_ex 赋值给fds.in ,fds.out, fds.ex,从而用户就知道哪个文件描述符有事件发生.
long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
{
...............
/* max_fdset can increase, so grab it once to avoid race */
max_fdset = current->files->max_fdset;
if (n > max_fdset)
n = max_fdset;
/*
* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
* since we used fdset we need to allocate memory in units of
* long-words.
*/
ret = -ENOMEM;
size = FDS_BYTES(n);
bits = select_bits_alloc(size);
if (!bits)
goto out_nofds;
fds.in = (unsigned long *) bits;
fds.out = (unsigned long *) (bits + size);
fds.ex = (unsigned long *) (bits + 2*size);
fds.res_in = (unsigned long *) (bits + 3*size);
fds.res_out = (unsigned long *) (bits + 4*size);
fds.res_ex = (unsigned long *) (bits + 5*size);
if ((ret = get_fd_set(n, inp, fds.in)) ||
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
zero_fd_set(n, fds.res_in);
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex);
ret = do_select(n, &fds, &timeout);
...........
if (set_fd_set(n, inp, fds.res_in) ||
set_fd_set(n, outp, fds.res_out) ||
set_fd_set(n, exp, fds.res_ex))
ret = -EFAULT;
out:
select_bits_free(bits, size);
out_nofds:
return ret;
}
再来看 int do_select(int n, fd_set_bits *fds, long *timeout)
- void poll_initwait(struct poll_wqueues *pwq) 是将struct poll_wqueues table变量进行初始化:
struct poll_wqueues {
poll_table pt;
struct poll_table_page * table;
int error;
};
其中pt 是一个函数指针:typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
table.pt = __pollwait 函数,后面再来分析这个函数
table.table = NULL,
table.error = 0 - 大循环依次遍历每个文件描述符,调用file->f_op->poll,并只传入table.pt 一次(不是很理解!!!!)
在前一边博客eventpoll中有解释到,以本地socket为例,file->f_op->poll,最终会调用到unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait),其会先调用
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && wait_address)
p->qproc(filp, wait_address, p);
}
也就是前面table.pt 即__pollwait 函数
再来看下__pollwait函数:
void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p)
{
…
{
struct poll_table_entry * entry = table->entry;
table->entry = entry+1;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
init_waitqueue_entry(&entry->wait, current);
add_wait_queue(wait_address,&entry->wait);
}
}
也就是将当前file关联的等待队列加入到sk->sk_sleep链表,再socket状态变化时,执行等待队列的回调函数,唤醒等待的进程. - 调用file->f_op->poll返回后,如果有event,则将对应的res_in/res_out/res_ex设定为1.
static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; unsigned int mask; poll_wait(file, sk->sk_sleep, wait); mask = 0; /* exceptional events? */ if (sk->sk_err) mask |= POLLERR; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* readable? */ if (!skb_queue_empty(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE) mask |= POLLHUP; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ if (unix_writable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; return mask; }
int do_select(int n, fd_set_bits *fds, long *timeout)
{
.............
poll_initwait(&table); // -------> step 1
wait = &table.pt;
if (!__timeout)
wait = NULL;
retval = 0;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
set_current_state(TASK_INTERRUPTIBLE);
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = 1, mask, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
struct file_operations *f_op = NULL;
struct file *file = NULL;
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
if (all_bits == 0) {
i += __NFDBITS;
continue;
}
for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
if (i >= n)
break;
if (!(bit & all_bits))
continue;
file = fget(i);
if (file) {
f_op = file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll)
mask = (*f_op->poll)(file, retval ? NULL : wait); // -------> 为何retval > 0 就不传入wait了 ??????
fput(file);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
}
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
}
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
}
}
}
if (res_in)
*rinp = res_in;
if (res_out)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
}
wait = NULL;
if (retval || !__timeout || signal_pending(current)) // -------> step 3
break;
if(table.error) {
retval = table.error;
break;
}
__timeout = schedule_timeout(__timeout);
}
__set_current_state(TASK_RUNNING);
poll_freewait(&table);
*timeout = __timeout;
return retval;
}