1.用户空间
/**********************************************************************************
* include/asm-generic/poll.h* These are specified by iBCS2
#define POLLIN 0x0001
#define POLLPRI 0x0002
#define POLLOUT 0x0004
#define POLLERR 0x0008
#define POLLHUP 0x0010
#define POLLNVAL 0x0020
struct pollfd {
int fd;
short events;
short revents;
};
*/
static void run_app(struct app_data_t *app)
{
struct pollfd pfd[2];
int ret = 0;
pfd[0].fd = app->afd;
pfd[0].events = POLLIN | POLLERR | POLLHUP | POLLNVAL;
pfd[1].fd = app->bfd;
pfd[1].events = POLLIN | POLLERR | POLLHUP | POLLNVAL;
while(1) {
int rfd;
int wfd;
/*这里的poll会被阻塞?return event 是怎样被赋值的?
*
*/
ret = poll(pfd, 2, -1);
if(ret < 0){
if(errno == EINTR || errno == ENOMEM)
continue;
else
break;
}
if(pfd[0].revents) {
if(!(pfd[0].revents & POLLIN))
return;
rfd = app->afd;
wfd = app->bfd;
} else if(pfd[1].revents) {
if(!(pfd[1].revents & POLLIN))
return;
rfd = app->bfd;
wfd = app->afd;
} else {
rfd = wfd = -1;
}
if(rfd >= 0 && wfd >=0) {
char buf[128];
int len;
len = safe_read(rfd, buf, sizeof(buf));
if(len > 0) {
len = full_write(wfd, buf, len);
if(len < 0)
return;
} else {
return;
}
}
}
}
/*这里系统调用为poll(pfd, 2, -1),timeout_msecs == -1 < 0
*do_sys_poll(ufds, nfds, NULL);
*/
2. 内核空间
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
struct timespec end_time, *to = NULL;
int ret;
/*2.1] 有关timeout*/
if (timeout_msecs >= 0) {to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}
/*2.2] 主要的部分*/
ret = do_sys_poll(ufds, nfds, to);/*2.3] #define EINTR 4 Interrupted system call */
if (ret == -EINTR) {struct restart_block *restart_block;
restart_block = ¤t_thread_info()->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
if (timeout_msecs >= 0) {
restart_block->poll.tv_sec = end_time.tv_sec;
restart_block->poll.tv_nsec = end_time.tv_nsec;
restart_block->poll.has_timeout = 1;
} else
restart_block->poll.has_timeout = 0;
ret = -ERESTART_RESTARTBLOCK;
}
return ret;
}
系统调用int poll(struct pollfd *fds, nfds_t nfds, int timeout)的实现由3部分组成:
1.有关timeout;2.主题函数do_sys_poll;3.特殊情况的处理;
2.2 do_sys_poll
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,struct timespec *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds;
if (nfds > rlimit(RLIMIT_NOFILE))
return -EINVAL;
len = min_t(unsigned int, nfds, N_STACK_PPS);
/*这个for循环是根据输入参数创建 poll_list
为pollfd分配内存,如果请求的pollfd个数太多,堆栈中预留的不够,这样就形成了list 链表,不是堆栈中预留的是page中分配的。
struct pollfd {
int fd;
short events;
short revents;
};
struct poll_list {
struct poll_list *next;
int len;
struct pollfd entries[0];
};
*/
for (;;) {
walk->next = NULL;
walk->len = len;
if (!len)
break;
if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len))
goto out_fds;
todo -= walk->len;
if (!todo)
break;
len = min(todo, POLLFD_PER_PAGE);
size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
walk = walk->next = kmalloc(size, GFP_KERNEL);
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}
/*创建 poll_wqueues*/
poll_wqueues:意思是wakeup queue,每个pollfd 都对应一项?不是它作为一个对应这次输入的整体*Structures and helpers for select/poll syscall
struct poll_wqueues {
poll_table pt;
struct poll_table_page *table;
struct task_struct *polling_task;
int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};
struct poll_wqueues table;这个变量的内存来自stack.
* structures and helpers for f_op->poll implementations
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
* Do not touch the structure directly, use the access functions
* poll_does_not_wait() and poll_requested_events() instead.
typedef struct poll_table_struct {
poll_queue_proc _qproc;
unsigned long _key;
} poll_table;
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
pwq->polling_task = current;
pwq->triggered = 0;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->_qproc = qproc;
pt->_key = ~0UL; all events enabled
}
*/
poll_initwait(&table);
/* 下面详细解释 do_poll*/
fdcount = do_poll(nfds, head, &table, end_time);/*poll_freewait:
*remove_wait_queue(entry->wait_address, &entry->wait);*fput(entry->filp);
*/
poll_freewait(&table);
/*每个poll_list下的每个fd
*/for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;
for (j = 0; j < walk->len; j++, ufds++)
if (__put_user(fds[j].revents, &ufds->revents))
goto out_fds;
}
err = fdcount;
out_fds:
/*这个while释放poll_list指针变量的内存
*/walk = head->next;
while (walk) {
struct poll_list *pos = walk;
walk = walk->next;
kfree(pos);
}
/*正常情况下,返回的是do_poll的返回值,系统调用返回*/
return err;}
/*do_sys_poll函数根据用户输入创建了poll_list, 又创建了poll_wqueues, 但是两者是怎样发生关系的那?
*其他的都是调用do_poll后释放资源。
*/
do_poll是do_sys_poll的核心部分
该函数会调用每个 file 的file_operations 的 poll 函数,如果没有 event 发生且time out 则sleep
static int do_poll(unsigned int nfds, struct poll_list *list,
struct poll_wqueues *wait, struct timespec *end_time){
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
unsigned long slack = 0;
/*1] 这里不关心end_time的情况即end_time == NULL*/
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {pt->_qproc = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time);
for (;;) {
struct poll_list *walk;
/*这个for循环是对这个整个 poll_list*/
for (walk = list; walk != NULL; walk = walk->next) {struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
/*这个for循环是对每个 entries*/
for (; pfd != pfd_end; pfd++) {/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt)) {
count++;
pt->_qproc = NULL;
}
}
}
/*
* All waiters have already been registered, so don't provide
* a poll_table->_qproc to them on the next loop iteration.
*/
pt->_qproc = NULL;
if (!count) {
/*如果count==0,这里还有机会count赋值, wait->error哪里赋值的?*/
count = wait->error;
if (signal_pending(current))
count = -EINTR;
}
/*这里就退出啦,没有看到该函数阻塞的地方,在哪里呀?do_pollfd,从下面可知休眠不再 do_pollfd中*/
if (count || timed_out)break;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec_to_ktime(*end_time);
to = &expire;
}
/*是在这里调度的,或者说是这里使当前进程休眠*/
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))timed_out = 1;
}
return count;
}
do_pollfd
/*
* Fish for pollable events on the pollfd->fd file descriptor. We're only
* interested in events matching the pollfd->events mask, and the result
* matching that mask is both recorded in pollfd->revents and returned. The
* pwait poll_table will be used by the fd-provided poll handler for waiting,
* if pwait->_qproc is non-NULL.
*/
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{unsigned int mask;
int fd;
/*1]开始 mask = 0, 写个测试代码验证,如果没有事件发生mask == 0*/
mask = 0;fd = pollfd->fd;
if (fd >= 0) {
int fput_needed;
struct file * file;
file = fget_light(fd, &fput_needed);
/*2]如果fd>=0, 则mask = POLLNVAL #define POLLNVAL 0x0020*/
mask = POLLNVAL;if (file != NULL) {
/*3]如果得到file,则masPOLLMASK = DEFAULT_;
*#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)*/
mask = DEFAULT_POLLMASK;
/*4]调用 file_operations的 poll函数:mask = file->f_op->poll(file, pwait);
*在这里赋值pwait->_key为用户关心的 events*/
if (file->f_op && file->f_op->poll) {
pwait->_key = pollfd->events|POLLERR|POLLHUP;
mask = file->f_op->poll(file, pwait);
}
/*5] Mask out unneeded events. */
mask &= pollfd->events | POLLERR | POLLHUP;fput_light(file, fput_needed);
}
}
/*赋值fd return event 为mask*/
pollfd->revents = mask;return mask;
}
调用file operation具体的poll函数 n_tty_poll
/** tty_poll直接调用到n_tty_poll,又引入了变量tty_struct
* tty_poll - check tty status* @filp: file being polled
* @wait: poll wait structures to update
*
* Call the line discipline polling method to obtain the poll
* status of the device.
*/
static unsigned int tty_poll(struct file *filp, poll_table *wait)
{
struct tty_struct *tty = file_tty(filp);
struct tty_ldisc *ld;
int ret = 0;
if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_poll"))
return 0;
ld = tty_ldisc_ref_wait(tty);
if (ld->ops->poll)
ret = (ld->ops->poll)(tty, filp, wait);
tty_ldisc_deref(ld);
return ret;
}
/** 这里根据具体的情况,对mask赋值,为什么还调用poll_wait?
* n_tty_poll - poll method for N_TTY* @tty: terminal device
* @file: file accessing it
* @wait: poll table
*
* Called when the line discipline is asked to poll() for data or
* for special events. This code is not serialized with respect to
* other events save open/close.
*/
static unsigned int n_tty_poll(struct tty_struct *tty, struct file *file,
poll_table *wait)
{
unsigned int mask = 0;
poll_wait(file, &tty->read_wait, wait);
poll_wait(file, &tty->write_wait, wait);
if (input_available_p(tty, TIME_CHAR(tty) ? 0 : MIN_CHAR(tty)))
mask |= POLLIN | POLLRDNORM;
if (tty->packet && tty->link->ctrl_status)
mask |= POLLPRI | POLLIN | POLLRDNORM;
if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
mask |= POLLHUP;
if (tty_hung_up_p(file))
mask |= POLLHUP;
if (!(mask & (POLLHUP | POLLIN | POLLRDNORM))) {
if (MIN_CHAR(tty) && !TIME_CHAR(tty))
tty->minimum_to_wake = MIN_CHAR(tty);
else
tty->minimum_to_wake = 1;
}
if (tty->ops->write && !tty_is_writelocked(tty) &&
tty_chars_in_buffer(tty) < WAKEUP_CHARS &&
tty_write_room(tty) > 0)
mask |= POLLOUT | POLLWRNORM;
return mask;
}
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);
}
init_poll_funcptr(&pwq->pt, __pollwait);
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,poll_table *p)
{
struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
entry->key = p->_key;
init_waitqueue_func_entry(&entry->wait, pollwake);
entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait);
}
/*一个fd, 可能会对应多个poll_table_entry*/
struct poll_table_entry {
struct file *filp;
unsigned long key;
wait_queue_t wait;
wait_queue_head_t *wait_address;
};
/*把wait_queue_t加入wait_queue_head_t指向的队列*/
void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait){
unsigned long flags;
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
}
/*当do_pollfd返回0时或者 timeout时, poll_schedule_timeout被调用,在该函数中 sleep*/
int poll_schedule_timeout(struct poll_wqueues *pwq, int state,ktime_t *expires, unsigned long slack)
{
int rc = -EINTR;
set_current_state(state);
if (!pwq->triggered)
rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/*
* Prepare for the next iteration.
*
* The following set_mb() serves two purposes. First, it's
* the counterpart rmb of the wmb in pollwake() such that data
* written before wake up is always visible after wake up.
* Second, the full barrier guarantees that triggered clearing
* doesn't pass event check of the next iteration. Note that
* this problem doesn't exist for the first iteration as
* add_wait_queue() has full barrier semantics.
*/
set_mb(pwq->triggered, 0);
return rc;
}
/*控制sleep的条件是pwq->triggered, 这个变量又是何时改变的那?
* pollwake -> __pollwake -> pwq->triggered = 1* poll_wait被调用多次,就是说 有多种可能更改 pwq->triggered,只有一个事件发生
* 就会唤醒进程,跳出死循环,返回发生的事件
poll_wait(file, &tty->read_wait, wait);
poll_wait(file, &tty->write_wait, wait);
*/
static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
struct poll_wqueues *pwq = wait->private;
DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
/*
* Although this function is called under waitqueue lock, LOCK
* doesn't imply write barrier and the users expect write
* barrier semantics on wakeup functions. The following
* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
* and is paired with set_mb() in poll_schedule_timeout.
*/
smp_wmb();
pwq->triggered = 1;
/*
* Perform the default wake up operation using a dummy
* waitqueue.
*
* TODO: This is hacky but there currently is no interface to
* pass in @sync. @sync is scheduled to be removed and once
* that happens, wake_up_process() can be used directly.
*/
return default_wake_function(&dummy_wait, mode, sync, key);
}
static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
struct poll_table_entry *entry;
entry = container_of(wait, struct poll_table_entry, wait);
if (key && !((unsigned long)key & entry->key))
return 0;
return __pollwake(wait, mode, sync, key);
}
/*
*又是何时pollwake被调用?
*/wake_up_interruptible(&tty->read_wait);
#define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, 0, key);
spin_unlock_irqrestore(&q->lock, flags);
}
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
/*这里调用了pollwake*/
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
怎样从open file返回值得到内核描述文件的file
有多个函数可以 从open file返回值得到内核描述文件的file,其核心都是:
struct files_struct *files = current->files;
struct file *file = fcheck_files(files, fd);
Lightweight file lookup: fgetlight ->fcheck_files(files, fd)
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
*
* You can use this instead of fget if you satisfy all of the following
* conditions:
* 1) You must call fput_light before exiting the syscall and returning control
* to userspace (i.e. you cannot remember the returned struct file * after
* returning to userspace).
* 2) You must not call filp_close on the returned struct file * in between
* calls to fget_light and fput_light.
* 3) You must not clone the current task in between the calls to fget_light
* and fput_light.
*
* The fput_needed flag returned by fget_light should be passed to the
* corresponding fput_light.
*/
struct file *fget_light(unsigned int fd, int *fput_needed)
{
struct file *file;
struct files_struct *files = current->files;
*fput_needed = 0;
if (atomic_read(&files->count) == 1) {
file = fcheck_files(files, fd);
if (file && (file->f_mode & FMODE_PATH))
file = NULL;
} else {
rcu_read_lock();
file = fcheck_files(files, fd);
if (file) {
if (!(file->f_mode & FMODE_PATH) &&
atomic_long_inc_not_zero(&file->f_count))
*fput_needed = 1;
else
/* Didn't get the reference, someone's freed */
file = NULL;
}
rcu_read_unlock();
}
return file;
}
fget,fget_raw, fget_light, fget_raw_light
struct file *fget(unsigned int fd)
{
struct file *file;
struct files_struct *files = current->files;
rcu_read_lock();
file = fcheck_files(files, fd);
if (file) {
/* File object ref couldn't be taken */
if (file->f_mode & FMODE_PATH ||
!atomic_long_inc_not_zero(&file->f_count))
file = NULL;
}
rcu_read_unlock();
return file;
}