poll 系统调用(1)

1.用户空间

/**********************************************************************************

 * include/asm-generic/poll.h
 * These are specified by iBCS2
#define POLLIN        0x0001
#define POLLPRI        0x0002
#define POLLOUT        0x0004
#define POLLERR        0x0008
#define POLLHUP        0x0010
#define POLLNVAL    0x0020

struct pollfd {
    int fd;
    short events;
    short revents;
};
*/

 
static void run_app(struct app_data_t *app)
{
    struct pollfd pfd[2];
    int ret = 0;

    pfd[0].fd = app->afd;
    pfd[0].events = POLLIN | POLLERR | POLLHUP | POLLNVAL;
    pfd[1].fd = app->bfd;
    pfd[1].events = POLLIN | POLLERR | POLLHUP | POLLNVAL;
    
    while(1) {
        int rfd;
        int wfd;
/*这里的poll会被阻塞?return event 是怎样被赋值的?
 *
 */
        ret = poll(pfd, 2, -1);
        if(ret < 0){
            if(errno == EINTR || errno == ENOMEM)
                continue;
            else
                break;
        }

        if(pfd[0].revents) {
            if(!(pfd[0].revents & POLLIN))
                return;
            rfd = app->afd;
            wfd = app->bfd;
        } else if(pfd[1].revents) {
            if(!(pfd[1].revents & POLLIN))
                return;
            rfd = app->bfd;
            wfd = app->afd;
        } else {
            rfd = wfd = -1;
        }
        
        if(rfd >= 0 && wfd >=0) {
            char buf[128];
            int len;
            len = safe_read(rfd, buf, sizeof(buf));
            if(len > 0) {
                len = full_write(wfd, buf, len);
                if(len < 0)
                    return;
            } else {
                return;
            }
        }
    }
}

/*这里系统调用为poll(pfd, 2, -1),timeout_msecs == -1 < 0
 *do_sys_poll(ufds, nfds, NULL);

 */

2. 内核空间

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
        int, timeout_msecs)
{
    struct timespec end_time, *to = NULL;
    int ret;

    /*2.1] 有关timeout*/

    if (timeout_msecs >= 0) {
        to = &end_time;
        poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
            NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
    }

    /*2.2] 主要的部分*/

    ret = do_sys_poll(ufds, nfds, to);

    /*2.3] #define EINTR 4 Interrupted system call */

    if (ret == -EINTR) {
        struct restart_block *restart_block;

        restart_block = &current_thread_info()->restart_block;
        restart_block->fn = do_restart_poll;
        restart_block->poll.ufds = ufds;
        restart_block->poll.nfds = nfds;

        if (timeout_msecs >= 0) {
            restart_block->poll.tv_sec = end_time.tv_sec;
            restart_block->poll.tv_nsec = end_time.tv_nsec;
            restart_block->poll.has_timeout = 1;
        } else
            restart_block->poll.has_timeout = 0;

        ret = -ERESTART_RESTARTBLOCK;
    }
    return ret;
}
系统调用int poll(struct pollfd *fds, nfds_t nfds, int timeout)的实现由3部分组成:
1.有关timeout;2.主题函数do_sys_poll;3.特殊情况的处理;

2.2 do_sys_poll

int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
        struct timespec *end_time)
{
    struct poll_wqueues table;
     int err = -EFAULT, fdcount, len, size;
    /* Allocate small arguments on the stack to save memory and be
       faster - use long to make sure the buffer is aligned properly
       on 64 bit archs to avoid unaligned access */
    long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
    struct poll_list *const head = (struct poll_list *)stack_pps;
     struct poll_list *walk = head;
     unsigned long todo = nfds;

    if (nfds > rlimit(RLIMIT_NOFILE))
        return -EINVAL;

    len = min_t(unsigned int, nfds, N_STACK_PPS);

/*这个for循环是根据输入参数创建 poll_list

为pollfd分配内存,如果请求的pollfd个数太多,堆栈中预留的不够,这样就形成了list 链表,
不是堆栈中预留的是page中分配的。

struct pollfd {
    int fd;
    short events;
    short revents;
};
 struct poll_list {
    struct poll_list *next;
    int len;
    struct pollfd entries[0];
 };
*/
    for (;;) {
        walk->next = NULL;
        walk->len = len;
        if (!len)
            break;

        if (copy_from_user(walk->entries, ufds + nfds-todo,
                    sizeof(struct pollfd) * walk->len))
            goto out_fds;

        todo -= walk->len;
        if (!todo)
            break;

        len = min(todo, POLLFD_PER_PAGE);
        size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
        walk = walk->next = kmalloc(size, GFP_KERNEL);
        if (!walk) {
            err = -ENOMEM;
            goto out_fds;
        }
    }

/*创建 poll_wqueues*/

 poll_wqueues:意思是wakeup queue,每个pollfd 都对应一项?不是它作为一个对应这次输入的整体
 *Structures and helpers for select/poll syscall
  struct poll_wqueues {
    poll_table pt;
    struct poll_table_page *table;
    struct task_struct *polling_task;
    int triggered;
    int error;
    int inline_index;
    struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
   };
   struct poll_wqueues table;这个变量的内存来自stack.
 
 * structures and helpers for f_op->poll implementations
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

 * Do not touch the structure directly, use the access functions
 * poll_does_not_wait() and poll_requested_events() instead.
typedef struct poll_table_struct {
    poll_queue_proc _qproc;
    unsigned long _key;
} poll_table;

void poll_initwait(struct poll_wqueues *pwq)
{
    init_poll_funcptr(&pwq->pt, __pollwait);
    pwq->polling_task = current;
    pwq->triggered = 0;
    pwq->error = 0;
    pwq->table = NULL;
    pwq->inline_index = 0;
}
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
    pt->_qproc = qproc;
    pt->_key   = ~0UL;  all events enabled
}
*/
    poll_initwait(&table);

/* 下面详细解释 do_poll*/

    fdcount = do_poll(nfds, head, &table, end_time);

/*poll_freewait:

 *remove_wait_queue(entry->wait_address, &entry->wait);
 *fput(entry->filp);
 */
    poll_freewait(&table);

/*每个poll_list下的每个fd

 */
    for (walk = head; walk; walk = walk->next) {
        struct pollfd *fds = walk->entries;
        int j;

        for (j = 0; j < walk->len; j++, ufds++)
            if (__put_user(fds[j].revents, &ufds->revents))
                goto out_fds;
      }

    err = fdcount;
out_fds:

/*这个while释放poll_list指针变量的内存

 */
    walk = head->next;
    while (walk) {
        struct poll_list *pos = walk;
        walk = walk->next;
        kfree(pos);
    }

/*正常情况下,返回的是do_poll的返回值,系统调用返回*/

    return err;
}

/*do_sys_poll函数根据用户输入创建了poll_list, 又创建了poll_wqueues, 但是两者是怎样发生关系的那?
 *其他的都是调用do_poll后释放资源。

 */

do_poll是do_sys_poll的核心部分

该函数会调用每个 file 的file_operations 的 poll 函数,如果没有 event   发生且time out  则sleep

static int do_poll(unsigned int nfds,  struct poll_list *list,

           struct poll_wqueues *wait, struct timespec *end_time)
{
    poll_table* pt = &wait->pt;
    ktime_t expire, *to = NULL;
    int timed_out = 0, count = 0;
    unsigned long slack = 0;

    /*1] 这里不关心end_time的情况即end_time == NULL*/

    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
        pt->_qproc = NULL;
        timed_out = 1;
    }

    if (end_time && !timed_out)
        slack = select_estimate_accuracy(end_time);

    for (;;) {
        struct poll_list *walk;

        /*这个for循环是对这个整个 poll_list*/

        for (walk = list; walk != NULL; walk = walk->next) {
            struct pollfd * pfd, * pfd_end;

            pfd = walk->entries;
            pfd_end = pfd + walk->len;

            /*这个for循环是对每个 entries*/

            for (; pfd != pfd_end; pfd++) {
                /*
                 * Fish for events. If we found one, record it
                 * and kill poll_table->_qproc, so we don't
                 * needlessly register any other waiters after
                 * this. They'll get immediately deregistered
                 * when we break out and return.
                 */
                if (do_pollfd(pfd, pt)) {
                    count++;
                    pt->_qproc = NULL;
                }
            }
        }
        /*
         * All waiters have already been registered, so don't provide
         * a poll_table->_qproc to them on the next loop iteration.
         */
        pt->_qproc = NULL;
        if (!count) {
            /*如果count==0,这里还有机会count赋值, wait->error哪里赋值的?*/
            count = wait->error;
            if (signal_pending(current))
                count = -EINTR;
        }

        /*这里就退出啦,没有看到该函数阻塞的地方,在哪里呀?do_pollfd,从下面可知休眠不再 do_pollfd中*/

        if (count || timed_out)
            break;

        /*
         * If this is the first loop and we have a timeout
         * given, then we convert to ktime_t and set the to
         * pointer to the expiry value.
         */
        if (end_time && !to) {
            expire = timespec_to_ktime(*end_time);
            to = &expire;
        }

        /*是在这里调度的,或者说是这里使当前进程休眠*/

        if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
            timed_out = 1;
    }
    return count;
}

do_pollfd


/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.
 */

static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)

{
    unsigned int mask;
    int fd;

    /*1]开始 mask = 0, 写个测试代码验证,如果没有事件发生mask == 0*/

    mask = 0;
    fd = pollfd->fd;
    if (fd >= 0) {
        int fput_needed;
        struct file * file;

        file = fget_light(fd, &fput_needed);

        /*2]如果fd>=0, 则mask = POLLNVAL   #define POLLNVAL 0x0020*/

        mask = POLLNVAL;
        if (file != NULL) {

            /*3]如果得到file,则masPOLLMASK = DEFAULT_;

             *#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
             */
            mask = DEFAULT_POLLMASK;

            /*4]调用 file_operations的 poll函数:mask = file->f_op->poll(file, pwait);

             *在这里赋值pwait->_key为用户关心的 events
             */
            if (file->f_op && file->f_op->poll) {
                pwait->_key = pollfd->events|POLLERR|POLLHUP;
                mask = file->f_op->poll(file, pwait);
            }

            /*5] Mask out unneeded events. */

            mask &= pollfd->events | POLLERR | POLLHUP;
            fput_light(file, fput_needed);
        }
    }

    /*赋值fd return event 为mask*/

    pollfd->revents = mask;
    return mask;
}


调用file operation具体的poll函数 n_tty_poll

/** tty_poll直接调用到n_tty_poll,又引入了变量tty_struct

 *    tty_poll  - check tty status
 *    @filp: file being polled
 *    @wait: poll wait structures to update
 *
 *    Call the line discipline polling method to obtain the poll
 *    status of the device.
 */
static unsigned int tty_poll(struct file *filp, poll_table *wait)
{
    struct tty_struct *tty = file_tty(filp);
    struct tty_ldisc *ld;
    int ret = 0;

    if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_poll"))
        return 0;

    ld = tty_ldisc_ref_wait(tty);
    if (ld->ops->poll)
        ret = (ld->ops->poll)(tty, filp, wait);
    tty_ldisc_deref(ld);
    return ret;
}

/** 这里根据具体的情况,对mask赋值,为什么还调用poll_wait?

 *    n_tty_poll        -    poll method for N_TTY
 *    @tty: terminal device
 *    @file: file accessing it
 *    @wait: poll table
 *
 *    Called when the line discipline is asked to poll() for data or
 *    for special events. This code is not serialized with respect to
 *    other events save open/close.
 */
static unsigned int n_tty_poll(struct tty_struct *tty, struct file *file,
                            poll_table *wait)
{
    unsigned int mask = 0;

    poll_wait(file, &tty->read_wait, wait);
    poll_wait(file, &tty->write_wait, wait);
    if (input_available_p(tty, TIME_CHAR(tty) ? 0 : MIN_CHAR(tty)))
        mask |= POLLIN | POLLRDNORM;
    if (tty->packet && tty->link->ctrl_status)
        mask |= POLLPRI | POLLIN | POLLRDNORM;
    if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
        mask |= POLLHUP;
    if (tty_hung_up_p(file))
        mask |= POLLHUP;
    if (!(mask & (POLLHUP | POLLIN | POLLRDNORM))) {
        if (MIN_CHAR(tty) && !TIME_CHAR(tty))
            tty->minimum_to_wake = MIN_CHAR(tty);
        else
            tty->minimum_to_wake = 1;
    }
    if (tty->ops->write && !tty_is_writelocked(tty) &&
            tty_chars_in_buffer(tty) < WAKEUP_CHARS &&
            tty_write_room(tty) > 0)
        mask |= POLLOUT | POLLWRNORM;
    return mask;
}

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
    if (p && p->_qproc && wait_address)
        p->_qproc(filp, wait_address, p);
}

init_poll_funcptr(&pwq->pt, __pollwait);

/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                poll_table *p)
{
    struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
    struct poll_table_entry *entry = poll_get_entry(pwq);
    if (!entry)
        return;
    get_file(filp);
    entry->filp = filp;
    entry->wait_address = wait_address;
    entry->key = p->_key;
    init_waitqueue_func_entry(&entry->wait, pollwake);
    entry->wait.private = pwq;
    add_wait_queue(wait_address, &entry->wait);
}
/*一个fd, 可能会对应多个poll_table_entry*/
struct poll_table_entry {
    struct file *filp;
    unsigned long key;
    wait_queue_t wait;
    wait_queue_head_t *wait_address;
};

/*把wait_queue_t加入wait_queue_head_t指向的队列*/
void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
    unsigned long flags;

    wait->flags &= ~WQ_FLAG_EXCLUSIVE;
    spin_lock_irqsave(&q->lock, flags);
    __add_wait_queue(q, wait);
    spin_unlock_irqrestore(&q->lock, flags);
}

/*当do_pollfd返回0时或者 timeout时, poll_schedule_timeout被调用,在该函数中 sleep*/

int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
              ktime_t *expires, unsigned long slack)
{
    int rc = -EINTR;

    set_current_state(state);
    if (!pwq->triggered)
        rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
    __set_current_state(TASK_RUNNING);

    /*
     * Prepare for the next iteration.
     *
     * The following set_mb() serves two purposes.  First, it's
     * the counterpart rmb of the wmb in pollwake() such that data
     * written before wake up is always visible after wake up.
     * Second, the full barrier guarantees that triggered clearing
     * doesn't pass event check of the next iteration.  Note that
     * this problem doesn't exist for the first iteration as
     * add_wait_queue() has full barrier semantics.
     */
    set_mb(pwq->triggered, 0);

    return rc;
}

/*控制sleep的条件是pwq->triggered, 这个变量又是何时改变的那?

 * pollwake -> __pollwake -> pwq->triggered = 1
 * poll_wait被调用多次,就是说 有多种可能更改 pwq->triggered,只有一个事件发生
 * 就会唤醒进程,跳出死循环,返回发生的事件
    poll_wait(file, &tty->read_wait, wait);
    poll_wait(file, &tty->write_wait, wait);
 */
static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_wqueues *pwq = wait->private;
    DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

    /*
     * Although this function is called under waitqueue lock, LOCK
     * doesn't imply write barrier and the users expect write
     * barrier semantics on wakeup functions.  The following
     * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
     * and is paired with set_mb() in poll_schedule_timeout.
     */
    smp_wmb();
    pwq->triggered = 1;

    /*
     * Perform the default wake up operation using a dummy
     * waitqueue.
     *
     * TODO: This is hacky but there currently is no interface to
     * pass in @sync.  @sync is scheduled to be removed and once
     * that happens, wake_up_process() can be used directly.
     */
    return default_wake_function(&dummy_wait, mode, sync, key);
}

static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_table_entry *entry;

    entry = container_of(wait, struct poll_table_entry, wait);
    if (key && !((unsigned long)key & entry->key))
        return 0;
    return __pollwake(wait, mode, sync, key);
}

/*

 *又是何时pollwake被调用?

 */

wake_up_interruptible(&tty->read_wait);

#define wake_up_interruptible(x)    __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
void __wake_up(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, void *key)
{
    unsigned long flags;

    spin_lock_irqsave(&q->lock, flags);
    __wake_up_common(q, mode, nr_exclusive, 0, key);
    spin_unlock_irqrestore(&q->lock, flags);
}

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, int wake_flags, void *key)
{
    wait_queue_t *curr, *next;

    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
        unsigned flags = curr->flags;
        /*这里调用了pollwake*/
        if (curr->func(curr, mode, wake_flags, key) &&
                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
    }
}

怎样从open file返回值得到内核描述文件的file

有多个函数可以 从open file返回值得到内核描述文件的file,其核心都是:

struct files_struct *files = current->files;

struct file *file = fcheck_files(files, fd);

Lightweight file lookup: fgetlight ->fcheck_files(files, fd)

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
 * You can use this instead of fget if you satisfy all of the following
 * conditions:
 * 1) You must call fput_light before exiting the syscall and returning control
 *    to userspace (i.e. you cannot remember the returned struct file * after
 *    returning to userspace).
 * 2) You must not call filp_close on the returned struct file * in between
 *    calls to fget_light and fput_light.
 * 3) You must not clone the current task in between the calls to fget_light
 *    and fput_light.
 *
 * The fput_needed flag returned by fget_light should be passed to the
 * corresponding fput_light.
 */
struct file *fget_light(unsigned int fd, int *fput_needed)
{
    struct file *file;
    struct files_struct *files = current->files;

    *fput_needed = 0;
    if (atomic_read(&files->count) == 1) {
        file = fcheck_files(files, fd);
        if (file && (file->f_mode & FMODE_PATH))
            file = NULL;
    } else {
        rcu_read_lock();
        file = fcheck_files(files, fd);
        if (file) {
            if (!(file->f_mode & FMODE_PATH) &&
                atomic_long_inc_not_zero(&file->f_count))
                *fput_needed = 1;
            else
                /* Didn't get the reference, someone's freed */
                file = NULL;
        }
        rcu_read_unlock();
    }

    return file;
}

fget,fget_raw, fget_light, fget_raw_light

struct file *fget(unsigned int fd)
{
    struct file *file;
    struct files_struct *files = current->files;

    rcu_read_lock();
    file = fcheck_files(files, fd);
    if (file) {
        /* File object ref couldn't be taken */
        if (file->f_mode & FMODE_PATH ||
            !atomic_long_inc_not_zero(&file->f_count))
            file = NULL;
    }
    rcu_read_unlock();

    return file;
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值