poll系统调用(2)

应用层使用poll/select 的地方很多,如蓝牙等。明白他们的实现很有必要。

应用层打开文件后得到是一个整形  fd. 把一组关心的fd /事件和 timeout传给内核。

如果poll的事件,没有发生,当前进程,就睡眠一段timeout时间,当睡眠时间到或者事件发生,进程就被唤醒;

进程唤醒后,继续调用fd 对应file的 poll函数,监测事件。

如果事件发生,或者timeout,就从poll系统调用返回,发生的事件返回给用户空间。

输入参数:

poll(struct pollfd __user * ufds, unsigned int nfds,int timeout_msecs)
{
    timeout_msecs的单位是msecs,?
}

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
        int, timeout_msecs)
{
    struct timespec end_time, *to = NULL;
    int ret;
/*1. 计算timeout,前提是 >=0, 如果是负数?
 */

    if (timeout_msecs >= 0) {
        to = &end_time;
        poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
            NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
    }

    ret = do_sys_poll(ufds, nfds, to);

/*2. 如果系统调用被中断,则初始化restart_block,返回ERESTART_RESTARTBLOCK给用户空间
 *   系统调用会自动restart还是?
 **/

    if (ret == -EINTR) {
        struct restart_block *restart_block;

        restart_block = &current_thread_info()->restart_block;
        restart_block->fn = do_restart_poll;
        restart_block->poll.ufds = ufds;
        restart_block->poll.nfds = nfds;

        if (timeout_msecs >= 0) {
            restart_block->poll.tv_sec = end_time.tv_sec;
            restart_block->poll.tv_nsec = end_time.tv_nsec;
            restart_block->poll.has_timeout = 1;
        } else
            restart_block->poll.has_timeout = 0;

        ret = -ERESTART_RESTARTBLOCK;
    }
    return ret;
}

/*到这里只剩下do_sys_poll*/

#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
            sizeof(struct pollfd))

/*在内存中的分布:
 *    struct poll_list + N_STACK_PPS*struct pollfd
 **
/
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
        struct timespec *end_time)
{
    struct poll_wqueues table;
     int err = -EFAULT, fdcount, len, size;
/* Allocate small arguments on the stack to save memory and be
    faster - use long to make sure the buffer is aligned properly
    on 64 bit archs to avoid unaligned access
*sizeof(long)是8个字节?
**/

    long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
    struct poll_list *const head = (struct poll_list *)stack_pps;
     struct poll_list *walk = head;
     unsigned long todo = nfds;

    if (nfds > rlimit(RLIMIT_NOFILE))
        return -EINVAL;

/*把来自用户空间的struct pollfd复制到内核空间
 **
/
    len = min_t(unsigned int, nfds, N_STACK_PPS);
    for (;;) {
        walk->next = NULL;
        walk->len = len;
        if (!len)
            break;

        if (copy_from_user(walk->entries, ufds + nfds-todo,
                    sizeof(struct pollfd) * walk->len))
            goto out_fds;

        todo -= walk->len;
        if (!todo)
            break;

        len = min(todo, POLLFD_PER_PAGE);
        size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
        walk = walk->next = kmalloc(size, GFP_KERNEL);
        if (!walk) {
            err = -ENOMEM;
            goto out_fds;
        }
    }

/*初始化结构体struct poll_wqueues
 **
/
    poll_initwait(&table);

/*得到检测的事件或者timeout
 **
/
    fdcount = do_poll(nfds, head, &table, end_time);

/*释放用于poll而申请的资源
 **/

    poll_freewait(&table);

    for (walk = head; walk; walk = walk->next) {
        struct pollfd *fds = walk->entries;
        int j;
        /* 把监测到的事件赋值到用户空间
          **
/
        for (j = 0; j < walk->len; j++, ufds++)
            if (__put_user(fds[j].revents, &ufds->revents))
                goto out_fds;
      }

    err = fdcount;
out_fds:
    walk = head->next;
/*释放用于申请的资源
 **
/
    while (walk) {
        struct poll_list *pos = walk;
        walk = walk->next;
        kfree(pos);
    }

    return err;

}



static int do_poll(unsigned int nfds,  struct poll_list *list,
           struct poll_wqueues *wait, struct timespec *end_time)
{
    poll_table* pt = &wait->pt;
    ktime_t expire, *to = NULL;
    int timed_out = 0, count = 0;
    unsigned long slack = 0;

    /* Optimise the no-wait case */
    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
        pt->_qproc = NULL;
        timed_out = 1;
    }

    if (end_time && !timed_out)
        slack = select_estimate_accuracy(end_time);

    for (;;) {
        struct poll_list *walk;

        for (walk = list; walk != NULL; walk = walk->next) {
            struct pollfd * pfd, * pfd_end;

            pfd = walk->entries;
            pfd_end = pfd + walk->len;
            for (; pfd != pfd_end; pfd++) {
                /*
                 * Fish for events. If we found one, record it
                 * and kill poll_table->_qproc, so we don't
                 * needlessly register any other waiters after
                 * this. They'll get immediately deregistered
                 * when we break out and return.
                 */

                if (do_pollfd(pfd, pt)) {
                    count++;
                    pt->_qproc = NULL;
                }
            }
        }
        /*
         * All waiters have already been registered, so don't provide
         * a poll_table->_qproc to them on the next loop iteration.
         */

        pt->_qproc = NULL;
        if (!count) {
            count = wait->error;
            if (signal_pending(current))
                count = -EINTR;
        }
        if (count || timed_out)
            break;

        /*
         * If this is the first loop and we have a timeout
         * given, then we convert to ktime_t and set the to
         * pointer to the expiry value.
         */

        if (end_time && !to) {
            expire = timespec_to_ktime(*end_time);
            to = &expire;
        }

        if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
            timed_out = 1;
    }
    return count;
}



/***************************************************************************************************/
struct pollfd {
    int fd;
    short events;
    short revents;
};
struct poll_list {
    struct poll_list *next;
    int len;
    struct pollfd entries[0];
};

/*
 * Structures and helpers for select/poll syscall
 */
struct poll_wqueues {
    poll_table pt;
    struct poll_table_page *table;
    struct task_struct *polling_task;
    int triggered;
    int error;
    int inline_index;
    struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};
/*对应每个文件:poll_table_entry*/
struct poll_table_entry {
    struct file *filp;
    unsigned long key;
    wait_queue_t wait;
    wait_queue_head_t *wait_address;
};

/*
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

/*
 * Do not touch the structure directly, use the access functions
 * poll_does_not_wait() and poll_requested_events() instead.
 */
typedef struct poll_table_struct {
    poll_queue_proc _qproc;
    unsigned long _key;
} poll_table;


void poll_initwait(struct poll_wqueues *pwq)
{
    init_poll_funcptr(&pwq->pt, __pollwait);
    pwq->polling_task = current;
    pwq->triggered = 0;
    pwq->error = 0;
    pwq->table = NULL;
    pwq->inline_index = 0;
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
    pt->_qproc = qproc;
    pt->_key   = ~0UL; /* all events enabled */
}

/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.
 */

static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
    unsigned int mask;
    int fd;

    mask = 0;
    fd = pollfd->fd;
    if (fd >= 0) {
        int fput_needed;
        struct file * file;
/*从文件句柄得到struct file
 **/

        file = fget_light(fd, &fput_needed);
        mask = POLLNVAL;
        if (file != NULL) {
            mask = DEFAULT_POLLMASK;
            if (file->f_op && file->f_op->poll) {
                 /*设置监测的事件:
                 **
/
                pwait->_key = pollfd->events|POLLERR|POLLHUP;
                mask = file->f_op->poll(file, pwait);
            }
            /* Mask out unneeded events. */
            mask &= pollfd->events | POLLERR | POLLHUP;
            fput_light(file, fput_needed);
        }
    }
    pollfd->revents = mask;

    return mask;
}

static const struct file_operations tty_fops = {
    .llseek        = no_llseek,
    .read        = tty_read,
    .write        = tty_write,
    .poll        = tty_poll,
    .unlocked_ioctl    = tty_ioctl,
    .compat_ioctl    = tty_compat_ioctl,
    .open        = tty_open,
    .release    = tty_release,
    .fasync        = tty_fasync,
};

static unsigned int tty_poll(struct file *filp, poll_table *wait)
{
    struct tty_struct *tty = file_tty(filp);
    struct tty_ldisc *ld;
    int ret = 0;

    if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_poll"))
        return 0;

    ld = tty_ldisc_ref_wait(tty);
    if (ld->ops->poll)
        ret = (ld->ops->poll)(tty, filp, wait);
    tty_ldisc_deref(ld);
    return ret;
}

static unsigned int n_tty_poll(struct tty_struct *tty, struct file *file,
                            poll_table *wait)
{
    unsigned int mask = 0;
/* poll_wait只是把poll request添加到
 * wait_queue_head_t write_wait;
 * wait_queue_head_t read_wait;
 * 并不是在这里block当前进程,而是继续run
 **
/
    poll_wait(file, &tty->read_wait, wait);
    poll_wait(file, &tty->write_wait, wait);
/*有关POLLIN 事件:tty->read_cnt >= 1
 *
*/
    if (input_available_p(tty, TIME_CHAR(tty) ? 0 : MIN_CHAR(tty)))
        mask |= POLLIN | POLLRDNORM;
    if (tty->packet && tty->link->ctrl_status)
        mask |= POLLPRI | POLLIN | POLLRDNORM;
    if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
        mask |= POLLHUP;
    if (tty_hung_up_p(file))
        mask |= POLLHUP;
    if (!(mask & (POLLHUP | POLLIN | POLLRDNORM))) {
        if (MIN_CHAR(tty) && !TIME_CHAR(tty))
            tty->minimum_to_wake = MIN_CHAR(tty);
        else
            tty->minimum_to_wake = 1;
    }
/*有关POLLOUT 事件:当前写入的数据 < xxx, 有没有可写的空间。
 **
/
    if (tty->ops->write && !tty_is_writelocked(tty) &&
            tty_chars_in_buffer(tty) < WAKEUP_CHARS &&
            tty_write_room(tty) > 0)
        mask |= POLLOUT | POLLWRNORM;
    return mask;
}


static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
    if (p && p->_qproc && wait_address)
        p->_qproc(filp, wait_address, p);
}

/* Add a new entry
 * 有关waitqueue搞明白了?
 **/

static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                poll_table *p)
{
    struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
    struct poll_table_entry *entry = poll_get_entry(pwq);
    if (!entry)
        return;
    get_file(filp);
    entry->filp = filp;
    entry->wait_address = wait_address;
    entry->key = p->_key;
    init_waitqueue_func_entry(&entry->wait, pollwake);
    entry->wait.private = pwq;
    add_wait_queue(wait_address, &entry->wait);

}

static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_wqueues *pwq = wait->private;
    DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

    /*
     * Although this function is called under waitqueue lock, LOCK
     * doesn't imply write barrier and the users expect write
     * barrier semantics on wakeup functions.  The following
     * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
     * and is paired with set_mb() in poll_schedule_timeout.
     */
    smp_wmb();
    pwq->triggered = 1;

    /*
     * Perform the default wake up operation using a dummy
     * waitqueue.
     *
     * TODO: This is hacky but there currently is no interface to
     * pass in @sync.  @sync is scheduled to be removed and once
     * that happens, wake_up_process() can be used directly.
     */
    return default_wake_function(&dummy_wait, mode, sync, key);
}


struct poll_table_entry {
    struct file *filp;
    unsigned long key;
    wait_queue_t wait;
    wait_queue_head_t *wait_address;
};


poll是怎样阻塞进程的

int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
              ktime_t *expires, unsigned long slack)
{
    int rc = -EINTR;

    set_current_state(state);
    if (!pwq->triggered)
        rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
    __set_current_state(TASK_RUNNING);

    /*
     * Prepare for the next iteration.
     *
     * The following set_mb() serves two purposes.  First, it's
     * the counterpart rmb of the wmb in pollwake() such that data
     * written before wake up is always visible after wake up.
     * Second, the full barrier guarantees that triggered clearing
     * doesn't pass event check of the next iteration.  Note that
     * this problem doesn't exist for the first iteration as
     * add_wait_queue() has full barrier semantics.
     */
    set_mb(pwq->triggered, 0);

    return rc;
}

/**
 * schedule_hrtimeout_range - sleep until timeout
 * @expires:    timeout value (ktime_t)
 * @delta:    slack in expires timeout (ktime_t)
 * @mode:    timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
 **/
int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
                     const enum hrtimer_mode mode)
{
    return schedule_hrtimeout_range_clock(expires, delta, mode,
                          CLOCK_MONOTONIC);
}

int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
                   const enum hrtimer_mode mode, int clock)
{
    struct hrtimer_sleeper t;

    /*
     * Optimize when a zero timeout value is given. It does not
     * matter whether this is an absolute or a relative time.
     */
    if (expires && !expires->tv64) {
        __set_current_state(TASK_RUNNING);
        return 0;
    }

    /*
     * A NULL parameter means "infinite"
     */

    if (!expires) {
        schedule();
        __set_current_state(TASK_RUNNING);
        return -EINTR;
    }

    hrtimer_init_on_stack(&t.timer, clock, mode);
    hrtimer_set_expires_range_ns(&t.timer, *expires, delta);

    hrtimer_init_sleeper(&t, current);

    hrtimer_start_expires(&t.timer, mode);
    if (!hrtimer_active(&t.timer))
        t.task = NULL;

    if (likely(t.task))
        schedule();

    hrtimer_cancel(&t.timer);
    destroy_hrtimer_on_stack(&t.timer);

    __set_current_state(TASK_RUNNING);

    return !t.task ? 0 : -EINTR;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值