poll系统调用(2)

最新推荐文章于 2023-05-10 20:49:41 发布

shuai_wen

最新推荐文章于 2023-05-10 20:49:41 发布

阅读量1.1k

点赞数

分类专栏： tty driver

本文链接：https://blog.csdn.net/u011279649/article/details/18663011

版权

tty driver 专栏收录该内容

27 篇文章 1 订阅

订阅专栏

应用层使用poll/select 的地方很多，如蓝牙等。明白他们的实现很有必要。

应用层打开文件后得到是一个整形 fd. 把一组关心的fd /事件和 timeout传给内核。

如果poll的事件，没有发生，当前进程，就睡眠一段timeout时间，当睡眠时间到或者事件发生，进程就被唤醒；

进程唤醒后，继续调用fd 对应file的 poll函数，监测事件。

如果事件发生，或者timeout,就从poll系统调用返回，发生的事件返回给用户空间。

输入参数：

poll（struct pollfd __user * ufds, unsigned int nfds,int timeout_msecs)
{
   timeout_msecs的单位是msecs,?
}

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
       int, timeout_msecs)
{
   struct timespec end_time, *to = NULL;
   int ret;
/*1. 计算timeout,前提是 >=0, 如果是负数？
*/
   if (timeout_msecs >= 0) {
       to = &end_time;
       poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
           NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
   }

   ret = do_sys_poll(ufds, nfds, to);

/*2. 如果系统调用被中断，则初始化restart_block，返回ERESTART_RESTARTBLOCK给用户空间
*   系统调用会自动restart还是？
**/
   if (ret == -EINTR) {
       struct restart_block *restart_block;

       restart_block = &current_thread_info()->restart_block;
       restart_block->fn = do_restart_poll;
       restart_block->poll.ufds = ufds;
       restart_block->poll.nfds = nfds;

       if (timeout_msecs >= 0) {
           restart_block->poll.tv_sec = end_time.tv_sec;
           restart_block->poll.tv_nsec = end_time.tv_nsec;
           restart_block->poll.has_timeout = 1;
       } else
           restart_block->poll.has_timeout = 0;

       ret = -ERESTART_RESTARTBLOCK;
   }
   return ret;
}

/到这里只剩下do_sys_poll/

#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \
           sizeof(struct pollfd))

/*在内存中的分布：
*   struct poll_list + N_STACK_PPS*struct pollfd
**/
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
       struct timespec *end_time)
{
   struct poll_wqueues table;
    int err = -EFAULT, fdcount, len, size;
/* Allocate small arguments on the stack to save memory and be
   faster - use long to make sure the buffer is aligned properly
   on 64 bit archs to avoid unaligned access
*sizeof(long)是8个字节？
**/
   long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
   struct poll_list *const head = (struct poll_list *)stack_pps;
    struct poll_list *walk = head;
    unsigned long todo = nfds;

   if (nfds > rlimit(RLIMIT_NOFILE))
       return -EINVAL;

/*把来自用户空间的struct pollfd复制到内核空间
**/
   len = min_t(unsigned int, nfds, N_STACK_PPS);
   for (;;) {
       walk->next = NULL;
       walk->len = len;
       if (!len)
           break;

       if (copy_from_user(walk->entries, ufds + nfds-todo,
                   sizeof(struct pollfd) * walk->len))
           goto out_fds;

       todo -= walk->len;
       if (!todo)
           break;

       len = min(todo, POLLFD_PER_PAGE);
       size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
       walk = walk->next = kmalloc(size, GFP_KERNEL);
       if (!walk) {
           err = -ENOMEM;
           goto out_fds;
       }
   }

/*初始化结构体struct poll_wqueues
**/
   poll_initwait(&table);

/*得到检测的事件或者timeout
**/
   fdcount = do_poll(nfds, head, &table, end_time);

/*释放用于poll而申请的资源
**/
   poll_freewait(&table);

   for (walk = head; walk; walk = walk->next) {
       struct pollfd *fds = walk->entries;
       int j;
       /* 把监测到的事件赋值到用户空间
        **/
       for (j = 0; j < walk->len; j++, ufds++)
           if (__put_user(fds[j].revents, &ufds->revents))
               goto out_fds;
   }

   err = fdcount;
out_fds:
   walk = head->next;
/*释放用于申请的资源
**/
   while (walk) {
       struct poll_list *pos = walk;
       walk = walk->next;
       kfree(pos);
   }

   return err;

}

static int do_poll(unsigned int nfds, struct poll_list *list,
           struct poll_wqueues *wait, struct timespec *end_time)
{
   poll_table* pt = &wait->pt;
   ktime_t expire, *to = NULL;
   int timed_out = 0, count = 0;
   unsigned long slack = 0;

   /* Optimise the no-wait case */
   if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
       pt->_qproc = NULL;
       timed_out = 1;
   }

   if (end_time && !timed_out)
       slack = select_estimate_accuracy(end_time);

   for (;;) {
       struct poll_list *walk;

       for (walk = list; walk != NULL; walk = walk->next) {
           struct pollfd * pfd, * pfd_end;

           pfd = walk->entries;
           pfd_end = pfd + walk->len;
           for (; pfd != pfd_end; pfd++) {
               /*
               * Fish for events. If we found one, record it
               * and kill poll_table->_qproc, so we don't
               * needlessly register any other waiters after
               * this. They'll get immediately deregistered
               * when we break out and return.
               */
               if (do_pollfd(pfd, pt)) {
                   count++;
                   pt->_qproc = NULL;
               }
           }
       }
       /*
       * All waiters have already been registered, so don't provide
       * a poll_table->_qproc to them on the next loop iteration.
       */
       pt->_qproc = NULL;
       if (!count) {
           count = wait->error;
           if (signal_pending(current))
               count = -EINTR;
       }
       if (count || timed_out)
           break;

       /*
       * If this is the first loop and we have a timeout
       * given, then we convert to ktime_t and set the to
       * pointer to the expiry value.
       */
       if (end_time && !to) {
           expire = timespec_to_ktime(*end_time);
           to = &expire;
       }

       if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
           timed_out = 1;
   }
   return count;
}

/***************************************************************************************************/
struct pollfd {
   int fd;
   short events;
   short revents;
};
struct poll_list {
   struct poll_list *next;
   int len;
   struct pollfd entries[0];
};

/*
* Structures and helpers for select/poll syscall
*/
struct poll_wqueues {
   poll_table pt;
   struct poll_table_page *table;
   struct task_struct *polling_task;
   int triggered;
   int error;
   int inline_index;
   struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};
/*对应每个文件：poll_table_entry*/
struct poll_table_entry {
   struct file *filp;
   unsigned long key;
   wait_queue_t wait;
   wait_queue_head_t *wait_address;
};

/*
* structures and helpers for f_op->poll implementations
*/
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

/*
* Do not touch the structure directly, use the access functions
* poll_does_not_wait() and poll_requested_events() instead.
*/
typedef struct poll_table_struct {
   poll_queue_proc _qproc;
   unsigned long _key;
} poll_table;

void poll_initwait(struct poll_wqueues *pwq)
{
   init_poll_funcptr(&pwq->pt, __pollwait);
   pwq->polling_task = current;
   pwq->triggered = 0;
   pwq->error = 0;
   pwq->table = NULL;
   pwq->inline_index = 0;
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
   pt->_qproc = qproc;
   pt->_key   = ~0UL; /* all events enabled */
}

/*
* Fish for pollable events on the pollfd->fd file descriptor. We're only
* interested in events matching the pollfd->events mask, and the result
* matching that mask is both recorded in pollfd->revents and returned. The
* pwait poll_table will be used by the fd-provided poll handler for waiting,
* if pwait->_qproc is non-NULL.
*/
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
   unsigned int mask;
   int fd;

   mask = 0;
   fd = pollfd->fd;
   if (fd >= 0) {
       int fput_needed;
       struct file * file;
/*从文件句柄得到struct file
**/
       file = fget_light(fd, &fput_needed);
       mask = POLLNVAL;
       if (file != NULL) {
           mask = DEFAULT_POLLMASK;
           if (file->f_op && file->f_op->poll) {
               /*设置监测的事件：
               **/
               pwait->_key = pollfd->events|POLLERR|POLLHUP;
               mask = file->f_op->poll(file, pwait);
           }
           /* Mask out unneeded events. */
           mask &= pollfd->events | POLLERR | POLLHUP;
           fput_light(file, fput_needed);
       }
   }
   pollfd->revents = mask;

   return mask;
}

static const struct file_operations tty_fops = {
   .llseek       = no_llseek,
   .read       = tty_read,
   .write       = tty_write,
   .poll       = tty_poll,
   .unlocked_ioctl   = tty_ioctl,
   .compat_ioctl   = tty_compat_ioctl,
   .open       = tty_open,
   .release   = tty_release,
   .fasync       = tty_fasync,
};

static unsigned int tty_poll(struct file *filp, poll_table *wait)
{
   struct tty_struct *tty = file_tty(filp);
   struct tty_ldisc *ld;
   int ret = 0;

   if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_poll"))
       return 0;

   ld = tty_ldisc_ref_wait(tty);
   if (ld->ops->poll)
       ret = (ld->ops->poll)(tty, filp, wait);
   tty_ldisc_deref(ld);
   return ret;
}

static unsigned int n_tty_poll(struct tty_struct *tty, struct file *file,
                           poll_table *wait)
{
   unsigned int mask = 0;
/* poll_wait只是把poll request添加到
* wait_queue_head_t write_wait;
* wait_queue_head_t read_wait;
* 并不是在这里block当前进程，而是继续run
**/
   poll_wait(file, &tty->read_wait, wait);
   poll_wait(file, &tty->write_wait, wait);
/*有关POLLIN 事件：tty->read_cnt >= 1
**/
   if (input_available_p(tty, TIME_CHAR(tty) ? 0 : MIN_CHAR(tty)))
       mask |= POLLIN | POLLRDNORM;
   if (tty->packet && tty->link->ctrl_status)
       mask |= POLLPRI | POLLIN | POLLRDNORM;
   if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
       mask |= POLLHUP;
   if (tty_hung_up_p(file))
       mask |= POLLHUP;
   if (!(mask & (POLLHUP | POLLIN | POLLRDNORM))) {
       if (MIN_CHAR(tty) && !TIME_CHAR(tty))
           tty->minimum_to_wake = MIN_CHAR(tty);
       else
           tty->minimum_to_wake = 1;
   }
/*有关POLLOUT 事件：当前写入的数据 < xxx, 有没有可写的空间。
**/
   if (tty->ops->write && !tty_is_writelocked(tty) &&
           tty_chars_in_buffer(tty) < WAKEUP_CHARS &&
           tty_write_room(tty) > 0)
       mask |= POLLOUT | POLLWRNORM;
   return mask;
}

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
   if (p && p->_qproc && wait_address)
       p->_qproc(filp, wait_address, p);
}

/* Add a new entry
* 有关waitqueue搞明白了？
**/
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
               poll_table *p)
{
   struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
   struct poll_table_entry *entry = poll_get_entry(pwq);
   if (!entry)
       return;
   get_file(filp);
   entry->filp = filp;
   entry->wait_address = wait_address;
   entry->key = p->_key;
   init_waitqueue_func_entry(&entry->wait, pollwake);
   entry->wait.private = pwq;
   add_wait_queue(wait_address, &entry->wait);

}

static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
   struct poll_wqueues *pwq = wait->private;
   DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

   /*
   * Although this function is called under waitqueue lock, LOCK
   * doesn't imply write barrier and the users expect write
   * barrier semantics on wakeup functions. The following
   * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
   * and is paired with set_mb() in poll_schedule_timeout.
   */
   smp_wmb();
   pwq->triggered = 1;

   /*
   * Perform the default wake up operation using a dummy
   * waitqueue.
   *
   * TODO: This is hacky but there currently is no interface to
   * pass in @sync. @sync is scheduled to be removed and once
   * that happens, wake_up_process() can be used directly.
   */
   return default_wake_function(&dummy_wait, mode, sync, key);
}

struct poll_table_entry {
   struct file *filp;
   unsigned long key;
   wait_queue_t wait;
   wait_queue_head_t *wait_address;
};

poll是怎样阻塞进程的

int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
              ktime_t *expires, unsigned long slack)
{
   int rc = -EINTR;

   set_current_state(state);
   if (!pwq->triggered)
       rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
   __set_current_state(TASK_RUNNING);

   /*
   * Prepare for the next iteration.
   *
   * The following set_mb() serves two purposes. First, it's
   * the counterpart rmb of the wmb in pollwake() such that data
   * written before wake up is always visible after wake up.
   * Second, the full barrier guarantees that triggered clearing
   * doesn't pass event check of the next iteration. Note that
   * this problem doesn't exist for the first iteration as
   * add_wait_queue() has full barrier semantics.
   */
   set_mb(pwq->triggered, 0);

   return rc;
}

/**
* schedule_hrtimeout_range - sleep until timeout
* @expires:   timeout value (ktime_t)
* @delta:   slack in expires timeout (ktime_t)
* @mode:   timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
**/
int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
                     const enum hrtimer_mode mode)
{
   return schedule_hrtimeout_range_clock(expires, delta, mode,
                          CLOCK_MONOTONIC);
}

int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
                   const enum hrtimer_mode mode, int clock)
{
   struct hrtimer_sleeper t;

   /*
   * Optimize when a zero timeout value is given. It does not
   * matter whether this is an absolute or a relative time.
   */
   if (expires && !expires->tv64) {
       __set_current_state(TASK_RUNNING);
       return 0;
   }

   /*
   * A NULL parameter means "infinite"
   */
   if (!expires) {
       schedule();
       __set_current_state(TASK_RUNNING);
       return -EINTR;
   }

   hrtimer_init_on_stack(&t.timer, clock, mode);
   hrtimer_set_expires_range_ns(&t.timer, *expires, delta);

   hrtimer_init_sleeper(&t, current);

   hrtimer_start_expires(&t.timer, mode);
   if (!hrtimer_active(&t.timer))
       t.task = NULL;

   if (likely(t.task))
       schedule();

   hrtimer_cancel(&t.timer);
   destroy_hrtimer_on_stack(&t.timer);

   __set_current_state(TASK_RUNNING);

   return !t.task ? 0 : -EINTR;
}