Linux 进程间通信 pipe 实现原理

目录

1、pipefs

2、pipe_inode_info

2.1、pipe_inode_info

2.2、create_pipe_files()

2.3、alloc_pipe_info()

3、pipe_readable/pipe_read

4、pipe_writable/pipe_write

5、pipe_poll

6、wait_queue

6.1、wait_event

6.2、wake_up

6.3、autoremove_wake_function


本文基于 Linux-5.10 版本分析,借助 pipe 分析等待队列,为了下篇文章分析 slect/poll/epoll 实现做准备。

pipe 是 Linux 系统中一种比较常用的进程间通信方法,不过使用 pipe 通信的两个进行必须有亲属关系,通常是父子进程。

pipe 和 pipe2 返回成功时,pipefd 保存两个文件描述符:pipefd[0] 用于读,pipefd[1] 用于写。

#include <unistd.h>
#include <fcntl.h> /* O_* constants */

int pipe(int pipefd[2]);
int pipe2(int pipefd[2], int flags);

1、pipefs

Linux 一切皆文件,pipe 也所属一个(伪)文件系统,名字为 pipefs,在启动的时候注册到 Linux 系统中。

/// fs/pipe.c
static struct file_system_type pipe_fs_type = {
  .name   = "pipefs",
  .init_fs_context = pipefs_init_fs_context,
  .kill_sb  = kill_anon_super,
};

static int __init init_pipe_fs(void)
{
  int err = register_filesystem(&pipe_fs_type);

  if (!err) {
    pipe_mnt = kern_mount(&pipe_fs_type);
    if (IS_ERR(pipe_mnt)) {
      err = PTR_ERR(pipe_mnt);
      unregister_filesystem(&pipe_fs_type);
    }
  }
  return err;
}

fs_initcall(init_pipe_fs);

当调用 pipe() 或者 pipe2() 时,kernel 都是调用 do_pipe2() 处理。

/// fs/pipe.c
SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
{
  return do_pipe2(fildes, flags);
}

SYSCALL_DEFINE1(pipe, int __user *, fildes)
{
  return do_pipe2(fildes, 0);
}

do_pipe2() 调用 __do_pipe_flags() 创建两个 struct file 对象,分别对应于两个 fd,然后返回。

/// fs/pipe.c
static int do_pipe2(int __user *fildes, int flags)
{
  struct file *files[2];
  int fd[2];
  int error;

  error = __do_pipe_flags(fd, files, flags);
  if (!error) {
    if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
      fput(files[0]);
      fput(files[1]);
      put_unused_fd(fd[0]);
      put_unused_fd(fd[1]);
      error = -EFAULT;
    } else {
      fd_install(fd[0], files[0]);
      fd_install(fd[1], files[1]);
    }
  }
  return error;
}

__do_pipe_flags() 承担 pipe 创建的主要工作:

  • 1)调用 create_pipe_files() 函数创建两个 struct file 对象

  • 2)获取两个未使用的文件描述符 fd

/// fs/pipe.c
static int __do_pipe_flags(int *fd, struct file **files, int flags)
{
  int error;
  int fdw, fdr;

  if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
    return -EINVAL;

  error = create_pipe_files(files, flags);
  if (error)
    return error;

  error = get_unused_fd_flags(flags);
  if (error < 0)
    goto err_read_pipe;
  fdr = error;

  error = get_unused_fd_flags(flags);
  if (error < 0)
    goto err_fdr;
  fdw = error;

  audit_fd_pair(fdr, fdw);
  fd[0] = fdr;
  fd[1] = fdw;
  return 0;

 err_fdr:
  put_unused_fd(fdr);
 err_read_pipe:
  fput(files[0]);
  fput(files[1]);
  return error;
}

pipe 文件系统,file_operations 定义如下:包含了读写函数 pipe_read() 和 pipe_write()

/// fs/pipe.c
const struct file_operations pipefifo_fops = {
  .open   = fifo_open,
  .llseek   = no_llseek,
  .read_iter  = pipe_read,
  .write_iter = pipe_write,
  .poll   = pipe_poll,
  .unlocked_ioctl = pipe_ioctl,
  .release  = pipe_release,
  .fasync   = pipe_fasync,
  .splice_write = iter_file_splice_write,
};

2、pipe_inode_info

2.1、pipe_inode_info

pipe_inode_info 是 pipe 实现的核心,保存在 file 的 private_data 中。

完整的定义如下,介绍几个重要的成员

  • 1)rd_wait/wr_wait:读和写等待队列。当 pipe 可读或者可写时,会唤醒阻塞在这两队列上的进程;

  • 2)head/tail:分别表示 pipe 数据生产者和消费者位置,对应循环队列 bufs 下标;对 head/tail 的操作需要获取 rd_wait.lock 互斥锁;

  • 3)max_usage/ring_size:循环队列 bufs 大小,默认是 16;

/// include/linux/pipe_fs_i.h
struct pipe_inode_info {
  struct mutex mutex;
  wait_queue_head_t rd_wait, wr_wait;
  unsigned int head;
  unsigned int tail;
  unsigned int max_usage;
  unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
  bool note_loss;
#endif
  unsigned int nr_accounted;
  unsigned int readers;
  unsigned int writers;
  unsigned int files;
  unsigned int r_counter;
  unsigned int w_counter;
  bool poll_usage;
  struct page *tmp_page;
  struct fasync_struct *fasync_readers;
  struct fasync_struct *fasync_writers;
  struct pipe_buffer *bufs;
  struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
  struct watch_queue *watch_queue;
#endif
};

pipe_buffer 保存 pipe 数据,保存在 page 指向的页面。

  • offset:数据在页面的起始偏移

  • len 数据长度。

/// include/linux/pipe_fs_i.h
struct pipe_buffer {
  struct page *page;
  unsigned int offset, len;
  const struct pipe_buf_operations *ops;
  unsigned int flags;
  unsigned long private;
};

struct pipe_buf_operations {
  /*
   * ->confirm() verifies that the data in the pipe buffer is there
   * and that the contents are good. If the pages in the pipe belong
   * to a file system, we may need to wait for IO completion in this
   * hook. Returns 0 for good, or a negative error value in case of
   * error.  If not present all pages are considered good.
   */
  int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

  /*
   * When the contents of this pipe buffer has been completely
   * consumed by a reader, ->release() is called.
   */
  void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

  /*
   * Attempt to take ownership of the pipe buffer and its contents.
   * ->try_steal() returns %true for success, in which case the contents
   * of the pipe (the buf->page) is locked and now completely owned by the
   * caller. The page may then be transferred to a different mapping, the
   * most often used case is insertion into different file address space
   * cache.
   */
  bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

  /*
   * Get a reference to the pipe buffer.
   */
  bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

pipe 使用循环队列保存数据,队列由 bufs 指向。队列大小默认 16,队列元素是 pipe_buffer 结果。

通过 head 和 tail 可以很方便地判断 pipe 是否为空,是否满,或者计算还有多少个 buffer 可以供用户填充数据。

head/tail 不是直接对应 bufs 队列的下标,在使用时,需要结合 ring_size 转换为下标(后面代码部分会讲解,转换非常简单)。

/// include/linux/pipe_fs_i.h
/**
 * pipe_empty - Return true if the pipe is empty
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 */
static inline bool pipe_empty(unsigned int head, unsigned int tail)
{
  return head == tail;
}

/**
 * pipe_occupancy - Return number of slots used in the pipe
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 */
static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
{
  return head - tail;
}

/**
 * pipe_full - Return true if the pipe is full
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 * @limit: The maximum amount of slots available.
 */
static inline bool pipe_full(unsigned int head, unsigned int tail,
           unsigned int limit)
{
  return pipe_occupancy(head, tail) >= limit;
}

2.2、create_pipe_files()

上文讲到,create_pipe_files() 会创建两个 file 对象。逻辑如下:

  • 1)调用 get_pipe_inode() 创建一个 inode

  • 2)调用 alloc_file_pseudo() 分配一个 file

  • 3)调用 alloc_file_clone() 复制一个 file

可以看到,调用 alloc_file_pseudo() 调用使用的是 O_WRONLY,表示只能写,申请的 file 保存在 res[1]。调用 alloc_file_clone() 复制已经申请的 file 时,使用的是 O_RDONLY,表示只能读,返回的 file 保存在 res[0]。

另外,file::private_data 被赋值为 inode->i_pipe,其实就是 pipe_inode_info 结构。

最后,file 对应的操作函数是 pipefifo_fops。

/// fs/pipe.c
int create_pipe_files(struct file **res, int flags)
{
  struct inode *inode = get_pipe_inode();
  struct file *f;
  int error;

  if (!inode)
    return -ENFILE;

  if (flags & O_NOTIFICATION_PIPE) {
    error = watch_queue_init(inode->i_pipe);
    if (error) {
      free_pipe_info(inode->i_pipe);
      iput(inode);
      return error;
    }
  }

  f = alloc_file_pseudo(inode, pipe_mnt, "",
        O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
        &pipefifo_fops);
  if (IS_ERR(f)) {
    free_pipe_info(inode->i_pipe);
    iput(inode);
    return PTR_ERR(f);
  }

  f->private_data = inode->i_pipe;

  res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
          &pipefifo_fops);
  if (IS_ERR(res[0])) {
    put_pipe_info(inode, inode->i_pipe);
    fput(f);
    return PTR_ERR(res[0]);
  }
  res[0]->private_data = inode->i_pipe;
  res[1] = f;
  stream_open(inode, res[0]);
  stream_open(inode, res[1]);
  return 0;
}

get_pipe_inode() 返回一个 inode 对象:

  • 1)调用 new_inode_pseudo() 函数返回一个 inode

  • 2)调用 alloc_pipe_info() 函数返回一个 pipe_inode_info,将其保存在 inode->i_pipe 中。

  • 3)inode->i_fop 赋值为 pipefifo_fops

/// fs/pipe.c
static struct inode * get_pipe_inode(void)
{
  struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
  struct pipe_inode_info *pipe;

  if (!inode)
    goto fail_inode;

  inode->i_ino = get_next_ino();

  pipe = alloc_pipe_info();
  if (!pipe)
    goto fail_iput;

  inode->i_pipe = pipe;
  pipe->files = 2;
  pipe->readers = pipe->writers = 1;
  inode->i_fop = &pipefifo_fops;

  /*
   * Mark the inode dirty from the very beginning,
   * that way it will never be moved to the dirty
   * list because "mark_inode_dirty()" will think
   * that it already _is_ on the dirty list.
   */
  inode->i_state = I_DIRTY;
  inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
  inode->i_uid = current_fsuid();
  inode->i_gid = current_fsgid();
  inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);

  return inode;

fail_iput:
  iput(inode);

fail_inode:
  return NULL;
}

2.3、alloc_pipe_info()

alloc_pipe_info() 函数可能看到 pipe_inode_info 结构的布局。首先说一下几个变量

  • 1)PIPE_DEF_BUFFERS:循环队列默认大小,为 16

  • 2)pipe_max_size:表示 pipe 最大 size,表示 1048576 字节,也就是 1024 KB,或者 1MB。

  • 3)PIPE_MIN_DEF_BUFFERS:循环队列最小大小,为 2

too_many_pipe_buffers_soft() 会判断当前 user 是否创建了太多的 pipe,如果是,循环队列大小被限制到 PIPE_MIN_DEF_BUFFERS。

pipe_inode_info 对象创建成功后,就给其成员赋值。阻塞队列头部 rd_wait/wr_wait 也被初始化。

/// fs/pipe.c
struct pipe_inode_info *alloc_pipe_info(void)
{
  struct pipe_inode_info *pipe;
  unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
  struct user_struct *user = get_current_user();
  unsigned long user_bufs;
  unsigned int max_size = READ_ONCE(pipe_max_size);

  pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
  if (pipe == NULL)
    goto out_free_uid;

  if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
    pipe_bufs = max_size >> PAGE_SHIFT;

  user_bufs = account_pipe_buffers(user, 0, pipe_bufs);

  if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
    user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
    pipe_bufs = PIPE_MIN_DEF_BUFFERS;
  }

  if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
    goto out_revert_acct; // 没有使用

  pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
           GFP_KERNEL_ACCOUNT);

  if (pipe->bufs) {
    init_waitqueue_head(&pipe->rd_wait);
    init_waitqueue_head(&pipe->wr_wait);
    pipe->r_counter = pipe->w_counter = 1;
    pipe->max_usage = pipe_bufs;
    pipe->ring_size = pipe_bufs;
    pipe->nr_accounted = pipe_bufs;
    pipe->user = user;
    mutex_init(&pipe->mutex);
    return pipe;
  }

out_revert_acct:
  (void) account_pipe_buffers(user, pipe_bufs, 0);
  kfree(pipe);
out_free_uid:
  free_uid(user);
  return NULL;
}

3、pipe_readable/pipe_read

pipe_readable() 表示 pipe 可读,条件是:

  • 1)pipe 非空

  • 2)没有 writer 正在填充数据

/// fs/pipe.c
static inline bool pipe_readable(const struct pipe_inode_info *pipe)
{
  unsigned int head = READ_ONCE(pipe->head);
  unsigned int tail = READ_ONCE(pipe->tail);
  unsigned int writers = READ_ONCE(pipe->writers);

  return !pipe_empty(head, tail) || !writers;
}

pipe_read() 负责从 pipe 中读出数据。如果 pipe 保存有数据,就从 tail 标志的 pipe_buffer 开始读取数据,直到读取到足够的数据,或者 pipe 没有数据。

从 tail 到 bufs 下标转换也是比较简单:1)mask = pipe->ring_size - 1;2)tail & mask 就是下标。可以看到 ring_size 一定是 2^N 大小。

/// fs/pipe.c
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
  size_t total_len = iov_iter_count(to); // 期望读取的数据量
  struct file *filp = iocb->ki_filp;
  struct pipe_inode_info *pipe = filp->private_data;
  bool was_full, wake_next_reader = false;
  ssize_t ret;

  /* Null read succeeds. */
  if (unlikely(total_len == 0))
    return 0;

  ret = 0;
  __pipe_lock(pipe);

  /*
   * We only wake up writers if the pipe was full when we started
   * reading in order to avoid unnecessary wakeups.
   *
   * But when we do wake up writers, we do so using a sync wakeup
   * (WF_SYNC), because we want them to get going and generate more
   * data for us.
   */
  was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
  for (;;) {
    /* Read ->head with a barrier vs post_one_notification() */
    unsigned int head = smp_load_acquire(&pipe->head);
    unsigned int tail = pipe->tail;
    unsigned int mask = pipe->ring_size - 1;

#ifdef CONFIG_WATCH_QUEUE
/// ...
#endif

    if (!pipe_empty(head, tail)) { // pipe 存有数据
      struct pipe_buffer *buf = &pipe->bufs[tail & mask];
      size_t chars = buf->len;
      size_t written;
      int error;

      if (chars > total_len) {
        if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
          if (ret == 0)
            ret = -ENOBUFS;
          break;
        }
        chars = total_len;
      }

      error = pipe_buf_confirm(pipe, buf);
      if (error) {
        if (!ret)
          ret = error;
        break;
      }
      // 从 page[offset, ) 拷贝数据
      written = copy_page_to_iter(buf->page, buf->offset, chars, to);
      if (unlikely(written < chars)) {
        if (!ret)
          ret = -EFAULT;
        break;
      }
      ret += chars;
      buf->offset += chars;
      buf->len -= chars;

      /* Was it a packet buffer? Clean up and exit */
      if (buf->flags & PIPE_BUF_FLAG_PACKET) {
        total_len = chars;
        buf->len = 0;
      }

      if (!buf->len) { // pipe_buffer 没有数据,释放
        pipe_buf_release(pipe, buf);
        spin_lock_irq(&pipe->rd_wait.lock);
#ifdef CONFIG_WATCH_QUEUE
        if (buf->flags & PIPE_BUF_FLAG_LOSS)
          pipe->note_loss = true;
#endif
        tail++;
        pipe->tail = tail;
        spin_unlock_irq(&pipe->rd_wait.lock);
      }
      total_len -= chars;
      if (!total_len) // to 空间使用完,返回
        break;  /* common path: read succeeded */
      if (!pipe_empty(head, tail))  /* More to do? */
        continue; // 继续读取更多数据
    } // end pipe_empty

    if (!pipe->writers) // 没有 writers,不可能有数据,退出
      break;
    if (ret)
      break;
    if (filp->f_flags & O_NONBLOCK) { // 非阻塞,返回 EAGAIN
      ret = -EAGAIN;
      break;
    }
    __pipe_unlock(pipe);

    /*
     * We only get here if we didn't actually read anything.
     *
     * However, we could have seen (and removed) a zero-sized
     * pipe buffer, and might have made space in the buffers
     * that way.
     *
     * You can't make zero-sized pipe buffers by doing an empty
     * write (not even in packet mode), but they can happen if
     * the writer gets an EFAULT when trying to fill a buffer
     * that already got allocated and inserted in the buffer
     * array.
     *
     * So we still need to wake up any pending writers in the
     * _very_ unlikely case that the pipe was full, but we got
     * no data.
     */
    if (unlikely(was_full)) // 唤醒 writer
      wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
    kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);

    /*
     * But because we didn't read anything, at this point we can
     * just return directly with -ERESTARTSYS if we're interrupted,
     * since we've done any required wakeups and there's no need
     * to mark anything accessed. And we've dropped the lock.
     */
    // 等待 pipe 可读,如果被中断,直接返回
    if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
      return -ERESTARTSYS;

    __pipe_lock(pipe);
    was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
    wake_next_reader = true;
  } // end for
  if (pipe_empty(pipe->head, pipe->tail))
    wake_next_reader = false;
  __pipe_unlock(pipe);

  if (was_full)
    wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
  if (wake_next_reader)
    wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
  kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  if (ret > 0)
    file_accessed(filp);
  return ret;
}

wake_up_interruptible_sync_poll/wait_event_interruptible_exclusive 后面讲解。

4、pipe_writable/pipe_write

pipe_writable 表示 pipe 可写,条件是:

  • 1)pipe 非空

  • 2)没有 writer 正在填充数据

/// fs/pipe.c
static inline bool pipe_writable(const struct pipe_inode_info *pipe)
{
  unsigned int head = READ_ONCE(pipe->head);
  unsigned int tail = READ_ONCE(pipe->tail);
  unsigned int max_usage = READ_ONCE(pipe->max_usage);

  return !pipe_full(head, tail, max_usage) ||
    !READ_ONCE(pipe->readers);
}

pipe_write() 负责向 pipe 中写入数据。如果 pipe 还有空间,就从 head 标志的 pipe_buffer 开始写入数据,直到写入全部到的数据,或者 pipe 没有空间。

/// fs/pipe.c
static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
  struct file *filp = iocb->ki_filp;
  struct pipe_inode_info *pipe = filp->private_data;
  unsigned int head;
  ssize_t ret = 0;
  size_t total_len = iov_iter_count(from);
  ssize_t chars;
  bool was_empty = false;
  bool wake_next_writer = false;

  /* Null write succeeds. */
  if (unlikely(total_len == 0))
    return 0;

  __pipe_lock(pipe);

  if (!pipe->readers) { // 没有 reader,不用写了
    send_sig(SIGPIPE, current, 0);
    ret = -EPIPE;
    goto out;
  }

#ifdef CONFIG_WATCH_QUEUE
/// ...
#endif

  /*
   * If it wasn't empty we try to merge new data into
   * the last buffer.
   *
   * That naturally merges small writes, but it also
   * page-aligns the rest of the writes for large writes
   * spanning multiple pages.
   */
  head = pipe->head;
  was_empty = pipe_empty(head, pipe->tail);
  chars = total_len & (PAGE_SIZE-1);
  if (chars && !was_empty) { // 非空,将数据写入到最后一个 pipe_buffer
    unsigned int mask = pipe->ring_size - 1;
    struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
    int offset = buf->offset + buf->len;

    if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
        offset + chars <= PAGE_SIZE) {
      ret = pipe_buf_confirm(pipe, buf);
      if (ret)
        goto out;

      ret = copy_page_from_iter(buf->page, offset, chars, from);
      if (unlikely(ret < chars)) {
        ret = -EFAULT;
        goto out;
      }

      buf->len += ret;
      if (!iov_iter_count(from)) // 数据写完,返回
        goto out;
    }
  }

  for (;;) {
    if (!pipe->readers) {
      send_sig(SIGPIPE, current, 0);
      if (!ret)
        ret = -EPIPE;
      break;
    }

    head = pipe->head;
    if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
      unsigned int mask = pipe->ring_size - 1;
      struct pipe_buffer *buf = &pipe->bufs[head & mask];
      struct page *page = pipe->tmp_page;
      int copied;

      if (!page) { // 分配 page 空间
        page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
        if (unlikely(!page)) {
          ret = ret ? : -ENOMEM;
          break;
        }
        pipe->tmp_page = page;
      }

      /* Allocate a slot in the ring in advance and attach an
       * empty buffer.  If we fault or otherwise fail to use
       * it, either the reader will consume it or it'll still
       * be there for the next write.
       */
      spin_lock_irq(&pipe->rd_wait.lock);

      head = pipe->head;
      if (pipe_full(head, pipe->tail, pipe->max_usage)) {
        spin_unlock_irq(&pipe->rd_wait.lock); // 让 reader 读取
        continue; // 重试
      }

      pipe->head = head + 1;
      spin_unlock_irq(&pipe->rd_wait.lock);

      /* Insert it into the buffer array */
      buf = &pipe->bufs[head & mask];
      buf->page = page;
      buf->ops = &anon_pipe_buf_ops;
      buf->offset = 0;
      buf->len = 0;
      if (is_packetized(filp))
        buf->flags = PIPE_BUF_FLAG_PACKET;
      else
        buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
      pipe->tmp_page = NULL;
      // 填充 page
      copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
      if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
        if (!ret)
          ret = -EFAULT;
        break;
      }
      ret += copied;
      buf->offset = 0;
      buf->len = copied;

      if (!iov_iter_count(from)) // 数据写完,结束
        break;
    }

    if (!pipe_full(head, pipe->tail, pipe->max_usage))
      continue;

    /* Wait for buffer space to become available. */
    if (filp->f_flags & O_NONBLOCK) { // 非阻塞,返回 EAGAIN
      if (!ret)
        ret = -EAGAIN;
      break;
    }
    if (signal_pending(current)) {
      if (!ret)
        ret = -ERESTARTSYS;
      break;
    }

    /*
     * We're going to release the pipe lock and wait for more
     * space. We wake up any readers if necessary, and then
     * after waiting we need to re-check whether the pipe
     * become empty while we dropped the lock.
     */
    __pipe_unlock(pipe);
    if (was_empty) // 写入数据,唤醒 reader
      wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
    kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
    // 阻塞等待可写
    wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
    __pipe_lock(pipe);
    was_empty = pipe_empty(pipe->head, pipe->tail);
    wake_next_writer = true;
  } // end for
out:
  if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
    wake_next_writer = false;
  __pipe_unlock(pipe);

  /*
   * If we do do a wakeup event, we do a 'sync' wakeup, because we
   * want the reader to start processing things asap, rather than
   * leave the data pending.
   *
   * This is particularly important for small writes, because of
   * how (for example) the GNU make jobserver uses small writes to
   * wake up pending jobs
   *
   * Epoll nonsensically wants a wakeup whether the pipe
   * was already empty or not.
   */
  if (was_empty || pipe->poll_usage)
    wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
  kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  if (wake_next_writer)
    wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
  if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
    int err = file_update_time(filp);
    if (err)
      ret = err;
    sb_end_write(file_inode(filp)->i_sb);
  }
  return ret;
}

5、pipe_poll

pipe_poll 可以将 caller 阻塞在 rd_wait/wr_wait 上(通常如此),然后返回 file 当前的可读或者可写的状态。

/// fs/pipe.c
/* No kernel lock held - fine */
static __poll_t
pipe_poll(struct file *filp, poll_table *wait)
{
  __poll_t mask;
  struct pipe_inode_info *pipe = filp->private_data;
  unsigned int head, tail;

  /* Epoll has some historical nasty semantics, this enables them */
  WRITE_ONCE(pipe->poll_usage, true);

  /*
   * Reading pipe state only -- no need for acquiring the semaphore.
   *
   * But because this is racy, the code has to add the
   * entry to the poll table _first_ ..
   */
  if (filp->f_mode & FMODE_READ)
    poll_wait(filp, &pipe->rd_wait, wait);
  if (filp->f_mode & FMODE_WRITE)
    poll_wait(filp, &pipe->wr_wait, wait);

  /*
   * .. and only then can you do the racy tests. That way,
   * if something changes and you got it wrong, the poll
   * table entry will wake you up and fix it.
   */
  head = READ_ONCE(pipe->head);
  tail = READ_ONCE(pipe->tail);

  mask = 0;
  if (filp->f_mode & FMODE_READ) {
    if (!pipe_empty(head, tail))
      mask |= EPOLLIN | EPOLLRDNORM; // 文件可读
    if (!pipe->writers && filp->f_version != pipe->w_counter)
      mask |= EPOLLHUP;
  }

  if (filp->f_mode & FMODE_WRITE) {
    if (!pipe_full(head, tail, pipe->max_usage))
      mask |= EPOLLOUT | EPOLLWRNORM; // 文件可写
    /*
     * Most Unices do not set EPOLLERR for FIFOs but on Linux they
     * behave exactly like pipes for poll().
     */
    if (!pipe->readers)
      mask |= EPOLLERR;
  }

  return mask;
}

poll_wait() 简单地调用 poll_table::_qproc 指向的函数,可以

/// include/linux/poll.h
/* 
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *,
                                struct poll_table_struct *);

/*
 * Do not touch the structure directly, use the access functions
 * poll_does_not_wait() and poll_requested_events() instead.
 */
typedef struct poll_table_struct {
  poll_queue_proc _qproc;
  __poll_t _key;
} poll_table;

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address,
                             poll_table *p)
{
  if (p && p->_qproc && wait_address)
    p->_qproc(filp, wait_address, p);
}

6、wait_queue

等待队列使进程可以睡眠(让出 CPU)地等待某一特定的时间发生。当等待的条件为 true 时,进程就会被唤醒,重新抢占 CPU 开始执行。

wait_queue_head_t 定义等待队列头部,wait_queue_entry 定义等待队列元素。wait_queue_entry 结构有一个指针成员 func,进程被唤醒时,func 指向的函数会被调用。

/// include/linux/wait.h
typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, 
                                 unsigned mode, int flags, void *key);
struct wait_queue_entry {
  unsigned int    flags;
  void      *private;
  wait_queue_func_t func;
  struct list_head  entry;
};

struct wait_queue_head {
  spinlock_t    lock;
  struct list_head  head;
};
typedef struct wait_queue_head wait_queue_head_t;

宏 init_waitqueue_head 用来初始化 wait_queue_head_t 头部。

/// include/linux/wait.h
#define init_waitqueue_head(wq_head)            \
  do {                  \
    static struct lock_class_key __key;       \
                    \
    __init_waitqueue_head((wq_head), #wq_head, &__key);   \
  } while (0)

函数 __init_waitqueue_head() 定义如下:

/// kernel/sched/wait.c
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name,
                           struct lock_class_key *key)
{
  spin_lock_init(&wq_head->lock);
  lockdep_set_class_and_name(&wq_head->lock, key, name);
  INIT_LIST_HEAD(&wq_head->head);
}

init_waitqueue_entry() 和 init_waitqueue_func_entry() 两个函数用于初始化 wait_queue_entry,不同的是后置可以指定 wait_queue_func_t 函数。

/// include/linux/wait.h
static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, 
                                        struct task_struct *p)
{
  wq_entry->flags   = 0;
  wq_entry->private = p;
  wq_entry->func    = default_wake_function;
}

static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
  wq_entry->flags   = 0;
  wq_entry->private = NULL;
  wq_entry->func    = func;
}

6.1、wait_event

在 pipe_read()/pipe_write() 函数中,当 pipe 中没有数据或者没有空间时,使用 wait_event_interruptible_exclusive() 等待可读或者可写事件发生。

/// include/linux/wait
#define __wait_event_interruptible_exclusive(wq, condition)     \
  ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,      \
          schedule())

#define wait_event_interruptible_exclusive(wq, condition)     \
({                    \
  int __ret = 0;                \
  might_sleep();                \
  if (!(condition))             \
    __ret = __wait_event_interruptible_exclusive(wq, condition);  \
  __ret;                  \
})

___wait_event 也是宏定义:

  • 1)调用 init_wait_entry() 函数初始化 wait_queue_entry

  • 2)调用 prepare_to_wait_event() 将 wait_queue_entry 添加到等待队列中,然后将当前进程切换为 state 的状态,比如 TASK_INTERRUPTIBLE

  • 3)finish_wait() 将当前进程切换为 RUNNING 状态

/// include/linux/wait.h
#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)   \
({                    \
  __label__ __out;              \
  struct wait_queue_entry __wq_entry;         \
  long __ret = ret; /* explicit shadow */       \
                    \
  init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);  \
  for (;;) {                \
    long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
                    \
    if (condition)              \
      break;              \
                    \
    if (___wait_is_interruptible(state) && __int) {     \
      __ret = __int;            \
      goto __out;           \
    }               \
                    \
    cmd;                \
  }                 \
  finish_wait(&wq_head, &__wq_entry);         \
__out:  __ret;                  \
})

init_wait_entry() 定义如下:private 指向当前进程,func 指向 autoremove_wake_function() 函数。

/// kernel/sched/wait.c
void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
{
  wq_entry->flags = flags;
  wq_entry->private = current;
  wq_entry->func = autoremove_wake_function;
  INIT_LIST_HEAD(&wq_entry->entry);
}

prepare_to_wait_event() 和 finish_wait() 函数定义如下:

/// kernel/sched/wait.c
long prepare_to_wait_event(struct wait_queue_head *wq_head,
                           struct wait_queue_entry *wq_entry, int state)
{
  unsigned long flags;
  long ret = 0;

  spin_lock_irqsave(&wq_head->lock, flags);
  if (signal_pending_state(state, current)) { // 接收到信号
    /*
     * Exclusive waiter must not fail if it was selected by wakeup,
     * it should "consume" the condition we were waiting for.
     *
     * The caller will recheck the condition and return success if
     * we were already woken up, we can not miss the event because
     * wakeup locks/unlocks the same wq_head->lock.
     *
     * But we need to ensure that set-condition + wakeup after that
     * can't see us, it should wake up another exclusive waiter if
     * we fail.
     */
    list_del_init(&wq_entry->entry);
    ret = -ERESTARTSYS;
  } else {
    if (list_empty(&wq_entry->entry)) {
      // 添加到等待队列 wq_head 中
      if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
        __add_wait_queue_entry_tail(wq_head, wq_entry);
      else
        __add_wait_queue(wq_head, wq_entry);
    }
    set_current_state(state); // 设置进程状态
  }
  spin_unlock_irqrestore(&wq_head->lock, flags);

  return ret;
}

void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
  unsigned long flags;

  __set_current_state(TASK_RUNNING); // 设置进程状态
  /*
   * We can check for list emptiness outside the lock
   * IFF:
   *  - we use the "careful" check that verifies both
   *    the next and prev pointers, so that there cannot
   *    be any half-pending updates in progress on other
   *    CPU's that we haven't seen yet (and that might
   *    still change the stack area.
   * and
   *  - all other users take the lock (ie we can only
   *    have _one_ other CPU that looks at or modifies
   *    the list).
   */
  if (!list_empty_careful(&wq_entry->entry)) {
    spin_lock_irqsave(&wq_head->lock, flags);
    list_del_init(&wq_entry->entry);
    spin_unlock_irqrestore(&wq_head->lock, flags);
  }
}

6.2、wake_up

当某一事件发生,使用 wake_up 唤醒睡眠的进程。

/// include/linux/wait.h
#define wake_up_interruptible_sync_poll(x, m)         \
  __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

__wake_up_sync_key() 函数直接调用 __wake_up_common_lock() 函数。

bookmark 用于标记,wq_head 是一个双向链表,遍历的是时候,判断已经遍历一轮了。

/// kernel/sched/wait.c
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
      void *key)
{
  if (unlikely(!wq_head))
    return;

  __wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
}

static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
      int nr_exclusive, int wake_flags, void *key)
{
  unsigned long flags;
  wait_queue_entry_t bookmark;

  bookmark.flags = 0;
  bookmark.private = NULL;
  bookmark.func = NULL;
  INIT_LIST_HEAD(&bookmark.entry);

  do {
    spin_lock_irqsave(&wq_head->lock, flags);
    nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
            wake_flags, key, &bookmark);
    spin_unlock_irqrestore(&wq_head->lock, flags);
  } while (bookmark.flags & WQ_FLAG_BOOKMARK);
}

__wake_up_common_lock() 函数遍历等待队列 wq_head。每次都遍历 WAITQUEUE_WALK_BREAK_CNT 个元素,返回返回。每个元素都调用 func 指向的函数。

/// kernel/sched/wait.c
static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
      int nr_exclusive, int wake_flags, void *key,
      wait_queue_entry_t *bookmark)
{
  wait_queue_entry_t *curr, *next;
  int cnt = 0;

  lockdep_assert_held(&wq_head->lock);

  if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
    curr = list_next_entry(bookmark, entry);

    list_del(&bookmark->entry);
    bookmark->flags = 0;
  } else
    curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);

  if (&curr->entry == &wq_head->head)
    return nr_exclusive;

  list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
    unsigned flags = curr->flags;
    int ret;

    if (flags & WQ_FLAG_BOOKMARK)
      continue;

    ret = curr->func(curr, mode, wake_flags, key);
    if (ret < 0)
      break;
    if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
      break;

    if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
        (&next->entry != &wq_head->head)) {
      bookmark->flags = WQ_FLAG_BOOKMARK;
      list_add_tail(&bookmark->entry, &next->entry);
      break;
    }
  }

  return nr_exclusive;
}

6.3、autoremove_wake_function

init_wait_entry() 函数使用到了 autoremove_wake_function() 函数

  • 1)调用 default_wake_function() 函数唤醒进程

  • 2)将 entry 从等待队列删除。

/// kernel/sched/wait.c
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, 
                             int sync, void *key)
{
  int ret = default_wake_function(wq_entry, mode, sync, key);

  if (ret)
    list_del_init_careful(&wq_entry->entry);

  return ret;
}

default_wake_function() 调用 try_to_wake_up() 函数。

/// kernel/sched/core.c
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
        void *key)
{
  WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
  return try_to_wake_up(curr->private, mode, wake_flags);
}

try_to_wake_up() 函数做唤醒的主要工作,比较复杂,有一点是切换进程状态。

/// kernel/sched/core.c
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
  unsigned long flags;
  int cpu, success = 0;

  preempt_disable();
  if (p == current) {
    /*
     * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
     * == smp_processor_id()'. Together this means we can special
     * case the whole 'p->on_rq && ttwu_runnable()' case below
     * without taking any locks.
     *
     * In particular:
     *  - we rely on Program-Order guarantees for all the ordering,
     *  - we're serialized against set_special_state() by virtue of
     *    it disabling IRQs (this allows not taking ->pi_lock).
     */
    if (!(p->state & state))
      goto out;

    success = 1;
    trace_sched_waking(p);
    p->state = TASK_RUNNING; // 切换进程状态
    trace_sched_wakeup(p);
    goto out;
  }

  /*
   * If we are going to wake up a thread waiting for CONDITION we
   * need to ensure that CONDITION=1 done by the caller can not be
   * reordered with p->state check below. This pairs with smp_store_mb()
   * in set_current_state() that the waiting thread does.
   */
  raw_spin_lock_irqsave(&p->pi_lock, flags);
  smp_mb__after_spinlock();
  if (!(p->state & state))
    goto unlock;

  trace_sched_waking(p);

  /* We're going to change ->state: */
  success = 1;

  /*
   * Ensure we load p->on_rq _after_ p->state, otherwise it would
   * be possible to, falsely, observe p->on_rq == 0 and get stuck
   * in smp_cond_load_acquire() below.
   *
   * sched_ttwu_pending()     try_to_wake_up()
   *   STORE p->on_rq = 1       LOAD p->state
   *   UNLOCK rq->lock
   *
   * __schedule() (switch to task 'p')
   *   LOCK rq->lock        smp_rmb();
   *   smp_mb__after_spinlock();
   *   UNLOCK rq->lock
   *
   * [task p]
   *   STORE p->state = UNINTERRUPTIBLE   LOAD p->on_rq
   *
   * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
   * __schedule().  See the comment for smp_mb__after_spinlock().
   *
   * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
   */
  smp_rmb();
  if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
    goto unlock;

#ifdef CONFIG_SMP
  /*
   * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
   * possible to, falsely, observe p->on_cpu == 0.
   *
   * One must be running (->on_cpu == 1) in order to remove oneself
   * from the runqueue.
   *
   * __schedule() (switch to task 'p')  try_to_wake_up()
   *   STORE p->on_cpu = 1      LOAD p->on_rq
   *   UNLOCK rq->lock
   *
   * __schedule() (put 'p' to sleep)
   *   LOCK rq->lock        smp_rmb();
   *   smp_mb__after_spinlock();
   *   STORE p->on_rq = 0       LOAD p->on_cpu
   *
   * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
   * __schedule().  See the comment for smp_mb__after_spinlock().
   *
   * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
   * schedule()'s deactivate_task() has 'happened' and p will no longer
   * care about it's own p->state. See the comment in __schedule().
   */
  smp_acquire__after_ctrl_dep();

  /*
   * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
   * == 0), which means we need to do an enqueue, change p->state to
   * TASK_WAKING such that we can unlock p->pi_lock before doing the
   * enqueue, such as ttwu_queue_wakelist().
   */
  p->state = TASK_WAKING;

  /*
   * If the owning (remote) CPU is still in the middle of schedule() with
   * this task as prev, considering queueing p on the remote CPUs wake_list
   * which potentially sends an IPI instead of spinning on p->on_cpu to
   * let the waker make forward progress. This is safe because IRQs are
   * disabled and the IPI will deliver after on_cpu is cleared.
   *
   * Ensure we load task_cpu(p) after p->on_cpu:
   *
   * set_task_cpu(p, cpu);
   *   STORE p->cpu = @cpu
   * __schedule() (switch to task 'p')
   *   LOCK rq->lock
   *   smp_mb__after_spin_lock()    smp_cond_load_acquire(&p->on_cpu)
   *   STORE p->on_cpu = 1    LOAD p->cpu
   *
   * to ensure we observe the correct CPU on which the task is currently
   * scheduling.
   */
  if (smp_load_acquire(&p->on_cpu) &&
      ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
    goto unlock;

  /*
   * If the owning (remote) CPU is still in the middle of schedule() with
   * this task as prev, wait until its done referencing the task.
   *
   * Pairs with the smp_store_release() in finish_task().
   *
   * This ensures that tasks getting woken will be fully ordered against
   * their previous state and preserve Program Order.
   */
  smp_cond_load_acquire(&p->on_cpu, !VAL);

  cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
  if (task_cpu(p) != cpu) {
    if (p->in_iowait) {
      delayacct_blkio_end(p);
      atomic_dec(&task_rq(p)->nr_iowait);
    }

    wake_flags |= WF_MIGRATED;
    psi_ttwu_dequeue(p);
    set_task_cpu(p, cpu);
  }
#else
  cpu = task_cpu(p);
#endif /* CONFIG_SMP */

  ttwu_queue(p, cpu, wake_flags);
unlock:
  raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out:
  if (success)
    ttwu_stat(p, task_cpu(p), wake_flags);
  preempt_enable();

  return success;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值