目录
本文基于 Linux-5.10 版本分析,借助 pipe 分析等待队列,为了下篇文章分析 slect/poll/epoll 实现做准备。
pipe 是 Linux 系统中一种比较常用的进程间通信方法,不过使用 pipe 通信的两个进行必须有亲属关系,通常是父子进程。
pipe 和 pipe2 返回成功时,pipefd 保存两个文件描述符:pipefd[0] 用于读,pipefd[1] 用于写。
#include <unistd.h>
#include <fcntl.h> /* O_* constants */
int pipe(int pipefd[2]);
int pipe2(int pipefd[2], int flags);
1、pipefs
Linux 一切皆文件,pipe 也所属一个(伪)文件系统,名字为 pipefs,在启动的时候注册到 Linux 系统中。
/// fs/pipe.c
static struct file_system_type pipe_fs_type = {
.name = "pipefs",
.init_fs_context = pipefs_init_fs_context,
.kill_sb = kill_anon_super,
};
static int __init init_pipe_fs(void)
{
int err = register_filesystem(&pipe_fs_type);
if (!err) {
pipe_mnt = kern_mount(&pipe_fs_type);
if (IS_ERR(pipe_mnt)) {
err = PTR_ERR(pipe_mnt);
unregister_filesystem(&pipe_fs_type);
}
}
return err;
}
fs_initcall(init_pipe_fs);
当调用 pipe() 或者 pipe2() 时,kernel 都是调用 do_pipe2() 处理。
/// fs/pipe.c
SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
{
return do_pipe2(fildes, flags);
}
SYSCALL_DEFINE1(pipe, int __user *, fildes)
{
return do_pipe2(fildes, 0);
}
do_pipe2() 调用 __do_pipe_flags() 创建两个 struct file 对象,分别对应于两个 fd,然后返回。
/// fs/pipe.c
static int do_pipe2(int __user *fildes, int flags)
{
struct file *files[2];
int fd[2];
int error;
error = __do_pipe_flags(fd, files, flags);
if (!error) {
if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
fput(files[0]);
fput(files[1]);
put_unused_fd(fd[0]);
put_unused_fd(fd[1]);
error = -EFAULT;
} else {
fd_install(fd[0], files[0]);
fd_install(fd[1], files[1]);
}
}
return error;
}
__do_pipe_flags() 承担 pipe 创建的主要工作:
-
1)调用 create_pipe_files() 函数创建两个 struct file 对象
-
2)获取两个未使用的文件描述符 fd
/// fs/pipe.c
static int __do_pipe_flags(int *fd, struct file **files, int flags)
{
int error;
int fdw, fdr;
if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
return -EINVAL;
error = create_pipe_files(files, flags);
if (error)
return error;
error = get_unused_fd_flags(flags);
if (error < 0)
goto err_read_pipe;
fdr = error;
error = get_unused_fd_flags(flags);
if (error < 0)
goto err_fdr;
fdw = error;
audit_fd_pair(fdr, fdw);
fd[0] = fdr;
fd[1] = fdw;
return 0;
err_fdr:
put_unused_fd(fdr);
err_read_pipe:
fput(files[0]);
fput(files[1]);
return error;
}
pipe 文件系统,file_operations 定义如下:包含了读写函数 pipe_read() 和 pipe_write()
/// fs/pipe.c
const struct file_operations pipefifo_fops = {
.open = fifo_open,
.llseek = no_llseek,
.read_iter = pipe_read,
.write_iter = pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
.fasync = pipe_fasync,
.splice_write = iter_file_splice_write,
};
2、pipe_inode_info
2.1、pipe_inode_info
pipe_inode_info 是 pipe 实现的核心,保存在 file 的 private_data 中。
完整的定义如下,介绍几个重要的成员
-
1)rd_wait/wr_wait:读和写等待队列。当 pipe 可读或者可写时,会唤醒阻塞在这两队列上的进程;
-
2)head/tail:分别表示 pipe 数据生产者和消费者位置,对应循环队列 bufs 下标;对 head/tail 的操作需要获取 rd_wait.lock 互斥锁;
-
3)max_usage/ring_size:循环队列 bufs 大小,默认是 16;
/// include/linux/pipe_fs_i.h
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
bool poll_usage;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};
pipe_buffer 保存 pipe 数据,保存在 page 指向的页面。
-
offset:数据在页面的起始偏移
-
len 数据长度。
/// include/linux/pipe_fs_i.h
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};
struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);
/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);
/*
* Get a reference to the pipe buffer.
*/
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};
pipe 使用循环队列保存数据,队列由 bufs 指向。队列大小默认 16,队列元素是 pipe_buffer 结果。
通过 head 和 tail 可以很方便地判断 pipe 是否为空,是否满,或者计算还有多少个 buffer 可以供用户填充数据。
head/tail 不是直接对应 bufs 队列的下标,在使用时,需要结合 ring_size 转换为下标(后面代码部分会讲解,转换非常简单)。
/// include/linux/pipe_fs_i.h
/**
* pipe_empty - Return true if the pipe is empty
* @head: The pipe ring head pointer
* @tail: The pipe ring tail pointer
*/
static inline bool pipe_empty(unsigned int head, unsigned int tail)
{
return head == tail;
}
/**
* pipe_occupancy - Return number of slots used in the pipe
* @head: The pipe ring head pointer
* @tail: The pipe ring tail pointer
*/
static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
{
return head - tail;
}
/**
* pipe_full - Return true if the pipe is full
* @head: The pipe ring head pointer
* @tail: The pipe ring tail pointer
* @limit: The maximum amount of slots available.
*/
static inline bool pipe_full(unsigned int head, unsigned int tail,
unsigned int limit)
{
return pipe_occupancy(head, tail) >= limit;
}
2.2、create_pipe_files()
上文讲到,create_pipe_files() 会创建两个 file 对象。逻辑如下:
-
1)调用 get_pipe_inode() 创建一个 inode
-
2)调用 alloc_file_pseudo() 分配一个 file
-
3)调用 alloc_file_clone() 复制一个 file
可以看到,调用 alloc_file_pseudo() 调用使用的是 O_WRONLY,表示只能写,申请的 file 保存在 res[1]。调用 alloc_file_clone() 复制已经申请的 file 时,使用的是 O_RDONLY,表示只能读,返回的 file 保存在 res[0]。
另外,file::private_data 被赋值为 inode->i_pipe,其实就是 pipe_inode_info 结构。
最后,file 对应的操作函数是 pipefifo_fops。
/// fs/pipe.c
int create_pipe_files(struct file **res, int flags)
{
struct inode *inode = get_pipe_inode();
struct file *f;
int error;
if (!inode)
return -ENFILE;
if (flags & O_NOTIFICATION_PIPE) {
error = watch_queue_init(inode->i_pipe);
if (error) {
free_pipe_info(inode->i_pipe);
iput(inode);
return error;
}
}
f = alloc_file_pseudo(inode, pipe_mnt, "",
O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
&pipefifo_fops);
if (IS_ERR(f)) {
free_pipe_info(inode->i_pipe);
iput(inode);
return PTR_ERR(f);
}
f->private_data = inode->i_pipe;
res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
&pipefifo_fops);
if (IS_ERR(res[0])) {
put_pipe_info(inode, inode->i_pipe);
fput(f);
return PTR_ERR(res[0]);
}
res[0]->private_data = inode->i_pipe;
res[1] = f;
stream_open(inode, res[0]);
stream_open(inode, res[1]);
return 0;
}
get_pipe_inode() 返回一个 inode 对象:
-
1)调用 new_inode_pseudo() 函数返回一个 inode
-
2)调用 alloc_pipe_info() 函数返回一个 pipe_inode_info,将其保存在 inode->i_pipe 中。
-
3)inode->i_fop 赋值为 pipefifo_fops
/// fs/pipe.c
static struct inode * get_pipe_inode(void)
{
struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
struct pipe_inode_info *pipe;
if (!inode)
goto fail_inode;
inode->i_ino = get_next_ino();
pipe = alloc_pipe_info();
if (!pipe)
goto fail_iput;
inode->i_pipe = pipe;
pipe->files = 2;
pipe->readers = pipe->writers = 1;
inode->i_fop = &pipefifo_fops;
/*
* Mark the inode dirty from the very beginning,
* that way it will never be moved to the dirty
* list because "mark_inode_dirty()" will think
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
return inode;
fail_iput:
iput(inode);
fail_inode:
return NULL;
}
2.3、alloc_pipe_info()
alloc_pipe_info() 函数可能看到 pipe_inode_info 结构的布局。首先说一下几个变量
-
1)PIPE_DEF_BUFFERS:循环队列默认大小,为 16
-
2)pipe_max_size:表示 pipe 最大 size,表示 1048576 字节,也就是 1024 KB,或者 1MB。
-
3)PIPE_MIN_DEF_BUFFERS:循环队列最小大小,为 2
too_many_pipe_buffers_soft() 会判断当前 user 是否创建了太多的 pipe,如果是,循环队列大小被限制到 PIPE_MIN_DEF_BUFFERS。
pipe_inode_info 对象创建成功后,就给其成员赋值。阻塞队列头部 rd_wait/wr_wait 也被初始化。
/// fs/pipe.c
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
struct user_struct *user = get_current_user();
unsigned long user_bufs;
unsigned int max_size = READ_ONCE(pipe_max_size);
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
if (pipe == NULL)
goto out_free_uid;
if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
pipe_bufs = max_size >> PAGE_SHIFT;
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
pipe_bufs = PIPE_MIN_DEF_BUFFERS;
}
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct; // 没有使用
pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);
if (pipe->bufs) {
init_waitqueue_head(&pipe->rd_wait);
init_waitqueue_head(&pipe->wr_wait);
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
}
out_revert_acct:
(void) account_pipe_buffers(user, pipe_bufs, 0);
kfree(pipe);
out_free_uid:
free_uid(user);
return NULL;
}
3、pipe_readable/pipe_read
pipe_readable() 表示 pipe 可读,条件是:
-
1)pipe 非空
-
2)没有 writer 正在填充数据
/// fs/pipe.c
static inline bool pipe_readable(const struct pipe_inode_info *pipe)
{
unsigned int head = READ_ONCE(pipe->head);
unsigned int tail = READ_ONCE(pipe->tail);
unsigned int writers = READ_ONCE(pipe->writers);
return !pipe_empty(head, tail) || !writers;
}
pipe_read() 负责从 pipe 中读出数据。如果 pipe 保存有数据,就从 tail 标志的 pipe_buffer 开始读取数据,直到读取到足够的数据,或者 pipe 没有数据。
从 tail 到 bufs 下标转换也是比较简单:1)mask = pipe->ring_size - 1;2)tail & mask 就是下标。可以看到 ring_size 一定是 2^N 大小。
/// fs/pipe.c
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
size_t total_len = iov_iter_count(to); // 期望读取的数据量
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
bool was_full, wake_next_reader = false;
ssize_t ret;
/* Null read succeeds. */
if (unlikely(total_len == 0))
return 0;
ret = 0;
__pipe_lock(pipe);
/*
* We only wake up writers if the pipe was full when we started
* reading in order to avoid unnecessary wakeups.
*
* But when we do wake up writers, we do so using a sync wakeup
* (WF_SYNC), because we want them to get going and generate more
* data for us.
*/
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
for (;;) {
/* Read ->head with a barrier vs post_one_notification() */
unsigned int head = smp_load_acquire(&pipe->head);
unsigned int tail = pipe->tail;
unsigned int mask = pipe->ring_size - 1;
#ifdef CONFIG_WATCH_QUEUE
/// ...
#endif
if (!pipe_empty(head, tail)) { // pipe 存有数据
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t chars = buf->len;
size_t written;
int error;
if (chars > total_len) {
if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
if (ret == 0)
ret = -ENOBUFS;
break;
}
chars = total_len;
}
error = pipe_buf_confirm(pipe, buf);
if (error) {
if (!ret)
ret = error;
break;
}
// 从 page[offset, ) 拷贝数据
written = copy_page_to_iter(buf->page, buf->offset, chars, to);
if (unlikely(written < chars)) {
if (!ret)
ret = -EFAULT;
break;
}
ret += chars;
buf->offset += chars;
buf->len -= chars;
/* Was it a packet buffer? Clean up and exit */
if (buf->flags & PIPE_BUF_FLAG_PACKET) {
total_len = chars;
buf->len = 0;
}
if (!buf->len) { // pipe_buffer 没有数据,释放
pipe_buf_release(pipe, buf);
spin_lock_irq(&pipe->rd_wait.lock);
#ifdef CONFIG_WATCH_QUEUE
if (buf->flags & PIPE_BUF_FLAG_LOSS)
pipe->note_loss = true;
#endif
tail++;
pipe->tail = tail;
spin_unlock_irq(&pipe->rd_wait.lock);
}
total_len -= chars;
if (!total_len) // to 空间使用完,返回
break; /* common path: read succeeded */
if (!pipe_empty(head, tail)) /* More to do? */
continue; // 继续读取更多数据
} // end pipe_empty
if (!pipe->writers) // 没有 writers,不可能有数据,退出
break;
if (ret)
break;
if (filp->f_flags & O_NONBLOCK) { // 非阻塞,返回 EAGAIN
ret = -EAGAIN;
break;
}
__pipe_unlock(pipe);
/*
* We only get here if we didn't actually read anything.
*
* However, we could have seen (and removed) a zero-sized
* pipe buffer, and might have made space in the buffers
* that way.
*
* You can't make zero-sized pipe buffers by doing an empty
* write (not even in packet mode), but they can happen if
* the writer gets an EFAULT when trying to fill a buffer
* that already got allocated and inserted in the buffer
* array.
*
* So we still need to wake up any pending writers in the
* _very_ unlikely case that the pipe was full, but we got
* no data.
*/
if (unlikely(was_full)) // 唤醒 writer
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
/*
* But because we didn't read anything, at this point we can
* just return directly with -ERESTARTSYS if we're interrupted,
* since we've done any required wakeups and there's no need
* to mark anything accessed. And we've dropped the lock.
*/
// 等待 pipe 可读,如果被中断,直接返回
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS;
__pipe_lock(pipe);
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
wake_next_reader = true;
} // end for
if (pipe_empty(pipe->head, pipe->tail))
wake_next_reader = false;
__pipe_unlock(pipe);
if (was_full)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (wake_next_reader)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
if (ret > 0)
file_accessed(filp);
return ret;
}
wake_up_interruptible_sync_poll/wait_event_interruptible_exclusive 后面讲解。
4、pipe_writable/pipe_write
pipe_writable 表示 pipe 可写,条件是:
-
1)pipe 非空
-
2)没有 writer 正在填充数据
/// fs/pipe.c
static inline bool pipe_writable(const struct pipe_inode_info *pipe)
{
unsigned int head = READ_ONCE(pipe->head);
unsigned int tail = READ_ONCE(pipe->tail);
unsigned int max_usage = READ_ONCE(pipe->max_usage);
return !pipe_full(head, tail, max_usage) ||
!READ_ONCE(pipe->readers);
}
pipe_write() 负责向 pipe 中写入数据。如果 pipe 还有空间,就从 head 标志的 pipe_buffer 开始写入数据,直到写入全部到的数据,或者 pipe 没有空间。
/// fs/pipe.c
static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head;
ssize_t ret = 0;
size_t total_len = iov_iter_count(from);
ssize_t chars;
bool was_empty = false;
bool wake_next_writer = false;
/* Null write succeeds. */
if (unlikely(total_len == 0))
return 0;
__pipe_lock(pipe);
if (!pipe->readers) { // 没有 reader,不用写了
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
}
#ifdef CONFIG_WATCH_QUEUE
/// ...
#endif
/*
* If it wasn't empty we try to merge new data into
* the last buffer.
*
* That naturally merges small writes, but it also
* page-aligns the rest of the writes for large writes
* spanning multiple pages.
*/
head = pipe->head;
was_empty = pipe_empty(head, pipe->tail);
chars = total_len & (PAGE_SIZE-1);
if (chars && !was_empty) { // 非空,将数据写入到最后一个 pipe_buffer
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
offset + chars <= PAGE_SIZE) {
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;
ret = copy_page_from_iter(buf->page, offset, chars, from);
if (unlikely(ret < chars)) {
ret = -EFAULT;
goto out;
}
buf->len += ret;
if (!iov_iter_count(from)) // 数据写完,返回
goto out;
}
}
for (;;) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page;
int copied;
if (!page) { // 分配 page 空间
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
}
pipe->tmp_page = page;
}
/* Allocate a slot in the ring in advance and attach an
* empty buffer. If we fault or otherwise fail to use
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
spin_lock_irq(&pipe->rd_wait.lock);
head = pipe->head;
if (pipe_full(head, pipe->tail, pipe->max_usage)) {
spin_unlock_irq(&pipe->rd_wait.lock); // 让 reader 读取
continue; // 重试
}
pipe->head = head + 1;
spin_unlock_irq(&pipe->rd_wait.lock);
/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;
if (is_packetized(filp))
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
pipe->tmp_page = NULL;
// 填充 page
copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
ret = -EFAULT;
break;
}
ret += copied;
buf->offset = 0;
buf->len = copied;
if (!iov_iter_count(from)) // 数据写完,结束
break;
}
if (!pipe_full(head, pipe->tail, pipe->max_usage))
continue;
/* Wait for buffer space to become available. */
if (filp->f_flags & O_NONBLOCK) { // 非阻塞,返回 EAGAIN
if (!ret)
ret = -EAGAIN;
break;
}
if (signal_pending(current)) {
if (!ret)
ret = -ERESTARTSYS;
break;
}
/*
* We're going to release the pipe lock and wait for more
* space. We wake up any readers if necessary, and then
* after waiting we need to re-check whether the pipe
* become empty while we dropped the lock.
*/
__pipe_unlock(pipe);
if (was_empty) // 写入数据,唤醒 reader
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
// 阻塞等待可写
wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
__pipe_lock(pipe);
was_empty = pipe_empty(pipe->head, pipe->tail);
wake_next_writer = true;
} // end for
out:
if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
wake_next_writer = false;
__pipe_unlock(pipe);
/*
* If we do do a wakeup event, we do a 'sync' wakeup, because we
* want the reader to start processing things asap, rather than
* leave the data pending.
*
* This is particularly important for small writes, because of
* how (for example) the GNU make jobserver uses small writes to
* wake up pending jobs
*
* Epoll nonsensically wants a wakeup whether the pipe
* was already empty or not.
*/
if (was_empty || pipe->poll_usage)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
if (wake_next_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
int err = file_update_time(filp);
if (err)
ret = err;
sb_end_write(file_inode(filp)->i_sb);
}
return ret;
}
5、pipe_poll
pipe_poll 可以将 caller 阻塞在 rd_wait/wr_wait 上(通常如此),然后返回 file 当前的可读或者可写的状态。
/// fs/pipe.c
/* No kernel lock held - fine */
static __poll_t
pipe_poll(struct file *filp, poll_table *wait)
{
__poll_t mask;
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head, tail;
/* Epoll has some historical nasty semantics, this enables them */
WRITE_ONCE(pipe->poll_usage, true);
/*
* Reading pipe state only -- no need for acquiring the semaphore.
*
* But because this is racy, the code has to add the
* entry to the poll table _first_ ..
*/
if (filp->f_mode & FMODE_READ)
poll_wait(filp, &pipe->rd_wait, wait);
if (filp->f_mode & FMODE_WRITE)
poll_wait(filp, &pipe->wr_wait, wait);
/*
* .. and only then can you do the racy tests. That way,
* if something changes and you got it wrong, the poll
* table entry will wake you up and fix it.
*/
head = READ_ONCE(pipe->head);
tail = READ_ONCE(pipe->tail);
mask = 0;
if (filp->f_mode & FMODE_READ) {
if (!pipe_empty(head, tail))
mask |= EPOLLIN | EPOLLRDNORM; // 文件可读
if (!pipe->writers && filp->f_version != pipe->w_counter)
mask |= EPOLLHUP;
}
if (filp->f_mode & FMODE_WRITE) {
if (!pipe_full(head, tail, pipe->max_usage))
mask |= EPOLLOUT | EPOLLWRNORM; // 文件可写
/*
* Most Unices do not set EPOLLERR for FIFOs but on Linux they
* behave exactly like pipes for poll().
*/
if (!pipe->readers)
mask |= EPOLLERR;
}
return mask;
}
poll_wait() 简单地调用 poll_table::_qproc 指向的函数,可以
/// include/linux/poll.h
/*
* structures and helpers for f_op->poll implementations
*/
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *,
struct poll_table_struct *);
/*
* Do not touch the structure directly, use the access functions
* poll_does_not_wait() and poll_requested_events() instead.
*/
typedef struct poll_table_struct {
poll_queue_proc _qproc;
__poll_t _key;
} poll_table;
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address,
poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);
}
6、wait_queue
等待队列使进程可以睡眠(让出 CPU)地等待某一特定的时间发生。当等待的条件为 true 时,进程就会被唤醒,重新抢占 CPU 开始执行。
wait_queue_head_t 定义等待队列头部,wait_queue_entry 定义等待队列元素。wait_queue_entry 结构有一个指针成员 func,进程被唤醒时,func 指向的函数会被调用。
/// include/linux/wait.h
typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry,
unsigned mode, int flags, void *key);
struct wait_queue_entry {
unsigned int flags;
void *private;
wait_queue_func_t func;
struct list_head entry;
};
struct wait_queue_head {
spinlock_t lock;
struct list_head head;
};
typedef struct wait_queue_head wait_queue_head_t;
宏 init_waitqueue_head 用来初始化 wait_queue_head_t 头部。
/// include/linux/wait.h
#define init_waitqueue_head(wq_head) \
do { \
static struct lock_class_key __key; \
\
__init_waitqueue_head((wq_head), #wq_head, &__key); \
} while (0)
函数 __init_waitqueue_head() 定义如下:
/// kernel/sched/wait.c
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name,
struct lock_class_key *key)
{
spin_lock_init(&wq_head->lock);
lockdep_set_class_and_name(&wq_head->lock, key, name);
INIT_LIST_HEAD(&wq_head->head);
}
init_waitqueue_entry() 和 init_waitqueue_func_entry() 两个函数用于初始化 wait_queue_entry,不同的是后置可以指定 wait_queue_func_t 函数。
/// include/linux/wait.h
static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry,
struct task_struct *p)
{
wq_entry->flags = 0;
wq_entry->private = p;
wq_entry->func = default_wake_function;
}
static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
wq_entry->flags = 0;
wq_entry->private = NULL;
wq_entry->func = func;
}
6.1、wait_event
在 pipe_read()/pipe_write() 函数中,当 pipe 中没有数据或者没有空间时,使用 wait_event_interruptible_exclusive() 等待可读或者可写事件发生。
/// include/linux/wait
#define __wait_event_interruptible_exclusive(wq, condition) \
___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
schedule())
#define wait_event_interruptible_exclusive(wq, condition) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_event_interruptible_exclusive(wq, condition); \
__ret; \
})
___wait_event 也是宏定义:
-
1)调用 init_wait_entry() 函数初始化 wait_queue_entry
-
2)调用 prepare_to_wait_event() 将 wait_queue_entry 添加到等待队列中,然后将当前进程切换为 state 的状态,比如 TASK_INTERRUPTIBLE
-
3)finish_wait() 将当前进程切换为 RUNNING 状态
/// include/linux/wait.h
#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \
({ \
__label__ __out; \
struct wait_queue_entry __wq_entry; \
long __ret = ret; /* explicit shadow */ \
\
init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \
for (;;) { \
long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
\
if (condition) \
break; \
\
if (___wait_is_interruptible(state) && __int) { \
__ret = __int; \
goto __out; \
} \
\
cmd; \
} \
finish_wait(&wq_head, &__wq_entry); \
__out: __ret; \
})
init_wait_entry() 定义如下:private 指向当前进程,func 指向 autoremove_wake_function() 函数。
/// kernel/sched/wait.c
void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
{
wq_entry->flags = flags;
wq_entry->private = current;
wq_entry->func = autoremove_wake_function;
INIT_LIST_HEAD(&wq_entry->entry);
}
prepare_to_wait_event() 和 finish_wait() 函数定义如下:
/// kernel/sched/wait.c
long prepare_to_wait_event(struct wait_queue_head *wq_head,
struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
long ret = 0;
spin_lock_irqsave(&wq_head->lock, flags);
if (signal_pending_state(state, current)) { // 接收到信号
/*
* Exclusive waiter must not fail if it was selected by wakeup,
* it should "consume" the condition we were waiting for.
*
* The caller will recheck the condition and return success if
* we were already woken up, we can not miss the event because
* wakeup locks/unlocks the same wq_head->lock.
*
* But we need to ensure that set-condition + wakeup after that
* can't see us, it should wake up another exclusive waiter if
* we fail.
*/
list_del_init(&wq_entry->entry);
ret = -ERESTARTSYS;
} else {
if (list_empty(&wq_entry->entry)) {
// 添加到等待队列 wq_head 中
if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
__add_wait_queue_entry_tail(wq_head, wq_entry);
else
__add_wait_queue(wq_head, wq_entry);
}
set_current_state(state); // 设置进程状态
}
spin_unlock_irqrestore(&wq_head->lock, flags);
return ret;
}
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
__set_current_state(TASK_RUNNING); // 设置进程状态
/*
* We can check for list emptiness outside the lock
* IFF:
* - we use the "careful" check that verifies both
* the next and prev pointers, so that there cannot
* be any half-pending updates in progress on other
* CPU's that we haven't seen yet (and that might
* still change the stack area.
* and
* - all other users take the lock (ie we can only
* have _one_ other CPU that looks at or modifies
* the list).
*/
if (!list_empty_careful(&wq_entry->entry)) {
spin_lock_irqsave(&wq_head->lock, flags);
list_del_init(&wq_entry->entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
}
6.2、wake_up
当某一事件发生,使用 wake_up 唤醒睡眠的进程。
/// include/linux/wait.h
#define wake_up_interruptible_sync_poll(x, m) \
__wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
__wake_up_sync_key() 函数直接调用 __wake_up_common_lock() 函数。
bookmark 用于标记,wq_head 是一个双向链表,遍历的是时候,判断已经遍历一轮了。
/// kernel/sched/wait.c
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
void *key)
{
if (unlikely(!wq_head))
return;
__wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
}
static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
wait_queue_entry_t bookmark;
bookmark.flags = 0;
bookmark.private = NULL;
bookmark.func = NULL;
INIT_LIST_HEAD(&bookmark.entry);
do {
spin_lock_irqsave(&wq_head->lock, flags);
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
} while (bookmark.flags & WQ_FLAG_BOOKMARK);
}
__wake_up_common_lock() 函数遍历等待队列 wq_head。每次都遍历 WAITQUEUE_WALK_BREAK_CNT 个元素,返回返回。每个元素都调用 func 指向的函数。
/// kernel/sched/wait.c
static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key,
wait_queue_entry_t *bookmark)
{
wait_queue_entry_t *curr, *next;
int cnt = 0;
lockdep_assert_held(&wq_head->lock);
if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
curr = list_next_entry(bookmark, entry);
list_del(&bookmark->entry);
bookmark->flags = 0;
} else
curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
if (&curr->entry == &wq_head->head)
return nr_exclusive;
list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
int ret;
if (flags & WQ_FLAG_BOOKMARK)
continue;
ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
(&next->entry != &wq_head->head)) {
bookmark->flags = WQ_FLAG_BOOKMARK;
list_add_tail(&bookmark->entry, &next->entry);
break;
}
}
return nr_exclusive;
}
6.3、autoremove_wake_function
init_wait_entry() 函数使用到了 autoremove_wake_function() 函数
-
1)调用 default_wake_function() 函数唤醒进程
-
2)将 entry 从等待队列删除。
/// kernel/sched/wait.c
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode,
int sync, void *key)
{
int ret = default_wake_function(wq_entry, mode, sync, key);
if (ret)
list_del_init_careful(&wq_entry->entry);
return ret;
}
default_wake_function() 调用 try_to_wake_up() 函数。
/// kernel/sched/core.c
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
return try_to_wake_up(curr->private, mode, wake_flags);
}
try_to_wake_up() 函数做唤醒的主要工作,比较复杂,有一点是切换进程状态。
/// kernel/sched/core.c
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
preempt_disable();
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
* == smp_processor_id()'. Together this means we can special
* case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
* In particular:
* - we rely on Program-Order guarantees for all the ordering,
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
if (!(p->state & state))
goto out;
success = 1;
trace_sched_waking(p);
p->state = TASK_RUNNING; // 切换进程状态
trace_sched_wakeup(p);
goto out;
}
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
* reordered with p->state check below. This pairs with smp_store_mb()
* in set_current_state() that the waiting thread does.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
if (!(p->state & state))
goto unlock;
trace_sched_waking(p);
/* We're going to change ->state: */
success = 1;
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
* be possible to, falsely, observe p->on_rq == 0 and get stuck
* in smp_cond_load_acquire() below.
*
* sched_ttwu_pending() try_to_wake_up()
* STORE p->on_rq = 1 LOAD p->state
* UNLOCK rq->lock
*
* __schedule() (switch to task 'p')
* LOCK rq->lock smp_rmb();
* smp_mb__after_spinlock();
* UNLOCK rq->lock
*
* [task p]
* STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
*
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* A similar smb_rmb() lives in try_invoke_on_locked_down_task().
*/
smp_rmb();
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
goto unlock;
#ifdef CONFIG_SMP
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
*
* One must be running (->on_cpu == 1) in order to remove oneself
* from the runqueue.
*
* __schedule() (switch to task 'p') try_to_wake_up()
* STORE p->on_cpu = 1 LOAD p->on_rq
* UNLOCK rq->lock
*
* __schedule() (put 'p' to sleep)
* LOCK rq->lock smp_rmb();
* smp_mb__after_spinlock();
* STORE p->on_rq = 0 LOAD p->on_cpu
*
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
* schedule()'s deactivate_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/
smp_acquire__after_ctrl_dep();
/*
* We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
* == 0), which means we need to do an enqueue, change p->state to
* TASK_WAKING such that we can unlock p->pi_lock before doing the
* enqueue, such as ttwu_queue_wakelist().
*/
p->state = TASK_WAKING;
/*
* If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, considering queueing p on the remote CPUs wake_list
* which potentially sends an IPI instead of spinning on p->on_cpu to
* let the waker make forward progress. This is safe because IRQs are
* disabled and the IPI will deliver after on_cpu is cleared.
*
* Ensure we load task_cpu(p) after p->on_cpu:
*
* set_task_cpu(p, cpu);
* STORE p->cpu = @cpu
* __schedule() (switch to task 'p')
* LOCK rq->lock
* smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
* STORE p->on_cpu = 1 LOAD p->cpu
*
* to ensure we observe the correct CPU on which the task is currently
* scheduling.
*/
if (smp_load_acquire(&p->on_cpu) &&
ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
goto unlock;
/*
* If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
* Pairs with the smp_store_release() in finish_task().
*
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
#else
cpu = task_cpu(p);
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out:
if (success)
ttwu_stat(p, task_cpu(p), wake_flags);
preempt_enable();
return success;
}