pipe函数内核实现

最新推荐文章于 2022-11-07 21:12:25 发布

玛丽奥ZJY

最新推荐文章于 2022-11-07 21:12:25 发布

阅读量419

点赞数

分类专栏：【linux kernel】

【linux kernel】专栏收录该内容

26 篇文章 8 订阅

订阅专栏

pipe源码分析

本文基于linux kernel 4.13 分析，与通用的2.6差距较大。请读者自行甄别本文的特性，是否符合自己当前环境。

本文要解决的问题

1：pipe源码分析

2：pipe大小限制

3：如果没有读（写）端了，那么我写（读）操作会发生什么。

父子进程之间通信，首先想到的是pipe函数，pipe函数返回2个fd。通常，fork前先调用pipe，fd[0]负责读，fd[1]负责写。

示例程序往往是这样

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

main()
{
int pipefd[2];
int pid;
int i, line;
char s[100]={0};

if (pipe(pipefd) < 0) {
perror("pipe");
exit(1);
}

pid = fork();

if (pid > 0)//父进程
{
printf("fater writing....\n");
memcpy(s,"helloworld\n",strlen("helloworld\n"));
write(pipefd[1], s, strlen(s));
close(pipefd[1]);
}
else//子进程
{
printf("child reading....\n");
read(pipefd[0],s,1000);
printf("read result: %s\n",s);
close(pipefd[0]);
}
}

通常，还有人告诉你，上面程序中，父进程还需要关闭pipefd[0]，子进程还需要关闭pipefd[1]。因为fork后，“子进程继承父进程文件描述符”，这里就不纠结细节了。

第一节：fd的创建以及关联

pipe() 系统调用对应的内核函数是sys_pipe，它由SYSCALL_DEFINE1宏来生成，由下面函数可以看到，sys_pipe实际调用了sys_pipe2函数。

fs/pipe.c(kernel 4.13)

SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
{
   struct file *files[2];
   int fd[2];
   int error;
   /*
   __do_pipe_flags函数生成fd和files
   */
   error = __do_pipe_flags(fd, files, flags);
   if (!error) {
       /*我们不关心错误处理*/
       if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
           fput(files[0]);
           fput(files[1]);
           put_unused_fd(fd[0]);
           put_unused_fd(fd[1]);
           error = -EFAULT;
       } else {
           /*这里是核心，将一个int 型的 fd 和一个struct file型的files关联起来*/

           fd_install(fd[0], files[0]);
           fd_install(fd[1], files[1]);
       }
   }
   return error;
}

SYSCALL_DEFINE1(pipe, int __user *, fildes)
{
   return sys_pipe2(fildes, 0);
}

我们先看fd_install的实现，在看看__do_pipe_flags如何创建fd和files的。
fs/file.c
void fd_install(unsigned int fd, struct file *file)
{
   __fd_install(current->files, fd, file);
}

void __fd_install(struct files_struct *files, unsigned int fd,
       struct file *file)
{
   struct fdtable *fdt;
   spin_lock(&files->file_lock);
   fdt = files_fdtable(files);
   BUG_ON(fdt->fd[fd] != NULL);
   rcu_assign_pointer(fdt->fd[fd], file);
   spin_unlock(&files->file_lock);
}

current用来描述当前进程，是一个struct task_struct类型的结构体。

每个current进程都有一个文件描述符表，即 current->files->fdt

所以上面的代码中，__fd_install 函数核心功能就是执行 current->files->fdt[fd] = files。这样，一个int 的fd和一个struct file的files对应起来了，当前进程的任何一个fd就能找到唯一个files。

fd我们都知道，但是 struct file 是上面东西？那现在我们来看看__do_pipe_flags函数，上面说了，它创建了一对fd和一对files。根据pipe的用法，我们知道能猜到，这两个fd肯定是有关系的，否则不可能一个fd写，另一个fd能读，而且只有亲属关系的进程间才能这样。

又因为一个fd能关联到一个files，那么也就是说，我们需要实现两个files有关联，这样，两个fd就自然有关联了。

static int __do_pipe_flags(int *fd, struct file **files, int flags)
{
   int error;
   int fdw, fdr;

   if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
       return -EINVAL;

   error = create_pipe_files(files, flags);
   if (error)
       return error;

   error = get_unused_fd_flags(flags);
   if (error < 0)
       goto err_read_pipe;
   fdr = error;

   error = get_unused_fd_flags(flags);
   if (error < 0)
       goto err_fdr;
   fdw = error;

   audit_fd_pair(fdr, fdw);
   fd[0] = fdr;
   fd[1] = fdw;
   return 0;

err_fdr:
   put_unused_fd(fdr);
err_read_pipe:
   fput(files[0]);
   fput(files[1]);
   return error;
}

__do_pipe_flags函数关键是 create_pipe_files，而其他的函数例如get_unused_fd_flags，顾名思义，获取一个可用的fd号而已，通过这个函数获取一个fdr和fdw。

着重讲create_pipe_files函数，他创建了2个有关联的struct file 的 files。

int create_pipe_files(struct file **res, int flags)
{
   int err;
   struct inode *inode = get_pipe_inode();
   struct file *f;
   struct path path;
   static struct qstr name = { .name = "" };

   if (!inode)
       return -ENFILE;

   err = -ENOMEM;
   path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
   if (!path.dentry)
       goto err_inode;
   path.mnt = mntget(pipe_mnt);

   d_instantiate(path.dentry, inode);

   err = -ENFILE;
   f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
   if (IS_ERR(f))
       goto err_dentry;

   f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
   f->private_data = inode->i_pipe;

   res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
   if (IS_ERR(res[0]))
       goto err_file;

   path_get(&path);
   res[0]->private_data = inode->i_pipe;
   res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
   res[1] = f;
   return 0;

err_file:
   put_filp(f);
err_dentry:
   free_pipe_info(inode->i_pipe);
   path_put(&path);
   return err;

err_inode:
   free_pipe_info(inode->i_pipe);
   iput(inode);
   return err;
}

上面涉及到了文件系统的各个方面，所以比较复杂，我们这里不详细说各个数据结构的作用，

inode很重要，通过get_pipe_inode函数获取。

inode->i_fop = &pipefifo_fops；//指定了一对操作函数，告诉vfs如何读写等操作。

inode->i_pipe = pipe；

后续我们会关注，pipe中的数据结构。

pipe->bufs

pipe->tmp_page

files创建：

files[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);

files[1] = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);

从第二个参数就知道，这两个file限制了各自的功能，也就意味着，对应的两个fd也限制了各自的功能。

d_instantiate(path.dentry, inode); 相当于执行了 path.dentry->d_inode = inode；

这样，alloc_file函数中，file->f_path = *path，就相当于file[x]->f_path.dentry->d_inode = inode，也即两个file指向了一个inode。

这个就是所谓的两个file关联。

下面这句话，简单的把pipe放在file中，方便取值，否则给定一个files，如果需要获取pipe，就需要从file->f_path.dentry->d_inode->i_pipe去的取得，这显然不合适。

files[0]->private_data = inode->i_pipe;

files[1]->private_data = inode->i_pipe;

至此sys_pipe函数执行完毕，它创建了2个互相关联的file，然后创建了2个fd，fd与file一一对应。

其次file都指向了同一个inode，我们可以想象，如果自己接下去实现读写，那么写的数据，肯定放在inode中了，事实也是如此。

第二节通过fd[0]读和通过fd[1]写
读写操作，在用户态，都是read和write，其对于的内核函数分别是sys_read和sys_write。我们先看写操作，

fs/read_write.c

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
       size_t, count)
{
   struct fd f = fdget_pos(fd);
   ssize_t ret = -EBADF;

   if (f.file) {
       loff_t pos = file_pos_read(f.file);
       ret = vfs_write(f.file, buf, count, &pos);
       if (ret >= 0)
           file_pos_write(f.file, pos);
       fdput_pos(f);
   }

   return ret;
}
入参很简单，一个fd，一个存放数据的buf，一个想要存放数据的长度值len。

fdget_pos 函数返回值是一个struct fd，不要被表面疑惑，他和fd没有任何关系。

这个函数通过fd，然后返回一个file结构，以及一些flag。通过第一节我们知道，一个fd如何和一个file关联的，我们也能想到，取得方法也很简单，current->files->fdt[fd]，这样我们就能取到fd对应的file了。找到file后，执行了vfs_write，这里不像多谈虚拟文件系统，说白了就是一个中间层。

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
   ssize_t ret;

   if (!(file->f_mode & FMODE_WRITE))
       return -EBADF;
   if (!(file->f_mode & FMODE_CAN_WRITE))
       return -EINVAL;
   if (unlikely(!access_ok(VERIFY_READ, buf, count)))
       return -EFAULT;

   ret = rw_verify_area(WRITE, file, pos, count);
   if (ret >= 0) {
       count = ret;
       file_start_write(file);
       ret = __vfs_write(file, buf, count, pos);
       if (ret > 0) {
           fsnotify_modify(file);
           add_wchar(current, ret);
       }
       inc_syscw(current);
       file_end_write(file);
   }

   return ret;
}

首先判断，这个fd对于的file是否可写，这与第一节中：

files[1] = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);

这句话相辅相成，所以pipe创建的两个fd一个只能读，一个只能写，就是这个道理。

接着调用__vfs_write，也即调用 file->f_op->write，这个f_op，就是第一节中，alloc_file函数设置的pipefifo_fops。f_op->write 也就是 pipe_write 函数。

pipe_write函数比较长，就不一一列举了，我们需要知道的是，write之后的数据，被放进了哪里：

struct pipe_inode_info *pipe = filp->private_data;//取出pipe，这个第一节中说过。

pipe中有一个字段，用来管理写入的数据：

/**

*struct pipe_inode_info - a linux kernel pipe

*@mutex: mutex protecting the whole thing

*@wait: reader/writer wait point in case of empty/full pipe

*@nrbufs: the number of non-empty pipe buffers in this pipe

*@buffers: total number of buffers (should be a power of 2)

*@curbuf: the current pipe buffer entry

*@tmp_page: cached released page

*@readers: number of current readers of this pipe

*@writers: number of current writers of this pipe

*@files: number of struct file referring this pipe (protected by ->i_lock)

*@waiting_writers: number of writers blocked waiting for room

*@r_counter: reader counter

*@w_counter: writer counter

*@fasync_readers: reader side fasync

*@fasync_writers: writer side fasync

*@bufs: the circular array of pipe buffers

*@user: the user who created this pipe

**/

struct pipe_inode_info {

struct mutex mutex;

wait_queue_head_t wait;

unsigned int nrbufs, curbuf, buffers;

unsigned int readers;

unsigned int writers;

unsigned int files;

unsigned int waiting_writers;

unsigned int r_counter;

unsigned int w_counter;

struct page *tmp_page;

struct fasync_struct *fasync_readers;

struct fasync_struct *fasync_writers;

struct pipe_buffer *bufs;//用来存放write的数据，他是一个链表。

struct user_struct *user;

};

pipe_write中，有个大循环，就是不断往buf中写数据的功能：

//大循环，以为每个循环最多处理一个page_size大小的数据
   for (;;) {
       int bufs;

       //没有读端了直接发送sigpipe信号给进程。
       if (!pipe->readers) {
           send_sig(SIGPIPE, current, 0);
           if (!ret)
               ret = -EPIPE;
           break;
       }
       bufs = pipe->nrbufs;
       //如果当前有可用的buf，则进入判断
       if (bufs < pipe->buffers) {
           int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
           struct pipe_buffer *buf = pipe->bufs + newbuf;
           struct page *page = pipe->tmp_page;
           int copied;

           //page为空，说明，之前拷贝的数据满了一个page_size，page不能接着用了。需要重新申请一个。
           if (!page) {
               page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
               if (unlikely(!page)) {
                   ret = ret ? : -ENOMEM;
                   break;
               }
               pipe->tmp_page = page;
           }
           /* Always wake up, even if the copy fails. Otherwise
           * we lock up (O_NONBLOCK-)readers that sleep due to
           * syscall merging.
           * FIXME! Is this really true?
           */
           do_wakeup = 1;
           copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
           if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
               if (!ret)
                   ret = -EFAULT;
               //page未写满，意味着送入的数据已经全部拷贝完成，就break退出。
               break;
           }
           ret += copied;

           /* Insert it into the buffer array */
           //到这里，全部记录在buf中。
           buf->page = page;
           buf->ops = &anon_pipe_buf_ops;
           buf->offset = 0;
           buf->len = copied;
           buf->flags = 0;
           if (is_packetized(filp)) {
               buf->ops = &packet_pipe_buf_ops;
               buf->flags = PIPE_BUF_FLAG_PACKET;
           }
           pipe->nrbufs = ++bufs;
           pipe->tmp_page = NULL;

           //没有要写的数据了，回去了。
           if (!iov_iter_count(from))
               break;
       }
       if (bufs < pipe->buffers)
           continue;
       //走到这，说明pipe 中可用的buf全部满了，如果设置了非阻塞，则回到用户态
       if (filp->f_flags & O_NONBLOCK) {
           if (!ret)
               ret = -EAGAIN;
           break;
       }
       if (signal_pending(current)) {
           if (!ret)
               ret = -ERESTARTSYS;
           break;
       }
       if (do_wakeup) {
           wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
           kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
           do_wakeup = 0;
       }
       pipe->waiting_writers++;
       pipe_wait(pipe);
       pipe->waiting_writers--;
   }

任何读写操作的逻辑近乎类似，无论socket的读写还是pipe的读写，上面的逻辑其实几句话就可以概括：

1：每次write调用，就在一个pipe->buffs队列中申请一个可用的buf，以及申请一个page。

2：拷贝数据至page，然后page放入buf中。

3：如果write的数据全部处理完毕，则return。

4：如果write的数据没有被处理完毕，判断当前buf是否够用，够用则执行1，不够用，判断当前fd是否是阻塞的，阻塞的就睡眠进程，非阻塞的，就范围EAGAIN。

所以，问题来了

问题1：一个pipe最大能写多大的数据？

答案是pipe->buffers*PAGE_SIZE超过该数之后，write将被阻塞。

pipe->buffers*PAGE_SIZE = 16 * 4k = 65536字节

不过pipe->buffers值往往不都是16个，详情请看get_pipe_inode中，对该队列申请大小的判断条件，这里不再展开。

对于网上说的PIPE_BUF限制，当前内核版本已经不存在了。

问题2：如果每次write 1字节，那么write16次，buf队列就不够用了

非也，pipe_write在你实际写操作前，有个merge的操作

/* We try to merge small writes */
   chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
   if (pipe->nrbufs && chars != 0) {
       int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
                           (pipe->buffers - 1);
       struct pipe_buffer *buf = pipe->bufs + lastbuf;
       int offset = buf->offset + buf->len;

       if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) {
           ret = pipe_buf_confirm(pipe, buf);
           if (ret)
               goto out;

           ret = copy_page_from_iter(buf->page, offset, chars, from);
           if (unlikely(ret < chars)) {
               ret = -EFAULT;
               goto out;
           }
           do_wakeup = 1;
           buf->len += ret;
           if (!iov_iter_count(from))
               goto out;
       }

pipe_readd这里就不进行分析了，就是循环buf队列，得到想要的数据，如果只获取了一个buf中的部分数据，则记录下offset即可，下次从offset处接着获取。

第三节 pipe的另一端关闭了，会发生什么

首先，pipe如何判断有没有对端？

struct pipe_inode_info 有2个字段readers以及writers，创建pipe时，即在get_pipe_inode中，各自赋值为1。

我们把文章中的程序改成这样

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
void main()
{
int pipefd[2];
int pid,ret;
int i, line;
char s[4*1024*16+1]={0};

if (pipe(pipefd) < 0) {
perror("pipe");
exit(1);
}

pid = fork();

if (pid > 0)//父进程
{
close(pipefd[0]);
sleep(1);
//printf("fater writing....\n");
//memcpy(s,"helloworld\n",strlen("helloworld\n"));
ret = write(pipefd[1], s, sizeof(s));
printf("ret:%d\n",ret);
perror("");
close(pipefd[1]);
}
else//子进程
{
   close(pipefd[0]);
   close(pipefd[1]);
   return;
   #if 0
   sleep(10);
printf("child reading....\n");
//read(pipefd[0],s,1000);
printf("read result: %s\n",s);
close(pipefd[0]);
   #endif
}
}

父进程关闭自己的读端，然后让出cpu，子进程关闭自己的读写两端。父进程再往写端数据。

我的系统什么都没有打出，显然write后面没有运行。

我们捕获一下SIGPIPE信号

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>

void handler()
{
   printf("sigpipe\n");
   perror("");
}

void main()
{
int pipefd[2];
int pid,ret;
int i, line;
char s[4*1024*16+1]={0};
struct sigaction sig;

sig.sa_handler = handler;
sig.sa_flags = 0;
sigaction(SIGPIPE, &sig, NULL);

if (pipe(pipefd) < 0) {
perror("pipe");
exit(1);
}

pid = fork();

if (pid > 0)//父进程
{
close(pipefd[0]);
sleep(1);
//printf("fater writing....\n");
//memcpy(s,"helloworld\n",strlen("helloworld\n"));
ret = write(pipefd[1], s, sizeof(s));
printf("ret:%d\n",ret);
perror("");
close(pipefd[1]);
}
else//子进程
{
   close(pipefd[0]);
   close(pipefd[1]);
   return;
   #if 0
   sleep(10);
printf("child reading....\n");
//read(pipefd[0],s,1000);
printf("read result: %s\n",s);
close(pipefd[0]);
   #endif
}
}

果然，有结果了，提示 Broken pipe。

我回过头看看pipe_write发现有这么一段代码：

   if (!pipe->readers) {
       send_sig(SIGPIPE, current, 0);
       ret = -EPIPE;
       goto out;
   }

但是在pipe_read中，当发现write端不存在时，只是返回0，不会产生SIGPIPE。

看这个判断，核心就是pipe->readers字段，我们的示例程序父子进程全都执行了close(fd[0])，这样，pipe的read端就彻底关闭了。至于为什么，我会在后续的父子进程描述符继承中讲，这里就简单讲一下。

首先，fork前，fd[0] 对应的file，file中引用计数为1，fork后，子进程继承了父进程的描述符，copy_process函数中把file引用计数加1，这样，file的引用计数为2，所以父子进程需要各自执行close(fd[0])才能把读端file的引用计数减成0，file才能彻底释放对应的pipe->reader才能为0。

玛丽奥ZJY

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
pipe函数内核实现

pipe源码分析本文基于linux kernel 4.13 分析，与通用的2.6差距较大。请读者自行甄别本文的特性，是否符合自己当前环境。本文要解决的问题1：pipe源码分析2：pipe大小限制3：如果没有读（写）端了，那么我写（读）操作会发生什么。父子进程之间通信，首先想到的是pipe函数，pipe函数返回2个fd。通常，fork前先调用pipe，fd[0]负责读，fd...
复制链接

扫一扫

专栏目录