linux的零复制splice、tee

最新推荐文章于 2024-06-03 10:05:01 发布

dupengchuan

最新推荐文章于 2024-06-03 10:05:01 发布

阅读量1.3k

点赞数

分类专栏： linux接口 linux命令文章标签： linux tee splice 零复制

本文链接：https://blog.csdn.net/dupengchuan/article/details/51184463

版权

linux命令同时被 2 个专栏收录

9 篇文章 0 订阅

订阅专栏

linux接口

2 篇文章 0 订阅

订阅专栏

要说零复制，就要先说管道pipe。

pipe在linux的实现中，用的是生产者消费者的模型，在linux/pipe_fs_i.h中我们能看到一下的代码：

#define PIPE_DEF_BUFFERS    16
//...
struct pipe_inode_info {
    struct mutex mutex;
    wait_queue_head_t wait;
    unsigned int nrbufs, curbuf, buffers;
    unsigned int readers;
    unsigned int writers;
    unsigned int files;
    unsigned int waiting_writers;
    unsigned int r_counter;
    unsigned int w_counter;
    struct page *tmp_page;
    struct fasync_struct *fasync_readers;
    struct fasync_struct *fasync_writers;
    struct pipe_buffer *bufs;
};

其中bufs就是一个指向管道缓冲区的指针，而管道缓冲区的结构如下：

struct pipe_buffer {
    struct page *page;
    unsigned int offset, len;
    const struct pipe_buf_operations *ops;
    unsigned int flags;
    unsigned long private;
};

其中page是指向包含pipe buffer的页，是物理上的页地址，不是虚拟地址，这样方便进程间的通信。

在创建管道缓冲区时，会创建PIPE_DEF_BUFFERS个pipe_buffer大小的空间给bufs，也就是bufs指向一个大小是PIPE_DEF_BUFFERS的pipe_buffer数组。一个页大小是4k，那么linux的管道缓冲区大小就是64k了。
在使用管道缓冲区时，就和生产者消费者的模型一样，一边把数据写进去，另一边把数据取出来，慢时写阻塞，空时读阻塞。在写入的时候，为了效率，linux会倾向于以页为单位的写，因此缓冲区满时未必是64k的数据。

接下来就是说splice了，
#include <fcntl.h>
ssize_t splice(int fd_in, loff_t *off_in, int fd_out,loff_t *off_out, size_t len, unsigned int flags);
成功返回spliced的字节数，出错-1
这个函数中，fd_in和fd_out中有一个要是管道，off_in、off_out分别是两个文件描述符的偏移，如果其对应的文件描述符不是普通文件，那么就不能有偏移量，就要设为NULL，当是NULL时，就是从文件当前位置读/写,结束后会更新偏移的位置。len就是要移动的数据，至于flags自己看manpage。

它之所以能零复制，就是利用了管道作为中介，先把数据“复制”管道，然后再从管道中读取即可：
pipe(fd_pair[2]);
splice(source_file,...,fd_pair[1],...);
splice(fd_pair[0],...,destination_file,...);
可是注意的是，其实我们并没有真的把数据复制进管道缓冲区，我们只是修改了管道缓冲区的page指针、偏移、长度，使它指向源数据的实际物理地址，然后再从管道中读出来，整个过程都没有设计用户空间和内核空间的复制，在内核中也没有多余的复制，因此是零复制（复制了一次，但术语是叫零复制）。

我们要注意用splice传送超过缓冲区64k的文件时，要更新：

while (filesize > 0) {
       len = splice(sourcefd,&off_in,pipe_pair[1],NULL,filesize-off_in,SPLICE_F_MOVE);
       splice(pipe_pair[0], NULL,dstfd,&off_out,len, SPLICE_F_MOVE);
       if (len < 0) {
           perror("splice");
           break;
       }
       filesize -= len;  
}

还有一个tee函数，这也是一个零复制函数
#include <fcntl.h>
ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags);
成功返回“复制”的字节数，出错-1
EINVAL fd_in or fd_out does not refer to a pipe; or fd_in and fd_out refer to the same pipe.
这个函数是用于两个不同管道之间的零复制，就相当于把两个管道连通

测试
下面是段性能比较，测试程序是这样的，读入一个文件，然后分别用read-write、mmap、splice三种方法复制这个文件，我使用的是一个300多mb的视频文件来测试，最终得到的测试结果是：
最上面的数字是真正复制所用的时间，下面的是time的输出，因为还有其他的影响，因此用户时间+系统时间！=函数工作时间
read-write:

0.820000
real    0m11.919s
user    0m0.028s
sys     0m0.996s

mmap:

0.830000

real    0m10.109s
user    0m0.312s
sys     0m0.676s

splice:

0.550000
real    0m10.643s
user    0m0.000s
sys     0m0.732s

mmap的时间居然和read-write差不多。。。不过我们可以看出用mmap的系统调用时间比read-write少30%左右，可是用户调用时间比较大，因为mmap这个操作本身就是一个消耗很大的函数，如果与要长时间使用这个文件的话，那么就可以冲淡mmap消耗。splice是最快的方法，用户调用时间很少，因为它的工作就是直接在内核完成，不需要频繁的在用户空间和内核空间之间切换。

更新：
发现mmap分段的复制比较直接复制的快，不过还是慢过splice

	int size=8192,total=0;
	while(total < statbuf.st_size) {
		size = statbuf.st_size-total > 4096 ? 4096 : statbuf.st_size-total;
		memcpy(dst,src,size);
		dst += size;
		src += size;
		total += size;
	}

编写程序时出现的问题：
在下面的程序中，对于splice，不知为什么说SPLICE_F_MOVE未声明，我只好用0x1来代替。。。
还有mmap时，不知为什么我把它们都设成写，就是permission denied，读写打开文件（mmap也是读写）就可以。

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<fcntl.h>
#include<sys/stat.h>
#include<unistd.h>
#include<time.h>
#include<sys/mman.h>

#define MAX 1024
#define BUF_SIZE 4096

typedef void (*p_func)(void);

int in_fd,out_fd1,out_fd2,out_fd3;
struct stat statbuf;

void do_std_copy();
void do_mmap_copy();
void do_splice_copy();
void testfun(p_func pa);

int main(int argc,char **argv)
{
	if(argc!=2) return 1;

	char outfile1[MAX],outfile2[MAX],outfile3[MAX];
	p_func pa[3] = {do_std_copy,do_mmap_copy,do_splice_copy};

	strcpy(outfile1,argv[1]);
	strcat(outfile1,"1");
	strcpy(outfile2,argv[1]);
	strcat(outfile2,"2");
	strcpy(outfile3,argv[1]);
	strcat(outfile3,"3");
 
	if((in_fd = open(argv[1],O_RDONLY))<0) { 
		perror("open in_fd faild");
		return 1;
	}
	if((out_fd1 = open(outfile1,O_WRONLY|O_CREAT|O_TRUNC,0777))<0) {
		perror("open out_fd1 faild");
		return 1;
	}
	if((out_fd2 = open(outfile2,O_RDWR|O_CREAT|O_TRUNC,0777))<0){//mmap要读写
		perror("open out_fd2 faild");
		return 1;
	}
	if((out_fd3 = open(outfile3,O_WRONLY|O_CREAT|O_TRUNC,0777))<0) {
		perror("open out_fd3 faild");
		return 1;
	}

	fstat(in_fd,&statbuf);

	for(int i = 0; i<3; ++i)
		testfun(pa[i]);

	close(in_fd);
	close(out_fd1);
	close(out_fd2);
	close(out_fd3);
	return 0;
}

void testfun(p_func pa)
{
	clock_t begin,end;

	lseek(in_fd,0,SEEK_SET);
	begin = clock();
	pa();
	end = clock();
	printf("%f\n",(double)(end-begin)/CLOCKS_PER_SEC);
}


void do_std_copy()
{
	char buffer[BUF_SIZE];
	int bytes;
	while((bytes = read(in_fd,buffer,sizeof(buffer))) >0) {
		if(write(out_fd1,buffer,bytes) != bytes) {
			perror("write errno");
			exit(1);
		}
	}
}

void do_mmap_copy()
{
	if(ftruncate(out_fd2,statbuf.st_size) < 0) {
		perror("ftruncate faild");
		return;
	}
	//如果是lseek创建空洞，就要write一个空字节进去
	void *src = mmap(NULL,statbuf.st_size,PROT_READ,MAP_SHARED,in_fd,0);
	if(src==MAP_FAILED) {
		perror("mmap map src faild");
		return;
	}
	void *dst = mmap(NULL,statbuf.st_size,PROT_READ|PROT_WRITE,MAP_SHARED,out_fd2,0);
	if(dst==MAP_FAILED) {
		munmap(src,statbuf.st_size);
		perror("mmap map dst faild");
		return;
	}
	
	memcpy(dst,src,statbuf.st_size);
	munmap(src,statbuf.st_size);
	munmap(dst,statbuf.st_size);
}

void do_splice_copy()
{
	int pipefd[2],len=statbuf.st_size;
	pipe(pipefd);

    for(;;) {
		if((len=splice(in_fd,NULL,pipefd[1],NULL,len,0x1))<0) {
			perror("splice in_fd faild");
			return;
		}
		if(len==0) return;
		if(splice(pipefd[0],NULL,out_fd3,NULL,len,0x1)<0) {
			perror("splice out_fd3 faild");
			return;
		}
	}
}