在测试机器上,如果用cat方式,大概3-4G/min,面对合并后1.4T的文件,需要至少6个小时,后来采用mmap+多线程写同一个文件(20个线程)的方式进行,速度大约14G/min,100min完成处理,要明显快很多,这是单机情形下较快的读写方式,经过测试综合来看,fread和fwrite不如这个模式的速度,比cat也快(cat底层应该是用fread/fwrite或者read/write)。代码如下:
用法:一个待合并文件列表和一个输出文件,线程数默认20,需要修改的话直接改代码
#include <iostream>
#include <string>
#include <fstream>
#include <stdio.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <zconf.h>
#include <fcntl.h> //open
#include <sys/mman.h> //mmap
#include <sys/types.h> //fstat
#include <unistd.h>
#include <thread>
#include <vector>
#include <map>
#include <mutex>
using namespace::std;
static string outFile;
map<string,long long> acc;
map<string,long long> sampleSize;
int th=20;
int bufSize=200*1024*1024;
long headersize=25511;
void line_split(string line_info,char sep,vector<string> &elements){
elements.clear();
string element;
for(string::size_type ix=0;ix!=line_info.size();ix++){
if(line_info[ix]!=sep){
element+=line_info[ix];
}else{
elements.emplace_back(element);
element="";
}
}
elements.emplace_back(element);
}
void* process(string path){
if(sampleSize[path]==0){
return (void*)0;
}
int outVcf=open(outFile.c_str(),O_WRONLY);
long long startPos=acc[path];
lseek(outVcf,startPos,SEEK_SET);
cout<<path<<"\t"<<startPos<<endl;
int copysz=200*104*1024;
int fd=open(path.c_str(),O_RDONLY);
off_t sz=sampleSize[path];
off_t cursz=0;
char* ptr;
while(sz-cursz>0){
if(sz-cursz>bufSize){
copysz=bufSize;
}else{
copysz=sz-cursz;
}
ptr=(char*)mmap(NULL,copysz,PROT_READ,MAP_SHARED,fd,cursz);
cursz+=copysz;
if(ptr!=MAP_FAILED){
write(outVcf,ptr,copysz);
}
munmap(ptr,copysz);
}
close(outVcf);
close(fd);
}
int main(int argc,char* argv[]) {
ifstream list(argv[1]);
outFile.assign(argv[2]);
string path;
char* buf=new char[bufSize];
int iter=0;
struct stat statue;
thread t_array[th];
long long pos=0;
while(1){
if(getline(list,path)) {
int fd=open(path.c_str(),O_RDONLY);
fstat(fd,&statue);
off_t vcfSize=statue.st_size;
acc[path]=pos;
pos+=vcfSize;
sampleSize[path]=vcfSize;
}else{
break;
}
}
list.close();
list.open(argv[1]);
while(1){
if(getline(list,path)) {
t_array[iter] = thread(process, path);
iter++;
if (iter == th) {
for (int i = 0; i < th; i++) {
t_array[i].join();
}
iter = 0;
}
}else{
break;
}
}
if(iter<=20){
for(int i=0;i<iter;i++){
t_array[i].join();
}
}
list.close();
return 0;
}