MPI-IO场景应用
N个计算节点(CN),每个上有200GB的SSD硬盘。一个大的数据文件以分布方式逻辑上存储在N个计算节点的SSD硬盘上,每个节点对应文件:<temp-dir>/data.bin。相对于并行文件系统集中存储,这种利用计算节点本地盘的存储大文件的方式,可以提供更大的聚合IO带宽,理论上可利用的带宽和节点数N成正比。
当前超算集群CN节点上多数配有SSD硬盘,IO带宽基本可以达到300MB/s左右,集群一般采用IB卡高速网络,网络带宽基本可到2~3GB/s!对于大数据应用, 根据应用的IO和计算模式,可以充分利用计算节点上的SSD,获得显著的聚合带宽, 加速计算。(相当于用CN节点本地SSD做Burst Buffer, 参考UnifyFS)
场景一
每个节点顺序访问大文件中的数据。逻辑上相当于大文件数据顺序的在每个计算节点上读取一遍。N个节点组成Ring环状结构,每个节点有上游和下游节点。每个计算节点启动两个线程: IO线程负责读取本地数据,并把读取后的数据发送给下游节点;接收转发线程负责接收上游节点传过来的数据,并转发给下游节点。在两个线程中有对数据的计算过程,通过异步IO和异步发送接收,可实现硬盘、网络、计算三者时间重叠,提高性能。
源码
#include <mpi.h>
#include <glog/logging.h>
static inline void
mpi_check_error(int errcode, const char *file, const char *function, int linenum)
{
if(errcode!=MPI_SUCCESS)
{
char errstr[MPI_MAX_ERROR_STRING];
int resultlen;
MPI_Error_string(errcode,errstr,&resultlen);
printf("%s.%s.%d: MPI errror %s\n", file,function,linenum,errstr);
MPI_Abort(MPI_COMM_WORLD,errcode);
}
return ;
}
#define MPI_CHECK(e) do{int errcode=(e); mpi_check_error(errcode,__FILE__, __FUNCTION__,__LINE__);}while(0)
static void initLog(int &argc, char **&argv)
{
google::InitGoogleLogging(argv[0]);
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InstallFailureSignalHandler();
//FLAGS_logbufsecs=0;
//FLAGS_log_dir="/tmp/";
//FLAGS_log_dir="./";
FLAGS_minloglevel = 0;
FLAGS_logtostderr=1;
FLAGS_colorlogtostderr = true;
FLAGS_max_log_size = 1024;
FLAGS_stop_logging_if_full_disk = true;
}
static void closeLog()
{
google::ShutdownGoogleLogging();
}
namespace {
static const size_t KB=1024UL;
static const size_t MB=KB*KB;
static const size_t GB=KB*KB*KB;
}
namespace {
static const int BUFSZ=512*MB;
MPI_Comm comm;
int rank;
int nnode;
int prev;
int next;
}
static void
init()
{
int provided;
MPI_CHECK(MPI_Init_thread(nullptr,nullptr,MPI_THREAD_MULTIPLE,&provided));
LOG_ASSERT(provided==MPI_THREAD_MULTIPLE)<<"wrong MPI thread level";
comm=MPI_COMM_WORLD;
MPI_Comm_size(comm,&::nnode);
MPI_Comm_rank(comm,&::rank);
::prev=::rank-1;
::next=::rank+1;
if(::prev<0) ::prev=::nnode-1;
if(::next==nnode) ::next=0;
LOG(INFO)<<"rank="<<::rank<<" prev="<<::prev<<" next="<<::next;
return;
}
static void
fini()
{
MPI_CHECK(MPI_Finalize());
return;
}
static void
compute_something(const char *buffer, int count)
{
//需要考虑多线调用,竞争问题
return;
}
void run(const char *node_local_filename)
{
MPI_CHECK(MPI_Barrier(::comm));
MPI_File fh;
MPI_CHECK(MPI_File_open(MPI_COMM_SELF,node_local_filename,MPI_MODE_RDONLY,MPI_INFO_NULL,&fh));
MPI_Offset file_size;
MPI_CHECK(MPI_File_get_size(fh, &file_size));
LOG(INFO)<<"rank#"<<::rank<<": "<<node_local_filename<<", size="<<file_size;
#pragma omp parallel sections num_threads(2)
{
//file read&send thread
#pragma omp section
{
//double buffers
char *read_bufs[2];
read_bufs[0]=new char[2*BUFSZ];
read_bufs[1]=read_bufs[0]+BUFSZ;
MPI_Request read_req,send_req;
read_req=send_req=MPI_REQUEST_NULL;
int current=0;
MPI_Offset read_offset=0;
int loops=0;
size_t total_read=0;
char *compute_buffer=nullptr;
double tic,toc;
tic=MPI_Wtime();
//issue read from file
MPI_CHECK(MPI_File_iread_at(fh,read_offset,read_bufs[current],BUFSZ,MPI_BYTE,&read_req));
while(1)
{
MPI_CHECK(MPI_Wait(&send_req,MPI_STATUS_IGNORE));
MPI_Status status;
MPI_CHECK(MPI_Wait(&read_req,&status));
LOG_ASSERT(send_req==MPI_REQUEST_NULL)<<"wrong send_req";
LOG_ASSERT(read_bufs[current]!=nullptr)<<"wrong read_bufs[current]";
int count;
MPI_CHECK(MPI_Get_count(&status,MPI_BYTE,&count));
total_read+=count;
read_offset+=count;
//issue next read from file
MPI_CHECK(MPI_File_iread_at(fh,read_offset,read_bufs[(current+1)%2],BUFSZ,MPI_BYTE,&read_req));
//token
int tag=::rank;
//issue send to next
MPI_CHECK(MPI_Isend(read_bufs[current],count,MPI_BYTE,::next,tag,::comm,&send_req));
//EOF
if(count==0)
{
MPI_CHECK(MPI_Wait(&send_req,MPI_STATUS_IGNORE));
if(read_req!=MPI_REQUEST_NULL)
MPI_CHECK(MPI_Wait(&read_req,MPI_STATUS_IGNORE));
break;
}
if(count>0)
compute_something(read_bufs[current],count);
//double buffers toggle
current=(current+1)%2;
loops++;
}
toc=MPI_Wtime();
LOG(INFO)<<"rank="<<::rank<<" read file time used: "<<toc-tic;
MPI_CHECK(MPI_File_close(&fh));
delete [] read_bufs[0];
LOG_ASSERT(total_read==file_size)<<"wrong read file";
LOG(INFO)<<"rank="<<::rank<<" read break while"
<<", loops="<<loops
<<", total_read="<<total_read;
}//io thread
//recv/send forward thread
#pragma omp section
{
int current=0;
//double buffers
char *recv_bufs[2];
recv_bufs[0]=new char[2*BUFSZ];
recv_bufs[1]=recv_bufs[0]+BUFSZ;
size_t *recv_counts=new size_t[::nnode];
for(int i=0; i<::nnode; i++)
{
recv_counts[i]=0;
}
MPI_Request recv_req, send_req;
recv_req=send_req=MPI_REQUEST_NULL;
int tag=0;
MPI_CHECK(MPI_Irecv(recv_bufs[current],BUFSZ,MPI_BYTE,::prev,tag,::comm, &recv_req));
double tic,toc;
tic=MPI_Wtime();
while(tag<::nnode)
{
MPI_Status status;
int count;
MPI_CHECK(MPI_Wait(&send_req,MPI_STATUS_IGNORE));
MPI_CHECK(MPI_Wait(&recv_req,&status));
MPI_CHECK(MPI_Get_count(&status,MPI_BYTE,&count));
recv_counts[tag]+=count;
//issue next recv op
if(count>0)
{
MPI_CHECK(MPI_Irecv(recv_bufs[(current+1)%2],BUFSZ,MPI_BYTE,::prev,tag,::comm,&recv_req));
}
else if(count==0)
{
//next token=tag+1
if(tag+1<::nnode)
MPI_CHECK(MPI_Irecv(recv_bufs[(current+1)%2],BUFSZ,MPI_BYTE,::prev,tag+1,::comm,&recv_req));
}
LOG_ASSERT(tag==status.MPI_TAG)<<"wrong tag";
LOG_ASSERT(tag>=0 && tag<::nnode)<<"wrong tag";
if(tag==::next)
{
if(count==0)
{
//EOF文件结束标志
MPI_CHECK(MPI_Isend(recv_bufs[current],0,MPI_BYTE,::next,tag,::comm,&send_req));
}
else
{
//提示有效数据,避免本节点接收自身IO线程读取的数据,ring长度减一
MPI_CHECK(MPI_Isend(recv_bufs[current],1,MPI_BYTE,::next,tag,::comm,&send_req));
}
}
else if(tag==::rank)
{
LOG_ASSERT(count==0 || count==1)<<"wrong count";
}
else
{
MPI_CHECK(MPI_Isend(recv_bufs[current],count,MPI_BYTE,::next,tag,::comm,&send_req));
}
//EOF, tag++
if(count==0) tag++;
if(tag!=::rank)
{
if(count>0)
compute_something(recv_bufs[current],count);
}
//所有节点的文件数据已经接收完成
if(tag==::nnode) break;
//double buffers toggle
current=(current+1)%2;
}//while(tag<::nnode)
LOG(INFO)<<"rank="<<::rank<<" recv/send break while";
MPI_CHECK(MPI_Wait(&send_req,MPI_STATUS_IGNORE));
MPI_CHECK(MPI_Wait(&recv_req,MPI_STATUS_IGNORE));
toc=MPI_Wtime();
LOG(INFO)<<"rank="<<::rank<<" recv/forward time used: "<<toc-tic;
//----------------------------------------------------------------------------
//检验统计量
size_t total_data_size=file_size;
for(int i=0; i<::nnode; i++)
{
LOG(INFO)<<"rank="<<::rank<<" recv_counts["<<i<<"]="<<recv_counts[i];
if(i!=::rank)
total_data_size+=recv_counts[i];
}
if(total_data_size>::GB)
LOG(INFO)<<"rank="<<::rank<<" total_data_size="<<(double)total_data_size/::GB<<" GB";
else if(total_data_size>::MB)
LOG(INFO)<<"rank="<<::rank<<" total_data_size="<<(double)total_data_size/::MB<<" MB";
//-----------------------------------------------------------------------------------------
delete [] recv_counts;
delete [] recv_bufs[0];
}//recv & send thread
}//#pragma omp parallel sections
}
int main(int argc, char **argv)
{
const char *node_local_filename="/tmp/xxgg.dir/data.bin";
initLog(argc,argv);
init();
run(node_local_filename);
fini();
closeLog();
}
程序中利用count=0表示EOF,用tag标记数据包的原始IO节点,tag有token功能,只有rank==tag的节点进行IO数据读取发送,进而保证数据的逻辑顺序。
性能数据
节点数 | 文件大小(GB) | 耗时(s) |
---|---|---|
32 | 282.6 | 74 |
64 | 626.7 | 164 |
场景二
每个节点访问大文件中的数据,数据顺序不重要。一个时间拍上,N个节点可以同时进行IO数据读取,然后发给下游节点,之后N个节点以Ring方式将数据发送给其他节点,开始下一个时间拍。这种方式每个节点的本地盘可以同时工作,磁盘带宽利用效率更高。
源码
#include <mpi.h>
#include <glog/logging.h>
static inline void
mpi_check_error(int errcode, const char *file, const char *function, int linenum)
{
if(errcode!=MPI_SUCCESS)
{
char errstr[MPI_MAX_ERROR_STRING];
int resultlen;
MPI_Error_string(errcode,errstr,&resultlen);
printf("%s.%s.%d: MPI errror %s\n", file,function,linenum,errstr);
MPI_Abort(MPI_COMM_WORLD,errcode);
}
return ;
}
#define MPI_CHECK(e) do{int errcode=(e); mpi_check_error(errcode,__FILE__, __FUNCTION__,__LINE__);}while(0)
static void initLog(int &argc, char **&argv)
{
google::InitGoogleLogging(argv[0]);
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InstallFailureSignalHandler();
//google::InstallFailureWriter(&FatalMessageDump);
//FLAGS_logbufsecs=0;
//FLAGS_log_dir="/tmp/";
FLAGS_log_dir="./";
FLAGS_minloglevel = 0;
FLAGS_logtostderr=1;
FLAGS_colorlogtostderr = true;
FLAGS_max_log_size = 1024;
FLAGS_stop_logging_if_full_disk = true;
}
static void closeLog()
{
google::ShutdownGoogleLogging();
}
namespace {
static const int KB=1024UL;
static const int MB=KB*KB;
static const int GB=KB*KB*KB;
}
namespace {
static const int BUFSZ=100*MB;
MPI_Comm comm;
int rank;
int nnode;
int prev;
int next;
}
static void
init()
{
int provided;
MPI_CHECK(MPI_Init_thread(nullptr,nullptr,MPI_THREAD_MULTIPLE,&provided));
LOG_ASSERT(provided==MPI_THREAD_MULTIPLE)<<"wrong MPI thread level";
comm=MPI_COMM_WORLD;
MPI_Comm_size(comm,&::nnode);
MPI_Comm_rank(comm,&::rank);
::prev=::rank-1;
::next=::rank+1;
if(::prev<0) ::prev=::nnode-1;
if(::next==nnode) ::next=0;
LOG(INFO)<<"rank="<<::rank<<" prev="<<::prev<<" next="<<::next;
return;
}
static void
fini()
{
MPI_CHECK(MPI_Finalize());
return;
}
static void
compute_something(const char *buffer, int count)
{
return;
}
void run(const char *node_local_file)
{
//const char *node_local_file="/tmp/xxgg.dir/data.bin";
MPI_CHECK(MPI_Barrier(::comm));
//-----------------------------------------------------------------------------------
//Open local file for reading
MPI_File fh;
MPI_CHECK(MPI_File_open(MPI_COMM_SELF,node_local_file,MPI_MODE_RDONLY,MPI_INFO_NULL,&fh));
MPI_Offset file_size;
MPI_CHECK(MPI_File_get_size(fh, &file_size));
LOG(INFO)<<"rank#"<<::rank<<": "<<node_local_file<<", size="<<file_size;
//----------------------------------------------
//计算全局IO数目
int io_loops=file_size/BUFSZ;
if(file_size%BUFSZ) io_loops++;
int g_io_loops;
MPI_CHECK(MPI_Allreduce(&io_loops,&g_io_loops,1,MPI_INT,MPI_MAX,::comm));
LOG(INFO)<<"rank#"<<::rank<<" io_loops="<<io_loops<<", global_io_loops="<<g_io_loops;
//------------------------------------------------
//申请内存资源
//double buffers for reading
char *read_bufs[2];
read_bufs[0]=new char[2*BUFSZ];
read_bufs[1]=read_bufs[0]+BUFSZ;
//double buffers for recv
char *recv_bufs[2];
recv_bufs[0]=new char[2*BUFSZ];
recv_bufs[1]=recv_bufs[0]+BUFSZ;
//----------------------------------------
//MPI_Request for file
MPI_Request read_req, io_send_req;
//MPI_Request for recv/send
MPI_Request recv_req,send_req;
read_req=io_send_req=MPI_REQUEST_NULL;
recv_req=send_req=MPI_REQUEST_NULL;
//----------------------------------------------------
//double buffers index
int io_current,recv_current;
io_current=0;
recv_current=0;
MPI_Offset read_offset=0;
size_t total_read=0;
MPI_CHECK(MPI_File_iread_at(fh,read_offset,read_bufs[io_current],BUFSZ,MPI_BYTE,&read_req));
MPI_CHECK(MPI_Irecv(recv_bufs[recv_current],BUFSZ,MPI_BYTE,::prev,MPI_ANY_TAG,::comm, &recv_req));
size_t *recv_counts=new size_t[::nnode];
for(int i=0; i<::nnode; i++)
{
recv_counts[i]=0;
}
int *flags_finished=new int[::nnode];
for(int i=0; i<::nnode; i++)
{
flags_finished[i]=0;
}
double tic,toc;
tic=MPI_Wtime();
for(int io=0; io<g_io_loops; io++)
{
MPI_Status status;
MPI_CHECK(MPI_Wait(&read_req,&status));
int count;
MPI_CHECK(MPI_Get_count(&status,MPI_BYTE,&count));
total_read+=count;
read_offset+=count;
if(count==0) flags_finished[::rank]++;
int tag=::rank;
MPI_CHECK(MPI_Wait(&io_send_req,MPI_STATUS_IGNORE));
//send to next
MPI_CHECK(MPI_Isend(read_bufs[io_current],count,MPI_BYTE,::next,tag,::comm,&io_send_req));
//issue next read
MPI_CHECK(MPI_File_iread_at(fh,read_offset,read_bufs[(io_current+1)%2],BUFSZ,MPI_BYTE,&read_req));
if(count>0)
compute_something(read_bufs[io_current],count);
io_current=(io_current+1)%2;
while(1){
MPI_Status status;
MPI_CHECK(MPI_Wait(&recv_req,&status));
MPI_CHECK(MPI_Get_count(&status,MPI_BYTE,&count));
int tag=status.MPI_TAG;
LOG_ASSERT(tag>=0 && tag<::nnode)<<"wrong tag";
LOG_ASSERT(tag!=::rank)<<"wrong tag";
MPI_CHECK(MPI_Wait(&send_req,MPI_STATUS_IGNORE));
MPI_CHECK(MPI_Irecv(recv_bufs[(recv_current+1)%2],BUFSZ,MPI_BYTE,::prev,MPI_ANY_TAG,::comm, &recv_req));
if(tag!=::next)
{
MPI_CHECK(MPI_Isend(recv_bufs[recv_current],count,MPI_BYTE,::next,tag,::comm,&send_req));
}
if(count>0)
compute_something(recv_bufs[recv_current],count);
recv_counts[tag]+=count;
if(count==0) flags_finished[tag]++;
recv_current=(recv_current+1)%2;
if(tag==::next) break;
}
if(send_req!=MPI_REQUEST_NULL)
MPI_CHECK(MPI_Wait(&send_req,MPI_STATUS_IGNORE));
}
toc=MPI_Wtime();
LOG(INFO)<<"rank="<<::rank<<" total time used: "<<toc-tic;
LOG_ASSERT(total_read==file_size)<<"wrong total_read";
//----------------------------------------------------------------------------
//检验统计量
size_t total_data_size=file_size;
for(int i=0; i<::nnode; i++)
{
LOG(INFO)<<"rank="<<::rank<<" recv_counts["<<i<<"]="<<recv_counts[i]
<<" flags_finished["<<i<<"]="<<flags_finished[i];
if(i!=::rank)
total_data_size+=recv_counts[i];
}
if(total_data_size>::GB)
LOG(INFO)<<"rank="<<::rank<<" total_data_size="<<(double)total_data_size/::GB<<" GB";
else if(total_data_size>::MB)
LOG(INFO)<<"rank="<<::rank<<" total_data_size="<<(double)total_data_size/::MB<<" MB";
delete [] read_bufs;
delete [] recv_bufs;
MPI_CHECK(MPI_File_close(&fh));
}
int main(int argc, char **argv)
{
const char *node_local_file="/tmp/xxgg.dir/data.bin";
initLog(argc,argv);
init();
run(node_local_file);
fini();
closeLog();
}
程序中根据缓存区大小,估计出全局IO次数。节点本地数据如果大小比较平均,有效数据包占比会高,效率也会更高。
性能数据
节点数 | 文件大小(GB) | 耗时(s) |
---|---|---|
32 | 293.0 | 50 |
64 | 577.7 | 109 |
总结
- 文件读取和数据接收转发使用了非阻塞调用
- 采用双缓冲方式解耦合,实现IO、网络、计算同时进行
- 利用了MPI传输函数中的count,tag参数,实现特殊消息传递
编译、作业脚本
生成随机大小的数据文件
#!/bin/bash
if [ -z ${TEST_TMPDIR} ]; then
echo "need to set TEST_TMPDIR"
exit 1
fi
################################################################
A=${1:-1000}
B=${2:-20000}
RANDOM=$(date +%s%N | cut -b10-19)
COUNT=$(($A+$RANDOM%$B))
echo "COUNT=$COUNT"
################################################################
mkdir -p ${TEST_TMPDIR}/
rm -f ${TEST_TMPDIR}/*
dd if=/dev/urandom of=${TEST_TMPDIR}/data.bin bs=1M count=$COUNT
ls -l ${TEST_TMPDIR}/data.bin
slurm作业脚本
#!/bin/bash
#SBATCH -N 64
#SBATCH -c 2
export TEST_TMPDIR=/tmp/xxgg.dir
#-----------------------------------
srun ./gen_temp_file.sh
echo "genenrate temp file OK"
#-----------------------------------
srun ./test_mpiio.out
#-----------------------------------
srun rm -f ${TEST_TMPDIR}/*
echo "delete temp file OK"
#-----------------------------------
date