钩子函数:
GKFS是一个兼容POSIX标准的用户空间文件系统,针对用户空间IO栈中最底层的系统调用层进行拦截。hook()是一个钩子函数,根据系统调用号判断拦截的是哪个系统调用,各个系统调用又有一个专门的钩子函数hook_xxx()处理。由于GKFS面向的是HPC环境,而传统POSIX提供的许多函数在HPC应用中是用不上的(比如chmod,rename等等),因此GKFS没有对这些不常用的系统调用进行实现。
static inline int hook(long syscall_number,
long arg0, long arg1, long arg2,
long arg3, long arg4, long arg5,
long *result)
{
switch (syscall_number) {
//根据系统调用号判断拦截的是哪个系统调用
case SYS_open:
*result = hook_openat(AT_FDCWD,
reinterpret_cast<char*>(arg0),
static_cast<int>(arg1),
static_cast<mode_t>(arg2));
break;
//当使用 AT_FDCWD 作为 dirfd 参数时,文件系统会在进程的当前工作目录中执行操作
//FDCWD:File Discriptor Current Working Directory
case SYS_creat:
*result = hook_openat(AT_FDCWD,
reinterpret_cast<const char*>(arg0),
O_WRONLY | O_CREAT | O_TRUNC,
static_cast<mode_t>(arg1));
break;
case SYS_openat:
*result = hook_openat(static_cast<int>(arg0),
reinterpret_cast<const char*>(arg1),
static_cast<int>(arg2),
static_cast<mode_t>(arg3));
break;
case SYS_close:
*result = hook_close(static_cast<int>(arg0));
break;
case SYS_stat:
*result = hook_stat(reinterpret_cast<char*>(arg0),
reinterpret_cast<struct stat*>(arg1));
break;
case SYS_lstat:
*result = hook_lstat(reinterpret_cast<char*>(arg0),
reinterpret_cast<struct stat*>(arg1));
break;
case SYS_fstat:
*result = hook_fstat(static_cast<int>(arg0),
reinterpret_cast<struct stat*>(arg1));
break;
case SYS_newfstatat:
*result = hook_fstatat(static_cast<int>(arg0),
reinterpret_cast<const char*>(arg1),
reinterpret_cast<struct stat *>(arg2),
static_cast<int>(arg3));
break;
case SYS_read:
*result = hook_read(static_cast<unsigned int>(arg0),
reinterpret_cast<void*>(arg1),
static_cast<size_t>(arg2));
break;
case SYS_pread64://pread和pwrite尤其适合多线程向同一个文件读写,在Linux内核2.6中,这两个函数改名了,在末尾加上了一个数字64
*result = hook_pread(static_cast<unsigned int>(arg0),
reinterpret_cast<char *>(arg1),
static_cast<size_t>(arg2),
static_cast<loff_t>(arg3));
break;
case SYS_pwrite64:
*result = hook_pwrite(static_cast<unsigned int>(arg0),
reinterpret_cast<const char *>(arg1),
static_cast<size_t>(arg2),
static_cast<loff_t>(arg3));
break;
case SYS_write:
*result = hook_write(static_cast<unsigned int>(arg0),
reinterpret_cast<const char *>(arg1),
static_cast<size_t>(arg2));
break;
case SYS_writev:
*result = hook_writev(static_cast<unsigned long>(arg0),
reinterpret_cast<const struct iovec *>(arg1),
static_cast<unsigned long>(arg2));
break;
//不支持
case SYS_pwritev:
*result = hook_pwritev(static_cast<unsigned long>(arg0),
reinterpret_cast<const struct iovec *>(arg1),
static_cast<unsigned long>(arg2),
static_cast<unsigned long>(arg3),
static_cast<unsigned long>(arg4));
break;
case SYS_unlink://Linux删除一个文件没有名为remove()的调用,删除文件就是用unlink来实现
*result = hook_unlinkat(AT_FDCWD,
reinterpret_cast<const char *>(arg0),
0);
break;
case SYS_unlinkat:
*result = hook_unlinkat(static_cast<int>(arg0),
reinterpret_cast<const char*>(arg1),
static_cast<int>(arg2));
break;
case SYS_rmdir:
*result = hook_unlinkat(AT_FDCWD,
reinterpret_cast<const char *>(arg0),
AT_REMOVEDIR);
break;
//不支持
case SYS_symlink:
*result = hook_symlinkat(reinterpret_cast<const char *>(arg0),
AT_FDCWD,
reinterpret_cast<const char *>(arg1));
break;
//不支持
case SYS_symlinkat:
*result = hook_symlinkat(reinterpret_cast<const char *>(arg0),
static_cast<int>(arg1),
reinterpret_cast<const char *>(arg2));
break;
//用于检查进程对文件的权限或者确认文件是否存在
case SYS_access:
*result = hook_access(reinterpret_cast<const char*>(arg0),
static_cast<int>(arg1));
break;
case SYS_faccessat:
*result = hook_faccessat(static_cast<int>(arg0),
reinterpret_cast<const char*>(arg1),
static_cast<int>(arg2));
break;
case SYS_lseek:
*result = hook_lseek(static_cast<unsigned int>(arg0),
static_cast<off_t>(arg1),
static_cast<unsigned int>(arg2));
break;
case SYS_truncate:
*result = hook_truncate(reinterpret_cast<const char*>(arg0),
static_cast<long>(arg1));
break;
case SYS_ftruncate:
*result = hook_ftruncate(static_cast<unsigned int>(arg0),
static_cast<unsigned long>(arg1));
break;
case SYS_dup:
*result = hook_dup(static_cast<unsigned int>(arg0));
break;
case SYS_dup2:
*result = hook_dup2(static_cast<unsigned int>(arg0),
static_cast<unsigned int>(arg1));
break;
//不支持
case SYS_dup3:
*result = hook_dup3(static_cast<unsigned int>(arg0),
static_cast<unsigned int>(arg1),
static_cast<int>(arg2));
break;
case SYS_getdents:
*result = hook_getdents(static_cast<unsigned int>(arg0),
reinterpret_cast<struct linux_dirent *>(arg1),
static_cast<unsigned int>(arg2));
break;
case SYS_getdents64:
*result = hook_getdents64(static_cast<unsigned int>(arg0),
reinterpret_cast<struct linux_dirent64 *>(arg1),
static_cast<unsigned int>(arg2));
break;
case SYS_mkdirat:
*result = hook_mkdirat(static_cast<unsigned int>(arg0),
reinterpret_cast<const char *>(arg1),
static_cast<mode_t>(arg2));
break;
case SYS_mkdir:
*result = hook_mkdirat(AT_FDCWD,
reinterpret_cast<const char *>(arg0),
static_cast<mode_t>(arg1));
break;
//不支持
case SYS_chmod:
*result = hook_fchmodat(AT_FDCWD,
reinterpret_cast<char*>(arg0),
static_cast<mode_t>(arg1));
break;
//不支持
case SYS_fchmod:
*result = hook_fchmod(static_cast<unsigned int>(arg0),
static_cast<mode_t>(arg1));
break;
//不支持
case SYS_fchmodat:
*result = hook_fchmodat(static_cast<unsigned int>(arg0),
reinterpret_cast<char*>(arg1),
static_cast<mode_t>(arg2));
break;
//用于改变进程的当前工作目录
case SYS_chdir:
*result = hook_chdir(reinterpret_cast<const char *>(arg0));
break;
case SYS_fchdir:
*result = hook_fchdir(static_cast<unsigned int>(arg0));
break;
case SYS_getcwd:
*result = hook_getcwd(reinterpret_cast<char *>(arg0),
static_cast<unsigned long>(arg1));
break;
//不支持
case SYS_readlink:
*result = hook_readlinkat(AT_FDCWD,
reinterpret_cast<const char *>(arg0),
reinterpret_cast<char *>(arg1),
static_cast<int>(arg2));
break;
//不支持
case SYS_readlinkat:
*result = hook_readlinkat(static_cast<int>(arg0),
reinterpret_cast<const char *>(arg1),
reinterpret_cast<char *>(arg2),
static_cast<int>(arg3));
break;
//用来修改打开文件的属性
case SYS_fcntl:
*result = hook_fcntl(static_cast<unsigned int>(arg0),
static_cast<unsigned int>(arg1),
static_cast<unsigned long>(arg2));
break;
//不支持
case SYS_rename:
*result = hook_renameat(AT_FDCWD,
reinterpret_cast<const char *>(arg0),
AT_FDCWD,
reinterpret_cast<const char *>(arg1),
0);
break;
//不支持
case SYS_renameat:
*result = hook_renameat(static_cast<int>(arg0),
reinterpret_cast<const char *>(arg1),
static_cast<int>(arg2),
reinterpret_cast<const char *>(arg3),
0);
break;
//不支持
case SYS_renameat2:
*result = hook_renameat(static_cast<int>(arg0),
reinterpret_cast<const char *>(arg1),
static_cast<int>(arg2),
reinterpret_cast<const char *>(arg3),
static_cast<unsigned int>(arg4));
break;
case SYS_fstatfs:
*result = hook_fstatfs(static_cast<unsigned int>(arg0),
reinterpret_cast<struct statfs *>(arg1));
break;
//获取文件系统用的统计信息(文件系统类型、总大小、可用空间、文件数量等)
case SYS_statfs:
*result = hook_statfs(reinterpret_cast<const char *>(arg0),
reinterpret_cast<struct statfs *>(arg1));
break;
default:
/*
* Ignore any other syscalls
* i.e.: pass them on to the kernel
* as would normally happen.
*/
#if LOG_SYSCALLS
CTX->log()->trace("Syscall [{}, {}] Passthrough", syscall_names[syscall_number], syscall_number);
#endif
return NOT_HOOKED;
}
#if LOG_SYSCALLS
CTX->log()->trace("Syscall [{}, {}] Intercepted", syscall_names[syscall_number], syscall_number);
#endif
return HOOKED;
}
SYS_open:
在GKFS中,钩子函数的主要工作是判断一个系统调用是内部的还是外部的,也即判断一个系统调用应当由GKFS处理还是由本地文件系统处理。
int hook_openat(int dirfd, const char *cpath, int flags, mode_t mode) {
/*拦截openat系统调用。openat 是一个 POSIX 标准系统调用,用于打开一个文件
* int dirfd: 目录文件描述符,用于指定要打开文件的目录
* char cpath: 要打开文件的路径
* int flags: 打开文件的标志,如只读、只写、创建等
* mode_t mode: 创建文件时使用的模式,如文件权限
*/
CTX->log()->trace("{}() called with fd: {}, path: {}, flags: {}, mode: {}",
__func__, dirfd, cpath, flags, mode);
std::string resolved;
auto rstatus = CTX->relativize_fd_path(dirfd, cpath, resolved);
switch(rstatus) {
case RelativizeStatus::fd_unknown:
return syscall_no_intercept(SYS_openat, dirfd, cpath, flags, mode);
case RelativizeStatus::external:
return syscall_no_intercept(SYS_openat, dirfd, resolved.c_str(), flags, mode);
case RelativizeStatus::fd_not_a_dir:
return -ENOTDIR;
case RelativizeStatus::internal:
//如果文件路径是GKFS内部路径,那就拦截下来,转而调用GKFS的实现
return with_errno(adafs_open(resolved, mode, flags));
default:
CTX->log()->error("{}() relativize status unknown: {}", __func__);
return -EINVAL;
}
}
在钩子函数hook_openat()中,如果判断路径是外部的,那就使用syscall_no_intercept()将其转发给本地文件系统,如果路径是内部的,那么进一步调用GKFS的相应的实现,GKFS实现的函数都用adafs作为前缀,比如adafs_open()。
/*GKFS自定义的文件系统操作函数,用于处理打开文件的请求。
*文件打开后,将文件描述符注册进打开文件表,并返回打开文件的描述符。
*
* 工作流程:首先获取文件的元数据,成功获取后在打开文件表中注册该文件
*/
int adafs_open(const std::string& path, mode_t mode, int flags) {
if(flags & O_PATH){
CTX->log()->error("{}() `O_PATH` flag is not supported", __func__);
errno = ENOTSUP;
return -1;
}
if(flags & O_APPEND){
CTX->log()->error("{}() `O_APPEND` flag is not supported", __func__);
errno = ENOTSUP;
return -1;
}
bool exists = true;
auto md = adafs_metadata(path); //获取文件元数据
if (!md) {//通过是否找到元数据来判断路径是否存在
if(errno == ENOENT) {//元数据不存在
exists = false;
} else {//和服务器通信失败
CTX->log()->error("{}() error while retriving stat to file", __func__);
return -1;
}
}
if (!exists) {
if (! (flags & O_CREAT)) {
// file doesn't exists and O_CREAT was not set
errno = ENOENT;
return -1;
}
/*** CREATION ***/
assert(flags & O_CREAT);
//GKFS的open()只支持创建不存在的文件,不支持创建不存在的目录
if(flags & O_DIRECTORY){
CTX->log()->error("{}() O_DIRECTORY use with O_CREAT. NOT SUPPORTED", __func__);
errno = ENOTSUP;
return -1;
}
// no access check required here. If one is using our FS they have the permissions.
if(adafs_mk_node(path, mode | S_IFREG)) {
CTX->log()->error("{}() error creating non-existent file: {}",
__func__, strerror(errno));
return -1;
}
} else {
/* File already exists */
if(flags & O_EXCL) {
// File exists and O_EXCL was set
errno = EEXIST;
return -1;
}
#ifdef HAS_SYMLINKS
if (md->is_link()) {
if (flags & O_NOFOLLOW) {
CTX->log()->warn("{}() symlink found and O_NOFOLLOW flag was specified", __func__);
errno = ELOOP;
return -1;
}
return adafs_open(md->target_path(), mode, flags);
}
#endif
if(S_ISDIR(md->mode())) {
return adafs_opendir(path);
}
/*** Regular file exists ***/
assert(S_ISREG(md->mode()));
if( (flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY)) ) {
if(adafs_truncate(path, md->size(), 0)) {
CTX->log()->error("{}() error truncating file", __func__);
return -1;
}
}
}
return CTX->file_map()->add(std::make_shared<OpenFile>(path, flags));
}
在钩子函数中,判断文件路径是否为内部的,只是把路径和挂载目录作比对,如果路径名落在挂载目录下,那就判断为内部文件,然而,虽然这是一个合法的内部路径,但是它未必存在,所以还要调用adafs_metadata()从服务器上获取它的元数据,如果元数据存在,说明以前确实创建过这个路径,如果不存在,那么根据Linux-open()的语义,检查打开文件的时候指定的标志flags,看是否设置了O_CREATE,如果设置了,说明用户的意思是“如果文件不存在就创建这个文件”,所以,下一步调用adafs_mk_node()。在GKFS中,不论是创建目录还是创建文件,都实现为在服务器上新增一条相应的元数据,都会调用adafs_mk_node(),所以这里的代码为了区别开目录和文件,在mode上又设置了S_IFREG这个标志,说明创建的是普通文件。但其实目录和文件的元数据形式都是一样的,这里只是为了提高代码的可读性。
创建元数据最终调用的是服务器上的create_metadentry(),这个函数根据文件系统的配置来决定如何填充元数据的各个字段,文件系统的配置主要就是用在这里。
元数据创建成功后,客户端在预加载上下文的打开文件表中新增一项打开文件。
回到获取元数据的话题,以上内容讲的是获取失败的情况,但如果获取成功的话,说明路径确实存在,进一步GKFS需要判断路径是一个文件还是一个目录,如果是目录,那么调用adafs_opendir()来打开目录。
int adafs_opendir(const std::string& path) {
auto md = adafs_metadata(path);
if (!md) {
return -1;
}
if (!S_ISDIR(md->mode())) {
CTX->log()->debug("{}() path is not a directory", __func__);
errno = ENOTDIR;
return -1;
}
//因为打开文件表是在客户端本地维护的,所以需要在本地为打开文件分配空间
//所以make_shared<OpneDir>在客户端调用
auto open_dir = std::make_shared<OpenDir>(path);
rpc_send::get_dirents(*open_dir);
return CTX->file_map()->add(open_dir);
}
虽然在当前调用栈中可以保证目录元数据是存在的,但是打开目录作为另外一个比较独立的模块,它可能在别的地方被调用,从“打开目录”这样的语境来考虑,首先也是应该先检查目录路径是否存在,所以调用adafs_metadata(),如果存在,进一步,使用get_dirents()从服务器把该目录下的所有文件和目录的元数据拉回本地,缓存起来。
/**
* Sends an RPC request to a specific node to push all chunks that belong to him
* 获取属于特定节点的所有目录项(所以接收缓冲区有8MB之大)
* 一个目录下有许多目录项,打开一个目录应该获取该目录下的所有条目
*/
void get_dirents(OpenDir& open_dir){
CTX->log()->trace("{}() called", __func__);
auto const root_dir = open_dir.path();
/*返回集群中所有主机的ID。targets为vector<Host>类型。目录项创建的时候就是在线性空间中随机哈希的,
所以一个目录路径下的目录项,它们可能分布在集群中的任何一个主机上,因此要遍历所有主机,如果这个主
机上有文件或者次级目录项的元数据,就把数据送回客户端。因为不知道到底有多少项,所以直接开了8MB的
缓存,这个缓存足够大,一般可以保证装下所有目录项
*/
auto const targets = CTX->distributor()->locate_directory_metadata(root_dir);
auto const host_size = targets.size();
std::vector<hg_handle_t> rpc_handles(host_size);
std::vector<margo_request> rpc_waiters(host_size);
std::vector<rpc_get_dirents_in_t> rpc_in(host_size);
std::vector<char*> recv_buffers(host_size);//每个目标服务器的接受缓冲区的首地址
/* preallocate receiving buffer. The actual size is not known yet.
*
* On C++14 make_unique function also zeroes the newly allocated buffer.
* It turns out that this operation is increadibly slow for such a big
* buffer. Moreover we don't need a zeroed buffer here.
*
* 缓冲区开辟在用户空间。GKFS本身也是一个用户空间的文件系统
*/
auto recv_buff = std::unique_ptr<char[]>(new char[RPC_DIRENTS_BUFF_SIZE]);//#define RPC_DIRENTS_BUFF_SIZE (8 * 1024 * 1024)
const unsigned long int per_host_buff_size = RPC_DIRENTS_BUFF_SIZE / host_size;//每个目标节点分配的缓冲区大小
hg_return_t hg_ret;
for(const auto& target_host: targets){//target_host为Host类型
CTX->log()->trace("{}() target_host: {}", __func__, target_host);
//Setup rpc input parameters for each host
rpc_in[target_host].path = root_dir.c_str();
recv_buffers[target_host] = recv_buff.get() + (target_host * per_host_buff_size);
hg_ret = margo_bulk_create(//使用RDMA通信
ld_margo_rpc_id, 1,
reinterpret_cast<void**>(&recv_buffers[target_host]),
&per_host_buff_size,
HG_BULK_WRITE_ONLY, &(rpc_in[target_host].bulk_handle));
if(hg_ret != HG_SUCCESS){
throw std::runtime_error("Failed to create margo bulk handle");
}
hg_ret = margo_create_wrap_helper(rpc_get_dirents_id, target_host, rpc_handles[target_host]);
if (hg_ret != HG_SUCCESS) {
std::runtime_error("Failed to create margo handle");
}
// Send RPC
CTX->log()->trace("{}() Sending RPC to host: {}", __func__, target_host);
hg_ret = margo_iforward(rpc_handles[target_host],
&rpc_in[target_host],
&rpc_waiters[target_host]);
if (hg_ret != HG_SUCCESS) {
CTX->log()->error("{}() Unable to send non-blocking get_dirents on {} to recipient {}", __func__, root_dir, target_host);
for (uint64_t i = 0; i <= target_host; i++) {
margo_bulk_free(rpc_in[i].bulk_handle);
margo_destroy(rpc_handles[i]);
}
throw std::runtime_error("Failed to forward non-blocking rpc request");
}
}
for(unsigned int target_host = 0; target_host < host_size; target_host++){//遍历所有主机
hg_ret = margo_wait(rpc_waiters[target_host]);//等待并处理与当前目标主机相关的RPC请求完成
if (hg_ret != HG_SUCCESS) {
throw std::runtime_error(fmt::format("Failed while waiting for rpc completion. [root dir: {}, target host: {}]", root_dir, target_host));
}
rpc_get_dirents_out_t out{};//用于存储RPC调用的输出参数
hg_ret = margo_get_output(rpc_handles[target_host], &out);
if (hg_ret != HG_SUCCESS) {
throw std::runtime_error(fmt::format("Failed to get rpc output.. [path: {}, target host: {}]", root_dir, target_host));
}
if (out.err) {
CTX->log()->error("{}() Sending RPC to host: {}", __func__, target_host);
throw std::runtime_error(fmt::format("Failed to retrieve dir entries from host '{}'. "
"Error '{}', path '{}'", target_host, strerror(out.err), root_dir));
}
bool* bool_ptr = reinterpret_cast<bool*>(recv_buffers[target_host]);
char* names_ptr = recv_buffers[target_host] + (out.dirents_size * sizeof(bool));
for(unsigned int i = 0; i < out.dirents_size; i++){//遍历目录项列表
FileType ftype = (*bool_ptr)? FileType::directory : FileType::regular;
bool_ptr++;
//Check that we are not outside the recv_buff for this specific host
assert((names_ptr - recv_buffers[target_host]) > 0);
assert(static_cast<unsigned long int>(names_ptr - recv_buffers[target_host]) < per_host_buff_size);
auto name = std::string(names_ptr);
names_ptr += name.size() + 1;
open_dir.add(name, ftype);
}
margo_free_output(rpc_handles[target_host], &out);
margo_bulk_free(rpc_in[target_host].bulk_handle);
margo_destroy(rpc_handles[target_host]);
}
}
这里打开目录有两方面的开销,一是内存开销:因为不知道目录下有多少条目,所以客户端直接开辟了一个很大的一般情况下足以装下所有条目的内存,有8MB之大;二是通信开销、计算开销:元数据通过哈希函数均匀地分布在集群中,同一文件夹下的文件和目录,它们可能分布在各个服务器上,所以拉取目录项的时候,遍历集群中所有主机,所有主机又遍历自己的元数据库,元数据库是一个键值存储的数据库,根据元数据的键来判断它是否下属于一个目录。
写到这里,笔者不禁思考,打开目录为什么一定要立马把所有目录项都拉到客户端?GKFS中文件和目录的名字空间其实是扁平的,不像传统文件存储那样,有一个层状的组织,在传统文件系统中,查找一个文件需要逐层打开目录,逐层查找,但是在GKFS的键值存储数据库中,获取数据和元数据的时间是线性的,不管路径名有多深,只需要进行一次哈希函数值的计算就能确定它的数据和元数据在哪个服务器上,所以在GKFS中,获取目录项是否有意义?在GKFS 0.6.2的实现中,就笔者所知,只有调用adafs_rmdir()来删除目录的时候,获取所有目录项才有意义。GKFS还支持了一个系统调用getdents(),这个调用的语义是获取目录项,笔者想,能不能延迟目录项的拉取,仅在确实需要用到它们的时候,比如用户调用了getdents()的时候,才把目录项从服务器上拉回来呢?
打开目录成功后,在打开文件表中增加一项。
回到元数据获取成功的话题,如果路径是一个文件,那么还要看用户是否以截断的方式打开并且可读可写,如果否,不做处理,一般打开,在打开文件表中新增一项,完成;如果是,那么就要进入另外一个模块adafs_truncate()了,该函数第二个参数是文件原来的大小,第三个参数是截断后的大小,以截断的方式打开一个文件,就是要将文件内容清空,所以这里第三个参数的值为0。
//对于大型文件,truncate 可以用来快速分配空间,而不必实际写入数据
//old_size说明文件原来的大小,这个参数是要从文件元数据中获得的,new_size说明需要将文件的大小调整为多大
int adafs_truncate(const std::string& path, off_t old_size, off_t new_size) {
assert(new_size >= 0);
assert(new_size <= old_size);
if(new_size == old_size) {
return 0;
}
//先更新元数据状态
if (rpc_send::decr_size(path, new_size)) {
CTX->log()->debug("{}() failed to decrease size", __func__);
return -1;
}
//再更新数据状态
if(rpc_send::trunc_data(path, old_size, new_size)){
CTX->log()->debug("{}() failed to truncate data", __func__);
return -1;
}
return 0;
}
Linux的truncate()调用也是adafs_truncate()这个函数来实现的。truncate()的语义是,截断一个文件,那么新的大小应当小于原来的大小,所以在adafs_truncate()里面,第一步就是一个断言assert(),确保这个大小关系得到满足;紧接着,因为文件的大小变了,而文件大小又是元数据的字段之一,所以元数据也要修改,调用decr_size()保持元数据一致;最后,删除文件尾部的数据。adafs_truncate()结束之后,程序控制权返回到adafs_open(),它在打开文件表中新增一项。
以上就是SYS_open()要做的事。
SYS_close:
关闭文件的工作很简单。删除打开文件表中对应的条目就可以了。
SYS_stat:
这个函数指定一个路径,可以查询它的状态信息。
//获取文件的状态信息,Linux文件系统中“状态信息”这个概念差不多就是GKFS的“元数据”概念
int hook_stat(const char* path, struct stat* buf) {
CTX->log()->trace("{}() called with path '{}'", __func__, path);
std::string rel_path;
if (CTX->relativize_path(path, rel_path, false)) {
return with_errno(adafs_stat(rel_path, buf));
}
return syscall_no_intercept(SYS_stat, rel_path.c_str(), buf);
}
GKFS使用relativize_path()来判断路径是否在GKFS内部,这个函数的第三个参数resolve_last_link是一个布尔类型,默认为true,它用来指示要解析的路径是否为一个链接,如果是链接,那么它会调用POSIX提供的realpath()来获取链接目标的全路径,然后通过rel_path返回。接下来,调用adafs_stat()。
int adafs_stat(const string& path, struct stat* buf, bool follow_links) {
auto md = adafs_metadata(path, follow_links);
if (!md) {
return -1;
}
//已经从服务器上获取到元数据,存储在md当中了,现在要做的是解析md中的数据为struct stat结构
metadata_to_stat(path, *md, *buf);
return 0;
}
Linux定义了一个stat结构体,POSIX标准的stat()函数会把这个结构体的各个字段都填上,但是GKFS不是POSIX完全支持的,这个结构体要求的一些信息,GKFS并没有维护,GKFS的文献说这是因为HPC应用通常用不上这些特性。GKFS实现stat(),只是获取文件元数据,然后使用metadata_to_stat()填充这个结构体的部分字段,元数据里面没有的字段就填充为一些默认的值。
SYS_lstat:
这个函数和stat()类似,不同之处就在于解析路径的时候resolve_last_link为默认值true。虽然POSIX标准的lstat()的语义为获取符号链接的状态信息,但是GKFS实现为获取符号链接指向的文件的信息。
//获取链接文件本身而非链接目标的状态信息
int hook_lstat(const char* path, struct stat* buf) {
CTX->log()->trace("{}() called with path '{}'", __func__, path);
std::string rel_path;
//relativize_path()有三个参数,最后一个参数默认resolve_last_link = true
if (CTX->relativize_path(path, rel_path)) {
return with_errno(adafs_stat(rel_path, buf));
}
return syscall_no_intercept(SYS_lstat, rel_path.c_str(), buf);
}
SYS_fstat:
该函数也用于获取文件的状态信息,只不过该函数接受的参数是一个文件描述符,通过查询打开文件表来获取描述符所指文件的全路径,然后把全路径传给adafs_stat()。
//该函数也用于获取文件状态信息,只不过传入参数要求是一个描述符而非路径
int hook_fstat(unsigned int fd, struct stat* buf) {
CTX->log()->trace("{}() called with fd '{}'", __func__, fd);
if (CTX->file_map()->exist(fd)) {
auto path = CTX->file_map()->get(fd)->path();
return with_errno(adafs_stat(path, buf));
}
return syscall_no_intercept(SYS_fstat, fd, buf);
}
SYS_newfstatat:
该函数结尾的at意为在某一路径下,所以函数第一个参数就是一个目录的描述符,第二个参数是目录下的文件,首先使用relativize_fd_path()获取目录的路径,然后把目录路径和文件路劲拼接起来,形成一个全路径,通过resolved变量返回。全路径对GKFS来说是很重要的,因为它的哈希函数就是根据全路径来计算值的。Relativize_fd_path()同时还负责解析全路径是否是GKFS内部的,如果是,那就调用adafs_stat()。
//获取状态信息:指定一个打开目录描述符dirfd,一个该目录下的路径cpath,flags是一些标志位,指定函数的行为
int hook_fstatat(int dirfd, const char * cpath, struct stat * buf, int flags) {
CTX->log()->trace("{}() called with path '{}' and fd {}", __func__, cpath, dirfd);
if(flags & AT_EMPTY_PATH) {//AT_EMPTY_PATH:如果cpath为空剪且目录描述符有效,那么就返回指定目录的状态
CTX->log()->error("{}() AT_EMPTY_PATH flag not supported", __func__);
return -ENOTSUP;
}
std::string resolved;
auto rstatus = CTX->relativize_fd_path(dirfd, cpath, resolved);
switch(rstatus) {
case RelativizeStatus::fd_unknown:
return syscall_no_intercept(SYS_newfstatat, dirfd, cpath, buf, flags);
case RelativizeStatus::external:
return syscall_no_intercept(SYS_newfstatat, dirfd, resolved.c_str(), buf, flags);
case RelativizeStatus::fd_not_a_dir:
return -ENOTDIR;
case RelativizeStatus::internal:
return with_errno(adafs_stat(resolved, buf));
default:
CTX->log()->error("{}() relativize status unknown: {}", __func__);
return -EINVAL;
}
}
该函数名中有一个new的前缀,它新的特性主要是相比上文几个获取文件状态的函数,可以接受一个flags参数,设置一些标志位,但是GKFS并不支持这些标志期望的功能,如果flags设置了AT_EMPTY_PATH,GKFS会报错,如果设置了其他值,GKFS直接无视,调用adafs_stat()的时候根本就没有用这些标志。
SYS_read:
读一个内部文件,调用adafs_read(),第一个参数是文件描述符,第二个参数是一个用户缓冲区的指针,数据将从服务器读到这个缓冲区,第三个参数count说明要读多少字节。
ssize_t adafs_read(int fd, void* buf, size_t count) {
auto adafs_fd = CTX->file_map()->get(fd);//返回一个打开文件OpenFile
auto pos = adafs_fd->pos(); //retrieve the current offset
auto ret = adafs_pread(adafs_fd, reinterpret_cast<char*>(buf), count, pos);
// Update offset in file descriptor in the file map
if (ret > 0) {
adafs_fd->pos(pos + ret);
}
return ret;
}
接下来,调用adafs_pread(),这个函数最后一个参数说明从文件的哪个位置开始读,这个函数其实是对Linux的pread()的实现,pread()主要用于多个线程并行地从一个文件的不同位置读,所以它提供了一个接口,可以指定读的位置,而不是像read()一样,只能从文件当前的位置开始读。为了调用adafs_pread(),所以adafs_read()先要使用描述符查询到打开文件读写指针当前的位置,再把这个位置传给它,说到这里,不禁要问,用户使用read()的时候就不能控制从哪里开始读吗,当然可以,结合lseek()调整文件的读写指针就行了。
ssize_t adafs_pread(std::shared_ptr<OpenFile> file, char * buf, size_t count, off64_t offset) {
if (file->type() != FileType::regular) {
assert(file->type() == FileType::directory);
CTX->log()->warn("{}() cannot read from directory", __func__);
errno = EISDIR;
return -1;
}
CTX->log()->trace("{}() count: {}, offset: {}", __func__, count, offset);
// Zeroing buffer before read is only relevant for sparse files. Otherwise sparse regions contain invalid data.
#if defined(ZERO_BUFFER_BEFORE_READ)
memset(buf, 0, sizeof(char)*count);
#endif
auto ret = rpc_send::read(file->path(), buf, offset, count);
if (ret < 0) {
CTX->log()->warn("{}() rpc_send::read failed with ret {}", __func__, ret);
}
// XXX check that we don't try to read past end of the file
return ret; // return read size or -1 as error
}
adafs_pread()返回之后,adafs_read()还要更新读写指针的位置,读了多少字节,就将其往后偏移多少字节,以便下次接着从这个最后读完的位置继续读。在adafs_pread()中,它先确保描述符指向的是一个文件而不是目录,然后,调用rpc_send这个命名空间里面的read()。
/**
* Sends an RPC request to a specific node to push all chunks that belong to him
*/
ssize_t read(const string& path, void* buf, const off64_t offset, const size_t read_size) {
// Calculate chunkid boundaries and numbers so that daemons know in which interval to look for chunks
auto chnk_start = chnk_id_for_offset(offset, CHUNKSIZE); // first chunk number
auto chnk_end = chnk_id_for_offset((offset + read_size - 1), CHUNKSIZE);
// Collect all chunk ids within count that have the same destination so that those are send in one rpc bulk transfer
map<uint64_t, vector<uint64_t>> target_chnks{};
// contains the recipient ids, used to access the target_chnks map. First idx is chunk with potential offset
vector<uint64_t> targets{};
// targets for the first and last chunk as they need special treatment
uint64_t chnk_start_target = 0;
uint64_t chnk_end_target = 0;
for (uint64_t chnk_id = chnk_start; chnk_id <= chnk_end; chnk_id++) {
auto target = CTX->distributor()->locate_data(path, chnk_id);
if (target_chnks.count(target) == 0) {
target_chnks.insert(make_pair(target, vector<uint64_t>{chnk_id}));
targets.push_back(target);
} else
target_chnks[target].push_back(chnk_id);
// set first and last chnk targets
if (chnk_id == chnk_start)
chnk_start_target = target;
if (chnk_id == chnk_end)
chnk_end_target = target;
}
// some helper variables for async RPC
auto target_n = targets.size();
vector<hg_handle_t> rpc_handles(target_n);
vector<margo_request> rpc_waiters(target_n);
vector<rpc_read_data_in_t> rpc_in(target_n);
// register local target buffer for bulk access for margo instance
auto bulk_buf = buf;
hg_bulk_t rpc_bulk_handle = nullptr;
auto size = make_shared<size_t>(read_size);
auto ret = margo_bulk_create(ld_margo_rpc_id, 1, &bulk_buf, size.get(), HG_BULK_WRITE_ONLY, &rpc_bulk_handle);
if (ret != HG_SUCCESS) {
CTX->log()->error("{}() Failed to create rpc bulk handle", __func__);
errno = EBUSY;
return -1;
}
// Issue non-blocking RPC requests and wait for the result later
for (unsigned int i = 0; i < target_n; i++) {
auto target = targets[i];
auto total_chunk_size = target_chnks[target].size() * CHUNKSIZE;//一个服务器上有多个块
//对于非边界块,都是整块读取
if (target == chnk_start_target) // receiver of first chunk must subtract the offset from first chunk
total_chunk_size -= chnk_lpad(offset, CHUNKSIZE);
if (target == chnk_end_target) // receiver of last chunk must subtract
total_chunk_size -= chnk_rpad(offset + read_size, CHUNKSIZE);
// Fill RPC input
rpc_in[i].path = path.c_str();
rpc_in[i].host_id = target;
rpc_in[i].host_size = CTX->hosts().size();
rpc_in[i].offset = chnk_lpad(offset, CHUNKSIZE);// first offset in targets is the chunk with a potential offset
rpc_in[i].chunk_n = target_chnks[target].size(); // number of chunks handled by that destination
rpc_in[i].chunk_start = chnk_start; // chunk start id of this write
rpc_in[i].chunk_end = chnk_end; // chunk end id of this write
rpc_in[i].total_chunk_size = total_chunk_size; // total size to write
rpc_in[i].bulk_handle = rpc_bulk_handle;
margo_create_wrap_helper(rpc_read_data_id, target, rpc_handles[i]);
// Send RPC,非阻塞,只要成功转发了就返回,不必等到服务器完成相应的操作
ret = margo_iforward(rpc_handles[i], &rpc_in[i], &rpc_waiters[i]);
if (ret != HG_SUCCESS) {
CTX->log()->error("{}() Unable to send non-blocking rpc for path {} and recipient {}", __func__, path,
target);
errno = EBUSY;
for (uint64_t j = 0; j < i + 1; j++) {
margo_destroy(rpc_handles[j]);
}
// free bulk handles for buffer
margo_bulk_free(rpc_bulk_handle);
return -1;
}
}
// Wait for RPC responses and then get response and add it to out_size which is the read size
// All potential outputs are served to free resources regardless of errors, although an errorcode is set.
ssize_t out_size = 0;
bool error = false;
for (unsigned int i = 0; i < target_n; i++) {
// XXX We might need a timeout here to not wait forever for an output that never comes?
ret = margo_wait(rpc_waiters[i]);
if (ret != HG_SUCCESS) {
CTX->log()->error("{}() Unable to wait for margo_request handle for path {} recipient {}", __func__, path,
targets[i]);
error = true;
errno = EBUSY;
}
// decode response
rpc_data_out_t out{};
ret = margo_get_output(rpc_handles[i], &out);
if (ret != HG_SUCCESS) {
CTX->log()->error("{}() Failed to get rpc output for path {} recipient {}", __func__, path, targets[i]);
error = true;
errno = EIO;
}
if (out.err != 0) {
CTX->log()->error("{}() Daemon reported error: {}", __func__, out.err);
error = true;
errno = out.err;
}
out_size += static_cast<size_t>(out.io_size);
margo_free_output(rpc_handles[i], &out);
margo_destroy(rpc_handles[i]);
}
// free bulk handles for buffer
margo_bulk_free(rpc_bulk_handle);
return (error) ? -1 : out_size;
}
rpc_send::read()把读的字节级的范围转化为块范围,然后使用哈希函数定位这些块在哪些服务器上,然后向目标服务器发起读数据的RPC请求。服务器的rpc_srv_read_data()负责响应来自客户端的RPC请求。
static hg_return_t rpc_srv_read_data(hg_handle_t handle) {
/*
* 1. Setup
*/
rpc_read_data_in_t in{};
rpc_data_out_t out{};
hg_bulk_t bulk_handle = nullptr;
// Set default out for error
out.err = EIO;
out.io_size = 0;
// Getting some information from margo
auto ret = margo_get_input(handle, &in);
if (ret != HG_SUCCESS) {
ADAFS_DATA->spdlogger()->error("{}() Could not get RPC input data with err {}", __func__, ret);
return rpc_cleanup_respond(&handle, &in, &out, &bulk_handle);
}
auto hgi = margo_get_info(handle);
auto mid = margo_hg_info_get_instance(hgi);
auto bulk_size = margo_bulk_get_size(in.bulk_handle);
ADAFS_DATA->spdlogger()->debug("{}() path: {}, size: {}, offset: {}", __func__,
in.path, bulk_size, in.offset);
/*
* 2. Set up buffers for pull bulk transfers
*/
void* bulk_buf; // buffer for bulk transfer
vector<char*> bulk_buf_ptrs(in.chunk_n); // buffer-chunk offsets
// create bulk handle and allocated memory for buffer with buf_sizes information
ret = margo_bulk_create(mid, 1, nullptr, &in.total_chunk_size, HG_BULK_READWRITE, &bulk_handle);
if (ret != HG_SUCCESS) {
ADAFS_DATA->spdlogger()->error("{}() Failed to create bulk handle", __func__);
return rpc_cleanup_respond(&handle, &in, &out, static_cast<hg_bulk_t*>(nullptr));
}
// access the internally allocated memory buffer and put it into buf_ptrs
uint32_t actual_count;
ret = margo_bulk_access(bulk_handle, 0, in.total_chunk_size, HG_BULK_READWRITE, 1, &bulk_buf,
&in.total_chunk_size, &actual_count);
if (ret != HG_SUCCESS || actual_count != 1) {
ADAFS_DATA->spdlogger()->error("{}() Failed to access allocated buffer from bulk handle", __func__);
return rpc_cleanup_respond(&handle, &in, &out, &bulk_handle);
}
//之所以要让客户端封装这两个变量作为参数传递过来,是因为服务器要使用哈希分布器,而创建一个哈希分布器又需要用这两个参数来初始化
auto const host_id = in.host_id;//本机ID
auto const host_size = in.host_size;//集群的大小
SimpleHashDistributor distributor(host_id, host_size);
auto path = make_shared<string>(in.path);
// chnk_ids used by this host
vector<uint64_t> chnk_ids_host(in.chunk_n);
// counter to track how many chunks have been assigned
auto chnk_id_curr = static_cast<uint64_t>(0);
// chnk sizes per chunk for this host
vector<uint64_t> chnk_sizes(in.chunk_n);
// local and origin offsets for bulk operations
vector<uint64_t> local_offsets(in.chunk_n);
vector<uint64_t> origin_offsets(in.chunk_n);
// how much size is left to assign chunks for reading
auto chnk_size_left_host = in.total_chunk_size;
// temporary traveling pointer
auto chnk_ptr = static_cast<char*>(bulk_buf);
// temporary variables
auto transfer_size = (bulk_size <= CHUNKSIZE) ? bulk_size : CHUNKSIZE;
// tasks structures
vector<ABT_task> abt_tasks(in.chunk_n);//一个task读一个块
vector<ABT_eventual> task_eventuals(in.chunk_n);
vector<struct read_chunk_args> task_args(in.chunk_n);
/*
* 3. Calculate chunk sizes that correspond to this host and start tasks to read from disk
*/
// Start to look for a chunk that hashes to this host with the first chunk in the buffer
for (auto chnk_id_file = in.chunk_start; chnk_id_file < in.chunk_end || chnk_id_curr < in.chunk_n; chnk_id_file++) {
// Continue if chunk does not hash to this host
if (distributor.locate_data(in.path, chnk_id_file) != host_id)
continue;
chnk_ids_host[chnk_id_curr] = chnk_id_file; // save this id to host chunk list
// Only relevant in the first iteration of the loop and if the chunk hashes to this host
if (chnk_id_file == in.chunk_start && in.offset > 0) {
// if only 1 destination and 1 chunk (small read) the transfer_size == bulk_size
auto offset_transfer_size = (in.offset + bulk_size <= CHUNKSIZE) ? bulk_size : static_cast<size_t>(
CHUNKSIZE - in.offset);
// Setting later transfer offsets
local_offsets[chnk_id_curr] = 0;
origin_offsets[chnk_id_curr] = 0;
bulk_buf_ptrs[chnk_id_curr] = chnk_ptr;
chnk_sizes[chnk_id_curr] = offset_transfer_size;
// util variables
chnk_ptr += offset_transfer_size;
chnk_size_left_host -= offset_transfer_size;
} else {
local_offsets[chnk_id_curr] = in.total_chunk_size - chnk_size_left_host;
// origin offset of a chunk is dependent on a given offset in a write operation
if (in.offset > 0)
origin_offsets[chnk_id_curr] =
(CHUNKSIZE - in.offset) + ((chnk_id_file - in.chunk_start) - 1) * CHUNKSIZE;
else
origin_offsets[chnk_id_curr] = (chnk_id_file - in.chunk_start) * CHUNKSIZE;
// last chunk might have different transfer_size
if (chnk_id_curr == in.chunk_n - 1)
transfer_size = chnk_size_left_host;
bulk_buf_ptrs[chnk_id_curr] = chnk_ptr;
chnk_sizes[chnk_id_curr] = transfer_size;
// util variables
chnk_ptr += transfer_size;
chnk_size_left_host -= transfer_size;
}
// Delegate chunk I/O operation to local FS to an I/O dedicated ABT pool,对每个块的读都用一个task来负载
// Starting tasklets for parallel I/O
ABT_eventual_create(sizeof(ssize_t), &task_eventuals[chnk_id_curr]); // written file return value
auto& task_arg = task_args[chnk_id_curr];
task_arg.path = path.get();
task_arg.buf = bulk_buf_ptrs[chnk_id_curr];
task_arg.chnk_id = chnk_ids_host[chnk_id_curr];
task_arg.size = chnk_sizes[chnk_id_curr];
// only the first chunk gets the offset. the chunks are sorted on the client side
task_arg.off = (chnk_id_file == in.chunk_start) ? in.offset : 0;
task_arg.eventual = task_eventuals[chnk_id_curr];
auto abt_ret = ABT_task_create(RPC_DATA->io_pool(), read_file_abt, &task_args[chnk_id_curr],
&abt_tasks[chnk_id_curr]);
if (abt_ret != ABT_SUCCESS) {
ADAFS_DATA->spdlogger()->error("{}() task create failed", __func__);
cancel_abt_io(&abt_tasks, &task_eventuals, chnk_id_curr + 1);
return rpc_cleanup_respond(&handle, &in, &out, &bulk_handle);
}
chnk_id_curr++;
}
// Sanity check that all chunks where detected in previous loop
if (chnk_size_left_host != 0)
ADAFS_DATA->spdlogger()->warn("{}() Not all chunks were detected!!! Size left {}", __func__,
chnk_size_left_host);
/*
* 4. Read task results and accumulate in out.io_size
*/
out.err = 0;
out.io_size = 0;
for (chnk_id_curr = 0; chnk_id_curr < in.chunk_n; chnk_id_curr++) {
ssize_t* task_read_size = nullptr;
// wait causes the calling ult to go into BLOCKED state, implicitly yielding to the pool scheduler
auto abt_ret = ABT_eventual_wait(task_eventuals[chnk_id_curr], (void**) &task_read_size);
if (abt_ret != ABT_SUCCESS) {
ADAFS_DATA->spdlogger()->error(
"{}() Failed to wait for read task for chunk {}",
__func__, chnk_id_curr);
out.err = EIO;
break;
}
assert(task_read_size != nullptr);
if(*task_read_size < 0){
if(-(*task_read_size) == ENOENT) {
continue;
}
ADAFS_DATA->spdlogger()->warn(
"{}() Read task failed for chunk {}",
__func__, chnk_id_curr);
out.err = -(*task_read_size);
break;
}
if(*task_read_size == 0) {
continue;
}
ret = margo_bulk_transfer(mid, HG_BULK_PUSH, hgi->addr, in.bulk_handle, origin_offsets[chnk_id_curr],
bulk_handle, local_offsets[chnk_id_curr], *task_read_size);
if (ret != HG_SUCCESS) {
ADAFS_DATA->spdlogger()->error(
"{}() Failed push chnkid {} on path {} to client. origin offset {} local offset {} chunk size {}",
__func__, chnk_id_curr, in.path, origin_offsets[chnk_id_curr], local_offsets[chnk_id_curr],
chnk_sizes[chnk_id_curr]);
out.err = EIO;
break;
}
out.io_size += *task_read_size; // add task read size to output size
}
/*
* 5. Respond and cleanup
*/
ADAFS_DATA->spdlogger()->debug("{}() Sending output response, err: {}", __func__, out.err);
ret = rpc_cleanup_respond(&handle, &in, &out, &bulk_handle);
// free tasks after responding
cancel_abt_io(&abt_tasks, &task_eventuals, in.chunk_n);
return ret;
}
服务器接收到请求后,对每个块的读都创建一个线程,这些线程执行的是read_file_abt()这个函数,它们可以并行,这一点也体现了GKFS是一个并行的文件系统。
/**
* Used by an argobots threads. Argument args has the following fields:
* const std::string* path;
char* buf;
const rpc_chnk_id_t* chnk_id;
size_t size;
off64_t off;
ABT_eventual* eventual;
* This function is driven by the IO pool. so there is a maximum allowed number of concurrent IO operations per daemon.
* This function is called by tasklets, as this function cannot be allowed to block.
* @return read_size<ssize_t> is put into eventual and returned that way
*/
void read_file_abt(void* _arg) {
//unpack args
auto* arg = static_cast<struct read_chunk_args*>(_arg);
const std::string& path = *(arg->path);
try {
ADAFS_DATA->storage()->read_chunk(path, arg->chnk_id,
arg->buf, arg->size, arg->off, arg->eventual);
} catch (const std::system_error& serr){
ADAFS_DATA->spdlogger()->error("{}() Error reading chunk {} of file {}", __func__, arg->chnk_id, path);
ssize_t read = -(serr.code().value());
ABT_eventual_set(arg->eventual, &read, sizeof(ssize_t));
}
}
在read_file_abt()中就需要调用服务器本地文件系统了。
void ChunkStorage::read_chunk(const std::string& file_path, unsigned int chunk_id,
char * buff, size_t size, off64_t offset, ABT_eventual& eventual) const {
assert((offset + size) <= chunksize);
auto chunk_path = absolute(get_chunk_path(file_path, chunk_id));
int fd = open(chunk_path.c_str(), O_RDONLY);
if(fd < 0) {
log->error("Failed to open chunk file for read. File: {}, Error: {}", chunk_path, std::strerror(errno));
throw std::system_error(errno, std::system_category(), "Failed to open chunk file for read");
}
size_t tot_read = 0;
ssize_t read = 0;
do {//一个块有512KB之大,本地VFS一次读不了这么多,要分多次
read = pread64(fd,
buff + tot_read,
size - tot_read,
offset + tot_read);
if(read == 0) {
break;
}
if (read < 0) {
log->error("Failed to read chunk file. File: {}, size: {}, offset: {}, Error: {}",
chunk_path, size, offset, std::strerror(errno));
throw std::system_error(errno, std::system_category(), "Failed to read chunk file");
}
#ifndef NDEBUG
if(tot_read + read < size) {
log->warn("Read less bytes than requested: {}/{}. Total read was {}", read, size - tot_read, size);
}
#endif
assert(read > 0);
tot_read += read;
} while (tot_read != size);
ABT_eventual_set(eventual, &tot_read, sizeof(size_t));
auto err = close(fd);
if (err < 0) {
log->error("Failed to close chunk file after read. File: {}, Error: {}",
chunk_path, std::strerror(errno));
//throw std::system_error(errno, std::system_category(), "Failed to close chunk file");
}
}
首先用open()打开块文件,然后在一个循环中多次调用pread(),保证把要求的所有数据读完,然后把读到的字节数使用最终一致性模型同步给rpc_srv_read_data(),rpc_srv_read_data()等待使用ABT_veventual_wait()阻塞自己,直到所有读文件块的线程都结束,它对每个线程读了多少字节做一个合计,把总共读取的字节数返回给客户端,同时,客户端向服务器读数据使用的是远程直接内存访问,rpc_srv_read_data()还要使用这个协议把读出的数据送回客户端内存。
SYS_lseek():
第一个参数fd说明是哪个文件的读写指针要调整,第二个参数offset和第三个参数whence结合,一起说明指针要调整到什么位置。
off_t adafs_lseek(shared_ptr<OpenFile> adafs_fd, off_t offset, unsigned int whence) {
switch (whence) {//whence用于说明offset是相对于哪一个位置的偏移
case SEEK_SET:
CTX->log()->debug("{}() whence is SEEK_SET", __func__);
adafs_fd->pos(offset);//把读写指针调整到offset的位置
break;
case SEEK_CUR:
CTX->log()->debug("{}() whence is SEEK_CUR", __func__);
adafs_fd->pos(adafs_fd->pos() + offset);//读写指针从当前位置开始再偏移offset字节
break;
case SEEK_END: {
CTX->log()->debug("{}() whence is SEEK_END", __func__);
off64_t file_size;//当前文件大小
auto err = rpc_send::get_metadentry_size(adafs_fd->path(), file_size);
if (err < 0) {
errno = err; // Negative numbers are explicitly for error codes
return -1;
}
adafs_fd->pos(file_size + offset);//读写指针从文件末尾偏移offset字节
break;
}
case SEEK_DATA://不支持
CTX->log()->warn("{}() SEEK_DATA whence is not supported", __func__);
// We do not support this whence yet
errno = EINVAL;
return -1;
case SEEK_HOLE://不支持
CTX->log()->warn("{}() SEEK_HOLE whence is not supported", __func__);
// We do not support this whence yet
errno = EINVAL;
return -1;
default:
CTX->log()->warn("{}() unknown whence {}", __func__, whence);
errno = EINVAL;
return -1;
}
return adafs_fd->pos();
}
如果whence为SEEK_SET,说明这次调用是要对指针位置进行直接设置,所以直接把offset的值赋给它;如果whence为SEEK_CUR,那就是要从当前位置再偏移offset字节;如果whence为SEEK_END,那么从文件末尾偏移offset字节。
另外,adafs_lseek()有两个版本,使用了C++函数重载的特性。这两个函数的区别主要是第一个参数的类型不同。
off_t adafs_lseek(unsigned int fd, off_t offset, unsigned int whence) {
return adafs_lseek(CTX->file_map()->get(fd), offset, whence);
}
SYS_pread64:
这个系统调用最后还是通过adafs_pread()来实现。
SYS_write():
写文件和读文件的框架和逻辑大体相似,只不过写的时候会改变文件的大小,这就需要同时对元数据进行更新,因为元数据里面有一个字段就是文件大小。所以调用update_metadentry_size()更新元数据,然后再调用write()向服务器写数据。
ssize_t adafs_pwrite(std::shared_ptr<OpenFile> file, const char * buf, size_t count, off64_t offset) {
if (file->type() != FileType::regular) {
assert(file->type() == FileType::directory);
CTX->log()->warn("{}() cannot read from directory", __func__);
errno = EISDIR;
return -1;
}
auto path = make_shared<string>(file->path());
CTX->log()->trace("{}() count: {}, offset: {}", __func__, count, offset);
auto append_flag = file->get_flag(OpenFile_flags::append);
ssize_t ret = 0;
long updated_size = 0;
//关键的地方:写的时候需要更新元数据
ret = rpc_send::update_metadentry_size(*path, count, offset, append_flag, updated_size);
if (ret != 0) {
CTX->log()->error("{}() update_metadentry_size failed with ret {}", __func__, ret);
return ret; // ERR
}
ret = rpc_send::write(*path, buf, append_flag, offset, count, updated_size);
if (ret < 0) {
CTX->log()->warn("{}() rpc_send::write failed with ret {}", __func__, ret);
}
return ret; // return written size or -1 as error
}
写和读的另外一个不同之处在于这个写可能是追加写,所以更新元数据和写数据之前还要先使用adafs_lseek()把读写指针调整到文件末尾。
SYS_pwrite:
这个写不用配合lseek()指定写的位置,写完之后也不会更新读写指针,直接在函数接口最后一个参数指定写的位置就行了,它最后也调用adafs_pwrite()来实现。
SYS_writev:
这是Linux的聚合写,什么意思呢?如果如果想从不连续的多个缓冲区把数据写入文件,使用write()有两种办法:一是再开辟一个大的缓冲区,把多个缓冲区的数据复制到这个缓冲区中,然后再把这个缓冲区中的数据写入文件;二是每个缓冲区都调用一次write()向文件写入自己的数据,而Linux的writev()把多个缓冲区的起始地址和大小都封装进iov这个数组中,iov是一个结构体数组,每个结构体元素有两个成员,一是数据存放的缓冲区的起始地址,而是要写的数据的大小,writev()可以一次性把这些缓冲区的数据写入文件,避免了方法一再次复制数据的开销,并且只进行一次系统调用,所以它适合聚合写。
//可以从用户空间的多个缓冲区向一个文件写入
ssize_t adafs_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset) {
CTX->log()->trace("{}() called with fd {}, op num {}, offset {}",
__func__, fd, iovcnt, offset);
auto file = CTX->file_map()->get(fd);
auto pos = offset; // keep truck of current position
ssize_t written = 0;
ssize_t ret;
for (int i = 0; i < iovcnt; ++i) {
auto count = (iov+i)->iov_len;
if (count == 0) {
continue;
}
auto buf = (iov+i)->iov_base;
ret = adafs_pwrite(file, reinterpret_cast<char *>(buf), count, pos);
if (ret == -1) {
break;
}
written += ret;
pos += ret;
if (static_cast<size_t>(ret) < count) {
break;
}
}
if (written == 0) {
return -1;
}
return written;
}
但是,在GKFS中,它只是实现了writev()的语义,但并没有做到高效率、低开销,因为它实际上还是针对每个缓冲区都调用一次adafs_pwrite()。不过,在分布式系统中确实也只能这样实现了。
SYS_pwritev:
不支持。
SYS_unlinkat:
这个函数用于删除一个目录下的文件,dirfd是目录描述符,path是这个目录下的文件的名字,flags指定unlinkat()这个函数的行为。
//如果目录为空,则删除目录,如果目录非空,则该调用失败
int hook_unlinkat(int dirfd, const char * cpath, int flags) {//API语义:删除dirfd目录下的cpath文件,flags指定删除模式
CTX->log()->trace("{}() called with path '{}' dirfd {}, flags {}",
__func__, cpath, dirfd, flags);
if ((flags & ~AT_REMOVEDIR) != 0) {
CTX->log()->error("{}() Flags unknown: {}", __func__, flags);
return -EINVAL;
}
std::string resolved;
auto rstatus = CTX->relativize_fd_path(dirfd, cpath, resolved, false);
switch(rstatus) {
case RelativizeStatus::fd_unknown://打开文件表中没有dirfd的表项
return syscall_no_intercept(SYS_unlinkat, dirfd, cpath, flags);
case RelativizeStatus::external:
return syscall_no_intercept(SYS_unlinkat, dirfd, resolved.c_str(), flags);
case RelativizeStatus::fd_not_a_dir:
return -ENOTDIR;
case RelativizeStatus::internal:
if(flags & AT_REMOVEDIR) {//AT_REMOVEDIR 用于指定 unlinkat 应该删除一个目录,并且只有当目录为空时才执行删除操作
return with_errno(adafs_rmdir(resolved));
} else {
return with_errno(adafs_rm_node(resolved));
}
default:
CTX->log()->error("{}() relativize status unknown: {}", __func__);
return -EINVAL;
}
}
如果flags没有设置AT_REMOVEDIR,说明要删除的是一个文件,调用adafs_rm_node(),通过获取元数据判断文件是否存在,如果存在,再判断文件是否为空。
/**
* This sends internally a broadcast (i.e. n RPCs) to clean their chunk folders for that path
* GKFS的一个node就是一个文件,指定一个文件的路径就是指定了一个node的路径,本函数会删除文件/node下的所有块
* @param path
* @return
*/
int adafs_rm_node(const std::string& path) {
//删除一个文件之前先确保有东西可删
auto md = adafs_metadata(path);
if (!md) {
return -1;
}
bool has_data = S_ISREG(md->mode()) && (md->size() != 0);
return rpc_send::rm_node(path, !has_data);
}
删除文件不仅要删除数据,还要删除元数据,如果文件还有数据,那么就要向整个集群广播删除信息,如果没有数据,那只用向存储了元数据的服务器单播,rm_node()的第二个参数remove_metadentry_only就是用来选择单播还是广播的。总之,消息播放出去后,收到消息的服务器都会调用remove_node(),如果文件元数据在本机,那就删除它,同时,如果本机有这个文件的数据块,那么也全部删除。
int rm_node(const std::string& path, const bool remove_metadentry_only) {
hg_return_t ret;
int err = 0; // assume we succeed
// if metadentry should only removed only, send only 1 rpc to remove the metadata
// else send an rpc to all hosts and thus broadcast chunk_removal.
auto rpc_target_size = remove_metadentry_only ? static_cast<uint64_t>(1) : CTX->hosts().size();
CTX->log()->debug("{}() Creating Mercury handles for all nodes ...", __func__);
vector<hg_handle_t> rpc_handles(rpc_target_size);
vector<margo_request> rpc_waiters(rpc_target_size);
vector<rpc_rm_node_in_t> rpc_in(rpc_target_size);
// Send rpc to all nodes as all of them can have chunks for this path
for (size_t i = 0; i < rpc_target_size; i++) {
// fill in
rpc_in[i].path = path.c_str();
// create handle
// if only the metadentry needs to removed send one rpc to metadentry's responsible node
if (remove_metadentry_only)
ret = margo_create_wrap(rpc_rm_node_id, path, rpc_handles[i]);
else
ret = margo_create_wrap_helper(rpc_rm_node_id, i, rpc_handles[i]);
if (ret != HG_SUCCESS) {
CTX->log()->warn("{}() Unable to create Mercury handle", __func__);
// We use continue here to remove at least some data
// XXX In the future we can discuss RPC retrying. This should be a function to be used in general
errno = EBUSY;
err = -1;
}
// send async rpc
ret = margo_iforward(rpc_handles[i], &rpc_in[i], &rpc_waiters[i]);
if (ret != HG_SUCCESS) {
CTX->log()->warn("{}() Unable to create Mercury handle", __func__);
errno = EBUSY;
err = -1;
}
}
// Wait for RPC responses and then get response
for (size_t i = 0; i < rpc_target_size; i++) {
// XXX We might need a timeout here to not wait forever for an output that never comes?
ret = margo_wait(rpc_waiters[i]);
if (ret != HG_SUCCESS) {
CTX->log()->warn("{}() Unable to wait for margo_request handle for path {} recipient {}", __func__, path, i);
errno = EBUSY;
err = -1;
}
rpc_err_out_t out{};
ret = margo_get_output(rpc_handles[i], &out);
if (ret == HG_SUCCESS) {
CTX->log()->debug("{}() Got response success: {}", __func__, out.err);
if (err != 0) {
errno = out.err;
err = -1;
}
} else {
// something is wrong
errno = EBUSY;
err = -1;
CTX->log()->error("{}() while getting rpc output", __func__);
}
/* clean up resources consumed by this rpc */
margo_free_output(rpc_handles[i], &out);
margo_destroy(rpc_handles[i]);
}
return err;
}
如果flags设置了AT_REMOVEDIR,那说明要删除的一个目录,调用另一个模块adafs_rm_dir(),因为只能删除存在的目录,所以先获取一下元数据,如果获取成功,才说明目录确实存在;由于只能删除一个空目录,所以先判断一下目录里面还有没有内容,临时定义一个打开目录OpenDir的对象,调用get_dirents()获取目录项,然后看目录项的数量是否为0,为0才说明目录是空的,然后才能调用rm_node()
int adafs_rmdir(const std::string& path) {
auto md = adafs_metadata(path);
if (!md) {
CTX->log()->debug("{}() path does not exists: '{}'", __func__, path);
errno = ENOENT;
return -1;
}
if (!S_ISDIR(md->mode())) {
CTX->log()->debug("{}() path is not a directory", __func__);
errno = ENOTDIR;
return -1;
}
//首先根据目录的大小是否为零来判断目录是否为空
//实现方法:创建一个打开目录对象,把它的目录项全部读取到客户端,再计数有多少条表项
//虽然觉得开销删除一个目录的开销有点大,但是删除目录应该不是高频操作,所以没有优化的迫切需求
auto open_dir = std::make_shared<OpenDir>(path);
rpc_send::get_dirents(*open_dir);
if(open_dir->size() != 0){
errno = ENOTEMPTY;
return -1;
}
return rpc_send::rm_node(path, true);
}
SYS_unlink:
SYS_unlink和SYS_unlinkat很相似,两者主要是语义上有区别,unlinkat的at支持用户指定要删除哪个目录下的文件,而unlink默认删除的是当前工作目录下的文件,所以SYS_unlink调用钩子函数hook_unlinkat()的时候,把指定目录路径的参数设置为AT_FDCWD这个宏,这是Linux定义的当前工作目录描述符的宏。
case SYS_unlink://Linux删除一个文件没有名为remove()的调用,删除文件就是用unlink来实现
*result = hook_unlinkat(AT_FDCWD,
reinterpret_cast<const char *>(arg0),
0);
SYS_rmdir:
删除一个目录,把目录描述符设置为AT_FDCWD,flags设置为AT_REMOVEDIR,然后调用hook_unlinkat(),hook_unlinkat()最终会调用adafs_rm_dir()。
case SYS_rmdir:
*result = hook_unlinkat(AT_FDCWD,
reinterpret_cast<const char *>(arg0),
AT_REMOVEDIR);
SYS_mkdirat:
在指定目录下创建一个子目录。调用adafs_mk_node(),把mode设置上S_IFDIR,从代码实现来看,实际上创建目录和创建文件都是一样的。
int hook_mkdirat(int dirfd, const char * cpath, mode_t mode) {
CTX->log()->trace("{}() called with fd: {}, path: {}, mode: {}",
__func__, dirfd, cpath, mode);
std::string resolved;
auto rstatus = CTX->relativize_fd_path(dirfd, cpath, resolved);
switch(rstatus) {
case RelativizeStatus::external:
return syscall_no_intercept(SYS_mkdirat, dirfd, resolved.c_str(), mode);
case RelativizeStatus::fd_unknown:
return syscall_no_intercept(SYS_mkdirat, dirfd, cpath, mode);
case RelativizeStatus::fd_not_a_dir:
return -ENOTDIR;
case RelativizeStatus::internal:
return with_errno(adafs_mk_node(resolved, mode | S_IFDIR));
default:
CTX->log()->error("{}() relativize status unknown: {}", __func__);
return -EINVAL;
}
}
SYS_access:
Linux的access()调用用于查看文件是否存在以及文件的访问权限,第二个参数是一个掩码,可以设置为F_OK、R_OK、W_OK和X_OK,但实际GKFS拦截下这个系统调用之后,并没有用上这些掩码,它只用到了文件的描述符,通过描述符获取文件的全路径,根据全路径计算哈希值,得到目标服务器的ID,然后向目标服务器查询这个全路径是否存在,存在则返回0,不存在返回1,也就是说无论用户要检查的是文件的什么权限,只要文件存在,就返回0。
int adafs_access(const std::string& path, const int mask, bool follow_links) {
auto md = adafs_metadata(path, follow_links);
if (!md) {
errno = ENOENT;
return -1;
}
return 0;
}
SYS_dup:
实现这个函数不需要和服务器通信,只需要在客户端的打开文件表里面找到要复制的打开文件,然后再新分配一个描述符,在打开文件表只中添加一个新描述符和旧文件的映射关系,然后把新的描述符返回。在调用这个函数的程序看来,它复制了一个文件,所有这个函数的名字是duplication的前三个字母。
SYS_getcwd:
该函数用于获取当前工作目录,当前工作目录保存在客户端预加载上下文这个类里面,类的成员cwd就是当前工作目录的路径,GKFS把这个路径返回。
int hook_getcwd(char * buf, unsigned long size) {
CTX->log()->trace("{}() called with size {}", __func__, size);
if(CTX->cwd().size() + 1 > size) {
CTX->log()->error("{}() buffer too small to host current working dir", __func__);
return -ERANGE;
}
strcpy(buf, CTX->cwd().c_str());
return (CTX->cwd().size() + 1);
}
SYS_statfs:
该函数用于获取文件系统的统计信息。
int adafs_statfs(struct statfs* buf) {
CTX->log()->trace("{}() called", __func__);
auto blk_stat = rpc_send::chunk_stat();
buf->f_type = 0; //文件系统类型
buf->f_bsize = blk_stat.chunk_size; //块大小
buf->f_blocks = blk_stat.chunk_total; //总块数
buf->f_bfree = blk_stat.chunk_free; //可用块数
buf->f_bavail = blk_stat.chunk_free; //可用块数(对于非特权用户)
buf->f_files = 0; //总文件数
buf->f_ffree = 0; //可用文件数 为什么是0,难道是因为GKFS是“块”存储的?
buf->f_fsid = {0, 0}; //文件系统 ID
buf->f_namelen = PATH_MAX_LEN; //文件名长度的上限
buf->f_frsize = 0; //基本文件系统块大小
buf->f_flags = //文件系统标志
ST_NOATIME | ST_NODIRATIME | ST_NOSUID | ST_NODEV | ST_SYNCHRONOUS;
return 0;
}
chunck_stat()查询集群中所有服务器,每个服务器调用本地的statfs(),统计本服务器存放数据块的目录的统计信息,返回的信息封装在sfs这个结构体里面,目录下的总存储空间和空闲存储空间是以字节为单位的,GKFS提供的是块级别的抽象,它把字节单位转换为块单位,然后返回给客户端,客户端在adafs_statfs()中合计各个服务器返回的信息,填充statfs结构体。
//查询集群中所有的主机,返回块的统计数据statistics
ChunkStat chunk_stat() {
CTX->log()->trace("{}()", __func__);
rpc_chunk_stat_in_t in;
auto const host_size = CTX->hosts().size();
std::vector<hg_handle_t> rpc_handles(host_size);
std::vector<margo_request> rpc_waiters(host_size);
hg_return_t hg_ret;
for (unsigned int target_host = 0; target_host < host_size; ++target_host) {
//Setup rpc input parameters for each host
hg_ret = margo_create_wrap_helper(rpc_chunk_stat_id, target_host,
rpc_handles[target_host]);
if (hg_ret != HG_SUCCESS) {
throw std::runtime_error("Failed to create margo handle");
}
// Send RPC
CTX->log()->trace("{}() Sending RPC to host: {}", __func__, target_host);
hg_ret = margo_iforward(rpc_handles[target_host],
&in,
&rpc_waiters[target_host]);
if (hg_ret != HG_SUCCESS) {
CTX->log()->error("{}() Unable to send non-blocking chunk_stat to recipient {}", __func__, target_host);
for (unsigned int i = 0; i <= target_host; i++) {
margo_destroy(rpc_handles[i]);
}
throw std::runtime_error("Failed to forward non-blocking rpc request");
}
}
unsigned long chunk_size = CHUNKSIZE;
unsigned long chunk_total = 0;
unsigned long chunk_free = 0;
for (unsigned int target_host = 0; target_host < host_size; ++target_host) {
hg_ret = margo_wait(rpc_waiters[target_host]);
if (hg_ret != HG_SUCCESS) {
throw std::runtime_error(fmt::format("Failed while waiting for rpc completion. target host: {}", target_host));
}
rpc_chunk_stat_out_t out{};
hg_ret = margo_get_output(rpc_handles[target_host], &out);
if (hg_ret != HG_SUCCESS) {
throw std::runtime_error(fmt::format("Failed to get rpc output for target host: {}", target_host));
}
//对所有块的信息做一个合计
assert(out.chunk_size == chunk_size);
chunk_total += out.chunk_total;
chunk_free += out.chunk_free;
margo_free_output(rpc_handles[target_host], &out);
margo_destroy(rpc_handles[target_host]);
}
return {chunk_size, chunk_total, chunk_free};
}