学习GlusterFS的过程中,对GlusterFS中几个重要的数据结构的简单分析,持续更新中…
1. frame以及frame->local
用来保存volfile内容的一个数据结构,同时还承担了其它重要功能。代表了该xlator的frame,同时一个frame对象还主要用于记录一个xlator与父子节点间的调用关系,比如记录其子节点的frame信息,父节点的frame信息。Frame相当于一个纽带,将原本独立的xlator的信息联系到了一起。
alamofs/libglusterfs/src/stack.h:
struct _call_frame_t {
call_stack_t *root; /* stack root 记录当前的用户是谁,要发送给server的,表示当前是谁在系统中工作 */
call_frame_t *parent; /* previous BP */
struct list_head frames; //和链表关联起来的指针,有两个数据成员,指向前面的prev和指向后面的next
void *local; /* local variables 主要用来记录上下文的(也就是volfile里的option等内容,由于每个translator的内容都不一样,所以定义为void* 也就是任意类型),保证异步操作时,请求回来之后其它进程能恢复上下文 */
xlator_t *this; /* implicit object 保存该frame的translator是哪一个 */
ret_fn_t ret; /* op_return address 返回的函数,也就是translator要执行的函数 */
int32_t ref_count;
gf_lock_t lock;
void *cookie; /* unique cookie */
gf_boolean_t complete;
glusterfs_fop_t op;
struct timeval begin; /* when this frame was created */
struct timeval end; /* when this frame completed */
const char *wind_from; //保存wind操作的起点,链上的最祖先
const char *wind_to; //保存wind操作的终点,链上的最子孙
const char *unwind_from;//保存unwind操作的起点,链上的最子孙
const char *unwind_to; //保存unwind操作的终点,链上的最祖先
};
struct list_head {
struct list_head *next;
struct list_head *prev;
};
2. this以及this->private
实际结构体是_xlator,主要用来保存translator的,其中又有private,private用来保存的是volfile的每一个节点的option,所以也是void *
类型
alamofs/libglusterfs/src/xlator.h:
typedef struct _xlator xlator_t;
struct _xlator {
/* Built during parsing */
char *name;
char *type;
xlator_t *next;
xlator_t *prev;
xlator_list_t *parents;
xlator_list_t *children;
dict_t *options;
/* Set after doing dlopen() */
void *dlhandle;
struct xlator_fops *fops;
struct xlator_cbks *cbks;
struct xlator_dumpops *dumpops;
struct list_head volume_options; /* list of volume_option_t */
void (*fini)(xlator_t *this); //回调函数,用来回收自己
int32_t (*init)(xlator_t *this); //回调函数,用来初始化translator自己
int32_t (*reconfigure)(xlator_t *this, dict_t *options);
int32_t (*mem_acct_init)(xlator_t *this);
event_notify_fn_t notify;
gf_loglevel_t loglevel; /* Log level for translator */
/* for latency measurement */
fop_latency_t latencies[GF_FOP_MAXVALUE];
/* Misc */
eh_t *history; /* event history context */
glusterfs_ctx_t *ctx;
glusterfs_graph_t *graph; /* not set for fuse */
inode_table_t *itable;
char init_succeeded;
void *private; //用来保存this translator的options
struct mem_acct *mem_acct;
uint64_t winds;
char switched;
/* for the memory pool of 'frame->local' */
struct mem_pool *local_pool; //内存池
gf_boolean_t is_autoloaded;
};
#define nfs_state(nfsxl) (nfsxl)->private
#define nfs_fop_mempool(nfxl) (((struct nfs_state *)nfs_state(nfxl))->foppool)
THIS
THIS的使用,是一个全局的变量
void *
__gf_calloc(size_t nmemb, size_t size, uint32_t type, const char *typestr)
{
size_t tot_size = 0;
size_t req_size = 0;
char *ptr = NULL;
xlator_t *xl = NULL;
if (!THIS->ctx->mem_acct_enable)
return CALLOC(nmemb, size);
xl = THIS; //在这里使用THIS
req_size = nmemb * size;
tot_size = req_size + GF_MEM_HEADER_SIZE + GF_MEM_TRAILER_SIZE;
ptr = calloc(1, tot_size);
if (!ptr) {
gf_msg_nomem("", GF_LOG_ALERT, tot_size);
return NULL;
}
gf_mem_set_acct_info(xl, &ptr, req_size, type, typestr);
return (void *)ptr;
}
3. loc
要找到文件系统中要操作的文件的inode,在nfs v3中找文件的inode有两种方法:pargfid + name
和gfid
,而在本地的文件系统还可以通过fd来获得文件的属性信息
还有个dentry,也就是文件的名字,可以通过dentry来找到文件的inode
alamofs/libglusterfs/src/xlator.h:
typedef _loc loc_t;
struct _loc {
const char *path; //没啥大用,因为解析路径基本不考这个
const char *name; //文件的名字
inode_t *inode; //存放文件的相关属性信息
inode_t *parent; //父节点的inode
uint64_t dataset; //用来保存当前是哪一个文件系统
/* Currently all location based operations are through 'gfid' of inode.
* But the 'inode->gfid' only gets set in higher most layer (as in,
* 'fuse', 'protocol/server', or 'nfs/server'). So if translators want
* to send fops on a inode before the 'inode->gfid' is set, they have to
* make use of below 'gfid' fields
*/
uuid_t gfid; //可以直接通过gfid找到文件的inode
uuid_t pargfid; //也可以通过pargfid+name的方式来找文件的inode
};
4. inode
记录了文件的属性
alamofs/libglusterfs/src/inode.h:
struct _inode {
inode_table_t *table; /* the table this inode belongs to */
uint64_t *startime; //开始时间?
uuid_t gfid;
uint64_t dataset;
gf_lock_t lock;
uint64_t nlookup;
uint64_t size; /* dentry count */
uint32_t fd_count; /* Opened fd count */
uint32_t ref; /* reference count on this inode */
ia_type_t ia_type; /* what kind of file */
struct list_head fd_list; /* list of open files on this inode */
struct list_head dentry_list; /* list of directory entries for this inode */
struct list_head hash; /* hash table pointers */
struct list_head list; /* active/lru/purge */
struct _inode_ctx *_ctx; /* replacement for dict_t *(inode->ctx) */
bool in_invalidate_list; /* Set if inode is in table invalidate list */
bool invalidate_sent; /* Set it if invalidator_fn is called for inode */
bool in_lru_list; /* Set if inode is in table lru list */
};
5. dict
函数的扩展参数,内容是k-v结构,比方说操作系统文件的扩展属性(EA)等参数,acl
attr:标准属性,使用stat进行查看
ea:扩展属性,使用getfattr查看
都是POSIX的内容,有user、trusted、ugo、alc等权限
alamofs/libglusterfs/src/dict.h:
struct _dict {
unsigned char is_static : 1;
int32_t hash_size;
int32_t count;
int32_t refcount;
data_pair_t **members;
data_pair_t *members_list;
char *extra_free;
char *extra_stdfree;
gf_lock_t lock;
data_pair_t *members_internal;
data_pair_t free_pair;
gf_boolean_t free_pair_in_use;
};
struct _data_pair {
struct _data_pair *hash_next;
struct _data_pair *prev;
struct _data_pair *next;
data_t *value;
char *key;
};
6. stack
用来记录是谁创建的,是frame相关的内容
alamofs/libglusterfs/src/stack.h:
typedef _call_stack_t _call_stack;
struct _call_stack_t {
union {
struct list_head all_frames; //frame链的头,祖先
struct {
call_stack_t *next_call;
call_stack_t *prev_call;
};
};
call_pool_t *pool; //什么池
gf_lock_t stack_lock;
client_t *client;
uint64_t unique;
void *state; /* pointer to request state */
uid_t uid;
gid_t gid;
pid_t pid;
uint16_t ngrps;
uint32_t groups_small[SMALL_GROUP_COUNT];
uint32_t *groups_large;
uint32_t *groups;
gf_lkowner_t lk_owner;
glusterfs_ctx_t *ctx;
struct list_head myframes; /* List of call_frame_t that go to make the call stack */
int32_t op;
int8_t type;
struct timeval tv;
};
//=======call_pool_t
struct call_pool {
union {
struct list_head all_frames;
struct {
call_stack_t *next_call;
call_stack_t *prev_call;
} all_stacks;
};
int64_t cnt;
gf_lock_t lock;
struct mem_pool *frame_mem_pool;
struct mem_pool *stack_mem_pool;
};
7.fd
struct _fd {
uint64_t pid;
int32_t flags;
char *key;
int32_t refcount; // 引用计数
struct list_head inode_list;
struct _inode *inode;
gf_lock_t lock; /* used ONLY for manipulating
'struct _fd_ctx' array (_ctx).*/
struct _fd_ctx *_ctx;
int xl_count; /* Number of xl referred in this fd */
struct fd_lk_ctx *lk_ctx;
gf_boolean_t anonymous; /* geo-rep anonymous fd */
};
8. gfid
inode的id,用来在文件系统中找到文件的inode的
9. ctx
记录各种配置信息的变量。
10. fuse
struct fuse_private {
int fd;
uint32_t proto_minor;
char *volfile;
size_t volfile_size;
char *mount_point;
struct iobuf *iobuf;
pthread_t fuse_thread;
char fuse_thread_started;
uint32_t direct_io_mode;
size_t *msg0_len_p;
double entry_timeout;
double negative_timeout;
double attribute_timeout;
pthread_cond_t sync_cond;
pthread_mutex_t sync_mutex;
char event_recvd;
char init_recvd;
gf_boolean_t strict_volfile_check;
fuse_handler_t **fuse_ops;
fuse_handler_t **fuse_ops0;
pthread_mutex_t fuse_dump_mutex;
int fuse_dump_fd;
glusterfs_graph_t *next_graph;
xlator_t *active_subvol;
pid_t client_pid;
gf_boolean_t client_pid_set;
unsigned uid_map_root;
gf_boolean_t acl;
gf_boolean_t selinux;
gf_boolean_t read_only;
int32_t fopen_keep_cache;
int32_t gid_cache_timeout;
gf_boolean_t enable_ino32;
/* This is the mount option for disabling the root-squash for the
mount irrespective of whether the root-squash option for the
volume is set or not. But this option is honoured only for
thr trusted clients. For non trusted clients this value does
not have any affect and the volume option for root-squash is
honoured.
*/
gf_boolean_t no_root_squash;
fdtable_t *fdtable;
gid_cache_t gid_cache;
char *fuse_mountopts;
/* For fuse-reverse-validation */
struct list_head invalidate_list;
pthread_cond_t invalidate_cond;
pthread_mutex_t invalidate_mutex;
gf_boolean_t reverse_fuse_thread_started;
uint64_t invalidate_count;
/* For communicating with separate mount thread. */
int status_pipe[2];
/* for fuse queue length and congestion threshold */
int background_qlen;
int congestion_threshold;
/* for using fuse-kernel readdirp*/
gf_boolean_t use_readdirp;
/* fini started, helps prevent multiple epoll worker threads
* firing up the fini routine */
gf_boolean_t fini_invoked;
/* resolve gid with getgrouplist() instead of /proc/%d/status */
gf_boolean_t resolve_gids;
/* Enable or disable capability support */
gf_boolean_t capability;
/* Enable or disable throttle */
gf_boolean_t throttle;
int32_t outstanding_limit;
sem_t sem;
uuid_t volume_id;
char volume_id_str[37];
/* LRU Limit, if not set, default is 64k for now */
uint32_t lru_limit;
uint32_t invalidate_limit;
};
fuse_state_t
typedef struct {
void *pool;
xlator_t *this;
xlator_t *active_subvol;
inode_table_t *itable;
loc_t loc;
loc_t loc2;
fuse_in_header_t *finh;
int32_t flags;
off_t off;
size_t size;
unsigned long nlookup;
fd_t *fd;
dict_t *xattr;
dict_t *xdata;
char *name;
char is_revalidate;
gf_boolean_t truncate_needed;
gf_lock_t lock;
uint64_t lk_owner;
/* used within resolve_and_resume */
/* */
fuse_resolve_t resolve;
fuse_resolve_t resolve2;
loc_t *loc_now;
fuse_resolve_t *resolve_now;
void *resume_fn;
int valid;
int mask;
dev_t rdev;
mode_t mode;
mode_t umask;
struct iatt attr;
struct gf_flock lk_lock;
struct iovec vector; // io流
uuid_t gfid;
uint32_t io_flags;
int32_t fd_no;
sem_t *sem; // 系统的一个信号量,是一个联合体
} fuse_state_t;
struct iovec vector
io流,真实的数据存储在该集合内部;
struct iovec {
void *iov_base;
size_t iov_len;
};
sem_t
/usr/include/bits/semaphore.h
typedef union
{
char __size[__SIZEOF_SEM_T];
long int __align;
} sem_t;
rpc_transport
struct rpc_transport {
struct rpc_transport_ops *ops;
rpc_transport_t *listener; /* listener transport to which
* request for creation of this
* transport came from. valid only
* on server process.
*/
void *private;
struct _client_t *xl_private;
void *xl; /* Used for THIS */
void *mydata;
pthread_mutex_t lock;
int32_t refcount;
int32_t outstanding_rpc_count;
glusterfs_ctx_t *ctx;
dict_t *options;
char *name;
void *dnscache;
void *drc_client;
data_t *buf;
int32_t (*init)(rpc_transport_t *this); // 初始化
void (*fini)(rpc_transport_t *this); // 回收
int (*reconfigure)(rpc_transport_t *this, dict_t *options);
rpc_transport_notify_t notify;
void *notify_data;
peer_info_t peerinfo;
peer_info_t myinfo;
uint64_t total_bytes_read;
uint64_t total_bytes_write;
struct list_head list;
int bind_insecure;
void *dl_handle; /* handle of dlopen() */
char *ssl_name;
dict_t *clnt_options; /* store options received from
* client */
uint64_t rpcsvc_request_sent_count;
uint64_t rpcsvc_request_received_count;
uint64_t pingcnt;
struct timeval last_sent;
struct timeval last_received;
gf_boolean_t throttled;
gf_boolean_t special_client;
gf_boolean_t special_backup;
int outstanding_limit_std;
int borrowed_cnt;
uint64_t timer_loops;
gf_boolean_t rebalance;
bonus_status_t bonus_status;
gf_boolean_t readonly_client;
gf_boolean_t no_root_squash_client;
#define TRANSPORT_SOCKET 0
#define TRANSPORT_RDMA 1
/* 0 for socket, 1 for rdma */
int trans_type;
/* variables for rdma sequential read/write */
int32_t lock_owner;
pthread_mutex_t *pollcq_lock;
gf_boolean_t
rdma_server_multithreading; /* server (nfs server/brick server) use
multithreading*/
rpc_sequential_request_t r;
rpc_sequential_request_t w;
};
rpc_transport_ops
struct rpc_transport_ops {
/* no need of receive op, msg will be delivered through an event
* notification
*/
int32_t (*submit_request)(rpc_transport_t *this, rpc_transport_req_t *req);
int32_t (*submit_reply)(rpc_transport_t *this,
rpc_transport_reply_t *reply);
int32_t (*connect)(rpc_transport_t *this, int port);
int32_t (*listen)(rpc_transport_t *this);
int32_t (*disconnect)(rpc_transport_t *this);
int32_t (*get_peername)(rpc_transport_t *this, char *hostname, int hostlen);
int32_t (*get_peeraddr)(rpc_transport_t *this, char *peeraddr, int addrlen,
struct sockaddr_storage *sa, socklen_t sasize);
int32_t (*get_myname)(rpc_transport_t *this, char *hostname, int hostlen);
int32_t (*get_myaddr)(rpc_transport_t *this, char *peeraddr, int addrlen,
struct sockaddr_storage *sa, socklen_t sasize);
int32_t (*throttle)(rpc_transport_t *this, gf_boolean_t onoff);
};
event_slot_epoll
struct event_slot_epoll {
int fd;
int events;
int gen;
int ref;
int do_close;
int in_handler;
int handled_error;
void *data;
event_handler_t handler; //保存回调函数
gf_lock_t lock;
};
event_poll
struct event_pool {
struct event_ops *ops;
int fd;
int breaker[2];
int count;
struct event_slot_poll *reg;
struct event_slot_epoll *ereg[EVENT_EPOLL_TABLES];
int slots_used[EVENT_EPOLL_TABLES];
int used;
int changed;
pthread_mutex_t mutex;
pthread_cond_t cond;
void *evcache;
int evcache_size;
/* NOTE: Currently used only when event processing is done using
* epoll. */
int eventthreadcount; /* number of event threads to execute. */
pthread_t pollers[EVENT_MAX_THREADS]; /* poller thread_id store,
* and live status */
int destroy;
int activethreadcount;
};
event_ops event_ops_epoll
在gluster里使用了大量的函数指针,所以如何找到指针对应的函数,非常有助于代码的阅读,下面2个struct,定义了对应的event函数
struct event_ops {
struct event_pool *(*new)(int count, int eventthreadcount);
int (*event_register)(struct event_pool *event_pool, int fd,
event_handler_t handler, void *data, int poll_in,
int poll_out);
int (*event_select_on)(struct event_pool *event_pool, int fd, int idx,
int poll_in, int poll_out);
int (*event_unregister)(struct event_pool *event_pool, int fd, int idx);
int (*event_unregister_close)(struct event_pool *event_pool, int fd,
int idx);
int (*event_dispatch)(struct event_pool *event_pool);
int (*event_reconfigure_threads)(struct event_pool *event_pool,
int newcount);
int (*event_pool_destroy)(struct event_pool *event_pool);
int (*event_handled)(struct event_pool *event_pool, int fd, int idx,
int gen);
};
// 指向函数真正实现的地方
struct event_ops event_ops_epoll = {
.new = event_pool_new_epoll,
.event_register = event_register_epoll,
.event_select_on = event_select_on_epoll,
.event_unregister = event_unregister_epoll,
.event_unregister_close = event_unregister_close_epoll,
.event_dispatch = event_dispatch_epoll,
.event_reconfigure_threads = event_reconfigure_threads_epoll,
.event_pool_destroy = event_pool_destroy_epoll,
.event_handled = event_handled_epoll,
};
/* Attempts to start the # of configured pollers, ensuring at least the first
* is started in a joinable state */
static int
event_dispatch_epoll(struct event_pool *event_pool)
{
int i = 0;
pthread_t t_id;
int pollercount = 0;
int ret = -1;
struct event_thread_data *ev_data = NULL;
/* Start the configured number of pollers */
pthread_mutex_lock(&event_pool->mutex);
{
pollercount = event_pool->eventthreadcount;
/* Set to MAX if greater */
if (pollercount > EVENT_MAX_THREADS)
pollercount = EVENT_MAX_THREADS;
/* Default pollers to 1 in case this is incorrectly set */
if (pollercount <= 0)
pollercount = 1;
event_pool->activethreadcount++;
for (i = 0; i < pollercount; i++) {
ev_data = GF_CALLOC(1, sizeof(*ev_data), gf_common_mt_event_pool);
if (!ev_data) {
if (i == 0) {
/* Need to suceed creating 0'th
* thread, to joinable and wait */
break;
} else {
/* Inability to create other threads
* are a lesser evil, and ignored */
continue;
}
}
ev_data->event_pool = event_pool;
ev_data->event_index = i + 1;
// 真正创建线程的语句
ret = pthread_create(&t_id, NULL,
event_dispatch_epoll_worker,
ev_data);
if (!ret) {
event_pool->pollers[i] = t_id;
/* mark all threads other than one in index 0
* as detachable. Errors can be ignored, they
* spend their time as zombies if not detched
* and the thread counts are decreased */
if (i != 0)
pthread_detach(event_pool->pollers[i]);
} else {
gf_msg("epoll", GF_LOG_WARNING, 0,
LG_MSG_START_EPOLL_THREAD_FAILED,
"Failed to start thread for index %d", i);
if (i == 0) {
GF_FREE(ev_data);
break;
} else {
GF_FREE(ev_data);
continue;
}
}
}
}
pthread_mutex_unlock(&event_pool->mutex);
/* Just wait for the first thread, that is created in a joinable state
* and will never die, ensuring this function never returns */
if (event_pool->pollers[0] != 0)
pthread_join(event_pool->pollers[0], NULL);
pthread_mutex_lock(&event_pool->mutex);
{
event_pool->activethreadcount--;
}
pthread_mutex_unlock(&event_pool->mutex);
return ret;
}
nfs3_local —— nfs3_call_state_t *cs
用于在 fop 与其回调之间通信状态的结构。并非所有成员都一直被使用。用法取决于 fop 和 NFS 请求。
struct nfs3_local {
GF_REF_DECL;
rpcsvc_request_t *req;
xlator_t *vol;
nfs3_resume_fn_t resume_fn;
xlator_t *nfsx;
struct nfs3_state *nfs3state;
struct iatt buf;
/* The list hook to attach this call state to the inode's queue till
* the opening of the fd on the inode completes.
*/
struct list_head openwait_q;
/* Per-NFSv3 Op state */
struct nfs3_fh parent;
struct nfs3_fh fh;
struct nfs3_fh old_fh;
fd_t *fd;
uint32_t accessbits;
int operrno;
count3 dircount;
count3 maxcount;
struct statvfs fsstat;
gf_dirent_t entries;
struct iatt stbuf;
struct iatt preparent;
struct iatt postparent;
int32_t setattr_valid;
nfstime3 timestamp;
loc_t oploc;
int writetype;
count3 datacount;
offset3 dataoffset;
struct iobuf *iob;
struct iobref *iobref;
createmode3 createmode;
uint64_t cookieverf;
int sattrguardcheck;
char *pathname;
char *old_pathname;
ftype3 mknodtype;
specdata3 devnums;
cookie3 cookie;
struct iovec datavec;
mode_t mode;
/* NFSv3 FH resolver state */
int hardresolved;
int softresolved;
int src_softresolved;
gf_boolean_t retry_resloved;
struct nfs3_fh resolvefh;
loc_t resolvedloc;
int resolve_ret;
int resolve_errno;
int hashidx;
fd_t *resolve_dir_fd;
char *resolventry;
nfs3_lookup_type_t lookuptype;
gf_dirent_t *hashmatch;
gf_dirent_t *entrymatch;
off_t lastentryoffset;
struct flock flock;
args args;
nlm4_lkowner_t lkowner;
char cookiebytes[1024];
struct nfs3_fh lockfh;
int monitor;
rpc_transport_t *trans;
call_frame_t *frame;
/* ACL */
aclentry aclentry[NFS_ACL_MAX_ENTRIES];
aclentry daclentry[NFS_ACL_MAX_ENTRIES];
int aclcount;
char aclxattr[NFS_ACL_MAX_ENTRIES * 8 + 4];
int daclcount;
char daclxattr[NFS_ACL_MAX_ENTRIES * 8 + 4];
};
rpcsvc_request
struct rpcsvc_request {
/* connection over which this request came. */
rpc_transport_t *trans;
rpcsvc_t *svc;
rpcsvc_program_t *prog;
/* The identifier for the call from client.
* Needed to pair the reply with the call.
*/
uint32_t xid;
int prognum;
int progver;
int procnum;
int type;
/* Uid and gid filled by the rpc-auth module during the authentication
* phase.
*/
uid_t uid;
gid_t gid;
pid_t pid;
gf_lkowner_t lk_owner;
uint64_t gfs_id;
/* Might want to move this to AUTH_UNIX specific state since this array
* is not available for every authentication scheme.
*/
gid_t *auxgids;
gid_t auxgidsmall[SMALL_GROUP_COUNT];
gid_t *auxgidlarge;
int auxgidcount;
/* The RPC message payload, contains the data required
* by the program actors. This is the buffer that will need to
* be de-xdred by the actor.
*/
struct iovec msg[MAX_IOVEC];
int count;
struct iobref *iobref;
/* Status of the RPC call, whether it was accepted or denied. */
int rpc_status;
/* In case, the call was denied, the RPC error is stored here
* till the reply is sent.
*/
int rpc_err;
/* In case the failure happened because of an authentication problem
* , this value needs to be assigned the correct auth error number.
*/
int auth_err;
/* There can be cases of RPC requests where the reply needs to
* be built from multiple sources. E.g. where even the NFS reply
* can contain a payload, as in the NFSv3 read reply. Here the RPC header
* ,NFS header and the read data are brought together separately from
* different buffers, so we need to stage the buffers temporarily here
* before all of them get added to the connection's transmission list.
*/
struct list_head txlist;
/* While the reply record is being built, this variable keeps track
* of how many bytes have been added to the record.
*/
size_t payloadsize;
/* The credentials extracted from the rpc request */
rpcsvc_auth_data_t cred;
/* The verified extracted from the rpc request. In request side
* processing this contains the verifier sent by the client, on reply
* side processing, it is filled with the verified that will be
* sent to the client.
*/
rpcsvc_auth_data_t verf;
/* Execute this request's actor function as a synctask? */
gf_boolean_t synctask;
/* Container for a RPC program wanting to store a temp
* request-specific item.
*/
void *private;
/* Container for transport to store request-specific item */
void *trans_private;
/* we need to ref the 'iobuf' in case of 'synctasking' it */
struct iobuf *hdr_iobuf;
/* pointer to cached reply for use in DRC */
drc_cached_op_t *reply;
/* list for sequentail read/write */
struct list_head list;
};
rpcsvc_actor_t glusterfs3_3_fop_actors
很神奇的写法,惊叹。。。
./rpc/rpc-lib/src/rpcsvc.h
typedef struct rpcsvc_actor_desc {
char procname[RPCSVC_NAME_MAX];
int procnum;
rpcsvc_actor actor;
/* Handler for cases where the RPC requests fragments are large enough
* to benefit from being decoded into aligned memory addresses. While
* decoding the request in a non-vectored manner, due to the nature of
* the XDR scheme, RPC cannot guarantee memory aligned addresses for
* the resulting message-specific structures. Allowing a specialized
* handler for letting the RPC program read the data from the network
* directly into its aligned buffers.
*/
rpcsvc_vector_sizer vector_sizer;
/* Can actor be ran on behalf an unprivileged requestor? */
gf_boolean_t unprivileged;
drc_op_type_t op_type;
} rpcsvc_actor_t;
服务器各方法的入口
位置:./xlators/protocol/server/src/server-rpc-fops.c
rpcsvc_actor_t glusterfs3_3_fop_actors[GLUSTER_FOP_PROCCNT] = {
[GFS3_OP_NULL] = {"NULL", GFS3_OP_NULL, server_null, NULL, 0, DRC_NA},
[GFS3_OP_STAT] = {"STAT", GFS3_OP_STAT, server3_3_stat, NULL, 0, DRC_NA},
[GFS3_OP_READLINK] = {"READLINK", GFS3_OP_READLINK, server3_3_readlink,
NULL, 0, DRC_NA},
[GFS3_OP_MKNOD] = {"MKNOD", GFS3_OP_MKNOD, server3_3_mknod, NULL, 0,
DRC_NA},
[GFS3_OP_MKDIR] = {"MKDIR", GFS3_OP_MKDIR, server3_3_mkdir, NULL, 0,
DRC_NA},
[GFS3_OP_UNLINK] = {"UNLINK", GFS3_OP_UNLINK, server3_3_unlink, NULL, 0,
DRC_NA},
[GFS3_OP_RMDIR] = {"RMDIR", GFS3_OP_RMDIR, server3_3_rmdir, NULL, 0,
DRC_NA},
[GFS3_OP_SYMLINK] = {"SYMLINK", GFS3_OP_SYMLINK, server3_3_symlink, NULL, 0,
DRC_NA},
[GFS3_OP_RENAME] = {"RENAME", GFS3_OP_RENAME, server3_3_rename, NULL, 0,
DRC_NA},
[GFS3_OP_LINK] = {"LINK", GFS3_OP_LINK, server3_3_link, NULL, 0, DRC_NA},
[GFS3_OP_TRUNCATE] = {"TRUNCATE", GFS3_OP_TRUNCATE, server3_3_truncate,
NULL, 0, DRC_NA},
[GFS3_OP_OPEN] = {"OPEN", GFS3_OP_OPEN, server3_3_open, NULL, 0, DRC_NA},
[GFS3_OP_READ] = {"READ", GFS3_OP_READ, server3_3_readv, NULL, 0, DRC_NA},
[GFS3_OP_WRITE] = {"WRITE", GFS3_OP_WRITE, server3_3_writev,
server3_3_writev_vecsizer, 0, DRC_NA},
[GFS3_OP_STATFS] = {"STATFS", GFS3_OP_STATFS, server3_3_statfs, NULL, 0,
DRC_NA},
[GFS3_OP_FLUSH] = {"FLUSH", GFS3_OP_FLUSH, server3_3_flush, NULL, 0,
DRC_NA},
[GFS3_OP_FSYNC] = {"FSYNC", GFS3_OP_FSYNC, server3_3_fsync, NULL, 0,
DRC_NA},
[GFS3_OP_SETXATTR] = {"SETXATTR", GFS3_OP_SETXATTR, server3_3_setxattr,
NULL, 0, DRC_NA},
[GFS3_OP_GETXATTR] = {"GETXATTR", GFS3_OP_GETXATTR, server3_3_getxattr,
NULL, 0, DRC_NA},
[GFS3_OP_REMOVEXATTR] = {"REMOVEXATTR", GFS3_OP_REMOVEXATTR,
server3_3_removexattr, NULL, 0, DRC_NA},
[GFS3_OP_OPENDIR] = {"OPENDIR", GFS3_OP_OPENDIR, server3_3_opendir, NULL, 0,
DRC_NA},
[GFS3_OP_FSYNCDIR] = {"FSYNCDIR", GFS3_OP_FSYNCDIR, server3_3_fsyncdir,
NULL, 0, DRC_NA},
[GFS3_OP_ACCESS] = {"ACCESS", GFS3_OP_ACCESS, server3_3_access, NULL, 0,
DRC_NA},
[GFS3_OP_CREATE] = {"CREATE", GFS3_OP_CREATE, server3_3_create, NULL, 0,
DRC_NA},
[GFS3_OP_FTRUNCATE] = {"FTRUNCATE", GFS3_OP_FTRUNCATE, server3_3_ftruncate,
NULL, 0, DRC_NA},
[GFS3_OP_FSTAT] = {"FSTAT", GFS3_OP_FSTAT, server3_3_fstat, NULL, 0,
DRC_NA},
[GFS3_OP_LK] = {"LK", GFS3_OP_LK, server3_3_lk, NULL, 0, DRC_NA},
[GFS3_OP_LOOKUP] = {"LOOKUP", GFS3_OP_LOOKUP, server3_3_lookup, NULL, 0,
DRC_NA},
[GFS3_OP_READDIR] = {"READDIR", GFS3_OP_READDIR, server3_3_readdir, NULL, 0,
DRC_NA},
[GFS3_OP_INODELK] = {"INODELK", GFS3_OP_INODELK, server3_3_inodelk, NULL, 0,
DRC_NA},
[GFS3_OP_FINODELK] = {"FINODELK", GFS3_OP_FINODELK, server3_3_finodelk,
NULL, 0, DRC_NA},
[GFS3_OP_ENTRYLK] = {"ENTRYLK", GFS3_OP_ENTRYLK, server3_3_entrylk, NULL, 0,
DRC_NA},
[GFS3_OP_FENTRYLK] = {"FENTRYLK", GFS3_OP_FENTRYLK, server3_3_fentrylk,
NULL, 0, DRC_NA},
[GFS3_OP_XATTROP] = {"XATTROP", GFS3_OP_XATTROP, server3_3_xattrop, NULL, 0,
DRC_NA},
[GFS3_OP_FXATTROP] = {"FXATTROP", GFS3_OP_FXATTROP, server3_3_fxattrop,
NULL, 0, DRC_NA},
[GFS3_OP_FGETXATTR] = {"FGETXATTR", GFS3_OP_FGETXATTR, server3_3_fgetxattr,
NULL, 0, DRC_NA},
[GFS3_OP_FSETXATTR] = {"FSETXATTR", GFS3_OP_FSETXATTR, server3_3_fsetxattr,
NULL, 0, DRC_NA},
[GFS3_OP_RCHECKSUM] = {"RCHECKSUM", GFS3_OP_RCHECKSUM, server3_3_rchecksum,
NULL, 0, DRC_NA},
[GFS3_OP_SETATTR] = {"SETATTR", GFS3_OP_SETATTR, server3_3_setattr, NULL, 0,
DRC_NA},
[GFS3_OP_FSETATTR] = {"FSETATTR", GFS3_OP_FSETATTR, server3_3_fsetattr,
NULL, 0, DRC_NA},
[GFS3_OP_READDIRP] = {"READDIRP", GFS3_OP_READDIRP, server3_3_readdirp,
NULL, 0, DRC_NA},
[GFS3_OP_RELEASE] = {"RELEASE", GFS3_OP_RELEASE, server3_3_release, NULL, 0,
DRC_NA},
[GFS3_OP_RELEASEDIR] = {"RELEASEDIR", GFS3_OP_RELEASEDIR,
server3_3_releasedir, NULL, 0, DRC_NA},
[GFS3_OP_FREMOVEXATTR] = {"FREMOVEXATTR", GFS3_OP_FREMOVEXATTR,
server3_3_fremovexattr, NULL, 0, DRC_NA},
[GFS3_OP_FALLOCATE] = {"FALLOCATE", GFS3_OP_FALLOCATE, server3_3_fallocate,
NULL, 0, DRC_NA},
[GFS3_OP_DISCARD] = {"DISCARD", GFS3_OP_DISCARD, server3_3_discard, NULL, 0,
DRC_NA},
[GFS3_OP_ZEROFILL] = {"ZEROFILL", GFS3_OP_ZEROFILL, server3_3_zerofill,
NULL, 0, DRC_NA},
[GFS3_OP_IPC] = {"IPC", GFS3_OP_IPC, server3_3_ipc, NULL, 0, DRC_NA},
};
nfs3_fh
用来保证一个字节流确实是 GlusterFS NFS 里面的文件句柄
struct nfs3_fh {
/* Used to ensure that a bunch of bytes are actually a GlusterFS NFS
file handle. Should contain ":OGL"
*/
char ident[4];
/* UUID that identifies an export. The value stored in exportid
* depends on the usage of gluster nfs. If the DVM is enabled using
* the nfs.dynamic-volumes option then exportid will contain the UUID
* of the volume so that gnfs is able to identify volumes uniquely
* through volume additions,deletions,migrations, etc.
*
* When not using dvm, exportid contains the index of the volume
* based on the position of the volume in the list of subvolumes
* for gnfs.
*/
uuid_t exportid;
/* File/dir gfid. */
uuid_t gfid;
uuid_t mountid;
/* This structure must be exactly NFS3_FHSIZE (64) bytes long.
Having the structure shorter results in buffer overflows
during XDR decoding.
*/
/* Dataset id, to distinguish snapshot or volume */
uint64_t dsid;
unsigned char padding[NFS3_FHSIZE - GF_NFSFH_STATIC_SIZE];
} __attribute__((__packed__));
recsvc_actor_t nfs3svc_actors
RPC的表,将操作和参数对应起来
rpcsvc_actor_t nfs3svc_actors[NFS3_PROC_COUNT] = {
{"NULL", NFS3_NULL, nfs3svc_null, NULL, 0, DRC_IDEMPOTENT},
{"GETATTR", NFS3_GETATTR, nfs3svc_getattr, NULL, 0, DRC_IDEMPOTENT},
{"SETATTR", NFS3_SETATTR, nfs3svc_setattr, NULL, 0, DRC_NON_IDEMPOTENT},
{"LOOKUP", NFS3_LOOKUP, nfs3svc_lookup, NULL, 0, DRC_IDEMPOTENT},
{"ACCESS", NFS3_ACCESS, nfs3svc_access, NULL, 0, DRC_IDEMPOTENT},
{"READLINK", NFS3_READLINK, nfs3svc_readlink, NULL, 0, DRC_IDEMPOTENT},
{"READ", NFS3_READ, nfs3svc_read, NULL, 0, DRC_IDEMPOTENT},
{"WRITE", NFS3_WRITE, nfs3svc_write, nfs3svc_write_vecsizer, 0,
DRC_NON_IDEMPOTENT},
{"CREATE", NFS3_CREATE, nfs3svc_create, NULL, 0, DRC_NON_IDEMPOTENT},
{"MKDIR", NFS3_MKDIR, nfs3svc_mkdir, NULL, 0, DRC_NON_IDEMPOTENT},
{"SYMLINK", NFS3_SYMLINK, nfs3svc_symlink, NULL, 0, DRC_NON_IDEMPOTENT},
{"MKNOD", NFS3_MKNOD, nfs3svc_mknod, NULL, 0, DRC_NON_IDEMPOTENT},
{"REMOVE", NFS3_REMOVE, nfs3svc_remove, NULL, 0, DRC_NON_IDEMPOTENT},
{"RMDIR", NFS3_RMDIR, nfs3svc_rmdir, NULL, 0, DRC_NON_IDEMPOTENT},
{"RENAME", NFS3_RENAME, nfs3svc_rename, NULL, 0, DRC_NON_IDEMPOTENT},
{"LINK", NFS3_LINK, nfs3svc_link, NULL, 0, DRC_NON_IDEMPOTENT},
{"READDIR", NFS3_READDIR, nfs3svc_readdir, NULL, 0, DRC_IDEMPOTENT},
{"READDIRPLUS", NFS3_READDIRP, nfs3svc_readdirp, NULL, 0, DRC_IDEMPOTENT},
{"FSSTAT", NFS3_FSSTAT, nfs3svc_fsstat, NULL, 0, DRC_IDEMPOTENT},
{"FSINFO", NFS3_FSINFO, nfs3svc_fsinfo, NULL, 0, DRC_IDEMPOTENT},
{"PATHCONF", NFS3_PATHCONF, nfs3svc_pathconf, NULL, 0, DRC_IDEMPOTENT},
{"COMMIT", NFS3_COMMIT, nfs3svc_commit, NULL, 0, DRC_IDEMPOTENT}
};
fuse_handler_t函数指针
在./xlators/mount/fuse/src/fuse-bridge.c
中
fuse的接口,这是客户端的入口,作为文件系统的“桥节点”
typedef void(fuse_handler_t)(xlator_t *this, fuse_in_header_t *finh, void *msg);
static fuse_handler_t *fuse_std_ops[FUSE_OP_HIGH] = {
[FUSE_LOOKUP] = fuse_lookup,
[FUSE_FORGET] = fuse_forget,
[FUSE_GETATTR] = fuse_getattr,
[FUSE_SETATTR] = fuse_setattr,
[FUSE_READLINK] = fuse_readlink,
[FUSE_SYMLINK] = fuse_symlink,
[FUSE_MKNOD] = fuse_mknod,
[FUSE_MKDIR] = fuse_mkdir,
[FUSE_UNLINK] = fuse_unlink,
[FUSE_RMDIR] = fuse_rmdir,
[FUSE_RENAME] = fuse_rename,
[FUSE_LINK] = fuse_link,
[FUSE_OPEN] = fuse_open,
[FUSE_READ] = fuse_readv,
[FUSE_WRITE] = fuse_write,
[FUSE_STATFS] = fuse_statfs,
[FUSE_RELEASE] = fuse_release,
[FUSE_FSYNC] = fuse_fsync,
[FUSE_SETXATTR] = fuse_setxattr,
[FUSE_GETXATTR] = fuse_getxattr,
[FUSE_LISTXATTR] = fuse_listxattr,
[FUSE_REMOVEXATTR] = fuse_removexattr,
[FUSE_FLUSH] = fuse_flush,
[FUSE_INIT] = fuse_init,
[FUSE_OPENDIR] = fuse_opendir,
[FUSE_READDIR] = fuse_readdir,
[FUSE_RELEASEDIR] = fuse_releasedir,
[FUSE_FSYNCDIR] = fuse_fsyncdir,
[FUSE_GETLK] = fuse_getlk,
[FUSE_SETLK] = fuse_setlk,
[FUSE_SETLKW] = fuse_setlk,
[FUSE_ACCESS] = fuse_access,
[FUSE_CREATE] = fuse_create,
/* [FUSE_INTERRUPT] */
/* [FUSE_BMAP] */
[FUSE_DESTROY] = fuse_destroy,
/* [FUSE_IOCTL] */
/* [FUSE_POLL] */
/* [FUSE_NOTIFY_REPLY] */
#if FUSE_KERNEL_MINOR_VERSION >= 16
[FUSE_BATCH_FORGET] = fuse_batch_forget,
#endif
#if FUSE_KERNEL_MINOR_VERSION >= 19
#ifdef FALLOC_FL_KEEP_SIZE
[FUSE_FALLOCATE] = fuse_fallocate,
#endif /* FALLOC_FL_KEEP_SIZE */
#endif
#if FUSE_KERNEL_MINOR_VERSION >= 21
[FUSE_READDIRPLUS] = fuse_readdirp,
#endif
};
fuse_opcode
fuse操作对应的编号
enum fuse_opcode {
FUSE_LOOKUP = 1,
FUSE_FORGET = 2, /* no reply */
FUSE_GETATTR = 3,
FUSE_SETATTR = 4,
FUSE_READLINK = 5,
FUSE_SYMLINK = 6,
FUSE_MKNOD = 8,
FUSE_MKDIR = 9,
FUSE_UNLINK = 10,
FUSE_RMDIR = 11,
FUSE_RENAME = 12,
FUSE_LINK = 13,
FUSE_OPEN = 14,
FUSE_READ = 15,
FUSE_WRITE = 16,
FUSE_STATFS = 17,
FUSE_RELEASE = 18,
FUSE_FSYNC = 20,
FUSE_SETXATTR = 21,
FUSE_GETXATTR = 22,
FUSE_LISTXATTR = 23,
FUSE_REMOVEXATTR = 24,
FUSE_FLUSH = 25,
FUSE_INIT = 26,
FUSE_OPENDIR = 27,
FUSE_READDIR = 28,
FUSE_RELEASEDIR = 29,
FUSE_FSYNCDIR = 30,
FUSE_GETLK = 31,
FUSE_SETLK = 32,
FUSE_SETLKW = 33,
FUSE_ACCESS = 34,
FUSE_CREATE = 35,
FUSE_INTERRUPT = 36,
FUSE_BMAP = 37,
FUSE_DESTROY = 38,
FUSE_IOCTL = 39,
FUSE_POLL = 40,
FUSE_NOTIFY_REPLY = 41,
FUSE_BATCH_FORGET = 42,
FUSE_FALLOCATE = 43,
FUSE_READDIRPLUS = 44,
/* CUSE specific operations */
CUSE_INIT = 4096,
};
volfile
/var/lib/alamod/vols/vol1/vol1.Al1Xtao.data-pool2-vol1-brick.vol
volume vol1-posix
type storage/posix
option posix on
option volume-id 2fc63729-770f-4287-af1a-5c2c2b64ffb7
option directory /data/pool2/vol1-brick
end-volume
volume vol1-huntlog
type features/huntlog
option huntlog-filter off
option huntlog off
option directory /data/pool2/vol1-brick
option brick-path /data/pool2/vol1-brick
option volume-name vol1
option brick-index 1
subvolumes vol1-posix
end-volume
volume vol1-access-control
type features/access-control
subvolumes vol1-huntlog
end-volume
volume vol1-locks
type features/locks
subvolumes vol1-access-control
end-volume
volume vol1-io-queues
type performance/io-queues
option io-queues on
subvolumes vol1-locks
end-volume
volume vol1-marker
type features/marker
option inode-quota off
option quota off
option gsync-force-xtime off
option xtime off
option quota-version 0
option timestamp-file /var/lib/alamod/vols/vol1/marker.tstamp
option volume-uuid 2fc63729-770f-4287-af1a-5c2c2b64ffb7
subvolumes vol1-io-queues
end-volume
volume vol1
type features/quota
option deem-statfs off
option timeout 0
option server-quota off
option volume-uuid vol1
subvolumes vol1-marker
end-volume
volume /data/pool2/vol1-brick
type debug/io-stats
option count-fop-hits off
option frame-latency-measurement off
option latency-measurement off
option io-stats on
option io-stats-global-switch off
option log-level INFO
subvolumes vol1
end-volume
volume vol1-server
type protocol/server
option transport.socket.keepalive-count 5
option transport.socket.keepalive-interval 2
option transport.socket.keepalive-time 20
option auth.addr./data/pool2/vol1-brick.allow *
option auth.login.c8730cf8-037b-440a-9461-95d4f63609e7.password 4bd06a73-e935-4e45-8a30-e45bc609e0bb
option auth.login./data/pool2/vol1-brick.allow c8730cf8-037b-440a-9461-95d4f63609e7
option transport-type tcp
subvolumes /data/pool2/vol1-brick
end-volume