GlusterFS源码分析——重要数据结构

学习GlusterFS的过程中,对GlusterFS中几个重要的数据结构的简单分析,持续更新中…

1. frame以及frame->local

用来保存volfile内容的一个数据结构,同时还承担了其它重要功能。代表了该xlator的frame,同时一个frame对象还主要用于记录一个xlator与父子节点间的调用关系,比如记录其子节点的frame信息,父节点的frame信息。Frame相当于一个纽带,将原本独立的xlator的信息联系到了一起。

alamofs/libglusterfs/src/stack.h:

struct _call_frame_t {
    call_stack_t *root;   /* stack root 记录当前的用户是谁,要发送给server的,表示当前是谁在系统中工作 */
    call_frame_t *parent; /* previous BP */
    struct list_head frames;	//和链表关联起来的指针,有两个数据成员,指向前面的prev和指向后面的next
    void *local;    /* local variables 主要用来记录上下文的(也就是volfile里的option等内容,由于每个translator的内容都不一样,所以定义为void* 也就是任意类型),保证异步操作时,请求回来之后其它进程能恢复上下文 */
    xlator_t *this; /* implicit object 保存该frame的translator是哪一个 */
    ret_fn_t ret;   /* op_return address 返回的函数,也就是translator要执行的函数 */
    int32_t ref_count;
    gf_lock_t lock;
    void *cookie; /* unique cookie */
    gf_boolean_t complete;

    glusterfs_fop_t op;
    struct timeval begin; /* when this frame was created */
    struct timeval end;   /* when this frame completed */
    const char *wind_from;	//保存wind操作的起点,链上的最祖先
    const char *wind_to;	//保存wind操作的终点,链上的最子孙
    const char *unwind_from;//保存unwind操作的起点,链上的最子孙
    const char *unwind_to;	//保存unwind操作的终点,链上的最祖先
};

struct list_head {
    struct list_head *next;
    struct list_head *prev;
};

2. this以及this->private

实际结构体是_xlator,主要用来保存translator的,其中又有private,private用来保存的是volfile的每一个节点的option,所以也是void * 类型

alamofs/libglusterfs/src/xlator.h:

typedef struct _xlator xlator_t;

struct _xlator {
    /* Built during parsing */
    char *name;
    char *type;
    xlator_t *next;
    xlator_t *prev;
    xlator_list_t *parents;
    xlator_list_t *children;
    dict_t *options;

    /* Set after doing dlopen() */
    void *dlhandle;
    struct xlator_fops *fops;
    struct xlator_cbks *cbks;
    struct xlator_dumpops *dumpops;
    struct list_head volume_options; /* list of volume_option_t */

    void (*fini)(xlator_t *this);		//回调函数,用来回收自己
    int32_t (*init)(xlator_t *this);	//回调函数,用来初始化translator自己
    int32_t (*reconfigure)(xlator_t *this, dict_t *options);
    int32_t (*mem_acct_init)(xlator_t *this);
    event_notify_fn_t notify;

    gf_loglevel_t loglevel; /* Log level for translator */

    /* for latency measurement */
    fop_latency_t latencies[GF_FOP_MAXVALUE];

    /* Misc */
    eh_t *history; /* event history context */
    glusterfs_ctx_t *ctx;		
    glusterfs_graph_t *graph; /* not set for fuse */
    inode_table_t *itable;
    char init_succeeded;
    void *private;						//用来保存this translator的options
    struct mem_acct *mem_acct;
    uint64_t winds;
    char switched;

    /* for the memory pool of 'frame->local' */
    struct mem_pool *local_pool;	//内存池
    gf_boolean_t is_autoloaded;
};

#define nfs_state(nfsxl) (nfsxl)->private
#define nfs_fop_mempool(nfxl) (((struct nfs_state *)nfs_state(nfxl))->foppool)

THIS

THIS的使用,是一个全局的变量

void *
__gf_calloc(size_t nmemb, size_t size, uint32_t type, const char *typestr)
{
    size_t tot_size = 0;
    size_t req_size = 0;
    char *ptr = NULL;
    xlator_t *xl = NULL;

    if (!THIS->ctx->mem_acct_enable)
        return CALLOC(nmemb, size);

    xl = THIS;		//在这里使用THIS

    req_size = nmemb * size;
    tot_size = req_size + GF_MEM_HEADER_SIZE + GF_MEM_TRAILER_SIZE;

    ptr = calloc(1, tot_size);

    if (!ptr) {
        gf_msg_nomem("", GF_LOG_ALERT, tot_size);
        return NULL;
    }
    gf_mem_set_acct_info(xl, &ptr, req_size, type, typestr);

    return (void *)ptr;
}

3. loc

要找到文件系统中要操作的文件的inode,在nfs v3中找文件的inode有两种方法:pargfid + namegfid,而在本地的文件系统还可以通过fd来获得文件的属性信息

还有个dentry,也就是文件的名字,可以通过dentry来找到文件的inode

alamofs/libglusterfs/src/xlator.h:

typedef _loc loc_t;

struct _loc {
    const char *path;	//没啥大用,因为解析路径基本不考这个
    const char *name;	//文件的名字
    inode_t *inode;		//存放文件的相关属性信息
    inode_t *parent;	//父节点的inode
    uint64_t dataset;	//用来保存当前是哪一个文件系统
    /* Currently all location based operations are through 'gfid' of inode.
     * But the 'inode->gfid' only gets set in higher most layer (as in,
     * 'fuse', 'protocol/server', or 'nfs/server'). So if translators want
     * to send fops on a inode before the 'inode->gfid' is set, they have to
     * make use of below 'gfid' fields
     */
    uuid_t gfid;		//可以直接通过gfid找到文件的inode
    uuid_t pargfid;		//也可以通过pargfid+name的方式来找文件的inode
};

4. inode

记录了文件的属性

alamofs/libglusterfs/src/inode.h:

struct _inode {
    inode_table_t *table; /* the table this inode belongs to */
    uint64_t *startime;		//开始时间?
    uuid_t gfid;
    uint64_t dataset;
    gf_lock_t lock;
    uint64_t nlookup;
    uint64_t size;                /* dentry count */
    uint32_t fd_count;            /* Opened fd count */
    uint32_t ref;                 /* reference count on this inode */
    ia_type_t ia_type;            /* what kind of file */
    struct list_head fd_list;     /* list of open files on this inode */
    struct list_head dentry_list; /* list of directory entries for this inode */
    struct list_head hash;        /* hash table pointers */
    struct list_head list;        /* active/lru/purge */

    struct _inode_ctx *_ctx; /* replacement for dict_t *(inode->ctx) */
    bool in_invalidate_list; /* Set if inode is in table invalidate list */
    bool invalidate_sent;    /* Set it if invalidator_fn is called for inode */
    bool in_lru_list;        /* Set if inode is in table lru list */
};

5. dict

函数的扩展参数,内容是k-v结构,比方说操作系统文件的扩展属性(EA)等参数,acl

attr:标准属性,使用stat进行查看

ea:扩展属性,使用getfattr查看

都是POSIX的内容,有user、trusted、ugo、alc等权限

alamofs/libglusterfs/src/dict.h:

struct _dict {
    unsigned char is_static : 1;
    int32_t hash_size;
    int32_t count;
    int32_t refcount;
    data_pair_t **members;
    data_pair_t *members_list;
    char *extra_free;
    char *extra_stdfree;
    gf_lock_t lock;
    data_pair_t *members_internal;
    data_pair_t free_pair;
    gf_boolean_t free_pair_in_use;
};
struct _data_pair {
    struct _data_pair *hash_next;
    struct _data_pair *prev;
    struct _data_pair *next;
    data_t *value;
    char *key;
};

6. stack

用来记录是谁创建的,是frame相关的内容

alamofs/libglusterfs/src/stack.h:

typedef _call_stack_t _call_stack;

struct _call_stack_t {
    union {
        struct list_head all_frames;	//frame链的头,祖先
        struct {
            call_stack_t *next_call;
            call_stack_t *prev_call;
        };
    };
    call_pool_t *pool;		//什么池
    gf_lock_t stack_lock;
    client_t *client;
    uint64_t unique;
    void *state; /* pointer to request state */
    uid_t uid;
    gid_t gid;
    pid_t pid;
    uint16_t ngrps;
    uint32_t groups_small[SMALL_GROUP_COUNT];
    uint32_t *groups_large;
    uint32_t *groups;
    gf_lkowner_t lk_owner;
    glusterfs_ctx_t *ctx;

    struct list_head myframes; /* List of call_frame_t that go to make the call stack */

    int32_t op;
    int8_t type;
    struct timeval tv;
};

//=======call_pool_t
struct call_pool {
    union {
        struct list_head all_frames;
        struct {
            call_stack_t *next_call;
            call_stack_t *prev_call;
        } all_stacks;
    };
    int64_t cnt;
    gf_lock_t lock;
    struct mem_pool *frame_mem_pool;
    struct mem_pool *stack_mem_pool;
};

7.fd

struct _fd {
    uint64_t pid;
    int32_t flags;
    char *key;
    int32_t refcount;			// 引用计数
    struct list_head inode_list;
    struct _inode *inode;
    gf_lock_t lock; /* used ONLY for manipulating
                       'struct _fd_ctx' array (_ctx).*/
    struct _fd_ctx *_ctx;
    int xl_count; /* Number of xl referred in this fd */
    struct fd_lk_ctx *lk_ctx;
    gf_boolean_t anonymous; /* geo-rep anonymous fd */
};

8. gfid

inode的id,用来在文件系统中找到文件的inode的

9. ctx

记录各种配置信息的变量。

10. fuse

struct fuse_private {
    int fd;
    uint32_t proto_minor;
    char *volfile;
    size_t volfile_size;
    char *mount_point;
    struct iobuf *iobuf;

    pthread_t fuse_thread;
    char fuse_thread_started;

    uint32_t direct_io_mode;
    size_t *msg0_len_p;

    double entry_timeout;
    double negative_timeout;
    double attribute_timeout;

    pthread_cond_t sync_cond;
    pthread_mutex_t sync_mutex;
    char event_recvd;

    char init_recvd;

    gf_boolean_t strict_volfile_check;

    fuse_handler_t **fuse_ops;
    fuse_handler_t **fuse_ops0;
    pthread_mutex_t fuse_dump_mutex;
    int fuse_dump_fd;

    glusterfs_graph_t *next_graph;
    xlator_t *active_subvol;

    pid_t client_pid;
    gf_boolean_t client_pid_set;
    unsigned uid_map_root;
    gf_boolean_t acl;
    gf_boolean_t selinux;
    gf_boolean_t read_only;
    int32_t fopen_keep_cache;
    int32_t gid_cache_timeout;
    gf_boolean_t enable_ino32;
    /* This is the mount option for disabling the root-squash for the
       mount irrespective of whether the root-squash option for the
       volume is set or not. But this option is honoured only for
       thr trusted clients. For non trusted clients this value does
       not have any affect and the volume option for root-squash is
       honoured.
    */
    gf_boolean_t no_root_squash;
    fdtable_t *fdtable;
    gid_cache_t gid_cache;
    char *fuse_mountopts;

    /* For fuse-reverse-validation */
    struct list_head invalidate_list;
    pthread_cond_t invalidate_cond;
    pthread_mutex_t invalidate_mutex;
    gf_boolean_t reverse_fuse_thread_started;
    uint64_t invalidate_count;
    /* For communicating with separate mount thread. */
    int status_pipe[2];

    /* for fuse queue length and congestion threshold */
    int background_qlen;
    int congestion_threshold;

    /* for using fuse-kernel readdirp*/
    gf_boolean_t use_readdirp;

    /* fini started, helps prevent multiple epoll worker threads
     * firing up the fini routine */
    gf_boolean_t fini_invoked;

    /* resolve gid with getgrouplist() instead of /proc/%d/status */
    gf_boolean_t resolve_gids;

    /* Enable or disable capability support */
    gf_boolean_t capability;

    /* Enable or disable throttle */
    gf_boolean_t throttle;
    int32_t outstanding_limit;
    sem_t sem;
    uuid_t volume_id;
    char volume_id_str[37];

    /* LRU Limit, if not set, default is 64k for now */
    uint32_t lru_limit;
    uint32_t invalidate_limit;
};

fuse_state_t

typedef struct {
    void *pool;
    xlator_t *this;
    xlator_t *active_subvol;
    inode_table_t *itable;
    loc_t loc;
    loc_t loc2;
    fuse_in_header_t *finh;
    int32_t flags;
    off_t off;
    size_t size;
    unsigned long nlookup;
    fd_t *fd;
    dict_t *xattr;
    dict_t *xdata;
    char *name;
    char is_revalidate;
    gf_boolean_t truncate_needed;
    gf_lock_t lock;
    uint64_t lk_owner;

    /* used within resolve_and_resume */
    /* */
    fuse_resolve_t resolve;
    fuse_resolve_t resolve2;

    loc_t *loc_now;
    fuse_resolve_t *resolve_now;

    void *resume_fn;

    int valid;
    int mask;
    dev_t rdev;
    mode_t mode;
    mode_t umask;
    struct iatt attr;
    struct gf_flock lk_lock;
    
    struct iovec vector;	// io流

    uuid_t gfid;
    uint32_t io_flags;
    int32_t fd_no;

    sem_t *sem;				// 系统的一个信号量,是一个联合体
} fuse_state_t;

struct iovec vector

io流,真实的数据存储在该集合内部;

struct iovec {
    void *iov_base;
    size_t iov_len;
};

sem_t

/usr/include/bits/semaphore.h

typedef union
{
    char __size[__SIZEOF_SEM_T];
    long int __align;
} sem_t; 

rpc_transport

struct rpc_transport {
    struct rpc_transport_ops *ops;
    rpc_transport_t *listener; /* listener transport to which
                                * request for creation of this
                                * transport came from. valid only
                                * on server process.
                                */

    void *private;
    struct _client_t *xl_private;
    void *xl; /* Used for THIS */
    void *mydata;
    pthread_mutex_t lock;
    int32_t refcount;

    int32_t outstanding_rpc_count;

    glusterfs_ctx_t *ctx;
    dict_t *options;
    char *name;
    void *dnscache;
    void *drc_client;
    data_t *buf;
    int32_t (*init)(rpc_transport_t *this);		// 初始化
    void (*fini)(rpc_transport_t *this);		// 回收
    int (*reconfigure)(rpc_transport_t *this, dict_t *options);
    rpc_transport_notify_t notify;
    void *notify_data;
    peer_info_t peerinfo;
    peer_info_t myinfo;

    uint64_t total_bytes_read;
    uint64_t total_bytes_write;

    struct list_head list;
    int bind_insecure;
    void *dl_handle; /* handle of dlopen() */
    char *ssl_name;
    dict_t *clnt_options; /* store options received from
                           * client */
    uint64_t rpcsvc_request_sent_count;
    uint64_t rpcsvc_request_received_count;
    uint64_t pingcnt;
    struct timeval last_sent;
    struct timeval last_received;
    gf_boolean_t throttled;
    gf_boolean_t special_client;
    gf_boolean_t special_backup;

    int outstanding_limit_std;
    int borrowed_cnt;

    uint64_t timer_loops;
    gf_boolean_t rebalance;
    bonus_status_t bonus_status;

    gf_boolean_t readonly_client;
    gf_boolean_t no_root_squash_client;

#define TRANSPORT_SOCKET 0
#define TRANSPORT_RDMA 1
    /* 0 for socket, 1 for rdma */
    int trans_type;

    /* variables for rdma sequential read/write */
    int32_t lock_owner;
    pthread_mutex_t *pollcq_lock;
    gf_boolean_t
        rdma_server_multithreading; /* server (nfs server/brick server) use
                                       multithreading*/
    rpc_sequential_request_t r;
    rpc_sequential_request_t w;
};

rpc_transport_ops

struct rpc_transport_ops {
    /* no need of receive op, msg will be delivered through an event
     * notification
     */
    int32_t (*submit_request)(rpc_transport_t *this, rpc_transport_req_t *req);
    int32_t (*submit_reply)(rpc_transport_t *this,
                            rpc_transport_reply_t *reply);
    int32_t (*connect)(rpc_transport_t *this, int port);
    int32_t (*listen)(rpc_transport_t *this);
    int32_t (*disconnect)(rpc_transport_t *this);
    int32_t (*get_peername)(rpc_transport_t *this, char *hostname, int hostlen);
    int32_t (*get_peeraddr)(rpc_transport_t *this, char *peeraddr, int addrlen,
                            struct sockaddr_storage *sa, socklen_t sasize);
    int32_t (*get_myname)(rpc_transport_t *this, char *hostname, int hostlen);
    int32_t (*get_myaddr)(rpc_transport_t *this, char *peeraddr, int addrlen,
                          struct sockaddr_storage *sa, socklen_t sasize);
    int32_t (*throttle)(rpc_transport_t *this, gf_boolean_t onoff);
};

event_slot_epoll

struct event_slot_epoll {
    int fd;
    int events;
    int gen;
    int ref;
    int do_close;
    int in_handler;
    int handled_error;
    void *data;
    event_handler_t handler;	//保存回调函数
    gf_lock_t lock;
};

event_poll

struct event_pool {
    struct event_ops *ops;

    int fd;
    int breaker[2];

    int count;
    struct event_slot_poll *reg;
    struct event_slot_epoll *ereg[EVENT_EPOLL_TABLES];
    int slots_used[EVENT_EPOLL_TABLES];

    int used;
    int changed;

    pthread_mutex_t mutex;
    pthread_cond_t cond;

    void *evcache;
    int evcache_size;

    /* NOTE: Currently used only when event processing is done using
     * epoll. */
    int eventthreadcount; /* number of event threads to execute. */
    pthread_t pollers[EVENT_MAX_THREADS]; /* poller thread_id store,
                                           * and live status */
    int destroy;
    int activethreadcount;
};

event_ops event_ops_epoll

在gluster里使用了大量的函数指针,所以如何找到指针对应的函数,非常有助于代码的阅读,下面2个struct,定义了对应的event函数

struct event_ops {
    struct event_pool *(*new)(int count, int eventthreadcount);

    int (*event_register)(struct event_pool *event_pool, int fd,
                          event_handler_t handler, void *data, int poll_in,
                          int poll_out);

    int (*event_select_on)(struct event_pool *event_pool, int fd, int idx,
                           int poll_in, int poll_out);

    int (*event_unregister)(struct event_pool *event_pool, int fd, int idx);

    int (*event_unregister_close)(struct event_pool *event_pool, int fd,
                                  int idx);

    int (*event_dispatch)(struct event_pool *event_pool);

    int (*event_reconfigure_threads)(struct event_pool *event_pool,
                                     int newcount);
    int (*event_pool_destroy)(struct event_pool *event_pool);
    int (*event_handled)(struct event_pool *event_pool, int fd, int idx,
                         int gen);
};

// 指向函数真正实现的地方
struct event_ops event_ops_epoll = {
    .new = event_pool_new_epoll,
    .event_register = event_register_epoll,
    .event_select_on = event_select_on_epoll,
    .event_unregister = event_unregister_epoll,
    .event_unregister_close = event_unregister_close_epoll,
    .event_dispatch = event_dispatch_epoll,
    .event_reconfigure_threads = event_reconfigure_threads_epoll,
    .event_pool_destroy = event_pool_destroy_epoll,
    .event_handled = event_handled_epoll,
};

/* Attempts to start the # of configured pollers, ensuring at least the first
 * is started in a joinable state */
static int
event_dispatch_epoll(struct event_pool *event_pool)
{
    int i = 0;
    pthread_t t_id;
    int pollercount = 0;
    int ret = -1;
    struct event_thread_data *ev_data = NULL;

    /* Start the configured number of pollers */
    pthread_mutex_lock(&event_pool->mutex);
    {
        pollercount = event_pool->eventthreadcount;

        /* Set to MAX if greater */
        if (pollercount > EVENT_MAX_THREADS)
            pollercount = EVENT_MAX_THREADS;

        /* Default pollers to 1 in case this is incorrectly set */
        if (pollercount <= 0)
            pollercount = 1;

        event_pool->activethreadcount++;

        for (i = 0; i < pollercount; i++) {
            ev_data = GF_CALLOC(1, sizeof(*ev_data), gf_common_mt_event_pool);
            if (!ev_data) {
                if (i == 0) {
                    /* Need to suceed creating 0'th
                     * thread, to joinable and wait */
                    break;
                } else {
                    /* Inability to create other threads
                     * are a lesser evil, and ignored */
                    continue;
                }
            }

            ev_data->event_pool = event_pool;
            ev_data->event_index = i + 1;

            // 真正创建线程的语句
            ret = pthread_create(&t_id, NULL, 
                                 event_dispatch_epoll_worker,
                                 ev_data);
            if (!ret) {
                event_pool->pollers[i] = t_id;

                /* mark all threads other than one in index 0
                 * as detachable. Errors can be ignored, they
                 * spend their time as zombies if not detched
                 * and the thread counts are decreased */
                if (i != 0)
                    pthread_detach(event_pool->pollers[i]);
            } else {
                gf_msg("epoll", GF_LOG_WARNING, 0,
                       LG_MSG_START_EPOLL_THREAD_FAILED,
                       "Failed to start thread for index %d", i);
                if (i == 0) {
                    GF_FREE(ev_data);
                    break;
                } else {
                    GF_FREE(ev_data);
                    continue;
                }
            }
        }
    }
    pthread_mutex_unlock(&event_pool->mutex);

    /* Just wait for the first thread, that is created in a joinable state
     * and will never die, ensuring this function never returns */
    if (event_pool->pollers[0] != 0)
        pthread_join(event_pool->pollers[0], NULL);

    pthread_mutex_lock(&event_pool->mutex);
    {
        event_pool->activethreadcount--;
    }
    pthread_mutex_unlock(&event_pool->mutex);

    return ret;
}

nfs3_local —— nfs3_call_state_t *cs

用于在 fop 与其回调之间通信状态的结构。并非所有成员都一直被使用。用法取决于 fop 和 NFS 请求。

struct nfs3_local {
    GF_REF_DECL;

    rpcsvc_request_t *req;
    xlator_t *vol;
    nfs3_resume_fn_t resume_fn;
    xlator_t *nfsx;
    struct nfs3_state *nfs3state;
    struct iatt buf;

    /* The list hook to attach this call state to the inode's queue till
     * the opening of the fd on the inode completes.
     */
    struct list_head openwait_q;

    /* Per-NFSv3 Op state */
    struct nfs3_fh parent;
    struct nfs3_fh fh;
    struct nfs3_fh old_fh;
    fd_t *fd;
    uint32_t accessbits;
    int operrno;
    count3 dircount;
    count3 maxcount;
    struct statvfs fsstat;
    gf_dirent_t entries;
    struct iatt stbuf;
    struct iatt preparent;
    struct iatt postparent;
    int32_t setattr_valid;
    nfstime3 timestamp;
    loc_t oploc;
    int writetype;
    count3 datacount;
    offset3 dataoffset;
    struct iobuf *iob;
    struct iobref *iobref;
    createmode3 createmode;
    uint64_t cookieverf;
    int sattrguardcheck;
    char *pathname;
    char *old_pathname;
    ftype3 mknodtype;
    specdata3 devnums;
    cookie3 cookie;
    struct iovec datavec;
    mode_t mode;

    /* NFSv3 FH resolver state */
    int hardresolved;
    int softresolved;
    int src_softresolved;
    gf_boolean_t retry_resloved;
    struct nfs3_fh resolvefh;
    loc_t resolvedloc;
    int resolve_ret;
    int resolve_errno;
    int hashidx;
    fd_t *resolve_dir_fd;
    char *resolventry;
    nfs3_lookup_type_t lookuptype;
    gf_dirent_t *hashmatch;
    gf_dirent_t *entrymatch;
    off_t lastentryoffset;
    struct flock flock;
    args args;
    nlm4_lkowner_t lkowner;
    char cookiebytes[1024];
    struct nfs3_fh lockfh;
    int monitor;
    rpc_transport_t *trans;
    call_frame_t *frame;

    /* ACL */
    aclentry aclentry[NFS_ACL_MAX_ENTRIES];
    aclentry daclentry[NFS_ACL_MAX_ENTRIES];
    int aclcount;
    char aclxattr[NFS_ACL_MAX_ENTRIES * 8 + 4];
    int daclcount;
    char daclxattr[NFS_ACL_MAX_ENTRIES * 8 + 4];
};

rpcsvc_request

struct rpcsvc_request {
    /* connection over which this request came. */
    rpc_transport_t *trans;

    rpcsvc_t *svc;

    rpcsvc_program_t *prog;

    /* The identifier for the call from client.
     * Needed to pair the reply with the call.
     */
    uint32_t xid;

    int prognum;

    int progver;

    int procnum;

    int type;

    /* Uid and gid filled by the rpc-auth module during the authentication
     * phase.
     */
    uid_t uid;
    gid_t gid;
    pid_t pid;

    gf_lkowner_t lk_owner;
    uint64_t gfs_id;

    /* Might want to move this to AUTH_UNIX specific state since this array
     * is not available for every authentication scheme.
     */
    gid_t *auxgids;
    gid_t auxgidsmall[SMALL_GROUP_COUNT];
    gid_t *auxgidlarge;
    int auxgidcount;

    /* The RPC message payload, contains the data required
     * by the program actors. This is the buffer that will need to
     * be de-xdred by the actor.
     */
    struct iovec msg[MAX_IOVEC];
    int count;

    struct iobref *iobref;

    /* Status of the RPC call, whether it was accepted or denied. */
    int rpc_status;

    /* In case, the call was denied, the RPC error is stored here
     * till the reply is sent.
     */
    int rpc_err;

    /* In case the failure happened because of an authentication problem
     * , this value needs to be assigned the correct auth error number.
     */
    int auth_err;

    /* There can be cases of RPC requests where the reply needs to
     * be built from multiple sources. E.g. where even the NFS reply
     * can contain a payload, as in the NFSv3 read reply. Here the RPC header
     * ,NFS header and the read data are brought together separately from
     * different buffers, so we need to stage the buffers temporarily here
     * before all of them get added to the connection's transmission list.
     */
    struct list_head txlist;

    /* While the reply record is being built, this variable keeps track
     * of how many bytes have been added to the record.
     */
    size_t payloadsize;

    /* The credentials extracted from the rpc request */
    rpcsvc_auth_data_t cred;

    /* The verified extracted from the rpc request. In request side
     * processing this contains the verifier sent by the client, on reply
     * side processing, it is filled with the verified that will be
     * sent to the client.
     */
    rpcsvc_auth_data_t verf;

    /* Execute this request's actor function as a synctask? */
    gf_boolean_t synctask;

    /* Container for a RPC program wanting to store a temp
     * request-specific item.
     */
    void *private;

    /* Container for transport to store request-specific item */
    void *trans_private;

    /* we need to ref the 'iobuf' in case of 'synctasking' it */
    struct iobuf *hdr_iobuf;

    /* pointer to cached reply for use in DRC */
    drc_cached_op_t *reply;

    /* list for sequentail read/write */
    struct list_head list;
};

rpcsvc_actor_t glusterfs3_3_fop_actors

很神奇的写法,惊叹。。。

./rpc/rpc-lib/src/rpcsvc.h

typedef struct rpcsvc_actor_desc {
    char procname[RPCSVC_NAME_MAX];
    int procnum;
    rpcsvc_actor actor;

    /* Handler for cases where the RPC requests fragments are large enough
     * to benefit from being decoded into aligned memory addresses. While
     * decoding the request in a non-vectored manner, due to the nature of
     * the XDR scheme, RPC cannot guarantee memory aligned addresses for
     * the resulting message-specific structures. Allowing a specialized
     * handler for letting the RPC program read the data from the network
     * directly into its aligned buffers.
     */
    rpcsvc_vector_sizer vector_sizer;

    /* Can actor be ran on behalf an unprivileged requestor? */
    gf_boolean_t unprivileged;
    drc_op_type_t op_type;
} rpcsvc_actor_t;

服务器各方法的入口

位置:./xlators/protocol/server/src/server-rpc-fops.c

rpcsvc_actor_t glusterfs3_3_fop_actors[GLUSTER_FOP_PROCCNT] = {
    [GFS3_OP_NULL] = {"NULL", GFS3_OP_NULL, server_null, NULL, 0, DRC_NA},
    [GFS3_OP_STAT] = {"STAT", GFS3_OP_STAT, server3_3_stat, NULL, 0, DRC_NA},
    [GFS3_OP_READLINK] = {"READLINK", GFS3_OP_READLINK, server3_3_readlink,
                          NULL, 0, DRC_NA},
    [GFS3_OP_MKNOD] = {"MKNOD", GFS3_OP_MKNOD, server3_3_mknod, NULL, 0,
                       DRC_NA},
    [GFS3_OP_MKDIR] = {"MKDIR", GFS3_OP_MKDIR, server3_3_mkdir, NULL, 0,
                       DRC_NA},
    [GFS3_OP_UNLINK] = {"UNLINK", GFS3_OP_UNLINK, server3_3_unlink, NULL, 0,
                        DRC_NA},
    [GFS3_OP_RMDIR] = {"RMDIR", GFS3_OP_RMDIR, server3_3_rmdir, NULL, 0,
                       DRC_NA},
    [GFS3_OP_SYMLINK] = {"SYMLINK", GFS3_OP_SYMLINK, server3_3_symlink, NULL, 0,
                         DRC_NA},
    [GFS3_OP_RENAME] = {"RENAME", GFS3_OP_RENAME, server3_3_rename, NULL, 0,
                        DRC_NA},
    [GFS3_OP_LINK] = {"LINK", GFS3_OP_LINK, server3_3_link, NULL, 0, DRC_NA},
    [GFS3_OP_TRUNCATE] = {"TRUNCATE", GFS3_OP_TRUNCATE, server3_3_truncate,
                          NULL, 0, DRC_NA},
    [GFS3_OP_OPEN] = {"OPEN", GFS3_OP_OPEN, server3_3_open, NULL, 0, DRC_NA},
    [GFS3_OP_READ] = {"READ", GFS3_OP_READ, server3_3_readv, NULL, 0, DRC_NA},
    [GFS3_OP_WRITE] = {"WRITE", GFS3_OP_WRITE, server3_3_writev,
                       server3_3_writev_vecsizer, 0, DRC_NA},
    [GFS3_OP_STATFS] = {"STATFS", GFS3_OP_STATFS, server3_3_statfs, NULL, 0,
                        DRC_NA},
    [GFS3_OP_FLUSH] = {"FLUSH", GFS3_OP_FLUSH, server3_3_flush, NULL, 0,
                       DRC_NA},
    [GFS3_OP_FSYNC] = {"FSYNC", GFS3_OP_FSYNC, server3_3_fsync, NULL, 0,
                       DRC_NA},
    [GFS3_OP_SETXATTR] = {"SETXATTR", GFS3_OP_SETXATTR, server3_3_setxattr,
                          NULL, 0, DRC_NA},
    [GFS3_OP_GETXATTR] = {"GETXATTR", GFS3_OP_GETXATTR, server3_3_getxattr,
                          NULL, 0, DRC_NA},
    [GFS3_OP_REMOVEXATTR] = {"REMOVEXATTR", GFS3_OP_REMOVEXATTR,
                             server3_3_removexattr, NULL, 0, DRC_NA},
    [GFS3_OP_OPENDIR] = {"OPENDIR", GFS3_OP_OPENDIR, server3_3_opendir, NULL, 0,
                         DRC_NA},
    [GFS3_OP_FSYNCDIR] = {"FSYNCDIR", GFS3_OP_FSYNCDIR, server3_3_fsyncdir,
                          NULL, 0, DRC_NA},
    [GFS3_OP_ACCESS] = {"ACCESS", GFS3_OP_ACCESS, server3_3_access, NULL, 0,
                        DRC_NA},
    [GFS3_OP_CREATE] = {"CREATE", GFS3_OP_CREATE, server3_3_create, NULL, 0,
                        DRC_NA},
    [GFS3_OP_FTRUNCATE] = {"FTRUNCATE", GFS3_OP_FTRUNCATE, server3_3_ftruncate,
                           NULL, 0, DRC_NA},
    [GFS3_OP_FSTAT] = {"FSTAT", GFS3_OP_FSTAT, server3_3_fstat, NULL, 0,
                       DRC_NA},
    [GFS3_OP_LK] = {"LK", GFS3_OP_LK, server3_3_lk, NULL, 0, DRC_NA},
    [GFS3_OP_LOOKUP] = {"LOOKUP", GFS3_OP_LOOKUP, server3_3_lookup, NULL, 0,
                        DRC_NA},
    [GFS3_OP_READDIR] = {"READDIR", GFS3_OP_READDIR, server3_3_readdir, NULL, 0,
                         DRC_NA},
    [GFS3_OP_INODELK] = {"INODELK", GFS3_OP_INODELK, server3_3_inodelk, NULL, 0,
                         DRC_NA},
    [GFS3_OP_FINODELK] = {"FINODELK", GFS3_OP_FINODELK, server3_3_finodelk,
                          NULL, 0, DRC_NA},
    [GFS3_OP_ENTRYLK] = {"ENTRYLK", GFS3_OP_ENTRYLK, server3_3_entrylk, NULL, 0,
                         DRC_NA},
    [GFS3_OP_FENTRYLK] = {"FENTRYLK", GFS3_OP_FENTRYLK, server3_3_fentrylk,
                          NULL, 0, DRC_NA},
    [GFS3_OP_XATTROP] = {"XATTROP", GFS3_OP_XATTROP, server3_3_xattrop, NULL, 0,
                         DRC_NA},
    [GFS3_OP_FXATTROP] = {"FXATTROP", GFS3_OP_FXATTROP, server3_3_fxattrop,
                          NULL, 0, DRC_NA},
    [GFS3_OP_FGETXATTR] = {"FGETXATTR", GFS3_OP_FGETXATTR, server3_3_fgetxattr,
                           NULL, 0, DRC_NA},
    [GFS3_OP_FSETXATTR] = {"FSETXATTR", GFS3_OP_FSETXATTR, server3_3_fsetxattr,
                           NULL, 0, DRC_NA},
    [GFS3_OP_RCHECKSUM] = {"RCHECKSUM", GFS3_OP_RCHECKSUM, server3_3_rchecksum,
                           NULL, 0, DRC_NA},
    [GFS3_OP_SETATTR] = {"SETATTR", GFS3_OP_SETATTR, server3_3_setattr, NULL, 0,
                         DRC_NA},
    [GFS3_OP_FSETATTR] = {"FSETATTR", GFS3_OP_FSETATTR, server3_3_fsetattr,
                          NULL, 0, DRC_NA},
    [GFS3_OP_READDIRP] = {"READDIRP", GFS3_OP_READDIRP, server3_3_readdirp,
                          NULL, 0, DRC_NA},
    [GFS3_OP_RELEASE] = {"RELEASE", GFS3_OP_RELEASE, server3_3_release, NULL, 0,
                         DRC_NA},
    [GFS3_OP_RELEASEDIR] = {"RELEASEDIR", GFS3_OP_RELEASEDIR,
                            server3_3_releasedir, NULL, 0, DRC_NA},
    [GFS3_OP_FREMOVEXATTR] = {"FREMOVEXATTR", GFS3_OP_FREMOVEXATTR,
                              server3_3_fremovexattr, NULL, 0, DRC_NA},
    [GFS3_OP_FALLOCATE] = {"FALLOCATE", GFS3_OP_FALLOCATE, server3_3_fallocate,
                           NULL, 0, DRC_NA},
    [GFS3_OP_DISCARD] = {"DISCARD", GFS3_OP_DISCARD, server3_3_discard, NULL, 0,
                         DRC_NA},
    [GFS3_OP_ZEROFILL] = {"ZEROFILL", GFS3_OP_ZEROFILL, server3_3_zerofill,
                          NULL, 0, DRC_NA},
    [GFS3_OP_IPC] = {"IPC", GFS3_OP_IPC, server3_3_ipc, NULL, 0, DRC_NA},
};

nfs3_fh

用来保证一个字节流确实是 GlusterFS NFS 里面的文件句柄

struct nfs3_fh {
    /* Used to ensure that a bunch of bytes are actually a GlusterFS NFS
       file handle. Should contain ":OGL"
     */
    char ident[4];

    /* UUID that identifies an export. The value stored in exportid
     * depends on the usage of gluster nfs. If the DVM is enabled using
     * the nfs.dynamic-volumes option then exportid will contain the UUID
     * of the volume so that gnfs is able to identify volumes uniquely
     * through volume additions,deletions,migrations, etc.
     *
     * When not using dvm, exportid contains the index of the volume
     * based on the position of the volume in the list of subvolumes
     * for gnfs.
     */
    uuid_t exportid;

    /* File/dir gfid. */
    uuid_t gfid;
    uuid_t mountid;
    /* This structure must be exactly NFS3_FHSIZE (64) bytes long.
       Having the structure shorter results in buffer overflows
       during XDR decoding.
    */
    /* Dataset id, to distinguish snapshot or volume */
    uint64_t dsid;
    unsigned char padding[NFS3_FHSIZE - GF_NFSFH_STATIC_SIZE];
} __attribute__((__packed__));

recsvc_actor_t nfs3svc_actors

RPC的表,将操作和参数对应起来

rpcsvc_actor_t nfs3svc_actors[NFS3_PROC_COUNT] = {
    {"NULL", NFS3_NULL, nfs3svc_null, NULL, 0, DRC_IDEMPOTENT},
    {"GETATTR", NFS3_GETATTR, nfs3svc_getattr, NULL, 0, DRC_IDEMPOTENT},
    {"SETATTR", NFS3_SETATTR, nfs3svc_setattr, NULL, 0, DRC_NON_IDEMPOTENT},
    {"LOOKUP", NFS3_LOOKUP, nfs3svc_lookup, NULL, 0, DRC_IDEMPOTENT},
    {"ACCESS", NFS3_ACCESS, nfs3svc_access, NULL, 0, DRC_IDEMPOTENT},
    {"READLINK", NFS3_READLINK, nfs3svc_readlink, NULL, 0, DRC_IDEMPOTENT},
    {"READ", NFS3_READ, nfs3svc_read, NULL, 0, DRC_IDEMPOTENT},
    {"WRITE", NFS3_WRITE, nfs3svc_write, nfs3svc_write_vecsizer, 0,
     DRC_NON_IDEMPOTENT},
    {"CREATE", NFS3_CREATE, nfs3svc_create, NULL, 0, DRC_NON_IDEMPOTENT},
    {"MKDIR", NFS3_MKDIR, nfs3svc_mkdir, NULL, 0, DRC_NON_IDEMPOTENT},
    {"SYMLINK", NFS3_SYMLINK, nfs3svc_symlink, NULL, 0, DRC_NON_IDEMPOTENT},
    {"MKNOD", NFS3_MKNOD, nfs3svc_mknod, NULL, 0, DRC_NON_IDEMPOTENT},
    {"REMOVE", NFS3_REMOVE, nfs3svc_remove, NULL, 0, DRC_NON_IDEMPOTENT},
    {"RMDIR", NFS3_RMDIR, nfs3svc_rmdir, NULL, 0, DRC_NON_IDEMPOTENT},
    {"RENAME", NFS3_RENAME, nfs3svc_rename, NULL, 0, DRC_NON_IDEMPOTENT},
    {"LINK", NFS3_LINK, nfs3svc_link, NULL, 0, DRC_NON_IDEMPOTENT},
    {"READDIR", NFS3_READDIR, nfs3svc_readdir, NULL, 0, DRC_IDEMPOTENT},
    {"READDIRPLUS", NFS3_READDIRP, nfs3svc_readdirp, NULL, 0, DRC_IDEMPOTENT},
    {"FSSTAT", NFS3_FSSTAT, nfs3svc_fsstat, NULL, 0, DRC_IDEMPOTENT},
    {"FSINFO", NFS3_FSINFO, nfs3svc_fsinfo, NULL, 0, DRC_IDEMPOTENT},
    {"PATHCONF", NFS3_PATHCONF, nfs3svc_pathconf, NULL, 0, DRC_IDEMPOTENT},
    {"COMMIT", NFS3_COMMIT, nfs3svc_commit, NULL, 0, DRC_IDEMPOTENT}
};

fuse_handler_t函数指针

./xlators/mount/fuse/src/fuse-bridge.c

fuse的接口,这是客户端的入口,作为文件系统的“桥节点”

typedef void(fuse_handler_t)(xlator_t *this, fuse_in_header_t *finh, void *msg); 

static fuse_handler_t *fuse_std_ops[FUSE_OP_HIGH] = {
    [FUSE_LOOKUP] = fuse_lookup,
    [FUSE_FORGET] = fuse_forget,
    [FUSE_GETATTR] = fuse_getattr,
    [FUSE_SETATTR] = fuse_setattr,
    [FUSE_READLINK] = fuse_readlink,
    [FUSE_SYMLINK] = fuse_symlink,
    [FUSE_MKNOD] = fuse_mknod,
    [FUSE_MKDIR] = fuse_mkdir,
    [FUSE_UNLINK] = fuse_unlink,
    [FUSE_RMDIR] = fuse_rmdir,
    [FUSE_RENAME] = fuse_rename,
    [FUSE_LINK] = fuse_link,
    [FUSE_OPEN] = fuse_open,
    [FUSE_READ] = fuse_readv,
    [FUSE_WRITE] = fuse_write,
    [FUSE_STATFS] = fuse_statfs,
    [FUSE_RELEASE] = fuse_release,
    [FUSE_FSYNC] = fuse_fsync,
    [FUSE_SETXATTR] = fuse_setxattr,
    [FUSE_GETXATTR] = fuse_getxattr,
    [FUSE_LISTXATTR] = fuse_listxattr,
    [FUSE_REMOVEXATTR] = fuse_removexattr,
    [FUSE_FLUSH] = fuse_flush,
    [FUSE_INIT] = fuse_init,
    [FUSE_OPENDIR] = fuse_opendir,
    [FUSE_READDIR] = fuse_readdir,
    [FUSE_RELEASEDIR] = fuse_releasedir,
    [FUSE_FSYNCDIR] = fuse_fsyncdir,
    [FUSE_GETLK] = fuse_getlk,
    [FUSE_SETLK] = fuse_setlk,
    [FUSE_SETLKW] = fuse_setlk,
    [FUSE_ACCESS] = fuse_access,
    [FUSE_CREATE] = fuse_create,
    /* [FUSE_INTERRUPT] */
    /* [FUSE_BMAP] */
    [FUSE_DESTROY] = fuse_destroy,
/* [FUSE_IOCTL] */
/* [FUSE_POLL] */
/* [FUSE_NOTIFY_REPLY] */

#if FUSE_KERNEL_MINOR_VERSION >= 16
    [FUSE_BATCH_FORGET] = fuse_batch_forget,
#endif

#if FUSE_KERNEL_MINOR_VERSION >= 19
#ifdef FALLOC_FL_KEEP_SIZE
    [FUSE_FALLOCATE] = fuse_fallocate,
#endif /* FALLOC_FL_KEEP_SIZE */
#endif

#if FUSE_KERNEL_MINOR_VERSION >= 21
    [FUSE_READDIRPLUS] = fuse_readdirp,
#endif
};

fuse_opcode

fuse操作对应的编号

enum fuse_opcode {
    FUSE_LOOKUP = 1,
    FUSE_FORGET = 2, /* no reply */
    FUSE_GETATTR = 3,
    FUSE_SETATTR = 4,
    FUSE_READLINK = 5,
    FUSE_SYMLINK = 6,
    FUSE_MKNOD = 8,
    FUSE_MKDIR = 9,
    FUSE_UNLINK = 10,
    FUSE_RMDIR = 11,
    FUSE_RENAME = 12,
    FUSE_LINK = 13,
    FUSE_OPEN = 14,
    FUSE_READ = 15,
    FUSE_WRITE = 16,
    FUSE_STATFS = 17,
    FUSE_RELEASE = 18,
    FUSE_FSYNC = 20,
    FUSE_SETXATTR = 21,
    FUSE_GETXATTR = 22,
    FUSE_LISTXATTR = 23,
    FUSE_REMOVEXATTR = 24,
    FUSE_FLUSH = 25,
    FUSE_INIT = 26,
    FUSE_OPENDIR = 27,
    FUSE_READDIR = 28,
    FUSE_RELEASEDIR = 29,
    FUSE_FSYNCDIR = 30,
    FUSE_GETLK = 31,
    FUSE_SETLK = 32,
    FUSE_SETLKW = 33,
    FUSE_ACCESS = 34,
    FUSE_CREATE = 35,
    FUSE_INTERRUPT = 36,
    FUSE_BMAP = 37,
    FUSE_DESTROY = 38,
    FUSE_IOCTL = 39,
    FUSE_POLL = 40,
    FUSE_NOTIFY_REPLY = 41,
    FUSE_BATCH_FORGET = 42,
    FUSE_FALLOCATE = 43,
    FUSE_READDIRPLUS = 44,

    /* CUSE specific operations */
    CUSE_INIT = 4096,
};

volfile

/var/lib/alamod/vols/vol1/vol1.Al1Xtao.data-pool2-vol1-brick.vol

volume vol1-posix
    type storage/posix
    option posix on
    option volume-id 2fc63729-770f-4287-af1a-5c2c2b64ffb7
    option directory /data/pool2/vol1-brick
end-volume

volume vol1-huntlog
    type features/huntlog
    option huntlog-filter off
    option huntlog off
    option directory /data/pool2/vol1-brick
    option brick-path /data/pool2/vol1-brick
    option volume-name vol1
    option brick-index 1
    subvolumes vol1-posix
end-volume

volume vol1-access-control
    type features/access-control
    subvolumes vol1-huntlog
end-volume

volume vol1-locks
    type features/locks
    subvolumes vol1-access-control
end-volume

volume vol1-io-queues
    type performance/io-queues
    option io-queues on
    subvolumes vol1-locks
end-volume

volume vol1-marker
    type features/marker
    option inode-quota off
    option quota off
    option gsync-force-xtime off
    option xtime off
    option quota-version 0
    option timestamp-file /var/lib/alamod/vols/vol1/marker.tstamp
    option volume-uuid 2fc63729-770f-4287-af1a-5c2c2b64ffb7
    subvolumes vol1-io-queues
end-volume

volume vol1
    type features/quota
    option deem-statfs off
    option timeout 0
    option server-quota off
    option volume-uuid vol1
    subvolumes vol1-marker
end-volume

volume /data/pool2/vol1-brick
    type debug/io-stats
    option count-fop-hits off
    option frame-latency-measurement off
    option latency-measurement off
    option io-stats on
    option io-stats-global-switch off
    option log-level INFO
    subvolumes vol1
end-volume

volume vol1-server
    type protocol/server
    option transport.socket.keepalive-count 5
    option transport.socket.keepalive-interval 2
    option transport.socket.keepalive-time 20
    option auth.addr./data/pool2/vol1-brick.allow *
    option auth.login.c8730cf8-037b-440a-9461-95d4f63609e7.password 4bd06a73-e935-4e45-8a30-e45bc609e0bb
    option auth.login./data/pool2/vol1-brick.allow c8730cf8-037b-440a-9461-95d4f63609e7
    option transport-type tcp
    subvolumes /data/pool2/vol1-brick
end-volume
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值