学习destor（一）

最新推荐文章于 2024-08-25 08:42:52 发布

persimmon_xh

最新推荐文章于 2024-08-25 08:42:52 发布

阅读量1.3k

点赞数 4

分类专栏：数据去重

本文链接：https://blog.csdn.net/persimmon_xh/article/details/113059509

版权

数据去重专栏收录该内容

3 篇文章 1 订阅

订阅专栏

从do_delete.c开始看

什么是recipe
为了找什么是recipe，首先需要先回到recipestore.h文件中，里面有一个结构体叫backupversion，这个backupversion我理解为处理指纹序列的模块，里面包含.meta,.recipe,和.record。
在backupversion中是以FILE文件指针形式来表示这三个东西的，所以应该在delete之前应该已经写入了磁盘了，并不是常驻内存（猜测）。

struct backupVersion {

	sds path;
	int32_t bv_num; /* backup version number start from 0 */

	int deleted;

	int64_t number_of_files;
	int64_t number_of_chunks;

	sds fname_prefix; /* The prefix of the file names */

	FILE *metadata_fp;
	FILE *recipe_fp;
	FILE *record_fp;

	/* the write buffer of recipe meta */
	char *metabuf;
	int metabufoff;

	/* the write buffer of records */
	char *recordbuf;
	int recordbufoff;

	char* segmentbuf;
	int segmentlen;
	int segmentbufoff;
};

顺便就看看这个头文件中的其他结构体：

fileRecipeMeta
file recipe的元数据。包含chunknum和filesize还有一个不理解的filename，不知道sds是什么东西？

sds是一个动态string库中的变量SDSLib，在util/sds.h中有说明

/* Point to the meta of a file recipe */
struct fileRecipeMeta {
	int64_t chunknum;
	int64_t filesize;
	sds filename;
};

segmentRecipe
这里说每个recipe中包含segment。
每一个预取的segment都被组织在一张哈希表中，目的是为了优化查找
这是逻辑局部性的基本单元（实现逻辑局部性？）

/*
 * Each recipe consists of segments.
 * Each prefetched segment is organized as a hash table for optimizing lookup.
 * It is the basic unit of logical locality.
 * */
struct segmentRecipe {
	segmentid id;
	/* Map fingerprints in the segment to their container IDs.*/
	GHashTable *kvpairs;
};

chunkPointer
指向chunk的指针。里面包含了chunk的指纹，container的id和chunk的大小。注意看注释。

/*
 * If id == CHUNK_SEGMENT_START or CHUNK_SEGMENT_END,
 * it is a flag of segment boundary.
 * If id == CHUNK_SEGMENT_START,
 * size indicates the length of the segment in terms of # of chunks.
 */
struct chunkPointer {
	fingerprint fp;
	containerid id;
	int32_t size;
};

所以总结，recipe应该是菜单一类的东西，用来指示文件（如struct fileRecipeMeta），其中包含segment recipe，segment recipe用来实现局部性。

回爷爷家了。回了再继续看…
我又回来了…最近真的太难了

好了知道了recipe 是什么，现在开始看do_delete.c

delete_an_entry函数
index_delete(fp,*id)函数在index.c中，其中又调用了kvstore_delete(fp, id);（套娃…），然后kvstore_delete又等价于 kvstore_htable_delete，然后再找到kvstore_htable.c文件，然后找到kvstore_htable_delete。
g_hash_table相当于fingerprint to container index。key是指纹，value是container的ID。

这个函数首先找到key在哈希表中的键值对kv，然后得到kv对应的value数组，遍历索引，找到value值为id的位置，调用memmove函数，memmove的函数的作用是从value[i+1]的位置开始复制（destor.index_value_length - i - 1) * sizeof(int64_t)这么多单元到value[i]的位置。

/* A simple wrap.
 * Just to make the interfaces of the index module more consistent.
 */
static inline void delete_an_entry(fingerprint *fp, int64_t *id){
	index_delete(fp, *id);
}

inline void index_delete(fingerprint *fp, int64_t id){
	kvstore_delete(fp, id);
}

void init_kvstore() {

    switch(destor.index_key_value_store){
    	case INDEX_KEY_VALUE_HTABLE:
    		init_kvstore_htable();

    		close_kvstore = close_kvstore_htable;
    		kvstore_lookup = kvstore_htable_lookup;
    		kvstore_update = kvstore_htable_update;
    		kvstore_delete = kvstore_htable_delete;

    		break;
    	default:
    		WARNING("Invalid key-value store!");
    		exit(1);
    }
}

/* Remove the 'id' from the kvpair identified by 'key' */
void kvstore_htable_delete(char* key, int64_t id){
	kvpair kv = g_hash_table_lookup(htable, key);
	if(!kv)
		return;

	int64_t *value = get_value(kv);
	int i;
	for(i=0; i<destor.index_value_length; i++){
		if(value[i] == id){
			value[i] = TEMPORARY_ID;
			/*
			 * If index exploits physical locality,
			 * the value length is 1. (correct)
			 * If index exploits logical locality,
			 * the deleted one should be in the end. (correct)
			 */
			/* NOTICE: If the backups are not deleted in FIFO order, this assert should be commented */
			assert((i == destor.index_value_length - 1)
					|| value[i+1] == TEMPORARY_ID);
			if(i < destor.index_value_length - 1 && value[i+1] != TEMPORARY_ID){
				/* If the next ID is not TEMPORARY_ID */
				memmove(&value[i], &value[i+1], (destor.index_value_length - i - 1) * sizeof(int64_t));
			}
			break;
		}
	}

	/*
	 * If all IDs are deleted, the kvpair is removed.
	 */
	if(value[0] == TEMPORARY_ID){
		/* This kvpair can be removed. */
		g_hash_table_remove(htable, key);
	}
}

static void* read_recipe_for_deletion(void *arg)函数
就是读出arg指向的备份系统中的file中的chunk的指针，也就是结构体chunkPointer，并且把指针压如delete_recipe_queue中，为什么要释放cp，是因为在read_next_n_chunk_pointers读的过程中malloc了空间，所以要释放。释放r也是一个道理。
最后new一个新的chunk的指针，设置为end边界压入队列。

static void* read_recipe_for_deletion(void *arg) {
    struct backupVersion* bv = (struct backupVersion*)arg;

    struct chunk *c = new_chunk(0);
    SET_CHUNK(c, CHUNK_FILE_START);
    sync_queue_push(delete_recipe_queue, c);

    int i, j, k;
    for (i = 0; i < bv->number_of_files; i++) {


        struct fileRecipeMeta *r = read_next_file_recipe_meta(bv);

        for (j = 0; j < r->chunknum; j++) {
            struct chunkPointer* cp = read_next_n_chunk_pointers(bv, 1, &k);

            struct chunk* c = new_chunk(0);
            memcpy(&c->fp, &cp->fp, sizeof(fingerprint));
            c->size = cp->size;
            c->id = cp->id;

            sync_queue_push(delete_recipe_queue, c);
            free(cp);
        }


        free_file_recipe_meta(r);
    }

//    struct segmentRecipe* sr;
//    while((sr=read_next_segment(bv))){
//        segment_recipe_foreach(sr, add_an_entry, &sr->id);
//        int64_t* r = (int64_t*)malloc(sizeof(int64_t));
//        *r = sr->id;
//        g_hash_table_insert(invalid_containers, r, r);
//    }

    c = new_chunk(0);
    SET_CHUNK(c, CHUNK_FILE_END);
    sync_queue_push(delete_recipe_queue, c);

    sync_queue_term(delete_recipe_queue);
    return NULL;
}

arg是什么？
发现void* read_recipe_for_deletion这个函数只有在pthread_create中调用了。
startJIDPtr是什么？
startJIDPtr的内容指向的是备份系统的number，arg也是指向备份系统的number的指针。见4
g_hash_table_new_full函数
是glib库中的函数。glib库里有两个函数可以用于创建hash表，分别是g_hash_table_new()和g_hash_table_new_full()，它们的原型如下：

GHashTable * g_hash_table_new(GHashFunc hash_func, GEqualFunc key_equal_func);
GHashTable * g_hash_table_new_full(GHashFunc hash_func, GEqualFunc key_equal_func, GDestroyNotify key_destroy_func, GDestroyNotify value_destroy_func);

其中hash_func是一个函数，它为key创建一个hash值；key_equal_func用于比较两个key是否相等；
key_destroy_func当你从hash表里删除、销毁一个条目时，glib库会自动调用它释放key所占用的内存空间，
这对于key是动态分配内存的hash表来说非常有用；value_destroy_func的作用与key_destroy_func相似，
只是它释放的是value占用的内存空间。

来自https://blog.csdn.net/plusboy/article/details/1496215

struct backupVersion* open_backup_version(int number) 函数
打开一个备份的版本。参数是number，也就是说startJIDPtr的内容指向的是备份系统的number，arg也是指向备份系统的number的指针。
外层遍历备份系统的文件，内层遍历chunk，首先读取每一个chunk的指针chunkPointer，然后将chunk对应的id插入哈希表中。
循环结束后，建立一个新的chunk c。
void sync_queue_push(SyncQueue* s_queue, void* item)函数。在utils/sync_queue.c中。这个函数主要是互斥地访问s_queue,并将item插入s_queue中。在这里是将新建立的chunk c插入到delete_recipe_queue中。

void sync_queue_push(SyncQueue* s_queue, void* item) {
	if (pthread_mutex_lock(&s_queue->mutex) != 0) {
		puts("failed to lock!");
		return;
	}

	if (s_queue->term == 1) {
		pthread_mutex_unlock(&s_queue->mutex);
		return;
	}

	while (s_queue->max_size > 0
			&& queue_size(s_queue->queue) >= s_queue->max_size) {
		pthread_cond_wait(&s_queue->max_work, &s_queue->mutex);
	}

	queue_push(s_queue->queue, item);

	pthread_cond_broadcast(&s_queue->min_work);

	if (pthread_mutex_unlock(&s_queue->mutex)) {
		puts("failed to lock!");
		return;
	}
}

destor.backup_retention_time是什么？
表示有多少个备份被保留（retained），当backup_retention_time为负数时，表示所有的备份都被保留。还是不懂，是留在内存中的意思吗？还是备份的意思？
struct fileRecipeMeta* read_next_file_recipe_meta(struct backupVersion* b) 该函数读入备份版本b中的下一个file_recipe_mate r，该函数最后返回r。


struct fileRecipeMeta* read_next_file_recipe_meta(struct backupVersion* b) {

	static int read_file_num;

	assert(read_file_num <= b->number_of_files);

	int len;
	fread(&len, sizeof(len), 1, b->metadata_fp);
	char filename[len + 1];

	fread(filename, len, 1, b->metadata_fp);
	filename[len] = 0;

	struct fileRecipeMeta* r = new_file_recipe_meta(filename);

	fread(&r->chunknum, sizeof(r->chunknum), 1, b->metadata_fp);
	fread(&r->filesize, sizeof(r->filesize), 1, b->metadata_fp);

	read_file_num++;

	return r;
}

struct chunkPointer* read_next_n_chunk_pointers(struct backupVersion* b, int n, int *k)
这个函数读入n或者b->number_of_chunks - read_chunk_num个chunk pointer。
注意：int num = (b->number_of_chunks - read_chunk_num) > n ?
n : (b->number_of_chunks - read_chunk_num), i; 这里面的,i表示的是声明i这个变量，和前面的num的赋值无关

/*
 * If return value is not NULL, a new file starts.
 * If no recipe and chunkpointer are read,
 * we arrive at the end of the stream.
 */
struct chunkPointer* read_next_n_chunk_pointers(struct backupVersion* b, int n,
		int *k) {

	/* Total number of read chunks. */
	static int read_chunk_num;

	if (read_chunk_num == b->number_of_chunks) {
		/* It's the stream end. */
		*k = 0;
		return NULL;
	}

	int num = (b->number_of_chunks - read_chunk_num) > n ?
					n : (b->number_of_chunks - read_chunk_num), i;

	struct chunkPointer *cp = (struct chunkPointer *) malloc(
			sizeof(struct chunkPointer) * num);

	for (i = 0; i < num; i++) {
		fread(&(cp[i].fp), sizeof(fingerprint), 1, b->recipe_fp);
		fread(&(cp[i].id), sizeof(containerid), 1, b->recipe_fp);
		fread(&(cp[i].size), sizeof(int32_t), 1, b->recipe_fp);
		/* Ignore segment boundaries */
		if(cp[i].id == 0 - CHUNK_SEGMENT_START || cp[i].id == 0 - CHUNK_SEGMENT_END)
			i--;
	}

	*k = num;

	read_chunk_num += num;
	assert(read_chunk_num <= b->number_of_chunks);

	return cp;
}

void sync_queue_term(SyncQueue* s_queue)
这个函数是互斥的修改term的值为1（通过上锁的方式实现互斥）

void sync_queue_term(SyncQueue* s_queue) {
	if (pthread_mutex_lock(&s_queue->mutex) != 0) {
		puts("failed to lock!");
		return;
	}

	s_queue->term = 1;

	pthread_cond_broadcast(&s_queue->min_work);

	pthread_mutex_unlock(&s_queue->mutex);
}