Linux Rsync文件增量同步的一个简单C++实现Demo

heyAmos

已于 2023-08-02 17:18:43 修改

阅读量1k

点赞数 1

分类专栏： C++ 文章标签： C++ Linux Rsync 算法

于 2020-08-23 14:30:00 首次发布

本文链接：https://blog.csdn.net/yinxiaobao97/article/details/108181980

版权

C++ 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

原理及整体流程简介

算法原理（我读的这篇bolg，感谢！）：Rsync同步算法__飞翔的企鹅_的博客-CSDN博客。

不成熟 的实现流程：

假设客户端是待更新文件A，服务端是文件A_new （当然，文件同步应该是双向的，这里只是为了方便介绍原理）

客户端整体流程：

1.文件分块获取chunks （每个chunk带有24B的信息）

2.发送chunks给服务端，（服务端处理chunks）

3.接受来自服务端的patches, 重组文件

服务端整体流程：

1.接受服务端的chunks(与客户端step2对应)

2.rolling匹配出相同分块，将patches发送给服务端(与客户端step3对应)

客户端的实现代码

Chunk结构：

#define CHUNK_SIZE 4096
struct Chunk {     /* Chunk内容 */
    Md5    md5;    /* MD5 16B */
    int    index;  /* Chunk number 4B*/
    Hash32 hash32; /* Hash32 4B*/
    void   init(const char* buf, int number = 0) {
        index = number;
        hash32.push(buf, CHUNK_SIZE);
        md5.constructor(buf, CHUNK_SIZE);
    }
};  //

step1.文件分块获取chunks （每个chunk带有24B的信息）

/* Split file to chunks, return vector<Chunk> */
vector<Chunk> file_to_chunk(char *filepath) {
    int         fd = Open(filepath, O_RDWR, 0);
    struct stat fs;
    Fstat(fd, &fs);
    int n_chunk = fs.st_size / CHUNK_SIZE; /* compute n_chunk */
    cout << "文件被分成的块数："<<n_chunk << endl;
    vector<Chunk> cks(n_chunk);
    char          buf[CHUNK_SIZE];
    for (int i = 0; i < n_chunk; ++i) {  // i : chunk_number
        memset(buf, 0, sizeof(buf));
        Read(fd, buf, CHUNK_SIZE);
        cks[i].init(buf, i);
    }
    return cks;
}  //

step2.发送chunks给服务端，（服务端会拿chunks去进行匹配）

关键问题1：服务端如何知道chunks的大小？

1.先发送一个chunks的个数 (4B的 uint); （此时服务端应该期待着一个4b的数据到来）

2.再发送sizeof(Chunk) *n 的字节流 (服务端知道chunks的个数，那么经过计算也知道了该read多少个字节流)

/* 写入n个Chunk 到fd(也许这应该是server的fd) */
void send_chunk(int fd, const vector<Chunk> &cks, uint n) {
    Rio_writen(fd, &n, sizeof(n)); /* #protocol: 先传输cks的个数, 约定为4B的int */
    Rio_writen(fd, (void *)cks.data(),
               sizeof(Chunk) * n); /* 然后将n个chunk转换成字节流，传输 */
}

step3.接受来自服务端的patches, 重组文件

这里的patches是不带数据结构的

patches有两种类型：

1.成功匹配块号（只需要传输块号）

2.未成功匹配的碎片（那么需要传输碎片大小，加上碎片数据）

关键问题2：如何区分patches是块号还是碎片？

根据上面信息，patches可以编码成字节流

1.块号格式：[index(4B)] , 接收到index>=0则说明这是一个块号

2.碎片格式：[index(4B)][data(-index B)], 接收到index<0则说明这是一个碎片，需要继续接受 -index B的数据流

/* 接受来自sfd的patches,  old_file_path：老文件, new_file_path:新文件temp，
 * 后期可以删除老文件，改名新文件 */
void recv_patches(int sfd, char *old_file_path, char *new_file_path) {
    int         old_fd = Open(old_file_path, O_RDONLY, 0);
    int         new_fd = Open(new_file_path, O_WRONLY | O_CREAT | O_TRUNC, 777);
    struct stat old_stat;
    Fstat(old_fd, &old_stat);

    /* step 3.1 先接收文件大小 */
    off_t new_file_size;
    Rio_readn(sfd, &new_file_size, sizeof(new_file_size));
    cout << "将要接受的文件大小:" << new_file_size / 1024.0 / 1024 << "MB"
         << endl;

    off_t i = 0;
    char  buf[CHUNK_SIZE];
    while (1) {
        int index;
        if (!Rio_readn(sfd, &index, sizeof(index))) break;
        if (index >= 0) { /* 此时index代表块号 */
            Lseek(old_fd, index * CHUNK_SIZE, SEEK_SET);
            Rio_readn(old_fd, buf, CHUNK_SIZE);
            Rio_writen(new_fd, buf, CHUNK_SIZE);
            i += CHUNK_SIZE;
        } else { /* 此时|index|代表patch的大小 */
            int nn = -index;
            while (nn > 0) {
                int n = min(nn, CHUNK_SIZE);
                Rio_readn(sfd, buf, n);
                Rio_writen(new_fd, buf, n);
                nn -= n;
            }
        }
    }
}

服务端的实现代码

1.接受服务端的chunks(与客户端step2对应)

vector<Chunk> recv_chunk(int fd) {
    uint n;
    Rio_readn(fd, &n, sizeof(n)); /* 1.先接受块的个数 */
    printf("接收到%d个chunk, 一共接受%d个字节\n", n, n * sizeof(Chunk));
    vector<Chunk> cks;
    char          buf[sizeof(Chunk)];
    while (n--) { /* 读取n个chunk */
        Rio_readn(fd, buf, sizeof(Chunk));
        cks.push_back(*(Chunk*)buf); /* 每读一个chunk，加入到cks */
    }
    return cks;
}  //

2.rolling匹配出相同分块，将patches发送给服务端(与客户端step3对应)

/*     if index>=0:  index  represented a chunk number,
  else if index< 0: |index| represented a patch size */
void send_patch(int clientfd, int index, char* data) {
    Rio_writen(clientfd, &index, sizeof(index));
    cnt_send_byte += 4;
    if (index < 0) {
        Rio_writen(clientfd, data, -index);
        cnt_send_byte += -index;
    }
}

/* 输入接收到的Chunk，本地文件 , 发送pathes到clientfd */
void rolling_compare(vector<Chunk>& dest, const char* filename, int clientfd) {
    unordered_map<int, int> mp; /* 查找hash32对应的index */
    int         ffd = Open(filename, O_RDONLY, 0); /* 文件描述符 */
    struct stat fstat;                             /* 文件信息 */
    char*       file_begin_ptr;                    /* 文件起始地址 */

    for (Chunk& ck : dest) mp[ck.hash32] = ck.index;
    Fstat(ffd, &fstat);
    /* 先发送新文件的大小过去吧 */
    Rio_writen(clientfd, &fstat.st_size, sizeof(fstat.st_size));
    printf("file size: %.2fMB\n", fstat.st_size / 1024.0 / 1024);
    file_begin_ptr =
        (char*)Mmap(NULL, fstat.st_size, PROT_READ, MAP_SHARED, ffd, 0);

    off_t i = 0; /* 文件偏移量 */
    while (i + CHUNK_SIZE < fstat.st_size) {
        off_t  last = i;
        Hash32 hs(file_begin_ptr + i, CHUNK_SIZE);
        i += CHUNK_SIZE;
        while (1) {
            /* 判断是否匹配到分块： 先比较hash32， 再比较md5，
             * 如果hash32不同则不需要计算md5
             */
            if (mp.count((int)hs) && Md5(file_begin_ptr + i - CHUNK_SIZE,
                                         CHUNK_SIZE) == dest[mp[(int)hs]].md5) {
                if (i - CHUNK_SIZE - last > 0)
                    send_patch(clientfd, -(i - CHUNK_SIZE - last),
                               &file_begin_ptr[last]); /* 发送patch */
                send_patch(clientfd, dest[mp[(int)hs]].index,
                           nullptr); /* 发送chunk_number */
                break;
            }
            if (i == fstat.st_size) { /* 指针到末尾 */
                send_patch(clientfd, -(i - last), &file_begin_ptr[last]);
                break;
            }
            hs.push(file_begin_ptr[i]);
            hs.pop(file_begin_ptr[i - CHUNK_SIZE]);
            ++i;
        };
    }
    if (i < fstat.st_size) /* 补充未发送的 */
        send_patch(clientfd, -(fstat.st_size - i), &file_begin_ptr[i]);
}

一个简单的测试（只是一个简单的例子）

1.创建一个file的文件，输入4.3MB的文本信息，

2.复制file文件，改名为file_new，随机更改里面的3连续字母

3.目标是客户端的file同步为file_new，暂存为file_tmp

结果：

1.内容一致性

结果检查，文件同步成功

2.流量节省度

监控数据网络传输的字节：

服务端一共接受26376B = 0.03MB（客户端发送过来的Chunks）

服务端一共发送8769B = 0.01MB （发送patches给客户端，客户端拿到patches重组）

在这个例子中（注意：仅仅是这个例子），只用了0.04MB同步了4.3MB的文件，数据流量节省了 99.1%。

分析：

1.chunk_size影响流量节省度

2.文件内容分块相似度起决定性影响（如果文件）

全部代码：GitHub - Nspyia/simple_rsync_demo: A simple implementation of Rsync algorithm.

heyAmos

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
Linux Rsync文件增量同步的一个简单C++实现Demo

原理及整体流程简介算法原理（我读的这篇bolg，感谢！）：https://blog.csdn.net/gdutliuyun827/article/details/72457984。不成熟的实现流程：假设客户端是待更新文件A，服务端是文件A_new （当然，文件同步应该是双向的，这里只是为了方便介绍原理）客户端整体流程：1.文件分块获取chunks（每个chunk带有24B的信息）2.发送chunks给服务端，（服务端处理chunk...
复制链接

扫一扫