Redis 之BIO与RIO

最新推荐文章于 2024-02-17 19:50:45 发布

andyhuabing

最新推荐文章于 2024-02-17 19:50:45 发布

阅读量2.8k

点赞数 1

分类专栏：分布式存储

本文链接：https://blog.csdn.net/andyhuabing/article/details/52585101

版权

分布式存储专栏收录该内容

12 篇文章 0 订阅

订阅专栏

一、BIO 之后台IO操作
BIO : Background I/O service for Redis.

负责我们需要在后台执行的操作。现在redis的版本中只有两类的操作，后台的close及fsync 系统调用。
为了避免一个文件最后的owner在执行close操作带来的unlink使得阻塞server，将这类操作用单独的后台线程来执行

将数据从内存写入磁盘这点非常重要，即fdatasync、因此就需要调用 fsync() 把文件数据和文件元信息写入强制刷新到磁盘中，这个速度是比较慢的、而其调用频度又会很高，所以有必要不能因IO而堵住现有的流程操作。

REDIS 允许有三种不同的策略:

<span style="font-size:18px;">/* Append only defines */
// 让kernel后台线程去做  这个线程默认可能是30秒去做一次
#define AOF_FSYNC_NO 0  
// 每次有write操作到AOF里 就会调用fsync
#define AOF_FSYNC_ALWAYS 1 
// 每秒调用一次fsync
#define AOF_FSYNC_EVERYSEC 2 
#define CONFIG_DEFAULT_AOF_FSYNC AOF_FSYNC_EVERYSEC</span>

AOF_FSYNC_EVERYSEC是一个很好的这种折中对于性能和安全这2个点。

<span style="font-size:18px;">/* 刷新缓存区的内容到磁盘中 */
void flushAppendOnlyFile(int force) {
	int sync_in_progress = 0;
	
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC) // 这个判定是否后台正在执行 fsync 
        sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;

	// 这里根据时间进行判定
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
        /* With this append fsync policy we do background fsyncing.
         * If the fsync is still in progress we can try to delay
         * the write for a couple of seconds. */
        if (sync_in_progress) {
            if (server.aof_flush_postponed_start == 0) {
                /* No previous write postponing, remember that we are
                 * postponing the flush and return. */
                server.aof_flush_postponed_start = server.unixtime;
                return;
            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
                /* We were already waiting for fsync to finish, but for less
                 * than two seconds this is still ok. Postpone again. */
                return;
            }
            /* Otherwise fall trough, and go write since we can't wait
             * over two seconds. */
            server.aof_delayed_fsync++;
            serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
        }
    }
    
    /* We want to perform a single write. This should be guaranteed atomic
     * at least if the filesystem we are writing is a real physical one.
     * While this will save us against the server being killed I don't think
     * there is much to do about the whole server stopping for power problems
     * or alike */
	//在进行写入操作的时候，还监听了延迟、write函数由于aof_buf一般不大很快就能返回，而阻塞的是fdatasync导致write等待
    latencyStartMonitor(latency);
    nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
    latencyEndMonitor(latency);
    
    // 调用偏移量
    server.aof_current_size += nwritten;

	// 优化内存重复使用性
    /* Re-use AOF buffer when it is small enough. The maximum comes from the
     * arena size of 4k minus some overhead (but is otherwise arbitrary). */
    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
        sdsclear(server.aof_buf);
    } else {
        sdsfree(server.aof_buf);
        server.aof_buf = sdsempty();
    }
    
    /* Perform the fsync if needed. */
    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
        /* aof_fsync is defined as fdatasync() for Linux in order to avoid
         * flushing metadata. */
        latencyStartMonitor(latency);
        aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("aof-fsync-always",latency);
        server.aof_last_fsync = server.unixtime;
    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
                server.unixtime > server.aof_last_fsync)) {
        // 交由后台执行fsync操作
        if (!sync_in_progress) 
        	aof_background_fsync(server.aof_fd);
        server.aof_last_fsync = server.unixtime;
    }
}</span>

用法很简单、下面重点看下BIO内部是如何实现的.

<span style="font-size:18px;">/* Background job opcodes */
#define BIO_CLOSE_FILE    0 /* Deferred close(2) syscall. */
#define BIO_AOF_FSYNC     1 /* Deferred AOF fsync. */
#define BIO_NUM_OPS       2</span>

主要两类作业类型：1.close 2.aof_fsync

<span style="font-size:18px;">//使用互斥量+条件变量，作为线程的保护条件
static pthread_mutex_t bio_mutex[REDIS_BIO_NUM_OPS];
static pthread_cond_t bio_condvar[REDIS_BIO_NUM_OPS];

//两类作业的队列、工作与挂起队列
static list *bio_jobs[REDIS_BIO_NUM_OPS];
static unsigned long long bio_pending[REDIS_BIO_NUM_OPS];

/* This structure represents a background Job. It is only used locally to this
 * file as the API does not expose the internals at all. */
struct bio_job {
    time_t time; /* Time at which the job was created. */
    /* Job specific arguments pointers. If we need to pass more than three
     * arguments we can just pass a pointer to a structure or alike. */
    void *arg1, *arg2, *arg3;
};

// 初始化相应变量并建立后台线程bioProcessBackgroundJobs
/* Initialize the background system, spawning the thread. */
void bioInit(void) {
    pthread_attr_t attr;
    pthread_t thread;
    size_t stacksize;
    int j;

    /* Initialization of state vars and objects */
    for (j = 0; j < BIO_NUM_OPS; j++) {
        pthread_mutex_init(&bio_mutex[j],NULL);
        pthread_cond_init(&bio_condvar[j],NULL);
        bio_jobs[j] = listCreate();
        bio_pending[j] = 0;
    }

    /* Set the stack size as by default it may be small in some system */
    pthread_attr_init(&attr);
    pthread_attr_getstacksize(&attr,&stacksize);
    if (!stacksize) stacksize = 1; /* The world is full of Solaris Fixes */
    while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
    pthread_attr_setstacksize(&attr, stacksize);

    /* Ready to spawn our threads. We use the single argument the thread
     * function accepts in order to pass the job ID the thread is
     * responsible of. */
    for (j = 0; j < BIO_NUM_OPS; j++) {
        void *arg = (void*)(unsigned long) j;
        if (pthread_create(&thread,&attr,bioProcessBackgroundJobs,arg) != 0) {
            serverLog(LL_WARNING,"Fatal: Can't initialize Background Jobs.");
            exit(1);
        }
        bio_threads[j] = thread;
    }
}

// 创建后台作业、并将作业挂成链表
void bioCreateBackgroundJob(int type, void *arg1, void *arg2, void *arg3) {
    struct bio_job *job = zmalloc(sizeof(*job));

    job->time = time(NULL);
    job->arg1 = arg1;
    job->arg2 = arg2;
    job->arg3 = arg3;
    pthread_mutex_lock(&bio_mutex[type]);
    // 作业加入到队尾
    listAddNodeTail(bio_jobs[type],job);
    // 挂起的作业队列数目加1
    bio_pending[type]++;
    pthread_cond_signal(&bio_condvar[type]);
    pthread_mutex_unlock(&bio_mutex[type]);
}
 
// 后台处理线程
void *bioProcessBackgroundJobs(void *arg) {
    struct bio_job *job;
    unsigned long type = (unsigned long) arg;
    sigset_t sigset;

    /* Check that the type is within the right interval. */
    if (type >= BIO_NUM_OPS) {
        serverLog(LL_WARNING,
            "Warning: bio thread started with wrong type %lu",type);
        return NULL;
    }

    /* Make the thread killable at any time, so that bioKillThreads()
     * can work reliably. */
    pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
    pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

    pthread_mutex_lock(&bio_mutex[type]);
    /* Block SIGALRM so we are sure that only the main thread will
     * receive the watchdog signal. */
    sigemptyset(&sigset);
    sigaddset(&sigset, SIGALRM);
    if (pthread_sigmask(SIG_BLOCK, &sigset, NULL))
        serverLog(LL_WARNING,
            "Warning: can't mask SIGALRM in bio.c thread: %s", strerror(errno));

    while(1) {
        listNode *ln;

		// 是否有作业未做，如果无任何作业则wait
        /* The loop always starts with the lock hold. */
        if (listLength(bio_jobs[type]) == 0) {
            pthread_cond_wait(&bio_condvar[type],&bio_mutex[type]);
            continue;
        }
        
        // 取出链表头作业结点
        /* Pop the job from the queue. */
        ln = listFirst(bio_jobs[type]);
        job = ln->value;
        /* It is now possible to unlock the background system as we know have
         * a stand alone job structure to process.*/
        pthread_mutex_unlock(&bio_mutex[type]);

		// 真正执行
        /* Process the job accordingly to its type. */
        if (type == BIO_CLOSE_FILE) {
            close((long)job->arg1);
        } else if (type == BIO_AOF_FSYNC) {
            aof_fsync((long)job->arg1);
        } else {
            serverPanic("Wrong job type in bioProcessBackgroundJobs().");
        }
        zfree(job);

        /* Lock again before reiterating the loop, if there are no longer
         * jobs to process we'll block again in pthread_cond_wait(). */
        pthread_mutex_lock(&bio_mutex[type]);
        // 从链表中删除已完成的作业
        listDelNode(bio_jobs[type],ln);
        // 挂起的作业队列数目减1
        bio_pending[type]--;
    }
}</span>

总结：
1、针对耗时的 close及fsync 进行另起线程后台执行、可以避免主线程阻塞问题。
2、对于高性能的文件刷新还有一些好的创意、这个好好再细看下。

二、RIO (统一buffer、file、socket不同对象IO操作)
I/O操作对于每个系统来说都是必不可少的一部分、而I/O操作的好坏，在一定程度上也会影响着系统的效率问题。

提供三个方面内容：
1、读写操作、获取偏移量操作等相关的回调函数。
rio可以处理buffer、file、socket三种不同类型的I/O对象，不同的rio对象底层使用相应的系统调用完成
read、write、tell、flush操作。比如，对于file rio对象，底层通过fwrite函数完成写操作，通过fread
函数完成读操作。
2、校验和操作。rio使用了RCR64算法计算校验和，具体实现可以参看crc64.h和crc64.c文件。
3、IO变量。_rio中的io成员是一个联合体，针对不同的I/O情况进行不同的处理：当执行内存buffer的I/O操作时，
使用rio.buffer结构体；当执行文件I/O操作时，使用rio.file结构体；当执行socket的I/O操作时，使用rio.fdset结构体。

先看一下 struct rio 结构：

<span style="font-size:18px;">// 系统IO操作的封装
struct _rio {
    /* Backend functions.
     * Since this functions do not tolerate short writes or reads the return
     * value is simplified to: zero on error, non zero on complete success. */
    // 数据流的读方法
    size_t (*read)(struct _rio *, void *buf, size_t len);
    // 数据流的写方法
    size_t (*write)(struct _rio *, const void *buf, size_t len);
    // 获取当前的读写偏移量
    off_t (*tell)(struct _rio *);
    // flush操作
    int (*flush)(struct _rio *);
    /* The update_cksum method if not NULL is used to compute the checksum of
     * all the data that was read or written so far. The method should be
     * designed so that can be called with the current checksum, and the buf
     * and len fields pointing to the new block of data to add to the checksum
     * computation. */
    // 更新校验和
    void (*update_cksum)(struct _rio *, const void *buf, size_t len);

    /* The current checksum */
    // 当前校验和
    uint64_t cksum;

    /* number of bytes read or written */
    // 已读或已写的字节数
    size_t processed_bytes;

    /* maximum single read or write chunk size */
    // 每次读或写操作的最大字节数
    size_t max_processing_chunk;

    /* Backend-specific vars. */
    // 不同的io变量
    union {
        /* In-memory buffer target. */
        // 内存缓冲区buffer结构体(buffer指针及偏移量)
        struct {
            sds ptr;
            off_t pos;
        } buffer;
        
        /* Stdio file pointer target. */
        // 文件结构体(文件句柄)
        struct {
            FILE *fp;
            // 最后一个fsync后写入的字节数
            off_t buffered; /* Bytes written since last fsync. */
            // 多少字节进行一次fsync操作
            off_t autosync; /* fsync after 'autosync' bytes written. */
        } file;
        
        /* Multiple FDs target (used to write to N sockets). */
         // 封装了多个文件描述符结构体（写同样的数据到多个socket fd中）
        struct {
        	// 文件描述符数组
            int *fds;       /* File descriptors. */
            int *state;     /* Error state of each fd. 0 (if ok) or errno. */
            // 文件描述符的个数
            int numfds;
            // 偏移量
            off_t pos;
            // 缓冲区
            sds buf;
        } fdset;
    } io;
};</span>

再看rio统一定义的读写方法：

<span style="font-size:18px;">static inline size_t rioWrite(rio *r, const void *buf, size_t len) {
    while (len) {
    	//判断当前操作字节长度是否超过最大长度
        size_t bytes_to_write = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
        //写入新的数据时，更新校验和
        if (r->update_cksum) r->update_cksum(r,buf,bytes_to_write);
        //执行写方法
        if (r->write(r,buf,bytes_to_write) == 0)
            return 0;
        buf = (char*)buf + bytes_to_write;
        len -= bytes_to_write;
        //操作字节数增加  
        r->processed_bytes += bytes_to_write;
    }
    return 1;
}

static inline size_t rioRead(rio *r, void *buf, size_t len) {
    while (len) {
    	//判断当前操作字节长度是否超过最大长度 
        size_t bytes_to_read = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
        //读数据方法  
        if (r->read(r,buf,bytes_to_read) == 0)
            return 0;
        //读数据时，更新校验和  
        if (r->update_cksum) r->update_cksum(r,buf,bytes_to_read);
        buf = (char*)buf + bytes_to_read;
        len -= bytes_to_read;
        r->processed_bytes += bytes_to_read;
    }
    return 1;
}</span>

每次当有数据发生改变的时候，Redis都会做一个计算校验和的处理算法，表明了数据操作的改变动作，用的算法就是CRC64算法。

下面继续分析 buffer IO和File IO及Socket IO.
rioFileIO使用标准C流式文件IO进行流式IO操作
rioBufferIO使用sds进行内存流式IO操作
rioFdsetIO使用多个socket fd写数据的IO操作

<span style="font-size:18px;">static const rio rioBufferIO = {
    rioBufferRead,
    rioBufferWrite,
    rioBufferTell,
    rioBufferFlush,
    NULL,           /* update_checksum */
    0,              /* current checksum */
    0,              /* bytes read or written */
    0,              /* read/write chunk size */
    { { NULL, 0 } } /* union for io-specific vars */
};

static const rio rioFileIO = {
    rioFileRead,
    rioFileWrite,
    rioFileTell,
    rioFileFlush,
    NULL,           /* update_checksum */
    0,              /* current checksum */
    0,              /* bytes read or written */
    0,              /* read/write chunk size */
    { { NULL, 0 } } /* union for io-specific vars */
};

static const rio rioFdsetIO = {
    rioFdsetRead,
    rioFdsetWrite,
    rioFdsetTell,
    rioFdsetFlush,
    NULL,           /* update_checksum */
    0,              /* current checksum */
    0,              /* bytes read or written */
    0,              /* read/write chunk size */
    { { NULL, 0 } } /* union for io-specific vars */
};</span>

以上的几个函数都很简单、稍微看下就能明白意思，就不细讲了。这里说下file write函数，有个细节是
当把内容写入到rio.file.buffer时，buffer超过给定的同步最小字节，必须将buffer内容刷新到文件中。

<span style="font-size:18px;">static size_t rioFileWrite(rio *r, const void *buf, size_t len) {
    size_t retval;

    retval = fwrite(buf,len,1,r->io.file.fp);
    r->io.file.buffered += len;

	//判读是否需要同步
    if (r->io.file.autosync &&
        r->io.file.buffered >= r->io.file.autosync)
    {
        fflush(r->io.file.fp);
        aof_fsync(fileno(r->io.file.fp));
        r->io.file.buffered = 0;
    }
    return retval;
}</span>

Redis中的rio模块还封装了一些辅助生成AOF协议的函数:

<span style="font-size:18px;">// 以【"*<count>\r\n"】 的形式将count以字符串的格式写入rio对象中，返回写入的字节数。
size_t rioWriteBulkCount(rio *r, char prefix, int count);

// 以【"$<count>\r\n<payload>\r\n"】格式往rio对象中写入二进制安全字符串。
size_t rioWriteBulkString(rio *r, const char *buf, size_t len);

// 以【"$<count>\r\n<payload>\r\n"】的格式往rio对象中写入long long类型的值。
size_t rioWriteBulkLongLong(rio *r, long long l);

// 以【"$<count>\r\n<payload>\r\n"】的格式往rio对象中写入double类型的值。
size_t rioWriteBulkDouble(rio *r, double d);
</span>

总结：
1、rio提供了基于文件流和内存流的读、写、位置通告、校验和操作方法
2、若设置了校验和方法，读写前会进行校验和更新操作
3、提供了用于写Redis协议的高层API函数

andyhuabing

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Redis 之BIO与RIO

一、BIO 之后台IO操作BIO : Background I/O service for Redis. 负责我们需要在后台执行的操作。现在redis的版本中只有两类的操作，后台的close及fsync 系统调用。为了避免一个文件最后的owner在执行close操作带来的unlink使得阻塞server，将这类操作用单独的后台线程来执行将数据从内存写入磁盘这点非常重要，即f
复制链接

扫一扫