He3DB团队架构解读:He3DB数据预热系统设计与实现

He3DB 是由移动云数据库团队研发的一款计算/存储分离的云原生数据库,He3DB通过计算/存储分离、数据冷热分层和压缩、智能中间件等技术,来保证高性能和低成本完美兼得,在获得高性能的同时,最大化的帮助客户节省数据库使用成本。

架构概览

He3DB选择对象存储承载数据持久化,相较于本地盘,S3具有高效、安全、可靠的三大优势。其高效性体现在可处理大量数据、快速读写数据、支持多种数据格式等方面;安全性在于数据加密、身份验证、访问权限控制等多层保障;可靠性则保证了数据的持久性、多地备份、自动修复等特性。He3DB数据预热系统设计充分考虑了对象存储分布式架构所提供的高吞吐能力,将原PG预热系统的单进程架构改造为多进程架构,获得预热速度的大幅提升。
He3DB数据预热系统架构概览

He3DB数据库系统在启动时,按照autoprewarm.blocks中记录的block文件,启动8个“数据抽取进程”从S3持久化存储中并行抽取数据加载到He3DB实例共享缓存中。

核心代码

static void
apw_load_buffers(void)
{
    FILE       *file = NULL;
    int         num_elements,
                i;
    BlockInfoRecord *blkinfo;
    dsm_segment *seg;
    ereport(LOG,(errmsg("autoprewarm start !")));
    /*
     * Skip the prewarm if the dump file is in use; otherwise, prevent any
     * other process from writing it while we're using it.
     */
    LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
    if (apw_state->pid_using_dumpfile == InvalidPid)
        apw_state->pid_using_dumpfile = MyProcPid;
    else
    {
        LWLockRelease(&apw_state->lock);
        ereport(LOG,
                (errmsg("skipping prewarm because block dump file is being written by PID %lu",
                        (unsigned long) apw_state->pid_using_dumpfile)));
        return;
    }
    LWLockRelease(&apw_state->lock);
    /*
     * Open the block dump file.  Exit quietly if it doesn't exist, but report
     * any other error.
     */
    file = AllocateFile(AUTOPREWARM_FILE, "r");
    if (!file)
    {
        if (errno == ENOENT)
        {
            LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
            apw_state->pid_using_dumpfile = InvalidPid;
            LWLockRelease(&apw_state->lock);
            return;             /* No file to load. */
        }
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not read file \"%s\": %m",
                        AUTOPREWARM_FILE)));
    }
    /* First line of the file is a record count. */
    if (fscanf(file, "<<%d>>\n", &num_elements) != 1)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not read from file \"%s\": %m",
                        AUTOPREWARM_FILE)));
    /* Allocate a dynamic shared memory segment to store the record data. */
    seg = dsm_create(sizeof(BlockInfoRecord) * num_elements, 0);
    blkinfo = (BlockInfoRecord *) dsm_segment_address(seg);
    /* Read records, one per line. */
    for (i = 0; i < num_elements; i++)
    {
        unsigned    forknum;
        if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database,
                   &blkinfo[i].tablespace, &blkinfo[i].filenode,
                   &forknum, &blkinfo[i].blocknum) != 5)
            ereport(ERROR,
                    (errmsg("autoprewarm block dump file is corrupted at line %d",
                            i + 1)));
        blkinfo[i].forknum = forknum;
    }
    FreeFile(file);
    /* Sort the blocks to be loaded. */
    pg_qsort(blkinfo, num_elements, sizeof(BlockInfoRecord),
             apw_compare_blockinfo);
    /* Populate shared memory state. */
    apw_state->block_info_handle = dsm_segment_handle(seg);
    apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
    apw_state->prewarmed_blocks = 0;
    OwnLatch(&apw_state->prewarmLatch);
    /* Get the info position of the first block of the next database. */
    while (apw_state->prewarm_start_idx < num_elements)
    {
        int         j = apw_state->prewarm_start_idx;
        Oid         current_db = blkinfo[j].database;
        /*
         * Advance the prewarm_stop_idx to the first BlockInfoRecord that does
         * not belong to this database.
         */
        j++;
        while (j < num_elements)
        {
            if (current_db != blkinfo[j].database)
            {
                /*
                 * Combine BlockInfoRecords for global objects with those of
                 * the database.
                 */
                if (current_db != InvalidOid)
                    break;
                current_db = blkinfo[j].database;
            }
            j++;
        }
        /*
         * If we reach this point with current_db == InvalidOid, then only
         * BlockInfoRecords belonging to global objects exist.  We can't
         * prewarm without a database connection, so just bail out.
         */
        if (current_db == InvalidOid)
            break;
        /* Configure stop point and database for next per-database worker. */
        apw_state->prewarm_stop_idx = j;
        apw_state->database = current_db;
        Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
        /* If we've run out of free buffers, don't launch another worker. */
        if (!have_free_buffer())
            break;
        /*
         * Likewise, don't launch if we've already been told to shut down.
         * (The launch would fail anyway, but we might as well skip it.)
         */
        if (ShutdownRequestPending)
            break;
        /*
         * Start a per-database worker to load blocks for this database; this
         * function will return once the per-database worker exits.
         */
        LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
        apw_state->thread_running_size++;
        LWLockRelease(&apw_state->lock);
        
        apw_start_database_worker();
        
        WaitLatch(&apw_state->prewarmLatch,
                       WL_LATCH_SET | WL_POSTMASTER_DEATH, 0,
                       WAIT_EVENT_BGWORKER_PREWARM_START);
        ResetLatch(&apw_state->prewarmLatch);
        
        while(apw_state->thread_running_size >= THREAD_POOL_SIZE)
        {
            sleep(1);
        }
        /* Prepare for next database. */
        apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx;
    }
    /* Clean up. */
    dsm_detach(seg);
    LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
    apw_state->block_info_handle = DSM_HANDLE_INVALID;
    apw_state->pid_using_dumpfile = InvalidPid;
    LWLockRelease(&apw_state->lock);
    while(apw_state->thread_running_size > 0)
    {
        sleep(1);
    }
    /* Report our success, if we were able to finish. */
    if (!ShutdownRequestPending)
        ereport(LOG,
                (errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
                        apw_state->prewarmed_blocks, num_elements)));
}
static void
apw_load_buffers(void)
{
    FILE       *file = NULL;
    int         num_elements,
                i;
    BlockInfoRecord *blkinfo;
    dsm_segment *seg;
    ereport(LOG,(errmsg("autoprewarm start !")));
    /*
     * Skip the prewarm if the dump file is in use; otherwise, prevent any
     * other process from writing it while we're using it.
     */
    LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
    if (apw_state->pid_using_dumpfile == InvalidPid)
        apw_state->pid_using_dumpfile = MyProcPid;
    else
    {
        LWLockRelease(&apw_state->lock);
        ereport(LOG,
                (errmsg("skipping prewarm because block dump file is being written by PID %lu",
                        (unsigned long) apw_state->pid_using_dumpfile)));
        return;
    }
    LWLockRelease(&apw_state->lock);
    /*
     * Open the block dump file.  Exit quietly if it doesn't exist, but report
     * any other error.
     */
    file = AllocateFile(AUTOPREWARM_FILE, "r");
    if (!file)
    {
        if (errno == ENOENT)
        {
            LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
            apw_state->pid_using_dumpfile = InvalidPid;
            LWLockRelease(&apw_state->lock);
            return;             /* No file to load. */
        }
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not read file \"%s\": %m",
                        AUTOPREWARM_FILE)));
    }
    /* First line of the file is a record count. */
    if (fscanf(file, "<<%d>>\n", &num_elements) != 1)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not read from file \"%s\": %m",
                        AUTOPREWARM_FILE)));
    /* Allocate a dynamic shared memory segment to store the record data. */
    seg = dsm_create(sizeof(BlockInfoRecord) * num_elements, 0);
    blkinfo = (BlockInfoRecord *) dsm_segment_address(seg);
    /* Read records, one per line. */
    for (i = 0; i < num_elements; i++)
    {
        unsigned    forknum;
        if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database,
                   &blkinfo[i].tablespace, &blkinfo[i].filenode,
                   &forknum, &blkinfo[i].blocknum) != 5)
            ereport(ERROR,
                    (errmsg("autoprewarm block dump file is corrupted at line %d",
                            i + 1)));
        blkinfo[i].forknum = forknum;
    }
    FreeFile(file);
    /* Sort the blocks to be loaded. */
    pg_qsort(blkinfo, num_elements, sizeof(BlockInfoRecord),
             apw_compare_blockinfo);
    /* Populate shared memory state. */
    apw_state->block_info_handle = dsm_segment_handle(seg);
    apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
    apw_state->prewarmed_blocks = 0;
    OwnLatch(&apw_state->prewarmLatch);
    /* Get the info position of the first block of the next database. */
    while (apw_state->prewarm_start_idx < num_elements)
    {
        int         j = apw_state->prewarm_start_idx;
        Oid         current_db = blkinfo[j].database;
        /*
         * Advance the prewarm_stop_idx to the first BlockInfoRecord that does
         * not belong to this database.
         */
        j++;
        while (j < num_elements)
        {
            if (current_db != blkinfo[j].database)
            {
                /*
                 * Combine BlockInfoRecords for global objects with those of
                 * the database.
                 */
                if (current_db != InvalidOid)
                    break;
                current_db = blkinfo[j].database;
            }
            j++;
        }
        /*
         * If we reach this point with current_db == InvalidOid, then only
         * BlockInfoRecords belonging to global objects exist.  We can't
         * prewarm without a database connection, so just bail out.
         */
        if (current_db == InvalidOid)
            break;
        /* Configure stop point and database for next per-database worker. */
        apw_state->prewarm_stop_idx = j;
        apw_state->database = current_db;
        Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
        /* If we've run out of free buffers, don't launch another worker. */
        if (!have_free_buffer())
            break;
        /*
         * Likewise, don't launch if we've already been told to shut down.
         * (The launch would fail anyway, but we might as well skip it.)
         */
        if (ShutdownRequestPending)
            break;
        /*
         * Start a per-database worker to load blocks for this database; this
         * function will return once the per-database worker exits.
         */
        LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
        apw_state->thread_running_size++;
        LWLockRelease(&apw_state->lock);
        
        apw_start_database_worker();
        
        WaitLatch(&apw_state->prewarmLatch,
                       WL_LATCH_SET | WL_POSTMASTER_DEATH, 0,
                       WAIT_EVENT_BGWORKER_PREWARM_START);
        ResetLatch(&apw_state->prewarmLatch);
        
        while(apw_state->thread_running_size >= THREAD_POOL_SIZE)
        {
            sleep(1);
        }
        /* Prepare for next database. */
        apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx;
    }
    /* Clean up. */
    dsm_detach(seg);
    LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
    apw_state->block_info_handle = DSM_HANDLE_INVALID;
    apw_state->pid_using_dumpfile = InvalidPid;
    LWLockRelease(&apw_state->lock);
    while(apw_state->thread_running_size > 0)
    {
        sleep(1);
    }
    /* Report our success, if we were able to finish. */
    if (!ShutdownRequestPending)
        ereport(LOG,
                (errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
                        apw_state->prewarmed_blocks, num_elements)));
}

性能测试

再个人研发环境使用移动云s3测试1GB数据预热加载内存速度,1并发55秒,4并发25秒,8并发19秒。
在这里插入图片描述

以上就是He3DB数据预热系统设计与实现。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值