He3DB 是由移动云数据库团队研发的一款计算/存储分离的云原生数据库,He3DB通过计算/存储分离、数据冷热分层和压缩、智能中间件等技术,来保证高性能和低成本完美兼得,在获得高性能的同时,最大化的帮助客户节省数据库使用成本。
架构概览
He3DB选择对象存储承载数据持久化,相较于本地盘,S3具有高效、安全、可靠的三大优势。其高效性体现在可处理大量数据、快速读写数据、支持多种数据格式等方面;安全性在于数据加密、身份验证、访问权限控制等多层保障;可靠性则保证了数据的持久性、多地备份、自动修复等特性。He3DB数据预热系统设计充分考虑了对象存储分布式架构所提供的高吞吐能力,将原PG预热系统的单进程架构改造为多进程架构,获得预热速度的大幅提升。
He3DB数据库系统在启动时,按照autoprewarm.blocks中记录的block文件,启动8个“数据抽取进程”从S3持久化存储中并行抽取数据加载到He3DB实例共享缓存中。
核心代码
static void
apw_load_buffers(void)
{
FILE *file = NULL;
int num_elements,
i;
BlockInfoRecord *blkinfo;
dsm_segment *seg;
ereport(LOG,(errmsg("autoprewarm start !")));
/*
* Skip the prewarm if the dump file is in use; otherwise, prevent any
* other process from writing it while we're using it.
*/
LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
if (apw_state->pid_using_dumpfile == InvalidPid)
apw_state->pid_using_dumpfile = MyProcPid;
else
{
LWLockRelease(&apw_state->lock);
ereport(LOG,
(errmsg("skipping prewarm because block dump file is being written by PID %lu",
(unsigned long) apw_state->pid_using_dumpfile)));
return;
}
LWLockRelease(&apw_state->lock);
/*
* Open the block dump file. Exit quietly if it doesn't exist, but report
* any other error.
*/
file = AllocateFile(AUTOPREWARM_FILE, "r");
if (!file)
{
if (errno == ENOENT)
{
LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
apw_state->pid_using_dumpfile = InvalidPid;
LWLockRelease(&apw_state->lock);
return; /* No file to load. */
}
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
AUTOPREWARM_FILE)));
}
/* First line of the file is a record count. */
if (fscanf(file, "<<%d>>\n", &num_elements) != 1)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read from file \"%s\": %m",
AUTOPREWARM_FILE)));
/* Allocate a dynamic shared memory segment to store the record data. */
seg = dsm_create(sizeof(BlockInfoRecord) * num_elements, 0);
blkinfo = (BlockInfoRecord *) dsm_segment_address(seg);
/* Read records, one per line. */
for (i = 0; i < num_elements; i++)
{
unsigned forknum;
if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database,
&blkinfo[i].tablespace, &blkinfo[i].filenode,
&forknum, &blkinfo[i].blocknum) != 5)
ereport(ERROR,
(errmsg("autoprewarm block dump file is corrupted at line %d",
i + 1)));
blkinfo[i].forknum = forknum;
}
FreeFile(file);
/* Sort the blocks to be loaded. */
pg_qsort(blkinfo, num_elements, sizeof(BlockInfoRecord),
apw_compare_blockinfo);
/* Populate shared memory state. */
apw_state->block_info_handle = dsm_segment_handle(seg);
apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
apw_state->prewarmed_blocks = 0;
OwnLatch(&apw_state->prewarmLatch);
/* Get the info position of the first block of the next database. */
while (apw_state->prewarm_start_idx < num_elements)
{
int j = apw_state->prewarm_start_idx;
Oid current_db = blkinfo[j].database;
/*
* Advance the prewarm_stop_idx to the first BlockInfoRecord that does
* not belong to this database.
*/
j++;
while (j < num_elements)
{
if (current_db != blkinfo[j].database)
{
/*
* Combine BlockInfoRecords for global objects with those of
* the database.
*/
if (current_db != InvalidOid)
break;
current_db = blkinfo[j].database;
}
j++;
}
/*
* If we reach this point with current_db == InvalidOid, then only
* BlockInfoRecords belonging to global objects exist. We can't
* prewarm without a database connection, so just bail out.
*/
if (current_db == InvalidOid)
break;
/* Configure stop point and database for next per-database worker. */
apw_state->prewarm_stop_idx = j;
apw_state->database = current_db;
Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
/* If we've run out of free buffers, don't launch another worker. */
if (!have_free_buffer())
break;
/*
* Likewise, don't launch if we've already been told to shut down.
* (The launch would fail anyway, but we might as well skip it.)
*/
if (ShutdownRequestPending)
break;
/*
* Start a per-database worker to load blocks for this database; this
* function will return once the per-database worker exits.
*/
LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
apw_state->thread_running_size++;
LWLockRelease(&apw_state->lock);
apw_start_database_worker();
WaitLatch(&apw_state->prewarmLatch,
WL_LATCH_SET | WL_POSTMASTER_DEATH, 0,
WAIT_EVENT_BGWORKER_PREWARM_START);
ResetLatch(&apw_state->prewarmLatch);
while(apw_state->thread_running_size >= THREAD_POOL_SIZE)
{
sleep(1);
}
/* Prepare for next database. */
apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx;
}
/* Clean up. */
dsm_detach(seg);
LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
apw_state->block_info_handle = DSM_HANDLE_INVALID;
apw_state->pid_using_dumpfile = InvalidPid;
LWLockRelease(&apw_state->lock);
while(apw_state->thread_running_size > 0)
{
sleep(1);
}
/* Report our success, if we were able to finish. */
if (!ShutdownRequestPending)
ereport(LOG,
(errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
apw_state->prewarmed_blocks, num_elements)));
}
static void
apw_load_buffers(void)
{
FILE *file = NULL;
int num_elements,
i;
BlockInfoRecord *blkinfo;
dsm_segment *seg;
ereport(LOG,(errmsg("autoprewarm start !")));
/*
* Skip the prewarm if the dump file is in use; otherwise, prevent any
* other process from writing it while we're using it.
*/
LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
if (apw_state->pid_using_dumpfile == InvalidPid)
apw_state->pid_using_dumpfile = MyProcPid;
else
{
LWLockRelease(&apw_state->lock);
ereport(LOG,
(errmsg("skipping prewarm because block dump file is being written by PID %lu",
(unsigned long) apw_state->pid_using_dumpfile)));
return;
}
LWLockRelease(&apw_state->lock);
/*
* Open the block dump file. Exit quietly if it doesn't exist, but report
* any other error.
*/
file = AllocateFile(AUTOPREWARM_FILE, "r");
if (!file)
{
if (errno == ENOENT)
{
LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
apw_state->pid_using_dumpfile = InvalidPid;
LWLockRelease(&apw_state->lock);
return; /* No file to load. */
}
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
AUTOPREWARM_FILE)));
}
/* First line of the file is a record count. */
if (fscanf(file, "<<%d>>\n", &num_elements) != 1)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read from file \"%s\": %m",
AUTOPREWARM_FILE)));
/* Allocate a dynamic shared memory segment to store the record data. */
seg = dsm_create(sizeof(BlockInfoRecord) * num_elements, 0);
blkinfo = (BlockInfoRecord *) dsm_segment_address(seg);
/* Read records, one per line. */
for (i = 0; i < num_elements; i++)
{
unsigned forknum;
if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database,
&blkinfo[i].tablespace, &blkinfo[i].filenode,
&forknum, &blkinfo[i].blocknum) != 5)
ereport(ERROR,
(errmsg("autoprewarm block dump file is corrupted at line %d",
i + 1)));
blkinfo[i].forknum = forknum;
}
FreeFile(file);
/* Sort the blocks to be loaded. */
pg_qsort(blkinfo, num_elements, sizeof(BlockInfoRecord),
apw_compare_blockinfo);
/* Populate shared memory state. */
apw_state->block_info_handle = dsm_segment_handle(seg);
apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
apw_state->prewarmed_blocks = 0;
OwnLatch(&apw_state->prewarmLatch);
/* Get the info position of the first block of the next database. */
while (apw_state->prewarm_start_idx < num_elements)
{
int j = apw_state->prewarm_start_idx;
Oid current_db = blkinfo[j].database;
/*
* Advance the prewarm_stop_idx to the first BlockInfoRecord that does
* not belong to this database.
*/
j++;
while (j < num_elements)
{
if (current_db != blkinfo[j].database)
{
/*
* Combine BlockInfoRecords for global objects with those of
* the database.
*/
if (current_db != InvalidOid)
break;
current_db = blkinfo[j].database;
}
j++;
}
/*
* If we reach this point with current_db == InvalidOid, then only
* BlockInfoRecords belonging to global objects exist. We can't
* prewarm without a database connection, so just bail out.
*/
if (current_db == InvalidOid)
break;
/* Configure stop point and database for next per-database worker. */
apw_state->prewarm_stop_idx = j;
apw_state->database = current_db;
Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
/* If we've run out of free buffers, don't launch another worker. */
if (!have_free_buffer())
break;
/*
* Likewise, don't launch if we've already been told to shut down.
* (The launch would fail anyway, but we might as well skip it.)
*/
if (ShutdownRequestPending)
break;
/*
* Start a per-database worker to load blocks for this database; this
* function will return once the per-database worker exits.
*/
LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
apw_state->thread_running_size++;
LWLockRelease(&apw_state->lock);
apw_start_database_worker();
WaitLatch(&apw_state->prewarmLatch,
WL_LATCH_SET | WL_POSTMASTER_DEATH, 0,
WAIT_EVENT_BGWORKER_PREWARM_START);
ResetLatch(&apw_state->prewarmLatch);
while(apw_state->thread_running_size >= THREAD_POOL_SIZE)
{
sleep(1);
}
/* Prepare for next database. */
apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx;
}
/* Clean up. */
dsm_detach(seg);
LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
apw_state->block_info_handle = DSM_HANDLE_INVALID;
apw_state->pid_using_dumpfile = InvalidPid;
LWLockRelease(&apw_state->lock);
while(apw_state->thread_running_size > 0)
{
sleep(1);
}
/* Report our success, if we were able to finish. */
if (!ShutdownRequestPending)
ereport(LOG,
(errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
apw_state->prewarmed_blocks, num_elements)));
}
性能测试
再个人研发环境使用移动云s3测试1GB数据预热加载内存速度,1并发55秒,4并发25秒,8并发19秒。
以上就是He3DB数据预热系统设计与实现。