PostgreSQL Why checkpointer impact performance so much ? - 2

Postgres2015全国用户大会将于11月20至21日在北京丽亭华苑酒店召开。本次大会嘉宾阵容强大,国内顶级PostgreSQL数据库专家将悉数到场,并特邀欧洲、俄罗斯、日本、美国等国家和地区的数据库方面专家助阵:

  • Postgres-XC项目的发起人铃木市一(SUZUKI Koichi)
  • Postgres-XL的项目发起人Mason Sharp
  • pgpool的作者石井达夫(Tatsuo Ishii)
  • PG-Strom的作者海外浩平(Kaigai Kohei)
  • Greenplum研发总监姚延栋
  • 周正中(德哥), PostgreSQL中国用户会创始人之一
  • 汪洋,平安科技数据库技术部经理
  • ……


 
  • 2015年度PG大象会报名地址:http://postgres2015.eventdove.com/
  • PostgreSQL中国社区: http://postgres.cn/
  • PostgreSQL专业1群: 3336901(已满)
  • PostgreSQL专业2群: 100910388
  • PostgreSQL专业3群: 150657323



接着上一篇讲解检查点最重的操作 CheckPointGuts@ src/backend/access/transam/xlog.c。

检查点最重量级的函数如下:
CheckPointGuts@src/backend/access/transam/xlog.c

static void
CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
{
        CheckPointCLOG();   // src/backend/access/transam/clog.c
        CheckPointSUBTRANS();  // src/backend/access/transam/subtrans.c
        CheckPointMultiXact();  // src/backend/access/transam/multixact.c
        CheckPointPredicate();  // src/backend/storage/lmgr/predicate.c
        CheckPointRelationMap();  // src/backend/utils/cache/relmapper.c
        CheckPointReplicationSlots();  //  src/backend/replication/slot.c
        CheckPointSnapBuild();   // src/backend/replication/logical/snapbuild.c
        CheckPointLogicalRewriteHeap();  // src/backend/access/heap/rewriteheap.c
        CheckPointBuffers(flags);       // src/backend/storage/buffer/bufmgr.c
        CheckPointTwoPhase(checkPointRedo);  //  src/backend/access/transam/twophase.c
}

分解到每个调用:
1. 将commit log在buffer中的脏数据刷到pg_clog目录下对应的文件中。
CheckPointCLOG@src/backend/access/transam/clog.c

/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 */
void
CheckPointCLOG(void)
{
        /* Flush dirty CLOG pages to disk */
        TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
        SimpleLruFlush(ClogCtl, true);
        TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}

2. 将subtrans log在buffer中的脏数据刷到pg_subtrans 目录下对应的文件中。
CheckPointSUBTRANS@src/backend/access/transam/subtrans.c

/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 */
void
CheckPointSUBTRANS(void)
{
        /*
         * Flush dirty SUBTRANS pages to disk
         *
         * This is not actually necessary from a correctness point of view. We do
         * it merely to improve the odds that writing of dirty pages is done by
         * the checkpoint process and not by backends.
         */
        TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true);
        SimpleLruFlush(SubTransCtl, true);
        TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
}

3. 将 MultiXact  log在buffer中的脏数据刷到 pg_multixact 目录下对应的文件中。
CheckPointMultiXact@src/backend/access/transam/multixact.c

/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 */
void
CheckPointMultiXact(void)
{
        TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);

        /* Flush dirty MultiXact pages to disk */
        SimpleLruFlush(MultiXactOffsetCtl, true);
        SimpleLruFlush(MultiXactMemberCtl, true);

        TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
}

4. Flush dirty SLRU(simple least recent used) pages to disk
CheckPointPredicate@src/backend/storage/lmgr/predicate.c

/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 *
 * We don't have any data that needs to survive a restart, but this is a
 * convenient place to truncate the SLRU.
 */
void
CheckPointPredicate(void)
{
        int                     tailPage;

        LWLockAcquire(OldSerXidLock, LW_EXCLUSIVE);

        /* Exit quickly if the SLRU is currently not in use. */
        if (oldSerXidControl->headPage < 0)
        {
                LWLockRelease(OldSerXidLock);
                return;
        }

        if (TransactionIdIsValid(oldSerXidControl->tailXid))
        {
                /* We can truncate the SLRU up to the page containing tailXid */
                tailPage = OldSerXidPage(oldSerXidControl->tailXid);
        }
        else
        {
                /*
                 * The SLRU is no longer needed. Truncate to head before we set head
                 * invalid.
                 *
                 * XXX: It's possible that the SLRU is not needed again until XID
                 * wrap-around has happened, so that the segment containing headPage
                 * that we leave behind will appear to be new again. In that case it
                 * won't be removed until XID horizon advances enough to make it
                 * current again.
                 */
                tailPage = oldSerXidControl->headPage;
                oldSerXidControl->headPage = -1;
        }

        LWLockRelease(OldSerXidLock);

        /* Truncate away pages that are no longer required */
        SimpleLruTruncate(OldSerXidSlruCtl, tailPage);

        /*
         * Flush dirty SLRU pages to disk
         *
         * This is not actually necessary from a correctness point of view. We do
         * it merely as a debugging aid.
         *
         * We're doing this after the truncation to avoid writing pages right
         * before deleting the file in which they sit, which would be completely
         * pointless.
         */
        SimpleLruFlush(OldSerXidSlruCtl, true);
}

前面4个调用,全部用到了SimpleLruFlush来完成刷缓存的动作。
SimpleLruFlush@src/backend/access/transam/slru.c

/*
 * Flush dirty pages to disk during checkpoint or database shutdown
 */
void
SimpleLruFlush(SlruCtl ctl, bool checkpoint)
{
        SlruShared      shared = ctl->shared;
        SlruFlushData fdata;
        int                     slotno;
        int                     pageno = 0;
        int                     i;
        bool            ok;

        /*
         * Find and write dirty pages
         */
        fdata.num_files = 0;

        LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);  // 注意每次都要获取排他锁

        for (slotno = 0; slotno < shared->num_slots; slotno++)
        {
                SlruInternalWritePage(ctl, slotno, &fdata);   // 这个可能会是比较重的操作

                /*
                 * When called during a checkpoint, we cannot assert that the slot is
                 * clean now, since another process might have re-dirtied it already.
                 * That's okay.
                 */
                Assert(checkpoint ||
                           shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
                           (shared->page_status[slotno] == SLRU_PAGE_VALID &&
                                !shared->page_dirty[slotno]));
        }

        LWLockRelease(shared->ControlLock);

        /*
         * Now fsync and close any files that were open
         */
        ok = true;
        for (i = 0; i < fdata.num_files; i++)
        {
                if (ctl->do_fsync && pg_fsync(fdata.fd[i]))
                {
                        slru_errcause = SLRU_FSYNC_FAILED;
                        slru_errno = errno;
                        pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
                        ok = false;
                }

                if (CloseTransientFile(fdata.fd[i]))
                {
                        slru_errcause = SLRU_CLOSE_FAILED;
                        slru_errno = errno;
                        pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
                        ok = false;
                }
        }
        if (!ok)
                SlruReportIOError(ctl, pageno, InvalidTransactionId);
}


5. 将rel mapper文件缓存写入文件, 什么是rel mapper文件呢?
rel mapper存储了一些数据库全局对象和文件ID的映射关系,一般的对象这种关系存储在全局对象pg_class.relfilenode中。

 * For most tables, the physical file underlying the table is specified by
 * pg_class.relfilenode.  However, that obviously won't work for pg_class
 * itself, nor for the other "nailed" catalogs for which we have to be able
 * to set up working Relation entries without access to pg_class.  It also
 * does not work for shared catalogs, since there is no practical way to
 * update other databases' pg_class entries when relocating a shared catalog.
 * Therefore, for these special catalogs (henceforth referred to as "mapped
 * catalogs") we rely on a separately maintained file that shows the mapping
 * from catalog OIDs to filenode numbers.  Each database has a map file for
 * its local mapped catalogs, and there is a separate map file for shared
 * catalogs.  Mapped catalogs have zero in their pg_class.relfilenode entries.

rel mapping文件名:
每个数据库有一个pg_filenode.map文件,全局还有一个pg_filenode.map文件。
这些文件分别放在表空间/database_oid/目录和global/目录下。

/*
 * The map file is critical data: we have no automatic method for recovering
 * from loss or corruption of it.  We use a CRC so that we can detect
 * corruption.  To minimize the risk of failed updates, the map file should
 * be kept to no more than one standard-size disk sector (ie 512 bytes),
 * and we use overwrite-in-place rather than playing renaming games.
 * The struct layout below is designed to occupy exactly 512 bytes, which
 * might make filesystem updates a bit more efficient.
 *
 * Entries in the mappings[] array are in no particular order.  We could
 * speed searching by insisting on OID order, but it really shouldn't be
 * worth the trouble given the intended size of the mapping sets.
 */
#define RELMAPPER_FILENAME              "pg_filenode.map"

CheckPointRelationMap@src/backend/utils/cache/relmapper.c

/*
 * CheckPointRelationMap
 *
 * This is called during a checkpoint.  It must ensure that any relation map
 * updates that were WAL-logged before the start of the checkpoint are
 * securely flushed to disk and will not need to be replayed later.  This
 * seems unlikely to be a performance-critical issue, so we use a simple
 * method: we just take and release the RelationMappingLock.  This ensures
 * that any already-logged map update is complete, because write_relmap_file
 * will fsync the map file before the lock is released.
 */
void
CheckPointRelationMap(void)
{
        LWLockAcquire(RelationMappingLock, LW_SHARED); // 隐式fsync, 加锁前会自动完成fsync.
        LWLockRelease(RelationMappingLock);
}


6. 将流复制replication slots信息刷到pg_replslot目录下对应的文件中。
CheckPointReplicationSlots@src/backend/replication/slot.c

/*
 * Flush all replication slots to disk.
 *
 * This needn't actually be part of a checkpoint, but it's a convenient
 * location.
 */
void
CheckPointReplicationSlots(void)
{
        int                     i;

        elog(DEBUG1, "performing replication slot checkpoint");

        /*
         * Prevent any slot from being created/dropped while we're active. As we
         * explicitly do *not* want to block iterating over replication_slots or
         * acquiring a slot we cannot take the control lock - but that's OK,
         * because holding ReplicationSlotAllocationLock is strictly stronger, and
         * enough to guarantee that nobody can change the in_use bits on us.
         */
        LWLockAcquire(ReplicationSlotAllocationLock, LW_SHARED);

        for (i = 0; i < max_replication_slots; i++)
        {
                ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
                char            path[MAXPGPATH];

                if (!s->in_use)
                        continue;

                /* save the slot to disk, locking is handled in SaveSlotToPath() */
                sprintf(path, "pg_replslot/%s", NameStr(s->data.name));
                SaveSlotToPath(s, path, LOG);
        }
        LWLockRelease(ReplicationSlotAllocationLock);
}


7. 逻辑复制相关的脏数据,刷入pg_logical/snapshots目录下对应的文件。
CheckPointSnapBuild@src/backend/replication/logical/snapbuild.c

/*
 * Remove all serialized snapshots that are not required anymore because no
 * slot can need them. This doesn't actually have to run during a checkpoint,
 * but it's a convenient point to schedule this.
 *
 * NB: We run this during checkpoints even if logical decoding is disabled so
 * we cleanup old slots at some point after it got disabled.
 */
void
CheckPointSnapBuild(void)
{
        XLogRecPtr      cutoff;
        XLogRecPtr      redo;
        DIR                *snap_dir;
        struct dirent *snap_de;
        char            path[MAXPGPATH];

        /*
         * We start of with a minimum of the last redo pointer. No new replication
         * slot will start before that, so that's a safe upper bound for removal.
         */
        redo = GetRedoRecPtr();

        /* now check for the restart ptrs from existing slots */
        cutoff = ReplicationSlotsComputeLogicalRestartLSN();

        /* don't start earlier than the restart lsn */
        if (redo < cutoff)
                cutoff = redo;

        snap_dir = AllocateDir("pg_logical/snapshots");
        while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL)
        {
                uint32          hi;
                uint32          lo;
                XLogRecPtr      lsn;
                struct stat statbuf;
                if (strcmp(snap_de->d_name, ".") == 0 ||
                        strcmp(snap_de->d_name, "..") == 0)
                        continue;

                snprintf(path, MAXPGPATH, "pg_logical/snapshots/%s", snap_de->d_name);

                if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
                {
                        elog(DEBUG1, "only regular files expected: %s", path);
                        continue;
                }

                /*
                 * temporary filenames from SnapBuildSerialize() include the LSN and
                 * everything but are postfixed by .$pid.tmp. We can just remove them
                 * the same as other files because there can be none that are
                 * currently being written that are older than cutoff.
                 *
                 * We just log a message if a file doesn't fit the pattern, it's
                 * probably some editors lock/state file or similar...
                 */
                if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
                {
                        ereport(LOG,
                                        (errmsg("could not parse file name \"%s\"", path)));
                        continue;
                }

                lsn = ((uint64) hi) << 32 | lo;

                /* check whether we still need it */
                if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
                {
                        elog(DEBUG1, "removing snapbuild snapshot %s", path);

                        /*
                         * It's not particularly harmful, though strange, if we can't
                         * remove the file here. Don't prevent the checkpoint from
                         * completing, that'd be cure worse than the disease.
                         */
                        if (unlink(path) < 0)
                        {
                                ereport(LOG,
                                                (errcode_for_file_access(),
                                                 errmsg("could not remove file \"%s\": %m",
                                                                path)));
                                continue;
                        }
                }
        }
        FreeDir(snap_dir);
}

8. 逻辑复制相关的脏数据,刷入pg_logical/mappings目录下对应的文件。
CheckPointLogicalRewriteHeap@src/backend/access/heap/rewriteheap.c

/* ---
 * Perform a checkpoint for logical rewrite mappings
 *
 * This serves two tasks:
 * 1) Remove all mappings not needed anymore based on the logical restart LSN
 * 2) Flush all remaining mappings to disk, so that replay after a checkpoint
 *        only has to deal with the parts of a mapping that have been written out
 *        after the checkpoint started.
 * ---
 */
void
CheckPointLogicalRewriteHeap(void)
{
        XLogRecPtr      cutoff;
        XLogRecPtr      redo;
        DIR                *mappings_dir;
        struct dirent *mapping_de;
        char            path[MAXPGPATH];

        /*
         * We start of with a minimum of the last redo pointer. No new decoding
         * slot will start before that, so that's a safe upper bound for removal.
         */
        redo = GetRedoRecPtr();

        /* now check for the restart ptrs from existing slots */
        cutoff = ReplicationSlotsComputeLogicalRestartLSN();

        /* don't start earlier than the restart lsn */
        if (cutoff != InvalidXLogRecPtr && redo < cutoff)
                cutoff = redo;

        mappings_dir = AllocateDir("pg_logical/mappings");
        while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL)
        {
                struct stat statbuf;
                Oid                     dboid;
                Oid                     relid;
                XLogRecPtr      lsn;
                TransactionId rewrite_xid;
                TransactionId create_xid;
                uint32          hi,
                                        lo;

                if (strcmp(mapping_de->d_name, ".") == 0 ||
                        strcmp(mapping_de->d_name, "..") == 0)
                        continue;

                snprintf(path, MAXPGPATH, "pg_logical/mappings/%s", mapping_de->d_name);
                if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
                        continue;

                /* Skip over files that cannot be ours. */
                if (strncmp(mapping_de->d_name, "map-", 4) != 0)
                        continue;

                if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
                                   &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6)
                        elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
                lsn = ((uint64) hi) << 32 | lo;

                if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
                {
                        elog(DEBUG1, "removing logical rewrite file \"%s\"", path);
                        if (unlink(path) < 0)
                                ereport(ERROR,
                                                (errcode_for_file_access(),
                                                 errmsg("could not remove file \"%s\": %m", path)));
                }
                else
                {
                        int                     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);

                        /*
                         * The file cannot vanish due to concurrency since this function
                         * is the only one removing logical mappings and it's run while
                         * CheckpointLock is held exclusively.
                         */
                        if (fd < 0)
                                ereport(ERROR,
                                                (errcode_for_file_access(),
                                                 errmsg("could not open file \"%s\": %m", path)));

                        /*
                         * We could try to avoid fsyncing files that either haven't
                         * changed or have only been created since the checkpoint's start,
                         * but it's currently not deemed worth the effort.
                         */
                        else if (pg_fsync(fd) != 0)
                                ereport(ERROR,
                                                (errcode_for_file_access(),
                                                 errmsg("could not fsync file \"%s\": %m", path)));
                        CloseTransientFile(fd);
                }
        }
        FreeDir(mappings_dir);
}


9. 将预提交(2PC)事务状态相关脏数据刷入pg_twophase 目录下对应的文件。
如果没有开启2PC(#max_prepared_transactions = 0 ),这里不需要操作。
CheckPointTwoPhase(checkPointRedo)@src/backend/access/transam/twophase.c

/*
 * CheckPointTwoPhase -- handle 2PC component of checkpointing.
 *
 * We must fsync the state file of any GXACT that is valid and has a PREPARE
 * LSN <= the checkpoint's redo horizon.  (If the gxact isn't valid yet or
 * has a later LSN, this checkpoint is not responsible for fsyncing it.)
 *
 * This is deliberately run as late as possible in the checkpoint sequence,
 * because GXACTs ordinarily have short lifespans, and so it is quite
 * possible that GXACTs that were valid at checkpoint start will no longer
 * exist if we wait a little bit.
 *
 * If a GXACT remains valid across multiple checkpoints, it'll be fsynced
 * each time.  This is considered unusual enough that we don't bother to
 * expend any extra code to avoid the redundant fsyncs.  (They should be
 * reasonably cheap anyway, since they won't cause I/O.)
 */
void
CheckPointTwoPhase(XLogRecPtr redo_horizon)
{
        TransactionId *xids;
        int                     nxids;
        char            path[MAXPGPATH];
        int                     i;
......
        if (max_prepared_xacts <= 0)
                return;                                 /* nothing to do */
......


10. 将shared buffer中的在检查点之前(这个说法并不严谨,也可能包含检查点开始后某一个时间差内产生的脏数据,见BufferSync@src/backend/storage/buffer/bufmgr.c)产生的脏数据块刷入缓存,但是同样可能需要全扫描整个缓存内存区。
原因下一篇再讲。
CheckPointBuffers(flags)@src/backend/storage/buffer/bufmgr.c

/*
 * CheckPointBuffers
 *
 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
 *
 * Note: temporary relations do not participate in checkpoints, so they don't
 * need to be flushed.
 */
void
CheckPointBuffers(int flags)
{
        TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
        CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
        BufferSync(flags);
        CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
        TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
        smgrsync();
        CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
        TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
}


[小结]
1. 从锁冲突角度来分析,可能会带来较大影响的有刷commit log,刷buffer。
2. 从数量级和IO层面分析,主观判断除了 CheckPointBuffers(flags)@src/backend/storage/buffer/bufmgr.c,其他几个刷缓存的动作应该都很快,不会有太大的冲突或影响。
但是这些都只是主观判断,还需要有测试数据来提供支撑。
跟踪锁冲突的次数和耗时,跟踪每个刷缓存函数的耗时。
跟踪的内容将留到后面的篇幅来讲。

[参考]
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值