PostgreSQL Why checkpointer impact performance so much ? - 5_amazing chum picks up butt-plugged procceding pers-CSDN博客

Postgres2015全国用户大会将于11月20至21日在北京丽亭华苑酒店召开。本次大会嘉宾阵容强大，国内顶级PostgreSQL数据库专家将悉数到场，并特邀欧洲、俄罗斯、日本、美国等国家和地区的数据库方面专家助阵:

Postgres-XC项目的发起人铃木市一(SUZUKI Koichi)
Postgres-XL的项目发起人Mason Sharp
pgpool的作者石井达夫(Tatsuo Ishii)
PG-Strom的作者海外浩平(Kaigai Kohei)
Greenplum研发总监姚延栋
周正中(德哥), PostgreSQL中国用户会创始人之一
汪洋，平安科技数据库技术部经理
……

 
 2015年度PG大象会报名地址：http://postgres2015.eventdove.com/
PostgreSQL中国社区： http://postgres.cn/
PostgreSQL专业1群： 3336901（已满）
PostgreSQL专业2群： 100910388
PostgreSQL专业3群： 150657323

我们前面分析了checkpoint的原理，并且对整个过程checkpoint了跟踪，但是并未发现到底是什么造成tps的下降。

http://blog.163.com/digoal@126/blog/static/16387704020154653422892/

本文主要针对PostgreSQL的lwlock重点跟踪一下：

src/backend/storage/lmgr/lwlock.c


  
  
   
                   TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(l), T_ID(l), mode);
  
  
  
  
   
   

  
  
  
  
   
                   for (;;)
  
  
  
  
   
                   {
  
  
  
  
   
                           /* "false" means cannot accept cancel/die interrupt here. */
  
  
  
  
   
                           PGSemaphoreLock(&proc->sem, false);
  
  
  
  
   
                           if (!proc->lwWaiting)
  
  
  
  
   
                                   break;
  
  
  
  
   
                           extraWaits++;
  
  
  
  
   
                   }
  
  
  
  
   
   

  
  
  
  
   
                   TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(l), T_ID(l), mode);

src/include/storage/lwlock.h

typedef enum LWLockMode

{

LW_EXCLUSIVE,

LW_SHARED,

LW_WAIT_UNTIL_FREE /* A special mode used in PGPROC->lwlockMode,

* when waiting for lock to become free. Not

* to be used as LWLockAcquire argument */

} LWLockMode;

lwlock-wait-start	(char *, int, LWLockMode)	Probe that fires when an LWLock was not immediately available and a server process has begun to wait for the lock to become available. arg0 is the LWLock's tranche. arg1 is the LWLock's offset within its tranche. arg2 is the requested lock mode, either exclusive or shared.
lwlock-wait-done	(char *, int, LWLockMode)	Probe that fires when a server process has been released from its wait for an LWLock (it does not actually have the lock yet). arg0 is the LWLock's tranche. arg1 is the LWLock's offset within its tranche. arg2 is the requested lock mode, either exclusive or shared.

$ vi test.sql

\setrandom id 1 50000000

update tbl set info=now(),crt_time=now() where id=:id;

$ pgbench -M prepared -n -r -f ./test.sql -c 3 -j 3 -P 2 -T 100000000

progress: 776.0 s, 13169.7 tps, lat 0.226 ms stddev 0.063

progress: 778.0 s, 7930.0 tps, lat 0.377 ms stddev 0.096

progress: 780.0 s, 7660.1 tps, lat 0.390 ms stddev 0.108

progress: 782.0 s, 8118.8 tps, lat 0.368 ms stddev 0.098

progress: 784.0 s, 8576.5 tps, lat 0.348 ms stddev 0.093

progress: 786.0 s, 9251.4 tps, lat 0.323 ms stddev 0.085

progress: 788.0 s, 9812.2 tps, lat 0.304 ms stddev 0.078

progress: 790.0 s, 10036.8 tps, lat 0.297 ms stddev 0.074

progress: 792.0 s, 10359.1 tps, lat 0.288 ms stddev 0.081

progress: 794.0 s, 10675.3 tps, lat 0.280 ms stddev 0.077

progress: 796.0 s, 10904.1 tps, lat 0.274 ms stddev 0.071

progress: 798.0 s, 11470.2 tps, lat 0.260 ms stddev 0.076

progress: 800.0 s, 11909.3 tps, lat 0.250 ms stddev 0.074

progress: 802.0 s, 12297.4 tps, lat 0.243 ms stddev 0.073

progress: 804.0 s, 12463.8 tps, lat 0.239 ms stddev 0.075

progress: 806.0 s, 12513.3 tps, lat 0.238 ms stddev 0.076

progress: 808.0 s, 12669.9 tps, lat 0.235 ms stddev 0.068

以13000为TPS，一次请求需要230786.9微秒.

1000000*(1/(13000/3))


   
   
    
    
     
     digoal=# select pid,query from pg_stat_activity;
    
    
    
    
     
      pid  |                         query                         
    
    
    
    
     
     ------+-------------------------------------------------------
    
    
    
    
     
      4249 | update tbl set info=now(),crt_time=now() where id=$1;
    
    
    
    
     
      5527 | autovacuum: VACUUM ANALYZE digoal.tbl
    
    
    
    
     
      4250 | update tbl set info=now(),crt_time=now() where id=$1;
    
    
    
    
     
      4251 | update tbl set info=now(),crt_time=now() where id=$1;
    
    
    
    
     
      4134 | select pid,query from pg_stat_activity;
    
    
    
    
     
     (5 rows)
    
    
   
   
   
   
    
    ps -ewf|grep 1484
   
   
   
   
    
    postgres  1484 30863  4 12:35 ?        00:02:17 postgres: wal writer process

跟踪这几个进程的锁等待情况。


   
   
    
    # stap -DMAXSKIPPED=100000 -v 11111 -e '
   
   
   
   
    
    global s_var, e_var, stat_var;
   
   
   
   
    
    

   
   
   
   
    
    /* probe lwlock__wait__start(const char *, int, LWLockMode); */
   
   
   
   
    
    probe process("/opt/pgsql/bin/postgres").mark("lwlock__wait__start") {
   
   
   
   
    
      s_var[pid()] = gettimeofday_us()
   
   
   
   
    
    }
   
   
   
   
    
    

   
   
   
   
    
    /* probe lwlock__wait__done(const char *, int, LWLockMode); */
   
   
   
   
    
    probe process("/opt/pgsql/bin/postgres").mark("lwlock__wait__start") {
   
   
   
   
    
      e_var[pid()] = gettimeofday_us()
   
   
   
   
    
      if ( s_var[pid()] > 0 && ((pid() >= 4249 && pid() <= 4251) || pid() ==1484 )   )
   
   
   
   
    
        stat_var[pid()] <<< e_var[pid()] - s_var[pid()]
   
   
   
   
    
    }
   
   
   
   
    
    

   
   
   
   
    
    probe timer.s(1) {
   
   
   
   
    
      foreach ([v1] in stat_var +) {
   
   
   
   
    
        if ( @count(stat_var[v1]) >0 ) {
   
   
   
   
    
          printf("pid: %d, min: %d, max: %d, avg: %d, sum: %d, count: %d\n", v1, @min(stat_var[v1]), @max(stat_var[v1]), @avg(stat_var[v1]), @sum(stat_var[v1]), @count(stat_var[v1]))
   
   
   
   
    
        }
   
   
   
   
    
      }
   
   
   
   
    
      printf("----------------------------------end-----------------------------\n")
   
   
   
   
    
      delete s_var
   
   
   
   
    
      delete e_var
   
   
   
   
    
      delete stat_var
   
   
   
   
    
    }'

当检查点发生时，WAL writer process出现了比较多的lwlock等待。

1484是wal writer process，其他几个是数据库测试进程。

从count列可以看出，检查点发生后，数据库测试进程的等待次数变少了。

从avg来看，等待的平均时间发生了细微的变化，大概增加了1微秒的平均时间，这个等待并不足以造成性能下降。

因为正常的一次请求需要用掉 230786.9微秒。

pid: 1484, min: 0, max: 3, avg: 1, sum: 25, count: 19

pid: 4249, min: 0, max: 2, avg: 0, sum: 161, count: 214

pid: 4251, min: 0, max: 3, avg: 0, sum: 181, count: 227

pid: 4250, min: 0, max: 2, avg: 0, sum: 181, count: 232

----------------------------------end-----------------------------

pid: 4249, min: 0, max: 3, avg: 0, sum: 134, count: 150

pid: 4250, min: 0, max: 5, avg: 1, sum: 167, count: 157

pid: 4251, min: 0, max: 3, avg: 0, sum: 145, count: 168

pid: 1484, min: 0, max: 3, avg: 0, sum: 142, count: 255

----------------------------------end-----------------------------

pid: 4250, min: 0, max: 6, avg: 2, sum: 187, count: 75

pid: 4251, min: 0, max: 3, avg: 1, sum: 88, count: 83

pid: 4249, min: 0, max: 3, avg: 0, sum: 102, count: 107

pid: 1484, min: 0, max: 6, avg: 0, sum: 302, count: 467

----------------------------------end-----------------------------

pid: 4250, min: 0, max: 6, avg: 2, sum: 198, count: 84

pid: 4251, min: 0, max: 4, avg: 1, sum: 124, count: 102

pid: 4249, min: 0, max: 2, avg: 0, sum: 98, count: 108

pid: 1484, min: 0, max: 4, avg: 0, sum: 380, count: 633

----------------------------------end-----------------------------

pid: 4250, min: 0, max: 6, avg: 2, sum: 197, count: 77

pid: 4251, min: 0, max: 4, avg: 1, sum: 106, count: 102

pid: 4249, min: 0, max: 2, avg: 0, sum: 101, count: 111

pid: 1484, min: 0, max: 4, avg: 0, sum: 396, count: 719

----------------------------------end-----------------------------

pid: 4249, min: 0, max: 4, avg: 1, sum: 88, count: 85

pid: 4251, min: 0, max: 6, avg: 1, sum: 102, count: 91

pid: 4250, min: 0, max: 6, avg: 2, sum: 221, count: 108

pid: 1484, min: 0, max: 4, avg: 0, sum: 360, count: 705

----------------------------------end-----------------------------

pid: 4250, min: 0, max: 7, avg: 2, sum: 211, count: 93

pid: 4249, min: 0, max: 13, avg: 1, sum: 152, count: 124

pid: 4251, min: 0, max: 4, avg: 1, sum: 132, count: 130

pid: 1484, min: 0, max: 5, avg: 0, sum: 419, count: 730

----------------------------------end-----------------------------

从另一个方面来观察，看看检查点发生时单次请求，PG所有函数的处理时间如何？

postgres 3167 2767 93 13:59 ? 00:00:47 postgres: digoal digoal [local] UPDATE

但是函数量太大，会死机没法跟踪，即使不死机，也不合理，因为正常1毫秒以内的请求，使用STAP跟踪后，额外开销比较大，会上升到100多毫秒。


  
  
   
   # stap -v 11111 -x 3167 -e '
  
  
  
  
   
   global f_start[999999],f_stop[999999]
  
  
  
  
   
   

  
  
  
  
   
   probe process("/opt/pgsql/bin/postgres").function("*@/opt/soft_bak/postgresql-9.4.1/src/*").call { 
  
  
  
  
   
     f_start[execname(), pid(), tid(), cpu()] = gettimeofday_ns()
  
  
  
  
   
   }
  
  
  
  
   
   

  
  
  
  
   
   probe process("/opt/pgsql/bin/postgres").function("*@/opt/soft_bak/postgresql-9.4.1/src/*").return { 
  
  
  
  
   
     t=gettimeofday_ns()
  
  
  
  
   
     a=execname()
  
  
  
  
   
     b=cpu()
  
  
  
  
   
     c=pid()
  
  
  
  
   
     d=pp()
  
  
  
  
   
     e=tid()
  
  
  
  
   
     if (f_start[a,c,e,b]) {
  
  
  
  
   
       f_stop[a,d] <<< t - f_start[a,c,e,b]
  
  
  
  
   
     }
  
  
  
  
   
   }
  
  
  
  
   
   

  
  
  
  
   
   probe timer.s(1) {
  
  
  
  
   
     foreach ([a,d] in f_stop @sum - limit 10 ) { 
  
  
  
  
   
       printf("avg_ns:%d, sum_ns:%d, cnt:%d, execname:%s, pp:%s\n", @avg(f_stop[a,d]), @sum(f_stop[a,d]), @count(f_stop[a,d]), a, d)
  
  
  
  
   
     }
  
  
  
  
   
      delete f_start
  
  
  
  
   
      delete f_stop
  
  
  
  
   
   }'

例如：


  
  
   
   开启stap前
  
  
  
  
   
   
    
    digoal=> update tbl set info=now() where id=1;
   
   
   
   
    
    UPDATE 1
   
   
   
   
    
    Time: 1.129 ms
   
   
   
   
    
    开启stap后
   
   
   
   
    
    digoal=> update tbl set info=now() where id=1;
   
   
   
   
    
    UPDATE 1
   
   
   
   
    
    Time: 167.001 ms

可以缩小 function("*@/opt/soft_bak/postgresql-9.4.1/src/*")的范围，再跟踪。

例如我们首先怀疑是锁的问题，那么可以缩小范围到 function("*@/opt/soft_bak/postgresql-9.4.1/src/ backend/storage/lmgr/ *")

  
  
   
   # stap -v 11111 -x 3167 -e '
  
  
  
  
   
   global f_start[999999],f_stop[999999]
  
  
  
  
   
   

  
  
  
  
   
   probe process("/opt/pgsql/bin/postgres").function("*@/opt/soft_bak/postgresql-9.4.1/src/backend/storage/lmgr/*").call { 
  
  
  
  
   
     f_start[execname(), pid(), tid(), cpu()] = gettimeofday_ns()
  
  
  
  
   
   }
  
  
  
  
   
   

  
  
  
  
   
   probe process("/opt/pgsql/bin/postgres").function("*@/opt/soft_bak/postgresql-9.4.1/src/backend/storage/lmgr/*").return { 
  
  
  
  
   
     t=gettimeofday_ns()
  
  
  
  
   
     a=execname()
  
  
  
  
   
     b=cpu()
  
  
  
  
   
     c=pid()
  
  
  
  
   
     d=pp()
  
  
  
  
   
     e=tid()
  
  
  
  
   
     if (f_start[a,c,e,b]) {
  
  
  
  
   
       f_stop[a,d] <<< t - f_start[a,c,e,b]
  
  
  
  
   
     }
  
  
  
  
   
   }
  
  
  
  
   
   

  
  
  
  
   
   probe timer.s(1) {
  
  
  
  
   
     foreach ([a,d] in f_stop @sum - limit 10 ) { 
  
  
  
  
   
       printf("avg_ns:%d, sum_ns:%d, cnt:%d, execname:%s, pp:%s\n", @avg(f_stop[a,d]), @sum(f_stop[a,d]), @count(f_stop[a,d]), a, d)
  
  
  
  
   
     }
  
  
  
  
   
      printf("-------------------------end--------------------------\n")
  
  
  
  
   
      delete f_start
  
  
  
  
   
      delete f_stop
  
  
  
  
   
   }'

结果截取：


   
   
    
    -------------------------end--------------------------
avg_ns:3108, sum_ns:80113700, cnt:25772, execname:postgres, pp:process("/opt/pgsql9.4.1/bin/postgres").function("LWLockRelease@/opt/soft_bak/postgresql-9.4.1/src/backend/storage/lmgr/lwlock.c:1118").return
avg_ns:3237, sum_ns:74622113, cnt:23051, execname:postgres, pp:process("/opt/pgsql9.4.1/bin/postgres").function("LWLockAcquire@/opt/soft_bak/postgresql-9.4.1/src/backend/storage/lmgr/lwlock.c:465").return
avg_ns:10610, sum_ns:37179631, cnt:3504, execname:postgres, pp:process("/opt/pgsql9.4.1/bin/postgres").function("LockRelationOid@/opt/soft_bak/postgresql-9.4.1/src/backend/storage/lmgr/lmgr.c:87").return
avg_ns:7832, sum_ns:34298340, cnt:4379, execname:postgres, pp:process("/opt/pgsql9.4.1/bin/postgres").function("LockAcquire@/opt/soft_bak/postgresql-9.4.1/src/backend/storage/lmgr/lock.c:672").return
-------------------------end--------------------------  发生检查点后,请求量变少了.
avg_ns:3122, sum_ns:70382918, cnt:22539, execname:postgres, pp:process("/opt/pgsql9.4.1/bin/postgres").function("LWLockRelease@/opt/soft_bak/postgresql-9.4.1/src/backend/storage/lmgr/lwlock.c:1118").return
avg_ns:3302, sum_ns:66548991, cnt:20152, execname:postgres, pp:process("/opt/pgsql9.4.1/bin/postgres").function("LWLockAcquire@/opt/soft_bak/postgresql-9.4.1/src/backend/storage/lmgr/lwlock.c:465").return
avg_ns:10591, sum_ns:32493427, cnt:3068, execname:postgres, pp:process("/opt/pgsql9.4.1/bin/postgres").function("LockRelationOid@/opt/soft_bak/postgresql-9.4.1/src/backend/storage/lmgr/lmgr.c:87").return
avg_ns:7832, sum_ns:30037745, cnt:3835, execname:postgres, pp:process("/opt/pgsql9.4.1/bin/postgres").function("LockAcquire@/opt/soft_bak/postgresql-9.4.1/src/backend/storage/lmgr/lock.c:672").return
-------------------------end--------------------------

每秒需要获得和释放几百万次的LWLock。排名前4的耗时较大。

当发生检查点时，请求量变少了，和前一篇跟踪LWLock等待的结果是一样的，请求量变少。

看样子也不是这个原因导致的性能下降。

原因还是回到代码，从第四篇的跟踪来看，性能下降出现在FLUSH BUFFER的循环中，也就是从这里开始：

TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);

* Loop over all buffers again, and write the ones (still) marked with

* BM_CHECKPOINT_NEEDED. In this loop, we start at the clock sweep point

* since we might as well dump soon-to-be-recycled buffers first.

* Note that we don't read the buffer alloc count here --- that should be

* left untouched till the next BgBufferSync() call.

buf_id = StrategySyncStart(NULL, NULL);

num_to_scan = NBuffers;

num_written = 0;

while (num_to_scan-- > 0)

{

......

StrategySyncStart是指根据LRU算法，让checkpointer从 nextVictimBuffer 开始sync。


   
   
    
    /*
   
   
   
   
    
     * StrategySyncStart -- tell BufferSync where to start syncing
   
   
   
   
    
     *
   
   
   
   
    
     * The result is the buffer index of the best buffer to sync first.
   
   
   
   
    
     * BufferSync() will proceed circularly around the buffer array from there.
   
   
   
   
    
     *
   
   
   
   
    
     * In addition, we return the completed-pass count (which is effectively
   
   
   
   
    
     * the higher-order bits of nextVictimBuffer) and the count of recent buffer
   
   
   
   
    
     * allocs if non-NULL pointers are passed.  The alloc count is reset after
   
   
   
   
    
     * being read.
   
   
   
   
    
     */
   
   
   
   
    
    int
   
   
   
   
    
    StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
   
   
   
   
    
    {
   
   
   
   
    
            int                     result;
   
   
   
   
    
    

   
   
   
   
    
            LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
   
   
   
   
    
            result = StrategyControl->nextVictimBuffer;
   
   
   
   
    
            if (complete_passes)
   
   
   
   
    
                    *complete_passes = StrategyControl->completePasses;
   
   
   
   
    
            if (num_buf_alloc)
   
   
   
   
    
            {
   
   
   
   
    
                    *num_buf_alloc = StrategyControl->numBufferAllocs;
   
   
   
   
    
                    StrategyControl->numBufferAllocs = 0;
   
   
   
   
    
            }
   
   
   
   
    
            LWLockRelease(BufFreelistLock);
   
   
   
   
    
            return result;
   
   
   
   
    
    }

但是对于我们的测试用例来说，实际上几乎所有的dirty buffer都是热的。从哪里开始都一样。

所以，性能下降与这个并无关联：

1. sync buffer时，可能有短暂的锁冲突，但是，从我们前面LWLock和再前面的跟踪结果来看影响极小，几乎可以忽略。

2. 检查点会将脏块同步到磁盘，然后脏块会变成非脏块，但是它依旧还在shared buffer中，所以紧接着的操作也不需要物理读（md read）。

$ pgbench -M prepared -n -r -f ./test.sql -P 1 -c 26 -j 26 -T 10000000

progress: 1.0 s, 13550.0 tps, lat 1.891 ms stddev 3.474

progress: 2.0 s, 15505.7 tps, lat 1.655 ms stddev 3.549

progress: 3.0 s, 18030.0 tps, lat 1.443 ms stddev 3.210

progress: 4.0 s, 20844.0 tps, lat 1.258 ms stddev 2.867

progress: 5.0 s, 25317.6 tps, lat 1.025 ms stddev 2.127

progress: 6.0 s, 31146.7 tps, lat 0.833 ms stddev 1.702

progress: 7.0 s, 40137.8 tps, lat 0.646 ms stddev 0.975

progress: 8.0 s, 51564.9 tps, lat 0.502 ms stddev 0.368

progress: 9.0 s, 53915.6 tps, lat 0.480 ms stddev 0.311

progress: 10.0 s, 56830.9 tps, lat 0.456 ms stddev 0.281

progress: 11.0 s, 57333.8 tps, lat 0.452 ms stddev 0.283

progress: 12.0 s, 58938.8 tps, lat 0.439 ms stddev 0.270

progress: 13.0 s, 59608.3 tps, lat 0.434 ms stddev 0.265

progress: 14.0 s, 59014.8 tps, lat 0.439 ms stddev 0.281

progress: 15.0 s, 60241.9 tps, lat 0.430 ms stddev 0.259

progress: 16.0 s, 61337.1 tps, lat 0.422 ms stddev 0.248

progress: 17.0 s, 60821.9 tps, lat 0.426 ms stddev 0.259

progress: 18.0 s, 60850.2 tps, lat 0.425 ms stddev 0.265

我们可以来跟踪一下物理读和物理写，你就可以看到，检查点发生后，其实并没有物理读。

smgr-md-read-start	(ForkNumber, BlockNumber, Oid, Oid, Oid, int)	Probe that fires when beginning to read a block from a relation. arg0 and arg1 contain the fork and block numbers of the page. arg2, arg3, and arg4 contain the tablespace, database, and relation OIDs identifying the relation. arg5 is the ID of the backend which created the temporary relation for a local buffer, or InvalidBackendId (-1) for a shared buffer.
smgr-md-read-done	(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int)	Probe that fires when a block read is complete. arg0 and arg1 contain the fork and block numbers of the page. arg2, arg3, and arg4 contain the tablespace, database, and relation OIDs identifying the relation. arg5 is the ID of the backend which created the temporary relation for a local buffer, or InvalidBackendId (-1) for a shared buffer. arg6 is the number of bytes actually read, while arg7 is the number requested (if these are different it indicates trouble).

src


   
   
    
    /*
   
   
   
   
    
     * ReadBuffer_common -- common logic for all ReadBuffer variants
   
   
   
   
    
     *
   
   
   
   
    
     * *hit is set to true if the request was satisfied from shared buffer cache.
   
   
   
   
    
     */
   
   
   
   
    
    static Buffer
   
   
   
   
    
    ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
   
   
   
   
    
                                      BlockNumber blockNum, ReadBufferMode mode,
   
   
   
   
    
                                      BufferAccessStrategy strategy, bool *hit)
   
   
   
   
    
    {
   
   
   
   
    
    ......
   
   
   
   
    
            probe buffer__read__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool);
   
   
   
   
    
            probe buffer__read__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool, bool);

跟踪读写：


   
   
    
    # stap -DMAXSKIPPED=100000 -v 11111 -e '
   
   
   
   
    
    global s_var, e_var, stat_var;
   
   
   
   
    
    

   
   
   
   
    
    /* probe smgr__md__read__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int); */
   
   
   
   
    
    probe process("/opt/pgsql/bin/postgres").mark("smgr__md__read__start") {
   
   
   
   
    
      s_var[pid(),1] = gettimeofday_us()
   
   
   
   
    
    }
   
   
   
   
    
    

   
   
   
   
    
    /* probe smgr__md__read__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int); */
   
   
   
   
    
    probe process("/opt/pgsql/bin/postgres").mark("smgr__md__read__done") {
   
   
   
   
    
      e_var[pid(),1] = gettimeofday_us()
   
   
   
   
    
      if ( s_var[pid(),1] > 0 )
   
   
   
   
    
        stat_var[pid(),1] <<< e_var[pid(),1] - s_var[pid(),1]
   
   
   
   
    
    }
   
   
   
   
    
    

   
   
   
   
    
    /* probe buffer__write__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool); */
   
   
   
   
    
    probe process("/opt/pgsql/bin/postgres").mark("buffer__write__start") {
   
   
   
   
    
      s_var[pid(),2] = gettimeofday_us()
   
   
   
   
    
    }
   
   
   
   
    
    

   
   
   
   
    
    /* probe buffer__write__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool, bool); */
   
   
   
   
    
    probe process("/opt/pgsql/bin/postgres").mark("buffer__write__done") {
   
   
   
   
    
      e_var[pid(),2] = gettimeofday_us()
   
   
   
   
    
      if ( s_var[pid(),2] > 0 )
   
   
   
   
    
        stat_var[pid(),2] <<< e_var[pid(),2] - s_var[pid(),2]
   
   
   
   
    
    }
   
   
   
   
    
    

   
   
   
   
    
    probe timer.s(1) {
   
   
   
   
    
      foreach ([v1,v2] in stat_var +) {
   
   
   
   
    
        if ( @count(stat_var[v1,v2]) >0 ) {
   
   
   
   
    
          printf("pid: %d, what: %d, min: %d, max: %d, avg: %d, sum: %d, count: %d\n", v1, v2, @min(stat_var[v1,v2]), @max(stat_var[v1,v2]), @avg(stat_var[v1,v2]), @sum(stat_var[v1,v2]), @count(stat_var[v1,v2]))
   
   
   
   
    
        }
   
   
   
   
    
      }
   
   
   
   
    
      printf("----------------------------------end-----------------------------\n")
   
   
   
   
    
      delete s_var
   
   
   
   
    
      delete e_var
   
   
   
   
    
      delete stat_var
   
   
   
   
    
    }'

开启压力测试：

$ pgbench -M prepared -n -r -f ./test.sql -P 1 -c 26 -j 26 -T 10000000

......

progress: 56.0 s, 60813.0 tps, lat 0.426 ms stddev 0.271

progress: 57.0 s, 59748.9 tps, lat 0.433 ms stddev 0.390

progress: 58.0 s, 61135.6 tps, lat 0.423 ms stddev 0.398

progress: 59.0 s, 64174.6 tps, lat 0.403 ms stddev 0.333

progress: 60.0 s, 59503.6 tps, lat 0.435 ms stddev 0.282

progress: 61.0 s, 60662.4 tps, lat 0.427 ms stddev 0.271

progress: 62.0 s, 60582.2 tps, lat 0.427 ms stddev 0.274

progress: 63.0 s, 61523.6 tps, lat 0.421 ms stddev 0.258

progress: 64.0 s, 62394.7 tps, lat 0.415 ms stddev 0.251

progress: 65.0 s, 60892.6 tps, lat 0.425 ms stddev 0.277

progress: 66.0 s, 60703.2 tps, lat 0.426 ms stddev 0.280

progress: 67.0 s, 61629.9 tps, lat 0.420 ms stddev 0.261

progress: 68.0 s, 61231.5 tps, lat 0.423 ms stddev 0.270

当tps进入顶峰，现在所有的热块都在内存中，可以看到现在没有物理读了。

----------------------------------end-----------------------------

执行检查点：

progress: 20.0 s, 60582.5 tps, lat 0.427 ms stddev 0.265

progress: 21.0 s, 40974.3 tps, lat 0.624 ms stddev 1.782

progress: 22.0 s, 14963.3 tps, lat 1.754 ms stddev 4.243

progress: 23.0 s, 16382.7 tps, lat 1.577 ms stddev 4.138

progress: 24.0 s, 20179.2 tps, lat 1.286 ms stddev 2.839

progress: 25.0 s, 23749.2 tps, lat 1.101 ms stddev 2.816

progress: 26.0 s, 27464.7 tps, lat 0.945 ms stddev 2.581

progress: 27.0 s, 33850.6 tps, lat 0.764 ms stddev 2.011

progress: 28.0 s, 43315.9 tps, lat 0.600 ms stddev 1.134

progress: 29.0 s, 50216.6 tps, lat 0.516 ms stddev 0.492

progress: 30.0 s, 52857.0 tps, lat 0.490 ms stddev 0.323

progress: 31.0 s, 53428.6 tps, lat 0.485 ms stddev 0.322

progress: 32.0 s, 54710.5 tps, lat 0.473 ms stddev 0.303

progress: 33.0 s, 57824.2 tps, lat 0.448 ms stddev 0.289

progress: 34.0 s, 59573.5 tps, lat 0.435 ms stddev 0.273

progress: 35.0 s, 58162.8 tps, lat 0.445 ms stddev 0.413

progress: 36.0 s, 57951.4 tps, lat 0.447 ms stddev 0.435

跟踪结果，我们只看到checkpointer在write buffer，并没有看到pgbench的连接产生物理读(md read)：

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 42, max: 101, avg: 49, sum: 57592, count: 1154

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 42, max: 124, avg: 51, sum: 520545, count: 10079

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 44, max: 191, avg: 65, sum: 464196, count: 7139

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 42, max: 145, avg: 53, sum: 467482, count: 8799

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 42, max: 131, avg: 53, sum: 447230, count: 8366

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 37, max: 175, avg: 57, sum: 429778, count: 7465

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 42, max: 10551, avg: 65, sum: 331686, count: 5065

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 41, max: 158, avg: 61, sum: 397299, count: 6429

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 29, max: 1173, avg: 61, sum: 672717, count: 10976

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 26, max: 11093, avg: 58, sum: 933458, count: 16027

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 27, max: 448, avg: 56, sum: 937812, count: 16633

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 27, max: 112, avg: 55, sum: 937420, count: 16996

----------------------------------end-----------------------------

pid: 18027, what: 2, min: 27, max: 98, avg: 54, sum: 431355, count: 7901

----------------------------------end-----------------------------

到目前为止，只能说，LWLOCK，WRITE BUFFER都没有对性能造成影响，我们并没有找到执行checkpoint时，带来的性能下降的真正原因。

下一篇将揭晓答案。

[参考]

1. http://blog.163.com/digoal@126/blog/static/16387704020154653422892/