PostgreSQL pg_bulkload speed test

最新推荐文章于 2024-08-31 16:31:11 发布

bqw2008

最新推荐文章于 2024-08-31 16:31:11 发布

阅读量1.5k

点赞数

本文将对比一下传统的数据导入和使用pg_bulkload数据导入的速度差别.

1. 使用普通的copy模式导入unlogged table.

2. 使用普通的copy模式导入logged table.

3. 使用pg_bulkload导入unlogged table.

4. 使用pg_bulkload导入logged table.

测试环境如下 :

数据库编译参数 :

pg93@db-172-16-3-150-> pg_config |grep CONFIG

CONFIGURE = '--prefix=/home/pg93/pgsql9.3.3' '--with-pgport=1922' '--with-perl' '--with-tcl' '--with-python' '--with-openssl' '--with-pam' '--without-ldap' '--with-libxml' '--with-libxslt' '--enable-thread-safety' '--with-wal-blocksize=64' '--with-blocksize=32' '--enable-dtrace' '--enable-debug' '--enable-cassert'

数据库配置 :

listen_addresses = '0.0.0.0' # what IP address(es) to listen on;

port = 1921 # (change requires restart)

max_connections = 100 # (change requires restart)

superuser_reserved_connections = 3 # (change requires restart)

unix_socket_directories = '.' # comma-separated list of directories

unix_socket_permissions = 0700 # begin with 0 to use octal notation

tcp_keepalives_idle = 60 # TCP_KEEPIDLE, in seconds;

tcp_keepalives_interval = 10 # TCP_KEEPINTVL, in seconds;

tcp_keepalives_count = 10 # TCP_KEEPCNT;

shared_buffers = 4096MB # min 128kB

maintenance_work_mem = 512MB # min 1MB

shared_preload_libraries = 'pg_stat_statements' # (change requires restart)

vacuum_cost_delay = 10 # 0-100 milliseconds

vacuum_cost_limit = 10000 # 1-10000 credits

bgwriter_delay = 10ms # 10-10000ms between rounds

wal_level = hot_standby # minimal, archive, or hot_standby

synchronous_commit = off # synchronization level;

wal_buffers = 16MB # min 32kB, -1 sets based on shared_buffers

wal_writer_delay = 10ms # 1-10000 milliseconds

checkpoint_segments = 512 # in logfile segments, min 1, 16MB each

archive_mode = on # allows archiving to be done

archive_command = '/bin/date' # command to use to archive a logfile segment

max_wal_senders = 32 # max number of walsender processes

hot_standby = on # "on" allows queries during recovery

wal_receiver_status_interval = 1s # send replies at least this often

hot_standby_feedback = on # send info from standby to prevent

random_page_cost = 2.0 # same scale as above

effective_cache_size = 96GB

log_destination = 'csvlog' # Valid values are combinations of

logging_collector = on # Enable capturing of stderr and csvlog

log_directory = 'pg_log' # directory where log files are written,

log_truncate_on_rotation = on # If on, an existing log file with the

log_min_messages = log # values in order of decreasing detail:

log_checkpoints = on

log_connections = on

log_disconnections = on

log_error_verbosity = verbose # terse, default, or verbose messages

log_lock_waits = on # log lock waits >= deadlock_timeout

log_statement = 'ddl' # none, ddl, mod, all

log_timezone = 'PRC'

track_activities = on

track_counts = on

track_functions = all # none, pl, all

track_activity_query_size = 1024 # (change requires restart)

autovacuum = off # Enable autovacuum subprocess? 'on'

log_autovacuum_min_duration = 0 # -1 disables, 0 logs all actions and

autovacuum_naptime = 3s # time between autovacuum runs

autovacuum_vacuum_scale_factor = 0.0002 # fraction of table size before vacuum

autovacuum_analyze_scale_factor = 0.0001 # fraction of table size before analyze

datestyle = 'iso, mdy'

timezone = 'PRC'

lc_messages = 'C' # locale for system error message

lc_monetary = 'C' # locale for monetary formatting

lc_numeric = 'C' # locale for number formatting

lc_time = 'C' # locale for time formatting

default_text_search_config = 'pg_catalog.english'

pg_stat_statements.max = 1000

pg_stat_statements.track = all

数据库列表以及collection :


  
  
   
   pg93@db-172-16-3-150-> psql
  
  
  
  
   
   psql (9.3.3)
  
  
  
  
   
   Type "help" for help.
  
  
  
  
   
   digoal=# \l
  
  
  
  
   
                                List of databases
  
  
  
  
   
      Name    |  Owner   | Encoding | Collate | Ctype |   Access privileges   
  
  
  
  
   
   -----------+----------+----------+---------+-------+-----------------------
  
  
  
  
   
    digoal    | postgres | UTF8     | C       | C     | 
  
  
  
  
   
    postgres  | postgres | UTF8     | C       | C     | 
  
  
  
  
   
    stats     | postgres | UTF8     | C       | C     | =Tc/postgres         +
  
  
  
  
   
              |          |          |         |       | postgres=CTc/postgres+
  
  
  
  
   
              |          |          |         |       | stats=CTc/postgres
  
  
  
  
   
    statsrepo | postgres | UTF8     | C       | C     | 
  
  
  
  
   
    template0 | postgres | UTF8     | C       | C     | =c/postgres          +
  
  
  
  
   
              |          |          |         |       | postgres=CTc/postgres
  
  
  
  
   
    template1 | postgres | UTF8     | C       | C     | =c/postgres          +
  
  
  
  
   
              |          |          |         |       | postgres=CTc/postgres
  
  
  
  
   
   (6 rows)

生成5000万条测试数据.


  
  
   
   
    
    digoal=# create table test(id int primary key, info text, crt_time timestamp);
   
   
   
   
    
    CREATE TABLE
   
   
   
   
    
    digoal=# insert into test select generate_series(1,50000000),md5(random()::text),clock_timestamp();
   
   
  
  
  
  
   
   INSERT 0 50000000
  
  
  
  
   
   
    
    digoal=# \dt+ test
   
   
   
   
    
                        List of relations
   
   
   
   
    
     Schema | Name | Type  |  Owner   |  Size   | Description 
   
   
   
   
    
    --------+------+-------+----------+---------+-------------
   
   
   
   
    
     public | test | table | postgres | 3634 MB | 
   
   
   
   
    
    (1 row)
   
   
   
   
    
    digoal=# \di+
   
   
   
   
    
                               List of relations
   
   
   
   
    
     Schema |   Name    | Type  |  Owner   | Table |  Size   | Description 
   
   
   
   
    
    --------+-----------+-------+----------+-------+---------+-------------
   
   
   
   
    
     public | test_pkey | index | postgres | test  | 1063 MB | 
   
   
   
   
    
    (1 row)

导出到csv文件

digoal=# copy test to '/ssd3/pg93/test.dmp' with (format csv, delimiter ',', null '\N', quote '"', force_quote *) ;

使用copy导入数据的测试 :

digoal=# truncate test;

TRUNCATE TABLE

digoal=# \timing

使用copy导入数据

digoal=# copy test from '/ssd3/pg93/test.dmp' with (format csv, delimiter ',', null '\N', quote '"');

COPY 50000000

Time: 411245.879 ms

改为unlogged table重新测试 :

digoal=# update pg_class set relpersistence='u' where relname='test';

UPDATE 1

digoal=# update pg_class set relpersistence='u' where relname='test_pkey';

UPDATE 1

使用copy导入数据

digoal=# copy test from '/ssd3/pg93/test.dmp' with (format csv, delimiter ',', null '\N', quote '"');

COPY 50000000

Time: 363699.466 ms

接下来要试一下使用pg_bulkload绕过shared buffer导入数据.

首先要按照pg_bulkload.

wget http://pgfoundry.org/frs/download.php/3566/pg_bulkload-3.1.5.tar.gz

[root@db-172-16-3-150 ~]# export PATH=/home/pg93/pgsql9.3.3/bin:$PATH

[root@db-172-16-3-150 ~]# cd /opt/soft_bak/pg_bulkload-3.1.5

[root@db-172-16-3-150 pg_bulkload-3.1.5]# which pg_config

/home/pg93/pgsql9.3.3/bin/pg_config

[root@db-172-16-3-150 pg_bulkload-3.1.5]# make

[root@db-172-16-3-150 pg_bulkload-3.1.5]# make install

清除test表的数据, 创建pg_bulkload extension.


   
   
    
    
     
     pg93@db-172-16-3-150-> psql
    
    
    
    
     
     psql (9.3.3)
    
    
    
    
     
     Type "help" for help.
    
    
   
   
   
   
    
    
     
     digoal=# truncate test;
    
    
    
    
     
     TRUNCATE TABLE
    
    
   
   
   
   
    
    digoal=# create extension pg_bulkload;
   
   
   
   
    
    digoal=# update pg_class set relpersistence ='p' where relname='test_pkey';
UPDATE 1
digoal=# update pg_class set relpersistence ='p' where relname='test';
UPDATE 1

使用postgresql启动数据库 :

pg93@db-172-16-3-150-> postgresql stop -m fast

waiting for server to shut down..... done

server stopped

pg93@db-172-16-3-150-> postgresql start

server starting

注意, pg_bulkload默认连接/tmp socket, 如果配置了其他sock, 必须改为/tmp或者添加/tmp的unix sock 监听.


  
  
   
   pg93@db-172-16-3-150-> pg_bulkload -i /ssd3/pg93/test.dmp -O test -l /ssd3/test.log -o "TYPE=CSV" -o "WRITER=PARALLEL" -h $PGDATA -p $PGPORT -d $PGDATABASE
  
  
  
  
   
   NOTICE: BULK LOAD START
  
  
  
  
   
   ERROR: query failed: ERROR:  could not establish connection to parallel writer
  
  
  
  
   
   DETAIL:  could not connect to server: No such file or directory
  
  
  
  
   
           Is the server running locally and accepting
  
  
  
  
   
           connections on Unix domain socket "/tmp/.s.PGSQL.1921"?
  
  
  
  
   
   HINT:  Refer to the following if it is an authentication error.  Specifies the authentication method to without the need for a password in pg_hba.conf (ex. trust or ident), or specify the password to the password file of the operating system user who ran PostgreSQL server.  If cannot use these solution, specify WRITER=DIRECT.
  
  
  
  
   
   DETAIL: query was: SELECT * FROM pg_bulkload($1)

修改unix socket目录 , 增加 /tmp. 使用pg_bulkload 提供的postgresql脚本重启数据库.

vi $PGDATA/postgresql.conf

unix_socket_directories = '.,/tmp'

pg93@db-172-16-3-150-> postgresql restart -m fast

waiting for server to shut down.... done

server stopped

server starting

重新执行pg_bulkload.

pg_bulkload -i /ssd3/pg93/test.dmp -O test -l /ssd3/pg93/test.log -o "TYPE=CSV" -o "WRITER=PARALLEL" -h $PGDATA -p $PGPORT -d $PGDATABASE

在执行过程中我们看到$PGDATA多了一个目录pg_bulkload, 存储加载数据的状态信息, 如果导入过程中发生异常, 使用postgresql脚本重启数据库时将自动修复. 或者在使用pg_ctl启动数据库前先使用pg_bulkload修复.

pg93@db-172-16-3-150-> cd $PGDATA

pg93@db-172-16-3-150-> ll pg_bulkload/

total 4.0K

-rw------- 1 pg93 pg93 512 Mar 28 09:36 16384.34315.loadstatus

日志 :

[root@db-172-16-3-150 pg93]# cat test.log

pg_bulkload 3.1.5 on 2014-03-28 13:32:31.32559+08 INPUT = /ssd3/pg93/test.dmp PARSE_BADFILE = /ssd4/pg93/pg_root/pg_bulkload/20140328133231_digoal_public_test.prs.dmp LOGFILE = /ssd3/pg93/test.log LIMIT = INFINITE PARSE_ERRORS = 0 CHECK_CONSTRAINTS = NO TYPE = CSV SKIP = 0 DELIMITER = , QUOTE = "\"" ESCAPE = "\"" NULL = OUTPUT = public.test MULTI_PROCESS = YES VERBOSE = NO WRITER = DIRECT DUPLICATE_BADFILE = /ssd4/pg93/pg_root/pg_bulkload/20140328133231_digoal_public_test.dup.csv DUPLICATE_ERRORS = 0 ON_DUPLICATE_KEEP = NEW TRUNCATE = NO 0 Rows skipped. 50000000 Rows successfully loaded. 0 Rows not loaded due to parse errors. 0 Rows not loaded due to duplicate errors. 0 Rows replaced with new rows. Run began on 2014-03-28 13:32:31.32559+08 Run ended on 2014-03-28 13:35:13.019018+08 CPU 1.55s/128.55u sec elapsed 161.69 sec

使用pg_bulkload的direct 和 multi process模式(即parallel)导入数据总耗时161秒. 相比普通的copy logged table 411秒快了一倍多.

改为unlogged table, 使用pg_bulkload重新测试 :

digoal=# update pg_class set relpersistence ='u' where relname='test';

UPDATE 1

digoal=# update pg_class set relpersistence ='u' where relname='test_pkey';

UPDATE 1

digoal=# truncate test;

TRUNCATE TABLE

digoal=# checkpoint;

CHECKPOINT

$ pg_bulkload -i /ssd3/pg93/test.dmp -O test -l /ssd3/pg93/test.log -o "TYPE=CSV" -o "WRITER=PARALLEL" -h $PGDATA -p $PGPORT -d $PGDATABASE

pg_bulkload 3.1.5 on 2014-03-28 13:36:15.602787+08

INPUT = /ssd3/pg93/test.dmp

PARSE_BADFILE = /ssd4/pg93/pg_root/pg_bulkload/20140328133615_digoal_public_test.prs.dmp

LOGFILE = /ssd3/pg93/test.log

LIMIT = INFINITE

PARSE_ERRORS = 0

CHECK_CONSTRAINTS = NO

TYPE = CSV

SKIP = 0

DELIMITER = ,

QUOTE = "\""

ESCAPE = "\""

NULL =

OUTPUT = public.test

MULTI_PROCESS = YES

VERBOSE = NO

WRITER = DIRECT

DUPLICATE_BADFILE = /ssd4/pg93/pg_root/pg_bulkload/20140328133615_digoal_public_test.dup.csv

DUPLICATE_ERRORS = 0

ON_DUPLICATE_KEEP = NEW

TRUNCATE = NO

0 Rows skipped.

50000000 Rows successfully loaded.

0 Rows not loaded due to parse errors.

0 Rows not loaded due to duplicate errors.

0 Rows replaced with new rows.

Run began on 2014-03-28 13:36:15.602787+08

Run ended on 2014-03-28 13:38:57.506558+08

CPU 2.26s/129.23u sec elapsed 161.90 sec

导入数据总耗时161秒. 相比普通的copy unlogged table 363秒快了一倍多.

因为已经绕过了shared buffer, 所以使用pg_bulkload导入目标unlogged和logged表的结果一样.

最后附direct模式的测试结果, (不开multi process). 256秒, 还是比363快.

pg93@db-172-16-3-150-> pg_bulkload -i /ssd3/pg93/test.dmp -O test -l /ssd3/pg93/test.log -o "TYPE=CSV" -o "WRITER=DIRECT" -h $PGDATA -p $PGPORT -d $PGDATABASE