基于tpc-ds 和 doris的测试
生成1G数据
./dsdgen -SCALE 1GB -DIR /data8/ryh/doris/apache-doris-be-1.1.5-bin-x86_64/data1G/ -CHILD 2 -TERMINATE
数据导入
使用tpc-ds生成数据后,可以使用以下方式导入
cat customer_demographics.dat | sed 's/.$//' | curl --location-trusted -u root -H "label:customer_demographics" -H "timeout:1200" -H "column_separator:|" -T - http://127.0.0.1:8230/api/test_db/customer_demographics/_stream_load
cat inventory.dat | sed 's/.$//' | curl --location-trusted -u root -H "label:inventory" -H "timeout:1200" -H "column_separator:|" -T - http://127.0.0.1:8230/api/test_db/inventory/_stream_load
建表语句
这里提供一些基于doris格式的tpc-ds建表语句:
CREATE TABLE dbgen_version (
`dv_version` varchar(16),
`dv_create_date` DATETIME,
`dv_create_time` DATETIME,
`dv_cmdline_args` varchar(200)
)
DUPLICATE KEY(`dv_version`)
DISTRIBUTED BY HASH(`dv_version`) BUCKETS 1
PROPERTIES (
"replication_num" = "3"
);
CREATE TABLE customer_address (
`ca_address_sk` INT NOT NULL ,
`ca_address_id` CHAR(16) NOT NULL ,
`ca_street_number` CHAR(10) NULL ,
`ca_street_name` VARCHAR(60) NULL ,
`ca_street_type` CHAR(15) NULL ,
`ca_suite_number` CHAR(10) NULL ,
`ca_city` VARCHAR(60) NULL ,
`ca_county` VARCHAR(30) NULL ,
`ca_state` CHAR(2) NULL ,
`ca_zip` CHAR(10) NULL ,
`ca_country` VARCHAR(20) NULL ,
`ca_gmt_offset` DECIMAL(5,2) NULL ,
`ca_location_type` CHAR(20) NULL
)
DUPLICATE KEY(`ca_address_sk`)
DISTRIBUTED BY HASH(`ca_address_sk`) BUCKETS 1
PROPERTIES (
"replication_num" = "3"
);
CREATE TABLE customer_demographics (
`cd_demo_sk` INT NOT NULL,
`cd_gender` CHAR(1) NULL,
`cd_marital_status` CHAR(1) NULL,
`cd_education_status` CHAR(20) NULL,
`cd_purchase_estimate` INT NULL,
`cd_credit_rating` CHAR(10) NULL,
`cd_dep_count` INT NULL,
`cd_dep_employed_count` INT NULL,
`cd_dep_college_count` INT NULL
)
DUPLICATE KEY(`cd_demo_sk`)
DISTRIBUTED BY HASH(`cd_demo_sk`) BUCKETS 1
PROPERTIES (
“replication_num” = “3”
);
CREATE TABLE date_dim (
`d_date_sk` INT NOT NULL,
`d_date_id` CHAR(16) NOT NULL,
`d_date` DATETIME NULL,
`d_month_seq` INT NULL,
`d_week_seq` INT NULL,
`d_quarter_seq` INT NULL,
`d_year` INT NULL,
`d_dow` INT NULL,
`d_moy` INT NULL,
`d_dom` INT NULL,
`d_qoy` INT NULL,
`d_fy_year` INT NULL,
`d_fy_quarter_seq` INT NULL,
`d_fy_week_seq` INT NULL,
`d_day_name` CHAR(9) NULL,
`d_quarter_name` CHAR(6) NULL,
`d_holiday` CHAR(1) NULL,
`d_weekend` CHAR(1) NULL,
`d_following_holiday` CHAR(1) NULL,
`d_first_dom` INT NULL,
`d_last_dom` INT NULL,
`d_same_day_ly` INT NULL,
`d_same_day_lq` INT NULL,
`d_current_day` CHAR(1) NULL,
`d_current_week` CHAR(1) NULL,
`d_current_month` CHAR(1) NULL,
`d_current_quarter` CHAR(1) NULL,
`d_current_year` CHAR(1) NULL
)
DUPLICATE KEY(`d_date_id`)
DISTRIBUTED BY HASH(`d_date_sk`) BUCKETS 1
PROPERTIES (
“replication_num” = “3”
);
CREATE TABLE warehouse (
`w_warehouse_sk` INT NOT NULL,
`w_warehouse_id` CHAR(16) NOT NULL,
`w_warehouse_name` VARCHAR(20) NULL,
`w_warehouse_sq_ft` INT NULL,
`w_street_number` CHAR(10) NULL,
`w_street_name` VARCHAR(60) NULL,
`w_street_type` CHAR(15) NULL,
`w_suite_number` CHAR(10) NULL,
`w_city` VARCHAR(60) NULL,
`w_county` VARCHAR(30) NULL,
`w_state` CHAR(2) NULL,
`w_zip` CHAR(10) NULL,
`w_country` VARCHAR(20) NULL,
`w_gmt_offset` DECIMAL(5,2) NULL
)
DUPLICATE KEY(`w_warehouse_id`)
DISTRIBUTED BY HASH(`w_warehouse_sk`) BUCKETS 1
PROPERTIES (
“replication_num” = “3”
);
CREATE TABLE ship_mode (
`sm_ship_mode_sk` INT NOT NULL,
`sm_ship_mode_id` CHAR(16) NOT NULL,
`sm_type` CHAR(30) NULL,
`sm_code` CHAR(10) NULL,
`sm_carrier` CHAR(20) NULL,
`sm_contract` CHAR(20) NULL
)
DUPLICATE KEY(`sm_ship_mode_id`)
DISTRIBUTED BY HASH(`sm_ship_mode_sk`) BUCKETS 1
PROPERTIES (
“replication_num” = “3”
);
CREATE TABLE time_dim (
`t_time_sk` INT NOT NULL,
`t_time_id` CHAR(16) NOT NULL,
`t_time` INT NULL,
`t_hour` INT NULL,
`t_minute` INT NULL,
`t_second` INT NULL,
`t_am_pm` CHAR(2) NULL,
`t_shift` CHAR(20) NULL,
`t_sub_shift` CHAR(20) NULL,
`t_meal_time` CHAR(20) NULL
)
DUPLICATE KEY(`t_time_id`)
DISTRIBUTED BY HASH(`t_time_sk`) BUCKETS 1
PROPERTIES (
“replication_num” = “3”
);
CREATE TABLE reason (
`r_reason_sk` INT NOT NULL,
`r_reason_id` CHAR(16) NOT NULL,
`r_reason_desc` CHAR(100) NULL
)
DUPLICATE KEY(`r_reason_id`)
DISTRIBUTED BY HASH(`r_reason_sk`) BUCKETS 1
PROPERTIES (
“replication_num” = “3”
);
CREATE TABLE income_band (
`ib_income_band_sk` INT NOT NULL,
`ib_lower_bound` INT NULL,
`ib_upper_bound` INT NULL
)
DUPLICATE KEY(`ib_income_band_sk`)
DISTRIBUTED BY HASH(`ib_income_band_sk`) BUCKETS 1
PROPERTIES (
“replication_num” = “3”
);
查询语句生成
在使用的时候会出现以下报错
ERROR: Substitution'_END' is used before being initialized at line 59 in ../query_templates/query3.tpl
我们需要添加这个在每个 query0.tpl 文件,一共99个文件
./dsqgen -input ../query_templates/templates.lst -directory ../query_templates -dialect oracle -scale 1GB -OUTPUT_DIR /data/tpc-ds/data