【大数据SQL测试】tpch压测

16 篇文章 0 订阅
12 篇文章 0 订阅

TPC-H生成数据

下载工具

网盘链接:https://pan.baidu.com/s/1u3_bwZbyWKrFGqdVzV2_7g 密码:sgnw

解压原文件
修改配置
cd dbgen
cp makefile.suite makefile
vim makefile
修改为如下内容(gcc要是小写的,因为这个问题百度了一小时左右-_-||)
#makefile.suite 的更改参数如下
CC      = gcc
# Current values for DATABASE are: INFORMIX, DB2, TDAT (Teradata)
#                                  SQLSERVER, SYBASE, ORACLE, VECTORWISE
# Current values for MACHINE are:  ATT, DOS, HP, IBM, ICL, MVS, 
#                                  SGI, SUN, U2200, VMS, LINUX, WIN32 
# Current values for WORKLOAD are:  TPCH
DATABASE = SQLSERVER     #程序给定参数没有postgresql ,修改tpcd.h 添加POSTGRESQL脚本
MACHINE = LINUX
WORKLOAD = TPCH
执行make,dbgen命令生成表结构和数据

如果你之前生成过数据表,可以执行make clean先清除一下再执行以下命令。

make

执行dbgen,生成数据表,大小为2G。根据自己需要设定参数。

./dbgen  -s 2
use spark beeline
drop database if exists tpch_50;
create database tpch_50;
use tpch_50;
drop table if exists region;
create table region (
        r_regionkey long,
        r_name string,
        r_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/region.tbl' overwrite into table region;
drop table if exists nation;
create table nation (
        n_nationkey bigint,
        n_name string,
        n_regionkey bigint,
        n_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/nation.tbl' overwrite into table nation;
drop table if exists supplier;
create table supplier (
        s_suppkey  bigint,
        s_name string,
        s_address string,
        s_nationkey bigint,
        s_phone string,
        s_acctbal double,
        s_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/supplier.tbl' overwrite into table supplier;
drop table if exists customer;
create table customer (
        c_custkey bigint,
        c_name string,
        c_address string,
        c_nationkey bigint,
        c_phone string,
        c_acctbal double,
        c_mktsegment string,
        c_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/customer.tbl' overwrite into table customer;
drop table if exists part;
create table part (
        p_partkey bigint,
        p_name string,
        p_mfgr string,
        p_brand string,
        p_type string,
        p_size bigint,
        p_container string,
        p_retailprice double,
        p_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/part.tbl' overwrite into table part;
drop table if exists partsupp;
create table partsupp (
        ps_partkey bigint,
        ps_suppkey bigint,
        ps_availqty bigint,
        ps_supplycost double,
        ps_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/partsupp.tbl' overwrite into table partsupp;
drop table if exists orders;
create table orders (
        o_orderkey bigint,
        o_custkey bigint,
        o_orderstatus string,
        o_totalprice double,
        o_orderdate date,
        o_orderpriority string,
        o_clerk string,
        o_shippriority bigint,
        o_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/orders.tbl' overwrite into table orders;
drop table if exists lineitem;
create table lineitem (
        l_orderkey bigint,
        l_partkey bigint,
        l_suppkey bigint,
        l_linenumber bigint,
        l_quantity double,
        l_extendedprice double,
        l_discount double,
        l_tax double,
        l_returnflag string,
        l_linestatus string,
        l_shipdate date,
        l_commitdate date,
        l_receiptdate date,
        l_shipinstruct string,
        l_shipmode string,
        l_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/lineitem.tbl' overwrite into table lineitem;
[root@ccb-st-online-k8s-node-1 tpch]# du -h *
4.0K    create_tpch.sql
1.2G    customer.tbl
64K     dbgen
12K     dists.dss
37G     lineitem.tbl
4.0K    nation.tbl
8.3G    orders.tbl
5.7G    partsupp.tbl
1.2G    part.tbl
60K     qgen
100K    query/tpch1s
104K    query
4.0K    readme.txt
4.0K    region.tbl
69M     supplier.tbl
0: jdbc:hive2://localhost:10003/> select count(*) from partsupp;
+-----------+--+
| count(1)  |
+-----------+--+
| 40000000  |
+-----------+--+
1 row selected (14.98 seconds)
0: jdbc:hive2://localhost:10003/> select count(*) from lineitem;
+------------+--+
|  count(1)  |
+------------+--+
| 300005811  |
+------------+--+
1 row selected (102.19 seconds)
0: jdbc:hive2://localhost:10003/> select l_orderkey,o_orderdate,o_shippriority from customer,orders,lineitem where c_mktsegment = 'BUILDING' and c_custkey = o_custkey and l_orderkey = o_orderkey and o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15' group by l_orderkey,o_orderdate,o_shippriority order by o_orderdate limit 10;
+-------------+--------------+-----------------+--+
| l_orderkey  | o_orderdate  | o_shippriority  |
+-------------+--------------+-----------------+--+
| 251171297   | 1994-11-15   | 0               |
| 57621312    | 1994-11-15   | 0               |
| 287115875   | 1994-11-15   | 0               |
| 260093319   | 1994-11-15   | 0               |
| 120297251   | 1994-11-15   | 0               |
| 72791619    | 1994-11-15   | 0               |
| 262117955   | 1994-11-15   | 0               |
| 179265990   | 1994-11-15   | 0               |
| 266580966   | 1994-11-15   | 0               |
| 121695717   | 1994-11-15   | 0               |
+-------------+--------------+-----------------+--+
10 rows selected (224.628 seconds)
能不能查询一次删除一次    
1. spark基于数据分区数量来动态调整Task数,完成小数据量partition合并
2. 基于多stage复杂SQL的broadcastjoin来规避可能的数据倾斜
3. 优化spark sql执行过程中根据数据量动态的去调整执行计划
1. 优化spark sql执行过程中探测分区数据量大小及行数等信息
2. 优化spark sql执行过程中动态调整数据量大小 连续partition合并在一起的策略
3. 优化spark sql执行过程中根据数据量大小 合并调整reduce task个数
4. 优化多stage复杂SQL的执行过程,将小表转成broadcast来优化执行过程
5. 统计spark中partition的行数,用来统计可能发生数据倾斜的分区
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

笑起来贼好看

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值