TPC-H生成数据
下载工具
网盘链接:https://pan.baidu.com/s/1u3_bwZbyWKrFGqdVzV2_7g 密码:sgnw
解压原文件
修改配置
cd dbgen
cp makefile.suite makefile
vim makefile
修改为如下内容(gcc要是小写的,因为这个问题百度了一小时左右-_-||)
#makefile.suite 的更改参数如下
CC = gcc
# Current values for DATABASE are: INFORMIX, DB2, TDAT (Teradata)
# SQLSERVER, SYBASE, ORACLE, VECTORWISE
# Current values for MACHINE are: ATT, DOS, HP, IBM, ICL, MVS,
# SGI, SUN, U2200, VMS, LINUX, WIN32
# Current values for WORKLOAD are: TPCH
DATABASE = SQLSERVER #程序给定参数没有postgresql ,修改tpcd.h 添加POSTGRESQL脚本
MACHINE = LINUX
WORKLOAD = TPCH
执行make,dbgen命令生成表结构和数据
如果你之前生成过数据表,可以执行make clean先清除一下再执行以下命令。
make
执行dbgen,生成数据表,大小为2G。根据自己需要设定参数。
./dbgen -s 2
use spark beeline
drop database if exists tpch_50;
create database tpch_50;
use tpch_50;
drop table if exists region;
create table region (
r_regionkey long,
r_name string,
r_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/region.tbl' overwrite into table region;
drop table if exists nation;
create table nation (
n_nationkey bigint,
n_name string,
n_regionkey bigint,
n_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/nation.tbl' overwrite into table nation;
drop table if exists supplier;
create table supplier (
s_suppkey bigint,
s_name string,
s_address string,
s_nationkey bigint,
s_phone string,
s_acctbal double,
s_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/supplier.tbl' overwrite into table supplier;
drop table if exists customer;
create table customer (
c_custkey bigint,
c_name string,
c_address string,
c_nationkey bigint,
c_phone string,
c_acctbal double,
c_mktsegment string,
c_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/customer.tbl' overwrite into table customer;
drop table if exists part;
create table part (
p_partkey bigint,
p_name string,
p_mfgr string,
p_brand string,
p_type string,
p_size bigint,
p_container string,
p_retailprice double,
p_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/part.tbl' overwrite into table part;
drop table if exists partsupp;
create table partsupp (
ps_partkey bigint,
ps_suppkey bigint,
ps_availqty bigint,
ps_supplycost double,
ps_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/partsupp.tbl' overwrite into table partsupp;
drop table if exists orders;
create table orders (
o_orderkey bigint,
o_custkey bigint,
o_orderstatus string,
o_totalprice double,
o_orderdate date,
o_orderpriority string,
o_clerk string,
o_shippriority bigint,
o_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/orders.tbl' overwrite into table orders;
drop table if exists lineitem;
create table lineitem (
l_orderkey bigint,
l_partkey bigint,
l_suppkey bigint,
l_linenumber bigint,
l_quantity double,
l_extendedprice double,
l_discount double,
l_tax double,
l_returnflag string,
l_linestatus string,
l_shipdate date,
l_commitdate date,
l_receiptdate date,
l_shipinstruct string,
l_shipmode string,
l_comment string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|';
load data local inpath '/opt/spark/d100/tpch/lineitem.tbl' overwrite into table lineitem;
[root@ccb-st-online-k8s-node-1 tpch]# du -h *
4.0K create_tpch.sql
1.2G customer.tbl
64K dbgen
12K dists.dss
37G lineitem.tbl
4.0K nation.tbl
8.3G orders.tbl
5.7G partsupp.tbl
1.2G part.tbl
60K qgen
100K query/tpch1s
104K query
4.0K readme.txt
4.0K region.tbl
69M supplier.tbl
0: jdbc:hive2://localhost:10003/> select count(*) from partsupp;
+-----------+--+
| count(1) |
+-----------+--+
| 40000000 |
+-----------+--+
1 row selected (14.98 seconds)
0: jdbc:hive2://localhost:10003/> select count(*) from lineitem;
+------------+--+
| count(1) |
+------------+--+
| 300005811 |
+------------+--+
1 row selected (102.19 seconds)
0: jdbc:hive2://localhost:10003/> select l_orderkey,o_orderdate,o_shippriority from customer,orders,lineitem where c_mktsegment = 'BUILDING' and c_custkey = o_custkey and l_orderkey = o_orderkey and o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15' group by l_orderkey,o_orderdate,o_shippriority order by o_orderdate limit 10;
+-------------+--------------+-----------------+--+
| l_orderkey | o_orderdate | o_shippriority |
+-------------+--------------+-----------------+--+
| 251171297 | 1994-11-15 | 0 |
| 57621312 | 1994-11-15 | 0 |
| 287115875 | 1994-11-15 | 0 |
| 260093319 | 1994-11-15 | 0 |
| 120297251 | 1994-11-15 | 0 |
| 72791619 | 1994-11-15 | 0 |
| 262117955 | 1994-11-15 | 0 |
| 179265990 | 1994-11-15 | 0 |
| 266580966 | 1994-11-15 | 0 |
| 121695717 | 1994-11-15 | 0 |
+-------------+--------------+-----------------+--+
10 rows selected (224.628 seconds)
能不能查询一次删除一次
1. spark基于数据分区数量来动态调整Task数,完成小数据量partition合并
2. 基于多stage复杂SQL的broadcastjoin来规避可能的数据倾斜
3. 优化spark sql执行过程中根据数据量动态的去调整执行计划
1. 优化spark sql执行过程中探测分区数据量大小及行数等信息
2. 优化spark sql执行过程中动态调整数据量大小 连续partition合并在一起的策略
3. 优化spark sql执行过程中根据数据量大小 合并调整reduce task个数
4. 优化多stage复杂SQL的执行过程,将小表转成broadcast来优化执行过程
5. 统计spark中partition的行数,用来统计可能发生数据倾斜的分区