压测工具准备
git clone hive-testbench
git clone git@github.com:hortonworks/hive-testbench.git
使用当中遇到130命令太长的问题,修了代码解决问题
编译生成数据产生jar包
bash ${HIVE_TSESTBENCH_HOME}/tpcds-build.sh
在 ${HIVE_TSESTBENCH_HOME}/tpcds-gen/target/ 下生成jar tpcds-gen-1.0-SNAPSHOT.jar
编译过程需要修改一个问题
[图片]
造成这个问题是shell执行的时候命令太长,具体修改如下
[图片]
参考:
https://www.baifachuan.com/posts/3696fe4b.html
https://blog.51cto.com/u_12539341/7580883(后边找到的没有验证)
生成数据
修改tpcds-setup.sh 脚本
1 修改 hive 命令
默认hive 是通过beeline连接,我们使用hive客户端建库建表。所以有以下修改
#HIVE="beeline -n hive -u 'jdbc:hive2://localhost:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2?tez.queue.name=default' "
HIVE=“hive”
2 提前建库
tpcds_text_${SCALE}
tpcds_bin_partitioned_${FORMAT}_${SCALE}
这个可根据产生数据的规模确定scale,format默认为orc
由于我们的xitong账号没有建库的权限,所以先用超级账号work建库
3 删除所有的建库sql
create database if not exists
修改配置
settings/init.sql
settings/load-flat.sql
settings/load-partitioned.sql
将上述文件中的一下配置加注释
– hive.optimize.sort.dynamic.partition.threshold=0;
生成数据,在hive上创建表
tpcds-setup.sh scale_factor [directory]
./tpcds-setup.sh 500 /tmp/wyf/tcpds
在生成数据的时候脚本需要注意的
在这里插入代码片
#!/bin/bash
function usage {
echo “Usage: tpcds-setup.sh scale_factor [temp_directory]”
exit 1
}
function runcommand {
if [ “X$DEBUG_SCRIPT” != “X” ]; then
$1
else
$1 2>/dev/null
fi
}
if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
echo “Please build the data generator with ./tpcds-build.sh first”
exit 1
fi
which hive > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo “Script must be run where Hive is installed”
exit 1
fi
Tables in the TPC-DS schema.
DIMS=“date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page web_site”
FACTS=“store_sales store_returns web_sales web_returns catalog_sales catalog_returns inventory”
Get the parameters.
SCALE=$1
DIR=
2
i
f
[
"
X
2 if [ "X
2if["XBUCKET_DATA" != “X” ]; then
BUCKETS=13
RETURN_BUCKETS=13
else
BUCKETS=1
RETURN_BUCKETS=1
fi
if [ “X$DEBUG_SCRIPT” != “X” ]; then
set -x
fi
Sanity checking.
if [ X"
S
C
A
L
E
"
=
"
X
"
]
;
t
h
e
n
u
s
a
g
e
f
i
i
f
[
X
"
SCALE" = "X" ]; then usage fi if [ X"
SCALE"="X"];thenusagefiif[X"DIR" = “X” ]; then
DIR=/tmp/tpcds-generate
fi
if [ $SCALE -eq 1 ]; then
echo “Scale factor must be greater than 1”
exit 1
fi
Do the actual data load.(这个地方生成数据,可以去自己指定的目录查看一下数据是否生成)
hdfs dfs -mkdir -p ${DIR}
hdfs dfs -ls
D
I
R
/
{DIR}/
DIR/{SCALE} > /dev/null
if [ $? -ne 0 ]; then
echo “Generating data at scale factor $SCALE.”
(cd tpcds-gen; hadoop jar target/*.jar -d
D
I
R
/
{DIR}/
DIR/{SCALE}/ -s ${SCALE})
fi
echo hdfs dfs -ls ${DIR}/${SCALE}
hdfs dfs -ls
D
I
R
/
{DIR}/
DIR/{SCALE} > /dev/null
if [ $? -ne 0 ]; then
echo “Data generation failed, exiting.”
exit 1
fi
hadoop fs -chmod -R 777 D I R / {DIR}/ DIR/{SCALE}
echo $DIR
echo “TPC-DS text data generation complete.”
#HIVE="beeline -n hive -u ‘jdbc:hive2://localhost:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2?tez.queue.name=default’ "
HIVE=“hive”
DEBUG_SCRIPT=bash
Create the text/flat tables as external tables. These will be later be converted to ORCFile.
#DATABASE=dwd_insys_test
echo “Loading text data into external tables.”
这个地方的dap_dev是库,用来放hdfs生成的数据对应的textFile的表的
runcommand “ H I V E − i s e t t i n g s / l o a d − f l a t . s q l − f d d l − t p c d s / t e x t / a l l t a b l e s . s q l − − h i v e v a r D B = ′ d a p d e v ′ − − h i v e v a r L O C A T I O N = HIVE -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql --hivevar DB='dap_dev' --hivevar LOCATION= HIVE−isettings/load−flat.sql−fddl−tpcds/text/alltables.sql−−hivevarDB=′dapdev′−−hivevarLOCATION={DIR}/${SCALE}”
Create the partitioned and bucketed tables.
if [ “X$FORMAT” = “X” ]; then
FORMAT=orc
fi
LOAD_FILE=“load_KaTeX parse error: Expected group after '_' at position 9: {FORMAT}_̲{SCALE}.mk”
SILENCE=“2> /dev/null 1> /dev/null”
if [ “X$DEBUG_SCRIPT” != “X” ]; then
SILENCE=“”
fi
echo -e “all: ${DIMS} ${FACTS}” > $LOAD_FILE
i=1
total=24
DATABASE=dwd_insys_test
echo KaTeX parse error: Expected 'EOF', got '#' at position 10: DATABASE #̲DATABASE=tpcds_…{FORMAT}_KaTeX parse error: Expected 'EOF', got '#' at position 27: …_REDUCERS=2500 #̲ maximum number…((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})
#增加这个方便排查问题
DEBUG_SCRIPT=ON
Populate the smaller tables.
for t in
D
I
M
S
d
o
e
c
h
o
d
d
l
−
t
p
c
d
s
/
b
i
n
p
a
r
t
i
t
i
o
n
e
d
/
{DIMS} do echo ddl-tpcds/bin_partitioned/
DIMSdoechoddl−tpcds/binpartitioned/{t}.sql
COMMAND=“
H
I
V
E
−
i
s
e
t
t
i
n
g
s
/
l
o
a
d
−
p
a
r
t
i
t
i
o
n
e
d
.
s
q
l
−
f
d
d
l
−
t
p
c
d
s
/
b
i
n
p
a
r
t
i
t
i
o
n
e
d
/
HIVE -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/
HIVE−isettings/load−partitioned.sql−fddl−tpcds/binpartitioned/{t}.sql
–hivevar DB=
D
A
T
A
B
A
S
E
−
−
h
i
v
e
v
a
r
S
O
U
R
C
E
=
d
a
p
d
e
v
−
−
h
i
v
e
v
a
r
S
C
A
L
E
=
{DATABASE} --hivevar SOURCE=dap_dev \ --hivevar SCALE=
DATABASE−−hivevarSOURCE=dapdev −−hivevarSCALE={SCALE}
–hivevar REDUCERS=
R
E
D
U
C
E
R
S
−
−
h
i
v
e
v
a
r
F
I
L
E
=
{REDUCERS} \ --hivevar FILE=
REDUCERS −−hivevarFILE={FORMAT}”
echo -e “KaTeX parse error: Undefined control sequence: \n at position 5: {t}:\̲n̲\t@COMMAND $SILENCE && echo ‘Optimizing table
t
(
t (
t(i/$total).’” >> $LOAD_FILE
i=expr $i + 1
done
echo "SOURCE: "$SOURCE
for t in
F
A
C
T
S
d
o
e
c
h
o
d
d
l
−
t
p
c
d
s
/
b
i
n
p
a
r
t
i
t
i
o
n
e
d
/
{FACTS} do echo ddl-tpcds/bin_partitioned/
FACTSdoechoddl−tpcds/binpartitioned/{t}.sql
COMMAND=“
H
I
V
E
−
−
h
i
v
e
c
o
n
f
h
i
v
e
.
e
x
e
c
.
m
a
x
.
d
y
n
a
m
i
c
.
p
a
r
t
i
t
i
o
n
s
=
1000
−
−
h
i
v
e
c
o
n
f
h
i
v
e
.
e
x
e
c
.
m
a
x
.
d
y
n
a
m
i
c
.
p
a
r
t
i
t
i
o
n
s
.
p
e
r
n
o
d
e
=
200
−
−
h
i
v
e
c
o
n
f
h
i
v
e
.
e
x
e
c
.
d
y
n
a
m
i
c
.
p
a
r
t
i
t
i
o
n
=
t
r
u
e
−
−
h
i
v
e
c
o
n
f
h
i
v
e
.
e
x
e
c
.
d
y
n
a
m
i
c
.
p
a
r
t
i
t
i
o
n
.
m
o
d
e
=
n
o
s
t
r
i
c
k
−
i
s
e
t
t
i
n
g
s
/
l
o
a
d
−
p
a
r
t
i
t
i
o
n
e
d
.
s
q
l
−
f
d
d
l
−
t
p
c
d
s
/
b
i
n
p
a
r
t
i
t
i
o
n
e
d
/
HIVE --hiveconf hive.exec.max.dynamic.partitions=1000 --hiveconf hive.exec.max.dynamic.partitions.pernode=200 --hiveconf hive.exec.dynamic.partition=true --hiveconf hive.exec.dynamic.partition.mode=nostrick -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/
HIVE−−hiveconfhive.exec.max.dynamic.partitions=1000−−hiveconfhive.exec.max.dynamic.partitions.pernode=200−−hiveconfhive.exec.dynamic.partition=true−−hiveconfhive.exec.dynamic.partition.mode=nostrick−isettings/load−partitioned.sql−fddl−tpcds/binpartitioned/{t}.sql
–hivevar DB=
D
A
T
A
B
A
S
E
−
−
h
i
v
e
v
a
r
S
C
A
L
E
=
{DATABASE} \ --hivevar SCALE=
DATABASE −−hivevarSCALE={SCALE}
–hivevar SOURCE=dap_dev --hivevar BUCKETS=
B
U
C
K
E
T
S
−
−
h
i
v
e
v
a
r
R
E
T
U
R
N
B
U
C
K
E
T
S
=
{BUCKETS} \ --hivevar RETURN_BUCKETS=
BUCKETS −−hivevarRETURNBUCKETS={RETURN_BUCKETS} --hivevar REDUCERS=
R
E
D
U
C
E
R
S
−
−
h
i
v
e
v
a
r
F
I
L
E
=
{REDUCERS} --hivevar FILE=
REDUCERS−−hivevarFILE={FORMAT}”
echo -e “KaTeX parse error: Undefined control sequence: \n at position 5: {t}:\̲n̲\t@COMMAND $SILENCE && echo ‘Optimizing table
t
(
t (
t(i/$total).’” >> $LOAD_FILE
i=expr $i + 1
done
echo $LOAD_FILE
#并发执行的速度,修改这个可以使产生的速度变强
make -j 10 -f $LOAD_FILE
echo “Loading constraints”
runcommand “
H
I
V
E
−
f
d
d
l
−
t
p
c
d
s
/
b
i
n
p
a
r
t
i
t
i
o
n
e
d
/
a
d
d
c
o
n
s
t
r
a
i
n
t
s
.
s
q
l
−
−
h
i
v
e
v
a
r
D
B
=
HIVE -f ddl-tpcds/bin_partitioned/add_constraints.sql --hivevar DB=
HIVE−fddl−tpcds/binpartitioned/addconstraints.sql−−hivevarDB={DATABASE}”
echo “Data loaded into database ${DATABASE}.”
最后生成一个mk文件,如果中间跑失败了,修改这个文件就行,不用重新生成表数据,例如make -j 5 -f ./all_query_rss.mk
查询数据,也仿照生成数据写了一个脚本,并发跑查询结果,更加方便
LOAD_FILE=all_query.mk
database_name=dwd_insys_test
current_path=pwd
queries_dir=KaTeX parse error: Double subscript at position 54: …e='tpcds_500g_1_̲' rm -rf logs m…DEBUG_SCRIPT" != “X” ]; then
SILENCE=“”
fi
total=$(ls ${queries_dir} | wc -l)
echo $total
for t in ls ${queries_dir}
do
all=“$all $t”
done
echo -e “all: ${all}” > $LOAD_FILE
for t in ls ${queries_dir}
do
COMMAND=“spark-sql --queue root.wyf --name
n
a
m
e
name
namet --executor-cores 4 --executor-memory 4G --database $database_name --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.initialExecutors=2 --conf spark.dynamicAllocation.minExecutors=2 --conf spark.dynamicAllocation.maxExecutors=300 --conf spark.dynamicAllocation.schedulerBacklogTimeout=1s --conf spark.dynamicAllocation.executorIdleTimeout=30s --conf spark.sql.shuffle.partitions=400 -f
q
u
e
r
i
e
s
d
i
r
/
{queries_dir}/
queriesdir/{t}”
echo -e “KaTeX parse error: Undefined control sequence: \n at position 5: {t}:\̲n̲\t@COMMAND > logs/$t”.log" 2>&1" >> $LOAD_FILE
i=expr $i + 1
done
make -j 5 -f $LOAD_FILE