关于Spark-Rapids的GPU化改造
在现存的基于HiveOnSpark的数据仓库上,将Spark-Rapids作为Hive的计算引擎来替代常规版本的Saprk,使数仓能够充分调用GPU计算资源。
1.安装CUDA相关驱动
前往驱动下载页面下载相关显卡的驱动,我这里使用的是RTX2070
https://www.nvidia.com/Download/index.aspx?lang=en-us
#安装驱动
./驱动.run
#安装完成后 #查看是否安装完成
nvidia-smi
#同样在官网下载好相应的CUDA驱动后进行安装
./cuda_11.0.3_450.51.06_linux.run
#在安装CUDA时会自动选择安装显卡驱动,为了避免不必要的麻烦,记得取消驱动安装,只安装CUDA相关
#安装好后添加相关环境变量
export PATH=$PATH:/usr/local/cuda-11.0/bin
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.0/lib64
2.配置相关依赖
#创建依赖性路径并下载依赖
mkdir -p /opt/sparkRapidsPlugin
cd /opt/sparkRapidsPlugin
wget https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.3.0/rapids-4-spark_2.12-0.3.0.jar
wget https://repo1.maven.org/maven2/ai/rapids/cudf/0.17/cudf-0.17-cuda11.jar
#创建相关环境变量
export SPARK_RAPIDS_DIR=/opt/sparkRapidsPlugin
export SPARK_CUDF_JAR=${SPARK_RAPIDS_DIR}/cudf-0.17-cuda11.jar
export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_DIR}/rapids-4-spark_2.12-0.3.0.jar
#创建GPU感知脚本
vim getGpusResources.sh
########################
#!/usr/bin/env bash
ADDRS=`nvidia-smi --query-gpu=index --format=csv,noheader | sed -e ':a' -e 'N' -e'$!ba' -e 's/\n/","/g'`
echo {
\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}
########################
3.配置Spark
修改Spark-env.sh
SPARK_WORKER_OPTS="-Dspark.worker.resource.gpu.amount=1 -Dspark.worker.resource.gpu.discoveryScript=/opt/sparkRapidsPlugin/getGpusResources.sh"
随后启动Spark,在web资源中的Resource端看见GPU资源即为成功
#配置默认的spark-defaults.conf 使其默认调用GPU进行计算
############################################################
spark.master yarn
spark.executor.resource.gpu.discoveryScript /opt/sparkRapidsPlugin/getGpusResources.sh
spark.executor.extraClassPath /opt/sparkRapidsPlugin/cudf-0.17-cuda11.jar:/opt/sparkRapidsPlugin/rapids-4-spark_2.12-0.3.0.jar
spark.rapids.sql.concurrentGpuTasks 1
spark.driver.memory 2G
spark.executor.memory 4G
spark.executor.cores 4
spark.task.resource.gpu.amount 0.25
spark.rapids.memory.pinnedPool.size 2G
spark.locality.wait 0s
spark.sql.files.maxPartitionBytes 512m
spark.sql.shuffle.partitions 10
spark.plugins com.nvidia.spark.SQLPlugin
spark.jars /opt/sparkRapidsPlugin/rapids-4-spark_2.12-0.3.0.jar,/opt/sparkRapidsPlugin/cudf-0.17-cuda11.jar
spark.executor.resource.gpu.amount 1
############################################################
随后即可进行一些简单测试
//进入spark Shell
val df = sc.makeRDD(1 to 100000000,6).toDF
val df2 = sc.makeRDD(1 to 100000000,6).toDF
df.select($"value" as "a").join(df2.select($"value" as "b"),$"a" === $"b").count
//在默认spark-defaults.conf与非默认spark-defaults.conf的情况下都进行测试
//结果在2070度加速下原本需要30秒左右的计算6秒左右即可计算完毕
4.配置作为数据仓库引擎
在原版的数据仓库中,已经配置好了HiveOnSpark相关配置信息,随后要改变Hive对Spark引擎的一些默认调度
#进入$hive/conf 修改/创建spark-defaults.conf
############################################################
spark.master yarn
spark.executor.resource.gpu.discoveryScript /opt/sparkRapidsPlugin/getGpusResources.sh
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.kryo.registrator com.nvidia.spark.rapids.GpuKryoRegistrator
spark.executor.extraClassPath /opt/sparkRapidsPlugin/cudf-0.17-cuda11.jar:/opt/sparkRapidsPlugin/rapids-4-spark_2.12-0.3.0.jar
spark.rapids.sql.concurrentGpuTasks 1
spark.driver.memory 2G
spark.executor.memory 4G
spark.executor.cores 4
spark.task.resource.gpu.amount 0.25
spark.rapids.memory.pinnedPool.size 2G
spark.locality.wait 0s
spark.sql.files.maxPartitionBytes 512m
spark.sql.shuffle.partitions 10
spark.plugins com.nvidia.spark.SQLPlugin
spark.jars /opt/sparkRapidsPlugin/rapids-4-spark_2.12-0.3.0.jar,/opt/sparkRapidsPlugin/cudf-0.17-cuda11.jar
spark.executor.resource.gpu.amount 1
############################################################
启动Hive,并进行一些简单操作,在Nvidia-smi中查看显卡资源是否被调用
若被调用则成功使用数据仓库进行GPU资源的计算
5.数据仓库测试
测试1
--测试1
select
nvl(new.id,old.id),
nvl(new.order_status,old.order_status),
nvl(new.user_id,old.user_id),
nvl(new.province_id,old.province_id),
nvl(new.payment_way,old.payment_way),
nvl(new.delivery_address,old.delivery_address),
nvl(new.out_trade_no,old.out_trade_no),
nvl(new.tracking_no,old.tracking_no),
nvl(new.create_time,old.create_time),
nvl(new.payment_time,old.payment_time),
nvl(new.cancel_time,old.cancel_time),
nvl(new.finish_time,old.finish_time),
nvl(new.refund_time,old.refund_time),
nvl(new.refund_finish_time,old.refund_finish_time),
nvl(new.expire_time,old.expire_time),
nvl(new.feight_fee,old.feight_fee),
nvl(new.feight_fee_reduce,old.feight_fee_reduce),
nvl(new.activity_reduce_amount,old.activity_reduce_amount),
nvl(new.coupon_reduce_amount,old.coupon_reduce_amount),
nvl(new.original_amount,old.original_amount),
nvl(new.final_amount,old.final_amount),
case
when new.cancel_time is not null then date_format(new.cancel_time,'yyyy-MM-dd')
when new.finish_time is not null and date_add(date_format(new.finish_time,'yyyy-MM-dd'),7)='2020-06-15' and new.refund_time is null then '2020-06-15'
when new.refund_finish_time is not null then date_format(new.refund_finish_time,'yyyy-MM-dd')
when new.expire_time is not null then date_format(new.expire_time,'yyyy-MM-dd')
else '9999-99-99'
end
from
(
select
id,
order_status,
user_id,
province_id,
payment_way,
delivery_address,
out_trade_no,
tracking_no,
create_time,
payment_time,
cancel_time,
finish_time,
refund_time,
refund_finish_time,
expire_time,
feight_fee,
feight_fee_reduce,
activity_reduce_amount,
coupon_reduce_amount,
original_amount,
final_amount
from dwd_order_info
where dt='9999-99-99'
)old
full outer join
(
select
oi.id,
oi.order_status,
oi.user_id,
oi.province_id,
oi.payment_way,
oi.delivery_address,
oi.out_trade_no,
oi.tracking_no,
oi.create_time,
times.ts['1002'] payment_time,
times.ts['1003'] cancel_time,
times.ts['1004'] finish_time,
times.ts['1005'] refund_time,
times.ts['1006'] refund_finish_time,
oi.expire_time,
feight_fee,
feight_fee_reduce,
activity_reduce_amount,
coupon_reduce_amount,
original_amount,
final_amount
from
(
select
*
from ods_order_info
where dt='2020-06-15'
)oi
left join
(
select
order_id,
str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=') ts
from ods_order_status_log
where dt='2020-06-15'
group by order_id
)times
on oi.id=times.order_id
)new
on old.id=new.id;
-- 普通集群8.540秒完成
-- GPU集群2.375秒完成
测试2
-- 测试2
select
nvl(1d_ago.user_id,old.user_id),
nvl(old.login_date_first,'2020-06-15'),
if(1d_ago.user_id is not null,'2020-06-15',old.login_date_last),
nvl(1d_ago.login_count,0),
if(1d_ago.user_id is not null,1,0),
nvl(old.login_last_7d_count,0)+nvl(1d_ago.login_count,0)- nvl(7d_ago.login_count,0),
nvl(old.login_last_7d_day_count,0)+if(1d_ago.user_id is null,0,1)- if(7d_ago.user_id is null,0,1),
nvl(old.login_last_30d_count,0)+nvl(</