clickhouse数据写入

远方有海，小样不乖

已于 2023-02-21 14:15:27 修改

阅读量851

点赞数

分类专栏：大数据文章标签： clickhouse spark Powered by 金山文档

于 2023-02-21 12:03:34 首次发布

本文链接：https://blog.csdn.net/anguoan/article/details/129140119

版权

大数据专栏收录该内容

27 篇文章 0 订阅

订阅专栏

一.用户画像clickhouse建表

-- 原始数据表
CREATE TABLE ads_user_portrait.ads_it_user_picture_da_local ON CLUSTER '{layer}'(
`cur_date` DateTime COMMENT '日期',
`user_id` UInt64 COMMENT '用户id',
`age` Nullable(Int32) COMMENT '年龄',
`gender` String COMMENT '性别',
`birthday` Nullable(DateTime) COMMENT '出生日期',
`recent_stay_province_id` Nullable(Int32) COMMENT '常驻省份id',
`recent_stay_province_name` String COMMENT '常驻省份',
`recent_stay_city_id` Nullable(Int32) COMMENT '常驻城市id',
`recent_stay_city_name` String COMMENT '常驻城市',
`mobile` String COMMENT '手机号',
`openid` String COMMENT '微信号',
`certificate` String COMMENT '是否实名认证',
`useable` String COMMENT '是否有用车资格',
`is_member` String COMMENT '是否会员',
`sys_ver` String COMMENT '手机系统',
`brand` String COMMENT '品牌',
`model` String COMMENT '机型',
`register_time` Nullable(DateTime) COMMENT '注册时间',
`first_active_time` Nullable(DateTime) COMMENT '首次活跃时间',
`last_active_time` Nullable(DateTime) COMMENT '最近活跃时间',
`first_order_time` Nullable(DateTime) COMMENT '首单时间',
`last_order_time` Nullable(DateTime) COMMENT '最近订单时间',
`active_count_d` Nullable(Int32) COMMENT '当日打开应用次数',
`active_count_3d` Nullable(Int32) COMMENT '近3日打开应用次数',
`active_count_7d` Nullable(Int32) COMMENT '近7日打开应用次数',
`active_count_15d` Nullable(Int32) COMMENT '近15日打开应用次数',
`active_count_30d` Nullable(Int32) COMMENT '近30日打开应用次数',
`active_count_90d` Nullable(Int32) COMMENT '近90日打开应用次数',
`active_count_180d` Nullable(Int32) COMMENT '近180日打开应用次数',
`active_days_3d` Nullable(Int32) COMMENT '近3日活跃天数',
`active_days_7d` Nullable(Int32) COMMENT '近7日活跃天数',
`active_days_15d` Nullable(Int32) COMMENT '近15日活跃天数',
`active_days_30d` Nullable(Int32) COMMENT '近30日活跃天数',
`active_days_90d` Nullable(Int32) COMMENT '近90日活跃天数',
`active_days_180d` Nullable(Int32) COMMENT '近180日活跃天数',
`order_count_d` Nullable(Int32) COMMENT '当日订单数',
`order_count_3d` Nullable(Int32) COMMENT '近3日订单数',
`order_count_7d` Nullable(Int32) COMMENT '近7日订单数',
`order_count_15d` Nullable(Int32) COMMENT '近15日订单数',
`order_count_30d` Nullable(Int32) COMMENT '近30日订单数',
`order_count_90d` Nullable(Int32) COMMENT '近90日订单数',
`order_count_180d` Nullable(Int32) COMMENT '近180日订单数',
`order_days_3d` Nullable(Int32) COMMENT '近3日订单天数',
`order_days_7d` Nullable(Int32) COMMENT '近7日订单天数',
`order_days_15d` Nullable(Int32) COMMENT '近15日订单天数',
`order_days_30d` Nullable(Int32) COMMENT '近30日订单天数',
`order_days_90d` Nullable(Int32) COMMENT '近90日订单天数',
`order_days_180d` Nullable(Int32) COMMENT '近180日订单天数',
`peccancy_count_3d` Nullable(Int32) COMMENT '近3日违规次数',
`peccancy_count_7d` Nullable(Int32) COMMENT '近7日违规次数',
`peccancy_count_15d` Nullable(Int32) COMMENT '近15日违规次数',
`peccancy_count_30d` Nullable(Int32) COMMENT '近30日违规次数',
`peccancy_count_90d` Nullable(Int32) COMMENT '近90日违规次数',
`peccancy_count_180d` Nullable(Int32) COMMENT '近180日违规次数',
`event_day` String COMMENT '操作日期')ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/test/ads_it_user_picture_da_local',
'{replica}')PARTITION BY cur_date ORDER BY user_id settings storage_policy='default_hot_cold';

CREATE TABLE test.ads_it_user_picture_da_all ON CLUSTER '{layer}' AS test.ads_it_user_picture_da_local ENGINE = Distributed('{layer}', test, ads_it_user_picture_da_local, rand());

-- 用户id上传表

CREATE TABLE ads_user_portrait.user_upload_id_local ON CLUSTER '{layer}'
(
`group_id` UInt64 COMMENT '分群id',
`user_id` UInt64 COMMENT '用户id',
`create_time` String COMMENT '日期',
`day` String COMMENT '分区日期'
)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/ads_user_portrait/user_upload_id_local',
'{replica}')PARTITION BY day ORDER BY group_id settings storage_policy='default_hot_cold';
CREATE TABLE ads_user_portrait.user_upload_id_all ON CLUSTER '{layer}' AS ads_user_portrait.user_upload_id_local ENGINE = Distributed('{layer}', ads_user_portrait, user_upload_id_local, rand());

二.读hive表写clickhouse

## 利用spark写入
import org.apache.spark.sql.{SaveMode, SparkSession}
object UserPortraitDataToCK {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("UserPortraitDataToCK").master("yarn").getOrCreate()
import spark.implicits._
import spark.sql
val df = sql(args(0))
val property = new java.util.Properties
property.put("driver","ru.yandex.clickhouse.ClickHouseDriver")
property.setProperty("user", args(1))
property.setProperty("password", args(2))
df.write.mode(SaveMode.Append).option("batchsize","500000")
.option("isolationLevel", "NONE")
.option("numPartitions", args(5))
.jdbc(args(3),args(4),property)
spark.close()
}
}
## args(0)---查询hive表的语句
## args(1)---连接clickhouse的用户
## args(2)---连接clickhouse的密码
## args(3)---clickhouse的url+库名
## args(4)---clickhouse目标表名
## args(5)---设置分区数（分布式ck下设置1）

三.提交spark任务shell

#!/bin/bash
if [ -n "$1" ]; then
    Y_DATE=$1
else
    Y_DATE=`date -d  "1 day ago" +%Y%m%d`
fi
sql="select * from ads.ads_it_user_picture_da where event_day='${Y_DATE}'"
user="###"
password="###"
ckurl="jdbc:clickhouse://x.x.x.x:8123/ads_user_portrait"
cktable="ads_it_user_picture_da_all"
function submit_spark_job() {
    spark-submit --jars clickhouse-jdbc-0.2.jar,guava-28.0-jre.jar \
                 --class com.songguo.UserPortraitDataToCK \
                 --master yarn --deploy-mode cluster \
                 --driver-memory 4g \
                 --executor-memory 6g \
                 --executor-cores 2 \
                 songguo.jar "${sql}" "${user}" "${password}" "${ckurl}" "${cktable}"
}
echo "-----函数开始执行-----"
submit_spark_job &&
echo "-----函数执行完毕-----"