一.用户画像clickhouse建表
-- 原始数据表
CREATE TABLE ads_user_portrait.ads_it_user_picture_da_local ON CLUSTER '{layer}'(
`cur_date` DateTime COMMENT '日期',
`user_id` UInt64 COMMENT '用户id',
`age` Nullable(Int32) COMMENT '年龄',
`gender` String COMMENT '性别',
`birthday` Nullable(DateTime) COMMENT '出生日期',
`recent_stay_province_id` Nullable(Int32) COMMENT '常驻省份id',
`recent_stay_province_name` String COMMENT '常驻省份',
`recent_stay_city_id` Nullable(Int32) COMMENT '常驻城市id',
`recent_stay_city_name` String COMMENT '常驻城市',
`mobile` String COMMENT '手机号',
`openid` String COMMENT '微信号',
`certificate` String COMMENT '是否实名认证',
`useable` String COMMENT '是否有用车资格',
`is_member` String COMMENT '是否会员',
`sys_ver` String COMMENT '手机系统',
`brand` String COMMENT '品牌',
`model` String COMMENT '机型',
`register_time` Nullable(DateTime) COMMENT '注册时间',
`first_active_time` Nullable(DateTime) COMMENT '首次活跃时间',
`last_active_time` Nullable(DateTime) COMMENT '最近活跃时间',
`first_order_time` Nullable(DateTime) COMMENT '首单时间',
`last_order_time` Nullable(DateTime) COMMENT '最近订单时间',
`active_count_d` Nullable(Int32) COMMENT '当日打开应用次数',
`active_count_3d` Nullable(Int32) COMMENT '近3日打开应用次数',
`active_count_7d` Nullable(Int32) COMMENT '近7日打开应用次数',
`active_count_15d` Nullable(Int32) COMMENT '近15日打开应用次数',
`active_count_30d` Nullable(Int32) COMMENT '近30日打开应用次数',
`active_count_90d` Nullable(Int32) COMMENT '近90日打开应用次数',
`active_count_180d` Nullable(Int32) COMMENT '近180日打开应用次数',
`active_days_3d` Nullable(Int32) COMMENT '近3日活跃天数',
`active_days_7d` Nullable(Int32) COMMENT '近7日活跃天数',
`active_days_15d` Nullable(Int32) COMMENT '近15日活跃天数',
`active_days_30d` Nullable(Int32) COMMENT '近30日活跃天数',
`active_days_90d` Nullable(Int32) COMMENT '近90日活跃天数',
`active_days_180d` Nullable(Int32) COMMENT '近180日活跃天数',
`order_count_d` Nullable(Int32) COMMENT '当日订单数',
`order_count_3d` Nullable(Int32) COMMENT '近3日订单数',
`order_count_7d` Nullable(Int32) COMMENT '近7日订单数',
`order_count_15d` Nullable(Int32) COMMENT '近15日订单数',
`order_count_30d` Nullable(Int32) COMMENT '近30日订单数',
`order_count_90d` Nullable(Int32) COMMENT '近90日订单数',
`order_count_180d` Nullable(Int32) COMMENT '近180日订单数',
`order_days_3d` Nullable(Int32) COMMENT '近3日订单天数',
`order_days_7d` Nullable(Int32) COMMENT '近7日订单天数',
`order_days_15d` Nullable(Int32) COMMENT '近15日订单天数',
`order_days_30d` Nullable(Int32) COMMENT '近30日订单天数',
`order_days_90d` Nullable(Int32) COMMENT '近90日订单天数',
`order_days_180d` Nullable(Int32) COMMENT '近180日订单天数',
`peccancy_count_3d` Nullable(Int32) COMMENT '近3日违规次数',
`peccancy_count_7d` Nullable(Int32) COMMENT '近7日违规次数',
`peccancy_count_15d` Nullable(Int32) COMMENT '近15日违规次数',
`peccancy_count_30d` Nullable(Int32) COMMENT '近30日违规次数',
`peccancy_count_90d` Nullable(Int32) COMMENT '近90日违规次数',
`peccancy_count_180d` Nullable(Int32) COMMENT '近180日违规次数',
`event_day` String COMMENT '操作日期')ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/test/ads_it_user_picture_da_local',
'{replica}')PARTITION BY cur_date ORDER BY user_id settings storage_policy='default_hot_cold';
CREATE TABLE test.ads_it_user_picture_da_all ON CLUSTER '{layer}' AS test.ads_it_user_picture_da_local ENGINE = Distributed('{layer}', test, ads_it_user_picture_da_local, rand());
-- 用户id上传表
CREATE TABLE ads_user_portrait.user_upload_id_local ON CLUSTER '{layer}'
(
`group_id` UInt64 COMMENT '分群id',
`user_id` UInt64 COMMENT '用户id',
`create_time` String COMMENT '日期',
`day` String COMMENT '分区日期'
)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/ads_user_portrait/user_upload_id_local',
'{replica}')PARTITION BY day ORDER BY group_id settings storage_policy='default_hot_cold';
CREATE TABLE ads_user_portrait.user_upload_id_all ON CLUSTER '{layer}' AS ads_user_portrait.user_upload_id_local ENGINE = Distributed('{layer}', ads_user_portrait, user_upload_id_local, rand());
二.读hive表写clickhouse
## 利用spark写入
import org.apache.spark.sql.{SaveMode, SparkSession}
object UserPortraitDataToCK {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("UserPortraitDataToCK").master("yarn").getOrCreate()
import spark.implicits._
import spark.sql
val df = sql(args(0))
val property = new java.util.Properties
property.put("driver","ru.yandex.clickhouse.ClickHouseDriver")
property.setProperty("user", args(1))
property.setProperty("password", args(2))
df.write.mode(SaveMode.Append).option("batchsize","500000")
.option("isolationLevel", "NONE")
.option("numPartitions", args(5))
.jdbc(args(3),args(4),property)
spark.close()
}
}
## args(0)---查询hive表的语句
## args(1)---连接clickhouse的用户
## args(2)---连接clickhouse的密码
## args(3)---clickhouse的url+库名
## args(4)---clickhouse目标表名
## args(5)---设置分区数(分布式ck下设置1)
三.提交spark任务shell
#!/bin/bash
if [ -n "$1" ]; then
Y_DATE=$1
else
Y_DATE=`date -d "1 day ago" +%Y%m%d`
fi
sql="select * from ads.ads_it_user_picture_da where event_day='${Y_DATE}'"
user="###"
password="###"
ckurl="jdbc:clickhouse://x.x.x.x:8123/ads_user_portrait"
cktable="ads_it_user_picture_da_all"
function submit_spark_job() {
spark-submit --jars clickhouse-jdbc-0.2.jar,guava-28.0-jre.jar \
--class com.songguo.UserPortraitDataToCK \
--master yarn --deploy-mode cluster \
--driver-memory 4g \
--executor-memory 6g \
--executor-cores 2 \
songguo.jar "${sql}" "${user}" "${password}" "${ckurl}" "${cktable}"
}
echo "-----函数开始执行-----"
submit_spark_job &&
echo "-----函数执行完毕-----"