环境准备
Hive on Spark
spark一定是不带hive的spark
mapreduce.job.queuename=hive
Spark on Hive(Spark-Sql)
spark一般自带hive,只需要知道hive的原数据
关系型数据库用sqoop直接从mysql到hdfs
业务数据
db_log vim application.properties 改日期 重置为0
#业务日期
mock.date=2020-06-27
#是否重置
mock.clear=0
[vanas@hadoop102 db_log]$ java -jar gmall2020-mock-db-2020-04-01.jar
添加
生产任务提交到hive队列里
[vanas@hadoop102 bin]$ vim mysql_to_hdfs.sh
-Dmapreduce.job.queuename=hive
hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer -Dmapreduce.job.queuename=hive /origin_data/gmall/db/$1/$do_date
导入第一次
[vanas@hadoop102 bin]$ mysql_to_hdfs.sh first 2020-06-27
第二次导入
[vanas@hadoop102 bin]$ mysql_to_hdfs.sh all 2020-06-28
日志数据
日志数据和业务数据日期要保持一致 102,103都要改
cd applog
[vanas@hadoop102 applog]$ vim application.properties
[vanas@hadoop103 applog]$ vim application.properties
#业务日期
mock.date=2020-06-27
hadoop集群
zookeeper
Kafka
f1.sh 102,103到kafka
f2.sh 104 kafka 到hdfs
启动前把上述通道打通
启动
log.sh
/origin_data/gmall/log/topic_log/2020-06-27
tips
改系统时间,生产环境绝对禁止
1.清除3台集群kafka下的logs的所有文件
2.去zookeeper中,删除一个节点 /kafka
采集需要改2个时间:
1.系统时间 ,仅仅是为了hdfs的目录
2.application.properties的时间,是生成的数据时间
spark
tar -zxf spark-2.4.5-bin-without-hive.tgz -C /opt/module/
mv spark-2.4.5-bin-without-hive/ spark-2.4.5
vim /opt/module/hive/conf/spark-defaults.conf
spark.master yarn
spark.eventLog.enabled true
spark.eventLog.dir hdfs://hadoop102:8020/spark-history
spark.executor.memory 1g
spark.driver.memory 1g
hadoop fs -mkdir /spark-history
hadoop fs -mkdir /spark-jars
hadoop fs -put /opt/module/spark-2.4.5/jars/* /spark-jars
JDK
idea和maven问题
idea
一切都改成1.8
Maven
在settings.xml中添加
<profile>
<id>jdk18</id>
<activation>
<activeByDefault>true</activeByDefault>
<jdk>1.8</jdk>
</activation>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.compilerVersion>1.8</maven.compiler.compilerVersion>
</properties>
</profile>
hive错误
查看日志 /tmp/vanas/hive.log
Yarn application -list
Yarn application -kill xxxx
数仓搭建-ODS层
ODS层(用户行为数据)
hive
creaate database gmall;
use gmall;
建表
要求 分区表,能解析lzo文件,一个字段 字符串,外部表(防止误删)
表名:ods_log
建表语句:
drop table if exists ods_log;
CREATE EXTERNAL TABLE ods_log (`line` string)
PARTITIONED BY (`dt` string)
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/warehouse/gmall/ods/ods_log';
数据装载
load data inpath '/origin_data/gmall/log/topic_log/2020-06-27' overwrite into table ods_log partition(dt='2020-06-27');
建索引
hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer -Dmapreduce.job.queuename=hive /warehouse/gmall/ods/ods_log/dt=2020-06-27
装载脚本
确定导入哪天的数据?
昨天的数据
指定具体日期的数据
“$APP” => ${APP} 这样更好
“$sql” 引号不可以省略
[vanas@hadoop102 bin]$ vim hdfs_to_ods_log.sh
[vanas@hadoop102 bin]$ chmod +x hdfs_to_ods_log.sh
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop-3.1.3/bin/hadoop
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
echo ================== 日志日期为 $do_date ==================
sql="
load data inpath '/origin_data/gmall/log/topic_log/$do_date' into table ${APP}.ods_log partition(dt='$do_date');
"
$hive -e "$sql"
$hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer -Dmapreduce.job.queuename=hive /warehouse/gmall/ods/ods_log/dt=$do_date
启动脚本
[vanas@hadoop102 bin]$ ./hdfs_to_ods_log.sh 2020-06-27
hive (gmall)> select * from ods_log limit 10;
单引号 双引号 飘号
要把xxx 当做一个整体 会用到引号
[vanas@hadoop102 bin]$ str='hello world'
[vanas@hadoop102 bin]$ echo $str
hello world
[vanas@hadoop102 bin]$ str="hello world"
[vanas@hadoop102 bin]$ echo $str
hello world
当里面有变量时
双引会解析里面的变量
单引不会解析 原样显示
[vanas@hadoop102 bin]$ str="hello $HADOOP_HOME"
[vanas@hadoop102 bin]$ echo $str
hello /opt/module/hadoop-3.1.3
[vanas@hadoop102 bin]$ str='hello $HADOOP_HOME'
[vanas@hadoop102 bin]$ echo $str
hello $HADOOP_HOME
当需要输出结果时 使用反引
如果引号嵌套使用,则以最外层的算
ODS层(业务数据)
建表的分隔符要与sqoop里的分隔符保持一致
注意2个特殊表 不需要分区
共24张表
--3.3.1 订单表(增量及更新)
--hive (gmall)>
drop table if exists ods_order_info;
create external table ods_order_info (
`id` string COMMENT '订单号',
`final_total_amount` decimal(16,2) COMMENT '订单金额',
`order_status` string COMMENT '订单状态',
`user_id` string COMMENT '用户id',
`out_trade_no` string COMMENT '支付流水号',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间',
`province_id` string COMMENT '省份ID',
`benefit_reduce_amount` decimal(16,2) COMMENT '优惠金额',
`original_total_amount` decimal(16,2) COMMENT '原价金额',
`feight_fee` decimal(16,2) COMMENT '运费'
) COMMENT '订单表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_order_info/';
--3.3.2 订单详情表(增量)
--hive (gmall)>
drop table if exists ods_order_detail;
create external table ods_order_detail(
`id` string COMMENT '订单编号',
`order_id` string COMMENT '订单号',
`user_id` string COMMENT '用户id',
`sku_id` string COMMENT '商品id',
`sku_name` string COMMENT '商品名称',
`order_price` decimal(16,2) COMMENT '商品价格',
`sku_num` bigint COMMENT '商品数量',
`create_time` string COMMENT '创建时间',
`source_type` string COMMENT '来源类型',
`source_id` string COMMENT '来源编号'
) COMMENT '订单详情表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_order_detail/';
--3.3.3 SKU商品表(全量)
--hive (gmall)>
drop table if exists ods_sku_info;
create external table ods_sku_info(
`id` string COMMENT 'skuId',
`spu_id` string COMMENT 'spuid',
`price` decimal(16,2) COMMENT '价格',
`sku_name` string COMMENT '商品名称',
`sku_desc` string COMMENT '商品描述',
`weight` string COMMENT '重量',
`tm_id` string COMMENT '品牌id',
`category3_id` string COMMENT '品类id',
`create_time` string COMMENT '创建时间'
) COMMENT 'SKU商品表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_sku_info/';
--3.3.4 用户表(增量及更新)
--hive (gmall)>
drop table if exists ods_user_info;
create external table ods_user_info(
`id` string COMMENT '用户id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性别',
`email` string COMMENT '邮箱',
`user_level` string COMMENT '用户等级',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间'
) COMMENT '用户表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_user_info/';
--3.3.5 商品一级分类表(全量)
--hive (gmall)>
drop table if exists ods_base_category1;
create external table ods_base_category1(
`id` string COMMENT 'id',
`name` string COMMENT '名称'
) COMMENT '商品一级分类表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_category1/';
--3.3.6 商品二级分类表(全量)
--hive (gmall)>
drop table if exists ods_base_category2;
create external table ods_base_category2(
`id` string COMMENT ' id',
`name` string COMMENT '名称',
category1_id string COMMENT '一级品类id'
) COMMENT '商品二级分类表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_category2/';
--3.3.7 商品三级分类表(全量)
--hive (gmall)>
drop table if exists ods_base_category3;
create external table ods_base_category3(
`id` string COMMENT ' id',
`name` string COMMENT '名称',
category2_id string COMMENT '二级品类id'
) COMMENT '商品三级分类表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_category3/';
--3.3.8 支付流水表(增量)
--hive (gmall)>
drop table if exists ods_payment_info;
create external table ods_payment_info(
`id` bigint COMMENT '编号',
`out_trade_no` string COMMENT '对外业务编号',
`order_id` string COMMENT '订单编号',
`user_id` string COMMENT '用户编号',
`alipay_trade_no` string COMMENT '支付宝交易流水编号',
`total_amount` decimal(16,2) COMMENT '支付金额',
`subject` string COMMENT '交易内容',
`payment_type` string COMMENT '支付类型',
`payment_time` string COMMENT '支付时间'
) COMMENT '支付流水表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_payment_info/';
--3.3.9 省份表(特殊)
--hive (gmall)>
drop table if exists ods_base_province;
create external table ods_base_province (
`id` bigint COMMENT '编号',
`name` string COMMENT '省份名称',
`region_id` string COMMENT '地区ID',
`area_code` string COMMENT '地区编码',
`iso_code` string COMMENT 'iso编码,superset可视化使用'
) COMMENT '省份表'
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_province/';
--3.3.10 地区表(特殊)
--hive (gmall)>
drop table if exists ods_base_region;
create external table ods_base_region (
`id` string COMMENT '编号',
`region_name` string COMMENT '地区名称'
) COMMENT '地区表'
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_region/';
--3.3.11 品牌表(全量)
--hive (gmall)>
drop table if exists ods_base_trademark;
create external table ods_base_trademark (
`tm_id` string COMMENT '编号',
`tm_name` string COMMENT '品牌名称'
) COMMENT '品牌表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_trademark/';
--3.3.12 订单状态表(增量)
--hive (gmall)>
drop table if exists ods_order_status_log;
create external table ods_order_status_log (
`id` string COMMENT '编号',
`order_id` string COMMENT '订单ID',
`order_status` string COMMENT '订单状态',
`operate_time` string COMMENT '修改时间'
) COMMENT '订单状态表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_order_status_log/';
--3.3.13 SPU商品表(全量)
--hive (gmall)>
drop table if exists ods_spu_info;
create external table ods_spu_info(
`id` string COMMENT 'spuid',
`spu_name` string COMMENT 'spu名称',
`category3_id` string COMMENT '品类id',
`tm_id` string COMMENT '品牌id'
) COMMENT 'SPU商品表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_spu_info/';
--3.3.14 商品评论表(增量)
--hive (gmall)>
drop table if exists ods_comment_info;
create external table ods_comment_info(
`id` string COMMENT '编号',
`user_id` string COMMENT '用户ID',
`sku_id` string COMMENT '商品sku',
`spu_id` string COMMENT '商品spu',
`order_id` string COMMENT '订单ID',
`appraise` string COMMENT '评价',
`create_time` string COMMENT '评价时间'
) COMMENT '商品评论表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_comment_info/';
--3.3.15 退单表(增量)
--hive (gmall)>
drop table if exists ods_order_refund_info;
create external table ods_order_refund_info(
`id` string COMMENT '编号',
`user_id` string COMMENT '用户ID',
`order_id` string COMMENT '订单ID',
`sku_id` string COMMENT '商品ID',
`refund_type` string COMMENT '退款类型',
`refund_num` bigint COMMENT '退款件数',
`refund_amount` decimal(16,2) COMMENT '退款金额',
`refund_reason_type` string COMMENT '退款原因类型',
`create_time` string COMMENT '退款时间'
) COMMENT '退单表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_order_refund_info/';
--3.3.16 加购表(全量)
--hive (gmall)>
drop table if exists ods_cart_info;
create external table ods_cart_info(
`id` string COMMENT '编号',
`user_id` string COMMENT '用户id',
`sku_id` string COMMENT 'skuid',
`cart_price` decimal(16,2) COMMENT '放入购物车时价格',
`sku_num` bigint COMMENT '数量',
`sku_name` string COMMENT 'sku名称 (冗余)',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '修改时间',
`is_ordered` string COMMENT '是否已经下单',
`order_time` string COMMENT '下单时间',
`source_type` string COMMENT '来源类型',
`source_id` string COMMENT '来源编号'
) COMMENT '加购表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_cart_info/';
--3.3.17 商品收藏表(全量)
--hive (gmall)>
drop table if exists ods_favor_info;
create external table ods_favor_info(
`id` string COMMENT '编号',
`user_id` string COMMENT '用户id',
`sku_id` string COMMENT 'skuid',
`spu_id` string COMMENT 'spuid',
`is_cancel` string COMMENT '是否取消',
`create_time` string COMMENT '收藏时间',
`cancel_time` string COMMENT '取消时间'
) COMMENT '商品收藏表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_favor_info/';
--3.3.18 优惠券领用表(新增及变化)
--hive (gmall)>
drop table if exists ods_coupon_use;
create external table ods_coupon_use(
`id` string COMMENT '编号',
`coupon_id` string COMMENT '优惠券ID',
`user_id` string COMMENT 'skuid',
`order_id` string COMMENT 'spuid',
`coupon_status` string COMMENT '优惠券状态',
`get_time` string COMMENT '领取时间',
`using_time` string COMMENT '使用时间(下单)',
`used_time` string COMMENT '使用时间(支付)'
) COMMENT '优惠券领用表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_coupon_use/';
--3.3.19 优惠券表(全量)
--hive (gmall)>
drop table if exists ods_coupon_info;
create external table ods_coupon_info(
`id` string COMMENT '购物券编号',
`coupon_name` string COMMENT '购物券名称',
`coupon_type` string COMMENT '购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券',
`condition_amount` decimal(16,2) COMMENT '满额数',
`condition_num` bigint COMMENT '满件数',
`activity_id` string COMMENT '活动编号',
`benefit_amount` decimal(16,2) COMMENT '减金额',
`benefit_discount` decimal(16,2) COMMENT '折扣',
`create_time` string COMMENT '创建时间',
`range_type` string COMMENT '范围类型 1、商品 2、品类 3、品牌',
`spu_id` string COMMENT '商品id',
`tm_id` string COMMENT '品牌id',
`category3_id` string COMMENT '品类id',
`limit_num` bigint COMMENT '最多领用次数',
`operate_time` string COMMENT '修改时间',
`expire_time` string COMMENT '过期时间'
) COMMENT '优惠券表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_coupon_info/';
--3.3.20 活动表(全量)
--hive (gmall)>
drop table if exists ods_activity_info;
create external table ods_activity_info(
`id` string COMMENT '编号',
`activity_name` string COMMENT '活动名称',
`activity_type` string COMMENT '活动类型',
`start_time` string COMMENT '开始时间',
`end_time` string COMMENT '结束时间',
`create_time` string COMMENT '创建时间'
) COMMENT '活动表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_activity_info/';
--3.3.21 活动订单关联表(增量)
--hive (gmall)>
drop table if exists ods_activity_order;
create external table ods_activity_order(
`id` string COMMENT '编号',
`activity_id` string COMMENT '活动ID',
`order_id` string COMMENT '订单id',
`create_time` string COMMENT '下单时间'
) COMMENT '活动订单关联表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_activity_order/';
--3.3.22 优惠规则表(全量)
--hive (gmall)>
drop table if exists ods_activity_rule;
create external table ods_activity_rule(
`id` string COMMENT '编号',
`activity_id` string COMMENT '活动ID',
`condition_amount` decimal(16,2) COMMENT '满减金额',
`condition_num` bigint COMMENT '满减件数',
`benefit_amount` decimal(16,2) COMMENT '优惠金额',
`benefit_discount` decimal(16,2) COMMENT '优惠折扣',
`benefit_level` string COMMENT '优惠级别'
) COMMENT '优惠规则表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_activity_rule/';
--3.3.23 编码字典表(全量)
--hive (gmall)>
drop table if exists ods_base_dic;
create external table ods_base_dic(
`dic_code` string COMMENT '编号',
`dic_name` string COMMENT '编码名称',
`parent_code` string COMMENT '父编码',
`create_time` string COMMENT '创建日期',
`operate_time` string COMMENT '操作日期'
) COMMENT '编码字典表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_dic/';
导入脚本
单纯load不需要建索引
全量表
load data ... into ...
新增表
新增和变化
特殊
为什么日志需要建索引,业务数据不需要索引?
1.日志flume直接采集过来的,没有索引,所以导入到ods的时候需要手动建索引
2.业务数据从mysql=>hdfs的时候,已经索引了,所以就不需要建了
Sql1 是21张表没有省份和地区表
Sql2 是省份表和地区表
[vanas@hadoop102 bin]$ vim hdfs_to_ods_db.sh
[vanas@hadoop102 bin]$ chmod +x hdfs_to_ods_db.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$2" ] ;then
do_date=$2
else
do_date=`date -d "-1 day" +%F`
fi
sql1="
load data inpath '/origin_data/$APP/db/order_info/$do_date' OVERWRITE into table ${APP}.ods_order_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_detail/$do_date' OVERWRITE into table ${APP}.ods_order_detail partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/sku_info/$do_date' OVERWRITE into table ${APP}.ods_sku_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/user_info/$do_date' OVERWRITE into table ${APP}.ods_user_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/payment_info/$do_date' OVERWRITE into table ${APP}.ods_payment_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category1/$do_date' OVERWRITE into table ${APP}.ods_base_category1 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category2/$do_date' OVERWRITE into table ${APP}.ods_base_category2 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category3/$do_date' OVERWRITE into table ${APP}.ods_base_category3 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_trademark/$do_date' OVERWRITE into table ${APP}.ods_base_trademark partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/activity_info/$do_date' OVERWRITE into table ${APP}.ods_activity_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/activity_order/$do_date' OVERWRITE into table ${APP}.ods_activity_order partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/cart_info/$do_date' OVERWRITE into table ${APP}.ods_cart_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/comment_info/$do_date' OVERWRITE into table ${APP}.ods_comment_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/coupon_info/$do_date' OVERWRITE into table ${APP}.ods_coupon_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/coupon_use/$do_date' OVERWRITE into table ${APP}.ods_coupon_use partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/favor_info/$do_date' OVERWRITE into table ${APP}.ods_favor_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_refund_info/$do_date' OVERWRITE into table ${APP}.ods_order_refund_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_status_log/$do_date' OVERWRITE into table ${APP}.ods_order_status_log partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/spu_info/$do_date' OVERWRITE into table ${APP}.ods_spu_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/activity_rule/$do_date' OVERWRITE into table ${APP}.ods_activity_rule partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_dic/$do_date' OVERWRITE into table ${APP}.ods_base_dic partition(dt='$do_date');
"
sql2="
load data inpath '/origin_data/$APP/db/base_province/$do_date' OVERWRITE into table ${APP}.ods_base_province;
load data inpath '/origin_data/$APP/db/base_region/$do_date' OVERWRITE into table ${APP}.ods_base_region;
"
case $1 in
"first"){
$hive -e "$sql1$sql2"
};;
"all"){
$hive -e "$sql1"
};;
*){
echo "你执行的姿势不对,换个姿势再来"
};;
esac
导入
装载数据不走mr
[vanas@hadoop102 bin]$ hdfs_to_ods_db.sh first 2020-06-27
之后再导入
[vanas@hadoop102 bin]$ hdfs_to_ods_db.sh all 2020-06-28
DWD层
19张 临时表2张 共21 个
用户行为日志
数据的分组:
1.启动日志
2.事件日志
把不同的事件放在不同的表中
启动日志
插入数据:
数据来源:ods层的ods_log
insert overwrite table dwd_start_log partition(dt='2020-06-27')
select
get_json_object(line,'$.common.ar'),
get_json_object(line,'$.common.ba'),
...
from ods_log
where dt='2020-06-27'
and get_json_object(line,'$.start') is not null
如何在sql中解析json
get_json_object('json字符串',$.common.ar)
{"name":"lisi","age":10,"girls":["zhiling","fengjie"]}
get_json_object('{"name":"lisi","age":10,"girls":["zhiling","fengjie"]}','$.age');
get_json_object('{"name":"lisi","age":10,"girls":["zhiling","fengjie"]}','$.girls[0]');
get_json_object函数使用
取第一个json对象
hive (gmall)> select get_json_object('[{"name":"大郎","sex":"男","age":"25"},{"name":"西门庆","sex":"男","age":"47"}]','$[0]');
OK
_c0
{"name":"大郎","sex":"男","age":"25"}
Time taken: 0.524 seconds, Fetched: 1 row(s)
取出第一个json的age字段的值
hive (gmall)> SELECT get_json_object('[{"name":"大郎","sex":"男","age":"25"},{"name":"西门庆","sex":"男","age":"47"}]',"$[0].age");
OK
_c0
25
页面埋点日志
启动日志
列式存储 + 压缩
parquet本身可以切片 所以不需要在建索引
建表
drop table if exists dwd_start_log;
CREATE EXTERNAL TABLE dwd_start_log(
`area_code` string,
`brand` string,
`channel` string,
`model` string,
`mid_id` string,
`os` string,
`user_id` string,
`version_code` string,
`entry` string,
`loading_time` bigint,
`open_ad_id` string,
`open_ad_ms` bigint,
`open_ad_skip_ms` bigint,
`ts` bigint
)
PARTITIONED BY (dt string)
stored as parquet
LOCATION '/warehouse/gmall/dwd/dwd_start_log'
TBLPROPERTIES('parquet.compression'='lzo');
装载数据
SET mapreduce.job.queuename=hive;
当读取的一张表有lzo文件时 并且lzo文件建了索引 需要设置
不然会把索引文件当成输入文件
默认:hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_start_log partition(dt='2020-06-27')
select
get_json_object(line,'$.common.ar'),
get_json_object(line,'$.common.ba'),
get_json_object(line,'$.common.ch'),
get_json_object(line,'$.common.md'),
get_json_object(line,'$.common.mid'),
get_json_object(line,'$.common.os'),
get_json_object(line,'$.common.uid'),
get_json_object(line,'$.common.vc'),
get_json_object(line,'$.start.entry'),
get_json_object(line,'$.start.loading_time'),
get_json_object(line,'$.start.open_ad_id'),
get_json_object(line,'$.start.open_ad_ms'),
get_json_object(line,'$.start.open_ad_skip_ms'),
get_json_object(line,'$.ts')
from ods_log
where dt='2020-06-27'
and get_json_object(line,'$.start') is not null;
select * from dwd_start_log;
页面日志
建表
drop table if exists dwd_page_log;
CREATE EXTERNAL TABLE dwd_page_log(
`area_code` string,
`brand` string,
`channel` string,
`model` string,
`mid_id` string,
`os` string,
`user_id` string,
`version_code` string,
`during_time` bigint,
`page_item` string,
`page_item_type` string,
`last_page_id` string,
`page_id` string,
`source_type` string,
`ts` bigint
)
PARTITIONED BY (dt string)
stored as parquet
LOCATION '/warehouse/gmall/dwd/dwd_page_log'
TBLPROPERTIES('parquet.compression'='lzo');
数据导入
insert overwrite table dwd_page_log partition(dt='2020-06-27')
select
get_json_object(line,'$.common.ar'),
get_json_object(line,'$.common.ba'),
get_json_object(line,'$.common.ch'),
get_json_object(line,'$.common.md'),
get_json_object(line,'$.common.mid'),
get_json_object(line,'$.common.os'),
get_json_object(line,'$.common.uid'),
get_json_object(line,'$.common.vc'),
get_json_object(line,'$.page.during_time'),
get_json_object(line,'$.page.item'),
get_json_object(line,'$.page.item_type'),
get_json_object(line,'$.page.last_page_id'),
get_json_object(line,'$.page.page_id'),
get_json_object(line,'$.page.sourceType'),
get_json_object(line,'$.ts')
from ods_log
where dt = '2020-06-27'
and get_json_object(line,'$.page') is not null
动作日志
UDTF
package com.vanas.gmall.hive.udtf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import java.util.ArrayList;
import java.util.List;
/**
* @author Vanas
* @create 2020-06-28 8:45 下午
*/
public class ExplodeJsonArray extends GenericUDTF {
@Override
public void close() throws HiveException {
}
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
//如何获取传参中的 ID:string的类型,这里的string是一个常量字符串,而argument是一个对象检查器
List<? extends StructField> allStructFieldRefs = argOIs.getAllStructFieldRefs();
if (allStructFieldRefs.size() != 1) {
throw new UDFArgumentException("explode_json_array函数的参数个数只能为1");
}
if (!"string".equals(allStructFieldRefs.get(0).getFieldObjectInspector().getTypeName())) {
throw new UDFArgumentException("explode_json_array函数的参数类型只能为string");
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("item");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
String jsonStr = args[0].toString();
JSONArray jsonArray = new JSONArray(jsonStr);
for (int i = 0; i < jsonArray.length(); i++) {
String[] result = new String[1];
result[0] = jsonArray.getString(i);
//要么传数组要么传集合,炸开可能是一个列或多个列
forward(result);
}
}
}
package com.atguigu.hive.udtf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.*;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import java.util.ArrayList;
import java.util.List;
/**
* udtf: 一进多出
* 1进: [{}, {}]
* 多出: "{}", "{}"
* @Author lzc
* @Date 2020/6/29 14:26
*/
@Description(name = "explode_json_array", value = "explode json array ....")
public class ExplodeJsonArray extends GenericUDTF {
/*
作用:
1.检测输入
2. 返回期望的数据类型的检测器
*/
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
// 1. 检测输入
// 1.1 获取到传入的参数
List<? extends StructField> inputFields = argOIs.getAllStructFieldRefs();
if(inputFields.size() != 1)
throw new UDFArgumentException("函数 explode_json_array 需要正好1个参数");
ObjectInspector inspector = inputFields.get(0).getFieldObjectInspector();
if(inspector.getCategory() != ObjectInspector.Category.PRIMITIVE || !inspector.getTypeName().equals("string")){
throw new UDFArgumentException("函数 explode_json_array 参数类型不匹配, 必须是一个字符串");
}
// 2. 返回期望数据类型的检测器
List<String> names = new ArrayList<>();
names.add("action");
List<ObjectInspector> inspectors = new ArrayList<>();
inspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(names, inspectors);
}
/*
处理数据 [{}, {}, ...]
*/
@Override
public void process(Object[] args) throws HiveException {
String jsonArrayString = args[0].toString();
JSONArray jsonArray = new JSONArray(jsonArrayString);
for (int i = 0; i < jsonArray.length(); i++) {
String col = jsonArray.getString(i);
String[] cols = new String[1];
cols[0] = col;
forward(cols); // 为什么要传递数组? 有可能炸裂出来多列数据, 所以才需要传递数字
}
}
/**
* 关闭资源
* 不用实现
*/
@Override
public void close() throws HiveException {
}
}
打包
上传到hdfs
/user/hive/jars
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pIXYbmMU-1600789772846)(/Users/vanas/Library/Application Support/typora-user-images/截屏2020-06-28下午9.56.26.png)]
create function explode_json_array as 'com.vanas.gmall.hive.udtf.ExplodeJsonArray' using jar 'hdfs://hadoop102:8020/user/hive/jars/hive-0213-1.0-SNAPSHOT.jar';
Lateral View关于炸裂函数复习
实际上查询的表是一个侧写过的虚拟表,不再是test表了
虚拟表会在原有表的基础上增加若干列,如array,则是1列,map为2列(K和V)
c1 c2 c3 c4(array)
1001 aaa bbb [a,b,c]
1002 ccc ddd [a,b]
1001 aaa bbb [c,d]
c5:多出来的字段名
tmp的含义:可以把c5这一列看成一个表,tmp就是这个中间表的表名
在查询时考虑当c5列名与前面表的字段名重复时,可以用 tmp.c5来区分
#个人理解:
侧写的过程理解为 以复杂字段为关联条件的两表join
其中一张表为主表,另一张表为复杂字段内的元素为最小粒度的表
select
from tbl lateral view explode(c4) tmp as c5,c6
tbl.c1 tbl.c2 tbl.c3 tbl.c4(array) tmp.c5
1001 aaa bbb [a,b,c] a
1001 aaa bbb [a,b,c] b
1001 aaa bbb [a,b,c] c
1002 ccc ddd [a,b] a
1002 ccc ddd [a,b] b
1001 aaa bbb [c,d] c
1001 aaa bbb [c,d] d
建表
drop table if exists dwd_action_log;
CREATE EXTERNAL TABLE dwd_action_log(
`area_code` string,
`brand` string,
`channel` string,
`model` string,
`mid_id` string,
`os` string,
`user_id` string,
`version_code` string,
`during_time` bigint,
`page_item` string,
`page_item_type` string,
`last_page_id` string,
`page_id` string,
`source_type` string,
`action_id` string,
`item` string,
`item_type` string,
`ts` bigint
)
PARTITIONED BY (dt string)
stored as parquet
LOCATION '/warehouse/gmall/dwd/dwd_action_log'
TBLPROPERTIES('parquet.compression'='lzo');
导入
insert overwrite table dwd_action_log partition(dt='2020-06-27')
select
get_json_object(line,'$.common.ar'),
get_json_object(line,'$.common.ba'),
get_json_object(line,'$.common.ch'),
get_json_object(line,'$.common.md'),
get_json_object(line,'$.common.mid'),
get_json_object(line,'$.common.os'),
get_json_object(line,'$.common.uid'),
get_json_object(line,'$.common.vc'),
get_json_object(line,'$.page.during_time'),
get_json_object(line,'$.page.item'),
get_json_object(line,'$.page.item_type'),
get_json_object(line,'$.page.last_page_id'),
get_json_object(line,'$.page.page_id'),
get_json_object(line,'$.page.sourceType'),
get_json_object(action,'$.action_id'),
get_json_object(action,'$.item'),
get_json_object(action,'$.item_type'),
get_json_object(action,'$.ts')
from ods_log lateral view explode_json_array(get_json_object(line,'$.actions')) tmp as action
where dt = '2020-06-27'
and get_json_object(line,'$.actions') is not null
select * from dwd_action_log limit 10;
曝光日志
同上述动作日志,也需要炸裂开
建表
drop table if exists dwd_display_log;
CREATE EXTERNAL TABLE dwd_display_log(
`area_code` string,
`brand` string,
`channel` string,
`model` string,
`mid_id` string,
`os` string,
`user_id` string,
`version_code` string,
`during_time` bigint,
`page_item` string,
`page_item_type` string,
`last_page_id` string,
`page_id` string,
`source_type` string,
`ts` bigint,
`display_type` string,
`item` string,
`item_type` string,
`order` bigint
)
PARTITIONED BY (dt string)
stored as parquet
LOCATION '/warehouse/gmall/dwd/dwd_display_log'
TBLPROPERTIES('parquet.compression'='lzo');
导入
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_display_log partition(dt='2020-06-27')
select
get_json_object(line,'$.common.ar'),
get_json_object(line,'$.common.ba'),
get_json_object(line,'$.common.ch'),
get_json_object(line,'$.common.md'),
get_json_object(line,'$.common.mid'),
get_json_object(line,'$.common.os'),
get_json_object(line,'$.common.uid'),
get_json_object(line,'$.common.vc'),
get_json_object(line,'$.page.during_time'),
get_json_object(line,'$.page.item'),
get_json_object(line,'$.page.item_type'),
get_json_object(line,'$.page.last_page_id'),
get_json_object(line,'$.page.page_id'),
get_json_object(line,'$.page.sourceType'),
get_json_object(line,'$.ts'),
get_json_object(displays,'$.displayType'),
get_json_object(displays,'$.item'),
get_json_object(displays,'$.item_type'),
get_json_object(displays,'$.order')
from ods_log lateral view explode_json_array(get_json_object(line,'$.displays')) tmp as displays
where dt='2020-06-27'
and get_json_object(line,'$.displays') is not null;
错误日志
One to One
udf知识补充
知识补充
简单UDF 已过时,支持基本数据类型
package com.vanas.gmall.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
/**
* 没办法处理复杂数据类型的
* @author Vanas
* @create 2020-06-28 11:07 下午
*/
public class TestSimpleUDF extends UDF {
public String evaluate(String arg) {
return "";
}
}
通用UDF,支持复杂数据类型
package com.vanas.gmall.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
/**
* 支持复杂的数据类型
* @author Vanas
* @create 2020-06-28 11:09 下午
*/
public class TestGenericUDF extends GenericUDF {
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
return null;
}
public Object evaluate(DeferredObject[] arguments) throws HiveException {
return null;
}
public String getDisplayString(String[] children) {
return null;
}
}
如何定义一个udtf:
1.继承一个类
2.重写方法
使用:
1.打包成jar
2.放到hdfs上(copy到hive的lib目录下)
3.创建永久的hive的函数
UDF
如果操作的类型是简单类型,一般选择UDF
如果要操作的是复杂类型,array,结构体,maplist
一般选择GenericUDF
package com.vanas.gmall.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONObject;
import java.util.ArrayList;
/**
* @author Vanas
* @create 2020-06-28 11:13 下午
*/
public class JsonArrayToStructArray extends GenericUDF {
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
//为什么要要中见哪个列名的参数?指明需要json中的哪些字段,协调个数和参数名
//json_array_to_struct_array(json_array,'id','name','age','id:string','name:strubg','age:int')
if (arguments.length < 3) {
throw new UDFArgumentException("json_array_to_struct_array至少需要3个参数");
}
for (int i = 0; i < arguments.length; i++) {
//如何获取传参中的 ID:string的类型,这里的string是一个常量字符串,而argument是一个对象检查器
if (!"string".equals(arguments[i].getTypeName())) {
throw new UDFArgumentException("json_array_to_struct_array的第" + (i + 1) + "个参数类型应为String");
}
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
for (int i = (arguments.length + 1) / 2; i < arguments.length; i++) {
if (!(arguments[i] instanceof ConstantObjectInspector)) {
throw new UDFArgumentException("json_array_to_struct_array的第" + (i + 1) + "个参数类型应为string常量");
}
String field = ((ConstantObjectInspector) arguments[i]).getWritableConstantValue().toString();
String[] split = field.split(":");
fieldNames.add(split[0]);
switch (split[1]) {
case "string":
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
break;
case "boolean":
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
break;
case "tinyint":
fieldOIs.add(PrimitiveObjectInspectorFactory.javaByteObjectInspector);
break;
case "smallint":
fieldOIs.add(PrimitiveObjectInspectorFactory.javaShortObjectInspector);
break;
case "int":
fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
break;
case "bigint":
fieldOIs.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector);
break;
case "float":
fieldOIs.add(PrimitiveObjectInspectorFactory.javaFloatObjectInspector);
break;
case "double":
fieldOIs.add(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector);
break;
default:
throw new UDFArgumentException("json_array_to_struct_array 不支持" + split[1] + "类型");
}
}
return ObjectInspectorFactory.getStandardListObjectInspector(ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs));
}
public Object evaluate(DeferredObject[] arguments) throws HiveException {
//array:数组或者集合
//map:Map
//struct:数组或者集合
//array<struct>:List<List>
if (arguments[0].get() == null) {
return null;
}
String strArray = arguments[0].get().toString();
JSONArray jsonArray = new JSONArray(strArray);
ArrayList<Object> array = new ArrayList<>();
for (int i = 0; i < jsonArray.length(); i++) {
ArrayList<Object> struct = new ArrayList<>();
JSONObject jsonObject = jsonArray.getJSONObject(i);
for (int j = 1; j < (arguments.length + 1) / 2; j++) {
String key = arguments[j].get().toString();
if (jsonObject.has(key)) {
struct.add(jsonObject.get(key));
} else {
struct.add(null);
}
}
array.add(struct);
}
return array;
}
public String getDisplayString(String[] children) {
return getStandardDisplayString("json_array_to_struct_array", children);
}
}
打包
放入hdfs中
/user/hive/jars
创建函数
create function json_array_to_struct_array as 'com.vanas.gmall.hive.udf.JsonArrayToStructArray' using jar 'hdfs://hadoop102:8020/user/hive/jars/hive-0213-1.0-SNAPSHOT.jar';
建表
drop table if exists dwd_error_log;
CREATE EXTERNAL TABLE dwd_error_log(
`area_code` string,
`brand` string,
`channel` string,
`model` string,
`mid_id` string,
`os` string,
`user_id` string,
`version_code` string,
`page_item` string,
`page_item_type` string,
`last_page_id` string,
`page_id` string,
`source_type` string,
`entry` string,
`loading_time` string,
`open_ad_id` string,
`open_ad_ms` string,
`open_ad_skip_ms` string,
`actions` array<struct<action_id:string,item:string,item_type:string,ts:bigint>>,
`displays` array<struct<display_type:string,item:string,item_type:string,`order`:int>>,
`ts` string,
`error_code` string,
`msg` string
)
PARTITIONED BY (dt string)
stored as parquet
LOCATION '/warehouse/gmall/dwd/dwd_error_log'
TBLPROPERTIES('parquet.compression'='lzo');
导入
insert overwrite table dwd_error_log partition(dt='2020-06-27')
select
get_json_object(line,'$.common.ar'),
get_json_object(line,'$.common.ba'),
get_json_object(line,'$.common.ch'),
get_json_object(line,'$.common.md'),
get_json_object(line,'$.common.mid'),
get_json_object(line,'$.common.os'),
get_json_object(line,'$.common.uid'),
get_json_object(line,'$.common.vc'),
get_json_object(line,'$.page.item'),
get_json_object(line,'$.page.item_type'),
get_json_object(line,'$.page.last_page_id'),
get_json_object(line,'$.page.page_id'),
get_json_object(line,'$.page.sourceType'),
get_json_object(line,'$.start.entry'),
get_json_object(line,'$.start.loading_time'),
get_json_object(line,'$.start.open_ad_id'),
get_json_object(line,'$.start.open_ad_ms'),
get_json_object(line,'$.start.open_ad_skip_ms'),
json_array_to_struct_array(get_json_object(line,'$.actions'),'action_id','item','item_type','ts','action_id:string','item:string','item_type:string','ts:bigint'),
json_array_to_struct_array(get_json_object(line,'$.displays'),'displayType','item','item_type','order','display_type:string','item:string','item_type:string','order:int'),
get_json_object(line,'$.ts'),
get_json_object(line,'$.err.error_code'),
get_json_object(line,'$.err.msg')
from ods_log
where dt = '2020-06-27'
and get_json_object(line,'$.err') is not null
业务数据
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xk74zCfP-1600789772848)(/Users/vanas/Library/Application Support/typora-user-images/截屏2020-06-29下午5.50.03.png)]
时间 | 用户 | 地区 | 商品 | 优惠券 | 活动 | 编码 | 度量值 | |
---|---|---|---|---|---|---|---|---|
订单 | √ | √ | √ | √ | 件数/金额 | |||
订单详情 | √ | √ | √ | 件数/金额 | ||||
支付 | √ | √ | 金额 | |||||
加购 | √ | √ | √ | 件数/金额 | ||||
收藏 | √ | √ | √ | 个数 | ||||
评价 | √ | √ | √ | 个数 | ||||
退款 | √ | √ | √ | 件数/金额 | ||||
优惠券领用 | √ | √ | √ | 个数 |
维度表大部分采用的是全量同步,基本上都是全量表,还有小部分是新增及变化
事实表 一般都是增量同步
维度表
商品维度表(全量表)
建表
DROP TABLE IF EXISTS `dwd_dim_sku_info`;
CREATE EXTERNAL TABLE `dwd_dim_sku_info` (
`id` string COMMENT '商品id',
`spu_id` string COMMENT 'spuid',
`price` decimal(16,2) COMMENT '商品价格',
`sku_name` string COMMENT '商品名称',
`sku_desc` string COMMENT '商品描述',
`weight` decimal(16,2) COMMENT '重量',
`tm_id` string COMMENT '品牌id',
`tm_name` string COMMENT '品牌名称',
`category3_id` string COMMENT '三级分类id',
`category2_id` string COMMENT '二级分类id',
`category1_id` string COMMENT '一级分类id',
`category3_name` string COMMENT '三级分类名称',
`category2_name` string COMMENT '二级分类名称',
`category1_name` string COMMENT '一级分类名称',
`spu_name` string COMMENT 'spu名称',
`create_time` string COMMENT '创建时间'
)
COMMENT '商品维度表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_sku_info/'
tblproperties ("parquet.compression"="lzo");
装载数据
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_dim_sku_info partition(dt='2020-06-27')
select
sku.id,
sku.spu_id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.tm_id,
ob.tm_name,
sku.category3_id,
c2.id category2_id,
c1.id category1_id,
c3.name category3_name,
c2.name category2_name,
c1.name category1_name,
spu.spu_name,
sku.create_time
from
(
select * from ods_sku_info where dt='2020-06-27'
)sku
join
(
select * from ods_base_trademark where dt='2020-06-27'
)ob on sku.tm_id=ob.tm_id
join
(
select * from ods_spu_info where dt='2020-06-27'
)spu on spu.id = sku.spu_id
join
(
select * from ods_base_category3 where dt='2020-06-27'
)c3 on sku.category3_id=c3.id
join
(
select * from ods_base_category2 where dt='2020-06-27'
)c2 on c3.category2_id=c2.id
join
(
select * from ods_base_category1 where dt='2020-06-27'
)c1 on c2.category1_id=c1.id;
优惠券信息表(全量)
建表
drop table if exists dwd_dim_coupon_info;
create external table dwd_dim_coupon_info(
`id` string COMMENT '购物券编号',
`coupon_name` string COMMENT '购物券名称',
`coupon_type` string COMMENT '购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券',
`condition_amount` decimal(16,2) COMMENT '满额数',
`condition_num` bigint COMMENT '满件数',
`activity_id` string COMMENT '活动编号',
`benefit_amount` decimal(16,2) COMMENT '减金额',
`benefit_discount` decimal(16,2) COMMENT '折扣',
`create_time` string COMMENT '创建时间',
`range_type` string COMMENT '范围类型 1、商品 2、品类 3、品牌',
`spu_id` string COMMENT '商品id',
`tm_id` string COMMENT '品牌id',
`category3_id` string COMMENT '品类id',
`limit_num` bigint COMMENT '最多领用次数',
`operate_time` string COMMENT '修改时间',
`expire_time` string COMMENT '过期时间'
) COMMENT '优惠券信息表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_coupon_info/'
tblproperties ("parquet.compression"="lzo");
装载
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_dim_coupon_info partition(dt='2020-06-27')
select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
spu_id,
tm_id,
category3_id,
limit_num,
operate_time,
expire_time
from ods_coupon_info
where dt='2020-06-27';
活动维度表(全量)
建表
drop table if exists dwd_dim_activity_info;
create external table dwd_dim_activity_info(
`id` string COMMENT '编号',
`activity_name` string COMMENT '活动名称',
`activity_type` string COMMENT '活动类型',
`condition_amount` decimal(16,2) COMMENT '满减金额',
`condition_num` bigint COMMENT '满减件数',
`benefit_amount` decimal(16,2) COMMENT '优惠金额',
`benefit_discount` decimal(16,2) COMMENT '优惠折扣',
`benefit_level` string COMMENT '优惠级别',
`start_time` string COMMENT '开始时间',
`end_time` string COMMENT '结束时间',
`create_time` string COMMENT '创建时间'
) COMMENT '活动信息表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_activity_info/'
tblproperties ("parquet.compression"="lzo");
装载数据
活动维度表与事实表粒度要保持一致 (所以activity_id 和 另外一个字段比如benefit_level 一起组成外键)
但是我们的数据有欠缺
所以换一种方法解决
left join 用join也行
insert overwrite table dwd_dim_activity_info partition(dt='2020-06-27')
select
info.id,
info.activity_name,
info.activity_type,
rule.condition_amount,
rule.condition_num,
rule.benefit_amount,
rule.benefit_discount,
rule.benefit_level,
info.start_time,
info.end_time,
info.create_time
from
(
select * from ods_activity_info where dt='2020-06-27'
)info
left join
(
select * from ods_activity_rule where dt='2020-06-27'
)rule on info.id = rule.activity_id;
删表
drop table dwd_dim_activity_info;
删hdfs的内存
建表
drop table if exists dwd_dim_activity_info;
create external table dwd_dim_activity_info
(
`id` string COMMENT '编号',
`activity_name` string COMMENT '活动名称',
`activity_type` string COMMENT '活动类型',
`start_time` string COMMENT '开始时间',
`end_time` string COMMENT '结束时间',
`create_time` string COMMENT '创建时间'
) COMMENT '活动信息表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_activity_info/'
tblproperties ("parquet.compression" = "lzo");
导入
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_dim_activity_info partition (dt = '2020-06-27')
select id,
activity_name,
activity_type,
start_time,
end_time,
create_time
from ods_activity_info
where dt = '2020-06-27'
脚本已经联动更改
地区维度表(特殊)
本身数据量小,没必要压缩浪费cpu资源
建表
DROP TABLE IF EXISTS `dwd_dim_base_province`;
CREATE EXTERNAL TABLE `dwd_dim_base_province` (
`id` string COMMENT 'id',
`province_name` string COMMENT '省市名称',
`area_code` string COMMENT '地区编码',
`iso_code` string COMMENT 'ISO编码',
`region_id` string COMMENT '地区id',
`region_name` string COMMENT '地区名称'
)
COMMENT '地区省市表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_base_province/'
tblproperties ("parquet.compression"="lzo");
装载
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_dim_base_province
select
bp.id,
bp.name,
bp.area_code,
bp.iso_code,
bp.region_id,
br.region_name
from ods_base_province bp
join ods_base_region br
on bp.region_id=br.id;
时间维度表(特殊)
特殊之处:不是不变,是可以提前预知的
业务系统中是不会有时间维度表
数仓中肯定有这个表的 ,多一个维度用来信息
处理节假日时,有2种方法
1.查日历 ,手动
2.通过第三方接口
DROP TABLE IF EXISTS `dwd_dim_date_info`;
CREATE EXTERNAL TABLE `dwd_dim_date_info`(
`date_id` string COMMENT '日',
`week_id` string COMMENT '周',
`week_day` string COMMENT '周的第几天',
`day` string COMMENT '每月的第几天',
`month` string COMMENT '第几月',
`quarter` string COMMENT '第几季度',
`year` string COMMENT '年',
`is_workday` string COMMENT '是否是周末',
`holiday_id` string COMMENT '是否是节假日'
)
row format delimited fields terminated by '\t'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_date_info/'
tblproperties ("parquet.compression"="lzo");
因为是有压缩的
所以不能直接load
需要建临时表
DROP TABLE IF EXISTS `dwd_dim_date_info_tmp`;
CREATE EXTERNAL TABLE `dwd_dim_date_info_tmp`(
`date_id` string COMMENT '日',
`week_id` string COMMENT '周',
`week_day` string COMMENT '周的第几天',
`day` string COMMENT '每月的第几天',
`month` string COMMENT '第几月',
`quarter` string COMMENT '第几季度',
`year` string COMMENT '年',
`is_workday` string COMMENT '是否是周末',
`holiday_id` string COMMENT '是否是节假日'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_dim_date_info_tmp/';
将数据导入临时表
load data local inpath '/opt/module/db_log/date_info.txt' into table dwd_dim_date_info_tmp;
将数据导入正式表
insert overwrite table dwd_dim_date_info select * from dwd_dim_date_info_tmp;
事实表
订单详情表
1.选择业务(确定事实表)
2.声明粒度
3.确认维度(维度表的相关id)
时间,地区,商品
4.确认事实(确定度量值)
金额
事务型事实表——般都是增量表
周期型——全量同步
累计型——新增及变化
订单明细事实表(事务型事实表)
时间 | 用户 | 地区 | 商品 | 优惠券 | 活动 | 编码 | 度量值 | |
---|---|---|---|---|---|---|---|---|
订单详情 | √ | √ | √ | √ | 件数/金额 |
看表主要来源里的数据是否能拿到 拿不到就和别的表join
drop table if exists dwd_fact_order_detail;
create external table dwd_fact_order_detail (
`id` string COMMENT '订单编号',
`order_id` string COMMENT '订单号',
`user_id` string COMMENT '用户id',
`sku_id` string COMMENT 'sku商品id',
`sku_name` string COMMENT '商品名称',
`order_price` decimal(16,2) COMMENT '商品价格',
`sku_num` bigint COMMENT '商品数量',
`create_time` string COMMENT '创建时间',
`province_id` string COMMENT '省份ID',
`source_type` string COMMENT '来源类型',
`source_id` string COMMENT '来源编号',
`original_amount_d` decimal(20,2) COMMENT '原始价格分摊',
`final_amount_d` decimal(20,2) COMMENT '购买价格分摊',
`feight_fee_d` decimal(20,2) COMMENT '分摊运费',
`benefit_reduce_amount_d` decimal(20,2) COMMENT '分摊优惠'
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_fact_order_detail/'
tblproperties ("parquet.compression"="lzo");
数据来源2张表 订单详情表ods_order_detail 和 订单表dos_order_info
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RHu17wJ8-1600789772849)(/Users/vanas/Library/Application Support/typora-user-images/截屏2020-06-29下午6.54.34.png)]
‘购买价格分摊’ =‘原始价格分摊’+‘分摊运费’-‘分摊优惠’
分摊运费 可能会造成精度损失,需要补偿
od.order_price*od.sku_num --当前明细总值
od.order_price*od.sku_num/oi.original_total_amount*oi.final_total_amount
od.order_price*od.sku_num/oi.original_total_amount*oi.feight_fee
od.order_price*od.sku_num/oi.original_total_amount*oi.benefit_reduce_amount
round(oi.feight_fee*od.order_price*od.sku_num/oi.original_total_amount,2)--运费分摊
insert overwrite table dwd_fact_order_detail partition(dt='2020-06-27')
select
id,
order_id,
user_id,
sku_id,
sku_name,
order_price,
sku_num,
create_time,
province_id,
source_type,
source_id,
original_amount_d,
if(rn=1,feight_fee_d+feight_fee-sum_div_feight_fee,fei)
from
(
select
od.id,
od.order_id,
od.user_id,
od.sku_id,
od.sku_name,
od.order_price,
od.sku_num,
od.create_time,
oi.province_id,
od.source_type,
od.source_id,
round(od.order_price*od.sku_num,2) original_amount_d, --原始价格分摊
round(oi.feight_fee*od.order_price*od.sku_num/oi.original_total_amount,2) feight_fee_d,--运费分摊
sum(round(oi.feight_fee*od.order_price*od.sku_num/oi.original_total_amount,2)) over(partition by od.order_id) sum_div_feight_fee,--分摊运费之和
oi.feight_fee feight_fee, --订单总运费
row_number() over(partition by od.order_id order by od.id) rn
from
(
select * from ods_order_detail where dt='2020-06-27'
)od
join
(
select * from ods_order_info where dt='2020-06-27'
)oi
on od.order_id=oi.id
)
数据装载
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_fact_order_detail partition(dt='2020-06-27')
select
id,
order_id,
user_id,
sku_id,
sku_num,
order_price,
sku_num,
create_time,
province_id,
source_type,
source_id,
original_amount_d,
if(rn=1,final_total_amount-(sum_div_final_amount-final_amount_d),final_amount_d),
if(rn=1,feight_fee-(sum_div_feight_fee-feight_fee_d),feight_fee_d),
if(rn=1,benefit_reduce_amount-(sum_div_benefit_reduce_amount-benefit_reduce_amount_d),benefit_reduce_amount_d)
from
(
select
od.id,
od.order_id,
od.user_id,
od.sku_id,
od.sku_name,
od.order_price,
od.sku_num,
od.create_time,
oi.province_id,
od.source_type,
od.source_id,
round(od.order_price*od.sku_num,2) original_amount_d,
round(od.order_price*od.sku_num/oi.original_total_amount*oi.final_total_amount,2) final_amount_d,
round(od.order_price*od.sku_num/oi.original_total_amount*oi.feight_fee,2) feight_fee_d,
round(od.order_price*od.sku_num/oi.original_total_amount*oi.benefit_reduce_amount,2) benefit_reduce_amount_d,
row_number() over(partition by od.order_id order by od.id desc) rn,
oi.final_total_amount,
oi.feight_fee,
oi.benefit_reduce_amount,
sum(round(od.order_price*od.sku_num/oi.original_total_amount*oi.final_total_amount,2)) over(partition by od.order_id) sum_div_final_amount,
sum(round(od.order_price*od.sku_num/oi.original_total_amount*oi.feight_fee,2)) over(partition by od.order_id) sum_div_feight_fee,
sum(round(od.order_price*od.sku_num/oi.original_total_amount*oi.benefit_reduce_amount,2)) over(partition by od.order_id) sum_div_benefit_reduce_amount
from
(
select * from ods_order_detail where dt='2020-06-27'
) od
join
(
select * from ods_order_info where dt='2020-06-27'
) oi
on od.order_id=oi.id
)t1;
支付事实表(事务型事实表)
时间 | 用户 | 地区 | 商品 | 优惠券 | 活动 | 编码 | 度量值 | |
---|---|---|---|---|---|---|---|---|
支付 | √ | √ | √ | 金额 |
表中字段来自订单流水
省份ID需要join
drop table if exists dwd_fact_payment_info;
create external table dwd_fact_payment_info (
`id` string COMMENT '',
`out_trade_no` string COMMENT '对外业务编号',
`order_id` string COMMENT '订单编号',
`user_id` string COMMENT '用户编号',
`alipay_trade_no` string COMMENT '支付宝交易流水编号',
`payment_amount` decimal(16,2) COMMENT '支付金额',
`subject` string COMMENT '交易内容',
`payment_type` string COMMENT '支付类型',
`payment_time` string COMMENT '支付时间',
`province_id` string COMMENT '省份ID'
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_fact_payment_info/'
tblproperties ("parquet.compression"="lzo");
装载数据
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_fact_payment_info partition(dt='2020-06-27')
select
pi.id,
pi.out_trade_no,
pi.order_id,
pi.user_id,
pi.alipay_trade_no,
pi.total_amount,
pi.subject,
pi.payment_type,
pi.payment_time,
oi.province_id
from
(
select * from ods_payment_info where dt='2020-06-27'
)pi
join
(
select id, province_id from ods_order_info where dt='2020-06-27'
)oi
on pi.order_id = oi.id;
退款事实表(事务型事实表)
时间 | 用户 | 地区 | 商品 | 优惠券 | 活动 | 编码 | 度量值 | |
---|---|---|---|---|---|---|---|---|
退款 | √ | √ | √ | 件数/金额 |
建表
drop table if exists dwd_fact_order_refund_info;
create external table dwd_fact_order_refund_info(
`id` string COMMENT '编号',
`user_id` string COMMENT '用户ID',
`order_id` string COMMENT '订单ID',
`sku_id` string COMMENT '商品ID',
`refund_type` string COMMENT '退款类型',
`refund_num` bigint COMMENT '退款件数',
`refund_amount` decimal(16,2) COMMENT '退款金额',
`refund_reason_type` string COMMENT '退款原因类型',
`create_time` string COMMENT '退款时间'
) COMMENT '退款事实表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_order_refund_info/';
装载
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_fact_order_refund_info partition(dt='2020-06-27')
select
id,
user_id,
order_id,
sku_id,
refund_type,
refund_num,
refund_amount,
refund_reason_type,
create_time
from ods_order_refund_info
where dt='2020-06-27';
评价事实表(事务型事实表)
时间 | 用户 | 地区 | 商品 | 优惠券 | 活动 | 编码 | 度量值 | |
---|---|---|---|---|---|---|---|---|
评价 | √ | √ | √ | 个数 |
新建
drop table if exists dwd_fact_comment_info;
create external table dwd_fact_comment_info(
`id` string COMMENT '编号',
`user_id` string COMMENT '用户ID',
`sku_id` string COMMENT '商品sku',
`spu_id` string COMMENT '商品spu',
`order_id` string COMMENT '订单ID',
`appraise` string COMMENT '评价',
`create_time` string COMMENT '评价时间'
) COMMENT '评价事实表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_comment_info/';
装载
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_fact_comment_info partition(dt='2020-06-27')
select
id,
user_id,
sku_id,
spu_id,
order_id,
appraise,
create_time
from ods_comment_info
where dt='2020-06-27';
加购事实表(周期型快照事实表,每日快照)
时间 | 用户 | 地区 | 商品 | 优惠券 | 活动 | 编码 | 度量值 | |
---|---|---|---|---|---|---|---|---|
加购 | √ | √ | √ | 件数/金额 |
购物车应该有地区 此数据没有载入地区
建表
drop table if exists dwd_fact_cart_info;
create external table dwd_fact_cart_info(
`id` string COMMENT '编号',
`user_id` string COMMENT '用户id',
`sku_id` string COMMENT 'skuid',
`cart_price` string COMMENT '放入购物车时价格',
`sku_num` string COMMENT '数量',
`sku_name` string COMMENT 'sku名称 (冗余)',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '修改时间',
`is_ordered` string COMMENT '是否已经下单。1为已下单;0为未下单',
`order_time` string COMMENT '下单时间',
`source_type` string COMMENT '来源类型',
`srouce_id` string COMMENT '来源编号'
) COMMENT '加购事实表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_cart_info/';
装载
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_fact_cart_info partition(dt='2020-06-27')
select
id,
user_id,
sku_id,
cart_price,
sku_num,
sku_name,
create_time,
operate_time,
is_ordered,
order_time,
source_type,
source_id
from ods_cart_info
where dt='2020-06-27';
收藏事实表(周期型快照事实表,每日快照)
时间 | 用户 | 地区 | 商品 | 优惠券 | 活动 | 编码 | 度量值 | |
---|---|---|---|---|---|---|---|---|
收藏 | √ | √ | √ | 个数 |
建表
drop table if exists dwd_fact_favor_info;
create external table dwd_fact_favor_info(
`id` string COMMENT '编号',
`user_id` string COMMENT '用户id',
`sku_id` string COMMENT 'skuid',
`spu_id` string COMMENT 'spuid',
`is_cancel` string COMMENT '是否取消',
`create_time` string COMMENT '收藏时间',
`cancel_time` string COMMENT '取消时间'
) COMMENT '收藏事实表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_favor_info/';
加载
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_fact_favor_info partition(dt='2020-06-27')
select
id,
user_id,
sku_id,
spu_id,
is_cancel,
create_time,
cancel_time
from ods_favor_info
where dt='2020-06-27';
优惠券领用事实表(累积型快照事实表)
时间 | 用户 | 地区 | 商品 | 优惠券 | 活动 | 编码 | 度量值 | |
---|---|---|---|---|---|---|---|---|
优惠券领用 | √ | √ | √ | 个数 |
领用时间-使用下单时间-使用支付时间 此数据没有优惠券过期时间
累积型快照后续需要修改,生产环境中hive修改数据 都是select 字段 判断修改 再insert overwrite到表中
所以累积型快照事实表必须是分区表
按天分区每天只存优惠券领取记录,如何利用使用记录去改老数据?
通过get_time来获得位于哪个分区,能join上的就是要改的
mysql中日期格式’%y-%m-%d’
hive中日期格式 ‘yyyy-MM-dd’
--返回今天所有领取和使用优惠券的时间
select
date_format(get_time,'yyyy-MM-dd')
from ods_coupon_use
where dt='2020-06-27'
当哪个是null哪个就更新
id | get_time | using_time | used_time | |
---|---|---|---|---|
old | 1001 | 2020-06-20 | null | null |
new | 1001 | 2020-06-20 | 2020-06-29 | 2020-06-29 |
Hive 分区:
静态分区 dt=’…’
动态分区 hive根据最后一个字段的值,自动进入相应分区
1.开启动态分区(默认开启的)
2.模式:
严格模式:至少有一个静态分区
非严格模式:全部都是动态分区
建表
drop table if exists dwd_fact_coupon_use;
create external table dwd_fact_coupon_use(
`id` string COMMENT '编号',
`coupon_id` string COMMENT '优惠券ID',
`user_id` string COMMENT 'userid',
`order_id` string COMMENT '订单id',
`coupon_status` string COMMENT '优惠券状态',
`get_time` string COMMENT '领取时间',
`using_time` string COMMENT '使用时间(下单)',
`used_time` string COMMENT '使用时间(支付)'
) COMMENT '优惠券领用事实表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_coupon_use/';
装载
第一次需要设置非严格模式,因为没有静态分区
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_fact_coupon_use partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.coupon_id is null,old.coupon_id,new.coupon_id),
if(new.user_id is null,old.user_id,new.user_id),
if(new.order_id is null,old.order_id,new.order_id),
if(new.coupon_status is null,old.coupon_status,new.coupon_status),
if(new.get_time is null,old.get_time,new.get_time),
if(new.using_time is null,old.using_time,new.using_time),
if(new.used_time is null,old.used_time,new.used_time),
date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')
from
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from dwd_fact_coupon_use
where dt in
(
select
date_format(get_time,'yyyy-MM-dd')
from ods_coupon_use
where dt='2020-06-27'
)
)old
full outer join
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ods_coupon_use
where dt='2020-06-27'
)new
on old.id=new.id
系统函数(concat、concat_ws、STR_TO_MAP)
STR_TO_MAP(String text, String listDelimiter, String keyValueDelimiter)
订单事实表(累积型快照事实表)
时间 | 用户 | 地区 | 商品 | 优惠券 | 活动 | 编码 | 度量值 | |
---|---|---|---|---|---|---|---|---|
订单 | √ | √ | √ | √ | 件数/金额 |
订单的状态使用order_status
将之转为一行后,join后才能是一行
关于列转行:使用collect_set去重,collect_list
将order_status 为key operater_time为 value组成map
Map:的目的是取结构的时候好取
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hdGKp0e7-1600789772850)(/Users/vanas/Library/Application Support/typora-user-images/截屏2020-06-30下午6.16.51.png)]
select
order_id, str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=') tms
from ods_order_status_log
where dt='2020-06-27'
group by order_id
建表
drop table if exists dwd_fact_order_info;
create external table dwd_fact_order_info (
`id` string COMMENT '订单编号',
`order_status` string COMMENT '订单状态',
`user_id` string COMMENT '用户id',
`out_trade_no` string COMMENT '支付流水号',
`create_time` string COMMENT '创建时间(未支付状态)',
`payment_time` string COMMENT '支付时间(已支付状态)',
`cancel_time` string COMMENT '取消时间(已取消状态)',
`finish_time` string COMMENT '完成时间(已完成状态)',
`refund_time` string COMMENT '退款时间(退款中状态)',
`refund_finish_time` string COMMENT '退款完成时间(退款完成状态)',
`province_id` string COMMENT '省份ID',
`activity_id` string COMMENT '活动ID',
`original_total_amount` decimal(16,2) COMMENT '原价金额',
`benefit_reduce_amount` decimal(16,2) COMMENT '优惠金额',
`feight_fee` decimal(16,2) COMMENT '运费',
`final_total_amount` decimal(16,2) COMMENT '订单金额'
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_fact_order_info/'
tblproperties ("parquet.compression"="lzo");
旧表:3表关联ods_order_info,ods_activity_order,ods_order_status_log
装载数据
set hive.exec.dynamic.partition.mode=nonstrict;
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_fact_order_info partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.order_status is null,old.order_status,new.order_status),
if(new.user_id is null,old.user_id,new.user_id),
if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001对应未支付状态
if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),
if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),
if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),
if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),
if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),
if(new.province_id is null,old.province_id,new.province_id),
if(new.activity_id is null,old.activity_id,new.activity_id),
if(new.original_total_amount is null,old.original_total_amount,new.original_total_amount),
if(new.benefit_reduce_amount is null,old.benefit_reduce_amount,new.benefit_reduce_amount),
if(new.feight_fee is null,old.feight_fee,new.feight_fee),
if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount),
date_format(if(new.tms['1001'] is null,old.create_time,new.tms['1001']),'yyyy-MM-dd')
from
(
select
id,
order_status,
user_id,
out_trade_no,
create_time,
payment_time,
cancel_time,
finish_time,
refund_time,
refund_finish_time,
province_id,
activity_id,
original_total_amount,
benefit_reduce_amount,
feight_fee,
final_total_amount
from dwd_fact_order_info
where dt
in
(
select
date_format(create_time,'yyyy-MM-dd')
from ods_order_info
where dt='2020-06-27'
)
)old
full outer join
(
select
info.id,
info.order_status,
info.user_id,
info.out_trade_no,
info.province_id,
act.activity_id,
log.tms,
info.original_total_amount,
info.benefit_reduce_amount,
info.feight_fee,
info.final_total_amount
from
(
select
order_id,
str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=') tms
from ods_order_status_log
where dt='2020-06-27'
group by order_id
)log
join
(
select * from ods_order_info where dt='2020-06-27'
)info
on log.order_id=info.id
left join
(
select order_id,activity_id from ods_activity_order where dt='2020-06-27'
)act
on log.order_id=act.order_id
)new
on old.id=new.id;
用户维度表(拉链表)
用户表中的数据每日既有可能新增,也有可能修改,但修改频率并不高,属于缓慢变化维度,此处采用拉链表存储用户维度数据。
既能保留历史数据,又不会造成数据冗余
拉链表读数据:都会进行全表扫描,所以分不分区都一样
拉链表修改数据:以用户的创建时间为分区,不用全表扫描提高效率
本项目没有分区,维度表数据量肯定没事实表大 ,也不会经常改动所以没有分区
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-C2tM6yX6-1600789772851)(/Users/vanas/Library/Application Support/typora-user-images/截屏2020-06-30下午7.07.43.png)]
拉链表制作流程
insert overwrite本身就是安全的,所以临时表可以省略
建表
drop table if exists dwd_dim_user_info_his;
create external table dwd_dim_user_info_his(
`id` string COMMENT '用户id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性别',
`email` string COMMENT '邮箱',
`user_level` string COMMENT '用户等级',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间',
`start_date` string COMMENT '有效开始日期',
`end_date` string COMMENT '有效结束日期'
) COMMENT '订单拉链表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_user_info_his/'
tblproperties ("parquet.compression"="lzo");
Tips:建表时的分隔符什么时候用?
如果将来导入的数据是以load形式的,就非常必要
到数据的问题
不要重复执行 不让你会越来越大
步骤0:初始化拉链表(首次独立执行)
加入开链时间和闭链时间
生产环境:单独使用sqoop将业务系统中的用户表的全部数据导入至HDFS,可通过一个临时表,将数据导入至拉链表
建立拉链表:
drop table if exists dwd_dim_user_info_his;
create external table dwd_dim_user_info_his(
`id` string COMMENT '用户id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性别',
`email` string COMMENT '邮箱',
`user_level` string COMMENT '用户等级',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间',
`start_date` string COMMENT '有效开始日期',
`end_date` string COMMENT '有效结束日期'
) COMMENT '订单拉链表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_user_info_his/'
tblproperties ("parquet.compression"="lzo");
初始化拉链表:
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_dim_user_info_his
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'2020-06-14',
'9999-99-99'
from ods_user_info oi
where oi.dt='2020-06-27';
步骤1:制作当日变动数据(包括新增,修改)每日执行
ods_user_info中每个分区中存储的就是当日变动数据
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_dim_user_info_his
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'2020-06-27',
'9999-99-99'
from ods_user_info oi
where oi.dt='2020-06-27';
步骤2:先合并变动信息,再追加新增信息,插入到临时表中
drop table if exists dwd_dim_user_info_his_tmp;
create external table dwd_dim_user_info_his_tmp(
`id` string COMMENT '用户id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性别',
`email` string COMMENT '邮箱',
`user_level` string COMMENT '用户等级',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间',
`start_date` string COMMENT '有效开始日期',
`end_date` string COMMENT '有效结束日期'
) COMMENT '订单拉链临时表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_user_info_his_tmp/'
tblproperties ("parquet.compression"="lzo");
装载数据
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table dwd_dim_user_info_his_tmp
select * from
(
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'2020-06-27' start_date,
'9999-99-99' end_date
from ods_user_info where dt='2020-06-27'
union all
select
uh.id,
uh.name,
uh.birthday,
uh.gender,
uh.email,
uh.user_level,
uh.create_time,
uh.operate_time,
uh.start_date,
if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1), uh.end_date) end_date
from dwd_dim_user_info_his uh left join
(
select
*
from ods_user_info
where dt='2020-06-27'
) ui on uh.id=ui.id
)his
order by his.id, start_date;
步骤3:把临时表覆盖给拉链表
insert overwrite table dwd_dim_user_info_his
select * from dwd_dim_user_info_his_tmp;
累积型快照事实表 与 拉链表区别?
相同:新增及变化、需要改老数据、追踪生命周期
区别:
累积周期快照:事实表,追踪表周期变化 ,业务数据在一行中
拉链表:维度表,缓慢变化为,一个状态一行
DWD导入脚本
地区维度表 第一次导,之后不需要重复导
时间维度表手动导入
用户维度表单独处理
first : sq1+sq2
All:sq1+sq3
记得把用户临时表建出来
执行脚本
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$2" ] ;then
do_date=$2
else
do_date=`date -d "-1 day" +%F`
fi
sql1="
set mapreduce.job.queuename=hive;
set hive.exec.dynamic.partition.mode=nonstrict;
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
insert overwrite table ${APP}.dwd_dim_sku_info partition(dt='$do_date')
select
sku.id,
sku.spu_id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.tm_id,
ob.tm_name,
sku.category3_id,
c2.id category2_id,
c1.id category1_id,
c3.name category3_name,
c2.name category2_name,
c1.name category1_name,
spu.spu_name,
sku.create_time
from
(
select * from ${APP}.ods_sku_info where dt='$do_date'
)sku
join
(
select * from ${APP}.ods_base_trademark where dt='$do_date'
)ob on sku.tm_id=ob.tm_id
join
(
select * from ${APP}.ods_spu_info where dt='$do_date'
)spu on spu.id = sku.spu_id
join
(
select * from ${APP}.ods_base_category3 where dt='$do_date'
)c3 on sku.category3_id=c3.id
join
(
select * from ${APP}.ods_base_category2 where dt='$do_date'
)c2 on c3.category2_id=c2.id
join
(
select * from ${APP}.ods_base_category1 where dt='$do_date'
)c1 on c2.category1_id=c1.id;
insert overwrite table ${APP}.dwd_dim_coupon_info partition(dt='$do_date')
select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
spu_id,
tm_id,
category3_id,
limit_num,
operate_time,
expire_time
from ${APP}.ods_coupon_info
where dt='$do_date';
insert overwrite table ${APP}.dwd_dim_activity_info partition(dt='$do_date')
select
info.id,
info.activity_name,
info.activity_type,
info.start_time,
info.end_time,
info.create_time
from ${APP}.ods_activity_info info
where dt='$do_date';
insert overwrite table ${APP}.dwd_fact_order_detail partition(dt='$do_date')
select
id,
order_id,
user_id,
sku_id,
sku_num,
order_price,
sku_num,
create_time,
province_id,
source_type,
source_id,
original_amount_d,
if(rn=1,final_total_amount-(sum_div_final_amount-final_amount_d),final_amount_d),
if(rn=1,feight_fee-(sum_div_feight_fee-feight_fee_d),feight_fee_d),
if(rn=1,benefit_reduce_amount-(sum_div_benefit_reduce_amount-benefit_reduce_amount_d),benefit_reduce_amount_d)
from
(
select
od.id,
od.order_id,
od.user_id,
od.sku_id,
od.sku_name,
od.order_price,
od.sku_num,
od.create_time,
oi.province_id,
od.source_type,
od.source_id,
round(od.order_price*od.sku_num,2) original_amount_d,
round(od.order_price*od.sku_num/oi.original_total_amount*oi.final_total_amount,2) final_amount_d,
round(od.order_price*od.sku_num/oi.original_total_amount*oi.feight_fee,2) feight_fee_d,
round(od.order_price*od.sku_num/oi.original_total_amount*oi.benefit_reduce_amount,2) benefit_reduce_amount_d,
row_number() over(partition by od.order_id order by od.id desc) rn,
oi.final_total_amount,
oi.feight_fee,
oi.benefit_reduce_amount,
sum(round(od.order_price*od.sku_num/oi.original_total_amount*oi.final_total_amount,2)) over(partition by od.order_id) sum_div_final_amount,
sum(round(od.order_price*od.sku_num/oi.original_total_amount*oi.feight_fee,2)) over(partition by od.order_id) sum_div_feight_fee,
sum(round(od.order_price*od.sku_num/oi.original_total_amount*oi.benefit_reduce_amount,2)) over(partition by od.order_id) sum_div_benefit_reduce_amount
from
(
select * from ${APP}.ods_order_detail where dt='$do_date'
) od
join
(
select * from ${APP}.ods_order_info where dt='$do_date'
) oi
on od.order_id=oi.id
)t1;
insert overwrite table ${APP}.dwd_fact_payment_info partition(dt='$do_date')
select
pi.id,
pi.out_trade_no,
pi.order_id,
pi.user_id,
pi.alipay_trade_no,
pi.total_amount,
pi.subject,
pi.payment_type,
pi.payment_time,
oi.province_id
from
(
select * from ${APP}.ods_payment_info where dt='$do_date'
)pi
join
(
select id, province_id from ${APP}.ods_order_info where dt='$do_date'
)oi
on pi.order_id = oi.id;
insert overwrite table ${APP}.dwd_fact_order_refund_info partition(dt='$do_date')
select
id,
user_id,
order_id,
sku_id,
refund_type,
refund_num,
refund_amount,
refund_reason_type,
create_time
from ${APP}.ods_order_refund_info
where dt='$do_date';
insert overwrite table ${APP}.dwd_fact_comment_info partition(dt='$do_date')
select
id,
user_id,
sku_id,
spu_id,
order_id,
appraise,
create_time
from ${APP}.ods_comment_info
where dt='$do_date';
insert overwrite table ${APP}.dwd_fact_cart_info partition(dt='$do_date')
select
id,
user_id,
sku_id,
cart_price,
sku_num,
sku_name,
create_time,
operate_time,
is_ordered,
order_time,
source_type,
source_id
from ${APP}.ods_cart_info
where dt='$do_date';
insert overwrite table ${APP}.dwd_fact_favor_info partition(dt='$do_date')
select
id,
user_id,
sku_id,
spu_id,
is_cancel,
create_time,
cancel_time
from ${APP}.ods_favor_info
where dt='$do_date';
insert overwrite table ${APP}.dwd_fact_coupon_use partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.coupon_id is null,old.coupon_id,new.coupon_id),
if(new.user_id is null,old.user_id,new.user_id),
if(new.order_id is null,old.order_id,new.order_id),
if(new.coupon_status is null,old.coupon_status,new.coupon_status),
if(new.get_time is null,old.get_time,new.get_time),
if(new.using_time is null,old.using_time,new.using_time),
if(new.used_time is null,old.used_time,new.used_time),
date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')
from
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ${APP}.dwd_fact_coupon_use
where dt in
(
select
date_format(get_time,'yyyy-MM-dd')
from ${APP}.ods_coupon_use
where dt='$do_date'
)
)old
full outer join
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ${APP}.ods_coupon_use
where dt='$do_date'
)new
on old.id=new.id;
insert overwrite table ${APP}.dwd_fact_order_info partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.order_status is null,old.order_status,new.order_status),
if(new.user_id is null,old.user_id,new.user_id),
if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001对应未支付状态
if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),
if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),
if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),
if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),
if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),
if(new.province_id is null,old.province_id,new.province_id),
if(new.activity_id is null,old.activity_id,new.activity_id),
if(new.original_total_amount is null,old.original_total_amount,new.original_total_amount),
if(new.benefit_reduce_amount is null,old.benefit_reduce_amount,new.benefit_reduce_amount),
if(new.feight_fee is null,old.feight_fee,new.feight_fee),
if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount),
date_format(if(new.tms['1001'] is null,old.create_time,new.tms['1001']),'yyyy-MM-dd')
from
(
select
id,
order_status,
user_id,
out_trade_no,
create_time,
payment_time,
cancel_time,
finish_time,
refund_time,
refund_finish_time,
province_id,
activity_id,
original_total_amount,
benefit_reduce_amount,
feight_fee,
final_total_amount
from ${APP}.dwd_fact_order_info
where dt
in
(
select
date_format(create_time,'yyyy-MM-dd')
from ${APP}.ods_order_info
where dt='$do_date'
)
)old
full outer join
(
select
info.id,
info.order_status,
info.user_id,
info.out_trade_no,
info.province_id,
act.activity_id,
log.tms,
info.original_total_amount,
info.benefit_reduce_amount,
info.feight_fee,
info.final_total_amount
from
(
select
order_id,
str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=') tms
from ${APP}.ods_order_status_log
where dt='$do_date'
group by order_id
)log
join
(
select * from ${APP}.ods_order_info where dt='$do_date'
)info
on log.order_id=info.id
left join
(
select * from ${APP}.ods_activity_order where dt='$do_date'
)act
on log.order_id=act.order_id
)new
on old.id=new.id;
"
sql2="
insert overwrite table ${APP}.dwd_dim_base_province
select
bp.id,
bp.name,
bp.area_code,
bp.iso_code,
bp.region_id,
br.region_name
from ${APP}.ods_base_province bp
join ${APP}.ods_base_region br
on bp.region_id=br.id;
"
sql3="
insert overwrite table ${APP}.dwd_dim_user_info_his_tmp
select * from
(
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'$do_date' start_date,
'9999-99-99' end_date
from ${APP}.ods_user_info where dt='$do_date'
union all
select
uh.id,
uh.name,
uh.birthday,
uh.gender,
uh.email,
uh.user_level,
uh.create_time,
uh.operate_time,
uh.start_date,
if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1), uh.end_date) end_date
from ${APP}.dwd_dim_user_info_his uh left join
(
select
*
from ${APP}.ods_user_info
where dt='$do_date'
) ui on uh.id=ui.id
)his
order by his.id, start_date;
insert overwrite table ${APP}.dwd_dim_user_info_his
select * from ${APP}.dwd_dim_user_info_his_tmp;
"
case $1 in
"first"){
$hive -e "$sql1$sql2"
};;
"all"){
$hive -e "$sql1$sql3"
};;
esac
初次导入
(1)时间维度表
参照时间维度表(特殊)数据装载
(2)用户维度表
参照订单事实表(累积型快照事实表)拉链表初始化
(3)其余表
ods_to_dwd_db.sh first 2020-06-27
每日定时导入
ods_to_dwd_db.sh all 2020-06-28
检查结果(8个事实表,6个维度表+2个维度临时表)
show tables like 'dwd_fact*';
show tables like 'dwd_dim*';
Time taken: 0.043 seconds, Fetched: 8 row(s)
小问题
关于动态分区根据最后一个字段进行分区,如果为空就会默认进入 __HIVE_DEFAULT_PARTITION__分区
hive (gmall)> select * from dwd_fact_order_info;
hive (gmall)> show partitions dwd_fact_order_info;
OK
partition
dt=2020-06-27
dt=__HIVE_DEFAULT_PARTITION__
Time taken: 0.204 seconds, Fetched: 2 row(s)
create_time出现了null值
hive (gmall)> select * from dwd_fact_order_info where create_time is null;
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-24uKnTGS-1600789772852)(/Users/vanas/Library/Application Support/typora-user-images/截屏2020-06-30下午8.16.11.png)]
第一次没有create_time的情况
新数据没有问题,老数据会跑到default中
如果开发中需要老数据,需要单独导
DWS
什么叫宽表?
一个表列很多的时候(包含一些维度)
建宽表的意义?
优化数仓,重复计算的问题(把需要的数据放到中间层)
宽表创建原理
有多少维度表,就应该创建多少个宽表
每个宽表就是以这个维度为基础,分析的多个指标
如何建宽表?
dwd层与需求关系不大
宽表需要考虑到后续的需求,带着维度去看度量值的,一个维度一个宽表
维度表为核心
DWS层统计每个主题的当日行为
DWT层统计的是累积值
宽表不是万能的,当无法满足时去dwd层拿
业务术语
用户
流量相关,用设备id
下单、支付的指标,用userid
新增用户
首次联网使用应用的用户
活跃用户(DAU Daily Active User)
最普遍的认知:打开应用的用户即为活跃用户
不同的公司可能有不同的定义
周(月)活跃用户
某个自然周(月)内启动过应用的用户,该周(月)内的多次启动只记一个活跃用户
月活跃率
月活跃用户与截止到该月累计的用户总和之间的比例。
沉默用户
用户仅在安装当天(次日)启动一次,后续时间无再启动行为。该指标可以反映新增用户质量和用户与APP的匹配程度。
版本分布
不同版本的周内各天新增用户数,活跃用户数和启动次数。利于判断APP各个版本之间的优劣和用户行为习惯。
本周回流用户
上周未启动过应用,本周启动了应用的用户。
连续n周活跃用户
连续n周,每周至少启动一次。
忠诚用户
连续活跃几周以上的用户()
连续活跃用户
连续几周及以上活跃的用户
近期流失用户
连续n(2<= n <= 4)周没有启动应用的用户。(第n+1周没有启动过)
留存用户
某段时间内的新增用户,经过一段时间后,仍然使用应用的被认作是留存用户;这部分用户占当时新增用户的比例即是留存率。
例如,5月份新增用户200,这200人在6月份启动过应用的有100人,7月份启动过应用的有80人,8月份启动过应用的有50人;则5月份新增用户一个月后的留存率是50%,二个月后的留存率是40%,三个月后的留存率是25%。
用户新鲜度
每天启动应用的新老用户比例,即新增用户数占活跃用户数的比例。
单次使用时长
每次启动使用的时间长度。
日使用时长
累计一天内的使用时间长度。
启动次数计算标准
IOS平台应用退到后台就算一次独立的启动;Android平台我们规定,两次启动之间的间隔小于30秒,被计算一次启动。用户在使用过程中,若因收发短信或接电话等退出应用30秒又再次返回应用中,那这两次行为应该是延续而非独立的,所以可以被算作一次使用行为,即一次启动。业内大多使用30秒这个标准,但用户还是可以自定义此时间间隔。
系统函数
collect_set函数
去重collect_set函数,不去重collect_list函数
nvl函数
NVL(表达式1,表达式2)
如果表达式1为空值,NVL返回值为表达式2的值,否则返回表达式1的值。
其表达式的值可以是数字型、字符型和日期型。但是表达式1和表达式2的数据类型必须为同一个类型。
coalesce
返回第一个非空的值
日期处理函数
--date_format函数(根据格式整理日期)
hive (gmall)> select date_format('2020-06-14','yyyy-MM');
2020-06
--date_add函数(加减日期)
hive (gmall)> select date_add('2020-06-14',-1);
2020-06-13
hive (gmall)> select date_add('2020-06-14',1);
2020-06-15
--next_day函数
--(1)取当前天的下一个周一
hive (gmall)> select next_day('2020-03-12','MO');
2020-03-16
--说明:星期一到星期日的英文(Monday,Tuesday、Wednesday、Thursday、Friday、Saturday、Sunday)取2个或3个
--(2)取当前周的周一 (本周一=下个周一 - 7)
hive (gmall)> select date_add(next_day('2020-07-03','MO'),-7);
2020-06-29
--本周日 = 下个周一 - 1
select date_add(next_day('2020-07-03','MO'),-1);
2020-07-05
--last_day函数(求当月最后一天日期)
hive (gmall)> select last_day('2020-06-14');
2020-03-31
--last_day和next_day函数一般用来求日活月活用的较多
DWS层(用户行为)
每日设备行为
此表用来统计流量相关或者pv uv
pv :页面总访问次数
uv:页面总访问人数
建表
字段少,因为没有设备维护表,所以选了些固定不变的字段,没选os
drop table if exists dws_uv_detail_daycount;
create external table dws_uv_detail_daycount
(
`mid_id` string,
`brand` string,
`model` string,
`login_count` bigint COMMENT '活跃次数',
`page_stats` array<struct<page_id:string,page_count:bigint>> COMMENT '页面访问统计'
)
partitioned by(dt string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_daycount';
数据装载
数据来源dwd
Login_count 来源于启动日志dwd_start_log
来源于页面的表dwd_page_log
分组后,能取的字段只有3中
1.分组字段
2.聚合值
3.常量值
left join 和 full outer join一样 也没必要进行nvl判断因为start肯定包含了mid等信息
后面的表连接一个 on就行
with
tmp_start as
(
select
mid_id,
brand,
model,
count(*) login_count
from dwd_start_log
where dt='2020-06-27'
group by mid_id,brand,model
),
tmp_page as
(
select
mid_id,
brand,
model,
collect_set(named_struct('page_id',page_id,'page_count',page_count)) page_stats
from
(
select
mid_id,
brand,
model,
page_id,
count(*) page_count
from dwd_page_log
where dt='2020-06-27'
group by mid_id,brand,model,page_id
)tmp
group by mid_id,brand,model
)
insert overwrite table dws_uv_detail_daycount partition(dt='2020-06-27')
select
nvl(tmp_start.mid_id,tmp_page.mid_id),
nvl(tmp_start.brand,tmp_page.brand),
nvl(tmp_start.model,tmp_page.model),
tmp_start.login_count,
tmp_page.page_stats
from tmp_start
full outer join tmp_page
on tmp_start.mid_id=tmp_page.mid_id
and tmp_start.brand=tmp_page.brand
and tmp_start.model=tmp_page.model;
DWS层(业务)
每日会员行为
建表
加入购物车次数:统计加入购物车的次数
drop table if exists dws_user_action_daycount;
create external table dws_user_action_daycount
(
user_id string comment '用户 id',
login_count bigint comment '登录次数',
cart_count bigint comment '加入购物车次数',
order_count bigint comment '下单次数',
order_amount decimal(16,2) comment '下单金额',
payment_count bigint comment '支付次数',
payment_amount decimal(16,2) comment '支付金额',
order_detail_stats array<struct<sku_id:string,sku_num:bigint,order_count:bigint,order_amount:decimal(20,2)>> comment '下单明细统计'
) COMMENT '每日用户行为'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action_daycount/'
tblproperties ("parquet.compression"="lzo");
有些用户没有登入,所以是null,这样会造成大量数据倾斜
mid_id
user_id 有一部分user_id 为null
解决办法 统计会员的登入次数
select
user_id,
count(*) login_count
from dwd_start_log
where dt='2020-06-27'
and user_id is not noll
group by user_id
购物车事实表拿不到历史过程,所以需要从日志dwd_action_log中拿
select
user_id,
count(*) cart_count
from dwd_action_log
where dt ='2020-06-27'
and user_id is not null
and action_id='cart_add'
group by user_id
select
user_id,
count(*) order_count,
sum(final_total_amount) order_amount
from dwd_fact_order_info
where dt='2020-06-27'
group by user_id
select
user_id,
count(*) payment_count,
sum(payment_amount) payment_amount
from dwd_facct_payment_info
where dt ='2020-06-27'
group by user_id
有关下单明细统计,需要转变结构体
select
user_id,
sku_id,
sum(sku_num) sku_num,
count(*) order_count,
sum(final_amount_d)order_amount
from dwd_fact_order_detail
where dt ='2020-06-27'
group by user_id,sku_id
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rN58q9bV-1600789772853)(/Users/vanas/Desktop/截屏2020-07-01下午6.49.50.png)]
select
user_id,
collect_set(named_struct('sku_id',sku_id,'sku_num',sku_num,'order_count',order_count,'order_amount',order_amount))
from
(
select
user_id,
sku_id,
sum(sku_num) sku_num,
count(*) order_count,
sum(final_amount_d)order_amount
from dwd_fact_order_detail
where dt ='2020-06-27'
group by user_id,sku_id
)t1
group by user_id
注意需要精度转换
cast(sum(final_amount_d) as decimal(20,2)) order_amount
数据装载
拼接
left join 登入放在最左边,其他操作都是基于登入后的
提前声明子查询,每个子查询用 ","分隔 这样可读性较好
with
tmp_login as
(
select
user_id,
count(*) login_count
from dwd_start_log
where dt='2020-06-14'
and user_id is not null
group by user_id
),
tmp_cart as
(
select
user_id,
count(*) cart_count
from dwd_action_log
where dt='2020-06-14'
and user_id is not null
and action_id='cart_add'
group by user_id
),tmp_order as
(
select
user_id,
count(*) order_count,
sum(final_total_amount) order_amount
from dwd_fact_order_info
where dt='2020-06-14'
group by user_id
) ,
tmp_payment as
(
select
user_id,
count(*) payment_count,
sum(payment_amount) payment_amount
from dwd_fact_payment_info
where dt='2020-06-14'
group by user_id
),
tmp_order_detail as
(
select
user_id,
collect_set(named_struct('sku_id',sku_id,'sku_num',sku_num,'order_count',order_count,'order_amount',order_amount)) order_stats
from
(
select
user_id,
sku_id,
sum(sku_num) sku_num,
count(*) order_count,
cast(sum(final_amount_d) as decimal(20,2)) order_amount
from dwd_fact_order_detail
where dt='2020-06-14'
group by user_id,sku_id
)tmp
group by user_id
)
insert overwrite table dws_user_action_daycount partition(dt='2020-06-27')
select
tmp_login.user_id,
login_count,
nvl(cart_count,0),
nvl(order_count,0),
nvl(order_amount,0.0),
nvl(payment_count,0),
nvl(payment_amount,0.0),
order_stats
from tmp_login
left join tmp_cart on tmp_login.user_id=tmp_cart.user_id
left join tmp_order on tmp_login.user_id=tmp_order.user_id
left join tmp_payment on tmp_login.user_id=tmp_payment.user_id
left join tmp_order_detail on tmp_login.user_id=tmp_order_detail.user_id;
每日商品行为
drop table if exists dws_sku_action_daycount;
create external table dws_sku_action_daycount
(
sku_id string comment 'sku_id',
order_count bigint comment '被下单次数',
order_num bigint comment '被下单件数',
order_amount decimal(16,2) comment '被下单金额',
payment_count bigint comment '被支付次数',
payment_num bigint comment '被支付件数',
payment_amount decimal(16,2) comment '被支付金额',
refund_count bigint comment '被退款次数',
refund_num bigint comment '被退款件数',
refund_amount decimal(16,2) comment '被退款金额',
cart_count bigint comment '被加入购物车次数',
favor_count bigint comment '被收藏次数',
appraise_good_count bigint comment '好评数',
appraise_mid_count bigint comment '中评数',
appraise_bad_count bigint comment '差评数',
appraise_default_count bigint comment '默认评价数'
) COMMENT '每日商品行为'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_sku_action_daycount/'
tblproperties ("parquet.compression"="lzo");
多表join的连接条件问题(COALESCE函数)
a
full outer join b on a.id = b.id
full outer join c on nvl(a.id,b.id) = c.id
full outer join d on coalesce(a.id,b.id,c.id)=d.id
full outer join e on coalesce(a.id,b.id,c.id,d.id)=e.ida
数据装载
其中payment可以直接从dwd_fact_payment_info表中直接获取订单号,可以不必那么复杂
补0,进行数据合并,要求必须是数值类型,用union再聚合,不用join这样效率高
with
tmp_order as
(
select
sku_id,
count(*) order_count,
sum(sku_num) order_num,
sum(final_amount_d) order_amount
from dwd_fact_order_detail
where dt='2020-06-27'
group by sku_id
),
tmp_payment as
(
select
sku_id,
count(*) payment_count,
sum(sku_num) payment_num,
sum(final_amount_d) payment_amount
from dwd_fact_order_detail
where dt='2020-06-27'
and order_id in
(
select
id
from dwd_fact_order_info。--可以换成dwd_payment的表
where (dt='2020-06-27'
or dt=date_add('2020-06-27',-1))
and date_format(payment_time,'yyyy-MM-dd')='2020-06-27'
)
group by sku_id
),
tmp_refund as
(
select
sku_id,
count(*) refund_count,
sum(refund_num) refund_num,
sum(refund_amount) refund_amount
from dwd_fact_order_refund_info
where dt='2020-06-27'
group by sku_id
),
tmp_cart as
(
select
item sku_id,
count(*) cart_count
from dwd_action_log
where dt='2020-06-27'
and user_id is not null
and action_id='cart_add'
group by item
),tmp_favor as
(
select
item sku_id,
count(*) favor_count
from dwd_action_log
where dt='2020-06-27'
and user_id is not null
and action_id='favor_add'
group by item
),
tmp_appraise as
(
select
sku_id,
sum(if(appraise='1201',1,0)) appraise_good_count,
sum(if(appraise='1202',1,0)) appraise_mid_count,
sum(if(appraise='1203',1,0)) appraise_bad_count,
sum(if(appraise='1204',1,0)) appraise_default_count
from dwd_fact_comment_info
where dt='2020-06-27'
group by sku_id
)
insert overwrite table dws_sku_action_daycount partition(dt='2020-06-27')
select
sku_id,
sum(order_count),
sum(order_num),
sum(order_amount),
sum(payment_count),
sum(payment_num),
sum(payment_amount),
sum(refund_count),
sum(refund_num),
sum(refund_amount),
sum(cart_count),
sum(favor_count),
sum(appraise_good_count),
sum(appraise_mid_count),
sum(appraise_bad_count),
sum(appraise_default_count)
from
(
select
sku_id,
order_count,
order_num,
order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_order
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
payment_count,
payment_num,
payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_payment
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
refund_count,
refund_num,
refund_amount,
0 cart_count,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_refund
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
cart_count,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_cart
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_favor
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
0 favor_count,
appraise_good_count,
appraise_mid_count,
appraise_bad_count,
appraise_default_count
from tmp_appraise
)tmp
group by sku_id;
每日活动统计
建表
drop table if exists dws_activity_info_daycount;
create external table dws_activity_info_daycount(
`id` string COMMENT '编号',
`activity_name` string COMMENT '活动名称',
`activity_type` string COMMENT '活动类型',
`start_time` string COMMENT '开始时间',
`end_time` string COMMENT '结束时间',
`create_time` string COMMENT '创建时间',
`display_count` bigint COMMENT '曝光次数',
`order_count` bigint COMMENT '下单次数',
`order_amount` decimal(20,2) COMMENT '下单金额',
`payment_count` bigint COMMENT '支付次数',
`payment_amount` decimal(20,2) COMMENT '支付金额'
) COMMENT '每日活动信息表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_activity_info_daycount/'
tblproperties ("parquet.compression"="lzo");
数据装载
with
tmp_op as
(
select
activity_id,
sum(if(date_format(create_time,'yyyy-MM-dd')='2020-06-14',1,0)) order_count,
sum(if(date_format(create_time,'yyyy-MM-dd')='2020-06-14',final_total_amount,0)) order_amount,
sum(if(date_format(payment_time,'yyyy-MM-dd')='2020-06-14',1,0)) payment_count,
sum(if(date_format(payment_time,'yyyy-MM-dd')='2020-06-14',final_total_amount,0)) payment_amount
from dwd_fact_order_info
where (dt='2020-06-14' or dt=date_add('2020-06-14',-1))
and activity_id is not null
group by activity_id
),
tmp_display as
(
select
item activity_id,
count(*) display_count
from dwd_display_log
where dt='2020-06-14'
and item_type='activity_id'
group by item
),
tmp_activity as
(
select
*
from dwd_dim_activity_info
where dt='2020-06-14'
)
insert overwrite table dws_activity_info_daycount partition(dt='2020-06-14')
select
nvl(tmp_op.activity_id,tmp_display.activity_id),
tmp_activity.activity_name,
tmp_activity.activity_type,
tmp_activity.start_time,
tmp_activity.end_time,
tmp_activity.create_time,
tmp_display.display_count,
tmp_op.order_count,
tmp_op.order_amount,
tmp_op.payment_count,
tmp_op.payment_amount
from tmp_op
full outer join tmp_display on tmp_op.activity_id=tmp_display.activity_id
left join tmp_activity on nvl(tmp_op.activity_id,tmp_display.activity_id)=tmp_activity.id;
每日地区统计
建表
drop table if exists dws_area_stats_daycount;
create external table dws_area_stats_daycount(
`id` bigint COMMENT '编号',
`province_name` string COMMENT '省份名称',
`area_code` string COMMENT '地区编码',
`iso_code` string COMMENT 'iso编码',
`region_id` string COMMENT '地区ID',
`region_name` string COMMENT '地区名称',
`login_count` string COMMENT '活跃设备数',
`order_count` bigint COMMENT '下单次数',
`order_amount` decimal(20,2) COMMENT '下单金额',
`payment_count` bigint COMMENT '支付次数',
`payment_amount` decimal(20,2) COMMENT '支付金额'
) COMMENT '每日地区信息表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_area_stats_daycount/'
tblproperties ("parquet.compression"="lzo");
数据装载
with
tmp_login as
(
select
area_code,
count(*) login_count
from dwd_start_log
where dt='2020-06-14'
group by area_code
),
tmp_op as
(
select
province_id,
sum(if(date_format(create_time,'yyyy-MM-dd')='2020-06-14',1,0)) order_count,
sum(if(date_format(create_time,'yyyy-MM-dd')='2020-06-14',final_total_amount,0)) order_amount,
sum(if(date_format(payment_time,'yyyy-MM-dd')='2020-06-14',1,0)) payment_count,
sum(if(date_format(payment_time,'yyyy-MM-dd')='2020-06-14',final_total_amount,0)) payment_amount
from dwd_fact_order_info
where (dt='2020-06-14' or dt=date_add('2020-06-14',-1))
group by province_id
)
insert overwrite table dws_area_stats_daycount partition(dt='2020-06-14')
select
pro.id,
pro.province_name,
pro.area_code,
pro.iso_code,
pro.region_id,
pro.region_name,
nvl(tmp_login.login_count,0),
nvl(tmp_op.order_count,0),
nvl(tmp_op.order_amount,0.0),
nvl(tmp_op.payment_count,0),
nvl(tmp_op.payment_amount,0.0)
from dwd_dim_base_province pro
left join tmp_login on pro.area_code=tmp_login.area_code
left join tmp_op on pro.id=tmp_op.province_id;
DWS层数据导入脚本
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
set mapreduce.job.queuename=hive;
with
tmp_start as
(
select
mid_id,
brand,
model,
count(*) login_count
from ${APP}.dwd_start_log
where dt='$do_date'
group by mid_id,brand,model
),
tmp_page as
(
select
mid_id,
brand,
model,
collect_set(named_struct('page_id',page_id,'page_count',page_count)) page_stats
from
(
select
mid_id,
brand,
model,
page_id,
count(*) page_count
from ${APP}.dwd_page_log
where dt='$do_date'
group by mid_id,brand,model,page_id
)tmp
group by mid_id,brand,model
)
insert overwrite table ${APP}.dws_uv_detail_daycount partition(dt='$do_date')
select
nvl(tmp_start.mid_id,tmp_page.mid_id),
nvl(tmp_start.brand,tmp_page.brand),
nvl(tmp_start.model,tmp_page.model),
tmp_start.login_count,
tmp_page.page_stats
from tmp_start
full outer join tmp_page
on tmp_start.mid_id=tmp_page.mid_id
and tmp_start.brand=tmp_page.brand
and tmp_start.model=tmp_page.model;
with
tmp_login as
(
select
user_id,
count(*) login_count
from ${APP}.dwd_start_log
where dt='$do_date'
and user_id is not null
group by user_id
),
tmp_cart as
(
select
user_id,
count(*) cart_count
from ${APP}.dwd_action_log
where dt='$do_date'
and user_id is not null
and action_id='cart_add'
group by user_id
),tmp_order as
(
select
user_id,
count(*) order_count,
sum(final_total_amount) order_amount
from ${APP}.dwd_fact_order_info
where dt='$do_date'
group by user_id
) ,
tmp_payment as
(
select
user_id,
count(*) payment_count,
sum(payment_amount) payment_amount
from ${APP}.dwd_fact_payment_info
where dt='$do_date'
group by user_id
),
tmp_order_detail as
(
select
user_id,
collect_set(named_struct('sku_id',sku_id,'sku_num',sku_num,'order_count',order_count,'order_amount',order_amount)) order_stats
from
(
select
user_id,
sku_id,
sum(sku_num) sku_num,
count(*) order_count,
cast(sum(final_amount_d) as decimal(20,2)) order_amount
from ${APP}.dwd_fact_order_detail
where dt='$do_date'
group by user_id,sku_id
)tmp
group by user_id
)
insert overwrite table ${APP}.dws_user_action_daycount partition(dt='$do_date')
select
tmp_login.user_id,
login_count,
nvl(cart_count,0),
nvl(order_count,0),
nvl(order_amount,0.0),
nvl(payment_count,0),
nvl(payment_amount,0.0),
order_stats
from tmp_login
left outer join tmp_cart on tmp_login.user_id=tmp_cart.user_id
left outer join tmp_order on tmp_login.user_id=tmp_order.user_id
left outer join tmp_payment on tmp_login.user_id=tmp_payment.user_id
left outer join tmp_order_detail on tmp_login.user_id=tmp_order_detail.user_id;
with
tmp_order as
(
select
sku_id,
count(*) order_count,
sum(sku_num) order_num,
sum(final_amount_d) order_amount
from ${APP}.dwd_fact_order_detail
where dt='$do_date'
group by sku_id
),
tmp_payment as
(
select
sku_id,
count(*) payment_count,
sum(sku_num) payment_num,
sum(final_amount_d) payment_amount
from ${APP}.dwd_fact_order_detail
where dt='$do_date'
and order_id in
(
select
id
from ${APP}.dwd_fact_order_info
where (dt='$do_date'
or dt=date_add('$do_date',-1))
and date_format(payment_time,'yyyy-MM-dd')='$do_date'
)
group by sku_id
),
tmp_refund as
(
select
sku_id,
count(*) refund_count,
sum(refund_num) refund_num,
sum(refund_amount) refund_amount
from ${APP}.dwd_fact_order_refund_info
where dt='$do_date'
group by sku_id
),
tmp_cart as
(
select
item sku_id,
count(*) cart_count
from ${APP}.dwd_action_log
where dt='$do_date'
and user_id is not null
and action_id='cart_add'
group by item
),tmp_favor as
(
select
item sku_id,
count(*) favor_count
from ${APP}.dwd_action_log
where dt='$do_date'
and user_id is not null
and action_id='favor_add'
group by item
),
tmp_appraise as
(
select
sku_id,
sum(if(appraise='1201',1,0)) appraise_good_count,
sum(if(appraise='1202',1,0)) appraise_mid_count,
sum(if(appraise='1203',1,0)) appraise_bad_count,
sum(if(appraise='1204',1,0)) appraise_default_count
from ${APP}.dwd_fact_comment_info
where dt='$do_date'
group by sku_id
)
insert overwrite table ${APP}.dws_sku_action_daycount partition(dt='$do_date')
select
sku_id,
sum(order_count),
sum(order_num),
sum(order_amount),
sum(payment_count),
sum(payment_num),
sum(payment_amount),
sum(refund_count),
sum(refund_num),
sum(refund_amount),
sum(cart_count),
sum(favor_count),
sum(appraise_good_count),
sum(appraise_mid_count),
sum(appraise_bad_count),
sum(appraise_default_count)
from
(
select
sku_id,
order_count,
order_num,
order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_order
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
payment_count,
payment_num,
payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_payment
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
refund_count,
refund_num,
refund_amount,
0 cart_count,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_refund
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
cart_count,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_cart
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_favor
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
0 favor_count,
appraise_good_count,
appraise_mid_count,
appraise_bad_count,
appraise_default_count
from tmp_appraise
)tmp
group by sku_id;
with
tmp_login as
(
select
area_code,
count(*) login_count
from ${APP}.dwd_start_log
where dt='$do_date'
group by area_code
),
tmp_op as
(
select
province_id,
sum(if(date_format(create_time,'yyyy-MM-dd')='$do_date',1,0)) order_count,
sum(if(date_format(create_time,'yyyy-MM-dd')='$do_date',final_total_amount,0)) order_amount,
sum(if(date_format(payment_time,'yyyy-MM-dd')='$do_date',1,0)) payment_count,
sum(if(date_format(payment_time,'yyyy-MM-dd')='$do_date',final_total_amount,0)) payment_amount
from ${APP}.dwd_fact_order_info
where (dt='$do_date' or dt=date_add('$do_date',-1))
group by province_id
)
insert overwrite table ${APP}.dws_area_stats_daycount partition(dt='$do_date')
select
pro.id,
pro.province_name,
pro.area_code,
pro.iso_code,
pro.region_id,
pro.region_name,
nvl(tmp_login.login_count,0),
nvl(tmp_op.order_count,0),
nvl(tmp_op.order_amount,0.0),
nvl(tmp_op.payment_count,0),
nvl(tmp_op.payment_amount,0.0)
from ${APP}.dwd_dim_base_province pro
left join tmp_login on pro.area_code=tmp_login.area_code
left join tmp_op on pro.id=tmp_op.province_id;
with
tmp_op as
(
select
activity_id,
sum(if(date_format(create_time,'yyyy-MM-dd')='$do_date',1,0)) order_count,
sum(if(date_format(create_time,'yyyy-MM-dd')='$do_date',final_total_amount,0)) order_amount,
sum(if(date_format(payment_time,'yyyy-MM-dd')='$do_date',1,0)) payment_count,
sum(if(date_format(payment_time,'yyyy-MM-dd')='$do_date',final_total_amount,0)) payment_amount
from ${APP}.dwd_fact_order_info
where (dt='$do_date' or dt=date_add('$do_date',-1))
and activity_id is not null
group by activity_id
),
tmp_display as
(
select
item activity_id,
count(*) display_count
from ${APP}.dwd_display_log
where dt='$do_date'
and item_type='activity_id'
group by item
),
tmp_activity as
(
select
*
from ${APP}.dwd_dim_activity_info
where dt='$do_date'
)
insert overwrite table ${APP}.dws_activity_info_daycount partition(dt='$do_date')
select
nvl(tmp_op.activity_id,tmp_display.activity_id),
tmp_activity.activity_name,
tmp_activity.activity_type,
tmp_activity.start_time,
tmp_activity.end_time,
tmp_activity.create_time,
tmp_display.display_count,
tmp_op.order_count,
tmp_op.order_amount,
tmp_op.payment_count,
tmp_op.payment_amount
from tmp_op
full outer join tmp_display on tmp_op.activity_id=tmp_display.activity_id
left join tmp_activity on nvl(tmp_op.activity_id,tmp_display.activity_id)=tmp_activity.id;
"
$hive -e "$sql"
DWT
这层的数据,都不分区
会员主题宽表
全量表,每天都要更新
不要每天重新计算,用原来的值结合新数据计算
建表
drop table if exists dwt_user_topic;
create external table dwt_user_topic
(
user_id string comment '用户id',
login_date_first string comment '首次登录时间',
login_date_last string comment '末次登录时间',
login_count bigint comment '累积登录天数',
login_last_30d_count bigint comment '最近30日登录天数',
order_date_first string comment '首次下单时间',
order_date_last string comment '末次下单时间',
order_count bigint comment '累积下单次数',
order_amount decimal(16,2) comment '累积下单金额',
order_last_30d_count bigint comment '最近30日下单次数',
order_last_30d_amount bigint comment '最近30日下单金额',
payment_date_first string comment '首次支付时间',
payment_date_last string comment '末次支付时间',
payment_count decimal(16,2) comment '累积支付次数',
payment_amount decimal(16,2) comment '累积支付金额',
payment_last_30d_count decimal(16,2) comment '最近30日支付次数',
payment_last_30d_amount decimal(16,2) comment '最近30日支付金额'
)COMMENT '用户主题宽表'
数据分为2部分 累积的 和 30天的
select
login_count,
order_count,
order_amount,
payment_count,
payment_amount
from dws_user_action_daycount
where dt='2020-06-27'
select
user_id,
count(*),
sum(order_count),
sum(order_amount),
sum(payment_count),
sum(payment_amount)
from dws_user_action_daycount
where dt>=date_add('2020-06-27',-29)
and dt<='2020-06-27'
group by user_id
把这俩个合并一起
sum(if(login_count>0,1,0)) login_last_30d_count,
这里直接用count就行
sum 不可去掉的因为 group by聚合
select
user_id,
sum(if(dt='2020-06-27',login_count,0)) login_count,
sum(if(dt='2020-06-27',order_count,0)) order_count,
sum(if(dt='2020-06-27',order_amount,0)) order_amount,
sum(if(dt='2020-06-27',payment_count,0)) payment_count,
sum(if(dt='2020-06-27',payment_amount,0)) payment_amount,
sum(if(login_count>0,1,0)) login_last_30d_count,
sum(order_count) order_last_30d_count,
sum(order_amount) order_last_30d_amount,
sum(payment_count) payment_last_30d_count,
sum(payment_amount) payment_last_30d_amount
from dws_user_action_daycount
where dt>=date_add( '2020-06-27',-29)
and dt<='2020=06-27'
group by user_id
数据装载
insert overwrite table dwt_user_topic
select
nvl(new.user_id,old.user_id),
if(old.login_date_first is null and new.login_count>0,'2020-06-27',old.login_date_first),
if(new.login_count>0,'2020-06-27',old.login_date_last),
nvl(old.login_count,0)+if(new.login_count>0,1,0),
nvl(new.login_last_30d_count,0),
if(old.order_date_first is null and new.order_count>0,'2020-06-27',old.order_date_first),
if(new.order_count>0,'2020-06-27',old.order_date_last),
nvl(old.order_count,0)+nvl(new.order_count,0),
nvl(old.order_amount,0)+nvl(new.order_amount,0),
nvl(new.order_last_30d_count,0),
nvl(new.order_last_30d_amount,0),
if(old.payment_date_first is null and new.payment_count>0,'2020-06-27',old.payment_date_first),
if(new.payment_count>0,'2020-06-27',old.payment_date_last),
nvl(old.payment_count,0)+nvl(new.payment_count,0),
nvl(old.payment_amount,0)+nvl(new.payment_amount,0),
nvl(new.payment_last_30d_count,0),
nvl(new.payment_last_30d_amount,0)
from
dwt_user_topic old
full outer join
(
select
user_id,
sum(if(dt='2020-06-27',login_count,0)) login_count,
sum(if(dt='2020-06-27',order_count,0)) order_count,
sum(if(dt='2020-06-27',order_amount,0)) order_amount,
sum(if(dt='2020-06-27',payment_count,0)) payment_count,
sum(if(dt='2020-06-27',payment_amount,0)) payment_amount,
sum(if(login_count>0,1,0)) login_last_30d_count,
sum(order_count) order_last_30d_count,
sum(order_amount) order_last_30d_amount,
sum(payment_count) payment_last_30d_count,
sum(payment_amount) payment_last_30d_amount
from dws_user_action_daycount
where dt>=date_add( '2020-06-27',-29)
group by user_id
)new
on old.user_id=new.user_id;
设备主题宽表
新增用户 就去找 ‘首次活跃时间’
建表
drop table if exists dwt_uv_topic;
create external table dwt_uv_topic
(
`mid_id` string,
`brand` string,
`model` string,
`login_date_first` string comment '首次活跃时间',
`login_date_last` string comment '末次活跃时间',
`login_day_count` bigint comment '当日活跃次数',
`login_count` bigint comment '累积活跃天数'
)
stored as parquet
location '/warehouse/gmall/dwt/dwt_uv_topic';
Login date first问题
区别
只有老用户才有login_count=0,所以加不加 new.login_count>0都一样
--这俩个表达是相同的
if(old.login_date_first is null,'2020-06-27',old.login_date_first)
nvl(old.login_date_first,'2020-06-27')
if(old.login_date_first is null and new.login_count>0,'2020-06-27',old.login_date_first)
数据装载
insert overwrite table dwt_uv_topic
select
nvl(new.mid_id,old.mid_id),
nvl(new.model,old.model),
nvl(new.brand,old.brand),
if(old.mid_id is null,'2020-06-27',old.login_date_first),
if(new.mid_id is not null,'2020-06-27',old.login_date_last),
if(new.mid_id is not null, new.login_count,0),
nvl(old.login_count,0)+if(new.login_count>0,1,0)
from dwt_uv_top old
full outer join
(
select
*
from dws_uv_detail_daycount
where dt='2020-06-27'
)new
on old.mid_id=new.mid_id;
商品主题宽表
建表
drop table if exists dwt_sku_topic;
create external table dwt_sku_topic
(
sku_id string comment 'sku_id',
spu_id string comment 'spu_id',
order_last_30d_count bigint comment '最近30日被下单次数',
order_last_30d_num bigint comment '最近30日被下单件数',
order_last_30d_amount decimal(16,2) comment '最近30日被下单金额',
order_count bigint comment '累积被下单次数',
order_num bigint comment '累积被下单件数',
order_amount decimal(16,2) comment '累积被下单金额',
payment_last_30d_count bigint comment '最近30日被支付次数',
payment_last_30d_num bigint comment '最近30日被支付件数',
payment_last_30d_amount decimal(16,2) comment '最近30日被支付金额',
payment_count bigint comment '累积被支付次数',
payment_num bigint comment '累积被支付件数',
payment_amount decimal(16,2) comment '累积被支付金额',
refund_last_30d_count bigint comment '最近三十日退款次数',
refund_last_30d_num bigint comment '最近三十日退款件数',
refund_last_30d_amount decimal(16,2) comment '最近三十日退款金额',
refund_count bigint comment '累积退款次数',
refund_num bigint comment '累积退款件数',
refund_amount decimal(16,2) comment '累积退款金额',
cart_last_30d_count bigint comment '最近30日被加入购物车次数',
cart_count bigint comment '累积被加入购物车次数',
favor_last_30d_count bigint comment '最近30日被收藏次数',
favor_count bigint comment '累积被收藏次数',
appraise_last_30d_good_count bigint comment '最近30日好评数',
appraise_last_30d_mid_count bigint comment '最近30日中评数',
appraise_last_30d_bad_count bigint comment '最近30日差评数',
appraise_last_30d_default_count bigint comment '最近30日默认评价数',
appraise_good_count bigint comment '累积好评数',
appraise_mid_count bigint comment '累积中评数',
appraise_bad_count bigint comment '累积差评数',
appraise_default_count bigint comment '累积默认评价数'
)COMMENT '商品主题宽表'
stored as parquet
location '/warehouse/gmall/dwt/dwt_sku_topic/'
tblproperties ("parquet.compression"="lzo");
数据装载
insert overwrite table dwt_sku_topic
select
nvl(new.sku_id,old.sku_id),
sku_info.spu_id,
nvl(new.order_count30,0),
nvl(new.order_num30,0),
nvl(new.order_amount30,0),
nvl(old.order_count,0) + nvl(new.order_count,0),
nvl(old.order_num,0) + nvl(new.order_num,0),
nvl(old.order_amount,0) + nvl(new.order_amount,0),
nvl(new.payment_count30,0),
nvl(new.payment_num30,0),
nvl(new.payment_amount30,0),
nvl(old.payment_count,0) + nvl(new.payment_count,0),
nvl(old.payment_num,0) + nvl(new.payment_count,0),
nvl(old.payment_amount,0) + nvl(new.payment_count,0),
nvl(new.refund_count30,0),
nvl(new.refund_num30,0),
nvl(new.refund_amount30,0),
nvl(old.refund_count,0) + nvl(new.refund_count,0),
nvl(old.refund_num,0) + nvl(new.refund_num,0),
nvl(old.refund_amount,0) + nvl(new.refund_amount,0),
nvl(new.cart_count30,0),
nvl(old.cart_count,0) + nvl(new.cart_count,0),
nvl(new.favor_count30,0),
nvl(old.favor_count,0) + nvl(new.favor_count,0),
nvl(new.appraise_good_count30,0),
nvl(new.appraise_mid_count30,0),
nvl(new.appraise_bad_count30,0),
nvl(new.appraise_default_count30,0) ,
nvl(old.appraise_good_count,0) + nvl(new.appraise_good_count,0),
nvl(old.appraise_mid_count,0) + nvl(new.appraise_mid_count,0),
nvl(old.appraise_bad_count,0) + nvl(new.appraise_bad_count,0),
nvl(old.appraise_default_count,0) + nvl(new.appraise_default_count,0)
from
dwt_sku_topic old
full outer join
(
select
sku_id,
sum(if(dt='2020-06-14', order_count,0 )) order_count,
sum(if(dt='2020-06-14',order_num ,0 )) order_num,
sum(if(dt='2020-06-14',order_amount,0 )) order_amount ,
sum(if(dt='2020-06-14',payment_count,0 )) payment_count,
sum(if(dt='2020-06-14',payment_num,0 )) payment_num,
sum(if(dt='2020-06-14',payment_amount,0 )) payment_amount,
sum(if(dt='2020-06-14',refund_count,0 )) refund_count,
sum(if(dt='2020-06-14',refund_num,0 )) refund_num,
sum(if(dt='2020-06-14',refund_amount,0 )) refund_amount,
sum(if(dt='2020-06-14',cart_count,0 )) cart_count,
sum(if(dt='2020-06-14',favor_count,0 )) favor_count,
sum(if(dt='2020-06-14',appraise_good_count,0 )) appraise_good_count,
sum(if(dt='2020-06-14',appraise_mid_count,0 ) ) appraise_mid_count ,
sum(if(dt='2020-06-14',appraise_bad_count,0 )) appraise_bad_count,
sum(if(dt='2020-06-14',appraise_default_count,0 )) appraise_default_count,
sum(order_count) order_count30 ,
sum(order_num) order_num30,
sum(order_amount) order_amount30,
sum(payment_count) payment_count30,
sum(payment_num) payment_num30,
sum(payment_amount) payment_amount30,
sum(refund_count) refund_count30,
sum(refund_num) refund_num30,
sum(refund_amount) refund_amount30,
sum(cart_count) cart_count30,
sum(favor_count) favor_count30,
sum(appraise_good_count) appraise_good_count30,
sum(appraise_mid_count) appraise_mid_count30,
sum(appraise_bad_count) appraise_bad_count30,
sum(appraise_default_count) appraise_default_count30
from dws_sku_action_daycount
where dt >= date_add ('2020-06-27', -29)
group by sku_id
)new
on new.sku_id = old.sku_id
left join
(select * from dwd_dim_sku_info where dt='2020-06-27') sku_info
on nvl(new.sku_id,old.sku_id)= sku_info.id;
活动主题宽表
建表
drop table if exists dwt_activity_topic;
create external table dwt_activity_topic(
`id` string COMMENT '编号',
`activity_name` string COMMENT '活动名称',
`activity_type` string COMMENT '活动类型',
`start_time` string COMMENT '开始时间',
`end_time` string COMMENT '结束时间',
`create_time` string COMMENT '创建时间',
`display_day_count` bigint COMMENT '当日曝光次数',
`order_day_count` bigint COMMENT '当日下单次数',
`order_day_amount` decimal(20,2) COMMENT '当日下单金额',
`payment_day_count` bigint COMMENT '当日支付次数',
`payment_day_amount` decimal(20,2) COMMENT '当日支付金额',
`display_count` bigint COMMENT '累积曝光次数',
`order_count` bigint COMMENT '累积下单次数',
`order_amount` decimal(20,2) COMMENT '累积下单金额',
`payment_count` bigint COMMENT '累积支付次数',
`payment_amount` decimal(20,2) COMMENT '累积支付金额'
) COMMENT '活动主题宽表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwt/dwt_activity_topic/'
tblproperties ("parquet.compression"="lzo");
数据装载
insert overwrite table dwt_activity_topic
select
nvl(new.id,old.id),
nvl(new.activity_name,old.activity_name),
nvl(new.activity_type,old.activity_type),
nvl(new.start_time,old.start_time),
nvl(new.end_time,old.end_time),
nvl(new.create_time,old.create_time),
nvl(new.display_count,0),
nvl(new.order_count,0),
nvl(new.order_amount,0.0),
nvl(new.payment_count,0),
nvl(new.payment_amount,0.0),
nvl(new.display_count,0)+nvl(old.display_count,0),
nvl(new.order_count,0)+nvl(old.order_count,0),
nvl(new.order_amount,0.0)+nvl(old.order_amount,0.0),
nvl(new.payment_count,0)+nvl(old.payment_count,0),
nvl(new.payment_amount,0.0)+nvl(old.payment_amount,0.0)
from
(
select
*
from dwt_activity_topic
)old
full outer join
(
select
*
from dws_activity_info_daycount
where dt='2020-06-14'
)new
on old.id=new.id;
地区主题宽表
建表
drop table if exists dwt_area_topic;
create external table dwt_area_topic(
`id` bigint COMMENT '编号',
`province_name` string COMMENT '省份名称',
`area_code` string COMMENT '地区编码',
`iso_code` string COMMENT 'iso编码',
`region_id` string COMMENT '地区ID',
`region_name` string COMMENT '地区名称',
`login_day_count` string COMMENT '当天活跃设备数',
`login_last_30d_count` string COMMENT '最近30天活跃设备数',
`order_day_count` bigint COMMENT '当天下单次数',
`order_day_amount` decimal(16,2) COMMENT '当天下单金额',
`order_last_30d_count` bigint COMMENT '最近30天下单次数',
`order_last_30d_amount` decimal(16,2) COMMENT '最近30天下单金额',
`payment_day_count` bigint COMMENT '当天支付次数',
`payment_day_amount` decimal(16,2) COMMENT '当天支付金额',
`payment_last_30d_count` bigint COMMENT '最近30天支付次数',
`payment_last_30d_amount` decimal(16,2) COMMENT '最近30天支付金额'
) COMMENT '地区主题宽表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwt/dwt_area_topic/'
tblproperties ("parquet.compression"="lzo");
数据装载
insert overwrite table dwt_area_topic
select
nvl(old.id,new.id),
nvl(old.province_name,new.province_name),
nvl(old.area_code,new.area_code),
nvl(old.iso_code,new.iso_code),
nvl(old.region_id,new.region_id),
nvl(old.region_name,new.region_name),
nvl(new.login_day_count,0),
nvl(new.login_last_30d_count,0),
nvl(new.order_day_count,0),
nvl(new.order_day_amount,0.0),
nvl(new.order_last_30d_count,0),
nvl(new.order_last_30d_amount,0.0),
nvl(new.payment_day_count,0),
nvl(new.payment_day_amount,0.0),
nvl(new.payment_last_30d_count,0),
nvl(new.payment_last_30d_amount,0.0)
from
(
select
*
from dwt_area_topic
)old
full outer join
(
select
id,
province_name,
area_code,
iso_code,
region_id,
region_name,
sum(if(dt='2020-06-14',login_count,0)) login_day_count,
sum(if(dt='2020-06-14',order_count,0)) order_day_count,
sum(if(dt='2020-06-14',order_amount,0.0)) order_day_amount,
sum(if(dt='2020-06-14',payment_count,0)) payment_day_count,
sum(if(dt='2020-06-14',payment_amount,0.0)) payment_day_amount,
sum(login_count) login_last_30d_count,
sum(order_count) order_last_30d_count,
sum(order_amount) order_last_30d_amount,
sum(payment_count) payment_last_30d_count,
sum(payment_amount) payment_last_30d_amount
from dws_area_stats_daycount
where dt>=date_add('2020-06-14',-30)
group by id,province_name,area_code,iso_code,region_id,region_name
)new
on old.id=new.id;
DWT层数据导入脚本
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
set mapreduce.job.queuename=hive;
insert overwrite table ${APP}.dwt_uv_topic
select
nvl(new.mid_id,old.mid_id),
nvl(new.model,old.model),
nvl(new.brand,old.brand),
if(old.mid_id is null,'$do_date',old.login_date_first),
if(new.mid_id is not null,'$do_date',old.login_date_last),
if(new.mid_id is not null, new.login_count,0),
nvl(old.login_count,0)+if(new.login_count>0,1,0)
from
(
select
*
from ${APP}.dwt_uv_topic
)old
full outer join
(
select
*
from ${APP}.dws_uv_detail_daycount
where dt='$do_date'
)new
on old.mid_id=new.mid_id;
insert overwrite table ${APP}.dwt_user_topic
select
nvl(new.user_id,old.user_id),
if(old.login_date_first is null and new.login_count>0,'$do_date',old.login_date_first),
if(new.login_count>0,'$do_date',old.login_date_last),
nvl(old.login_count,0)+if(new.login_count>0,1,0),
nvl(new.login_last_30d_count,0),
if(old.order_date_first is null and new.order_count>0,'$do_date',old.order_date_first),
if(new.order_count>0,'$do_date',old.order_date_last),
nvl(old.order_count,0)+nvl(new.order_count,0),
nvl(old.order_amount,0)+nvl(new.order_amount,0),
nvl(new.order_last_30d_count,0),
nvl(new.order_last_30d_amount,0),
if(old.payment_date_first is null and new.payment_count>0,'$do_date',old.payment_date_first),
if(new.payment_count>0,'$do_date',old.payment_date_last),
nvl(old.payment_count,0)+nvl(new.payment_count,0),
nvl(old.payment_amount,0)+nvl(new.payment_amount,0),
nvl(new.payment_last_30d_count,0),
nvl(new.payment_last_30d_amount,0)
from
${APP}.dwt_user_topic old
full outer join
(
select
user_id,
sum(if(dt='$do_date',login_count,0)) login_count,
sum(if(dt='$do_date',order_count,0)) order_count,
sum(if(dt='$do_date',order_amount,0)) order_amount,
sum(if(dt='$do_date',payment_count,0)) payment_count,
sum(if(dt='$do_date',payment_amount,0)) payment_amount,
sum(if(login_count>0,1,0)) login_last_30d_count,
sum(order_count) order_last_30d_count,
sum(order_amount) order_last_30d_amount,
sum(payment_count) payment_last_30d_count,
sum(payment_amount) payment_last_30d_amount
from ${APP}.dws_user_action_daycount
where dt>=date_add( '$do_date',-30)
group by user_id
)new
on old.user_id=new.user_id;
insert overwrite table ${APP}.dwt_sku_topic
select
nvl(new.sku_id,old.sku_id),
sku_info.spu_id,
nvl(new.order_count30,0),
nvl(new.order_num30,0),
nvl(new.order_amount30,0),
nvl(old.order_count,0) + nvl(new.order_count,0),
nvl(old.order_num,0) + nvl(new.order_num,0),
nvl(old.order_amount,0) + nvl(new.order_amount,0),
nvl(new.payment_count30,0),
nvl(new.payment_num30,0),
nvl(new.payment_amount30,0),
nvl(old.payment_count,0) + nvl(new.payment_count,0),
nvl(old.payment_num,0) + nvl(new.payment_count,0),
nvl(old.payment_amount,0) + nvl(new.payment_count,0),
nvl(new.refund_count30,0),
nvl(new.refund_num30,0),
nvl(new.refund_amount30,0),
nvl(old.refund_count,0) + nvl(new.refund_count,0),
nvl(old.refund_num,0) + nvl(new.refund_num,0),
nvl(old.refund_amount,0) + nvl(new.refund_amount,0),
nvl(new.cart_count30,0),
nvl(old.cart_count,0) + nvl(new.cart_count,0),
nvl(new.favor_count30,0),
nvl(old.favor_count,0) + nvl(new.favor_count,0),
nvl(new.appraise_good_count30,0),
nvl(new.appraise_mid_count30,0),
nvl(new.appraise_bad_count30,0),
nvl(new.appraise_default_count30,0) ,
nvl(old.appraise_good_count,0) + nvl(new.appraise_good_count,0),
nvl(old.appraise_mid_count,0) + nvl(new.appraise_mid_count,0),
nvl(old.appraise_bad_count,0) + nvl(new.appraise_bad_count,0),
nvl(old.appraise_default_count,0) + nvl(new.appraise_default_count,0)
from
(
select
sku_id,
spu_id,
order_last_30d_count,
order_last_30d_num,
order_last_30d_amount,
order_count,
order_num,
order_amount ,
payment_last_30d_count,
payment_last_30d_num,
payment_last_30d_amount,
payment_count,
payment_num,
payment_amount,
refund_last_30d_count,
refund_last_30d_num,
refund_last_30d_amount,
refund_count,
refund_num,
refund_amount,
cart_last_30d_count,
cart_count,
favor_last_30d_count,
favor_count,
appraise_last_30d_good_count,
appraise_last_30d_mid_count,
appraise_last_30d_bad_count,
appraise_last_30d_default_count,
appraise_good_count,
appraise_mid_count,
appraise_bad_count,
appraise_default_count
from ${APP}.dwt_sku_topic
)old
full outer join
(
select
sku_id,
sum(if(dt='$do_date', order_count,0 )) order_count,
sum(if(dt='$do_date',order_num ,0 )) order_num,
sum(if(dt='$do_date',order_amount,0 )) order_amount ,
sum(if(dt='$do_date',payment_count,0 )) payment_count,
sum(if(dt='$do_date',payment_num,0 )) payment_num,
sum(if(dt='$do_date',payment_amount,0 )) payment_amount,
sum(if(dt='$do_date',refund_count,0 )) refund_count,
sum(if(dt='$do_date',refund_num,0 )) refund_num,
sum(if(dt='$do_date',refund_amount,0 )) refund_amount,
sum(if(dt='$do_date',cart_count,0 )) cart_count,
sum(if(dt='$do_date',favor_count,0 )) favor_count,
sum(if(dt='$do_date',appraise_good_count,0 )) appraise_good_count,
sum(if(dt='$do_date',appraise_mid_count,0 ) ) appraise_mid_count ,
sum(if(dt='$do_date',appraise_bad_count,0 )) appraise_bad_count,
sum(if(dt='$do_date',appraise_default_count,0 )) appraise_default_count,
sum(order_count) order_count30 ,
sum(order_num) order_num30,
sum(order_amount) order_amount30,
sum(payment_count) payment_count30,
sum(payment_num) payment_num30,
sum(payment_amount) payment_amount30,
sum(refund_count) refund_count30,
sum(refund_num) refund_num30,
sum(refund_amount) refund_amount30,
sum(cart_count) cart_count30,
sum(favor_count) favor_count30,
sum(appraise_good_count) appraise_good_count30,
sum(appraise_mid_count) appraise_mid_count30,
sum(appraise_bad_count) appraise_bad_count30,
sum(appraise_default_count) appraise_default_count30
from ${APP}.dws_sku_action_daycount
where dt >= date_add ('$do_date', -30)
group by sku_id
)new
on new.sku_id = old.sku_id
left join
(select * from ${APP}.dwd_dim_sku_info where dt='$do_date') sku_info
on nvl(new.sku_id,old.sku_id)= sku_info.id;
insert overwrite table ${APP}.dwt_activity_topic
select
nvl(new.id,old.id),
nvl(new.activity_name,old.activity_name),
nvl(new.activity_type,old.activity_type),
nvl(new.start_time,old.start_time),
nvl(new.end_time,old.end_time),
nvl(new.create_time,old.create_time),
nvl(new.display_count,0),
nvl(new.order_count,0),
nvl(new.order_amount,0.0),
nvl(new.payment_count,0),
nvl(new.payment_amount,0.0),
nvl(new.display_count,0)+nvl(old.display_count,0),
nvl(new.order_count,0)+nvl(old.order_count,0),
nvl(new.order_amount,0.0)+nvl(old.order_amount,0.0),
nvl(new.payment_count,0)+nvl(old.payment_count,0),
nvl(new.payment_amount,0.0)+nvl(old.payment_amount,0.0)
from
(
select
*
from ${APP}.dwt_activity_topic
)old
full outer join
(
select
*
from ${APP}.dws_activity_info_daycount
where dt='$do_date'
)new
on old.id=new.id;
insert overwrite table ${APP}.dwt_area_topic
select
nvl(old.id,new.id),
nvl(old.province_name,new.province_name),
nvl(old.area_code,new.area_code),
nvl(old.iso_code,new.iso_code),
nvl(old.region_id,new.region_id),
nvl(old.region_name,new.region_name),
nvl(new.login_day_count,0),
nvl(new.login_last_30d_count,0),
nvl(new.order_day_count,0),
nvl(new.order_day_amount,0.0),
nvl(new.order_last_30d_count,0),
nvl(new.order_last_30d_amount,0.0),
nvl(new.payment_day_count,0),
nvl(new.payment_day_amount,0.0),
nvl(new.payment_last_30d_count,0),
nvl(new.payment_last_30d_amount,0.0)
from
(
select
*
from ${APP}.dwt_area_topic
)old
full outer join
(
select
id,
province_name,
area_code,
iso_code,
region_id,
region_name,
sum(if(dt='$do_date',login_count,0)) login_day_count,
sum(if(dt='$do_date',order_count,0)) order_day_count,
sum(if(dt='$do_date',order_amount,0.0)) order_day_amount,
sum(if(dt='$do_date',payment_count,0)) payment_day_count,
sum(if(dt='$do_date',payment_amount,0.0)) payment_day_amount,
sum(login_count) login_last_30d_count,
sum(order_count) order_last_30d_count,
sum(order_amount) order_last_30d_amount,
sum(payment_count) payment_last_30d_count,
sum(payment_amount) payment_last_30d_amount
from ${APP}.dws_area_stats_daycount
where dt>=date_add('$do_date',-30)
group by id,province_name,area_code,iso_code,region_id,region_name
)new
on old.id=new.id;
"
$hive -e "$sql"
数仓搭建-ADS层
ads层数据 没有分区,也没有列式存储是直接往mysql中导的
所以插入数据时不需要overwrite
新增->沉默->留存
设备主题
活跃设备数(日、周、月)
需求定义:
日活:当日活跃的设备数
周活:当周活跃的设备数
月活:当月活跃的设备数
azkaban处理多个工作流程的依赖是不足的所以在建表时加上标记
这样的好处:可以看到周活月活的动态变化
CH中的wuz是可以的
建表
drop table if exists ads_uv_count;
create external table ads_uv_count(
`dt` string COMMENT '统计日期',
`day_count` bigint COMMENT '当日用户数量',
`wk_count` bigint COMMENT '当周用户数量',
`mn_count` bigint COMMENT '当月用户数量',
`is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果',
`is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果'
) COMMENT '活跃设备数'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_uv_count/';
导入数据
日活:可以使用当日活跃次数>0,末次活跃时间是今天
周活:末次活跃时间在本周内(大于等于周一,小于等于周日)
月活:末次活跃时间在本月内(大于等于1日小于等于这个月的最后一天
保证最后活跃时间的年月和今天的年月保持一致
select
'2020-06-27',
sum(if(login_date_last='2020-06-27',1,0)),
sum(if(login_date_last>=date_add(next_day('2020-06-27','monday'),-7) and login_date_last<=date_add(next_day('2020-06-27','monday'),-1),1,0)),
sum(if(date_format(login_date_last,'yyyy-MM')=date_format('2020-06-27','yyyy-MM'),1,0)),
if('2020-06-27'=date_add(next_day('2020-06-27','monday'),-1),'Y','N'),
if(last_day('2020-06-27')='2020-06-27','Y','N')
from dwt_uv_topic;
每日新增设备
建表
drop table if exists ads_new_mid_count;
create external table ads_new_mid_count
(
`create_date` string comment '创建时间' ,
`new_mid_count` BIGINT comment '新增设备数量'
) COMMENT '每日新增设备信息数量'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_new_mid_count/';
数据载入
select '2020-06-27',
count(*)
from dws_uv_detail_daycount
where dt = '2020-06-27';
留存率
2020-06-24 的 2 日 留存
stat_date | create_date | retention_day | retention_count | new_mid_count | retention_ratio |
---|---|---|---|---|---|
2020-06-24 | 2 | 3000 | 10000 | 20 |
建表
drop table if exists ads_user_retention_day_rate;
create external table ads_user_retention_day_rate
(
`stat_date` string comment '统计日期',
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数量',
`new_mid_count` bigint comment '设备新增数量',
`retention_ratio` decimal(16,2) comment '留存率'
) COMMENT '每日用户留存情况'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_rate/';
-- 2020-06-27的 1日留存率
--2020-06-27的新增用户 2020-06-24的1日留存用户(2020-06-28的活跃)
--2020-06-28
-- 2020-06-27的 2日留存率
--2020-06-27的新增用户 2020-06-29的活跃
--2020-06-29
-- 2020-06-27的 3日留存率
--2020-06-27的新增用户 2020-06-30的活跃
--2020-06-30
--2020-06-30日的计算任务
--2020-06-27的3日留存
新增用户
留存用户
--2020-06-28的2日留存
--2020-06-29的1日留存
载入数据
--2020-06-30日的计算任务
insert into table ads_user_retention_day_rate
--2020-06-27的3日留存
select '2020-06-30',
'2020-06-27',
3,
sum(`if`(login_date_first = '2020-06-27') and login_date_last = '2020-06-30', 1, 0),
sum(`if`(login_date_first = '2020-06-27', 1, 0)),
sum(`if`(login_date_first = '2020-06-27') and login_date_last = '2020-06-30', 1, 0)/
sum(`if`(login_date_first = '2020-06-27', 1, 0))
from dwt_uv_topic
union all
--2020-06-28的2日留存
select '2020-06-30',
'2020-06-28',
2,
sum(`if`(login_date_first = '2020-06-28') and login_date_last = '2020-06-30', 1, 0),
sum(`if`(login_date_first = '2020-06-28', 1, 0)),
sum(`if`(login_date_first = '2020-06-28') and login_date_last = '2020-06-30', 1, 0)/
sum(`if`(login_date_first = '2020-06-28', 1, 0))
from dwt_uv_topic
--2020-06-29的1日留存
union all
select '2020-06-30',
'2020-06-29', --date_add('2020-06-30',-1)
1,
sum(`if`(login_date_first = '2020-06-29') and login_date_last = '2020-06-30', 1, 0),
sum(`if`(login_date_first = '2020-06-29', 1, 0)),
sum(`if`(login_date_first = '2020-06-29') and login_date_last = '2020-06-30', 1, 0)/
sum(`if`(login_date_first = '2020-06-29', 1, 0))
from dwt_uv_topic;
优化
做下过滤提高效率
insert into table ads_user_retention_day_rate
--2020-06-27的3日留存
select '2020-06-30',
'2020-06-27',
3,
sum(`if`(login_date_first = '2020-06-27') and login_date_last = '2020-06-30', 1, 0),
sum(`if`(login_date_first = '2020-06-27', 1, 0)),
sum(`if`(login_date_first = '2020-06-27') and login_date_last = '2020-06-30', 1, 0)/
sum(`if`(login_date_first = '2020-06-27', 1, 0))
from dwt_uv_topic
where login_date_first='2020-06-27'
union all
--2020-06-28的2日留存
select '2020-06-30',
'2020-06-28',
2,
sum(`if`(login_date_first = '2020-06-28') and login_date_last = '2020-06-30', 1, 0),
sum(`if`(login_date_first = '2020-06-28', 1, 0)),
sum(`if`(login_date_first = '2020-06-28') and login_date_last = '2020-06-30', 1, 0)/
sum(`if`(login_date_first = '2020-06-28', 1, 0))
from dwt_uv_topic
where login_date_first='2020-06-28'
--2020-06-29的1日留存
union all
select '2020-06-30',
'2020-06-29',
1,
sum(`if`(login_date_first = '2020-06-29') and login_date_last = '2020-06-30', 1, 0),
sum(`if`(login_date_first = '2020-06-29', 1, 0)),
sum(`if`(login_date_first = '2020-06-29') and login_date_last = '2020-06-30', 1, 0)/
sum(`if`(login_date_first = '2020-06-29', 1, 0))
from dwt_uv_topic
where login_date_first='2020-06-29';
放入脚本
沉默用户数
需求定义:
沉默用户:只在安装当天启动过,且启动时间是在7天前(当天启动,7天后没有启动)
drop table if exists ads_silent_count;
create external table ads_silent_count(
`dt` string COMMENT '统计日期',
`silent_count` bigint COMMENT '沉默设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_silent_count';
数据载入
2个条件 安装当天启动过 启动时间时7天前
select '2020-06-27' `dt`,
count(*)
from dwt_uv_topic
where login_count=1
and login_date_first<date_add('2020-06-27',-7);
本周回流用户数
需求定义:
本周回流用户:上周未活跃,本周活跃的设备,且不是本周新增设备
回流=本周活跃-本周新增-上周活跃 最后或已饿在本周-首次活跃在本周
上周活跃:不能用最后活跃时间在上周 这样会丢失一部分数据:上周活跃本周也活跃
从dws
建表
drop table if exists ads_back_count;
create external table ads_back_count(
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '统计日期所在周',
`wastage_count` bigint COMMENT '回流设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_back_count';
装载数据
insert into table ads_back_count
select
'2020-06-27',
concat(date_add(next_day('2020-06-27','MO'),-7),'_', date_add(next_day('2020-06-25','MO'),-1)),
count(*)
from
(
select
mid_id
from dwt_uv_topic
where login_date_last>=date_add(next_day('2020-06-27','MO'),-7)
and login_date_last<= date_add(next_day('2020-06-27','MO'),-1)
and login_date_first<date_add(next_day('2020-06-27','MO'),-7)
)current_wk
left join
(
select
mid_id
from dws_uv_detail_daycount
where dt>=date_add(next_day('2020-06-27','MO'),-7*2)
and dt<= date_add(next_day('2020-06-27','MO'),-7-1)
group by mid_id
)last_wk
on current_wk.mid_id=last_wk.mid_id
where last_wk.mid_id is null;
流失用户数
需求定义:
流失用户:最近7天未活跃的设备
建表
drop table if exists ads_wastage_count;
create external table ads_wastage_count(
`dt` string COMMENT '统计日期',
`wastage_count` bigint COMMENT '流失设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_wastage_count';
载入
insert into table ads_wastage_count
select '2020-06-27',
count(*)
from dwt_uv_topic
where login_date_last <= date_add('2020-06-27', -7);
最近连续三周活跃用户数
建表
drop table if exists ads_continuity_wk_count;
create external table ads_continuity_wk_count(
`dt` string COMMENT '统计日期,一般用结束周周日日期,如果每天计算一次,可用当天日期',
`wk_dt` string COMMENT '持续时间',
`continuity_count` bigint COMMENT '活跃次数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_wk_count';
载入
Union all 纵向拼接
join 横向拼接
select '2020-06-27',
concat_ws("-", date_add(next_day('2020-06-27'), 'mon', -7 * 3), date_add(next_day('2020-06-27'), 'mon', -1)),
count(*)
from (
select mid_id
from (
select mid_id
from dws_uv_detail_daycount
where dt > date_add(next_day('2020-06-27'), 'mon', -7 * 3)
and dt < date_add(next_day('2020-06-27'), 'mon', -7 * 2 - 1)
group by mid_id
union all
select mid_id
from dws_uv_detail_daycount
where dt > date_add(next_day('2020-06-27'), 'mon', -7 * 2)
and dt < date_add(next_day('2020-06-27'), "mon", -7 - 1)
group by mid_id
union all
select mid_id
from dws_uv_detail_daycount
where dt > date_add(next_day('2020-06-27'), 'mon', -7)
and dt < date_add(next_day('2020-06-27'), 'mon', -1)
group by mid_id
) t1
group by mid_id
having count(*) = 3
) t2
最近七天内连续三天活跃用户数
其实求出最近连续3天活跃的设备id
数据来源:dws_uv_detail_daycount
日期 | 设备id | 排名 | 日期-排名 |
---|---|---|---|
07-01 | 1 | 1 | 06-30 |
07-02 | 1 | 2 | 06-30 |
07-03 | 1 | 3 | 06-30 |
07-05 | 1 | 4 | 07-01 |
07-06 | 1 | 5 | 07-01 |
建表
drop table if exists ads_continuity_uv_count;
create external table ads_continuity_uv_count(
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '最近7天日期',
`continuity_count` bigint
) COMMENT '连续活跃设备数'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_uv_count';
数据装载
insert into table ads_continuity_uv_count
select
'2020-06-16',
concat(date_add('2020-06-16',-6),'_','2020-06-16'),
count(*)
from
(
select mid_id
from
(
select mid_id
from
(
select
mid_id,
date_sub(dt,rank) date_dif
from
(
select
mid_id,
dt,
rank() over(partition by mid_id order by dt) rank
from dws_uv_detail_daycount
where dt>=date_add('2020-06-16',-6) and dt<='2020-06-16'
)t1
)t2
group by mid_id,date_dif
having count(*)>=3
)t3
group by mid_id
)t4;
思路2:
连续3天特点:日期相减 datediff=2
Dt | mid_id | lead(dt,2,‘1970-01-01’) lead_2 | datesdiff(leade_2,dt) |
---|---|---|---|
07-01 | 1 | 07-03 | 2 |
07-02 | 1 | 07-05 | 3 |
07-03 | 1 | 07-06 | 3 |
07-05 | 1 | 1970-01-01 | - |
07-06 | 1 | 1970-01-01 | - |
select '2020-06-27',
concat(date_add('2020-06-27', -6), '_', '2020-06-27'),
count(*)
from (
select mid_id
from (
select mid_id,
datediff(lead_2, dt) diff
from (
select dt,
mid_id,
lead(dt, 2, '1970-01-01') over (partition by mid_id order by dt) lead_2
from dws_uv_detail_daycount
where dt >= date_add('2020-06-27', -7)
) t1
) t2
where diff = 2
group by mid_id
)
会员主题
会员主题信息
drop table if exists ads_user_topic;
create external table ads_user_topic(
`dt` string COMMENT '统计日期',
`day_users` string COMMENT '活跃会员数',
`day_new_users` string COMMENT '新增会员数',
`day_new_payment_users` string COMMENT '新增消费会员数',
`payment_users` string COMMENT '总付费会员数',
`users` string COMMENT '总会员数',
`day_users2users` decimal(16,2) COMMENT '会员活跃率',
`payment_users2users` decimal(16,2) COMMENT '会员付费率',
`day_new_users2users` decimal(16,2) COMMENT '会员新鲜度'
) COMMENT '会员主题信息表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_topic';
数据装载
select '2020-06-27',
sum(`if`(login_date_last = '2020-06-27', 1, 0)),
sum(`if`(login_date_first = '2020-06-27', 1, 0)),
sum(`if`(payment_date_first = '2020-06-27', 1, 0)),
sum(`if`(payment_count > 0, 1, 0)),
count(*),
sum(`if`(login_date_last = '2020-06-27', 1, 0)) / count(*),
sum(`if`(payment_amount != null, 1, 0)) / count(*),
sum(`if`(login_date_first = '2020-06-27', 1, 0)) / sum(`if`(login_date_last = '2020-06-27', 1, 0))
from dwt_user_topic;
漏斗分析
统计“浏览首页->浏览商品详情页->加入购物车->下单->支付”的转化率
思路:统计各个行为的人数,然后计算比值。
drop table if exists ads_user_action_convert_day;
create external table ads_user_action_convert_day(
`dt` string COMMENT '统计日期',
`home_count` bigint COMMENT '浏览首页人数',
`good_detail_count` bigint COMMENT '浏览商品详情页人数',
`home2good_detail_convert_ratio` decimal(16,2) COMMENT '首页到商品详情转化率',
`cart_count` bigint COMMENT '加入购物车的人数',
`good_detail2cart_convert_ratio` decimal(16,2) COMMENT '商品详情页到加入购物车转化率',
`order_count` bigint COMMENT '下单人数',
`cart2order_convert_ratio` decimal(16,2) COMMENT '加入购物车到下单转化率',
`payment_amount` bigint COMMENT '支付人数',
`order2payment_convert_ratio` decimal(16,2) COMMENT '下单到支付的转化率'
) COMMENT '用户行为漏斗分析'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_action_convert_day/';
确定数据来源
页面访问信息:来源于dwd_page_log
购物车和支付下单:dws_user_action_daycount
先计算出来每个用户都浏览过的页面
第一种
select
'2020-06-27' dt,
sum(if(array_contains(pages,'home'),1,0)) home_count,
sum(if(array_contains(pages,'good_detail'),1,0)) good_detail_count
from
(
select
mid_id,
collect_set(page_id) pages
from dwd_page_log
where dt='2020-06-27'
and page_id in ('home','good_detail')
group by mid_id
)tmp
第二种
select sum(`if`(page_id = 'home', 1, 0)),
sum(`if`(page_id = 'good_detail', 1, 0))
from (
select mid_id,
page.page_id
from dws_uv_detail_daycount lateral view explode(page_stats) tmp as page
where dt = '2020-06-27'
and page.page_id in ('home', 'good_detail')
--group by mid_id, page.page_id
) t1;
装载数据
with
tmp_uv as
(
select
'2020-06-14' dt,
sum(if(array_contains(pages,'home'),1,0)) home_count,
sum(if(array_contains(pages,'good_detail'),1,0)) good_detail_count
from
(
select
mid_id,
collect_set(page_id) pages
from dwd_page_log
where dt='2020-06-14'
and page_id in ('home','good_detail')
group by mid_id
)tmp
),
tmp_cop as
(
select
'2020-06-14' dt,
sum(if(cart_count>0,1,0)) cart_count,
sum(if(order_count>0,1,0)) order_count,
sum(if(payment_count>0,1,0)) payment_count
from dws_user_action_daycount
where dt='2020-06-14'
)
insert into table ads_user_action_convert_day
select
tmp_uv.dt,
tmp_uv.home_count,
tmp_uv.good_detail_count,
tmp_uv.good_detail_count/tmp_uv.home_count*100,
tmp_cop.cart_count,
tmp_cop.cart_count/tmp_uv.good_detail_count*100,
tmp_cop.order_count,
tmp_cop.order_count/tmp_cop.cart_count*100,
tmp_cop.payment_count,
tmp_cop.payment_count/tmp_cop.order_count*100
from tmp_uv
join tmp_cop
商品主题
商品个数信息
建表
drop table if exists ads_product_info;
create external table ads_product_info(
`dt` string COMMENT '统计日期',
`sku_num` string COMMENT 'sku个数',
`spu_num` string COMMENT 'spu个数'
) COMMENT '商品个数信息'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_info';
装载数据
insert into table ads_product_info
select
'2020-06-14' dt,
sku_num,
spu_num
from
(
select
'2020-06-14' dt,
count(*) sku_num
from
dwt_sku_topic
) tmp_sku_num
join
(
select
'2020-06-14' dt,
count(*) spu_num
from
(
select
spu_id
from
dwt_sku_topic
group by
spu_id
) tmp_spu_id
) tmp_spu_num
on
tmp_sku_num.dt=tmp_spu_num.dt;
商品销量排名
歧义:
1.商品销量是统计的当天,还是累计
2.销量的统计 是以销售额还是销售的数量
当天来源于dws
累计的来源于dwt
金额排行
建表
drop table if exists ads_product_sale_topN;
create external table ads_product_sale_topN(
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品ID',
`payment_amount` bigint COMMENT '销量'
) COMMENT '商品个数信息'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_sale_topN';
装载数据
select '2020-06-27'
sku_id,
payment_amount
from dws_sku_action_daycount
where dt='2020-06-27'
order by payment_amount desc
limit 10
商品收藏排名
建表
drop table if exists ads_product_favor_topN;
create external table ads_product_favor_topN(
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品ID',
`favor_count` bigint COMMENT '收藏量'
) COMMENT '商品收藏TopN'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_favor_topN';
装载数据
insert into table ads_product_favor_topN
select
'2020-06-14' dt,
sku_id,
favor_count
from
dws_sku_action_daycount
where
dt='2020-06-14'
order by favor_count desc
limit 10;
商品加入购物车排名
建表
drop table if exists ads_product_cart_topN;
create external table ads_product_cart_topN(
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品ID',
`cart_count` bigint COMMENT '加入购物车次数'
) COMMENT '商品加入购物车TopN'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_cart_topN';
装载数据
insert into table ads_product_cart_topN
select
'2020-06-14' dt,
sku_id,
cart_count
from
dws_sku_action_daycount
where
dt='2020-06-14'
order by cart_count desc
limit 10;
商品退款率排名(最近30天)
建表
drop table if exists ads_product_refund_topN;
create external table ads_product_refund_topN(
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品ID',
`refund_ratio` decimal(16,2) COMMENT '退款率'
) COMMENT '商品退款率TopN'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_refund_topN';
数据装载
insert into table ads_product_refund_topN
select
'2020-06-14',
sku_id,
refund_last_30d_count/payment_last_30d_count*100 refund_ratio
from dwt_sku_topic
order by refund_ratio desc
limit 10;
商品差评率
建表
drop table if exists ads_appraise_bad_topN;
create external table ads_appraise_bad_topN(
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品ID',
`appraise_bad_ratio` decimal(16,2) COMMENT '差评率'
) COMMENT '商品差评率TopN'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_appraise_bad_topN';
装载数据
insert into table ads_appraise_bad_topN
select
'2020-06-14' dt,
sku_id,
appraise_bad_count/(appraise_good_count+appraise_mid_count+appraise_bad_count+appraise_default_count) appraise_bad_ratio
from
dws_sku_action_daycount
where
dt='2020-06-14'
order by appraise_bad_ratio desc
limit 10;
分组topn
用开窗区
营销主题(用户+商品+购买行为)
GMV 订单金额总和
下单数目统计
建表
drop table if exists ads_order_daycount;
create external table ads_order_daycount(
dt string comment '统计日期',
order_count bigint comment '单日下单笔数',
order_amount bigint comment '单日下单金额',
order_users bigint comment '单日下单用户数'
) comment '每日订单总计表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_order_daycount';
装载
转时间戳
insert into table ads_order_daycount
select
'2020-06-14',
sum(order_count),
sum(order_amount),
sum(if(order_count>0,1,0))
from dws_user_action_daycount
where dt='2020-06-14';
支付信息统计
建表
drop table if exists ads_payment_daycount;
create external table ads_payment_daycount(
dt string comment '统计日期',
order_count bigint comment '单日支付笔数',
order_amount bigint comment '单日支付金额',
payment_user_count bigint comment '单日支付人数',
payment_sku_count bigint comment '单日支付商品数',
payment_avg_time decimal(16,2) comment '下单到支付的平均时长,取分钟数'
) comment '每日订单总计表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_payment_daycount';
数据装载
with tmp_payment as (
select '2020-06-27' dt,
sum(payment_count) payment_count,
sum(payment_amount) payment_amount,
sum(if(payment_count > 0, 1, 0)) payment_user_count
from dws_user_action_daycount
where dt = '2020-06-27'
),
tmp_sku as (
select '2020-06-27' dt,
sum(`if`(payment_num > 0, 1, 0)) payment_sku_count
from dws_sku_action_daycount
where dt = '2020-06-27'
),
tmp_time as (
select '2020-06-27' dt,
sum(unix_timestamp(payment_time) - unix_timestamp(create_time)) / count(*) / 60 payment_avg_time
from dwd_fact_order_info
where dt = '2020-06-27'
and payment_time is not null
)
select tmp_payment.dt,
payment_count,
payment_amount,
payment_user_count,
payment_sku_count,
payment_avg_time
from tmp_payment
join tmp_sku on tmp_payment.dt = tmp_sku.dt
join tmp_time on tmp_time.dt = tmp_payment.dt
单独join
select
tmp_payment.dt,
tmp_payment.payment_count,
tmp_payment.payment_amount,
tmp_payment.payment_user_count,
tmp_skucount.payment_sku_count,
tmp_time.payment_avg_time
from
(
select
'2020-06-27' dt,
sum(payment_count) payment_count,
sum(payment_amount) payment_amount,
sum(if(payment_count>0,1,0)) payment_user_count
from dws_user_action_daycount
where dt='2020-06-27'
)tmp_payment
join
(
select
'2020-06-27' dt,
sum(if(payment_count>0,1,0)) payment_sku_count
from dws_sku_action_daycount
where dt='2020-06-27'
)tmp_skucount on tmp_payment.dt=tmp_skucount.dt
join
(
select
'2020-06-27' dt,
sum(unix_timestamp(payment_time)-unix_timestamp(create_time))/count(*)/60 payment_avg_time
from dwd_fact_order_info
where dt='2020-06-27'
and payment_time is not null
)tmp_time on tmp_payment.dt=tmp_time.dt;
品牌复购率
单次复购率:单次复购人数/购买1次以上的人数
多次复购率:多次复购人数/购买1次以上的人数
时间跨度:月,季度,年。。。
数据来源:dws_user_action_daycount
dwd_dim_sku_info
建表
drop table ads_sale_tm_category1_stat_mn;
create external table ads_sale_tm_category1_stat_mn
(
tm_id string comment '品牌id',
category1_id string comment '1级品类id ',
category1_name string comment '1级品类名称 ',
buycount bigint comment '购买人数',
buy_twice_last bigint comment '两次以上购买人数',
buy_twice_last_ratio decimal(16,2) comment '单次复购率',
buy_3times_last bigint comment '三次以上购买人数',
buy_3times_last_ratio decimal(16,2) comment '多次复购率',
stat_mn string comment '统计月份',
stat_date string comment '统计日期'
) COMMENT '复购率统计'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/';
装载数据
with tmp_order as (
select user_id,
order_detail_stats_struct.sku_id sku_id,
order_detail_stats_struct.order_count order_count
from dws_user_action_daycount lateral view explode(order_detail_stats) tmp as order_detail_stats_struct
where date_format(dt, 'yyyy-MM') = date_format('2020-06-27', 'yyyy-MM')
),
tmp_sku as (
select id,
tm_id,
category1_id,
category1_name
from dwd_dim_sku_info
where dt = '2020-06-27'
)
select tm_id,
category1_id,
category1_name,
sum(`if`(order_count >= 1, 1, 0)) buycount,
sum(`if`(order_count >= 2, 1, 0)) buy_twice_last,
sum(`if`(order_count >= 2, 1, 0)) / sum(`if`(order_count >= 1, 1, 0)) * 100 buy_twice_last_ratio,
sum(`if`(order_count >= 3, 1, 0)) buy_3times_last,
sum(`if`(order_count >= 3, 1, 0)) / sum(`if`(order_count >= 1, 1, 0)) * 100 buy_3times_last_ratio,
date_format('2020-06-27','yyyy-MM') stat_mn,
'2020-06-27'
from (
select user_id,
category1_id,
category1_name,
tm_id,
sum(order_count) order_count
from tmp_order
join tmp_sku on tmp_sku.id = tmp_order.sku_id
group by user_id, category1_id, category1_name, tm_id
) t1
group by tm_id, category1_id, category1_name
with
tmp_order as
(
select
user_id,
sku_id,
sum(order_count) order_count
from
(
select
user_id,
order_detail.sku_id,
order_detail.order_count
from dws_user_action_daycount lateral view explode(order_detail_stats) tmp as order_detail
where date_format(dt,'yyyy-MM')=date_format('2020-06-24','yyyy-MM')
)tmp
group by user_id,sku_id
),
tmp_sku as
(
select
id,
tm_id,
category1_id,
category1_name
from dwd_dim_sku_info
where dt='2020-06-24'
)
insert into table ads_sale_tm_category1_stat_mn
select
tm_id,
category1_id,
category1_name,
sum(if(order_count>=1,1,0)) buycount,
sum(if(order_count>=2,1,0)) buyTwiceLast,
sum(if(order_count>=2,1,0))/sum( if(order_count>=1,1,0)) buyTwiceLastRatio,
sum(if(order_count>=3,1,0)) buy3timeLast ,
sum(if(order_count>=3,1,0))/sum( if(order_count>=1,1,0)) buy3timeLastRatio ,
date_format('2020-06-24' ,'yyyy-MM') stat_mn,
'2020-06-24' stat_date
from
(
select
category1_id,
category1_name,
user_id,
tm_id,
sum(order_count) order_count
from tmp_order
join tmp_sku
on tmp_order.sku_id=tmp_sku.id
group by category1_id,category1_name,user_id,tm_id
)t1
group by category1_id,category1_name,tm_id
地区主题
iso国际化通用编码
建表
drop table if exists ads_area_topic;
create external table ads_area_topic(
`dt` string COMMENT '统计日期',
`id` bigint COMMENT '编号',
`province_name` string COMMENT '省份名称',
`area_code` string COMMENT '地区编码',
`iso_code` string COMMENT 'iso编码',
`region_id` string COMMENT '地区ID',
`region_name` string COMMENT '地区名称',
`login_day_count` bigint COMMENT '当天活跃设备数',
`order_day_count` bigint COMMENT '当天下单次数',
`order_day_amount` decimal(16,2) COMMENT '当天下单金额',
`payment_day_count` bigint COMMENT '当天支付次数',
`payment_day_amount` decimal(16,2) COMMENT '当天支付金额'
) COMMENT '地区主题宽表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_area_topic/';
载入数据
insert into table ads_area_topic
select
'2020-06-14',
id,
province_name,
area_code,
iso_code,
region_id,
region_name,
login_day_count,
order_day_count,
order_day_amount,
payment_day_count,
payment_day_amount
from dwt_area_topic;
ADS层导入脚本
Inser into会导致大量的小文件
解决办法 把历史数据先拿出来 2个子查询再union
vim dwt_to_ads.sh
#!/bin/bash
hive=/opt/module/hive/bin/hive
APP=gmall
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
set mapreduce.job.queuename=hive;
insert into table ${APP}.ads_uv_count
select
'$do_date' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
if(date_add(next_day('$do_date','MO'),-1)='$do_date','Y','N') ,
if(last_day('$do_date')='$do_date','Y','N')
from
(
select
'$do_date' dt,
count(*) ct
from ${APP}.dwt_uv_topic
where login_date_last='$do_date'
)daycount join
(
select
'$do_date' dt,
count (*) ct
from ${APP}.dwt_uv_topic
where login_date_last>=date_add(next_day('$do_date','MO'),-7)
and login_date_last<= date_add(next_day('$do_date','MO'),-1)
) wkcount on daycount.dt=wkcount.dt
join
(
select
'$do_date' dt,
count (*) ct
from ${APP}.dwt_uv_topic
where date_format(login_date_last,'yyyy-MM')=date_format('$do_date','yyyy-MM')
)mncount on daycount.dt=mncount.dt;
insert into table ${APP}.ads_new_mid_count
select
login_date_first,
count(*)
from ${APP}.dwt_uv_topic
where login_date_first='$do_date'
group by login_date_first;
insert into table ${APP}.ads_silent_count
select
'$do_date',
count(*)
from ${APP}.dwt_uv_topic
where login_date_first=login_date_last
and login_date_last<=date_add('$do_date',-7);
insert into table ${APP}.ads_back_count
select
'$do_date',
concat(date_add(next_day('$do_date','MO'),-7),'_', date_add(next_day('$do_date','MO'),-1)),
count(*)
from
(
select
mid_id
from ${APP}.dwt_uv_topic
where login_date_last>=date_add(next_day('$do_date','MO'),-7)
and login_date_last<= date_add(next_day('$do_date','MO'),-1)
and login_date_first<date_add(next_day('$do_date','MO'),-7)
)current_wk
left join
(
select
mid_id
from ${APP}.dws_uv_detail_daycount
where dt>=date_add(next_day('$do_date','MO'),-7*2)
and dt<= date_add(next_day('$do_date','MO'),-7-1)
group by mid_id
)last_wk
on current_wk.mid_id=last_wk.mid_id
where last_wk.mid_id is null;
insert into table ${APP}.ads_wastage_count
select
'$do_date',
count(*)
from
(
select
mid_id
from ${APP}.dwt_uv_topic
where login_date_last<=date_add('$do_date',-7)
group by mid_id
)t1;
insert into table ${APP}.ads_user_retention_day_rate
select
'$do_date',--统计日期
date_add('$do_date',-1),--新增日期
1,--留存天数
sum(if(login_date_first=date_add('$do_date',-1) and login_date_last='$do_date',1,0)),--$do_date的1日留存数
sum(if(login_date_first=date_add('$do_date',-1),1,0)),--$do_date新增
sum(if(login_date_first=date_add('$do_date',-1) and login_date_last='$do_date',1,0))/sum(if(login_date_first=date_add('$do_date',-1),1,0))*100
from ${APP}.dwt_uv_topic
union all
select
'$do_date',--统计日期
date_add('$do_date',-2),--新增日期
2,--留存天数
sum(if(login_date_first=date_add('$do_date',-2) and login_date_last='$do_date',1,0)),--$do_date的2日留存数
sum(if(login_date_first=date_add('$do_date',-2),1,0)),--$do_date新增
sum(if(login_date_first=date_add('$do_date',-2) and login_date_last='$do_date',1,0))/sum(if(login_date_first=date_add('$do_date',-2),1,0))*100
from ${APP}.dwt_uv_topic
union all
select
'$do_date',--统计日期
date_add('$do_date',-3),--新增日期
3,--留存天数
sum(if(login_date_first=date_add('$do_date',-3) and login_date_last='$do_date',1,0)),--$do_date的3日留存数
sum(if(login_date_first=date_add('$do_date',-3),1,0)),--$do_date新增
sum(if(login_date_first=date_add('$do_date',-3) and login_date_last='$do_date',1,0))/sum(if(login_date_first=date_add('$do_date',-3),1,0))*100
from ${APP}.dwt_uv_topic;
insert into table ${APP}.ads_continuity_wk_count
select
'$do_date',
concat(date_add(next_day('$do_date','MO'),-7*3),'_',date_add(next_day('$do_date','MO'),-1)),
count(*)
from
(
select
mid_id
from
(
select
mid_id
from ${APP}.dws_uv_detail_daycount
where dt>=date_add(next_day('$do_date','monday'),-7)
and dt<=date_add(next_day('$do_date','monday'),-1)
group by mid_id
union all
select
mid_id
from ${APP}.dws_uv_detail_daycount
where dt>=date_add(next_day('$do_date','monday'),-7*2)
and dt<=date_add(next_day('$do_date','monday'),-7-1)
group by mid_id
union all
select
mid_id
from ${APP}.dws_uv_detail_daycount
where dt>=date_add(next_day('$do_date','monday'),-7*3)
and dt<=date_add(next_day('$do_date','monday'),-7*2-1)
group by mid_id
)t1
group by mid_id
having count(*)=3
)t2;
insert into table ${APP}.ads_continuity_uv_count
select
'$do_date',
concat(date_add('$do_date',-6),'_','$do_date'),
count(*)
from
(
select mid_id
from
(
select mid_id
from
(
select
mid_id,
date_sub(dt,rank) date_dif
from
(
select
mid_id,
dt,
rank() over(partition by mid_id order by dt) rank
from ${APP}.dws_uv_detail_daycount
where dt>=date_add('$do_date',-6) and dt<='$do_date'
)t1
)t2
group by mid_id,date_dif
having count(*)>=3
)t3
group by mid_id
)t4;
insert into table ${APP}.ads_user_topic
select
'$do_date',
sum(if(login_date_last='$do_date',1,0)),
sum(if(login_date_first='$do_date',1,0)),
sum(if(payment_date_first='$do_date',1,0)),
sum(if(payment_count>0,1,0)),
count(*),
sum(if(login_date_last='$do_date',1,0))/count(*),
sum(if(payment_count>0,1,0))/count(*),
sum(if(login_date_first='$do_date',1,0))/sum(if(login_date_last='$do_date',1,0))
from ${APP}.dwt_user_topic;
with
tmp_uv as
(
select
'$do_date' dt,
sum(if(array_contains(pages,'home'),1,0)) home_count,
sum(if(array_contains(pages,'good_detail'),1,0)) good_detail_count
from
(
select
mid_id,
collect_set(page_id) pages
from ${APP}.dwd_page_log
where dt='$do_date'
and page_id in ('home','good_detail')
group by mid_id
)tmp
),
tmp_cop as
(
select
'$do_date' dt,
sum(if(cart_count>0,1,0)) cart_count,
sum(if(order_count>0,1,0)) order_count,
sum(if(payment_count>0,1,0)) payment_count
from ${APP}.dws_user_action_daycount
where dt='$do_date'
)
insert into table ${APP}.ads_user_action_convert_day
select
tmp_uv.dt,
tmp_uv.home_count,
tmp_uv.good_detail_count,
tmp_uv.good_detail_count/tmp_uv.home_count*100,
tmp_cop.cart_count,
tmp_cop.cart_count/tmp_uv.good_detail_count*100,
tmp_cop.order_count,
tmp_cop.order_count/tmp_cop.cart_count*100,
tmp_cop.payment_count,
tmp_cop.payment_count/tmp_cop.order_count*100
from tmp_uv
join tmp_cop
on tmp_uv.dt=tmp_cop.dt;
insert into table ${APP}.ads_product_info
select
'$do_date' dt,
sku_num,
spu_num
from
(
select
'$do_date' dt,
count(*) sku_num
from
${APP}.dwt_sku_topic
) tmp_sku_num
join
(
select
'$do_date' dt,
count(*) spu_num
from
(
select
spu_id
from
${APP}.dwt_sku_topic
group by
spu_id
) tmp_spu_id
) tmp_spu_num
on
tmp_sku_num.dt=tmp_spu_num.dt;
insert into table ${APP}.ads_product_sale_topN
select
'$do_date' dt,
sku_id,
payment_amount
from
${APP}.dws_sku_action_daycount
where
dt='$do_date'
order by payment_amount desc
limit 10;
insert into table ${APP}.ads_product_favor_topN
select
'$do_date' dt,
sku_id,
favor_count
from
${APP}.dws_sku_action_daycount
where
dt='$do_date'
order by favor_count desc
limit 10;
insert into table ${APP}.ads_product_cart_topN
select
'$do_date' dt,
sku_id,
cart_count
from
${APP}.dws_sku_action_daycount
where
dt='$do_date'
order by cart_count desc
limit 10;
insert into table ${APP}.ads_product_refund_topN
select
'$do_date',
sku_id,
refund_last_30d_count/payment_last_30d_count*100 refund_ratio
from ${APP}.dwt_sku_topic
order by refund_ratio desc
limit 10;
insert into table ${APP}.ads_appraise_bad_topN
select
'$do_date' dt,
sku_id,
appraise_bad_count/(appraise_good_count+appraise_mid_count+appraise_bad_count+appraise_default_count) appraise_bad_ratio
from
${APP}.dws_sku_action_daycount
where
dt='$do_date'
order by appraise_bad_ratio desc
limit 10;
insert into table ${APP}.ads_order_daycount
select
'$do_date',
sum(order_count),
sum(order_amount),
sum(if(order_count>0,1,0))
from ${APP}.dws_user_action_daycount
where dt='$do_date';
insert into table ${APP}.ads_payment_daycount
select
tmp_payment.dt,
tmp_payment.payment_count,
tmp_payment.payment_amount,
tmp_payment.payment_user_count,
tmp_skucount.payment_sku_count,
tmp_time.payment_avg_time
from
(
select
'$do_date' dt,
sum(payment_count) payment_count,
sum(payment_amount) payment_amount,
sum(if(payment_count>0,1,0)) payment_user_count
from ${APP}.dws_user_action_daycount
where dt='$do_date'
)tmp_payment
join
(
select
'$do_date' dt,
sum(if(payment_count>0,1,0)) payment_sku_count
from ${APP}.dws_sku_action_daycount
where dt='$do_date'
)tmp_skucount on tmp_payment.dt=tmp_skucount.dt
join
(
select
'$do_date' dt,
sum(unix_timestamp(payment_time)-unix_timestamp(create_time))/count(*)/60 payment_avg_time
from ${APP}.dwd_fact_order_info
where dt='$do_date'
and payment_time is not null
)tmp_time on tmp_payment.dt=tmp_time.dt;
with
tmp_order as
(
select
user_id,
order_stats_struct.sku_id sku_id,
order_stats_struct.order_count order_count
from ${APP}.dws_user_action_daycount lateral view explode(order_detail_stats) tmp as order_stats_struct
where date_format(dt,'yyyy-MM')=date_format('$do_date','yyyy-MM')
),
tmp_sku as
(
select
id,
tm_id,
category1_id,
category1_name
from ${APP}.dwd_dim_sku_info
where dt='$do_date'
)
insert into table ${APP}.ads_sale_tm_category1_stat_mn
select
tm_id,
category1_id,
category1_name,
sum(if(order_count>=1,1,0)) buycount,
sum(if(order_count>=2,1,0)) buyTwiceLast,
sum(if(order_count>=2,1,0))/sum( if(order_count>=1,1,0)) buyTwiceLastRatio,
sum(if(order_count>=3,1,0)) buy3timeLast ,
sum(if(order_count>=3,1,0))/sum( if(order_count>=1,1,0)) buy3timeLastRatio ,
date_format('$do_date' ,'yyyy-MM') stat_mn,
'$do_date' stat_date
from
(
select
tmp_order.user_id,
tmp_sku.category1_id,
tmp_sku.category1_name,
tmp_sku.tm_id,
sum(order_count) order_count
from tmp_order
join tmp_sku
on tmp_order.sku_id=tmp_sku.id
group by tmp_order.user_id,tmp_sku.category1_id,tmp_sku.category1_name,tmp_sku.tm_id
)tmp
group by tm_id, category1_id, category1_name;
insert into table ${APP}.ads_area_topic
select
'$do_date',
id,
province_name,
area_code,
iso_code,
region_id,
region_name,
login_day_count,
order_day_count,
order_day_amount,
payment_day_count,
payment_day_amount
from ${APP}.dwt_area_topic;
"
$hive -e "$sql"