离线数仓(五)
1、数仓搭建
1.1、业务数据生成
~1.建表语句
1)通过SQLyog创建数据库gmall
2)设置数据库编码
3)导入建表语句(1建表脚本)
4)重复步骤3的导入方式,依次导入:2商品分类数据插入脚本、3函数脚本、4存储过程脚本。
~2.生成业务数据
CALL init_data('2019-02-16',1000,200,300,TRUE);
1.2、业务数据导入数仓
~1.sqoop安装
- 1)上传安装包sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz到虚拟机中
- 2)解压sqoop安装包到指定目录,如:
$ tar -zxf sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz -C /opt/module/ - 3)拷贝jdbc驱动到sqoop的lib目录下,如:
$ cp mysql-connector-java-5.1.27-bin.jar /opt/module/sqoop-1.4.6.bin__hadoop-2.0.4-alpha/lib/
~2.Sqoop定时导入脚本
1)在/home/atguigu/bin目录下创建脚本sqoop_import.sh
[atguigu@hadoop103 bin]$ vim sqoop_import.sh
在脚本中填写如下内容
#!/bin/bash
db_date=$2
echo $db_date
db_name=gmall
import_data() {
/opt/module/sqoop/bin/sqoop import \
--connect jdbc:mysql://hadoop102:3306/$db_name \
--username root \
--password 000000 \
--target-dir /origin_data/$db_name/db/$1/$db_date \
--delete-target-dir \
--num-mappers 1 \
--fields-terminated-by "\t" \
--query "$2"' and $CONDITIONS;' \
--null-string \\N \
--null-non-string \\N
}
import_sku_info(){
import_data "sku_info" "select
id, spu_id, price, sku_name, sku_desc, weight, tm_id,
category3_id, create_time
from sku_info where 1=1"
}
import_user_info(){
import_data "user_info" "select
id, name, birthday, gender, email, user_level,
create_time
from user_info where 1=1"
}
import_base_category1(){
import_data "base_category1" "select
id, name from base_category1 where 1=1"
}
import_base_category2(){
import_data "base_category2" "select
id, name, category1_id from base_category2 where 1=1"
}
import_base_category3(){
import_data "base_category3" "select id, name, category2_id from base_category3 where 1=1"
}
import_order_detail(){
import_data "order_detail" "select
od.id,
order_id,
user_id,
sku_id,
sku_name,
order_price,
sku_num,
o.create_time
from order_info o, order_detail od
where o.id=od.order_id
and DATE_FORMAT(create_time,'%Y-%m-%d')='$db_date'"
}
import_payment_info(){
import_data "payment_info" "select
id,
out_trade_no,
order_id,
user_id,
alipay_trade_no,
total_amount,
subject,
payment_type,
payment_time
from payment_info
where DATE_FORMAT(payment_time,'%Y-%m-%d')='$db_date'"
}
import_order_info(){
import_data "order_info" "select
id,
total_amount,
order_status,
user_id,
payment_way,
out_trade_no,
create_time,
operate_time
from order_info
where (DATE_FORMAT(create_time,'%Y-%m-%d')='$db_date' or DATE_FORMAT(operate_time,'%Y-%m-%d')='$db_date')"
}
case $1 in
"base_category1")
import_base_category1
;;
"base_category2")
import_base_category2
;;
"base_category3")
import_base_category3
;;
"order_info")
import_order_info
;;
"order_detail")
import_order_detail
;;
"sku_info")
import_sku_info
;;
"user_info")
import_user_info
;;
"payment_info")
import_payment_info
;;
"all")
import_base_category1
import_base_category2
import_base_category3
import_order_info
import_order_detail
import_sku_info
import_user_info
import_payment_info
;;
esac
2)增加脚本执行权限
[atguigu@hadoop103 bin]$ chmod 777 sqoop_import.sh
3)执行脚本导入数据
[atguigu@hadoop103 bin]$ sqoop_import.sh all 2021-08-11
~3.sqoop导入数据的NULL值问题
①默认sqoop到import数据时,将Mysql的Null类型,转为’null’
②hive中使用\N代表NULL类型
③如果希望在import时,讲将Mysql的Null类型,转为自己期望的类型,
需要使用–null-string and --null-non-string
–null-string: 当mysql的string类型列为null时,导入到hive时,使用什么来代替!
–null-string a: 如果mysql中,当前列是字符串类型(varchar,char),假如这列值为NULL,
导入到hive时,使用a来代替!
–null-non-string: 当mysql的非string类型列为null时,导入到hive时,使用什么来代替!
–null-non-string b: 如果mysql中,当前列不是字符串类型(varchar,char),假如这列值为NULL,
导入到hive时,使用b来代替!
④如果到导出时,希望将指定的参数,导出为mysql的NULL类型,需要使用
–input-null-string and --input-null-non-string
–input-null-string a: 在hive导出到mysql时,如果hive中string类型的列的值为a,导出到mysql中,使用NULL代替!
–input-null-non-string b: 在hive导出到mysql时,如果hive中非string类型的列的值为b,导出到mysql中,使用NULL代替!
1.3、ODS 层
~1.创建订单表
hive (gmall)>
drop table if exists ods_order_info;
create external table ods_order_info (
`id` string COMMENT '订单编号',
`total_amount` decimal(10,2) COMMENT '订单金额',
`order_status` string COMMENT '订单状态',
`user_id` string COMMENT '用户id',
`payment_way` string COMMENT '支付方式',
`out_trade_no` string COMMENT '支付流水号',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间'
) COMMENT '订单表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_order_info/';
~2. 创建订单详情表
hive (gmall)>
drop table if exists ods_order_detail;
create external table ods_order_detail(
`id` string COMMENT '订单详情编号',
`order_id` string COMMENT '订单号',
`user_id` string COMMENT '用户id',
`sku_id` string COMMENT '商品id',
`sku_name` string COMMENT '商品名称',
`order_price` string COMMENT '商品单价',
`sku_num` string COMMENT '商品数量',
`create_time` string COMMENT '创建时间'
) COMMENT '订单明细表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_order_detail/';
~3.创建商品表
hive (gmall)>
drop table if exists ods_sku_info;
create external table ods_sku_info(
`id` string COMMENT 'skuId',
`spu_id` string COMMENT 'spuid',
`price` decimal(10,2) COMMENT '价格',
`sku_name` string COMMENT '商品名称',
`sku_desc` string COMMENT '商品描述',
`weight` string COMMENT '重量',
`tm_id` string COMMENT '品牌id',
`category3_id` string COMMENT '品类id',
`create_time` string COMMENT '创建时间'
) COMMENT '商品表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_sku_info/';
~4.创建用户表
hive (gmall)>
drop table if exists ods_user_info;
create external table ods_user_info(
`id` string COMMENT '用户id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性别',
`email` string COMMENT '邮箱',
`user_level` string COMMENT '用户等级',
`create_time` string COMMENT '创建时间'
) COMMENT '用户信息'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_user_info/';
~5.创建商品一级分类表
hive (gmall)>
drop table if exists ods_base_category1;
create external table ods_base_category1(
`id` string COMMENT 'id',
`name` string COMMENT '名称'
) COMMENT '商品一级分类'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_base_category1/';
~6.创建商品二级分类表
hive (gmall)>
drop table if exists ods_base_category2;
create external table ods_base_category2(
`id` string COMMENT ' id',
`name` string COMMENT '名称',
category1_id string COMMENT '一级品类id'
) COMMENT '商品二级分类'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_base_category2/';
~7. 创建商品三级分类表
hive (gmall)>
drop table if exists ods_base_category3;
create external table ods_base_category3(
`id` string COMMENT ' id',
`name` string COMMENT '名称',
category2_id string COMMENT '二级品类id'
) COMMENT '商品三级分类'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_base_category3/';
~8.创建支付流水表
hive (gmall)>
drop table if exists ods_payment_info;
create external table ods_payment_info(
`id` bigint COMMENT '编号',
`out_trade_no` string COMMENT '对外业务编号',
`order_id` string COMMENT '订单编号',
`user_id` string COMMENT '用户编号',
`alipay_trade_no` string COMMENT '支付宝交易流水编号',
`total_amount` decimal(16,2) COMMENT '支付金额',
`subject` string COMMENT '交易内容',
`payment_type` string COMMENT '支付类型',
`payment_time` string COMMENT '支付时间'
) COMMENT '支付流水表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_payment_info/';
~9.ODS层数据导入脚本
[atguigu@hadoop102 myscripts]$ vim ods_db.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
load data inpath '/origin_data/$APP/db/order_info/$do_date' OVERWRITE into table "$APP".ods_order_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_detail/$do_date' OVERWRITE into table "$APP".ods_order_detail partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/sku_info/$do_date' OVERWRITE into table "$APP".ods_sku_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/user_info/$do_date' OVERWRITE into table "$APP".ods_user_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/payment_info/$do_date' OVERWRITE into table "$APP".ods_payment_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category1/$do_date' OVERWRITE into table "$APP".ods_base_category1 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category2/$do_date' OVERWRITE into table "$APP".ods_base_category2 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category3/$do_date' OVERWRITE into table "$APP".ods_base_category3 partition(dt='$do_date');
"
$hive -e "$sql"
1.4、DWD层
对ODS层数据进行判空过滤。对商品分类表进行维度退化(降维)。
~1.创建订单表
hive (gmall)>
drop table if exists dwd_order_info;
create external table dwd_order_info (
`id` string COMMENT '',
`total_amount` decimal(10,2) COMMENT '',
`order_status` string COMMENT ' 1 2 3 4 5',
`user_id` string COMMENT 'id',
`payment_way` string COMMENT '',
`out_trade_no` string COMMENT '',
`create_time` string COMMENT '',
`operate_time` string COMMENT ''
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_order_info/'
tblproperties ("parquet.compression"="snappy");
~2.创建订单详情表
hive (gmall)>
drop table if exists dwd_order_detail;
create external table dwd_order_detail(
`id` string COMMENT '',
`order_id` decimal(10,2) COMMENT '',
`user_id` string COMMENT 'id',
`sku_id` string COMMENT 'id',
`sku_name` string COMMENT '',
`order_price` string COMMENT '',
`sku_num` string COMMENT '',
`create_time` string COMMENT ''
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_order_detail/'
tblproperties ("parquet.compression"="snappy");
~3.创建用户表
hive (gmall)>
drop table if exists dwd_user_info;
create external table dwd_user_info(
`id` string COMMENT 'id',
`name` string COMMENT '',
`birthday` string COMMENT '',
`gender` string COMMENT '',
`email` string COMMENT '',
`user_level` string COMMENT '',
`create_time` string COMMENT ''
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_user_info/'
tblproperties ("parquet.compression"="snappy");
~4. 创建支付流水表
hive (gmall)>
drop table if exists dwd_payment_info;
create external table dwd_payment_info(
`id` bigint COMMENT '',
`out_trade_no` string COMMENT '',
`order_id` string COMMENT '',
`user_id` string COMMENT '',
`alipay_trade_no` string COMMENT '',
`total_amount` decimal(16,2) COMMENT '',
`subject` string COMMENT '',
`payment_tpe` string COMMENT '',
`payment_time` string COMMENT ''
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_payment_info/'
tblproperties ("parquet.compression"="snappy");
~5.创建商品表(增加分类)
hive (gmall)>
drop table if exists dwd_sku_info;
create external table dwd_sku_info(
`id` string COMMENT 'skuId',
`spu_id` string COMMENT 'spuid',
`price` decimal(10,2) COMMENT '',
`sku_name` string COMMENT '',
`sku_desc` string COMMENT '',
`weight` string COMMENT '',
`tm_id` string COMMENT 'id',
`category3_id` string COMMENT '1id',
`category2_id` string COMMENT '2id',
`category1_id` string COMMENT '3id',
`category3_name` string COMMENT '3',
`category2_name` string COMMENT '2',
`category1_name` string COMMENT '1',
`create_time` string COMMENT ''
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_sku_info/'
tblproperties ("parquet.compression"="snappy");
~6.DWD层数据导入脚本
[atguigu@hadoop103 myscripts]$ vim dwd_db.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
use gmall;
insert overwrite table dwd_order_info partition(dt)
select * from ods_order_info
where dt='$do_date' and id is not null;
insert overwrite table dwd_order_detail partition(dt)
select * from ods_order_detail
where dt='$do_date' and id is not null;
insert overwrite table dwd_user_info partition(dt)
select * from ods_user_info
where dt='$do_date' and id is not null;
insert overwrite table dwd_payment_info partition(dt)
select * from ods_payment_info
where dt='$do_date' and id is not null;
insert overwrite table dwd_sku_info PARTITION(dt='$do_date')
SELECT
t1.id, t1.spu_id, t1.price,
t1.sku_name,
t1.sku_desc,
t1.weight,
t1.tm_id,
t1.category3_id,
t3.id category2_id,
t4.id category1_id,
t2.name category3_name,
t3.name category2_name,
t4.name category1_name,
t1.create_time
FROM
(select * from ods_sku_info where dt='$do_date') t1
join
(select * from ods_base_category3 where dt='$do_date') t2
on t1.category3_id=t2.id
join
(select * from ods_base_category2 where dt='$do_date') t3
on t2.category2_id=t3.id
join
(select * from ods_base_category1 where dt='$do_date' )t4
on t3.category1_id=t4.id
"
hive -e "$sql"
1.5、DWS层之用户行为宽表
~1.创建用户行为宽表
hive (gmall)>
drop table if exists dws_user_action;
create external table dws_user_action
(
user_id string comment '用户 id',
order_count bigint comment '下单次数 ',
order_amount decimal(16,2) comment '下单金额 ',
payment_count bigint comment '支付次数',
payment_amount decimal(16,2) comment '支付金额 ',
comment_count bigint comment '评论次数'
) COMMENT '每日用户行为宽表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action/';
~2.用户行为数据宽表导入脚本
[atguigu@hadoop103 myscripts]$ vim dws_db_wide.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert overwrite TABLE dws_user_action PARTITION(dt='$do_date')
select
user_id,sum(order_count),sum(order_amount),
sum(payment_count),sum(payment_amount),
sum(comment_count)
from
(select
user_id,count(*) order_count,sum(total_amount) order_amount,
0 payment_count,0 payment_amount,0 comment_count
from dwd_order_info
where dt='$do_date'
GROUP by user_id
union all
select
user_id,0 order_count,0 order_amount,
count(*) payment_count,sum(total_amount) payment_amount,
0 comment_count
from dwd_payment_info
where dt='$do_date'
GROUP by user_id
union all
select
user_id,0 order_count,0 order_amount,
0 payment_count,0 payment_amount,
count(*) comment_count
from dwd_comment_log
where dt='$do_date'
GROUP by user_id) tmp
GROUP by user_id
"
hive -e "$sql"
2、需求一:GMV成交总额
GVM:是一定时间段内的总成交金额
2.1、ADS层
~1.建表语句
hive (gmall)>
drop table if exists ads_gmv_sum_day;
create external table ads_gmv_sum_day(
`dt` string COMMENT '统计日期',
`gmv_count` bigint COMMENT '当日gmv订单个数',
`gmv_amount` decimal(16,2) COMMENT '当日gmv订单总金额',
`gmv_payment` decimal(16,2) COMMENT '当日支付金额'
) COMMENT 'GMV'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_gmv_sum_day/';
~2.数据脚本导入
[atguigu@hadoop103 myscripts]$ vim ads_db_gmv.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table ads_gmv_sum_day
select
'$do_date',
sum(order_count) gmv_count,
sum(order_amount) gmv_amount,
sum(payment_amount) gmv_payment
from dws_user_action
where dt='$do_date'
"
hive -e "$sql"
3、需求二:转化率之用户新鲜度及漏斗分析
转化率:单日实际做xxx事情的人数 / 当日日活= xxx单日转化率
用户新鲜度: 用户新鲜度也是转化率的一种
指当日新增用户 / 当日日活用户
3.1、 ADS层之新增用户占日活跃用户比率(用户新鲜度)
~1.建表语句
hive (gmall)>
drop table if exists ads_user_convert_day;
create external table ads_user_convert_day(
`dt` string COMMENT '统计日期',
`uv_m_count` bigint COMMENT '当日活跃设备',
`new_m_count` bigint COMMENT '当日新增设备',
`new_m_ratio` decimal(10,2) COMMENT '当日新增占日活的比率'
) COMMENT '转化率'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_convert_day/';
~2.数据导入
hive (gmall)>
insert into table ads_user_convert_day
select
'2021-08-11',
uv_m_count,
new_m_count,
cast(new_m_count/uv_m_count*100 as decimal(10,2))
from
(select
day_count uv_m_count
from ads_uv_count
where dt='2021-08-11') t1
join
(SELECT
new_mid_count new_m_count
from ads_new_mid_count
where create_date='2021-08-11') t2
3.2、ADS层之用户行为漏斗分析
~1.建表语句
hive (gmall)>
drop table if exists ads_user_action_convert_day;
create external table ads_user_action_convert_day(
`dt` string COMMENT '统计日期',
`total_visitor_m_count` bigint COMMENT '总访问人数',
`order_u_count` bigint COMMENT '下单人数',
`visitor2order_convert_ratio` decimal(10,2) COMMENT '访问到下单转化率',
`payment_u_count` bigint COMMENT '支付人数',
`order2payment_convert_ratio` decimal(10,2) COMMENT '下单到支付的转化率'
) COMMENT '用户行为漏斗分析'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_action_convert_day/';
~2.数据导入
hive (gmall)>
insert into table ads_user_action_convert_day
select
'2021-08-11',
total_visitor_m_count,
order_u_count,
cast(order_u_count/total_visitor_m_count*100 as decimal(10,2)) visitor2order_convert_ratio,
payment_u_count,
cast(payment_u_count/order_u_count*100 as decimal(10,2)) order2payment_convert_ratio
from
(select
count(*) total_visitor_m_count
from
(select
user_id
from dwd_start_log
where dt='2021-08-11'
group by user_id) t1) t3
join
(select
sum(if(order_count>0,1,0)) order_u_count,
sum(if(payment_count>0,1,0)) payment_u_count
from dws_user_action
where dt='2021-08-11') t2
4、需求三:品牌复购率
复购率: 重复购买的概率!
同一件商品 购买过的人数有 10 人
购买同一件商品 二次的人数有 8 人
购买同一件商品 三次的人数有 6人
此商品,单次复购率:8/10
多(至少购买3次)次复购率: 6/10
4.1、DWS层
~1.用户购买商品明细表(宽表)
hive (gmall)>
drop table if exists dws_sale_detail_daycount;
create external table dws_sale_detail_daycount
(
user_id string comment '用户 id',
sku_id string comment '商品 Id',
user_gender string comment '用户性别',
user_age string comment '用户年龄',
user_level string comment '用户等级',
order_price decimal(10,2) comment '商品价格',
sku_name string comment '商品名称',
sku_tm_id string comment '品牌id',
sku_category3_id string comment '商品三级品类id',
sku_category2_id string comment '商品二级品类id',
sku_category1_id string comment '商品一级品类id',
sku_category3_name string comment '商品三级品类名称',
sku_category2_name string comment '商品二级品类名称',
sku_category1_name string comment '商品一级品类名称',
spu_id string comment '商品 spu',
sku_num int comment '购买个数',
order_count string comment '当日下单单数',
order_amount string comment '当日下单金额'
) COMMENT '用户购买商品明细表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_sale_detail_daycount/'
tblproperties ("parquet.compression"="snappy");
~2.数据导入脚本
[atguigu@hadoop103 myscripts]$ vim dws_sale.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
with t1 as
(select
id user_id,gender user_gender,
ceil(months_between('$do_date',birthday)/12) user_age,
user_level
from dwd_user_info
where dt='$do_date'),
t2 as
(select
id sku_id,price order_price,sku_name,tm_id sku_tm_id,
category3_id sku_category3_id,
category2_id sku_category2_id,
category1_id sku_category1_id,
category3_name sku_category3_name,
category2_name sku_category2_name,
category1_name sku_category1_name,
spu_id spu_id
from dwd_sku_info
where dt='$do_date'),
t3 as
(select
orderdatail.sku_num,orderdatail.sku_id,orderdatail.user_id
from ods_order_detail orderdatail join ods_payment_info payment
on orderdatail.order_id = payment.order_id
),
t4 as
(select
orderdatail.sku_id,orderdatail.user_id,
count(*) order_count,
sum(orderdatail.order_price*orderdatail.sku_num) order_amount
from ods_order_detail orderdatail join ods_payment_info payment
on orderdatail.order_id = payment.order_id
group by orderdatail.sku_id,orderdatail.user_id)
insert overwrite TABLE dws_sale_detail_daycount PARTITION(dt='$do_date')
select
t1.user_id,t2.sku_id,t1.user_gender,t1.user_age,t1.user_level,
t2.order_price,t2.sku_name,t2.sku_tm_id,t2.sku_category3_id,
t2.sku_category2_id,t2.sku_category1_id,
t2.sku_category3_name,t2.sku_category2_name,t2.sku_category1_name,
t2.spu_id,t3.sku_num,t4.order_count,t4.order_amount
from t4 join t3
on t4.sku_id=t3.sku_id and t4.user_id=t3.user_id
join t1 on t1.user_id=t3.user_id
join t2 on t3.sku_id=t2.sku_id
"
hive -e "$sql"
4.2、ADS层品牌复购率
~1.建表语句
hive (gmall)>
drop table ads_sale_tm_category1_stat_mn;
create external table ads_sale_tm_category1_stat_mn
(
tm_id string comment '品牌id',
category1_id string comment '1级品类id ',
category1_name string comment '1级品类名称 ',
buycount bigint comment '购买人数',
buy_twice_last bigint comment '两次以上购买人数',
buy_twice_last_ratio decimal(10,2) comment '单次复购率',
buy_3times_last bigint comment '三次以上购买人数',
buy_3times_last_ratio decimal(10,2) comment '多次复购率',
stat_mn string comment '统计月份',
stat_date string comment '统计日期'
) COMMENT '复购率统计'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/';
~2.数据导入脚本
[atguigu@hadoop103 myscripts]$ vim ads_sale.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
INSERT into TABLE ads_sale_tm_category1_stat_mn
select
sku_tm_id, sku_category1_id,sku_category1_name,
sum(if(order_count_per_mn>=1,1,0)) buycount,
sum(if(order_count_per_mn>=2,1,0)) buy_twice_last,
cast(sum(if(order_count_per_mn>=2,1,0))/sum(if(order_count_per_mn>=1,1,0))*100 as decimal(10,2)) buy_twice_last_ratio,
sum(if(order_count_per_mn>=3,1,0)) buy_3times_last,
cast(sum(if(order_count_per_mn>=3,1,0)) / sum(if(order_count_per_mn>=1,1,0)) * 100 as decimal(10,2)) buy_3times_last_ratio,
date_format('2020-02-16','yyyy-MM') stat_mn,
'2020-02-16'
from
(select
user_id,sku_tm_id,count(order_count) order_count_per_mn,sku_category1_id,sku_category1_name
from dws_sale_detail_daycount
where date_format(dt,'yyyy-MM')=date_format('2020-02-16','yyyy-MM')
group by sku_tm_id,user_id,sku_category1_id,sku_category1_name ) tmp
group by sku_tm_id,sku_category1_id,sku_category1_name
"
hive -e "$sql"
5、 需求四:各用户等级对应的复购率前十的商品排行
5.1、DWS层
使用用户购买明细表宽表(dws_sale_detail_daycount)作为DWS数据
5.2、 ADS层
~1.建表语句
drop table ads_ul_rep_ratio;
create table ads_ul_rep_ratio(
user_level string comment '用户等级' ,
sku_id string comment '商品id',
buy_count bigint comment '购买总人数',
buy_twice_count bigint comment '两次购买总数',
buy_twice_rate decimal(10,2) comment '二次复购率',
rank string comment '排名' ,
state_date string comment '统计日期'
) COMMENT '复购率统计'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_ul_rep_ratio/';
~2.导入数据脚本
[atguigu@hadoop103 myscripts]$ vim ads_ul_rep_ratio.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
INSERT INTO TABLE ads_ul_rep_ratio
SELECT
user_level,sku_id,buy_count,buy_twice_count,buy_twice_rate,rn,'2020-02-16'
from
(select
user_level,sku_id,buy_count,buy_twice_count,buy_twice_rate,
rank() over(PARTITION by user_level order by buy_twice_rate desc)rn
from
(select
user_level,sku_id,
sum(if(total_buy_count_per_person>=1,1,0)) buy_count,
sum(if(total_buy_count_per_person>=2,1,0)) buy_twice_count,
cast(sum(if(total_buy_count_per_person>=2,1,0))/sum(if(total_buy_count_per_person>=1,1,0))*100 as decimal(10,2)) buy_twice_rate
from
(select
user_level,sku_id,user_id,count(*) total_buy_count_per_person
from dws_sale_detail_daycount
where dt<='2020-02-16'
group by user_level,sku_id,user_id) tmp
group by user_level,sku_id) tmp2
where buy_twice_rate>0) tmp3
where rn<=10
"
hive -e "$sql"
6、需求五:新付费用户数
6.1、 DWS层
~1.建表语句
drop table if exists dws_pay_user_detail;
create external table dws_pay_user_detail(
`user_id` string comment '付费用户id',
`name` string comment '付费用户姓名',
`birthday` string COMMENT '',
`gender` string COMMENT '',
`email` string COMMENT '',
`user_level` string COMMENT ''
) COMMENT '付费用户表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_pay_user_detail/';
~2.导入数据脚本
[atguigu@hadoop103 myscripts]$ vim dws_pay_user_detail.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert overwrite table dws_pay_user_detail PARTITION(dt='2020-02-16')
SELECT
ui.id,ui.name,ui.birthday,ui.gender,ui.email,ui.user_level
from
(select
distinct t1.user_id
from
(select
user_id
from dws_sale_detail_daycount
where dt='2020-02-16') t1
left join dws_pay_user_detail pu
on t1.user_id=pu.user_id
where pu.user_id is null) t2 join
(select * from dwd_user_info where dt='2020-02-16') ui
on t2.user_id=ui.id
"
hive -e "$sql"
6.2、ADS层
~1.建表语句
drop table if exists ads_pay_user_count;
create external table ads_pay_user_count(
dt string COMMENT '统计日期',
pay_count bigint COMMENT '付费用户数'
) COMMENT '付费用户表'
stored as parquet
location '/warehouse/gmall/dws/ads_pay_user_count/';
~2.导入数据脚本
[atguigu@hadoop103 myscripts]$ vim ads_pay_user_count.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table ads_pay_user_count
select
'2020-02-16',count(*)
from dws_pay_user_detail
where dt='2020-02-16'
"
hive -e "$sql"
7、需求六:付费用户率
7.1、DWS层
使用付费用户数以及新增用户表作为数据源
7.2、ADS层
~1.建表语句
drop table if exists ads_pay_user_ratio;
create external table ads_pay_user_ratio (
dt string comment '统计日期',
pay_count bigint comment '总付费用户数',
user_count bigint comment '总用户数',
pay_count_ratio decimal(10,2) COMMENT '付费用户比率'
) COMMENT '付费用户率表'
stored as parquet
location '/warehouse/gmall/dws/ads_pay_user_ratio';
~2.导入数据脚本
[atguigu@hadoop103 myscripts]$ vim ads_pay_user_ratio.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert INTO TABLE ads_pay_user_ratio
SELECT
'2020-02-16',pay_count,user_count,
cast(pay_count/user_count*100 as decimal(10,2))
from
(select
sum(pay_count) pay_count
from ads_pay_user_count
where dt<='2020-02-16') t1
join
(SELECT
count(*) user_count
from dwd_user_info
where dt='2020-02-16') t2
"
hive -e "$sql"
8、需求七:每个用户最近一次购买时间
8.1、DWS层
使用用户行为宽表作为DWS层数据
8.2、ADS层
~1.建表语句
drop table if exists ads_user_last_pay;
create external table ads_user_last_pay(
user_id string comment '用户id',
pay_date string comment '最近一次购买时间'
) COMMENT '用户最近一次购买时间表'
stored as parquet
location '/warehouse/gmall/dws/ads_user_last_pay/';
~2.导入数据脚本
[atguigu@hadoop103 myscripts]$ vim ads_user_last_pay.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert overwrite TABLE ads_user_last_pay
select
user_id,
max(dt)
from dws_user_action
where payment_count>0
GROUP by user_id
"
hive -e "$sql"
9、需求八:商品每日销量排行Top10
9.1、DWS层
使用用户购买商品明细宽表作为DWS数据
9.2、ADS层
~1.建表语句
drop table if exists ads_goods_order_count_day;
create external table ads_goods_order_count_day(
dt string comment '统计日期',
sku_id string comment '商品id',
order_count bigint comment '下单次数'
) COMMENT '商品下单top10'
stored as parquet
location '/warehouse/gmall/dws/ads_goods_order_count_day/';
~2.导入数据脚本
[atguigu@hadoop103 myscripts]$ vim ads_goods_order_count_day.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into TABLE ads_goods_order_count_day
select
'2020-02-16',sku_id,count(*) order_count
from dws_sale_detail_daycount
where dt='2020-02-16'
group by sku_id
order by order_count desc
limit 10
"
hive -e "$sql"
10、需求九:统计每个月订单付款率
10.1、DWS层
采用用户行为宽表作为DWS层
10.2、ADS层
~1.建表语句
drop table if exists ads_order2pay_mn;
create external table ads_order2pay_mn (
`dt` string COMMENT '统计日期',
`order_u_count` bigint COMMENT '下单人数',
`payment_u_count` bigint COMMENT '支付人数',
`order2payment_convert_ratio` decimal(10,2) COMMENT '下单到支付的转化率'
) COMMENT ''
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ ads_order2pay_mn /';
~2.导入数据脚本
[atguigu@hadoop103 myscripts]$ vim ads_order2pay_mn.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into TABLE ads_order2pay_mn
SELECT
'2020-02-16',
sum(order_count) order_u_count,
sum(payment_count) payment_u_count,
cast(sum(payment_count)/ sum(order_count) * 100 as decimal(10,2)) order2payment_convert_ratio
from dws_user_action
where date_format(dt,'yyyy-MM')=date_format('2020-02-16','yyyy-MM')
"
hive -e "$sql"