#1对transaction_details.csv中重复数据生成新的ID
#通过shell找到重复的ID
cat transaction_details.csv|awk -F ',' '{a[$1]++}END{for(i in a){if(a[i]>1) c++}; print c}'
#=>101 101条重复的transaction_id
cat transaction_details.csv|awk -F ',' '{a[$1]++}END{for(i in a){if(a[i]>1) print i}}'>transaction_repeat_ids.log
#将重复的transaction_id写到transaction_repeat_ids.log备用
#通过shell处理address中存在双引号以及其中逗号
cat transaction_details.csv|awk -F '"' 'BEGIN{a=0}{if(length($2)>0 &&$2~/,/) a++}END{print a}'
#=》 * 1525说明双引号中包含逗号的行有1525行
cat transaction_details.csv|awk -F '"' 'BEGIN{a=0}{if(length($2)>0 && $2~/,/){gsub(/,/," ",$2);print $1""$2""$3}else print $0}'>transaction_details2.csv
#=》将带有双引号或双引号中有逗号,将逗号替换为空格,在去除双引号,将所有行写入transaction_details2.csv备用
wc -l transaction_details.csv
wc -l transaction_details2.csv
#检查处理前后数据行是否有遗失
#2过滤掉store_review中没有评分的数据
#通过shell直接完成
cat store_review.csv|awk -F ',' '{if(length($3)==1) a++}END{print a}'
#=> 63 有63条数据没有评分数据
cat store_review.csv|awk -F ',' '{if(length($3)>1) print $0}'>store_review2.csv
#通过重定向将有评分数据的行写到store_review2中
cat store_review2.csv |awk -F ',' 'BEGIN{a=0}{if(length($3)==1) a++}END{print a}'
#=》0 检验store_review2中是否确实不存在没有评分数据
#验证数据行数1001-938=63
wc -l store_review.csv
1001 store_review.csv
wc -l store_review2.csv
938 store_review2.csv
#3、上传文件至hdfs
hdfs dfs -put customer_details.csv /test/hive/pro01_electronic_consume/ods_customer
hdfs dfs -put transaction_details2.csv /test/hive/pro01_electronic_consume/ods_transaction_details
hdfs dfs -put store_details.csv /test/hive/pro01_electronic_consume/ods_store
hdfs dfs -put transaction_repeat_ids.log /test/hive/pro01_electronic_consume/ods_transaction_repeat_ids
hdfs dfs -put store_review2.csv /test/hive/pro01_electronic_consume/ods_store_review
#4、建库表表
create database if not exists ods_electronic_biz_analysis_db;
use ods_electronic_biz_analysis_db;
create external table if not exists ods_customer(
customer_id int,
first_name string,
last_name string,
email string,
gender string,
address string,
country string,
language string,
job string,
credit_type string,
credit_no string
)row format delimited
fields terminated by ','
location '/test/hive/pro01_electronic_consume/ods_customer'
tblproperties("skip.header.line.count"="1");
create external table if not exists ods_transaction_details(
transaction_id int,
customer_id int,
store_id int,
price_id decimal,
product string,
`date` string,
`time` string
)row format delimited
fields terminated by ','
location '/test/hive/pro01_electronic_consume/ods_transaction_details'
tblproperties("skip.header.line.count"="1");
create external table if not exists ods_store(
store_id int,
store_name string,
employee_number int
)row format delimited
fields terminated by ','
location '/test/hive/pro01_electronic_consume/ods_store'
tblproperties("skip.header.line.count"="1");
create external table if not exists ods_store_review(
transaction_id int,
store_id int,
review_score int
)row format delimited
fields terminated by ','
location '/test/hive/pro01_electronic_consume/ods_store_review'
tblproperties("skip.header.line.count"="1");
create external table if not exists ods_transaction_repeat_ids(
transaction_id int
)location '/test/hive/pro01_electronic_consume/ods_transaction_repeat_ids';
select * from ods_customer limit 5;
select * from ods_transaction_details limit 5;
select * from ods_store limit 5;
select * from ods_store_review limit 5;
select * from ods_transaction_repeat_ids limit 5;
#发现评价表中存在交易表中不存在的transaction_id
select count(1) from ods_store_review
where transaction_id between 5001 and 6000;
select count(1) from ods_transaction_details
where transaction_id between 5001 and 6000;
#DWD层建库表
#加密
#customer
# address aes kgckb16_20220114
# credit_no md5 20220114
# 分析按卡统计是否倾斜
# 分析按职位统计是否倾斜
# 分析按性别统计是否倾斜
#transaction_details
# yyyy-MM partition
# 添加日期维度表:dim_date(年,季,月,周,是否工作日,段位)
#hive -e "select min(date_format(replace(`date`,'/','-'),'yyyy-MM-dd')),max(date_format(replace(`date`,'/','-'),'yyyy-MM-dd'))
#from ods_transaction_details;" >hive_dt.log
[root@single01 pro01_electronic_consume]# vim hql.log
#------------------------------------------------------------------------------------------------------------------
select min(date_format(replace(`date`,'/','-'),'yyyy-MM-dd')),max(date_format(replace(`date`,'/','-'),'yyyy-MM-dd')) from ods_electronic_biz_analysis_db.ods_transaction_details;
#------------------------------------------------------------------------------------------------------------------
dt=`hive -f hql.log`
echo $dt
[root@single01 pro01_electronic_consume]# vim date_dim_maker.sh
#----------------------------------------------------------------------
#!/bin/bash
source /etc/profile
dt=(`hive -f hql.log `)
echo ${dt[2]}
echo ${dt[3]}
#---------------------------------------------------------------------
[root@single01 pro01_electronic_consume]# chmod u+x date_dim_maker.sh
[root@single01 pro01_electronic_consume]# ./date_dim_maker.sh
# 扩展列:年,季,月,周,是否工作日,段位(0~3)
# 时间段:凌晨0~6,6~12,12~18,18~24
#
#
#DWS
聚合:
店铺:年,季,月,周,日,段,员工数量,交易数量,金额,独立客户总数
# 按月是否存在倾斜
# 按季度是否存在倾斜
# 按周是否存在倾斜
# 按时间段求均值:用户消费均值(段总额/独立客户数量)
用户: 用户ID,年,季度,月,周,交易数量,金额
#DWT
聚合:
用户: 用户ID,年,季度,月,交易数量,交易数量排名,金额,交易金额排名
#DWM
聚合:
店铺:店铺ID,年,季,月,周,员工数量,交易数量,金额,独立客户总数,总平均评分,评分数,差评比率,好评比率
用户: 用户ID,交易数量,交易排名,金额,交易金额排名
商品:商品名称,金额,金额排名,交易商品数量,交易次数,交易次数排名,独立客户数,独立客户数排名
# 按商品统计是否存在倾斜
业务
transaction_id at store_id in transaction_details
transaction_id at store_id in store_review
了解客户评价的覆盖率
客户:评价订单数/实际购买总订单数
商品:评价订单数/实际购买总订单数
店铺:评价订单数/实际购买订单总数
根据评分了解客户分布情况
平均分
评分值比重
差评率
根据交易了解客户的分布情况
金额
均额
频次
店铺数
店铺订单数
店铺订单总额
月度的订单总数
月度的订单总额
月度的订单均值
#---------------------------------------------------------------------------------------------------------------
#根据需求扩展表格
create external table dwd_transaction_details(
transaction_id string,
customer_id int,
store_id int,
price decimal,
product string,
`date` string,
`time` string,
dt string,
years string,
`quarter` int,
months int,
weeks int,
isworkday string,
period int
)
partitioned by (dt string)
row format delimited
fields terminated by ',';
insert overwrite table dwd_transaction_details
select *,year(replace(`date`,'/','-'))years ,
quarter(replace(`date`,'/','-'))`quarter` ,
month(replace(`date`,'/','-'))months,
ceil(day(replace(`date`,'/','-'))/7) weeks,
if (dayofweek(replace(`date`,'/','-'))between 2 and 6,'yes','no')isworkday,
ceil((split(`time`,':')[0]+1)/6) period from ods_electronic_biz_analysis_db.transaction_details_par;
#----------------------------------------------------------------------------
create external table dwd_transaction_details_par(
transaction_id string,
customer_id int,
store_id int,
price decimal,
product string,
`date` string,
`time` string
)
partitioned by (dt string)
row format delimited
fields terminated by ',';
insert overwrite table dwd_transaction_details_par partition(dt)
select transaction_id,customer_id,store_id,price,product,`date`,`time`, date_format(replace(`date`,'/','-'),'yyyy-MM')dt
from ods_electronic_biz_analysis_db.transaction_details;
#------------------------------------------------------------------------------------------------
select transaction_id,`date`,`time`,case when substr(`time`, -2, 2) = 'PM' and split(`time`,':')[0] <> 12
then from_unixtime(unix_timestamp(`time`,'HH:mm')+43200,'HH:mm')
else from_unixtime(unix_timestamp(`time`,'HH:mm'),'HH:mm') end as t from dwd_transaction_details;
#---------------------------------------------------------------------------------------------------
sqoop import \
--connect jdbc:mysql://single01:3306/test \
--username root \
--password kb16 \
--table order_info \
--where "id>700000 and id<=800000" \
--delete-target-dir \
--fields-terminated-by ',' \
-m 1 \
--hive-import \
--hive-database test \
--hive-table order_info_par \
--hive-partition-key 'id_sec' \
--hive-partition-value '8'
#-----------------------------------------------------------------------------------------