项目数据分层 ODS DW DM

数据获取 专栏收录该内容
6 篇文章 0 订阅

项目数据分层 ODS DW DM

ODS

ODS 全称是 Operational Data Store,操作数据存储.“面向主题的”,数据运营层,也叫ODS层,是最接近数据源中数据的一层,数据源中的数据,经过抽取、洗净、传输,也就说传说中的 ETL 之后,装入本层。本层的数据,总体上大多是按照源头业务系统的分类方式而分类的。
撰写脚本 执行数据全量导入 MySQL ===》Hive
适合之前表数据可能存在修改操作

#!/bin/bash
sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table user \
--delete-target-dir \
--target-dir /snbap/ods/ods_user \
--fields-terminated-by '\001'

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table cart \
--delete-target-dir \
--target-dir /snbap/ods/ods_cart \
--fields-terminated-by '\001'

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table code_brand \
--delete-target-dir \
--target-dir /snbap/ods/ods_code_brand \
--fields-terminated-by '\001'

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table code_category \
--delete-target-dir \
--target-dir /snbap/ods/ods_code_category \
--fields-terminated-by '\001'

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table code_city \
--delete-target-dir \
--target-dir /snbap/ods/ods_code_city \
--fields-terminated-by '\001'

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table code_education \
--delete-target-dir \
--target-dir /snbap/ods/ods_code_education \
--fields-terminated-by '\001'

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table code_email_suffix \
--delete-target-dir \
--target-dir /snbap/ods/ods_code_email_suffix \
--fields-terminated-by '\001'

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table code_goods \
-m 1 \
--delete-target-dir \
--target-dir /snbap/ods/ods_code_goods \
--fields-terminated-by '\001'


sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table code_profession \
--delete-target-dir \
--target-dir /snbap/ods/ods_code_profession \
--fields-terminated-by '\001'

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table user_addr \
--delete-target-dir \
--target-dir /snbap/ods/ods_user_addr \
--fields-terminated-by '\001'

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table user_extend \
--delete-target-dir \
--target-dir /snbap/ods/ods_user_extend \
--fields-terminated-by '\001'

观察可知大部分代码都相似,可以写一个循环,简化脚本。

#!/bin/bash
list="cart code_brand code_category code_city code_education code_email_suffix code_goods code_profession user user_addr user_extend"

for tablename in $list;do
	echo '正在创建bap_${tablename}'
	
    sqoop import \
    --connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
    --driver com.mysql.jdbc.Driver \
    --username root \
    --password root \
    --table ${tablename} \
    --delete-target-dir \
    --target-dir /snbap/ods/ods_${tablename} \
    --fields-terminated-by '\001'
done  

撰写脚本 执行数据增量导入 MySQL ===》Hive
适合之前表数据不会修改
此脚本若传入参数则其表分区为传入参数,不传参数则为当前日期的前一天。

#!/bin/bash
args=$1
dt=
if [ ${#args} == 0 ]
    then 
    	dt=`date -d '1 days ago' +%Y%m%d`
    else
    	dt=$1
fi

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table user_order \
--target-dir /snbap/ods_tmp/ods_user_order \
--fields-terminated-by '\001' \
--check-column order_id \
--incremental append \
--last-value 0

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table order_delivery \
--target-dir /snbap/ods_tmp/ods_order_delivery \
--fields-terminated-by '\001' \
--check-column order_id \
--incremental append \
--last-value 0

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table order_item \
--target-dir /snbap//ods_tmp/ods_order_item \
--fields-terminated-by '\001' \
--check-column order_id \
--incremental append \
--last-value 0

sqoop import \
--connect jdbc:mysql://192.168.64.188:3306/snbap_ods \
--driver com.mysql.jdbc.Driver \
--username root \
--password root \
--table biz_trade \
--target-dir /snbap/ods_tmp/ods_biz_trade \
--fields-terminated-by '\001' \
--check-column order_id \
--incremental append \
--last-value 0

echo "执行sqoop语句完成"

echo "加载数据到分区表"
hive -e "load data inpath '/snbap/ods_tmp/ods_user_order/*' into table snbap_ods.ods_user_order partition(dt='$dt')"
hive -e "load data inpath '/snbap/ods_tmp/ods_order_delivery/*' into table snbap_ods.ods_order_delivery partition(dt='$dt')"
hive -e "load data inpath '/snbap/ods_tmp/ods_order_item/*' into table snbap_ods.ods_order_item partition(dt='$dt')"
hive -e "load data inpath '/snbap/ods_tmp/ods_biz_trade/*' into table snbap_ods.ods_biz_trade partition(dt='$dt')"

撰写Flume配置文件 监控目标文件夹 ===》Hdfs

# 监控文件夹,sink到HDFS
################################pc端采集#######################################
##############################################################################
	#组件声明
	a1.sources = s1
	a1.channels = c1
	a1.sinks = k1
	#初始化数据源
	a1.sources.s1.type=spooldir
	a1.sources.s1.spoolDir = /root/project/snbap/flume/data/pc
	a1.sources.s1.fileHeader =false
	#初始化通道
	a1.channels.c1.type=memory
	a1.channels.c1.capacity=10000
	a1.channels.c1.transactionCapacity=10000
	#初始化数据槽
	a1.sinks.k1.type=hdfs
	a1.sinks.k1.hdfs.path=/snbap/ods/user_pc_click_log/dt=%Y%m%d
	a1.sinks.k1.hdfs.filePrefix=pc-
	a1.sinks.k1.hdfs.rollInterval=30
	a1.sinks.k1.hdfs.rollSize=134217728
	a1.sinks.k1.hdfs.rollCount=100000
	a1.sinks.k1.hdfs.useLocalTimeStamp=true
	a1.sinks.k1.hdfs.fileType=DataStream
	#关联组件
	a1.sources.s1.channels=c1
	a1.sinks.k1.channel=c1


################################app端采集######################################
##############################################################################
	#组件声明
	a1.sources = s1
	a1.channels = c1
	a1.sinks = k1
	#初始化数据源
	a1.sources.s1.type=spooldir
	a1.sources.s1.spoolDir = /root/project/snbap/flume/data/app
	a1.sources.s1.fileHeader =false
	#初始化通道
	a1.channels.c1.type=memory
	a1.channels.c1.capacity=10000
	a1.channels.c1.transactionCapacity=10000
	#初始化数据槽
	a1.sinks.k1.type=hdfs
	a1.sinks.k1.hdfs.path=/snbap/ods/user_app_click_log/dt=%Y%m%d
	a1.sinks.k1.hdfs.filePrefix=app-
	a1.sinks.k1.hdfs.rollInterval=30
	a1.sinks.k1.hdfs.rollSize=134217728
	a1.sinks.k1.hdfs.rollCount=100000
	a1.sinks.k1.hdfs.useLocalTimeStamp=true
	a1.sinks.k1.hdfs.fileType=DataStream
	#关联组件
	a1.sources.s1.channels=c1
	a1.sinks.k1.channel=c1

flume-ng agent -name a1 -c /opt/software/flume/flume190/conf/ -f /root/project/snbap/flume/conf/log1.conf -Dflume.root.logger=INFO,console

如果有日志表有分区,需要修复分区,如:

alter table ods_user_pc_click_log add partition(dt='20210805');
msck repair table ods_user_app_click_log;

DW

数据仓库层(DW),是数据仓库的主体.在这里,从 ODS 层中获得的数据按照主题建立各种数据模型。这一层和维度建模会有比较深的联系

DWD

数据导入脚本 load_data_dwd.sql

# 加载dwd_user表
insert overwrite table snbap_dwd.dwd_user
select
user_id, 
user_name, 
user_gender, 
user_birthday, 
user_age, 
constellation, 
province, 
city, 
city_level, 
e_mail, 
op_mail, 
mobile, 
num_seg_mobile, 
op_mobile, 
register_time, 
login_ip, 
login_source, 
request_user, 
total_score, 
used_score, 
is_blacklist, 
is_married, 
education, 
monthly_income, 
profession, 
create_date, 
current_timestamp() dw_date
from snbap_ods.ods_user
;

#加载dwd_user_extend表
insert into table snbap_dwd.dwd_user_extend
SELECT 
user_id,
user_gender,
is_pregnant_woman,
is_have_children,
is_have_car,
phone_brand,
phone_brand_level,
phone_cnt,
change_phone_cnt,
is_majia,
majia_account_cnt,
loyal_model,
shopping_type_model,
weight,
height,
current_timestamp() dw_date
FROM snbap_ods.ods_user_extend;

#加载dwd_user_app_click_log表
insert overwrite table snbap_dwd.dwd_user_app_click_log partition(dt='${hivevar:param_dt}')
SELECT 
log_id,
user_id,
imei,
regexp_extract(log_time,'\"(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2})\"') log_time,
regexp_extract(os_version,'(.*?) ') visit_os,
regexp_extract(os_version," (.*?)$") os_version,
app_name,
app_version,
device_token,
regexp_extract(visit_ip,'\"(.*?)\"') visit_ip,
province,
city,
current_timestamp() dw_date
FROM snbap_ods.ods_user_app_click_log;

#加载dwd_user_pc_click_log表
insert overwrite table snbap_dwd.dwd_user_pc_click_log partition(dt='${hivevar:param_dt}')
SELECT 
log_id,
user_id,
session_id,
cookie_id,
regexp_extract(visit_time,'\"(.*?)\"') visit_time,
visit_url,
visit_os,
browser_name,
regexp_extract(visit_ip,'\"(.*?)\"') visit_ip,
province,
city,
page_id,
goods_id,
brand_id,
current_timestamp() dw_date
FROM snbap_ods.ods_user_pc_click_log;

#加载dwd_biz_trade表
insert into table snbap_dwd.dwd_biz_trade
partition(dt='${hivevar:param_dt}')
SELECT trade_id, order_id, user_id, amount, trade_type, trade_time, current_timestamp() dw_date
FROM snbap_ods.ods_biz_trade;

#加载dwd_cart表
insert into table snbap_dwd.dwd_cart
partition(dt='${hivevar:param_dt}')
SELECT cart_id, session_id, user_id, goods_id, goods_num, add_time, cancle_time, sumbit_time, create_date, current_timestamp() dw_date
FROM snbap_ods.ods_cart;

#加载dwd_code_brand
insert into table snbap_dwd.dwd_code_brand
SELECT brand_id, brand_name,current_timestamp() dw_date
FROM snbap_ods.ods_code_brand;

#加载dwd_code_category
insert into table snbap_dwd.dwd_code_category
SELECT first_category_id, first_category_name, second_category_id, second_category_name, third_category_id, third_category_name, category_id,current_timestamp() dw_date
FROM snbap_ods.ods_code_category;

#加载dwd_code_city
insert into table snbap_dwd.dwd_code_city
SELECT id, city, province,current_timestamp() dw_date
FROM snbap_ods.ods_code_city;

#加载dwd_code_education
insert into table snbap_dwd.dwd_code_education
SELECT id, edu_name,current_timestamp() dw_date
FROM snbap_ods.ods_code_education;

#加载dwd_code_email_suffix
insert into table snbap_dwd.dwd_code_email_suffix
SELECT id, suffix, op_mail,current_timestamp() dw_date
FROM snbap_ods.ods_code_email_suffix;

#加载dwd_code_goods
insert into table snbap_dwd.dwd_code_goods
SELECT goods_id, goods_name, goods_no, first_category_id, first_category_name, second_category_id, second_category_name, third_category_id, third_category_name, brand_id, brand_name, price,current_timestamp() dw_date
FROM snbap_ods.ods_code_goods;

#加载dwd_code_profession
insert into table snbap_dwd.dwd_code_profession
SELECT id, pro_name,current_timestamp() dw_date
FROM snbap_ods.ods_code_profession;

#加载dwd_order_delivery
insert into table snbap_dwd.dwd_order_delivery
partition(dt='${hivevar:param_dt}')
SELECT order_id, order_no, consignee, area_id, area_name, address, mobile, phone, coupon_id, coupon_money, carriage_money, create_time, update_time, addr_id,current_timestamp() dw_date 
FROM snbap_ods.ods_order_delivery;

#加载dwd_order_item
insert into table snbap_dwd.dwd_order_item
partition(dt='${hivevar:param_dt}')
SELECT user_id, order_id, order_no, goods_id, goods_no, goods_name, goods_amount, brand_id, brand_name, curr_price, market_price, discount, cost_price, first_cart, first_cart_name, second_cart, second_cart_name, third_cart, third_cart_name, goods_desc,current_timestamp() dw_date  
FROM snbap_ods.ods_order_item;

#加载dwd_user
insert into table snbap_dwd.dwd_user
SELECT user_id, user_name, user_gender, user_birthday, user_age, constellation, province, city, city_level, e_mail, op_mail, mobile, num_seg_mobile, op_mobile, register_time, login_ip, login_source, request_user, total_score, used_score, is_blacklist, is_married, education, monthly_income, profession, create_date,
current_timestamp() dw_date 
FROM snbap_ods.ods_user;

#加载dwd_user_addr
insert into table snbap_dwd.dwd_user_addr
SELECT user_id, order_addr, user_order_flag, addr_id, arear_id,current_timestamp() dw_date
FROM snbap_ods.ods_user_addr;

#加载dwd_user_app_click_log
insert into table snbap_dwd.dwd_user_app_click_log
partition(dt='${hivevar:param_dt}')
SELECT log_id, user_id, imei, log_time, visit_os, os_version, app_name, app_version, device_token, visit_ip, province, city,current_timestamp() dw_date 
FROM snbap_ods.ods_user_app_click_log;

#加载dwd_user_extend
insert into table snbap_dwd.dwd_user_extend
SELECT user_id, user_gender, is_pregnant_woman, is_have_children, is_have_car, phone_brand, phone_brand_level, phone_cnt, change_phone_cnt, is_majia, majia_account_cnt, loyal_model, shopping_type_model, weight, height,current_timestamp() dw_date
FROM snbap_ods.ods_user_extend;

#加载dwd_user_order
insert into table snbap_dwd.dwd_user_order
partition(dt='${hivevar:param_dt}')
SELECT order_id, order_no, order_date, user_id, user_name, order_money, order_type, order_status, pay_status, pay_type, order_source, update_time,current_timestamp() dw_date
FROM snbap_ods.ods_user_order;

#加载dwd_user_pc_click_log
insert into table snbap_dwd.dwd_user_pc_click_log
partition(dt='${hivevar:param_dt}')
SELECT log_id, user_id, session_id, cookie_id, visit_time, visit_url, visit_os, browser_name, visit_ip, province, city, page_id, goods_id, brand_id,current_timestamp() dw_date
FROM snbap_ods.ods_user_pc_click_log;

执行脚本 load_data_dwd.sh

#!/bin/bash
args=$1
dt=
if [ ${#args} == 0 ]
    then 
    	dt=`date -d '1 days ago' +%Y%m%d`
    else
    	dt=$1
fi
hive --hivevar param_dt=${dt} -f load_data_dwd.sql

DWS

数据导入脚本 load_data_dws.sql

#dws_user_app_pv
insert overwrite table snbap_dws.dws_user_app_pv
partition(dt='${hivevar:param_dt}')
SELECT 
user_id, 
imei, 
log_time,
hour(log_time) log_hour,
visit_os, 
os_version, 
`app_name`, 
app_version, 
device_token, 
visit_ip, 
province, 
city, 
dw_date
FROM snbap_dwd.dwd_user_app_click_log
where dt=${hivevar:param_dt};


#dws_user_pc_pv
insert overwrite table snbap_dws.dws_user_pc_pv
partition(dt='${hivevar:param_dt}')
SELECT 
user_id, 
session_id, 
cookie_id, 
min(visit_time) in_time, 
max(visit_time) out_time,
case when min(visit_time) = max(visit_time) then 3 else max(visit_time) - min(visit_time) end stay_time,
count(1) pv ,
visit_os, 
browser_name, 
visit_ip, 
province, 
city, 
current_timestamp() dw_date
FROM snbap_dwd.dwd_user_pc_click_log
where dt=${hivevar:param_dt}
group by 
user_id, 
session_id, 
cookie_id,
visit_os, 
browser_name, 
visit_ip, 
province, 
city;

#snbap_dws.dws_user_basic
insert overwrite table snbap_dws.dws_user_basic
SELECT 
u.user_id, 
u.user_name, 
u.user_gender, 
u.user_birthday, 
u.user_age, 
u.constellation, 
u.province, 
u.city, 
u.city_level, 
u.e_mail mail, 
u.op_mail, 
u.mobile, 
u.num_seg_mobile, 
u.op_mobile, 
u.register_time, 
u.login_ip, 
u.login_source, 
u.request_user,
u.total_score, 
u.used_score, 
u.is_blacklist, 
u.is_married, 
u.education, 
u.monthly_income, 
u.profession, 
ue.is_pregnant_woman, 
ue.is_have_children, 
ue.is_have_car, 
ue.phone_brand, 
ue.phone_brand_level, 
ue.phone_cnt, 
ue.change_phone_cnt, 
ue.is_maja, 
ue.majia_account_cnt, 
ue.loyal_model, 
ue.shopping_type_model, 
ue.weight, 
ue.height 
FROM snbap_dwd.dwd_user u
join snbap_dwd.dwd_user_extend ue on u.user_id = ue.user_id;

#snbap_dws.dws_user_visit_month1
set hivevar param_dt = 20210526;
insert overwrite table snbap_dws.dws_user_visit_month1
partition(dt='${hivevar:param_dt}')
select
user_id,`type`,content,cnt,
row_number() over(distribute by user_id,`type` sort by cnt desc ) rn
from
(
select
user_id,
'visit_ip' as `type`,
visit_ip content,
count(1) cnt
from snbap_dwd.dwd_user_pc_click_log 
where visit_time<date_add('${hivevar:param_dt}',1)and visit_time>=date_sub('${hivevar:param_dt}',29)
group by
user_id,visit_ip
union all
select
user_id,
'browser_name' as `type`,
browser_name content,
count(1) cnt
from snbap_dwd.dwd_user_pc_click_log 
where visit_time<date_add('${hivevar:param_dt}',1)and visit_time>=date_sub('${hivevar:param_dt}',29)
group by
user_id,browser_name
union all
select
user_id,
'cookie_id' as `type`,
cookie_id content,
count(1) cnt
from snbap_dwd.dwd_user_pc_click_log 
where visit_time<date_add('${hivevar:param_dt}',1)and visit_time>=date_sub('${hivevar:param_dt}',29)
group by
user_id,cookie_id
union all
select
user_id,
'visit_os' as `type`,
visit_os content,
count(1) cnt
from snbap_dwd.dwd_user_pc_click_log 
where visit_time<date_add('${hivevar:param_dt}',1)and visit_time>=date_sub('${hivevar:param_dt}',29)
group by
user_id,visit_os
)t
;

执行脚本 load_data_dws.sh

#!/bin/bash
args=$1
dt=
if [ ${#args} == 0 ]
    then 
    	dt=`date -d '1 days ago' +%Y%m%d`
    else
    	dt=$1
fi
hive --hivevar param_dt=${dt} -f load_data_dwd.sql

DM

数据报表层,这一层是提供为数据产品使用的结果数据

  • 0
    点赞
  • 0
    评论
  • 1
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

©️2021 CSDN 皮肤主题: 深蓝海洋 设计师:CSDN官方博客 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值