库存管理子系统:
领料单(领料单号,领料人,商品号,数量,日期)
进料单(进料单号,订单号,进料人,收料人,日期)
库存(商品号,库房号,库存量,日期)
库房(库房号,仓库管理员,地点,库存商品描述)
人事管理子系统:
员工(员工号,姓名,性别,年龄,文化程度,部门号)
部门(部门号,部门名称,部门主管,电话)
主题一: 销售
固有信息: 员工号,顾客号,商品号,数量,单价,日期
员工信息: 员工号,姓名,性别,年龄,文化程度,部门号
顾客信息: 顾客号,姓名,性别,年龄,文化程度,地址,电话
商品信息: 商品号, 商品名称, 单价, 重量, 体积, ...
主题二: 供应商
固有信息: 供应商号,供应商名,地址,电话
商品信息: 商品号, 商品名称, 单价, 重量, 体积, ...
主题三: 顾客
固有信息: 顾客号,姓名,性别,年龄,文化程度,地址,电话, 职业, ...
购物信息: 顾客号,商品号, 商品名,售价,购买数量, 购买日期....
在每个主题当中,包含了相关主题的所有信息,同时又抛弃了与分析处理无关的数据
创建数据库和表
- 在mysql中创建对应的数据库和表
CREATE DATABASE IF NOT EXISTS sales_source DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
CREATE TABLE product
(
product_code INT(11) NOT NULL AUTO_INCREMENT,
product_name VARCHAR(128) NOT NULL,
product_category VARCHAR(256) NOT NULL,
PRIMARY KEY (product_code)
);
- 使用数据生成脚本插入数据
source '/root/data/generate_data.sql'
Hive准备工作
Hive只是为数据仓库提供了一种解决方案, 并不能说Hive就是数据仓库
hive 启动元数据服务
nohup hive --service metastore &
hive 启动 hiveserver2 服务
nohup hive --service hiveserver2 &
全路径启动beeline
/opt/install/hive/bin/beeline -u jdbc:hive2://localhost:10000
rds层建库建表
- 使用建表语句创建rds层表
source '/root/data/rds_create.sql'
加载数据到rds层
ETL 抽取
- 全量抽取customer表
sqoop import \
--connect jdbc:mysql://localhost:3306/sales_source \
--username root \
--password root \
--table customer \
--hive-import \
--hive-table sales_rds.customer \
--hive-overwrite \
--target-dir temp
- 全量导入product表
sqoop import \
--connect jdbc:mysql://localhost:3306/sales_source \
--username root \
--password root \
--table product \
--hive-import \
--hive-table sales_rds.product \
--hive-overwrite \
--target-dir temp
- 增量抽取 sales_order
sqoop import \
--connect jdbc:mysql://localhost:3306/sales_source \
--username root \
--password root \
--table sales_order \
--hive-import \
--hive-table sales_rds.sales_order \
--fields-terminated-by '\t' \
--lines-terminated-by '\n' \
--check-column entry_date \
--incremental append \
--last-value '1900-1-1'
-
指定Hive表之间的字段分隔符
–fields-terminated-by ‘\t’ -
指定Hive表之间的行分隔符
–lines-terminated-by ‘\n’ -
检查列, 通过检查某一列的值实现递增
–check-column entry_date -
增量的方式, 可以选择append或者是lastmodified
–incremental append -
上一次检查列最大的值
–last-value ‘1900-1-1’ -
删除job
sqoop job --delete myjob -
创建job
sqoop job \
--create myjob \
-- import \
--connect jdbc:mysql://localhost:3306/sales_source \
--username root \
--password root \
--table sales_order \
--hive-import \
--hive-table sales_rds.sales_order \
--fields-terminated-by '\t' \
--lines-terminated-by '\n' \
--check-column entry_date \
--incremental append \
--last-value '1900-1-1'
- 查看job
sqoop job --list - 执行job
sqoop job --exec myjob - 在mysql中新增数据
USE sales_source;
SET @customer_number := FLOOR(1+RAND()*6);
SET @product_code := FLOOR(1+RAND()* 3);
SET @order_date := '2018-11-23';
SET @amount := FLOOR(1000+RAND()*9000);
INSERT INTO sales_order VALUES(100001,@customer_number,@product_code,@order_date,@order_date,@amount);
SET @customer_number := FLOOR(1+RAND()*6);
SET @product_code := FLOOR(1+RAND()* 3);
SET @order_date := '2018-11-24';
SET @amount := FLOOR(1000+RAND()*9000);
INSERT INTO sales_order VALUES(100002,@customer_number,@product_code,@order_date,@order_date,@amount);
COMMIT ;
-
执行job
sqoop job --exec myjob -
在hive中查看数据条数, 可以看到多了两条数据, 一共100002条数据
±--------±-+
| _c0 |
±--------±-+
| 100002 |
±--------±-+ -
删除job
sqoop job --delete myjob -
防止在执行的时候手动输入密码
echo -n "root" > sqoopPWD.pwd
hdfs dfs -mkdir -p /sqoop/pwd
hdfs dfs -put sqoopPWD.pwd /sqoop/pwd/
hdfs dfs -chmod 400 /sqoop/pwd/sqoopPWD.pwd
- 删除job
sqoop job --delete myjob - 创建job
sqoop job \
--create myjob \
-- import \
--connect jdbc:mysql://localhost:3306/sales_source \
--username root \
--password-file /sqoop/pwd/sqoopPWD.pwd \
--table sales_order \
--hive-import \
--hive-table sales_rds.sales_order \
--fields-terminated-by '\t' \
--lines-terminated-by '\n' \
--check-column entry_date \
--incremental append \
--last-value '1900-1-1'
- 执行job
sqoop job --exec myjob
从rds层抽取传到DW层
- 加载 dim_product 表
from
(
select
row_number() over(order by sp.product_code ) product_sk,
sp.product_code,
sp.product_name,
sp.product_category,
'1.0',
'2018-1-1',
'2050-1-1'
from sales_rds.product sp
) tmp
insert into sales_dw.dim_product
select *
;
- 加载 dim_customer 表
from
(
select
row_number() over(order by sc.customer_number) customer_sk,
sc.customer_number ,
sc.customer_name ,
sc.customer_street_address,
sc.customer_zip_code,
sc.customer_city,
sc.customer_state ,
'1.0',
'2018-1-1',
'2050-1-1'
from sales_rds.customer sc
) tmp
insert into sales_dw.dim_customer
select *
;
- 加载 dim_order 表
from
(
select
row_number() over(order by so.order_number) order_sk,
order_number,
'1.0',
'2018-1-1',
'2050-1-1'
from sales_rds.sales_order so
) tmp
insert into sales_dw.dim_order
select *
;
编写生成日期的脚本
#!/bin/bash
# 起始日期($1表示获取第一个输入的参数值)
date1=$1
# 终止日期
date2=$2
# 日期((date -d 是--date的缩写=字符串 显示指定字符串所描述的时间,而非当前时间))
tmpdate=`date -d "$date1" +%F`
# 起始时间戳
startSec=`date -d "$date1" +%s`
# 终止时间戳
endSec=`date -d "$date2" +%s`
# 循环的起始值
min=1
# 循环的终止值(expr命令是一个手工命令行计数器,用于在UNIX/LINUX下求表达式变量的值,一般用于整数值,也可用于字符串。)
max=`expr \( $endSec - $startSec \) / 60 / 60 / 24`
while [ $min -le $max ]
do
# 计算月份
month=`date -d "$tmpdate" +%m`
# 计算月份名称
month_name=`date -d "$tmpdate" +%B`
# 计算年
year=`date -d "$tmpdate" +%Y`
# 计算季度
quarter=`expr \( $month - 1 \) \/ 3 + 1`
# 输出到文件
echo ${min}","${tmpdate}","${month}","${month_name}","${quarter}","${year} >> ./dim_date.csv
# 计算下一次的日期
tmpdate=`date -d "+$min day $date1" +%F`
# 计算下一次的时间戳
startSec=`date -d "+$min day $date1" +%s`
# 变量+1
min=`expr $min + 1`
done
获得数据后上传至hdfs上中date表的目录下
hdfs dfs -put dim_date.csv /hive/warehouse/sales_dw.db/dim_date
加载事实表
- 设置动态分区
set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict;
. - 设置最大分区个数
set hive.exec.max.dynamic.partitions=10000;
set hive.exec.max.dynamic.partitions.pernode=10000;
- 这里我们按照月份分区
from
(
select
b.order_sk,
c.customer_sk,
d.product_sk,
e.date_sk order_date_sk,
a.order_amount,
substr(a.order_date,1,7) order_date
from sales_rds.sales_order a
join sales_dw.dim_order b on a.order_number=b.order_number
join sales_dw.dim_customer c on a.customer_number=c.customer_number
join sales_dw.dim_product d on a.product_code=d.product_code
join sales_dw.dim_date e on date(a.order_date)=e.date
) temp
insert into table sales_dw.fact_sales_order partition(order_date)
select order_sk,customer_sk,product_sk,order_date_sk,order_amount,order_date;
dm层
想求一下 2018年10月20这一天的指标
顾客,产品,日期,当天订单个数, 当天的订单金额, 近2天的订单个数, 近2天的订单金额
create database if not exists sales_dm;
CREATE TABLE `sales_dm.dm_order`(
`customer_sk` int,
`customer_number` int,
`customer_name` varchar(128),
`customer_street_address` varchar(256),
`customer_zip_code` int,
`customer_city` varchar(32),
`customer_state` varchar(32),
`product_sk` int,
`product_code` int,
`product_name` varchar(128),
`product_category` varchar(256),
`date_sk` int,
`date` date,
`month` tinyint,
`month_name` varchar(16),
`quarter` tinyint,
`year` int,
`one_order_cnt` bigint,
`tow_order_cnt` bigint,
`one_order_amount` decimal(28,2),
`tow_order_amount` decimal(28,2))
;
from
(
select
dc.customer_sk,
dc.customer_number,
dc.customer_name ,
dc.customer_street_address,
dc.customer_zip_code ,
dc.customer_city ,
dc.customer_state ,
dp.product_sk,
dp.product_code ,
dp.product_name,
dp.product_category,
dd.date_sk ,
dd.`date`,
dd.month ,
dd.month_name,
dd.quarter ,
dd.year ,
sum(case when datediff('2018-10-20',dd.date)=0 then 1 else 0 end) one_order_cnt,
sum(case when datediff('2018-10-20',dd.date)<=1 then 1 else 0 end) tow_order_cnt,
sum(case when datediff('2018-10-20',dd.date)=0 then fso.order_amount else 0 end) one_order_amount,
sum(case when datediff('2018-10-20',dd.date)<=1 then fso.order_amount else 0 end) tow_order_amount
from sales_dw.fact_sales_order fso
join sales_dw.dim_customer dc on fso.customer_sk=dc.customer_sk
join sales_dw.dim_product dp on fso.product_sk=dp.product_sk
join sales_dw.dim_date dd on fso.order_date_sk=dd.date_sk
where dd.date>='2018-10-19' and dd.date<='2018-10-20'
group by
dc.customer_sk,
dc.customer_number,
dc.customer_name ,
dc.customer_street_address,
dc.customer_zip_code ,
dc.customer_city ,
dc.customer_state ,
dp.product_sk,
dp.product_code ,
dp.product_name,
dp.product_category,
dd.date_sk ,
dd.`date`,
dd.month ,
dd.month_name,
dd.quarter ,
dd.year
) temp
insert into table sales_dm.dm_order
select * from temp;
;