数据仓库模拟环境
上一篇说了很多数据仓库和维度模型的理论,从本篇开始落地实操,用一个小而完整的示例说明维度模型及其相关的ETL技术。示例数据库和ETL的SQL实现是在《Dimensional Data Warehousing with MySQL: A Tutorial》基础上做了些修改,增加了Kettle实现的部分。本篇详细说明数据仓库模拟实验环境搭建过程。
实验环境:
操作系统:windows XP
工具:Powerdesigner 16.5 ,sqlserver 2005
实验环境搭建过程:
1. 设计ERD
2. 建立源数据数据库和数据仓库数据库
3. 建立源库表
4. 建立数据仓库表
5. 建立过渡表
6. 生成源库测试数据
7. 生成日期维度数据
源数据数据库初始ERD如图(二)- 1所示(自己用Powerdesigner设计的图,由于没保存,借用CSDN上别人画的图)
数据仓库数据库初始ERD如图(二)- 2所示
图(二)- 1
图(二)- 2
如下为sqlserver创建数据库、相应表 以及生成源数据的SQL:
1 --建立数据库 2 if exists (select 1 from sys.databases where name ='source') 3 drop database source; 4 create database source; 5 if exists (select 1 from sys.databases where name ='DW') 6 drop database source; 7 CREATE DATABASE DW; 8 9 ---建立数据仓库表 10 use DW; 11 ---建立过渡表 12 -- 建立产品过渡表 13 CREATE TABLE product_stg ( 14 product_code INT, 15 product_name VARCHAR(30), 16 product_category VARCHAR(30) 17 ); 18 19 -- 建立客户过渡表 20 CREATE TABLE customer_stg ( 21 customer_number INT, 22 customer_name VARCHAR(30), 23 customer_street_address VARCHAR(30), 24 customer_zip_code INT, 25 customer_city VARCHAR(30), 26 customer_state VARCHAR(2) 27 ); 28 /*==============================================================*/ 29 /* Table: Customer_dim */ 30 /*==============================================================*/ 31 create table Customer_dim ( 32 Customer_sk int not null, 33 Customer_number int null, 34 Customer_name varchar(50) null, 35 Customer_street_address varchar(50) null, 36 Customer_zip_code int null, 37 Customer_city varchar(30) null, 38 Customer_state varchar(2) null, 39 effective_date datetime null, 40 expiry_date datetime null, 41 constraint PK_CUSTOMER_DIM primary key (Customer_sk) 42 ) 43 go 44 45 /*==============================================================*/ 46 /* Table: Product_dim */ 47 /*==============================================================*/ 48 create table Product_dim ( 49 Product_sk int not null, 50 Product_code int null, 51 Product_name varchar(30) null, 52 Product_category varchar(30) null, 53 effective_date datetime null, 54 expire_date datetime null, 55 constraint PK_PRODUCT_DIM primary key (Product_sk) 56 ) 57 go 58 59 /*==============================================================*/ 60 /* Table: Sales_order_fact */ 61 /*==============================================================*/ 62 create table Sales_order_fact ( 63 Order_sk int null, 64 Customer_sk int null, 65 Product_sk int null, 66 Date_sk int null, 67 ord_order_sk int null, 68 Order_date_sk int null, 69 Order_amount decimal(10,2) null 70 ) 71 go 72 73 /*==============================================================*/ 74 /* Table: date_dim */ 75 /*==============================================================*/ 76 create table date_dim ( 77 Date_sk int identity(1,1) not null, 78 date datetime null, 79 Month_name varchar(30) null, 80 Month int null, 81 quarter int null, 82 year int null, 83 effective_date datetime null, 84 expire_date datetime null, 85 constraint PK_DATE_DIM primary key (Date_sk) 86 ) 87 go 88 89 /*==============================================================*/ 90 /* Table: order_dim */ 91 /*==============================================================*/ 92 create table order_dim ( 93 order_sk int not null, 94 order_number int null, 95 effective_date datetime null, 96 expire_date datetime null, 97 constraint PK_ORDER_DIM primary key (order_sk) 98 ) 99 go 100 101 alter table Sales_order_fact 102 add constraint FK_SALES_OR_REFERENCE_CUSTOMER foreign key (Customer_sk) 103 references Customer_dim (Customer_sk) 104 go 105 106 alter table Sales_order_fact 107 add constraint FK_SALES_OR_REFERENCE_PRODUCT_ foreign key (Product_sk) 108 references Product_dim (Product_sk) 109 go 110 111 alter table Sales_order_fact 112 add constraint FK_SALES_OR_REFERENCE_DATE_DIM foreign key (Date_sk) 113 references date_dim (Date_sk) 114 go 115 116 alter table Sales_order_fact 117 add constraint FK_SALES_OR_REFERENCE_ORDER_DI foreign key (ord_order_sk) 118 references order_dim (order_sk) 119 go 120 121 ---建立源数据库表 122 use source; 123 /*==============================================================*/ 124 /* Table: Customer */ 125 /*==============================================================*/ 126 create table Customer ( 127 Customer_number int not null, 128 Customer_name varchar(50) null, 129 Customer_street_address varchar(50) null, 130 Customer_zip_code int null, 131 Customer_city varchar(30) null, 132 Customer_state varchar(2) null, 133 constraint PK_CUSTOMER primary key (Customer_number) 134 ) 135 go 136 137 /*==============================================================*/ 138 /* Table: Product */ 139 /*==============================================================*/ 140 create table Product ( 141 Product_code int not null, 142 Product_name varchar(30) null, 143 Product_category varchar(30) null, 144 constraint PK_PRODUCT primary key (Product_code) 145 ) 146 go 147 148 /*==============================================================*/ 149 /* Table: Sales_Order */ 150 /*==============================================================*/ 151 create table Sales_Order ( 152 Order_number int not null, 153 Customer_number int null, 154 Product_code int null, 155 Order_date datetime null, 156 Entry_date datetime null, 157 Order_mouunt int null, 158 constraint PK_SALES_ORDER primary key (Order_number) 159 ) 160 go 161 162 alter table Sales_Order 163 add constraint FK_SALES_OR_REFERENCE_CUSTOMER foreign key (Customer_number) 164 references Customer (Customer_number) 165 go 166 167 alter table Sales_Order 168 add constraint FK_SALES_OR_REFERENCE_PRODUCT foreign key (Product_code) 169 references Product (Product_code) 170 go 171 ---生成客户表数据 172 USE source; 173 174 -- 生成客户表测试数据 175 INSERT INTO customer(Customer_number,customer_name, customer_street_address,customer_zip_code,customer_city, customer_state) 176 VALUES(1,'Really Large Customers', '7500 Louise Dr.',17050, 'Mechanicsburg','PA'); 177 INSERT INTO customer(Customer_number,customer_name, customer_street_address,customer_zip_code,customer_city, customer_state) 178 VALUES(2,'Small Stores', '2500 Woodland St.',17055, 'Pittsburgh','PA'); 179 INSERT INTO customer(Customer_number,customer_name, customer_street_address,customer_zip_code,customer_city, customer_state) 180 VALUES(3,'Medium Retailers','1111 Ritter Rd.',17055,'Pittsburgh','PA'); 181 INSERT INTO customer(Customer_number,customer_name, customer_street_address,customer_zip_code,customer_city, customer_state) 182 VALUES(4,'Good Companies','9500 Scott St.',17050,'Mechanicsburg','PA'); 183 INSERT INTO customer(Customer_number,customer_name, customer_street_address,customer_zip_code,customer_city, customer_state) 184 VALUES(5,'Wonderful Shops','3333 Rossmoyne Rd.',17050,'Mechanicsburg','PA'); 185 INSERT INTO customer(Customer_number,customer_name, customer_street_address,customer_zip_code,customer_city, customer_state) 186 VALUES(6,'Loyal Clients','7070 Ritter Rd.',17055,'Pittsburgh','PA'); 187 INSERT INTO customer(Customer_number,customer_name, customer_street_address,customer_zip_code,customer_city, customer_state) 188 VALUES(7,'Distinguished Partners','9999 Scott St.',17050,'Mechanicsburg','PA'); 189 use source; 190 -- 生成产品表测试数据 191 INSERT INTO product(Product_code,product_name,product_category ) VALUES(1,'Hard Disk Drive', 'Storage'); 192 INSERT INTO product(Product_code,product_name,product_category ) VALUES (2,'Floppy Drive', 'Storage'); 193 INSERT INTO product(Product_code,product_name,product_category ) VALUES (3,'LCD Panel', 'Monitor'); 194 195 INSERT INTO sales_order VALUES 196 (1, 1, 1, '2013-02-01', '2013-02-01', 1000); 197 INSERT INTO sales_order VALUES (2, 2, 2, '2013-02-10', '2013-02-10', 1000); 198 INSERT INTO sales_order VALUES (3, 3, 3, '2013-03-01', '2013-03-01', 4000); 199 INSERT INTO sales_order VALUES (4, 4, 1, '2013-04-15', '2013-04-15', 4000) ; 200 INSERT INTO sales_order VALUES (5, 5, 2, '2013-05-20', '2013-05-20', 6000) ; 201 INSERT INTO sales_order VALUES (6, 6, 3, '2013-07-30', '2013-07-30', 6000) ; 202 INSERT INTO sales_order VALUES (7, 7, 1, '2013-09-01', '2013-09-01', 8000) ; 203 INSERT INTO sales_order VALUES (8, 1, 2, '2013-11-10', '2013-11-10', 8000) ; 204 INSERT INTO sales_order VALUES (9, 2, 3, '2014-01-05', '2014-01-05', 1000); 205 INSERT INTO sales_order VALUES (10, 3, 1, '2014-02-10', '2014-02-10', 1000) ; 206 INSERT INTO sales_order VALUES (11, 4, 2, '2014-03-15', '2014-03-15', 2000); 207 INSERT INTO sales_order VALUES (12, 5, 3, '2014-04-20', '2014-04-20', 2500) ; 208 INSERT INTO sales_order VALUES (13, 6, 1, '2014-05-30', '2014-05-30', 3000) ; 209 INSERT INTO sales_order VALUES (14, 7, 2, '2014-06-01', '2014-06-01', 3500) ; 210 INSERT INTO sales_order VALUES (15, 1, 3, '2014-07-15', '2014-07-15', 4000) ; 211 INSERT INTO sales_order VALUES (16, 2, 1, '2014-08-30', '2014-08-30', 4500) ; 212 INSERT INTO sales_order VALUES (17, 3, 2, '2014-09-05', '2014-09-05', 1000); 213 INSERT INTO sales_order VALUES (18, 4, 3, '2014-10-05', '2014-10-05', 1000) ; 214 INSERT INTO sales_order VALUES (19, 5, 1, '2015-01-10', '2015-01-10', 4000); 215 INSERT INTO sales_order VALUES (20, 6, 2, '2015-02-20', '2015-02-20', 4000); 216 INSERT INTO sales_order VALUES (21, 7, 3, '2015-02-28', '2015-02-28', 4000);
数据仓库架构当中,一般会有一个ODS层做数据缓冲,这里为了简单起见,将ODS层的事情放在DW库的STG表当中。
实验进行至此,需要生成日期维度数据。日期包含时间,由于数据仓库是用来存储历史数据以作分析,时间对于数据仓库非常重要,每一个数据仓库当中必然会有一个时间维度。
日期维度表生成数据脚本:
declare @start_date datetime declare @end_date datetime set @start_date=convert(datetime,'2001-01-01',120) set @end_date=convert(datetime,'2020-12-31',120) while @start_date<@end_date begin INSERT dbo.date_dim ( date ,Month_name ,Month ,quarter ,year ,effective_date ,expire_date) values ( @start_date ,datename(month,@start_date) ,month(@start_date) ,case when month(@start_date)<=3 then 1 when month(@start_date)<=6 then 2 when month(@start_date)<=9 then 3 else 4 end ,year(@start_date) ,convert(datetime,'1900-01-01',120) ,convert(datetime,'9999-12-31',120)); set @start_date= dateadd(day,1,@start_date); end;