数仓搭建之DWD层
4.1 DWD层启动表数据解析
4.1.1 创建启动表
1)建表语句
hive (gmall)>
drop table if exists dwd_start_log;
CREATE EXTERNAL TABLE dwd_start_log(
mid_id
string,
user_id
string,
version_code
string,
version_name
string,
lang
string,
source
string,
os
string,
area
string,
model
string,
brand
string,
sdk_version
string,
gmail
string,
height_width
string,
app_time
string,
network
string,
lng
string,
lat
string,
entry
string,
open_ad_type
string,
action
string,
loading_time
string,
detail
string,
extend1
string
)
PARTITIONED BY (dt string)
location ‘/warehouse/gmall/dwd/dwd_start_log/’;
4.1.2 向启动表导入数据
hive (gmall)>
insert overwrite table dwd_start_log
PARTITION (dt=‘2019-02-10’)
select
get_json_object(line,’
.
m
i
d
′
)
m
i
d
i
d
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.mid') mid_id, get_json_object(line,'
.mid′)midid,getjsonobject(line,′.uid’) user_id,
get_json_object(line,’
.
v
c
′
)
v
e
r
s
i
o
n
c
o
d
e
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.vc') version_code, get_json_object(line,'
.vc′)versioncode,getjsonobject(line,′.vn’) version_name,
get_json_object(line,’
.
l
′
)
l
a
n
g
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.l') lang, get_json_object(line,'
.l′)lang,getjsonobject(line,′.sr’) source,
get_json_object(line,’
.
o
s
′
)
o
s
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.os') os, get_json_object(line,'
.os′)os,getjsonobject(line,′.ar’) area,
get_json_object(line,’
.
m
d
′
)
m
o
d
e
l
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.md') model, get_json_object(line,'
.md′)model,getjsonobject(line,′.ba’) brand,
get_json_object(line,’
.
s
v
′
)
s
d
k
v
e
r
s
i
o
n
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.sv') sdk_version, get_json_object(line,'
.sv′)sdkversion,getjsonobject(line,′.g’) gmail,
get_json_object(line,’
.
h
w
′
)
h
e
i
g
h
t
w
i
d
t
h
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.hw') height_width, get_json_object(line,'
.hw′)heightwidth,getjsonobject(line,′.t’) app_time,
get_json_object(line,’
.
n
w
′
)
n
e
t
w
o
r
k
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.nw') network, get_json_object(line,'
.nw′)network,getjsonobject(line,′.ln’) lng,
get_json_object(line,’
.
l
a
′
)
l
a
t
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.la') lat, get_json_object(line,'
.la′)lat,getjsonobject(line,′.entry’) entry,
get_json_object(line,’
.
o
p
e
n
a
d
t
y
p
e
′
)
o
p
e
n
a
d
t
y
p
e
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.open_ad_type') open_ad_type, get_json_object(line,'
.openadtype′)openadtype,getjsonobject(line,′.action’) action,
get_json_object(line,’
.
l
o
a
d
i
n
g
t
i
m
e
′
)
l
o
a
d
i
n
g
t
i
m
e
,
g
e
t
j
s
o
n
o
b
j
e
c
t
(
l
i
n
e
,
′
.loading_time') loading_time, get_json_object(line,'
.loadingtime′)loadingtime,getjsonobject(line,′.detail’) detail,
get_json_object(line,’$.extend1’) extend1
from ods_start_log
where dt=‘2019-02-10’;
3)测试
hive (gmall)> select * from dwd_start_log limit 2;
4.1.3 DWD层启动表加载数据脚本
1)在hadoop102的/home/atguigu/bin目录下创建脚本
[atguigu@hadoop102 bin]$ vim dwd_start_log.sh
在脚本中编写如下内容
#!/bin/bash
定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
h
i
v
e
−
e
"
hive -e "
hive−e"sql"
2)增加脚本执行权限
[atguigu@hadoop102 bin]$ chmod 777 dwd_start_log.sh
3)脚本使用
[atguigu@hadoop102 module]$ dwd_start_log.sh 2019-02-11
4)查询导入结果
hive (gmall)>
select * from dwd_start_log where dt=‘2019-02-11’ limit 2;
5)脚本执行时间
企业开发中一般在每日凌晨30分~1点