ods-data-import.sql
hdfs dfs -mkdir -p /weblog/preprocessed
hdfs dfs -mkdir -p /weblog/clickstream/pageviews
hdfs dfs -mkdir -p /weblog/clickstream/visits
hdfs dfs -mkdir -p /weblog/dim_time
hdfs dfs -put part-m-00000 /weblog/preprocessed
hdfs dfs -put part-r-00000 /weblog/clickstream/pageviews
hdfs dfs -put part-r-00000 /weblog/clickstream/visits
hdfs dfs -put dim_time_dat.txt /weblog/dim_time
set hive.exec.mode.local.auto=true;
---------------------
导入清洗结果数据到贴源数据表ods_weblog_origin
load data inpath '/weblog/preprocessed/' overwrite into table ods_weblog_origin partition(datestr='20181101');
show partitions ods_weblog_origin;
select count(*) from ods_weblog_origin;
---------------------------------------------------------------------------
导入点击流模型pageviews数据到ods_click_pageviews表
load data local inpath '/weblog/clickstream/pageviews' overwrite into table ods_click_pageviews partition(datestr='20181101');
select count(*) from ods_click_pageviews;
-----------------------------------------------------------------------------
导入点击流模型visit数据到ods_click_stream_visit表
load data inpath '/weblog/clickstream/visits' overwrite into table ods_click_stream_visit partition(datestr='20181101');
----------------------------------------------------------------------------------------------------------------------
时间维度表数据导入
参考数据《dim_time_dat.txt》
load data inpath '/weblog/dim_time' overwrite into table t_dim_time;
load data local inpath '/root/hivedata/part-r-00000' into table ods_click_pageviews partition(datestr='20181101');
load data local inpath '/root/hivedata/part-r-00000' into table ods_click_stream_visit partition(datestr='20181101');
load data local inpath '/root/hivedata/dim_time.dat' into table t_dim_time;
创建表:
原始数据表:对应mr清洗完之后的数据,而不是原始日志数据
drop table if exists ods_weblog_origin;
create table ods_weblog_origin(
valid string,
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
http_referer string,
http_user_agent string)
partitioned by (datestr string)
row format delimited
fields terminated by '\001';
---------------------------------------------------
点击流pageview表
drop table if exists ods_click_pageviews;
create table ods_click_pageviews(
session string,
remote_addr string,
remote_user string,
time_local string,
request string,
visit_step string,
page_staylong string,
http_referer string,
http_user_agent string,
body_bytes_sent string,
status string)
partitioned by (datestr string)
row format delimited
fields terminated by '\001';
-----------------------------------------------
点击流visit表
drop table if exists ods_click_stream_visit;
create table ods_click_stream_visit(
session string,
remote_addr string,
inTime string,
outTime string,
inPage string,
outPage string,
referal string,
pageVisits int)
partitioned by (datestr string)
row format delimited
fields terminated by '\001';
-------------------------------------------
维度表示例:
drop table if exists t_dim_time;
create table t_dim_time(date_key int,year string,month string,day string,hour string) row format delimited fields terminated by ',';
操作表:
建表——明细宽表 dw_weblog_detail
drop table dw_weblog_detail;
create table dw_weblog_detail(
valid string, --有效标识
remote_addr string, --来源IP
remote_user string, --用户标识
time_local string, --访问完整时间
daystr string, --访问日期
timestr string, --访问时间
month string, --访问月
day string, --访问日
hour string, --访问时
request string, --请求的url
status string, --响应码
body_bytes_sent string, --传输字节数
http_referer string, --来源url
ref_host string, --来源的host
ref_path string, --来源的路径
ref_query string, --来源参数query
ref_query_id string, --来源参数query的值
http_user_agent string --客户终端标识
)
partitioned by(datestr string);
-------------------------------------------------------------------------------
通过查询插入数据到明细宽表 dw_weblog_detail中
分步:
--抽取refer_url到中间表 t_ods_tmp_referurl
--也就是将来访url分离出host path query query id
drop table if exists t_ods_tmp_referurl;
create table t_ods_tmp_referurl as
SELECT a.*,b.*
FROM ods_weblog_origin a
LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as host, path, query, query_id;
--抽取转换time_local字段到中间表明细表 t_ods_tmp_detail
drop table if exists t_ods_tmp_detail;
create table t_ods_tmp_detail as
select b.*,substring(time_local,0,10) as daystr,
substring(time_local,12) as tmstr,
substring(time_local,6,2) as month,
substring(time_local,9,2) as day,
substring(time_local,12,2) as hour
From t_ods_tmp_referurl b;
以上语句可以改写成:
insert into table dw_weblog_detail partition(datestr='20181101')
select c.valid,c.remote_addr,c.remote_user,c.time_local,
substring(c.time_local,0,10) as daystr,
substring(c.time_local,12) as tmstr,
substring(c.time_local,6,2) as month,
substring(c.time_local,9,2) as day,
substring(c.time_local,12,2) as hour,
c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent
from
(SELECT
a.valid,a.remote_addr,a.remote_user,a.time_local,
a.request,a.status,a.body_bytes_sent,a.http_referer,a.http_user_agent,b.ref_host,b.ref_path,b.ref_query,b.ref_query_id
FROM ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as ref_host, ref_path, ref_query, ref_query_id) c;
show partitions dw_weblog_detail;
我的操作版:
hive启动:
nohup /export/servers/hive/bin/hive --service metastore &
nohup /export/servers/hive/bin/hive --service hiveserver2 &
! connect jdbc:hive2://node-1:10000
load data local inpath '/root/hivedata/part-m-00000' into table ods_weblog_origin partition(datestr="20181101");
select * from ods_weblog_origin;
| ods_weblog_origin.valid | ods_weblog_origin.remote_addr | ods_weblog_origin.remote_user | ods_weblog_origin.time_local | ods_weblog_origin.request | ods_weblog_origin.status | ods_weblog_origin.body_bytes_sent | ods_weblog_origin.http_referer | ods_weblog_origin.http_user_agent | ods_weblog_origin.datestr |
+--------------------------+--------------------------------+--------------------------------+-------------------------------+-----------------------------------------------+---------------------------+------------------------------------+---------------------------------+------------------------------------+----------------------------+--+
| false | 194.237.142.21 | - | 2018-11-01 06:49:18 | /wp-content/uploads/2013/07/rstudio-git3.png | 304 | 0 | "-" | "Mozilla/4.0(compatible;)" | 20181101 |
| false | 163.177.71.12 | - | 2018-11-01 06:49:33 | / | 200 | 20 | "-" | "DNSPod-Monitor/1.0" | 20181101 |
| false | 163.177.71.12 | - | 2018-11-01 06:49:36 | / | 200 | 20 | "-" | "DNSPod-Monitor/1.0" | 20181101 |
| false | 101.226.68.137 | - | 2018-11-01 06:49:42 | / | 200 | 20 | "-" | "DNSPod-Monitor/1.0" | 20181101 |
| false | 101.226.68.137 | - | 2018-11-01 06:49:45 | / | 200 | 20 | "-" | "DNSPod-Monitor/1.0" | 20181101 |
+--------------------------+--------------------------------+--------------------------------+-------------------------------+-----------------------------------------------+---------------------------+------------------------------------+---------------------------------+------------------------------------+----------------------------+--+
load data local inpath '/root/hivedata/part-r-00000' into table ods_click_pageviews partition(datestr="20181101");
| ods_click_pageviews.session | ods_click_pageviews.remote_addr | ods_click_pageviews.remote_user | ods_click_pageviews.time_local | ods_click_pageviews.request | ods_click_pageviews.visit_step | ods_click_pageviews.page_staylong | ods_click_pageviews.http_referer | ods_click_pageviews.http_user_agent | ods_click_pageviews.body_bytes_sent | ods_click_pageviews.status | ods_click_pageviews.datestr |
+---------------------------------------+----------------------------------+----------------------------------+---------------------------------+------------------------------+---------------------------------+------------------------------------+----------------------------------------------------+----------------------------------------------------+--------------------------------------+-----------------------------+------------------------------+--+
| 7e81b5a1-8b54-437a-ab29-51b215b14fa9 | 1.80.249.223 | - | 2018-11-01 07:57:33 | /hadoop-hive-intro/ | 1 | 60 | "http://www.google.com.hk/url?sa=t&rct=j&q=hive%E7%9A%84%E5%AE%89%E8%A3%85&source=web&cd=2&ved=0CC4QFjAB&url=%68%74%74%70%3a%2f%2f%62%6c%6f%67%2e%66%65%6e%73%2e%6d%65%2f%68%61%64%6f%6f%70%2d%68%69%76%65%2d%69%6e%74%72%6f%2f&ei=5lw5Uo-2NpGZiQfCwoG4BA&usg=AFQjCNF8EFxPuCMrm7CvqVgzcBUzrJZStQ&bvm=bv.52164340,d.aGc&cad=rjt" | "Mozilla/5.0(WindowsNT5.2;rv:23.0)Gecko/20100101Firefox/23.0" | 14764 | 200 | 20181101 |
| 5ec73e05-f291-4996-b2d1-13f15cd8331c | 101.226.167.201 | - | 2018-11-01 09:30:36 | /hadoop-mahout-roadmap/ | 1 | 60 | "http://blog.fens.me/hadoop-mahout-roadmap/" | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" | 10335 | 200 | 20181101 |
| fb2c9ed1-09c0-442b-8903-5132b46250ba | 101.226.167.205 | - | 2018-11-01 09:30:32 | /hadoop-family-roadmap/ | 1 | 60 | "http://blog.fens.me/hadoop-family-roadmap/" | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" | 11715 | 200 | 20181101 |
| 6cf55c75-47b8-425a-83d1-8efc8c94e518 | 101.226.169.215 | - | 2018-11-01 10:07:31 | /about | 1 | 60 | "http://blog.fens.me/about" | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" | 5 | 301 | 20181101 |
| 182549fa-aca2-49ea-88ce-6aa238bbf1dd | 110.211.10.14 | - | 2018-11-01 13:31:10 | /hadoop-mahout-roadmap/ | 1 | 60 | "http://f.dataguru.cn/forum.php?mod=viewthread&tid=175501" | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;WOW64;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MALN;InfoPath.2;.NET4.0C;MediaCenterPC6.0)" | 10335 | 200 | 20181101 |
+---------------------------------------+----------------------------------+----------------------------------+---------------------------------+------------------------------+---------------------------------+------------------------------------+----------------------------------------------------+----------------------------------------------------+--------------------------------------+-----------------------------+------------------------------+--+
load data local inpath '/root/hivedata/part-r-00000' into table ods_click_stream_visit partition(datestr="20181101");
| ods_click_stream_visit.session | ods_click_stream_visit.remote_addr | ods_click_stream_visit.intime | ods_click_stream_visit.outtime | ods_click_stream_visit.inpage | ods_click_stream_visit.outpage | ods_click_stream_visit.referal | ods_click_stream_visit.pagevisits | ods_click_stream_visit.datestr |
+---------------------------------------+-------------------------------------+--------------------------------+---------------------------------+--------------------------------+---------------------------------+------------------------------------------------+------------------------------------+---------------------------------+--+
| 005fd135-d9ec-4ac2-ba16-706004359d42 | 195.110.138.105 | 2018-11-01 19:10:52 | 2018-11-01 19:10:52 | /cassandra-clustor/ | /cassandra-clustor/ | "https://www.google.com.tw/" | 1 | 20181101 |
| 00a9e4d8-dfb2-4d32-b180-eb2383112317 | 66.249.66.84 | 2018-11-01 07:44:41 | 2018-11-01 07:44:41 | /finance-rhive-repurchase/ | /finance-rhive-repurchase/ | "-" | 1 | 20181101 |
| 0173589c-2fbb-4d60-ad65-44d7468ec0ab | 159.226.251.170 | 2018-11-01 08:26:18 | 2018-11-01 08:26:18 | /hadoop-mahout-roadmap/ | /hadoop-mahout-roadmap/ | "http://f.dataguru.cn/thread-177375-1-1.html" | 1 | 20181101 |
| 065d5e7a-d7fe-45a3-b7b4-a5f053ecc4ab | 221.179.193.202 | 2018-11-02 00:44:07 | 2018-11-02 00:44:07 | /hadoop-mahout-roadmap/ | /hadoop-mahout-roadmap/ | "-" | 1 | 20181101 |
| 14c9ea27-dd75-476e-bea4-7f01d9ee4953 | 218.18.232.228 | 2018-11-01 09:53:55 | 2018-11-01 09:53:55 | /hadoop-family-roadmap/ | /hadoop-family-roadmap/ | "https://www.google.com.hk/" | 1 | 20181101 |
+---------------------------------------+-------------------------------------+--------------------------------+---------------------------------+--------------------------------+---------------------------------+------------------------------------------------+------------------------------------+---------------------------------+--+
load data local inpath '/root/hivedata/dim_time.dat' into table t_dim_time;
+----------------------+------------------+-------------------+-----------------+------------------+--+
| t_dim_time.date_key | t_dim_time.year | t_dim_time.month | t_dim_time.day | t_dim_time.hour |
+----------------------+------------------+-------------------+-----------------+------------------+--+
| 1 | 2018 | 10 | 31 | 22 |
| 2 | 2018 | 10 | 31 | 23 |
| 3 | 2018 | 11 | 01 | 00 |
| 4 | 2018 | 11 | 01 | 01 |
| 5 | 2018 | 11 | 01 | 02 |
+----------------------+------------------+-------------------+-----------------+------------------+--+
5 rows selected (0.129 seconds)
#建议开启自动本地化执行模式 便于后续分析的方便
set hive.exec.mode.local.auto=true;
----------------------------------------------------------------------------------------------------------------
#kuangbiao
# ods_weblog_origin.time_local
# 2018-11-01 06:49:18
select substring(time_local,12,2) from ods_weblog_origin limit 1;
+------+--+
| _c0 |
+------+--+
| 06 |
+------+--+
year month day:time_local
host path query : http_referer
--------------------------------------------------------------------------------------------------------------------
#创建宽表:
#建表——明细宽表 dw_weblog_detail
drop table dw_weblog_detail;
create table dw_weblog_detail(
valid string, --有效标识
remote_addr string, --来源IP
remote_user string, --用户标识
time_local string, --访问完整时间
daystr string, --访问日期
timestr string, --访问时间
month string, --访问月
day string, --访问日
hour string, --访问时
request string, --请求的url
status string, --响应码
body_bytes_sent string, --传输字节数
http_referer string, --来源url
ref_host string, --来源的host
ref_path string, --来源的路径
ref_query string, --来源参数query
ref_query_id string, --来源参数query的值
http_user_agent string --客户终端标识
)
partitioned by(datestr string);
感觉这个有点问题
insert
into table dw_weblog_detail partition(datestr='20181101')
select
c.valid,c.remote_addr,c.remote_user,c.time_local,
substring(c.time_local,0,10) as datestr,
substring(c.time_local,12) as tmstr,
substring(c.time_local,6,2) as month,
substring(c.time_local,9,2) as day,
substring(c.time_local,12,2) as hour,
c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent
(select
a.valid,a.remote_addr,a.remote_user,a.time_local,
a.request,a.status,a.body_bytes_sent,a.http_referer,a.http_user_agent,b.ref_host,b.ref_path,b.ref_query,b.ref_query_id
from ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer,"\"", ""),'HOST','QUERY','QUERY:id') b as ref_host, ref_path,ref_query,ref_query_id) c; (me)
######################################################################################
insert into table dw_weblog_detail partition(datestr='20181101')
select c.valid,c.remote_addr,c.remote_user,c.time_local,
substring(c.time_local,0,10) as daystr,
substring(c.time_local,12) as tmstr,
substring(c.time_local,6,2) as month,
substring(c.time_local,9,2) as day,
substring(c.time_local,12,2) as hour,
c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent
from
(SELECT
a.valid,a.remote_addr,a.remote_user,a.time_local,
a.request,a.status,a.body_bytes_sent,a.http_referer,a.http_user_agent,b.ref_host,b.ref_path,b.ref_query,b.ref_query_id
FROM ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as ref_host, ref_path, ref_query, ref_query_id) c;
select * from dw_weblog_detail limit 5;
+-------------------------+-------------------------------+-------------------------------+------------------------------+--------------------------+---------------------------+-------------------------+-----------------------+------------------------+-----------------------------------------------+--------------------------+-----------------------------------+--------------------------------+----------------------------+----------------------------+-----------------------------+--------------------------------+-----------------------------------+---------------------------+--+
| dw_weblog_detail.valid | dw_weblog_detail.remote_addr | dw_weblog_detail.remote_user | dw_weblog_detail.time_local | dw_weblog_detail.daystr | dw_weblog_detail.timestr | dw_weblog_detail.month | dw_weblog_detail.day | dw_weblog_detail.hour | dw_weblog_detail.request | dw_weblog_detail.status | dw_weblog_detail.body_bytes_sent | dw_weblog_detail.http_referer | dw_weblog_detail.ref_host | dw_weblog_detail.ref_path | dw_weblog_detail.ref_query | dw_weblog_detail.ref_query_id | dw_weblog_detail.http_user_agent | dw_weblog_detail.datestr |
+-------------------------+-------------------------------+-------------------------------+------------------------------+--------------------------+---------------------------+-------------------------+-----------------------+------------------------+-----------------------------------------------+--------------------------+-----------------------------------+--------------------------------+----------------------------+----------------------------+-----------------------------+--------------------------------+-----------------------------------+---------------------------+--+
| false | 194.237.142.21 | - | 2018-11-01 06:49:18 | 2018-11-01 | 06:49:18 | 11 | 01 | 06 | /wp-content/uploads/2013/07/rstudio-git3.png | 304 | 0 | "-" | NULL | NULL | NULL | NULL | "Mozilla/4.0(compatible;)" | 20181101 |
| false | 163.177.71.12 | - | 2018-11-01 06:49:33 | 2018-11-01 | 06:49:33 | 11 | 01 | 06 | / | 200 | 20 | "-" | NULL | NULL | NULL | NULL | "DNSPod-Monitor/1.0" | 20181101 |
| false | 163.177.71.12 | - | 2018-11-01 06:49:36 | 2018-11-01 | 06:49:36 | 11 | 01 | 06 | / | 200 | 20 | "-" | NULL | NULL | NULL | NULL | "DNSPod-Monitor/1.0" | 20181101 |
| false | 101.226.68.137 | - | 2018-11-01 06:49:42 | 2018-11-01 | 06:49:42 | 11 | 01 | 06 | / | 200 | 20 | "-" | NULL | NULL | NULL | NULL | "DNSPod-Monitor/1.0" | 20181101 |
| false | 101.226.68.137 | - | 2018-11-01 06:49:45 | 2018-11-01 | 06:49:45 | 11 | 01 | 06 | / | 200 | 20 | "-" | NULL | NULL | NULL | NULL | "DNSPod-Monitor/1.0" | 20181101 |
+-------------------------+-------------------------------+-------------------------------+------------------------------+--------------------------+---------------------------+-------------------------+-----------------------+------------------------+-----------------------------------------------+--------------------------+-----------------------------------+--------------------------------+----------------------------+----------------------------+-----------------------------+--------------------------------+-----------------------------------+---------------------------+--+
---------------------------------------------------------------------------------------------------
show partitions dw_weblog_detail;
+-------------------+--+
| partition |
+-------------------+--+
| datestr=20181101 |
+-------------------+--+
-----------------------------------------------------------------------------------------------
北京多少人?
----------------
基础指标分析: