数据入库----step3(我的草稿箱)

ods-data-import.sql

hdfs dfs -mkdir -p /weblog/preprocessed
hdfs dfs -mkdir -p /weblog/clickstream/pageviews
hdfs dfs -mkdir -p /weblog/clickstream/visits
hdfs dfs -mkdir -p /weblog/dim_time

hdfs dfs -put part-m-00000 /weblog/preprocessed
hdfs dfs -put part-r-00000 /weblog/clickstream/pageviews
hdfs dfs -put part-r-00000 /weblog/clickstream/visits
hdfs dfs -put dim_time_dat.txt /weblog/dim_time

set hive.exec.mode.local.auto=true;

---------------------

导入清洗结果数据到贴源数据表ods_weblog_origin

load data inpath '/weblog/preprocessed/' overwrite into table ods_weblog_origin partition(datestr='20181101');
show partitions ods_weblog_origin;
select count(*) from ods_weblog_origin;
---------------------------------------------------------------------------
导入点击流模型pageviews数据到ods_click_pageviews表

load data local inpath '/weblog/clickstream/pageviews' overwrite into table ods_click_pageviews partition(datestr='20181101');
select count(*) from ods_click_pageviews;
-----------------------------------------------------------------------------
导入点击流模型visit数据到ods_click_stream_visit表

load data inpath '/weblog/clickstream/visits' overwrite into table ods_click_stream_visit partition(datestr='20181101');

----------------------------------------------------------------------------------------------------------------------
时间维度表数据导入
参考数据《dim_time_dat.txt》
load data inpath '/weblog/dim_time' overwrite into table t_dim_time;

load data local inpath '/root/hivedata/part-r-00000' into table ods_click_pageviews partition(datestr='20181101');
load data local inpath '/root/hivedata/part-r-00000' into table ods_click_stream_visit partition(datestr='20181101');
load data local inpath '/root/hivedata/dim_time.dat' into table t_dim_time;

创建表:

原始数据表:对应mr清洗完之后的数据,而不是原始日志数据

drop table if exists ods_weblog_origin;
create table ods_weblog_origin(
valid string,
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
http_referer string,
http_user_agent string)
partitioned by (datestr string)
row format delimited
fields terminated by '\001';

---------------------------------------------------
点击流pageview表

drop table if exists ods_click_pageviews;
create table ods_click_pageviews(
session string,
remote_addr string,
remote_user string,
time_local string,
request string,
visit_step string,
page_staylong string,
http_referer string,
http_user_agent string,
body_bytes_sent string,
status string)
partitioned by (datestr string)
row format delimited
fields terminated by '\001';

-----------------------------------------------
点击流visit表

drop table if exists ods_click_stream_visit;
create table ods_click_stream_visit(
session     string,
remote_addr string,
inTime      string,
outTime     string,
inPage      string,
outPage     string,
referal     string,
pageVisits  int)
partitioned by (datestr string)
row format delimited
fields terminated by '\001';

-------------------------------------------
维度表示例:
drop table if exists t_dim_time;
create table t_dim_time(date_key int,year string,month string,day string,hour string) row format delimited fields terminated by ',';



 

操作表:

建表——明细宽表 dw_weblog_detail

drop table dw_weblog_detail;
create table dw_weblog_detail(
valid           string, --有效标识
remote_addr     string, --来源IP
remote_user     string, --用户标识
time_local      string, --访问完整时间
daystr          string, --访问日期
timestr         string, --访问时间
month           string, --访问月
day             string, --访问日
hour            string, --访问时
request         string, --请求的url
status          string, --响应码
body_bytes_sent string, --传输字节数
http_referer    string, --来源url
ref_host        string, --来源的host
ref_path        string, --来源的路径
ref_query       string, --来源参数query
ref_query_id    string, --来源参数query的值
http_user_agent string --客户终端标识
)
partitioned by(datestr string);
-------------------------------------------------------------------------------
通过查询插入数据到明细宽表  dw_weblog_detail中


分步:
--抽取refer_url到中间表  t_ods_tmp_referurl 
--也就是将来访url分离出host  path  query  query id

drop table if exists t_ods_tmp_referurl;
create table t_ods_tmp_referurl as
SELECT a.*,b.*
FROM ods_weblog_origin a 
LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as host, path, query, query_id; 

--抽取转换time_local字段到中间表明细表 t_ods_tmp_detail

drop table if exists t_ods_tmp_detail;
create table t_ods_tmp_detail as 
select b.*,substring(time_local,0,10) as daystr,
substring(time_local,12) as tmstr,
substring(time_local,6,2) as month,
substring(time_local,9,2) as day,
substring(time_local,12,2) as hour
From t_ods_tmp_referurl b;

以上语句可以改写成:
insert into table dw_weblog_detail partition(datestr='20181101')
select c.valid,c.remote_addr,c.remote_user,c.time_local,
substring(c.time_local,0,10) as daystr,
substring(c.time_local,12) as tmstr,
substring(c.time_local,6,2) as month,
substring(c.time_local,9,2) as day,
substring(c.time_local,12,2) as hour,
c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent
from
(SELECT 
a.valid,a.remote_addr,a.remote_user,a.time_local,
a.request,a.status,a.body_bytes_sent,a.http_referer,a.http_user_agent,b.ref_host,b.ref_path,b.ref_query,b.ref_query_id 
FROM ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as ref_host, ref_path, ref_query, ref_query_id) c;


show partitions dw_weblog_detail;




 

我的操作版:

hive启动:

nohup /export/servers/hive/bin/hive --service metastore &
nohup /export/servers/hive/bin/hive --service hiveserver2 &

! connect jdbc:hive2://node-1:10000



load data local inpath '/root/hivedata/part-m-00000' into table ods_weblog_origin partition(datestr="20181101");

select * from  ods_weblog_origin;
| ods_weblog_origin.valid  | ods_weblog_origin.remote_addr  | ods_weblog_origin.remote_user  | ods_weblog_origin.time_local  |           ods_weblog_origin.request           | ods_weblog_origin.status  | ods_weblog_origin.body_bytes_sent  | ods_weblog_origin.http_referer  | ods_weblog_origin.http_user_agent  | ods_weblog_origin.datestr  |
+--------------------------+--------------------------------+--------------------------------+-------------------------------+-----------------------------------------------+---------------------------+------------------------------------+---------------------------------+------------------------------------+----------------------------+--+
| false                    | 194.237.142.21                 | -                              | 2018-11-01 06:49:18           | /wp-content/uploads/2013/07/rstudio-git3.png  | 304                       | 0                                  | "-"                             | "Mozilla/4.0(compatible;)"         | 20181101                   |
| false                    | 163.177.71.12                  | -                              | 2018-11-01 06:49:33           | /                                             | 200                       | 20                                 | "-"                             | "DNSPod-Monitor/1.0"               | 20181101                   |
| false                    | 163.177.71.12                  | -                              | 2018-11-01 06:49:36           | /                                             | 200                       | 20                                 | "-"                             | "DNSPod-Monitor/1.0"               | 20181101                   |
| false                    | 101.226.68.137                 | -                              | 2018-11-01 06:49:42           | /                                             | 200                       | 20                                 | "-"                             | "DNSPod-Monitor/1.0"               | 20181101                   |
| false                    | 101.226.68.137                 | -                              | 2018-11-01 06:49:45           | /                                             | 200                       | 20                                 | "-"                             | "DNSPod-Monitor/1.0"               | 20181101                   |
+--------------------------+--------------------------------+--------------------------------+-------------------------------+-----------------------------------------------+---------------------------+------------------------------------+---------------------------------+------------------------------------+----------------------------+--+




load data local inpath '/root/hivedata/part-r-00000' into table ods_click_pageviews partition(datestr="20181101");
|      ods_click_pageviews.session      | ods_click_pageviews.remote_addr  | ods_click_pageviews.remote_user  | ods_click_pageviews.time_local  | ods_click_pageviews.request  | ods_click_pageviews.visit_step  | ods_click_pageviews.page_staylong  |          ods_click_pageviews.http_referer          |        ods_click_pageviews.http_user_agent         | ods_click_pageviews.body_bytes_sent  | ods_click_pageviews.status  | ods_click_pageviews.datestr  |
+---------------------------------------+----------------------------------+----------------------------------+---------------------------------+------------------------------+---------------------------------+------------------------------------+----------------------------------------------------+----------------------------------------------------+--------------------------------------+-----------------------------+------------------------------+--+
| 7e81b5a1-8b54-437a-ab29-51b215b14fa9  | 1.80.249.223                     | -                                | 2018-11-01 07:57:33             | /hadoop-hive-intro/          | 1                               | 60                                 | "http://www.google.com.hk/url?sa=t&rct=j&q=hive%E7%9A%84%E5%AE%89%E8%A3%85&source=web&cd=2&ved=0CC4QFjAB&url=%68%74%74%70%3a%2f%2f%62%6c%6f%67%2e%66%65%6e%73%2e%6d%65%2f%68%61%64%6f%6f%70%2d%68%69%76%65%2d%69%6e%74%72%6f%2f&ei=5lw5Uo-2NpGZiQfCwoG4BA&usg=AFQjCNF8EFxPuCMrm7CvqVgzcBUzrJZStQ&bvm=bv.52164340,d.aGc&cad=rjt" | "Mozilla/5.0(WindowsNT5.2;rv:23.0)Gecko/20100101Firefox/23.0" | 14764                                | 200                         | 20181101                     |
| 5ec73e05-f291-4996-b2d1-13f15cd8331c  | 101.226.167.201                  | -                                | 2018-11-01 09:30:36             | /hadoop-mahout-roadmap/      | 1                               | 60                                 | "http://blog.fens.me/hadoop-mahout-roadmap/"       | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" | 10335                                | 200                         | 20181101                     |
| fb2c9ed1-09c0-442b-8903-5132b46250ba  | 101.226.167.205                  | -                                | 2018-11-01 09:30:32             | /hadoop-family-roadmap/      | 1                               | 60                                 | "http://blog.fens.me/hadoop-family-roadmap/"       | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" | 11715                                | 200                         | 20181101                     |
| 6cf55c75-47b8-425a-83d1-8efc8c94e518  | 101.226.169.215                  | -                                | 2018-11-01 10:07:31             | /about                       | 1                               | 60                                 | "http://blog.fens.me/about"                        | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MediaCenterPC6.0;MDDR;.NET4.0C;.NET4.0E;.NETCLR1.1.4322;TabletPC2.0);360Spider" | 5                                    | 301                         | 20181101                     |
| 182549fa-aca2-49ea-88ce-6aa238bbf1dd  | 110.211.10.14                    | -                                | 2018-11-01 13:31:10             | /hadoop-mahout-roadmap/      | 1                               | 60                                 | "http://f.dataguru.cn/forum.php?mod=viewthread&tid=175501" | "Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;WOW64;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;MALN;InfoPath.2;.NET4.0C;MediaCenterPC6.0)" | 10335                                | 200                         | 20181101                     |
+---------------------------------------+----------------------------------+----------------------------------+---------------------------------+------------------------------+---------------------------------+------------------------------------+----------------------------------------------------+----------------------------------------------------+--------------------------------------+-----------------------------+------------------------------+--+


load data local inpath '/root/hivedata/part-r-00000' into table ods_click_stream_visit partition(datestr="20181101");

|    ods_click_stream_visit.session     | ods_click_stream_visit.remote_addr  | ods_click_stream_visit.intime  | ods_click_stream_visit.outtime  | ods_click_stream_visit.inpage  | ods_click_stream_visit.outpage  |         ods_click_stream_visit.referal         | ods_click_stream_visit.pagevisits  | ods_click_stream_visit.datestr  |
+---------------------------------------+-------------------------------------+--------------------------------+---------------------------------+--------------------------------+---------------------------------+------------------------------------------------+------------------------------------+---------------------------------+--+
| 005fd135-d9ec-4ac2-ba16-706004359d42  | 195.110.138.105                     | 2018-11-01 19:10:52            | 2018-11-01 19:10:52             | /cassandra-clustor/            | /cassandra-clustor/             | "https://www.google.com.tw/"                   | 1                                  | 20181101                        |
| 00a9e4d8-dfb2-4d32-b180-eb2383112317  | 66.249.66.84                        | 2018-11-01 07:44:41            | 2018-11-01 07:44:41             | /finance-rhive-repurchase/     | /finance-rhive-repurchase/      | "-"                                            | 1                                  | 20181101                        |
| 0173589c-2fbb-4d60-ad65-44d7468ec0ab  | 159.226.251.170                     | 2018-11-01 08:26:18            | 2018-11-01 08:26:18             | /hadoop-mahout-roadmap/        | /hadoop-mahout-roadmap/         | "http://f.dataguru.cn/thread-177375-1-1.html"  | 1                                  | 20181101                        |
| 065d5e7a-d7fe-45a3-b7b4-a5f053ecc4ab  | 221.179.193.202                     | 2018-11-02 00:44:07            | 2018-11-02 00:44:07             | /hadoop-mahout-roadmap/        | /hadoop-mahout-roadmap/         | "-"                                            | 1                                  | 20181101                        |
| 14c9ea27-dd75-476e-bea4-7f01d9ee4953  | 218.18.232.228                      | 2018-11-01 09:53:55            | 2018-11-01 09:53:55             | /hadoop-family-roadmap/        | /hadoop-family-roadmap/         | "https://www.google.com.hk/"                   | 1                                  | 20181101                        |
+---------------------------------------+-------------------------------------+--------------------------------+---------------------------------+--------------------------------+---------------------------------+------------------------------------------------+------------------------------------+---------------------------------+--+

load data local inpath '/root/hivedata/dim_time.dat' into table t_dim_time;
+----------------------+------------------+-------------------+-----------------+------------------+--+
| t_dim_time.date_key  | t_dim_time.year  | t_dim_time.month  | t_dim_time.day  | t_dim_time.hour  |
+----------------------+------------------+-------------------+-----------------+------------------+--+
| 1                    | 2018             | 10                | 31              | 22               |
| 2                    | 2018             | 10                | 31              | 23               |
| 3                    | 2018             | 11                | 01              | 00               |
| 4                    | 2018             | 11                | 01              | 01               |
| 5                    | 2018             | 11                | 01              | 02               |
+----------------------+------------------+-------------------+-----------------+------------------+--+
5 rows selected (0.129 seconds)


#建议开启自动本地化执行模式  便于后续分析的方便
set hive.exec.mode.local.auto=true;
----------------------------------------------------------------------------------------------------------------

#kuangbiao

# ods_weblog_origin.time_local 
#  2018-11-01 06:49:18            
select substring(time_local,12,2) from ods_weblog_origin limit 1;
+------+--+
| _c0  |
+------+--+
| 06   |
+------+--+

year month day:time_local 

host path query : http_referer 
--------------------------------------------------------------------------------------------------------------------
#创建宽表:
#建表——明细宽表 dw_weblog_detail
drop table dw_weblog_detail;
create table dw_weblog_detail(
valid           string, --有效标识
remote_addr     string, --来源IP
remote_user     string, --用户标识
time_local      string, --访问完整时间
daystr          string, --访问日期
timestr         string, --访问时间
month           string, --访问月
day             string, --访问日
hour            string, --访问时
request         string, --请求的url
status          string, --响应码
body_bytes_sent string, --传输字节数
http_referer    string, --来源url
ref_host        string, --来源的host
ref_path        string, --来源的路径
ref_query       string, --来源参数query
ref_query_id    string, --来源参数query的值
http_user_agent string --客户终端标识
)
partitioned by(datestr string);


感觉这个有点问题

insert                  
	into table dw_weblog_detail partition(datestr='20181101')
select 
	c.valid,c.remote_addr,c.remote_user,c.time_local,
	substring(c.time_local,0,10) as datestr,
	substring(c.time_local,12) as tmstr,
	substring(c.time_local,6,2) as month,
	substring(c.time_local,9,2) as day,
	substring(c.time_local,12,2) as hour,
	c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent
(select  
		a.valid,a.remote_addr,a.remote_user,a.time_local,
          a.request,a.status,a.body_bytes_sent,a.http_referer,a.http_user_agent,b.ref_host,b.ref_path,b.ref_query,b.ref_query_id 
 from ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer,"\"", ""),'HOST','QUERY','QUERY:id') b as ref_host, ref_path,ref_query,ref_query_id) c; (me)
 ######################################################################################
 insert into table dw_weblog_detail partition(datestr='20181101')
select c.valid,c.remote_addr,c.remote_user,c.time_local,
substring(c.time_local,0,10) as daystr,
substring(c.time_local,12) as tmstr,
substring(c.time_local,6,2) as month,
substring(c.time_local,9,2) as day,
substring(c.time_local,12,2) as hour,
c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent
from
(SELECT 
a.valid,a.remote_addr,a.remote_user,a.time_local,
a.request,a.status,a.body_bytes_sent,a.http_referer,a.http_user_agent,b.ref_host,b.ref_path,b.ref_query,b.ref_query_id 
FROM ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as ref_host, ref_path, ref_query, ref_query_id) c;
 
 select * from dw_weblog_detail limit 5;

+-------------------------+-------------------------------+-------------------------------+------------------------------+--------------------------+---------------------------+-------------------------+-----------------------+------------------------+-----------------------------------------------+--------------------------+-----------------------------------+--------------------------------+----------------------------+----------------------------+-----------------------------+--------------------------------+-----------------------------------+---------------------------+--+
| dw_weblog_detail.valid  | dw_weblog_detail.remote_addr  | dw_weblog_detail.remote_user  | dw_weblog_detail.time_local  | dw_weblog_detail.daystr  | dw_weblog_detail.timestr  | dw_weblog_detail.month  | dw_weblog_detail.day  | dw_weblog_detail.hour  |           dw_weblog_detail.request            | dw_weblog_detail.status  | dw_weblog_detail.body_bytes_sent  | dw_weblog_detail.http_referer  | dw_weblog_detail.ref_host  | dw_weblog_detail.ref_path  | dw_weblog_detail.ref_query  | dw_weblog_detail.ref_query_id  | dw_weblog_detail.http_user_agent  | dw_weblog_detail.datestr  |
+-------------------------+-------------------------------+-------------------------------+------------------------------+--------------------------+---------------------------+-------------------------+-----------------------+------------------------+-----------------------------------------------+--------------------------+-----------------------------------+--------------------------------+----------------------------+----------------------------+-----------------------------+--------------------------------+-----------------------------------+---------------------------+--+
| false                   | 194.237.142.21                | -                             | 2018-11-01 06:49:18          | 2018-11-01               | 06:49:18                  | 11                      | 01                    | 06                     | /wp-content/uploads/2013/07/rstudio-git3.png  | 304                      | 0                                 | "-"                            | NULL                       | NULL                       | NULL                        | NULL                           | "Mozilla/4.0(compatible;)"        | 20181101                  |
| false                   | 163.177.71.12                 | -                             | 2018-11-01 06:49:33          | 2018-11-01               | 06:49:33                  | 11                      | 01                    | 06                     | /                                             | 200                      | 20                                | "-"                            | NULL                       | NULL                       | NULL                        | NULL                           | "DNSPod-Monitor/1.0"              | 20181101                  |
| false                   | 163.177.71.12                 | -                             | 2018-11-01 06:49:36          | 2018-11-01               | 06:49:36                  | 11                      | 01                    | 06                     | /                                             | 200                      | 20                                | "-"                            | NULL                       | NULL                       | NULL                        | NULL                           | "DNSPod-Monitor/1.0"              | 20181101                  |
| false                   | 101.226.68.137                | -                             | 2018-11-01 06:49:42          | 2018-11-01               | 06:49:42                  | 11                      | 01                    | 06                     | /                                             | 200                      | 20                                | "-"                            | NULL                       | NULL                       | NULL                        | NULL                           | "DNSPod-Monitor/1.0"              | 20181101                  |
| false                   | 101.226.68.137                | -                             | 2018-11-01 06:49:45          | 2018-11-01               | 06:49:45                  | 11                      | 01                    | 06                     | /                                             | 200                      | 20                                | "-"                            | NULL                       | NULL                       | NULL                        | NULL                           | "DNSPod-Monitor/1.0"              | 20181101                  |
+-------------------------+-------------------------------+-------------------------------+------------------------------+--------------------------+---------------------------+-------------------------+-----------------------+------------------------+-----------------------------------------------+--------------------------+-----------------------------------+--------------------------------+----------------------------+----------------------------+-----------------------------+--------------------------------+-----------------------------------+---------------------------+--+
---------------------------------------------------------------------------------------------------

show partitions dw_weblog_detail;

+-------------------+--+
|     partition     |
+-------------------+--+
| datestr=20181101  |
+-------------------+--+

-----------------------------------------------------------------------------------------------

北京多少人?

----------------
基础指标分析:






 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值