hive命令

–创建数据库

create database if not exists db_yhd;
use db_yhd;

CREATE TABLE db_yhd.track_log(
id              STRING,
url             STRING,
referer         STRING,
keyword         STRING,
type            STRING,
guid            STRING,
pageId          STRING,
moduleId        STRING,
linkId          STRING,
attachedInfo    STRING,
sessionId       STRING,
trackerU        STRING,
trackerType     STRING,
ip              STRING,
trackerSrc      STRING,
cookie          STRING,
orderCode       STRING,
trackTime       STRING,
endUserId       STRING,
firstLink       STRING,
sessionViewNo   STRING,
productId       STRING,
curMerchantId   STRING,
provinceId      STRING,
cityId          STRING,
fee             STRING,
edmActivity     STRING,
edmEmail        STRING,
edmJobId        STRING,
ieVersion       STRING,
platform        STRING,
internalKeyword STRING,
resultSum       STRING,
currentPage     STRING,
linkPosition    STRING,
buttonPosition  STRING
)
PARTITIONED BY (date_str STRING ,hour_str STRING )
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n';

--加载数据
LOAD DATA LOCAL INPATH '/opt/datas/2015082818'  INTO TABLE db_yhd.track_log PARTITION (date_str="20150828", hour_str="18"); 
LOAD DATA LOCAL INPATH '/opt/datas/2015082819'  INTO TABLE db_yhd.track_log PARTITION (date_str="20150828", hour_str="19"); 



--查看分区信息
show partitions db_yhd.track_log;

--抽样

select count(1) as count from db_yhd.track_log where date_str="20150828";

select date_str,url from db_yhd.track_log where date_str="20150828" limit 100;

/**
	统计分析每日各时段的PV和UV
		PV:pageview网页浏览器,URL不能为空
		UV:uniqueVisitor 唯一访客数,多少人访问
*/

select
   date_str,hour_str,count(url) as pv 
from
    db_yhd.track_log 
where 
    date_str="20150828" and length(trim(url))>0
group by
    date_str,hour_str
	
+-----------+-----------+--------+--+
| date_str  | hour_str  |   pv   |
+-----------+-----------+--------+--+
| 20150828  | 18        | 35880  |
| 20150828  | 19        | 33317  |
+-----------+-----------+--------+--+
select
   date_str,count(url) as pv 
from
    db_yhd.track_log 
where 
    date_str="20150828" and length(trim(url))>0
group by
    date_str

+-----------+--------+--+
| date_str  |   pv   |
+-----------+--------+--+
| 20150828  | 69197  |
+-----------+--------+--+
--发现有很多数据丢失,属于所谓"脏数据":56937
select
   count(url) as c 
from
    db_yhd.track_log 
where 
    date_str="20150828" and length(trim(url)) ==0
	
--每日各时段UV统计
--guid是唯一ID,去重即可得到结果

select
   date_str,hour_str,count(distinct guid) as uv 
from
    db_yhd.track_log 
where 
    date_str="20150828"  and length(trim(url)) ==0
group by
    date_str,hour_str
	
+-----------+-----------+--------+--+
| date_str  | hour_str  |   uv   |
+-----------+-----------+--------+--+
| 20150828  | 18        | 23938  |
| 20150828  | 19        | 22330  |
+-----------+-----------+--------+--+

select
   date_str,count(distinct guid) as uv 
from
    db_yhd.track_log 
where 
    date_str="20150828" 
group by
    date_str
+-----------+--------+--+
| date_str  |   uv   |
+-----------+--------+--+
| 20150828  | 39007  |
+-----------+--------+--+

/**
统计一条的UV
	结果发现一天UV比各个时段加起来小
	整个很正常,6:59开始到7点之后用户
	
*/

--合并结果
+-----------+-----------+--------+--------+
| date_str  | hour_str  |   pv   |   uv   |
+-----------+-----------+--------+--------+
| 20150828  | 18        | 35880  | 23938  |
| 20150828  | 19        | 33317  | 22330  |
+-----------+-----------+--------+--------+

select
   date_str,hour_str,
   count(url) as pv ,
   count(distinct guid) as uv 
from
    db_yhd.track_log 
where 
    date_str="20150828"  and length(trim(url)) ==0
group by
    date_str,hour_str

	
	
--保存结果
drop table if exists db_yhd.tb_pvuv_result;
create table db_yhd.tb_pvuv_result
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
AS 
SELECT 
   date_str,hour_str,
   count(url) as pv ,
   count(distinct guid) as uv 
from
    db_yhd.track_log 
where 
    date_str="20150828"  and length(trim(url)) ==0
group by
    date_str,hour_str
	
--将Hive结果集中数据,导出到MYSQL中
--创建MYSQL表
create table tb_result_pv(
    id int not null AUTO_INCREMENT,
    date_str varchar(255) not null,
    hour_str varchar(255) not null,
    pv int not null,
    uv int not null ,
    PRIMARY KEY(id)
)
--使用sqoop导出结果
bin/sqoop export \
--connect jdbc:mysql://bigdata-hpsk01.huadian.com/test \
--username root \
--password 123456 \
--table tb_result_pv \
--columns date_str,hour_str,pv,uv \
--export-dir /user/hive/warehouse/db_yhd.db/tb_pvuv_result \
--input-fields-terminated-by ','



--统计每日各省份PU和UV
--企业中往往实现一个功能之后,其他功能复制改吧改吧,就O了。
SELECT 
   date_str,provinceId,
   count(url) as pv ,
   count(distinct guid) as uv 
from
    db_yhd.track_log 
where 
    date_str="20150828"  and length(trim(url)) >0 and length(trim(provinceId)) >0
group by
    date_str,provinceId
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值