sqoop 的数据导出
导出数据到mysql数据库中:
1.在hive中创建数据库,创建表,加载数据。
create database if not exists db_yhd;
use db_yhd;
模拟业务,创建分区表:
CREATE TABLE db_yhd.track_log(
id STRING,
url STRING,
referer STRING,
keyword STRING,
type STRING,
guid STRING,
pageId STRING,
moduleId STRING,
linkId STRING,
attachedInfo STRING,
sessionId STRING,
trackerU STRING,
trackerType STRING,
ip STRING,
trackerSrc STRING,
cookie STRING,
orderCode STRING,
trackTime STRING,
endUserId STRING,
firstLink STRING,
sessionViewNo STRING,
productId STRING,
curMerchantId STRING,
provinceId STRING,
cityId STRING,
fee STRING,
edmActivity STRING,
edmEmail STRING,
edmJobId STRING,
ieVersion STRING,
platform STRING,
internalKeyword STRING,
resultSum STRING,
currentPage STRING,
linkPosition STRING,
buttonPosition STRING
)
PARTITIONED BY (date_str STRING ,hour_str STRING ) //选取天,小时字段作为分区标准
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n';
/opt/datas/2015082818
/opt/datas/2015082819
文件路径
--加载数据
LOAD DATA LOCAL INPATH '/opt/datas/2015082818' INTO TABLE db_yhd.track_log PARTITION (date_str="20150828", hour_str="18");
LOAD DATA LOCAL INPATH '/opt/datas/2015082819' INTO TABLE db_yhd.track_log PARTITION (date_str="20150828", hour_str="19");
查看效果:
select count(1) as count from db_yhd.track_log where date_str="20150828";
举例:
统计分析每日各时段的PV和UV
PV:pageview网页浏览器,URL不能为空
UV:uniqueVisitor 唯一访客数,多少人访问
PV
select
date_str,hour_str,count(url) as pv
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url))>0
group by
date_str,hour_str
--发现有很多数据丢失,属于所谓"脏数据":58184
select
count(url) as c
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url)) ==0
--每日各时段UV统计
--guid是唯一ID,去重即可得到结果
每天不同时段:
select
date_str,hour_str,count(distinct guid) as uv
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url)) >0
group by
date_str,hour_str
每天数据:
select
date_str,count(distinct guid) as uv
from
db_yhd.track_log
where
date_str="20150828"
group by
date_str
统计一条的UV
结果发现一天UV比各个时段加起来小
这个很正常,6:59开始到7点之后用户
--合并结果
select
date_str,hour_str,
count(url) as pv ,
count(distinct guid) as uv
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url)) >0
group by
date_str,hour_str
--保存结果 --》放入一个新表
drop table if exists db_yhd.tb_pvuv_result;
create table db_yhd.tb_pvuv_result
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
AS
SELECT
date_str,hour_str,
count(url) as pv ,
count(distinct guid) as uv
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url)) ==0
group by
date_str,hour_str
--将Hive结果集中数据,导出到MYSQL中
--创建MYSQL表
将数据通过sqoop导出到mysql数据库需要先建好表:
create table tb_result_pv(
id int not null AUTO_INCREMENT,
date_str varchar(255) not null,
hour_str varchar(255) not null,
pv int not null,
uv int not null ,
PRIMARY KEY(id)
)
--使用sqoop导出结果
bin/sqoop export \
--connect jdbc:mysql://bigdata-hpsk01.huadian.com/test \
--username root \
--password 123456 \
--table tb_result_pv \
--columns date_str,hour_str,pv,uv \
--export-dir /user/hive/warehouse/db_yhd.db/tb_pvuv_result \
--input-fields-terminated-by ','
导入成功: