Hive的简单使用
创建表语句
create table city(province_code int,
province_name string,
city_code int,
city_name string
)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n';
如果需要将本地文件加载到Hive:
load data local inpath '/home/bigdata/hive/city.txt' into table city;
基本的查询语句
select * from city order by city_code desc limit 10;
select province_name,count(city_name) as cnt from city group by province_name order by cnt desc limit 10; //统计城市最多的省份
select city_code,count(2) as cnt from user where access='WIFI' group by city_code order by cnt desc limit 5; //统计wifi环境下用户最多的5个城市
select access,city_code,count(uid) as cnti from user group by access,city_code having cnti>2 order by cnti desc; //统计用户数>=3的access和city_code组合
select access,count(1) as cnts from user where lower(model) like '%ne%' group by access order by cnts desc; //机型中带有ne(不区分大小写),不同access及对应用户数
select count(distinct province_name) from city; //统计省份总数
select count(1)
from
(
select province_name from city group by province_name
) a; //统计省份总数Hive的复杂使用
INSERT OVERWRITE TABLE user_daily PARTITION (p_data='2017- 01-01')
SELECT * FROM user; //插入分区
show partitions user_daily; //显示分区
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE user_daily PARTITION (p_data)
SELECT *,'2017-09-02' FROM user
UNION ALL
SELECT *,'2017-09-03' FROM user
; //动态插入分区
ALTER TABLE user_daily PARTITION
(p_data='2017-01-01') RENAME TO PARTITION
(p_data='20170101'); //更改分区名字
ALTER TABLE user_daily DROP PARTITION(p_data='20170101'); //删除
select sum(if(access='WIFI',1,0))/count(1) from user; //使用WIFI的占比
//统计尾号在0-3 4-7 8-9之间的id
select
case
when uid % 10 in (0,1,2,3) then '0-3'
when uid % 10 in (4,5,6,7) then '4-7'
else '8-9'
end as interval,
count(*) as cnt
from user
group by
case
when uid % 10 in (0, 1, 2, 3) then '0-3'
when uid % 10 in (4, 5, 6, 7) then '4-7'
else '8-9'
end;
//去重
select collect_set(access) from user;
//不去重
select collect_list(access) from user;
//右边的表和左边表相同的字段(以左边的表为准)
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
left join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
//(以右边的表为准)
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
right join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
//内链接
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
inner join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
//全连接
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
full join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
//分组topN
select access,city_code, uid
from
(
select uid,access,city_code,
row_number() over (partition by access order by city_code desc) as row_num
from user
)a
where row_num=1;
//累计
select p_date,
sum(cnt) over(order by p_date asc rows between unbounded preceding and current row)
from
(
select p_date, count(*) as cnt
from user_daily
where p_date between '2017-09-01' and '2017-09-30'
group by p_date
)a
;
//累计
select p_data,
sum(cnt) over(order by p_data asc rows between unbounded preceding and current row)
from
(
select p_data,count(*) as cnt
from user_daily
where p_data between '2017-09-01' and '2017-09-30'
group by p_data
)a
;