hive的基本操作
创建表
create table city(
province_code INT,
province_name string,
city_code INT,
city_name string
)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n';
导入本地源表
load data local inpath '/home/bigdata/hive/city.txt' into table city;
注意
sql语句不能用tab键修改格式,用两个空格即可。
设置表头列名显示
set hive.cli.print.header=true;
显示拥有最多市的10个省份
select province_name,count(*) as count from city group by province_name order by count desc limit 10;
查看省的数目
select count(distinct province_name) from city;
查出只有一个市的省份
select
province_name,cnt
from
(
select
province_name,count(*)
as
cnt
from
city
group by
province_name
)a
where
cnt = 1;
外表和内表
外表
CREATE EXTERNAL TABLE city_ex(
province_code int,
province_name string,
city_code int,
city_name string)
row FORMAT delimited
fields terminated by ','
lines terminated by '\n'
location '/user/dang/city';
内表
create table city_in(
province_code int,
province_name string,
city_code int,
city_name string)
row format delimited
fields terminated by ','
lines terminated by '\n'
location '/user/dang/city';
注意
- location后加的是hdfs目录
- 文件导入内表时,文件在原文件夹下消失,并进入内表中,当内表被删除时,文件连同文件夹一起被删除掉;
- 文件导入外表时,文件在原文件夹下消失,外表删除时,文件在原来目录下依然存在。
分区表
静态分区
create table user_daily(
uid int,
city_code int,
model string,
access string
)
partitioned by (p_date string);
insert overwrite table user_daily partition (p_date = '2107-01-01')
select * from user;
设置动态分区
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE user_daily PARTITION (p_date)
SELECT *,'2017-09-02' FROM user
UNION ALL
SELECT *,'2017-09-03' FROM user
;
显示所有的分区表
show partitions user_daily;
修改或删除分区
ALTER TABLE user_daily PARTITION (p_date='2017-01-01') RENAME TO PARTITION (p_date='20170101');
DROP TABLE user_daily partition (p_date ='20170101');
- if判断计算百分比
select sum(if(access ='2G',1,0))/count(1) from user;
- case判断
select
case
when uid % 10 in (0,1,2,3) then '0-3'
when uid % 10 in (4,5,6,7) then '4-7'
else '8-9'
end as interval,
count(*) as cnt
from user
group by
case
when uid % 10 in (0,1,2,3) then '0-3'
when uid % 10 in (4,5,6,7) then '4-7'
else '8-9'
end;
- 集合
select collect_set(access) from user;
select collect_list(access) from user;
- join
左连接
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
left join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
右连接
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
right join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
内链接
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
inner join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
完全外连接
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
full join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 200;
分组topN(窗口函数)
1)分组
(2)排序
(3)top1
select access,city_code,uid
from
(
select uid,access,city_code,
row_number() over (partition by access order by city_code desc) as row_num
from user
)a
where row_num = 1;
累计
unbounded preceding 第一行
current row 当前行
select p_date,
sum(cnt) over (order by p_date asc rows between unbounded preceding and current row)
from
(
select p_date, count(*) as cnt
from user_daily
where p_date between '2017-09-01' and '2017-09-30'
group by p_date
)a;