1.创建数据库/创建数据表
CREATE DATABASE smx; //创建数据库
show databases;//查看自己的数据库是否创建成功
use smx;//使用数据库
//创建数据表
CREATE TABLE `city`(
`province_code` INT,
`province_name` String,
`city_code` INT,
`city_name` string
)
row FORMAT delimited
fields terminated by ','//每个区域之间用,隔开
lines terminated by '\n';//每行用换行隔开
load data local inpath '/home/bigdata/hive/city.txt' into table city ;//把本地的内容传到自己的table表中
2.数据表的查询操作
select * from city ;//查看自己的city表内容
select * from city limit 10;//查看自己的city表中前十行的内容
show create table city;//查看建表语句
desc city ;//查看表信息
alter table city rename to city_new;//修改表名
create table city like city_new;//复制表内容
drop database smx;//删除库
drop table city_new;//删除表
select count(*) from city;//查看city表中有多少行
select * from city order by city_code desc limit 10;//按照city_code降序排列前十个(排序使用order by)
select * from city order by city_code asc limit 10;//按照city_code升序排列前十个
select province_name,count(city_name) as cnt from city group by province_name order by cnt desc limit 10;//查找省里市数排名前十的(分组使用group by)
select count(distinct province_name) from city;//查找city表里有多少省(去重使用distinct)
select count(1)
from
(
select province_name from city group by province_name
)a;//表数量大时,推荐使用这种方法。筛选出省名作为新表a,从新表a中查询数量即可
select province_name,count(city_name) as cnt from city group by province_name having cnt=1;//查找只有一个市的省份
select
province_name,cnt
from
(
select
province_name,count(1) as cnt//count(1)代表第一个字段
from
city
group by
province_name
)a
where cnt=1
;
3.外表/内表
(1)外表
use smx;
//创建外部表
CREATE EXTERNAL TABLE city_ex(//创建外表要使用external在create table之间
province_code int,
province_name string,
city_code int,
city_name string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
LOCATION '/user/smx/city';//创建外表要使用location地址
show create table city_ex;//查看建表语句
drop table city_ex;//删除外表。外表删除后在hadoop中查看仍会存在 hadoop fs -ls /user/smx/city
(2)内表
use smx;
//创建内部表
CREATE TABLE city_in(
province_code int,
province_name string,
city_code int,
city_name string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
LOCATION '/user/smx/city';
show create table city_in;//查看建表语句
drop table city_in;//删除内表。内表删除后在hadoop中查看不存在了 hadoop fs -ls /user/smx/city
4.分区
(1)静态分区
use smx;
//创建一个分区表
create table user_daily(
uid int,
city_code int,
model string,
access string
)
partitioned by(p_date string);
INSERT OVERWRITE TABLE user_daily PARTITION (p_date='2017-09-01') //在分区表中插入一个数据
SELECT * FROM user;
show partitions user_daily;//查看分区表有哪天的数据
(2)动态分区
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE user_daily PARTITION (p_date)
SELECT *,'2017-09-02' FROM user
UNION ALL
SELECT *,'2017-09-03' FROM user
;
show partitions user_daily;//查看动态分区结果
INSERT OVERWRITE TABLE user_daily PARTITION (p_date='2017-01-01')//再静态插入一个新的日期
SELECT * FROM user;
ALTER TABLE user_daily PARTITION (p_date='2017-01-01') RENAME TO PARTITION (p_date='20170101');//更改日期格式
ALTER TABLE user_daily drop PARTITION (p_date='20170101');//删除某个日期的
select * from user_daily where p_date='2017-09-01'
5.if/case when/collect
(1)if
//统计网络为WIFI的比例
select sum(if(access='WIFI',1,0))/count(1) from user;
sum(if((access='WIFI',1,0)))//如果access='WIFI'则返回1,否则返回0,然后求sum即为access='WIFI'的总数
(2)case when
//把uid的尾数按照0-3,4-7,8-9分组统计数目
select
case
when uid % 10 in (0, 1, 2, 3) then '0-3'
when uid % 10 in (4, 5, 6, 7) then '4-7'
else '8-9'
end as interval,
count(*) as cnt
from user
group by
case
when uid % 10 in (0, 1, 2, 3) then '0-3'
when uid % 10 in (4, 5, 6, 7) then '4-7'
else '8-9'
end;
(3)collect
select collect_list(access) from user;//查看access值且去重的结果
collect_set是去重的集合
collect_list不去重的集合
6.hive的各种join
对于表user和表city:
(1)左连接left outer join
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
left join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
(2)右连接right outer join
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
right join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
(3)内连接(取交集)inner join
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
inner join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
(4)全连接(取并集)full outer join
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
full outer join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
7.窗口函数
(1)分组求Top N(求分组里面的最大值)
select access,city_code, uid
from
(
select uid, access, city_code,
row_number() over (partition by access order by city_code desc) as row_num//partition by access根据access分组,order by city_code desc根据city_code递减排序,row_number() over的值就表示每组内部排序后的顺序编号(组内连续的唯一的)
from user
) a
where row_num = 1;//找分组里的第一个
(2)累计
select p_date,
sum(cnt) over(order by p_date asc rows between unbounded preceding and current row)//unbounded preceding第一行,current row当前行
from
(
select p_date, count(*) as cnt
from user_daily
where p_date between '2017-09-01' and '2017-09-30'
group by p_date
)a
;