20170916Hive学习总结

1.创建数据库/创建数据表

CREATE DATABASE smx; //创建数据库
show databases;//查看自己的数据库是否创建成功
use smx;//使用数据库
//创建数据表
CREATE TABLE `city`(
  `province_code` INT,
  `province_name` String,
  `city_code` INT,
  `city_name` string
  )
row FORMAT delimited
fields terminated by ','//每个区域之间用,隔开
lines terminated by '\n';//每行用换行隔开
load data local inpath '/home/bigdata/hive/city.txt' into table city ;//把本地的内容传到自己的table表中

2.数据表的查询操作

select * from city ;//查看自己的city表内容  
select * from city limit 10;//查看自己的city表中前十行的内容
show create table city;//查看建表语句
desc city ;//查看表信息
alter table city rename to city_new;//修改表名
create table city like city_new;//复制表内容
drop database smx;//删除库
drop table city_new;//删除表
select count(*) from city;//查看city表中有多少行
select * from city order by city_code desc limit 10;//按照city_code降序排列前十个(排序使用order by)
select * from city order by city_code asc limit 10;//按照city_code升序排列前十个
select province_name,count(city_name) as cnt from city group by province_name order by cnt desc limit 10;//查找省里市数排名前十的(分组使用group by)
select count(distinct province_name) from city;//查找city表里有多少省(去重使用distinct)
select count(1) 
from 
(
    select  province_name from city group by province_name
)a;//表数量大时,推荐使用这种方法。筛选出省名作为新表a,从新表a中查询数量即可
select province_name,count(city_name) as cnt from city group by province_name having cnt=1;//查找只有一个市的省份
select 
  province_name,cnt
from
 (
    select
       province_name,count(1) as cnt//count(1)代表第一个字段
    from 
       city
    group by
       province_name    
)a
where cnt=1
;

3.外表/内表
(1)外表

use smx;
//创建外部表
CREATE EXTERNAL TABLE city_ex(//创建外表要使用externalcreate table之间
  province_code int,
  province_name string,
  city_code int,
  city_name string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n' 
LOCATION '/user/smx/city';//创建外表要使用location地址

show create table city_ex;//查看建表语句

drop table city_ex;//删除外表。外表删除后在hadoop中查看仍会存在 hadoop fs -ls /user/smx/city

(2)内表

use smx;
//创建内部表
CREATE TABLE city_in(
  province_code int,
  province_name string,
  city_code int,
  city_name string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n' 
LOCATION '/user/smx/city';

show create table city_in;//查看建表语句

drop table city_in;//删除内表。内表删除后在hadoop中查看不存在了 hadoop fs -ls /user/smx/city

4.分区
(1)静态分区

use smx;
//创建一个分区表
create table user_daily(
 uid int,
 city_code int,
 model string,
 access string
)
partitioned by(p_date string);

INSERT OVERWRITE TABLE user_daily PARTITION (p_date='2017-09-01') //在分区表中插入一个数据
SELECT * FROM user;

show partitions user_daily;//查看分区表有哪天的数据

(2)动态分区

set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE user_daily PARTITION (p_date) 
SELECT *,'2017-09-02' FROM user
UNION ALL
SELECT *,'2017-09-03' FROM user
;

show partitions user_daily;//查看动态分区结果

INSERT OVERWRITE TABLE user_daily PARTITION (p_date='2017-01-01')//再静态插入一个新的日期
SELECT * FROM user;

ALTER TABLE user_daily PARTITION (p_date='2017-01-01') RENAME TO PARTITION (p_date='20170101');//更改日期格式
ALTER TABLE user_daily drop PARTITION (p_date='20170101');//删除某个日期的
select * from user_daily where p_date='2017-09-01'

5.if/case when/collect
(1)if

//统计网络为WIFI的比例
select sum(if(access='WIFI',1,0))/count(1) from user;
sum(if((access='WIFI',1,0)))//如果access='WIFI'则返回1,否则返回0,然后求sum即为access='WIFI'的总数

(2)case when

//把uid的尾数按照0-3,4-7,8-9分组统计数目
select 
  case 
    when uid % 10 in (0, 1, 2, 3) then '0-3'    
    when uid % 10 in (4, 5, 6, 7) then '4-7'
    else '8-9'
   end as interval,
   count(*) as cnt
from user 
group by 
  case 
    when uid % 10 in (0, 1, 2, 3) then '0-3'    
    when uid % 10 in (4, 5, 6, 7) then '4-7'
    else '8-9'
   end;

(3)collect

select collect_list(access) from user;//查看access值且去重的结果

collect_set是去重的集合
collect_list不去重的集合

6.hive的各种join
对于表user和表city:
(1)左连接left outer join

select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
left join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;

(2)右连接right outer join

select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
right join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;

(3)内连接(取交集)inner join

select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
inner join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;

(4)全连接(取并集)full outer join

select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
full outer join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;

7.窗口函数
(1)分组求Top N(求分组里面的最大值)

select access,city_code, uid
from
(
  select uid, access, city_code,
  row_number() over (partition by access order by city_code desc) as row_num//partition by access根据access分组,order by city_code desc根据city_code递减排序,row_number() over的值就表示每组内部排序后的顺序编号(组内连续的唯一的)
  from user
) a
where row_num = 1;//找分组里的第一个

(2)累计

select p_date, 
  sum(cnt) over(order by p_date asc rows between unbounded preceding and current row)//unbounded preceding第一行,current row当前行
from
(
  select p_date, count(*) as cnt
  from user_daily   
  where p_date between '2017-09-01' and '2017-09-30'
  group by p_date
)a
; 
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值