# 1、ROW_NUMBER，RANK()，DENSE_RANK()

partition by：类似hive的建表，分区的意思；
order by ：排序，默认是升序，加desc降序；
rank：表示别名

## 1.1 案例

### 1.1.2导入数据

--执行下述语句
-----------------------------------------------
drop table datatable;
CREATE table datatable (
province string,
city string,
people int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;

overwrite into table temp.datatable;
----------------------------------------------
--查看结果
hive (temp)> select * from datatable;
OK
province      city  people



## 1.2 按照人口降序排列，生成衍生变量

select province,city,
rank() over (order by people desc) rank,
dense_rank() over (order by people desc) dense_rank,
row_number() over(order by people desc) row_number
from datatable
group by province,city,people;

--结果
province        city    rank    dense_rank      row_number



row_number：顺序下来
rank:在遇到数据相同项时,会留下空位5,(第一列4,4,6)
dense_rank:在遇到数据相同项时,不会留下空位,(红框内第一列,4,4,5)

### 1.3 分组按照省份分区，再按照人口降序排列，生成衍生变量

select province,city,
rank() over (partition by province order by people desc) rank,
dense_rank() over (partition by province order by people desc) dense_rank,
row_number() over(partition by province order by people desc) row_number
from datatable
group by province,city,people;

--结果
province        city    rank    dense_rank      row_number



# 2取TOPN数据

## 2.1 按照国家提取TOP3

### 2.1.2导入数据

--执行下述语句
-----------------------------------------------
drop table temp.tripdata;
CREATE table datatable (
country string,
city string,
Visitors int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;

overwrite into table temp.tripdata;
----------------------------------------------

----------------------------------------------
--查看结果
hive (temp)> select * from tripdata;
country        city   visitors


---格式：select 品牌,count/sum/其它() as num  from table_name order by num limit 10;

select country,city,visitors
from tripdata
order by visitors desc
limit 5;

country city    visitors



## 2.2 按照国家、城市提取TOP3

--取top10品牌下各品牌的top10渠道 ,格式:
select
a.*
from
(
select 品牌,渠道,count/sum/其它() as num row_number() over (partition by 品牌 order by num desc ) rank
from table_name
where 品牌限制条件
group by 品牌,渠道
)a
where a.rank<=10

select a.*
from (
select country,city,visitors, row_number() over (partition by country order by visitors desc ) rank
from tripdata
order by country,visitors desc
) a
where a.rank<=3;

--结果
a.country       a.city  a.visitors      a.rank



## 2.3 按照国家提取TOP5

--取top10品牌下各品牌的top10渠道中各渠道的top10档期 ,格式:
select  a.*
from
(
select 品牌,渠道,档期,count/sum/其它() as num row_number() over (partition by 品牌,渠道 order by num desc ) rank
from table_name
where 品牌,渠道 限制条件
group by 品牌,渠道,档期
)a
where  a.rank<=10

select a.*
from (
select country,city,visitors, row_number() over (partition by city order by visitors desc ) rank
from tripdata
order by country,city,visitors desc
) a
where a.rank<=3;

--结果
a.country       a.city  a.visitors      a.rank


===============================================================

Hive在0.11.0版本开始加入了row_number、rank、dense_rank分析函数，可以查询分组排序后的top值

row_number() over ([partition col1] [order by col2])
rank() over ([partition col1] [order by col2])
dense_rank() over ([partition col1] [order by col2])

col1、col2都可以是多个字段，用','分隔

1）row_number：不管col2字段的值是否相等，行号一直递增，比如：有两条记录的值相等，但一个是第一，一个是第二
2）rank：上下两条记录的col2相等时，记录的行号是一样的，但下一个col2值的行号递增N（N是重复的次数），比如：有两条并列第一，下一个是第三，没有第二
3）dense_rank：上下两条记录的col2相等时，下一个col2值的行号递增1，比如：有两条并列第一，下一个是第二

row_number可以实现分页查询

1. hive> create table t(name string, sub string, score int) row format delimited fields terminated by '\t';

1. a chinese 98
2. a english 90
3. d chinese 88
4. c english 82
5. c math 98
6. b math 89
7. b chinese 79
8. z english 90
9. z math 89
10. z chinese 80
11. e math 99
12. e english 87
13. d english 90

1、row_number
1. hive (test)> select *, row_number() over (partition by sub order by score) as od from t;

2、rank
1. hive (test)> select *, rank() over (partition by sub order by score) as od from t;

3、dense_ran
1. hive (test)> select *, dense_rank() over (partition by sub order by score desc) from t;

1. select * from (select *, row_number() over (partition by sub order by score desc) as od from t ) t where od<=3;

1. hive (test)> select od from (select *, row_number() over (partition by sub order by score desc) as od from t ) t where sub='chinese' and score=80;

1. hive (test)> select * from (select *, row_number() over () as rn from t) t1 where rn between 1 and 5;

• 擅长领域：