数据聚合和采样

最新推荐文章于 2024-07-06 03:08:00 发布

adzuxhs5024

最新推荐文章于 2024-07-06 03:08:00 发布

阅读量515

点赞数

文章标签：大数据数据库

原文链接：http://www.cnblogs.com/StephenMeng/p/9870121.html

版权

基础聚合 
-- 常见的聚合函数 count, sum, avg, max, min
-- 聚合函数不允许嵌套 如：avg(count(*)) error!
set hive.map.aggr=true;      -- mapper端预聚合，提高性能，但消耗较多内存
#注意：不能直接select没有出现在group by从句中的字段，否则报错 why?
select name, gender_age.gender, count(*) as row_cnt  -- error!
from employee
group by gender_age.gender;  -- 后跟一个或多个字段，不能加别名。

如何select没有出现在group by中的字段？
    1)使用collect_set函数；
select gender_age.gender, collect_set(gender_age.age)[0] as random_age, count(*) as row_cnt
from employee
group by gender_age.gender;

-- select中使用多个聚合函数
select gender_age.gender, avg(gender_age.age) as avg_age, count(*) as row_cnt
from employee
group by gender_age.gender;

    2)使用分析函数(不需要 group by)
-- 聚合函数内嵌条件函数
-- 求男性的平均年龄
select sum(case when gender_age.gender = "Male" then gender_age.age else 0 end) /
sum(case when gender_age.gender = "Male" then 1 else null end) as man_age_avg    -- 0 == null
from employee;
select avg(gender_age.age) as avg_age
from employee
where gender_age.gender = 'Male'
group by gender_age.gender;
# 第一种方法效率要比第二种高

-- 求age_sum, woman_age_sum
select sum(coalesce(gender_age.age, 0)) as age_sum,
sum(if(gender_age.gender = "Female", gender_age.age, 0)) as woman_age_sum
from employee;

-- 聚合函数中使用distinct 
-- 统计员工中有几种性别，几种不同的姓名
select count(distinct gender_age.gender) as sex_uni_cnt,
count(distinct name) as name_uni_cnt
from employee;
#注意：count和distinct一起使用时，使用一个reducer，会降低性能，解决方法：分开使用
select count(*) as sex_uni_cnt
from (
select distinct gender_age.gender from employee
) a;

#注意：聚合时若遇到为null值的字段，会忽略该行，如：
-- 创建含有null行的表
create table t as select * from (
select employee_id - 99 as val1,
employee_id - 98 as val2 from employee_hr
where employee_id <= 101
union all
select null as val1, 2 as val2 from employee_hr
where employee_id = 100
) a;
+---------+---------+--+
| t.val1  | t.val2  |
+---------+---------+--+
| 1       | 2       |
| NULL    | 2       |
| 2       | 3       |
+---------+---------+--+
select sum(val1 + val2) from t;     --第二行会被忽略，解决方法：
select sum(coalesce(val1, 0) + val2) from t;

高级聚合
-- grouping sets:表示用union all连接多个group by的结果集，在job中的某一阶段完成更高效
内层的grouping sets定义在每个union all中的group by要实现的数据。
select name, work_place[0] as main_place, count(employee_id) as emp_id_cnt
from employee_id
group by name, work_place[0] grouping sets( (name, work_place[0]) );  -- 1个参数，毋需union all
<==>
select name, work_place[0] as main_place, count(employee_id) as emp_id_cnt
from employee_id
group by name, work_place[0];

select name, work_place[0] as main_place, count(employee_id) as emp_id_cnt
from employee_id
group by name, work_place[0] grouping sets((name), (work_place[0])); -- union all连接两个group by结果集
<==>
select name, null as main_place, count(employee_id) as emp_id_cnt
from employee_id
group by name
union all
select null as name, work_place[0] as main_place, count(employee_id) as employee_id_cnt
from employee_id
group by work_place[0];

select name, work_place[0] as main_place, count(employee_id) as employee_id_cnt
from employee_id
group by name, work_place[0] grouping sets((name, work_place[0]), name, work_place[0], ());  -- () 不分组

#煮鱼：grouping sets的内容为表或记录的别名所指向的列时，可能会出现问题，官方已修复。如：
select gender_age.gender, gender_age.age,
count(name) as name_uni_cnt
from employee
group by gender_age.gender, gender_age.age grouping sets((gender_age.gender, gender_age.age));

-- 汇总 rollup ：group by从句的扩展，其效率高，查询开销最小。 创建n + 1级聚合，n为group by中的列数
group by a, b, c with rollup <==> grouping set((a, b, c), (a, b), (a), ())
-- cube  对分组字段所有可能的组合进行汇总
group by a, b, c with cube 

-- grouping_id函数：显示字段是否聚合的位向量的10进制形式
select grouping_id, bin(cast(grouping_id as bigint)) as bit_vector,
name, start_date, count(employee_id) emp_id_cont
from employee_hr
group by start_date, name with cube 
order by start_date;

-- having 对组的数据集进行过滤,过滤掉一些不需要的组
将员工按年龄分组，统计:人数<=3的年龄组
select gender_age.age, count(distinct employee_id) as cnt
from employee_id
group by gender_age.age 
    having cnt <= 1;
<==>
select a.age, a.cnt
from (
select gender_age.age as age, count(distinct employee_id) as cnt    -- 内查询中的字段一定要起个别名
from employee_id
group by gender_age.age
) a
where a.cnt <= 3;  -- where中不支持UDAF函数，要起个别名

⭐︎⭐︎⭐︎分析函数，在指定窗口内统计数据，格式：
Function(arg1,..., argn) OVER ([PARTITION BY <...>] [ORDER BY <....>] [<window_clause>])
Function(arg1,..., argn) :
    标准聚合函数：SUM, COUNT and AVG支持'distinct'(Hive 2.1.0),能够后跟order by或窗口从句(Hive 2.2.0)
        如查询出不同的部门中有多少人：count(distict employee_id) partition by apart
    分析函数：
        row_number(): 根据partition和order的规定为表中的所有行编号，从1开始编号,且编号唯一
        排序函数(如下)以及lead， log在over中使用时不支持开窗：
            ntile():将排序好的数据集分桶，并且给每行分配一个合适的桶号，适用于将数据等分？
            rank():按照排序字段在分区内编号。 如:升序的分数 60 60 80 90 -> 1 1 3 4     从1开始编号，相同的值编号相同
            dense_rank():分区内密集编号。对相邻的不同数据，步进1 。  如:60 60 80 90 -> 1 1 2 3     
            cume_dist():升序时，定义为值小于或等于当前行值的行数除以所在分区或查询结果集的总行数
            percent_rank():对分组字段按照rank( )进行编号,只不过从零开始编号，求百分比：用它们的编号号除以最大的编号， 如：60 60 80 90 -> 0 0 2 3 -> 0/3 0/3 1/3 2/3
    窗口函数：
        lead(filed[, offset[, default]]):访问窗口内指定行向下前进offset行后的数据
        lag(field[, offset[, default]]):访问窗口内指定行向上滞后offset行后的数据
        first_value(field[, boolea]):从有序的结果集中返回第一个值, boolea为true时跳过null值
        last_value(field[, boolea]):从有序的结果集中返回最后一个值
over + :
    标准的聚合函数(Hive 2.1.0)
    + partition by :要求分区字段为一个或多个基本类型的字段
    + partition by + order by：分区或排序字段为一个或多个任意类型的字段
        window_clause：分区内开窗,Function每次会统计窗口内的所有记录
        作用：细颗粒度
        类型：
          ⚠️：升序时，star_expr必须小于end_expr,否则整列值为null或报异常
          行类型窗口 范围类型窗口
          (ROWS | RANGE) BETWEEN (UNBOUNDED | [num]) PRECEDING AND ([num] PRECEDING | CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
          (ROWS | RANGE) BETWEEN CURRENT ROW AND (CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
          (ROWS | RANGE) BETWEEN [num] FOLLOWING AND (UNBOUNDED | [num]) FOLLOWING   -- 当前行下方的某个范围内开窗
                [num]告诉窗口函数执行时参考的记录数,会将当前记录之前或之后的num条记录纳入统计范围内，
                [unbounded] 将当前记录之前或之后的所有记录纳入统计范围内
          两窗口的不同点：range仅支持一个排序字段，且此字段只能为数字或日期。
          若定义了order by,缺失窗口从句。默认为：range between unbounded preceding and current row 窗口为分区的起始行到当前行
          order by和窗口从句都缺失。默认为：rows between unbounded preceding and unbounded following 窗口为当前分区的所有行

select name, dept_num, salary,
count(*) over (partition by dept_num) as row_cnt,  -- 分区内求和
sum(salary) over (partition by dept_num order by dept_num) as deptTotal,   -- 分区内数据累加求和
sum(salary) over (order by dept_num) as runingTotol1,   -- 表内数据累加求和
sum(salary) over (order by dept_num, name rows unbounded preceding) as runingTotol2  -- dept_num全局排序，name局部排序
from employee_contract;

select name, dept_num, salary,
count(*) over (partition by dept_num) as row_cnt,
sum(salary) over(partition by dept_num order by dept_num) as deptTotal,  
sum(salary) over(order by dept_num) as runingTotol1,   
sum(salary) over(order by dept_num, name rows unbounded preceding) as runingTotol2
from employee_contract
order by dept_num, name;
<==>独立的窗口从句格式:
select name, dept_num, salary,
count(*) over row_cnt,
sum(salary) over deptTotal,  
sum(salary) over runingTotol1,   
sum(salary) over runingTotol2
from employee_contract
order by dept_num, name
window
row_cnt as (partition by dept_num),
overdeptTotal as (partition by dept_num order by dept_num),
runingTotol1 as (order by dept_num),
runingTotol2 as (order by dept_num, name rows unbounded preceding);

-- 范围查询
select name, dept_num, salary, start_year,
max(salary) over (partition by dept_num order by start_year 
range between 2 preceding and current row) as win1
from (
select name, dept_num, salary, year(start_date) as start_year
from employee_contract
) a;

抽样
1. 随机取样??
select name from employee_id
distribute by rand() sort by rand()
limit 5;
2. 分桶表抽样
SELECT * FROM <BucketTable_Name> 
TABLESAMPLE(BUCKET <specified bucket number to sample> OUT OF <total number of buckets> ON [colname|RAND()]) table_alias;
select name from employee_id_buckets
tablesample(bucket 1 out of 2 on rand()) a;
3. 块抽样
SELECT * FROM <Table_Name> 
TABLESAMPLE(N PERCENT|ByteLengthLiteral|N ROWS) s;
按百分比大小抽样
select name from employee_id_buckets
tablesample (20 percent) a;
按行数抽样
select name from employee_id
tablesample (5 rows) a;
按字节大小抽样
select name from employee_id
tablesample (700B) a;

参考

Oracle开发之：窗口函数
 LanguageManual WindowingAndAnalytics
PERCENT_RANK
CUME_DIST (Transact-SQL)
为什么group by后面不能使用列的别名
《Hive Essentials》
《Programing Hive》
《Practical Hive》

转载于:https://www.cnblogs.com/StephenMeng/p/9870121.html

adzuxhs5024

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
数据聚合和采样

基础聚合 -- 常见的聚合函数 count, sum, avg, max, min-- 聚合函数不允许嵌套如：avg(count(*)) error!set hive.map.aggr=true; -- mapper端预聚合，提高性能，但消耗较多内存#注意：不能直接select没有出现在group by从句中的字段，否则报错 why?select name, gen...
复制链接

扫一扫