集合函数
sort_array(字段名);----对字段进行排序
size(Map<k,v>);-------计算map个数,返回int
map_keys(Map<k,v>);-------字段中所有的key值 返回数组
map_values(Map<k,v>);-------字段中所有的values值,返回数组
条件控制函数
case when
select id,name
case
when age<28 then '青年'-----------------如果是复杂数据类型,需要添加字段名例如:info.age...........
when age>=28 and age<40 then '中年'
else '老年'
end
from t_user;
if
select move_name,actors
if(array_contains(actors,'周星驰'),'好电影','烂片')--------如果actors字段中含有‘周星驰’,返回‘好电影’,否则返回‘烂片’
from t_move;
窗口分析函数-----处理复杂分析报表的函数
分组Top n: row_number()over()---------分组后前几名(区别于group的是,group服务于聚合函数,只能选出max或者min)
思路:分组→排序→不同组分别排序号→根据top n的n做筛选
例如:查询不同性别中年龄最大的两个人
select * from
(select id,name,sex
row_number(-----打标记) over(partition by sex----分组 order by-------排序age desc)
as rn from t_rn) tmp
where rn<3;
窗口求和函数:sum() over---------实现逐行累加
例子: 求出每个人截止当月的总额
create table t_access_time(user string,mounth string,values int)
row format delimited fields terminated by ' ';
load data local inpath '/home/hadoop/logs/user.dat' into t_access_time;
----先求每个人每月的总金额
create table t_access_amount as
select user,month,sum(values) as amount
from t_access_time
group by user,month;
传统方法:
累计到当月总额的思路:做自连接,去除大于本月时间的数据,以A为例
create table t_tmp as
select a.values as a_anount,b.use as uid,b.month as month,b.values as amount
from t_access_amount a join t_access_amount b
on a.user=b.user
where a.mounth<=b.mounth;
select uid,b_month,max(amount),sum(a_amount)
from t_tmp
group by uid,b_month;
使用窗口求和函数:sum() over
select * from t_access_amount
select user,month,values,
sum(values) over(partition by user-----按用户分组
order by month------按月份排序
rows between-------加载XXXX行之间的数据
unbounded --------无边界的
preceding------向上、向前
and current row-------直到当前行
) as accumulate
from t_access_amount;