常用函数
数值计算函数 字符串函数 日期函数 条件函数 统计函数(多行变一行) 表格生成函数(一行拆分多行)
开窗函数 统计分析函数(count,sum,avg,min,max) 序列函数(row_number,rank,dense_rank,ntile) 排名占比函数(percent_rank,cume_dist) 行函数(lag,lead,first_value,last_value) 分组聚合函数(grouping sets,cube,rollup)
数值计算函数
select
round ( 3.1415926 , 4 ) ,
floor( 3.1415926 ) ,
ceil( 3.1415926 ) ,
rand( ) ,
pow( 2 , 4 ) ,
sqrt( 16 ) ,
abs( - 3.9 ) ,
pmod( 9 , 4 )
字符串函数
select
concat( 'abc' , 'def' , 'gh' ) ,
concat_ws( ',' , 'abc' , 'def' , 'gh' ) ,
concat_ws( '|' , array( 'a' , 'b' , 'c' ) ) ,
format_number( 5.23456 , 3 ) ,
substr( 'abcde' , 3 , 2 ) ,
instr( 'abcdf' , 'df' ) ,
locate( 'a' , 'abcda' , 2 ) ,
length( 'abc' ) ,
str_to_map( 'k1=v1,k2=v2' , ',' , '=' ) ,
upper( 'abSEd' ) ,
lower( 'abSEd' ) ,
trim( ' abc ' ) ,
regexp_replace( 'foobar' , 'oo|ar' , '' ) ,
get_json_object(
'{"device_info":{"device_id":"03E7A750","brand":"苹果"},"age":28}' ,
'$.device_info'
) ,
get_json_object(
'{"device_info":{"device_id":"03E7A750","brand":"苹果"},"age":28}' ,
'$.device_info.device_id'
) ,
split( 'ab,cd,ef' , ',' ) ,
find_in_set( 'ab' , 'ef,ab,de' )
日期函数
select
from_unixtime(
cast( 1622367594000 / 1000 as bigint ) ,
'yyyyMMdd'
) ,
unix_timestamp( '20210530' ) ,
to_date( '2021-05-30 17:39:54' ) ,
day ( '2021-05-30 17:39:54' ) ,
weekofyear( '2021-05-30 17:39:54' ) ,
datediff( '2021-05-30' , '2021-05-20' ) ,
date_add( '2021-05-30' , 10 ) ,
date_sub( '2021-06-09' , 10 ) ,
add_months( '2020-07-15' , - 1 ) ,
dayofweek( '2020-04-12' ) ,
case
when pmod( datediff( '2021-05-30' , '2018-01-01' ) + 1 , 7 ) = 1 then '周一'
when pmod( datediff( '2021-05-30' , '2018-01-01' ) + 1 , 7 ) = 2 then '周二'
when pmod( datediff( '2021-05-30' , '2018-01-01' ) + 1 , 7 ) = 3 then '周三'
when pmod( datediff( '2021-05-30' , '2018-01-01' ) + 1 , 7 ) = 4 then '周四'
when pmod( datediff( '2021-05-30' , '2018-01-01' ) + 1 , 7 ) = 5 then '周五'
when pmod( datediff( '2021-05-30' , '2018-01-01' ) + 1 , 7 ) = 6 then '周六'
else '周日'
end week_day,
'{{ macros.hive.max_partition(table="table_name",field = "p_date") }}'
条件函数
select
coalesce ( null , null , '50' ) ,
nvl( null , '100' )
统计函数(多行变一行)
select
percentile_approx( view_cnt, array( 0.5 , 0.8 ) ) ,
collect_set( city_id) ,
collect_list( city_id) ,
hivemall. to_ordered_list( f. city_id, f. city_id)
表格生成函数(一行拆分多行)
explode(ARRAY) 搭配lateral view
select
t. str
from (
) as f
lateral view
explode( split( f. strs, ',' ) ) t as str
posexplode(ARRAY) 搭配lateral view
select
f. id,
tt. pos,
concat_ws( '.' , get_json_object( tt. table_info, '$.hiveDb' ) , get_json_object( tt. table_info, '$.hiveTable' ) )
from
table_name as f
lateral view
posexplode( json_to_array( get_json_object( f. args, '$.tables' ) ) ) tt as pos, table_info
where
f. dt = '2020-11-30'
and f. action = 'UPDATE_HIVE_TABLE_LIFE_CYCLE'
explode(MAP) 搭配lateral view
select
f. user_id,
t. flag,
t. score
from (
select
f. user_id,
t. flag,
t. score,
row_number( ) over ( partition by f. user_id order by cast( t. score as bigint ) desc , t. flag) rn
from
table_name as f
lateral view
explode( str_to_map( regexp_replace( f. interests_l1, '\\{|\\}|"' , '' ) ) ) t as flag, score
where
f. p_daterange = 'timedecay'
and f. p_date = '20201215'
and f. user_id = '1575703519'
) f
where f. rn <= 5
开窗函数
窗口函数用于实现分组内所有数据连续累积的统计。 – 如果不指定rows between关键词,默认为从起点到当前行; – 如果不指定order by,则将分组内所有值累加; rows between也就是window子句: – preceding:往前; following:往后; current row:当前行; unbounded:窗口边界; – unbounded preceding 表示窗口内第一行起点; unbounded following:表示窗口内最后一行终点。
统计分析函数(count,sum,avg,min,max)
select
f. user_id,
f. city_id,
f. dt,
f. view_cnt,
sum ( f. view_cnt) over (
partition by f. city_id
order by f. dt, f. user_id
) as view_cnt1,
sum ( f. view_cnt) over (
partition by f. city_id
order by f. dt, f. user_id
rows between unbounded preceding and current row
) as view_cnt2,
sum ( f. view_cnt) over (
partition by f. city_id
) as view_cnt3,
sum ( f. view_cnt) over (
partition by f. city_id
order by f. dt, f. user_id
rows between 3 preceding and current row
) as view_cnt4,
sum ( f. view_cnt) over (
partition by f. city_id
order by f. dt, f. user_id
rows between 3 preceding and 1 following
) as view_cnt5,
sum ( f. view_cnt) over (
partition by f. city_id
order by f. dt, f. user_id
rows between current row and unbounded following
) as view_cnt6
from
table_name as f
where
f. dt between '2021-05-30' and '2021-06-06'
order by
f. city_id,
f. dt,
f. user_id
序列函数(row_number,rank,dense_rank,ntile)
select
f. user_id,
f. city_id,
f. dt,
f. view_cnt,
row_number( ) over (
partition by f. city_id
order by f. view_cnt
) as rn,
rank( ) over (
partition by f. city_id
order by f. view_cnt
) as rk,
dense_rank( ) over (
partition by f. city_id
order by f. view_cnt
) as dense_rk
from
table_name as f
where
f. dt = '2021-05-30'
order by
f. city_id,
view_cnt,
f. user_id
select
f. user_id,
f. city_id,
f. dt,
f. view_cnt,
ntile( 2 ) over (
partition by f. city_id
order by f. view_cnt, f. user_id
) as ntile2,
ntile( 3 ) over (
partition by f. city_id
order by f. view_cnt, f. user_id
) as ntile3,
ntile( 4 ) over (
partition by f. city_id
order by f. view_cnt, f. user_id
) as ntile4
from
table_name as f
where
f. dt = '2021-05-30'
order by
f. city_id,
view_cnt,
f. user_id
排名占比函数(percent_rank,cume_dist)
select
f. user_id,
f. city_id,
f. dt,
f. view_cnt,
rank( ) over (
partition by f. city_id
order by f. view_cnt
) as rk,
count ( 1 ) over (
partition by f. city_id
) as cnt,
percent_rank( ) over (
partition by f. city_id
order by f. view_cnt
) as percent_rk
from
table_name as f
where
f. dt = '2021-05-30'
order by
f. city_id,
view_cnt,
f. user_id
select
f. user_id,
f. city_id,
f. dt,
f. view_cnt,
cume_dist( ) over ( order by f. view_cnt) as all_le_rate,
cume_dist( ) over ( partition by f. city_id order by f. view_cnt) as city_le_rate,
cume_dist( ) over ( partition by f. city_id order by f. view_cnt desc ) as city_ge_rate
from
table_name as f
where
f. dt = '2021-05-30'
and f. city_id <= 'D0006'
order by
f. city_id,
view_cnt,
f. user_id
行函数(lag,lead,first_value,last_value)
select
f. user_id,
f. dt,
f. view_cnt,
lag( f. dt, 1 , '' ) over ( partition by f. user_id order by f. dt) as pre_dt,
lead( f. dt, 1 , '' ) over ( partition by f. user_id order by f. dt) as next_dt,
first_value( f. dt) over ( partition by f. user_id order by f. dt) as first_dt,
last_value( f. dt) over ( partition by f. user_id order by f. dt) as last_dt
from (
select
f. user_id,
f. dt,
sum ( f. view_cnt) as view_cnt
from
table_name as f
where
f. dt between '2021-05-30' and '2021-06-06'
group by
f. user_id,
f. dt
) as f
order by
f. user_id,
f. dt
分组聚合函数(grouping sets,cube,rollup)