SQL之常用函数汇总


数值计算函数

select
    round(3.1415926,4), -- 3.1416                       -- round(double a[, int d]) 保留对应精度,默认d=0                                  
    floor(3.1415926), -- 3                              -- 向下取整                                                  
    ceil(3.1415926), -- 4                               -- 向上取整 等价于ceiling                                                                 
    rand(), -- 0.5577432776034763                       -- rand([int seed]) 如果指定种子seed,则会取到一个稳定的随机数序列                                                  
    pow(2,4), -- 16.0                                   -- 幂函数 等价于power                                
    sqrt(16), -- 4.0                                    -- 开根号                                                                    
    abs(-3.9), -- 3.9                                   -- 绝对值                                                                
    pmod(9,4)  -- 1                                     -- 取余函数 

字符串函数

select 
    concat('abc','def','gh'), -- abcdefgh               -- 字符串连接函数
    concat_ws(',','abc','def','gh'), -- abc,def,gh      -- concat_ws(string SEP, string A, string B...) 带连接字符的字符串连接函数
    concat_ws('|',array('a','b','c')), -- a|b|c         -- 数组转换成字符串
    format_number(5.23456,3), -- 5.235                  -- format_number(number x, int d) 小数位格式化成字符串函数 等价于substring
    substr('abcde',3,2), -- cd                          -- substr(string A, int start[, int len]) 默认len为截取到最后
    instr('abcdf','df'), -- 4                           -- 字符串查找函数,返回字符串首次出现的位置 
    locate('a','abcda',2), -- 5                         -- locate(string substr, string str[, int pos]) 返回字符串从pos后查找首次出现的位置 
    length('abc'), -- 3                                 -- 字符串长度
    str_to_map('k1=v1,k2=v2',',','='), -- {"k2":"v2","k1":"v1"}
                                                        -- str_to_map(text[, delimiter1, delimiter2]) 按照给定的分隔符转换成map结构
    upper('abSEd'), --  ABSED                           -- 转大写
    lower('abSEd'), --  absed                           -- 转小写
    trim(' abc '), -- abc                               -- 去空格,ltrim/rtrim 为 左边/右边去空格
    regexp_replace('foobar', 'oo|ar', ''), -- fb        -- 正则表达式替换函数
    get_json_object(
        '{"device_info":{"device_id":"03E7A750","brand":"苹果"},"age":28}',
        '$.device_info'
        ),                                              -- 解析json返回path指定的内容 获取device_info的值
    get_json_object(
        '{"device_info":{"device_id":"03E7A750","brand":"苹果"},"age":28}',
        '$.device_info.device_id'
        ),                                              -- 解析json返回path指定的内容 获取device_info中device_id的值
    split('ab,cd,ef',','), -- ["ab","cd","ef"]          -- split(string str, string pat) 按照分割符分割,返回分割后的字符串数组 
    find_in_set('ab','ef,ab,de') -- 2                   -- 集合查找函数,返回str在strlist第一次出现的位置,未出现返回0

日期函数

select 
    from_unixtime(
        cast(1622367594000/1000 as bigint),
        'yyyyMMdd'
    ), --  20210530                                     -- UNIX时间戳转日期函数 时间戳要转换成bigint 用cast(filed as bigint)
    unix_timestamp('20210530'), -- 1622367594           -- unix_timestamp(string date[, string pattern])
    to_date('2021-05-30 17:39:54'), -- 2021-05-30	    -- to_date(string timestamp)
    day('2021-05-30 17:39:54'), -- 30                   -- 类似还有year, month, hour, minute, second,注意输入格式为date格式!
    weekofyear('2021-05-30 17:39:54'), -- 21            -- weekofyear (string date) 返回日期在当前的周数
    datediff('2021-05-30','2021-05-20'), -- 10          -- datediff(string enddate, string startdate) 返回日期差
    date_add('2021-05-30',10), --  2021-06-09           -- date_add (string startdate, int days)
    date_sub('2021-06-09',10), --  2021-05-30           -- date_sub(string startdate, int days)
    add_months('2020-07-15',-1), --  2020-06-15         -- 
    dayofweek('2020-04-12'), -- 1:周一, 7 周日          -- 返回周几
    case 
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 1 then '周一'
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 2 then '周二'
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 3 then '周三'
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 4 then '周四'
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 5 then '周五'
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 6 then '周六'
        else '周日'
    end week_day,                                       -- 返回周几
    '{{ macros.hive.max_partition(table="table_name",field = "p_date") }}'
                                                        -- 返回table的最新分区

条件函数

select
   coalesce(null,null,'50'), -- 50                     -- 返回参数中的第一个非空值
   nvl(null,'100') -- 100                              -- 若第一个参数NULL,则返回第二个参数

统计函数(多行变一行)

select
    percentile_approx(view_cnt,array(0.5,0.8)),         --  percentile_approx(DOUBLE col, array(p1 [, p2]...) [, B]) 返回近似的第pth个百分位数
    collect_set(city_id),                               --  将 col 字段进行去重,合并成一个数组
    collect_list(city_id),                              --  将 col 字段合并成一个数组,不去重
    hivemall.to_ordered_list(f.city_id,f.city_id)       --  hivemall.to_ordered_list(col,order_col) 将col字段按order_col排序后合并成一个数组,不去重

表格生成函数(一行拆分多行)

explode(ARRAY) 搭配lateral view

select 
    t.str
from(
    -- 解析出来的字段select  regexp_replace('[496496737761,496556893761,496514794761]','\\[|]','') as strs
) as f 
lateral view 
    explode(split(f.strs,',')) t as str

posexplode(ARRAY) 搭配lateral view

select 
    f.id,
    tt.pos,
    concat_ws('.',get_json_object(tt.table_info,'$.hiveDb'),get_json_object(tt.table_info,'$.hiveTable'))
from 
    table_name as f
lateral view 
    posexplode(json_to_array(get_json_object(f.args,'$.tables'))) tt as pos, table_info
where 
    f.dt = '2020-11-30'
    and f.action = 'UPDATE_HIVE_TABLE_LIFE_CYCLE'

explode(MAP) 搭配lateral view

-- 求每个用户得分score top5 数据
select 
    f.user_id,
    t.flag,
    t.score
from( -- 解析string类型的interests_l1 字段里的值,一行变多行。
    select
        f.user_id,
        t.flag,
        t.score,
        row_number() over (partition by f.user_id order by cast(t.score as bigint) desc, t.flag) rn
    from
        table_name as f 
    lateral view 
        explode( str_to_map(regexp_replace(f.interests_l1,'\\{|\\}|"','')) ) t as flag, score
    where
        f.p_daterange = 'timedecay'
        and f.p_date = '20201215'
        and f.user_id = '1575703519' -- 测试用户
)f 
where f.rn <= 5

开窗函数


  • 窗口函数用于实现分组内所有数据连续累积的统计。
    – 如果不指定rows between关键词,默认为从起点到当前行;
    – 如果不指定order by,则将分组内所有值累加;
  • rows between也就是window子句:
    – preceding:往前; following:往后; current row:当前行; unbounded:窗口边界;
    – unbounded preceding 表示窗口内第一行起点; unbounded following:表示窗口内最后一行终点。

统计分析函数(count,sum,avg,min,max)

select 
    f.user_id,
    f.city_id,
    f.dt,
    f.view_cnt,
    sum(f.view_cnt) over(
        partition by f.city_id 
        order by f.dt,f.user_id
    ) as view_cnt1,                                     -- 默认为从起点到当前行
    sum(f.view_cnt) over(
        partition by f.city_id 
        order by f.dt,f.user_id 
        rows between unbounded preceding and current row
    ) as view_cnt2,                                     -- 从起点到当前行,结果同view_cnt1 
    sum(f.view_cnt) over(
        partition by f.city_id
    ) as view_cnt3,                                     -- 分组内所有行
    sum(f.view_cnt) over(
        partition by f.city_id 
        order by f.dt,f.user_id 
        rows between 3 preceding and current row
    ) as view_cnt4,                                     -- 当前行+往前3行
    sum(f.view_cnt) over(
        partition by f.city_id 
        order by f.dt,f.user_id 
        rows between 3 preceding and 1 following
    ) as view_cnt5,                                     -- 当前行+往前3行+往后1行
    sum(f.view_cnt) over(
        partition by f.city_id 
        order by f.dt,f.user_id 
        rows between current row and unbounded following
    ) as view_cnt6                                      -- 当前行+往后所有行 
from 
    table_name as f 
where 
    f.dt between '2021-05-30' and '2021-06-06'
order by -- 加个排序,便于验证数据
    f.city_id,
    f.dt,
    f.user_id 

序列函数(row_number,rank,dense_rank,ntile)

-- row_number,rank,dense_rank
	select 
	    f.user_id,
	    f.city_id,
	    f.dt,
	    f.view_cnt,
	    row_number() over(
	        partition by f.city_id 
	        order by f.view_cnt
	    ) as rn,                                            -- 分组内序号
	    rank() over(
	        partition by f.city_id 
	        order by f.view_cnt
	    ) as rk,                                            -- 分组排名相等留下空位的序号
	    dense_rank() over(
	        partition by f.city_id 
	        order by f.view_cnt
	    ) as dense_rk                                       -- 分组排名相等不会留下空位的序号
	from 
	    table_name as f 
	where 
	    f.dt = '2021-05-30' 
	order by -- 加个排序,便于验证数据
	    f.city_id,
	    view_cnt,
	    f.user_id 
-- ntile -- 将分组数据按照顺序切分成n片,返回当前切片值,若切片不均匀,默认增加第一个切片的分布。
	select 
	    f.user_id,
	    f.city_id,
	    f.dt,
	    f.view_cnt,
	    ntile(2) over(
	        partition by f.city_id 
	        order by f.view_cnt,f.user_id
	    ) as ntile2,                                    -- 分组内将数据分成2片,切片值范围1、2
	    ntile(3) over(
	        partition by f.city_id 
	        order by f.view_cnt,f.user_id
	    ) as ntile3,                                    -- 分组内将数据分成3片,切片值范围1、2、3
	    ntile(4) over(
	        partition by f.city_id 
	        order by f.view_cnt,f.user_id
	    ) as ntile4                                     -- 分组内将数据分成4片,切片值范围1、2、3、4
	from 
	    table_name as f 
	where 
	    f.dt = '2021-05-30' 
	order by -- 加个排序,便于验证数据
	    f.city_id,
	    view_cnt,
	    f.user_id 

排名占比函数(percent_rank,cume_dist)

-- percent_rank 计算窗口内数据rank值排名占比,(分组内当前行的rank值-1)/(分组内总行数-1)
    -- 实际场景未使用过
    select 
        f.user_id,
        f.city_id,
        f.dt,
        f.view_cnt,
        rank() over(
            partition by f.city_id 
            order by f.view_cnt
        ) as rk,                        -- 分组内排名rank值
        count(1) over(
            partition by f.city_id 
        ) as cnt,                       -- 分组内总行数
        percent_rank() over(
            partition by f.city_id 
            order by f.view_cnt
        ) as percent_rk                 -- (分组内当前行的rank值-1)/(分组内总行数-1)
    from 
        table_name as f 
    where 
        f.dt = '2021-05-30' 
    order by -- 加个排序,便于验证数据
        f.city_id,
        view_cnt,
        f.user_id 
-- cume_dist() 计算窗口内小于(或大于)等于当前值的数据排名占比,小于(或大于)等于当前值的行数/分组内总行数。
    select 
        f.user_id,
        f.city_id,
        f.dt,
        f.view_cnt,
        cume_dist() over(order by f.view_cnt) as all_le_rate,                               -- 全国数据,访问次数顺排占比
        cume_dist() over(partition by f.city_id order by f.view_cnt) as city_le_rate,       -- 城市分组,访问次数顺排占比
        cume_dist() over(partition by f.city_id order by f.view_cnt desc ) as city_ge_rate  -- 城市分组,访问次数倒排占比
    from 
        table_name as f 
    where 
        f.dt = '2021-05-30' 
        -- 数据量多,限制城市,方便验证数据计算正确性
        and f.city_id <= 'D0006' 
    order by -- 加个排序,便于验证数据
        f.city_id,
        view_cnt,
        f.user_id 

行函数(lag,lead,first_value,last_value)

select 
    f.user_id,
    f.dt,
    f.view_cnt,
    lag(f.dt,1,'') over(partition by f.user_id order by f.dt) as pre_dt,        -- 计算用户上一次访问时间,不存在则为空字符
    lead(f.dt,1,'') over(partition by f.user_id order by f.dt) as next_dt,      -- 计算用户下一次访问时间
    first_value(f.dt) over(partition by f.user_id order by f.dt) as first_dt,   -- 截止当前行,计算用户第一次访问时间
    last_value(f.dt) over(partition by f.user_id order by f.dt) as last_dt      -- 截止当前行,计算用户最后一次访问时间
from( -- 计算用户每天访问次数
    select 
        f.user_id,
        f.dt,
        sum(f.view_cnt) as view_cnt
    from 
        table_name as f 
    where 
        f.dt between '2021-05-30' and '2021-06-06'
    group by 
        f.user_id,
        f.dt
) as f
order by -- 加个排序,便于验证数据
    f.user_id,
    f.dt 

分组聚合函数(grouping sets,cube,rollup)

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值