SQL之常用函数汇总

最新推荐文章于 2024-05-09 15:40:32 发布

Jon Shen

最新推荐文章于 2024-05-09 15:40:32 发布

阅读量311

点赞数 1

分类专栏：数据分析进阶之路码农日常文章标签： sql

本文链接：https://blog.csdn.net/VictorJones529/article/details/130529091

版权

数据分析进阶之路同时被 2 个专栏收录

9 篇文章 14 订阅

订阅专栏

码农日常

8 篇文章 0 订阅

订阅专栏

常用函数

数值计算函数
字符串函数
日期函数
条件函数
统计函数（多行变一行）
表格生成函数（一行拆分多行）
开窗函数

数值计算函数

select
    round(3.1415926,4), -- 3.1416                       -- round(double a[, int d]) 保留对应精度，默认d=0                                  
    floor(3.1415926), -- 3                              -- 向下取整                                                  
    ceil(3.1415926), -- 4                               -- 向上取整 等价于ceiling                                                                 
    rand(), -- 0.5577432776034763                       -- rand([int seed]) 如果指定种子seed，则会取到一个稳定的随机数序列                                                  
    pow(2,4), -- 16.0                                   -- 幂函数 等价于power                                
    sqrt(16), -- 4.0                                    -- 开根号                                                                    
    abs(-3.9), -- 3.9                                   -- 绝对值                                                                
    pmod(9,4)  -- 1                                     -- 取余函数

字符串函数

select 
    concat('abc','def','gh'), -- abcdefgh               -- 字符串连接函数
    concat_ws(',','abc','def','gh'), -- abc,def,gh      -- concat_ws(string SEP, string A, string B...) 带连接字符的字符串连接函数
    concat_ws('|',array('a','b','c')), -- a|b|c         -- 数组转换成字符串
    format_number(5.23456,3), -- 5.235                  -- format_number(number x, int d) 小数位格式化成字符串函数 等价于substring
    substr('abcde',3,2), -- cd                          -- substr(string A, int start[, int len]) 默认len为截取到最后
    instr('abcdf','df'), -- 4                           -- 字符串查找函数，返回字符串首次出现的位置 
    locate('a','abcda',2), -- 5                         -- locate(string substr, string str[, int pos]) 返回字符串从pos后查找首次出现的位置 
    length('abc'), -- 3                                 -- 字符串长度
    str_to_map('k1=v1,k2=v2',',','='), -- {"k2":"v2","k1":"v1"}
                                                        -- str_to_map(text[, delimiter1, delimiter2]) 按照给定的分隔符转换成map结构
    upper('abSEd'), --  ABSED                           -- 转大写
    lower('abSEd'), --  absed                           -- 转小写
    trim(' abc '), -- abc                               -- 去空格，ltrim/rtrim 为 左边/右边去空格
    regexp_replace('foobar', 'oo|ar', ''), -- fb        -- 正则表达式替换函数
    get_json_object(
        '{"device_info":{"device_id":"03E7A750","brand":"苹果"},"age":28}',
        '$.device_info'
        ),                                              -- 解析json返回path指定的内容 获取device_info的值
    get_json_object(
        '{"device_info":{"device_id":"03E7A750","brand":"苹果"},"age":28}',
        '$.device_info.device_id'
        ),                                              -- 解析json返回path指定的内容 获取device_info中device_id的值
    split('ab,cd,ef',','), -- ["ab","cd","ef"]          -- split(string str, string pat) 按照分割符分割，返回分割后的字符串数组 
    find_in_set('ab','ef,ab,de') -- 2                   -- 集合查找函数,返回str在strlist第一次出现的位置，未出现返回0

日期函数

select 
    from_unixtime(
        cast(1622367594000/1000 as bigint),
        'yyyyMMdd'
    ), --  20210530                                     -- UNIX时间戳转日期函数 时间戳要转换成bigint 用cast(filed as bigint)
    unix_timestamp('20210530'), -- 1622367594           -- unix_timestamp(string date[, string pattern])
    to_date('2021-05-30 17:39:54'), -- 2021-05-30	    -- to_date(string timestamp)
    day('2021-05-30 17:39:54'), -- 30                   -- 类似还有year, month, hour, minute, second，注意输入格式为date格式！
    weekofyear('2021-05-30 17:39:54'), -- 21            -- weekofyear (string date) 返回日期在当前的周数
    datediff('2021-05-30','2021-05-20'), -- 10          -- datediff(string enddate, string startdate) 返回日期差
    date_add('2021-05-30',10), --  2021-06-09           -- date_add (string startdate, int days)
    date_sub('2021-06-09',10), --  2021-05-30           -- date_sub(string startdate, int days)
    add_months('2020-07-15',-1), --  2020-06-15         -- 
    dayofweek('2020-04-12'), -- 1：周一， 7 周日          -- 返回周几
    case 
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 1 then '周一'
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 2 then '周二'
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 3 then '周三'
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 4 then '周四'
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 5 then '周五'
        when pmod(datediff('2021-05-30','2018-01-01') + 1,7) = 6 then '周六'
        else '周日'
    end week_day,                                       -- 返回周几
    '{{ macros.hive.max_partition(table="table_name",field = "p_date") }}'
                                                        -- 返回table的最新分区

条件函数

select
   coalesce(null,null,'50'), -- 50                     -- 返回参数中的第一个非空值
   nvl(null,'100') -- 100                              -- 若第一个参数NULL,则返回第二个参数

统计函数（多行变一行）

select
    percentile_approx(view_cnt,array(0.5,0.8)),         --  percentile_approx(DOUBLE col, array(p1 [, p2]...) [, B]) 返回近似的第pth个百分位数
    collect_set(city_id),                               --  将 col 字段进行去重，合并成一个数组
    collect_list(city_id),                              --  将 col 字段合并成一个数组，不去重
    hivemall.to_ordered_list(f.city_id,f.city_id)       --  hivemall.to_ordered_list(col,order_col) 将col字段按order_col排序后合并成一个数组,不去重

表格生成函数（一行拆分多行）

explode(ARRAY) 搭配lateral view

select 
    t.str
from(
    -- 解析出来的字段select  regexp_replace('[496496737761,496556893761,496514794761]','\\[|]','') as strs
) as f 
lateral view 
    explode(split(f.strs,',')) t as str

posexplode(ARRAY) 搭配lateral view

select 
    f.id,
    tt.pos,
    concat_ws('.',get_json_object(tt.table_info,'$.hiveDb'),get_json_object(tt.table_info,'$.hiveTable'))
from 
    table_name as f
lateral view 
    posexplode(json_to_array(get_json_object(f.args,'$.tables'))) tt as pos, table_info
where 
    f.dt = '2020-11-30'
    and f.action = 'UPDATE_HIVE_TABLE_LIFE_CYCLE'

explode(MAP) 搭配lateral view

-- 求每个用户得分score top5 数据
select 
    f.user_id,
    t.flag,
    t.score
from( -- 解析string类型的interests_l1 字段里的值，一行变多行。
    select
        f.user_id,
        t.flag,
        t.score,
        row_number() over (partition by f.user_id order by cast(t.score as bigint) desc, t.flag) rn
    from
        table_name as f 
    lateral view 
        explode( str_to_map(regexp_replace(f.interests_l1,'\\{|\\}|"','')) ) t as flag, score
    where
        f.p_daterange = 'timedecay'
        and f.p_date = '20201215'
        and f.user_id = '1575703519' -- 测试用户
)f 
where f.rn <= 5

开窗函数

窗口函数用于实现分组内所有数据连续累积的统计。
– 如果不指定rows between关键词,默认为从起点到当前行；
– 如果不指定order by，则将分组内所有值累加；
rows between也就是window子句：
– preceding：往前; following：往后; current row：当前行; unbounded：窗口边界;
– unbounded preceding 表示窗口内第一行起点; unbounded following：表示窗口内最后一行终点。

统计分析函数(count,sum,avg,min,max)

select 
    f.user_id,
    f.city_id,
    f.dt,
    f.view_cnt,
    sum(f.view_cnt) over(
        partition by f.city_id 
        order by f.dt,f.user_id
    ) as view_cnt1,                                     -- 默认为从起点到当前行
    sum(f.view_cnt) over(
        partition by f.city_id 
        order by f.dt,f.user_id 
        rows between unbounded preceding and current row
    ) as view_cnt2,                                     -- 从起点到当前行，结果同view_cnt1 
    sum(f.view_cnt) over(
        partition by f.city_id
    ) as view_cnt3,                                     -- 分组内所有行
    sum(f.view_cnt) over(
        partition by f.city_id 
        order by f.dt,f.user_id 
        rows between 3 preceding and current row
    ) as view_cnt4,                                     -- 当前行+往前3行
    sum(f.view_cnt) over(
        partition by f.city_id 
        order by f.dt,f.user_id 
        rows between 3 preceding and 1 following
    ) as view_cnt5,                                     -- 当前行+往前3行+往后1行
    sum(f.view_cnt) over(
        partition by f.city_id 
        order by f.dt,f.user_id 
        rows between current row and unbounded following
    ) as view_cnt6                                      -- 当前行+往后所有行 
from 
    table_name as f 
where 
    f.dt between '2021-05-30' and '2021-06-06'
order by -- 加个排序，便于验证数据
    f.city_id,
    f.dt,
    f.user_id

序列函数(row_number,rank,dense_rank,ntile)

-- row_number,rank,dense_rank
	select 
	    f.user_id,
	    f.city_id,
	    f.dt,
	    f.view_cnt,
	    row_number() over(
	        partition by f.city_id 
	        order by f.view_cnt
	    ) as rn,                                            -- 分组内序号
	    rank() over(
	        partition by f.city_id 
	        order by f.view_cnt
	    ) as rk,                                            -- 分组排名相等留下空位的序号
	    dense_rank() over(
	        partition by f.city_id 
	        order by f.view_cnt
	    ) as dense_rk                                       -- 分组排名相等不会留下空位的序号
	from 
	    table_name as f 
	where 
	    f.dt = '2021-05-30' 
	order by -- 加个排序，便于验证数据
	    f.city_id,
	    view_cnt,
	    f.user_id 
-- ntile -- 将分组数据按照顺序切分成n片，返回当前切片值，若切片不均匀，默认增加第一个切片的分布。
	select 
	    f.user_id,
	    f.city_id,
	    f.dt,
	    f.view_cnt,
	    ntile(2) over(
	        partition by f.city_id 
	        order by f.view_cnt,f.user_id
	    ) as ntile2,                                    -- 分组内将数据分成2片,切片值范围1、2
	    ntile(3) over(
	        partition by f.city_id 
	        order by f.view_cnt,f.user_id
	    ) as ntile3,                                    -- 分组内将数据分成3片,切片值范围1、2、3
	    ntile(4) over(
	        partition by f.city_id 
	        order by f.view_cnt,f.user_id
	    ) as ntile4                                     -- 分组内将数据分成4片,切片值范围1、2、3、4
	from 
	    table_name as f 
	where 
	    f.dt = '2021-05-30' 
	order by -- 加个排序，便于验证数据
	    f.city_id,
	    view_cnt,
	    f.user_id

排名占比函数(percent_rank,cume_dist)

-- percent_rank 计算窗口内数据rank值排名占比，(分组内当前行的rank值-1)/(分组内总行数-1)
    -- 实际场景未使用过
    select 
        f.user_id,
        f.city_id,
        f.dt,
        f.view_cnt,
        rank() over(
            partition by f.city_id 
            order by f.view_cnt
        ) as rk,                        -- 分组内排名rank值
        count(1) over(
            partition by f.city_id 
        ) as cnt,                       -- 分组内总行数
        percent_rank() over(
            partition by f.city_id 
            order by f.view_cnt
        ) as percent_rk                 -- (分组内当前行的rank值-1)/(分组内总行数-1)
    from 
        table_name as f 
    where 
        f.dt = '2021-05-30' 
    order by -- 加个排序，便于验证数据
        f.city_id,
        view_cnt,
        f.user_id 
-- cume_dist() 计算窗口内小于(或大于)等于当前值的数据排名占比，小于(或大于)等于当前值的行数/分组内总行数。
    select 
        f.user_id,
        f.city_id,
        f.dt,
        f.view_cnt,
        cume_dist() over(order by f.view_cnt) as all_le_rate,                               -- 全国数据，访问次数顺排占比
        cume_dist() over(partition by f.city_id order by f.view_cnt) as city_le_rate,       -- 城市分组，访问次数顺排占比
        cume_dist() over(partition by f.city_id order by f.view_cnt desc ) as city_ge_rate  -- 城市分组，访问次数倒排占比
    from 
        table_name as f 
    where 
        f.dt = '2021-05-30' 
        -- 数据量多，限制城市，方便验证数据计算正确性
        and f.city_id <= 'D0006' 
    order by -- 加个排序，便于验证数据
        f.city_id,
        view_cnt,
        f.user_id

行函数(lag,lead,first_value,last_value)

select 
    f.user_id,
    f.dt,
    f.view_cnt,
    lag(f.dt,1,'') over(partition by f.user_id order by f.dt) as pre_dt,        -- 计算用户上一次访问时间，不存在则为空字符
    lead(f.dt,1,'') over(partition by f.user_id order by f.dt) as next_dt,      -- 计算用户下一次访问时间
    first_value(f.dt) over(partition by f.user_id order by f.dt) as first_dt,   -- 截止当前行，计算用户第一次访问时间
    last_value(f.dt) over(partition by f.user_id order by f.dt) as last_dt      -- 截止当前行，计算用户最后一次访问时间
from( -- 计算用户每天访问次数
    select 
        f.user_id,
        f.dt,
        sum(f.view_cnt) as view_cnt
    from 
        table_name as f 
    where 
        f.dt between '2021-05-30' and '2021-06-06'
    group by 
        f.user_id,
        f.dt
) as f
order by -- 加个排序，便于验证数据
    f.user_id,
    f.dt