工作常用sql总结

爱创新的小萌

已于 2023-08-21 14:01:35 修改

阅读量3.3k

点赞数 3

分类专栏：数据分析 sql 文章标签： sql 数据分析数据库

于 2022-09-29 19:47:49 首次发布

本文链接：https://blog.csdn.net/qq_42410633/article/details/127067825

版权

数据分析同时被 2 个专栏收录

4 篇文章

订阅专栏

sql

3 篇文章

订阅专栏

本文详细介绍了在Presto和Spark中常用的数据分析SQL函数，包括聚合函数（count、sum、avg、max、min、percentile_approx）、数组操作（array_agg、collect_set）、连表函数（join、leftjoin、rightjoin）、时间函数、字符串操作以及窗口函数row_number()over()等。这些函数在大数据分析中起着关键作用，帮助处理和计算大规模数据。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

这篇文章主要记录个人做数据分析日常使用的sql函数，语法介绍的是presto与spark(语法同hive)两种常用的大数据sql，文章会持续更新，看的时候可以点击右下角第一个只显示目录，老铁们看着爽的话欢迎留言666。
两种sql的基础数据类型
在这里插入图片描述

聚合函数：

count：

统计个数,不会统计为null的数据

-- distinct 去重函数
select count(distinct member_id) as member_num from table

sum：

总和

select sum(amount) as all_amount from table

avg：

平均值

select avg(amount) as avg_amount from table

max：

最大值

select max(amount) as max_amount from table

min：

最小值

select min(amount) as min_amount from table

percentile_approx：

取百分位数

select percentile_approx(amount,0.75) as up_quartile from table

array_agg：

presto聚合为数组

select 
	member_id,
	-- array_distinct对数组进行去重操作
	array_distinct(array_agg(class)) as class 
from 
	table 
group by 
	member_id

collect_set：

spark聚合为数组

select 
	member_id,
	-- collect_set聚合数组已自动去重
	collect_set(class) as class 
from 
	table 
group by 
	member_id

连表函数

join，inner join

(这俩函数一样，join为inner join 缩写)
在这里插入图片描述

内联函数，当条件成立时右表和左表生成新的数据表，左表和右表仅保留符合条件数据。当左表，或右表连接条件不唯一时会出现重复数据。以下数据仅保留table1和table2，table3都具有的member_id。

select
  count(distinct t1.memebr_id) as member_num
from
  table t1
  join table2 t2 on t1.member_id = t2.member_id
  join table3 t3 on t1.member_id = t3.member_id

left join, right join

在这里插入图片描述

左连接，当条件成立时右表加入左表生成新的数据表。左表数据全部保留，右表数据当符合条件时保留下来加入左表，不符合条件则为空。当左表，或右表连接条件不唯一时会出现重复数据。以下数据member_num1为t1表内所有用户数，member_num2为table2中含有table1表中member_id的用户数

select
  count(distinct t1.memebr_id) as member_num1
  count(distinct t2.memebr_id) as member_num2
from
  table t1
  left join table2 t2 on t1.member_id = t2.member_id

union all

上下连接
连接条件：上表和下表表字段必须一致，以下返回table1和table2的数据

select
  member_id,
  amount
from
  table
union all
select
  member_id,
  amount2 as amount
from
  table2

时间函数

文本时间格式转化时间戳

presto：

select to_unixtime(cast('2018-09-05' as timestamp))

spark:

select to_unix_timestamp('2018-09-05' ，'yyyy-mm-dd')

时间戳转化文本时间格式

presto：


select format_datetime(from_unixtime(1234567890),'yyyy-MM-dd')

apark：

select from_unixtime(1234567890,'yyyy-MM-dd')

获取昨日字符日期

-- presto spark通用
select date_sub(current_date,1)

日期计算

后边日期减去前边日期

-- spark计算天数差距
select datediff('2022-02-03','2022-02-01')

-- presto计算天数差距
select date_diff('day',cast('2022-02-03' as date),cast('2022-02-01' as date))

--查看当日日期是当年的第几个星期，第几天week(当年第几个星期)，doy(当年第几天)，day(当月第几天)
SELECT extract(day FROM TIMESTAMP '2019-08-12 01:00:00.123456')

数组

创建数组

select array(1,2) -- saprk

查看数组长度

select cardinality(array) -- presto
select size(array) -- spark

数组排序

select array_sort(array) --presto排序
select sort_array(array,true) -- spark排序 true从小到大，false从大到小

查看数组第几个数据

select array[1]

数组合并

select array_union(array1,array2) --返回数组1和数组2去重后的数据

select array_except(array(1, 2, 3), array(1, 3, 5)) --返回在数组1不在数组2的数

判断元素是否在数组中

-- presto
select contains(x, element)

-- spark
select array_contains(x, element)

拆分数组

数组拆分的时候会从一行数据变为多行数据，下面展示一下拆分后的形式

memberid1	topic_name1
memberid1	topic_name2
memberid2	topic_name1
memberid2	topic_name2

-- presto
select
  member_id,
  CROSS JOIN unnest(topic_name_set) mt2 (topic_name)
from
  table

-- spark
select
  member_id,
  LATERAL VIEW explode(topic_name_set) t AS topic_name
from
  table

合并数组，从字符串变为字符串

-- presto文字聚合把每个memebr_id的content_text内容放在一个字段里
select 
	member_id,
	array_join(array_distinct(array_agg(content_text)), ',') as content_text
from 
	table
group by
	member_id

-- spark文字聚合把每个memebr_id的content_text内容放在一个字段里
select 
	member_id,
	concat_ws(',',collect_set(content_text)) as content_text
from 
	table
group by
	member_id

字符串操作

模糊匹配

like

-- 筛选出名字中带飞的学生 
select
  student_name,
  score
from
  table
where
  student_name like '%飞%'

not like

-- 筛选出名字中不带飞的学生 
select
  student_name,
  score
from
  table
where
  student_name not like '%飞%'

rlike

spark多条件模糊匹配

-- 筛选出名字中带叶、萧、王的学生 
select
  student_name,
  score
from
  table
where
  student_name rlike '叶|萧|王'

regexp_like

presto多条件模糊匹配

-- 筛选出名字中带叶、萧、王的学生 
select
  student_name,
  score
from
  table
where
  regexp_like(student_name,'叶|萧|王')

字符串分割

split

根据中间某一个或某一段字符进行切割，返回为数组形式的数据

-- 结果'a'
select split('a-b', '-')[1] as str

substr，substring

从第几位开始截取，截取几位

select substr('abcdefg',1,3) as a -- 结果'abc'
select substr('abcdefg',-3) as a -- 结果'efg'
select substr('abcdefg',3) as a -- 结果'cdefg'

字符串拼接

concat

把字符串拼接在一起

select concat('a','b','c')--结果abc
-- presto需要转化为字符串类型才能进行字符串拼接，spark不需要,cast转化数据格式
select concat(cast(34 as varchar),'%')--结果'34%'
-- 输出百分比格式小数，round向上取整到第几位
select concat(cast(round(0.3427887*100,2) as varchar),'%') as a

concat_ws

用第一个字符串把后边字符串拼接在一起

select concat_ws('-','2022','09','29') as p_date --结果'2022-09-29'

正则字符串操作

regexp_extract_all

返回所有符合匹配条件的结果

select regexp_extract('1a 2b 14m', '\d+') as a -- 结果[1, 2, 14]

regexp_extract

返回第一个符合匹配条件的结果

select regexp_extract('1a 2b 14m', '\d+') as a -- 结果'1'
select regexp_extract('1a 2b 14m', '(\d+)([a-z+])') as a -- 结果'1a'
select regexp_extract('1a 2b 14m', '(\d+)([a-z+])',2) as a -- 结果'a'
select regexp_extract('1a 2b 14m', '(1a )(.*?)( 14m)',2) as a -- 结果'2b'

regexp_replace

字符串替换

select regexp_replace('1a 2b 14m', '\d+[ab] ') as a --结果14m
select regexp_replace('1a 2b 14m', '(\d+)([ab]) ', '3c$2 ') as a -- 结果'3ca 3cb 14m'

常用函数

case when

判断函数，一般当需要判断的条件较多时使用，当只有两种情况时一般使用if

select
  case
    when a > 1000 then '大于1000'
    when a > 500 then '500到1000'
    when a > 300 then '300到500'
    else '小于300'
  end as price
from
  table

if

--找出分数大于60的学生有多少人
select 
	count(distinct if(score>60,member_id,null)) as student_num 
from 
	table

order by

排序函数，desc从大到小，asc默认从小到大

--筛选班级学生数，并按照从大到小的顺序排序
select
  class_name,
  count(distinct student_id) as student_num
from
  table
where
  p_date = date_sub(current_date, 1)
  and category = '语文'
group by
  class_name
order by
  student_num desc

取json格式里的数据json_extract

json_extract(json, json_path) 第一个参数位json格式的数据，第二个为需要取出的数据在该json数据的路径

select json_format(JSON '{"a": [1,2,3], "b": 2}') as a -- 创建json格式数据
select json_extract(json_format(JSON '{"a": [1,2,3], "b": 2}'),'$.a[0]') as a -- 结果1

窗口函数row_number() over()

row_number() over( partition by … order by …)函数主要为数据进行不聚合的排序，多用于复杂场景，比如用户第一次付款和第二次付款相隔时间；用户支付最大款项的时间，名字…多个数据时；

--用户第一次付款和第二次付款相隔时间
select
 member_id,
 t2.time - t1.time as gap_time
from
 (
   select
     member_id,
     time,
     paid_amount,
     row_number() over(
       partition by member_id
       order by
         time
     ) as paid_order
   from
     table
 ) t1
 left join (
   select
     member_id,
     time,
     paid_amount,
     row_number() over(
       partition by member_id
       order by
         time
     ) as paid_order
   from
     table
 ) t2 on t1.member_id = t2.member_id
where
 t1.paid_order = 1
 and t2.paid_order = 2

其他常见的排名窗口函数：

ROW_NUMBER函数可以为分区中的每行数据分配一个序列号，序列号从1开始。
RANK函数返回当前行在分区中的名次。如果存在名次相同的数据，后续的排名将会产生跳跃。
DENSE_RANK函数返回当前行在分区中的名次。即使存在名次相同的数据，后续的排名也是连续值。
PERCENT_RANK函数以百分比的形式返回当前行在分区中的名次。如果存在名次相同的数据，后续的排名将会产生跳跃。
CUME_DIST函数计算当前行在分区内的累积分布。
NTILE函数将分区内的数据分为N等份，并返回当前行所在的分片位置。
在这里插入图片描述

————————————————
原文链接：https://blog.csdn.net/liangmengbk/article/details/124256665

后言：

本文只展示了常用函数，未概其全，另外博主水平有限，在函数上如果有理解误差的地方欢迎评论指出。最后附铁子们两个地方：
presto：
https://prestodb.io/docs/current/index.html
spark：
https://spark.apache.org/docs/latest/api/sql/