目录
1、如何快速生成多行的序列
-- 需求: 请生成一列数据, 内容为 1 , 2 , 3 , 4 ,5 仅使用select语句
select explode(split('1,2,3,4,5',',')) as num;
-- 需求: 请生成一列数据, 内容 1~100 python 中有一个函数 range(1,100)
-- SQL函数: https://spark.apache.org/docs/3.1.2/api/sql/index.html
-- sequence(start, stop, step) : 参数1: 起始值 参数2 结束值 参数3 步长(默认为1) 包头包尾
select explode(sequence(1,100)) as num;
2、如何快速生成表数据
快速生成列
--生成一列 1,2,3,4,5的数据
select explode(split('1,2,3,4,5', ',')) as num;
--生成一列数据内容1-100
select sequence(1,100,1);
-- 索引从0开始
select explode(sequence(0,100,20))as num;
快速生成内容
-- 生成一个两行两列的数据,第一行放置男 m,第二行放置女 f
select stack(2,'男','M','女','F')as (sex1,sex2);select stack(2,'M','F')as sex;
永久视图
-- 创建永久视图
create or replace view t3 as
select stack(2,'M','F')as sex;
select * from t3;
临时视图
-- 创建临时视图
create or replace temporary view t4 as
select stack(2,'男','女')as sex;
select * from t4;
复制表
-- 创建复制表
use test_sql;
create table t5 as
select stack(2,'M','F')as sex;
select * from t5;
缓存相关内容
-- 缓存表
cache table t6 as
select stack(2,'M','F')as sex;
--清空一个表的缓存数据
UNCACHE TABLE t6;
--清空所有的缓存
clear chche;
3.开窗函数
创建数据
-- 初始化一些数据
create or replace temporary view t1(cookie, dateStr,pv) as
values ('cookie1','2022-10-05',80),
('cookie1','2022-10-06',75),
('cookie1','2022-10-07',75),
('cookie1','2022-10-08',79),
('cookie1','2022-10-09',85),
('cookie1','2022-10-10',71),
('cookie2','2022-10-05',80),
('cookie2','2022-10-06',74),
('cookie2','2022-10-07',79),
('cookie2','2022-10-08',92),
('cookie2','2022-10-09',95);
select * from t1;
cookie1,2022-10-05,80
cookie1,2022-10-06,75
cookie1,2022-10-07,75
cookie1,2022-10-08,79
cookie1,2022-10-09,85
cookie1,2022-10-10,71
cookie2,2022-10-05,80
cookie2,2022-10-06,74
cookie2,2022-10-07,79
cookie2,2022-10-08,92
cookie2,2022-10-09,95
排序函数
--排序函数 1234,1223,1224
select *,row_number() over (partition by cookie order by pv desc )as rn
from t1;
select *,dense_rank() over (partition by cookie order by pv desc )as rn
from t1;
select *,rank() over (partition by cookie order by pv desc )as rn
from t1;
平分函数
ntile(N): 在进行打标记的时候, 会根据N将窗口内的数据划分为等份的N份, 每一份打上相同的标记
select *,ntile(4) over (partition by cookie order by pv desc) as rn4
from t1;cookie1,2022-10-09,85,1
cookie1,2022-10-05,80,1
cookie1,2022-10-08,79,2
cookie1,2022-10-06,75,2
cookie1,2022-10-07,75,3
cookie1,2022-10-10,71,4
聚合函数
第二类: 和聚合函数组合使用
可以通过窗口实现级联求各种值或者累计求各种值的操作:
当后续遇到需要在计算的时候, 将当前行或者之前或者之后的相关某几行进行计算的时候, 可以使用这个方案说明:
1- 如果排序字段存在重复值, 默认会将重复的范围内全部数据级联计算在一起
2- 如果没有排序字段, 整个窗口会全部打开, 不管执行到哪一行, 都是针对整个窗口进行级联计算
3- 可以通过rows between方式来锁定窗口的范围:
N preceding : 表示往前的N行, N的取值可以是具体的数据, 也可以是关键词(unbounded(边界))
N following : 表示往后的N行, N的取值可以是具体的数据, 也可以是关键词(unbounded(边界))
current row : 当前行
*/
select *,sum(pv)over (partition by cookie order by pv desc )as rn1
from t1;cookie1,2022-10-09,85,85
cookie1,2022-10-05,80,165
cookie1,2022-10-08,79,244
cookie1,2022-10-06,75,394
cookie1,2022-10-07,75,394
cookie1,2022-10-10,71,465
--从前面计算到当前行
select *,sum(pv)over (partition by cookie
order by pv rows between unbounded preceding and current row )as rn2
from t1;cookie1,2022-10-10,71,71
cookie1,2022-10-06,75,146
cookie1,2022-10-07,75,221
cookie1,2022-10-08,79,300
cookie1,2022-10-05,80,380
cookie1,2022-10-09,85,465
--将当前行,前一行,后一行进行计算
select *,sum(pv) over(partition by cookie
order by pv rows between 1 preceding and 1 following ) as rn3
from t1;cookie1,2022-10-10,71,146
cookie1,2022-10-06,75,221
cookie1,2022-10-07,75,229
cookie1,2022-10-08,79,234
cookie1,2022-10-05,80,244
cookie1,2022-10-09,85,165
-- 从当前行计算到最后
select *,sum(pv) over (partition by cookie
order by pv rows between current row and unbounded following)as rn4
from t1;cookie1,2022-10-10,71,465
cookie1,2022-10-06,75,394
cookie1,2022-10-07,75,319
cookie1,2022-10-08,79,244
cookie1,2022-10-05,80,165
cookie1,2022-10-09,85,85
-- 计算窗口内所有的,rn列的每一个数字都是pv窗口内求和的总数
select *,sum(pv) over (partition by cookie order by pv
rows between unbounded preceding and unbounded following)as rn6
from t1;cookie1,2022-10-10,71,465
cookie1,2022-10-06,75,465
cookie1,2022-10-07,75,465
cookie1,2022-10-08,79,465
cookie1,2022-10-05,80,465
cookie1,2022-10-09,85,465
向上向下窗口函数
lag(字段, 往前第N行, 默认值): 可以实现将对应的字段的前N行的值和当前行放置到同一行中, 如果没有, 设置为默认值
lead(字段, 往后第N行, 默认值): 可以实现将对应的字段的后N行的值和当前行放置到同一行中, 如果没有, 设置为默认值以上的两个分析函数, 必须带上排序操作
first_value(字段) : 用于将字段的第一行的值 和 每一行放置在一起
last_value(字段) : 用于将字段的最后一行的值 和 每一行放置在一起
注意:
如果有排序字段, 会形成当前行和当前行的值放置在一起
解决方案:
方案一: 去除排序字段, 但是弊端没有排序操作, 可能最后一行不是你所想要的最后一行的值
方案二: 保留排序操作, 通过 rows between 强制打开窗口最大范围
rows between unbounded preceding and unbounded following应用场景: 当我们需要将当前行和之前或者之后的某一行进行计算操作的时候, 需要通过这类分析函数, 将其放置到同一行进行处理
比如说: 转换率计算
*/
-- lag向上取第2个数作为rn列的值
select *,lag(pv,2,0)over (partition by cookie order by dateStr)as rn1
from t1;cookie1,2022-10-05,80,0
cookie1,2022-10-06,75,0
cookie1,2022-10-07,75,80
cookie1,2022-10-08,79,75
cookie1,2022-10-09,85,75
cookie1,2022-10-10,71,79
-- lead向下取第2个值作为rn列的值
select *,lead(pv,2,0) over (partition by cookie order by dateStr) as rn2
from t1;cookie1,2022-10-05,80,75
cookie1,2022-10-06,75,79
cookie1,2022-10-07,75,85
cookie1,2022-10-08,79,71
cookie1,2022-10-09,85,0
cookie1,2022-10-10,71,0
--first value取窗口内最开头的值作为rn列的值
select *,first_value(pv) over (partition by cookie order by dateStr desc )as rn3
from t1;cookie1,2022-10-10,71,71
cookie1,2022-10-09,85,71
cookie1,2022-10-08,79,71
cookie1,2022-10-07,75,71
cookie1,2022-10-06,75,71
cookie1,2022-10-05,80,71
--last value取最后的值作为rn列的值,如果有排序字段,就会形成rn列的值和当前行一样,等于没用
select *,last_value(pv) over (partition by cookie order by dateStr) as rn4
from t1;
-- 解决方法,要么把排序的字段去掉,要么设置窗口变成最大
-- last value ,留下排序的同时,设置窗口最大数,这样就能取到最后的数作为rn列的值
-- unbounded preceding and unbounded following 无界前和无界后
select *,last_value(pv) over(partition by cookie order by dateStr
rows between unbounded preceding and unbounded following
) as rn5
from t1;