nvl空字段赋值 nvl(comm, -1)
count(*) sum(1)
LATERAL VIEW udtf(expression) tableAlias AS columnAlias
split()切成后为数组
explode()将 hive 一列中复杂的 Array 或者 Map 结构拆分成多行。
CAST('1' AS INT) 字符串'1'转换成整数1;如果强制类型转换失败,如执行
CAST('X' AS INT),表达式返回空值 NULL
concat 拼接函数
collect_list 将某字段的值进行汇总,产生array类型字段。
collect_set 将某字段的值进行去重汇总,产生array类型字段。
concat_ws(separator,[string|array(string)]) separator是分隔符,数组按照分隔符进行拼接
CURRENT ROW:当前行
n PRECEDING:往前 n 行数据
n FOLLOWING:往后 n 行数据
UNBOUNDED:起点,
UNBOUNDED PRECEDING 表示从前面的起点,
UNBOUNDED FOLLOWING 表示到后面的终点
LAG(col,n,default_val):往前第 n 行数据
LEAD(col,n, default_val):往后第 n 行数据
NTILE(n):把有序窗口的行分发到指定数据的组中,各个组有编号,编号从 1 开始,对
于每一行,NTILE 返回此行所属的组的编号。注意:n 必须为 int 类型。
datediff(CURRENT_DATE(), "1990-06-04");
date_add(current_date(), 90);
current_date 返回当前日期
next_day 返回指定日期之后一周中特定的日期
date_format
last_day 获取每个月的最后一天
get_json_object
get_json_object(json_txt, path): 从一个JSON字符串中取出指定路径对应的数据!
核心:path怎么写?
$: 代表根对象
. : 获取子元素的操作符
[] : 获取一个数组中子元素的操作符
round( , ) 保留几位小数
--各种聚合
select name,orderdate,cost,
sum(cost) over() as sample1,--所有行相加
sum(cost) over(partition by name) as sample2,--按name分组,组内数据相加
sum(cost) over(partition by name order by orderdate) as sample3,--按name分组,组内数据累加
sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING and current row ) as sample4 ,--和sample3一样,由起点到当前行的聚合
sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING and current row) as sample5, --当前行和前面一行做聚合
sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6,--当前行和前边一行及后面一行
sum(cost) over(partition by name order by orderdate rows between current row and UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行
from business;
1.最多连胜次数
透过题目看本质:开窗函数算是一个打标记,然后我通过打的标记,去找对应的规律。得出最终的结果。
题目解析:
2.连胜的最大天数
3.直播间访客峰值
4.相互关注
5.统计累积访问次数
需求:我们有如下的用户访问数据
要求:使用SQL统计出每个用户的累积访问次数,如下表所示:
create database test_sql;
use test_sql;
--第一题
CREATE TABLE test_sql.test1 (
userId string,
visitDate string,
visitCount INT )
ROW format delimited FIELDS TERMINATED BY "\t";
INSERT INTO TABLE test_sql.test1
VALUES
( 'u01', '2017/1/21', 5 ),
( 'u02', '2017/1/23', 6 ),
( 'u03', '2017/1/22', 8 ),
( 'u04', '2017/1/20', 3 ),
( 'u01', '2017/1/23', 6 ),
( 'u01', '2017/2/21', 8 ),
( 'u02', '2017/1/23', 6 ),
( 'u01', '2017/2/22', 4 );
set spark.sql.shuffle.partitions=4;
select *,
sum(sum1) over (partition by userId order by visitMonth rows between unbounded preceding and current row ) as sum2--累积
from
(select userId,
substr(visitDate,0,6) as visitMonth,
sum(visitCount) as sum1 --小计
from test1
group by userId,substr(visitDate,0,6)) t
order by userId,visitMonth;
--优化1
select *,
sum(sum1) over (partition by userId order by visitMonth rows between unbounded preceding and current row ) as sum2--累积
from
(select userId,
date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') as visitMonth,
sum(visitCount) as sum1 --小计
from test1
group by userId,date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') ) t
order by userId,visitMonth;
6.2017年11月的新客数
数据准备
CREATE TABLE test_sql.test3 (
dt string,
order_id string,
user_id string,
amount DECIMAL ( 10, 2 ) )
ROW format delimited FIELDS TERMINATED BY '\t';
INSERT INTO TABLE test_sql.test3 VALUES ('2017-01-01','10029028','1000003251',33.57);
INSERT INTO TABLE test_sql.test3 VALUES ('201