1、按行数滑动
时间窗口函数over(partition by ... order by ... rows between .. and ..)
hive之窗口函数理解与实践_luffy的博客-CSDN博客
案例:取每条数据 对应的 指定行范围内的数据集
-- 1.1 获取每条数据的比较对象
-- 【重点】比较对象限定在:同一发生区域city_code、同一事件类型sjlbdm、同一月内
-- 【思路】比较对象以map<string,string>类型存储在hive表,其中key-事件唯一标识,value-事件内容
-- 【实现】str_to_map函数、collect_list函数、窗口函数
-- 【注意点】str_to_map函数是根据英文逗号来识别一个key-value对的,所以要先处理掉文本中可能存在的英文逗号,否则会出现null。
-- 【函数】collect_list(xx) 把xx组合成list。如果后接时间窗口函数over(partition by ... order by ... rows between .. and ..),则组合的xx来源时间窗口内的数据;否则,xx取全表数据。
drop table if exists issues_map_3d;
create table issues_map_3d as
select
a.xxzjbh,
a.sjqk_jyqk,
a.sjbt,
a.city_code,
a.sjlbdm,
from_unixtime(unix_timestamp(a.fsrq_rq,'yyyy-MM-dd HH:mm:ss'),'yyyy-MM-dd') as fsday,
from_unixtime(unix_timestamp(a.fsrq_rq,'yyyy-MM-dd HH:mm:ss'),'yyyy-MM') as fsmonth,
str_to_map(
concat_ws(",",collect_list(concat_ws(':', cast(xxzjbh as string), regexp_replace(sjqk_jyqk, ',', '')))
over(
partition by city_code, sjlbdm, from_unixtime(unix_timestamp(a.fsrq_rq,'yyyy-MM-dd HH:mm:ss'),'yyyy-MM')
order by xxzjbh rows between 1 following and unbounded following)
)
) as contentmap
from issues a;
【以下是拓展:使用udtf接收hive的map<String,String>类型数据。如果只看时间窗口函数,可以跳过1.2】
-- 1.2【自定义的udtf函数说明】java 类中用LazyMap接收hive的map<String,String>类型数据
-- step1、编写sql,内容如下:
use jiangsu;
-- 关闭向量化查询
set hive.vectorized.execution.enabled=false;
set hive.vectorized.execution.reduce.enabled=false;
add jar hdfs://tqHadoopCluster/cf/validate-and-count-lazymap-result-0.0.1-bin.jar;
create temporary function checkudf_lazymap as 'com.tianque.IssueCheckLazyMapUDTF';
drop table if exists issues_check_lazymap_3d;
create table issues_check_lazymap_3d as
select
a.xxzjbh as xxzjbh,
a.sjqk_jyqk as sjqk_jyqk,
a.sjbt as sjbt,
a.city_code as city_code,
a.sjlbdm as sjlbdm,
a.fsday as fsday,
a.fsmonth as fsmonth,
t1.check_result as issame,
t1.check_ids as sameids,
t2.check_result as issimilarity,
t2.check_ids as similarityids
from issues_map_3d a
lateral view checkudf_lazymap(sjqk_jyqk, contentmap, 0.95) t1
lateral view checkudf_lazymap(sjqk_jyqk, contentmap, 0.80) t2;
2、 按时间滑动
Hive实现时间滑动窗口计算_一个写湿的程序猿的博客-CSDN博客_hive滑动窗口
案例:取每条数据 对应的 近7天内的数据集
备注:案例用inner join实现自身关联,当 当天数据只有1条时,如果join on条件中限定a.id!=b.id, 那该条数据会被消去(可见1、2中的结果中都没有a.id=11的数据)。如果想保留这种数据,可以使用left join。
-- 测试: 取每条数据对应的前7天数据
drop table if exists test;
create table test(
id int,
createdate string,
day date
)
row format delimited fields terminated by '\001';
insert into test values (1, '2022-08-01 20:33:10', '2022-08-01');
insert into test values (2, '2022-08-02 05:34:10', '2022-08-02');
insert into test values (3, '2022-08-07 13:11:26', '2022-08-07');
insert into test values (4, '2022-08-08 35:26:26', '2022-08-08');
insert into test values (5, '2022-08-06 08:56:11', '2022-08-06');
insert into test values (6, '2022-08-10 08:56:11', '2022-08-10');
insert into test values (7, '2022-08-15 08:56:11', '2022-08-15');
insert into test values (8, '2022-08-12 08:56:11', '2022-08-12');
insert into test values (9, '2022-08-10 11:13:12', '2022-08-10');
insert into test values (10, '2022-08-18 11:13:12', '2022-08-18');
insert into test values (11, '2022-07-26 11:13:12', '2022-07-26');
-- (1)取t1中每条数据对应的前7天数据,且不包含自身(id是主键)
select t1.id,t1.createdate,t2.id,t2.createdate
from
test t1
inner join
test t2
on datediff(t1.day, t2.day) between 0 and 6 and t1.id!=t2.id;
-- 结果
-- +--------+----------------------+--------+----------------------+
-- | t1.id | t1.createdate | t2.id | t2.createdate |
-- +--------+----------------------+--------+----------------------+
-- | 3 | 2022-08-07 13:11:26 | 1 | 2022-08-01 20:33:10 |
-- | 3 | 2022-08-07 13:11:26 | 2 | 2022-08-02 05:34:10 |
-- | 3 | 2022-08-07 13:11:26 | 5 | 2022-08-06 08:56:11 |
-- | 10 | 2022-08-18 11:13:12 | 7 | 2022-08-15 08:56:11 |
-- | 10 | 2022-08-18 11:13:12 | 8 | 2022-08-12 08:56:11 |
-- | 6 | 2022-08-10 08:56:11 | 3 | 2022-08-07 13:11:26 |
-- | 6 | 2022-08-10 08:56:11 | 9 | 2022-08-10 11:13:12 |
-- | 6 | 2022-08-10 08:56:11 | 4 | 2022-08-08 35:26:26 |
-- | 6 | 2022-08-10 08:56:11 | 5 | 2022-08-06 08:56:11 |
-- | 5 | 2022-08-06 08:56:11 | 1 | 2022-08-01 20:33:10 |
-- | 5 | 2022-08-06 08:56:11 | 2 | 2022-08-02 05:34:10 |
-- | 4 | 2022-08-08 35:26:26 | 3 | 2022-08-07 13:11:26 |
-- | 4 | 2022-08-08 35:26:26 | 2 | 2022-08-02 05:34:10 |
-- | 4 | 2022-08-08 35:26:26 | 5 | 2022-08-06 08:56:11 |
-- | 8 | 2022-08-12 08:56:11 | 3 | 2022-08-07 13:11:26 |
-- | 8 | 2022-08-12 08:56:11 | 9 | 2022-08-10 11:13:12 |
-- | 8 | 2022-08-12 08:56:11 | 4 | 2022-08-08 35:26:26 |
-- | 8 | 2022-08-12 08:56:11 | 6 | 2022-08-10 08:56:11 |
-- | 8 | 2022-08-12 08:56:11 | 5 | 2022-08-06 08:56:11 |
-- | 2 | 2022-08-02 05:34:10 | 1 | 2022-08-01 20:33:10 |
-- | 7 | 2022-08-15 08:56:11 | 9 | 2022-08-10 11:13:12 |
-- | 7 | 2022-08-15 08:56:11 | 8 | 2022-08-12 08:56:11 |
-- | 7 | 2022-08-15 08:56:11 | 6 | 2022-08-10 08:56:11 |
-- | 9 | 2022-08-10 11:13:12 | 3 | 2022-08-07 13:11:26 |
-- | 9 | 2022-08-10 11:13:12 | 4 | 2022-08-08 35:26:26 |
-- | 9 | 2022-08-10 11:13:12 | 6 | 2022-08-10 08:56:11 |
-- | 9 | 2022-08-10 11:13:12 | 5 | 2022-08-06 08:56:11 |
-- | 1 | 2022-08-01 20:33:10 | 11 | 2022-07-26 11:13:12 |
-- +--------+----------------------+--------+----------------------+
-- (2)取t1中每条数据对应的前7天数据,且不包含自身(id是主键),同时同一天的事件时间 不晚于 当前事件时间
select t1.id,t1.createdate,t2.id,t2.createdate
from
test t1
inner join
test t2
on datediff(t1.day, t2.day) between 0 and 6 and t1.id!=t2.id and t1.createdate>=t2.createdate;
-- 1 2022-08-01 20:33:10 11 2022-07-26 11:13:12
-- 2 2022-08-02 05:34:10 1 2022-08-01 20:33:10
-- 3 2022-08-07 13:11:26 1 2022-08-01 20:33:10
-- 3 2022-08-07 13:11:26 2 2022-08-02 05:34:10
-- 3 2022-08-07 13:11:26 5 2022-08-06 08:56:11
-- 4 2022-08-08 35:26:26 2 2022-08-02 05:34:10
-- 4 2022-08-08 35:26:26 3 2022-08-07 13:11:26
-- 4 2022-08-08 35:26:26 5 2022-08-06 08:56:11
-- 5 2022-08-06 08:56:11 1 2022-08-01 20:33:10
-- 5 2022-08-06 08:56:11 2 2022-08-02 05:34:10
-- 6 2022-08-10 08:56:11 3 2022-08-07 13:11:26
-- 6 2022-08-10 08:56:11 4 2022-08-08 35:26:26
-- 6 2022-08-10 08:56:11 5 2022-08-06 08:56:11
-- 7 2022-08-15 08:56:11 6 2022-08-10 08:56:11
-- 7 2022-08-15 08:56:11 8 2022-08-12 08:56:11
-- 7 2022-08-15 08:56:11 9 2022-08-10 11:13:12
-- 8 2022-08-12 08:56:11 3 2022-08-07 13:11:26
-- 8 2022-08-12 08:56:11 4 2022-08-08 35:26:26
-- 8 2022-08-12 08:56:11 5 2022-08-06 08:56:11
-- 8 2022-08-12 08:56:11 6 2022-08-10 08:56:11
-- 8 2022-08-12 08:56:11 9 2022-08-10 11:13:12
-- 9 2022-08-10 11:13:12 3 2022-08-07 13:11:26
-- 9 2022-08-10 11:13:12 4 2022-08-08 35:26:26
-- 9 2022-08-10 11:13:12 5 2022-08-06 08:56:11
-- 9 2022-08-10 11:13:12 6 2022-08-10 08:56:11
-- 10 2022-08-18 11:13:12 7 2022-08-15 08:56:11
-- 10 2022-08-18 11:13:12 8 2022-08-12 08:56:11
【以下是把数据行转列,可以不看】
-- (3)把每条数据 近7天 的数据用map组装成一行
drop table if exists test_3d;
create table test_3d as
select /*+ repartition(240) */
id,
createdate,
str_to_map(
concat_ws(",",collect_list(concat_ws(':', cast(done_id as string), regexp_replace(done_createdate, ',', ''))))
) as contentmap
from
(
select t1.id,t1.createdate,t2.id as done_id,t2.createdate as done_createdate
from
test t1
left join
test t2
on datediff(t1.day, t2.day) between 0 and 6 and t1.id!=t2.id
) t
group by t.id,t.createdate;
3、切片函数 ntile
hive之切片排序窗口函数_南枫&的博客-CSDN博客_hive 切片
案例:把数据分成 指定数量 的分片数。
-- step1: 把tq_yybj_issue表中的数据分成240个分片,每个分片的数量 = tq_yybj_issue表总数据量/240。同分片的数据会被标记相同的分片编码group_num。
drop table if exists tq_yybj_group_issue;
create table tq_yybj_group_issue as
select ajbh,ajxq, ntile(240) over(order by ajbh) as group_num from tq_yybj_issue;
-- step2: 按分组组合数据
select
str_to_map(
concat_ws(",",collect_list(concat_ws(':', ajbh, ajxq)))
) as content_group_map
from
tq_yybj_group_issue
group by group_num;
拓展:
从窗口中获取的数据集可以结合算法udf做 相似算法比较、算法的批量预测等等,应用代码可见spark sql获取指定区间的数据集_风路丞的博客-CSDN博客