1、HQL每天购课总额前三的用户
原始数据
with tbl_order as
(
select '2019-01-01' as dt, 1 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 2 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 3 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 4 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 1 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 1 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 1 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 1 as uid, '数学' as course_id, 50 as price
)
select * from tbl_order;
2019-01-01 1 数学 50
2019-01-01 2 数学 50
2019-01-01 3 数学 50
2019-01-01 4 数学 50
2019-01-01 1 数学 50
2019-01-01 1 数学 50
2019-01-01 1 数学 50
2019-01-01 1 数学 50
Time taken: 0.156 seconds, Fetched 8 row(s)
示例
with tbl_order as
(
select '2019-01-01' as dt, 1 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 2 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 3 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 4 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 1 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 1 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 1 as uid, '数学' as course_id, 50 as price
union all
select '2019-01-01' as dt, 1 as uid, '数学' as course_id, 50 as price
)
select dt, uid,total_price,rk
from(
select dt, uid, sum(price) as total_price, rank() over (partition by dt order by sum(price) desc) as rk
from tbl_order
group by dt,uid
) a
WHERE rk < 4;
2019-01-01 1 250 1
2019-01-01 4 50 2
2019-01-01 3 50 2
2019-01-01 2 50 2
Time taken: 1.433 seconds, Fetched 4 row(s)
2、HQL实现每日订单量和未填地址订单量以及未填地址订单量N日后的变化
参考:https://blog.51cto.com/u_10120275/2482148
with order_tbl as
(
-- 20200320分区
select 1 as order_id, '天津' as address, 20200319 as trade_time, 20200320 as dt union all
select 2 as order_id, null as address, 20200319 as trade_time, 20200320 as dt union all
select 3 as order_id, '北京' as address, 20200320 as trade_time, 20200320 as dt union all
select 4 as order_id, '北京' as address, 20200320 as trade_time, 20200320 as dt union all
select 5 as order_id, null as address, 20200320 as trade_time, 20200320 as dt union all
select 6 as order_id, null as address, 20200320 as trade_time, 20200320 as dt union all
select 7 as order_id, null as address, 20200320 as trade_time, 20200320 as dt union all
-- 20200321分区
select 1 as order_id, '天津' as address, 20200319 as trade_time, 20200321 as dt union all
select 2 as order_id, '天津' as address, 20200319 as trade_time, 20200321 as dt union all
select 3 as order_id, '北京' as address, 20200320 as trade_time, 20200321 as dt union all
select 4 as order_id, '北京' as address, 20200320 as trade_time, 20200321 as dt union all
select 5 as order_id, '北京' as address, 20200320 as trade_time, 20200321 as dt union all
select 6 as order_id, null as address, 20200320 as trade_time, 20200321 as dt union all
select 7 as order_id, null as address, 20200320 as trade_time, 20200321 as dt union all
select 8 as order_id, '北京' as address, 20200321 as trade_time, 20200321 as dt union all
select 9 as order_id, null as address, 20200321 as trade_time, 20200321 as dt union all
select 10 as order_id, null as address, 20200321 as trade_time, 20200321 as dt union all
-- 20200322分区
select 1 as order_id, '天津' as address, 20200319 as trade_time, 20200322 as dt union all
select 2 as order_id, '天津' as address, 20200319 as trade_time, 20200322 as dt union all
select 3 as order_id, '北京' as address, 20200320 as trade_time, 20200322 as dt union all
select 4 as order_id, '北京' as address, 20200320 as trade_time, 20200322 as dt union all
select 5 as order_id, '北京' as address, 20200320 as trade_time, 20200322 as dt union all
select 6 as order_id, '北京' as address, 20200320 as trade_time, 20200322 as dt union all
select 7 as order_id, null as address, 20200320 as trade_time, 20200322 as dt union all
select 8 as order_id, '北京' as address, 20200321 as trade_time, 20200322 as dt union all
select 9 as order_id, null as address, 20200321 as trade_time, 20200322 as dt union all
select 10 as order_id, null as address, 20200321 as trade_time, 20200322 as dt union all
select 11 as order_id, '北京' as address, 20200322 as trade_time, 20200322 as dt union all
select 12 as order_id, null as address, 20200322 as trade_time, 20200322 as dt union all
select 13 as order_id, null as address, 20200322 as trade_time, 20200322 as dt
)
select trade_time, order_cnt, 1_day_later, 2_day_later, 3_day_later
from
(
select
trade_time,
order_cnt,
no_address_cnt,
LEAD(no_address_cnt, 1, 0 ) over (partition by trade_time order by dt asc) as 1_day_later,
LEAD(no_address_cnt, 2, 0 ) over (partition by trade_time order by dt asc) as 2_day_later,
LEAD(no_address_cnt, 3, 0 ) over (partition by trade_time order by dt asc) as 3_day_later,
row_number () over(partition by trade_time ORDER BY dt) as rn
from
(
select dt, trade_time, count(1) as order_cnt, sum(case when address is null then 1 else 0 end) as no_address_cnt from order_tbl group by dt, trade_time
) a
) b
where rn = 1
order by trade_time desc
trade_time order_cnt 1_day_later 2_day_later 3_day_later
20200322 3 0 0 0
20200321 3 2 0 0
20200320 5 2 1 0
20200319 2 0 0 0
Time taken: 3.778 seconds, Fetched 4 row(s)
3、HQL实现用户访问记录表,即访问A又访问B的用户数
参考:https://blog.51cto.com/u_10120275/2482648
3.1、方法一
with user_visit as
(
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'B' as url union all
select 1 as user_id, 'C' as url union all
select 2 as user_id, 'B' as url union all
select 2 as user_id, 'B' as url union all
select 3 as user_id, 'A' as url union all
select 4 as user_id, 'A' as url union all
select 4 as user_id, 'B' as url union all
select 5 as user_id, 'C' as url union all
select 6 as user_id, 'A' as url union all
select 6 as user_id, 'B' as url
)
--即访问A,又访问B页面的用户数
select count(user_id)
from
(
select user_id, collect_set(url) as url_set
from user_visit
where url = 'A' or url = 'B'
group by user_id
) a
where size(url_set) = 2;
3
Time taken: 0.61 seconds, Fetched 1 row(s)
3.1、方法二
spark-SQL执行示例
with user_visit as
(
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'B' as url union all
select 1 as user_id, 'C' as url union all
select 2 as user_id, 'B' as url union all
select 2 as user_id, 'B' as url union all
select 3 as user_id, 'A' as url union all
select 4 as user_id, 'A' as url union all
select 4 as user_id, 'B' as url union all
select 5 as user_id, 'C' as url union all
select 6 as user_id, 'A' as url union all
select 6 as user_id, 'B' as url
)
--即访问A,又访问B页面的用户数
select count(a.user_id)
from
(
select distinct user_id, url from user_visit where url = 'A'
) a
join
(
select distinct user_id, url from user_visit where url = 'B'
) b;
用上面SQL初试小刀,笛卡尔积报错如下:
2021-04-16 20:10:34,513 INFO internal.SessionState: username : s_xiaojin
Error in query: Detected implicit cartesian product for INNER join between logical plans
Aggregate [user_id#739, url#740]
+- Union
:- Project [1 AS user_id#739, A AS url#740]
: +- OneRowRelation
:- Project [1 AS user_id#741, A AS url#742]
: +- OneRowRelation
:- LocalRelation <empty>, [user_id#743, url#744]
:- LocalRelation <empty>, [user_id#745, url#746]
:- LocalRelation <empty>, [user_id#747, url#748]
:- LocalRelation <empty>, [user_id#749, url#750]
:- Project [3 AS user_id#751, A AS url#752]
: +- OneRowRelation
:- Project [4 AS user_id#753, A AS url#754]
: +- OneRowRelation
:- LocalRelation <empty>, [user_id#755, url#756]
:- LocalRelation <empty>, [user_id#757, url#758]
:- Project [6 AS user_id#759, A AS url#760]
: +- OneRowRelation
+- LocalRelation <empty>, [user_id#761, url#762]
and
Aggregate [user_id#763, url#764]
+- Union
:- LocalRelation <empty>, [user_id#763, url#764]
:- LocalRelation <empty>, [user_id#741, url#742]
:- Project [1 AS user_id#743, B AS url#744]
: +- OneRowRelation
:- LocalRelation <empty>, [user_id#745, url#746]
:- Project [2 AS user_id#747, B AS url#748]
: +- OneRowRelation
:- Project [2 AS user_id#749, B AS url#750]
: +- OneRowRelation
:- LocalRelation <empty>, [user_id#751, url#752]
:- LocalRelation <empty>, [user_id#753, url#754]
:- Project [4 AS user_id#755, B AS url#756]
: +- OneRowRelation
:- LocalRelation <empty>, [user_id#757, url#758]
:- LocalRelation <empty>, [user_id#759, url#760]
+- Project [6 AS user_id#761, B AS url#762]
+- OneRowRelation
Join condition is missing or trivial.
Either: use the CROSS JOIN syntax to allow cartesian products between these
relations, or: enable implicit cartesian products by setting the configuration
variable spark.sql.crossJoin.enabled=true;
正确示例
方式一、
with user_visit as
(
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'B' as url union all
select 1 as user_id, 'C' as url union all
select 2 as user_id, 'B' as url union all
select 2 as user_id, 'B' as url union all
select 3 as user_id, 'A' as url union all
select 4 as user_id, 'A' as url union all
select 4 as user_id, 'B' as url union all
select 5 as user_id, 'C' as url union all
select 6 as user_id, 'A' as url union all
select 6 as user_id, 'B' as url
)
--即访问A,又访问B页面的用户数
select count(distinct a.user_id)
from
(
select distinct user_id, url from user_visit where url = 'A'
) a
CROSS JOIN
(
select distinct user_id, url from user_visit where url = 'B'
) b
on a.user_id=b.user_id;
3
Time taken: 1.946 seconds, Fetched 1 row(s)
方式二、
set spark.sql.crossJoin.enabled=true;
with user_visit as
(
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'B' as url union all
select 1 as user_id, 'C' as url union all
select 2 as user_id, 'B' as url union all
select 2 as user_id, 'B' as url union all
select 3 as user_id, 'A' as url union all
select 4 as user_id, 'A' as url union all
select 4 as user_id, 'B' as url union all
select 5 as user_id, 'C' as url union all
select 6 as user_id, 'A' as url union all
select 6 as user_id, 'B' as url
)
--即访问A,又访问B页面的用户数
select count(distinct a.user_id)
from
(
select distinct user_id, url from user_visit where url = 'A'
) a
JOIN
(
select distinct user_id, url from user_visit where url = 'B'
) b
on a.user_id=b.user_id;
3
Time taken: 1.768 seconds, Fetched 1 row(s)
hive 执行示例
Driver: Hive JDBC (version 1.2.2-mdh2.0.2-SNAPSHOT)
Transaction isolation: TRANSACTION_REPEATABLE_READ
Beeline version 1.2.2-mdh2.0.2-SNAPSHOT by Apache Hive
0: jdbc:hive2://xxx.hive.hadooppdt> set hive.strict.checks.cartesian.product
0: jdbc:hive2://xxx.hive.hadooppdt> ;
+----------------------------------------------------+--+
| set |
+----------------------------------------------------+--+
| hive.strict.checks.cartesian.product is undefined |
+----------------------------------------------------+--+
1 row selected (0.079 seconds)
0: jdbc:hive2://xxx.hive.hadooppdt> set hive.strict.checks.cartesian.product=false;
No rows affected (0.002 seconds)
0: jdbc:hive2://xxx.hive.hadooppdt> set hive.strict.checks.cartesian.product
0: jdbc:hive2://xxx.hive.hadooppdt> ;
+---------------------------------------------+--+
| set |
+---------------------------------------------+--+
| hive.strict.checks.cartesian.product=false |
+---------------------------------------------+--+
1 row selected (0.003 seconds)
set hive.strict.checks.cartesian.product=false;
with user_visit as
(
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'B' as url union all
select 1 as user_id, 'C' as url union all
select 2 as user_id, 'B' as url union all
select 2 as user_id, 'B' as url union all
select 3 as user_id, 'A' as url union all
select 4 as user_id, 'A' as url union all
select 4 as user_id, 'B' as url union all
select 5 as user_id, 'C' as url union all
select 1 as user_id, 'A' as url
)
--即访问A,又访问B页面的用户数
select count(a.user_id)
from
(
select distinct user_id, url from user_visit where url = 'A'
) a
join
(
select distinct user_id, url from user_visit where url = 'B'
) b;
+------+--+
| _c0 |
+------+--+
| 9 |
+------+--+
1 row selected (170.511 seconds)
正确示例
set hive.strict.checks.cartesian.product=false;
with user_visit as
(
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'A' as url union all
select 1 as user_id, 'B' as url union all
select 1 as user_id, 'C' as url union all
select 2 as user_id, 'B' as url union all
select 2 as user_id, 'B' as url union all
select 3 as user_id, 'A' as url union all
select 4 as user_id, 'A' as url union all
select 4 as user_id, 'B' as url union all
select 5 as user_id, 'C' as url union all
select 1 as user_id, 'A' as url
)
--即访问A,又访问B页面的用户数
select count(distinct a.user_id)
from
(
select distinct user_id, url from user_visit where url = 'A'
) a
join
(
select distinct user_id, url from user_visit where url = 'B'
) b;
+------+--+
| _c0 |
+------+--+
| 3 |
+------+--+
1 row selected (194.018 seconds)