1、用户表操作如下:
log_action:
uid time action
1 2019-09-07 12:22:23 read
1 2019-09-07 12:23:23 write
1 2019-09-07 12:26:23 like
1 2019-09-07 12:20:23 share
3 2019-09-07 12:28:23 like
3 2019-09-07 12:29:00 read
3 2019-09-07 12:32:33 comment
4 2019-09-07 12:22:23 read
4 2019-09-07 12:16:18 like
使用hql语句,找到每一个用户在表中的最后一次行为?
select
uid,time,action
from
(
select
uid,time,action,
row_number() over(distribute by uid sort by time desc) rn
from log_action
) a
where rn = 1
;
-- last_value
2、
words表:
uid contents
i love china china china
china is good and i i like love
使用hql语句实现词频的top3统计
word cnt
china 4
i 3
love 2
SELECT
word,
count(1) cnt
FROM words
LATERAL VIEW EXPLODE(split(contents,' ')) t as word
group by word
order by count(1) desc
limit 3
;
-- row_number
3、有如下数据:(建表语句+sql查询)
id names
1 aa,bb,cc,dd,ee
2 aa,bb,ff,ww,qq
3 aa,cc,rr,yy
4 aa,bb,dd,oo,pp
求英雄的出场排名top3的出场次数及出场率
SELECT
name,
cnt,
cnt / scnt lv
FROM
(
SELECT
name,
cnt,
rank() over(distribute by name sort by cnt desc) rn,
sum(cnt) over() scnt
FROM
(
SELECT
name,
count(1) cnt
FROM heros
LATERAL VIEW EXPLODE(SPLIT(names,',')) t as name
group by name
) a
) b
WHERE rn < 4
;
4、有如下数据:
PLATFORM USER_ID CLICK_TIME CLICK_URL
WEB 12332321 2013-03-21 13:48:31.324 /home/
WEB 12332321 2013-03-21 13:48:32.954 /selectcat/er/
WEB 12332321 2013-03-21 13:48:46.365 /er/viewad/12.html
WEB 12332321 2013-03-21 13:48:53.651 /er/viewad/13.html
WEB 12332321 2013-03-21 13:49:13.435 /er/viewad/24.html
WEB 12332321 2013-03-21 13:49:35.876 /selectcat/che/
WEB 12332321 2013-03-21 13:49:56.398 /che/viewad/93.html
WEB 12332321 2013-03-21 13:50:03.143 /che/viewad/10.html
WEB 12332321 2013-03-21 13:50:34.265 /home/
WAP 32483923 2013-03-21 23:58:41.123 /m/home/
WAP 32483923 2013-03-21 23:59:16.123 /m/selectcat/fang/
WAP 32483923 2013-03-21 23:59:45.123 /m/fang/33.html
WAP 32483923 2013-03-22 00:00:23.984 /m/fang/54.html
WAP 32483923 2013-03-22 00:00:54.043 /m/selectcat/er/
WAP 32483923 2013-03-22 00:01:16.576 /m/er/49.html
…… …… …… ……
现需将数据整理成如下数据结构:
PLATFORM USER_ID SEQ FROM_URL TO_URL
WEB 12332321 1 NULL /home/
WEB 12332321 2 /home/ /selectcat/er/
WEB 12332321 3 /selectcat/er/ /er/viewad/12.html
WEB 12332321 4 /er/viewad/12.html /er/viewad/13.html
WEB 12332321 5 /er/viewad/13.html /er/viewad/24.html
WEB 12332321 6 /er/viewad/24.html /selectcat/che/
WEB 12332321 7 /selectcat/che/ /che/viewad/93.html
WEB 12332321 8 /che/viewad/93.html /che/viewad/10.html
WEB 12332321 9 /che/viewad/10.html /home/
WAP 32483923 1 NULL /m/home/
WAP 32483923 2 /m/home/ /m/selectcat/fang/
WAP 32483923 3 /m/selectcat/fang/ /m/fang/33.html
WAP 32483923 4 /m/fang/33.html /m/fang/54.html
WAP 32483923 5 /m/fang/54.html /m/selectcat/er/
WAP 32483923 6 /m/selectcat/er/ /m/er/49.html
…… …… …… …… ……
说明:PLATFORM和USER_ID还是代表平台和用户ID;SEQ字段代表用户按时间排序后的访问顺序,FROM_URL和TO_URL分别代表用户从哪一页跳转到哪一页。对于某个平台上某个用户的第一条访问记录,其FROM_URL是NULL(空值)。
请写出实现语句
select table01.platform,table01.userid,
row_number() over(distribute by platform,userid sort by clicktime) as seq,
lag(clickurl,1) over(distribute by platform,userid sort by clicktime) as fromurl,
table01.clickurl as tourl
from table01;
5、
user_id month amt
1,20170101,100
3,20170101,20
4,20170101,30
1,20170102,200
2,20170102,240
3,20170102,30
4,20170102,2
1,20170101,180
2,20170101,250
3,20170101,30
4,20170101,260
......
......
5,20170101,100
5,20170201,210
5,20170301,320
5,20170401,500
[Hive SQL]统计amt连续3个月,环比增长>50%的user
CREATE TABLE IF NOT EXISTS bg1.amt(
user_id int,
dt string,
amt int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
;
LOAD DATA LOCAL INPATH '/root/amt' OVERWRITE INTO TABLE BG1.amt;
SELECT * FROM bg1.amt;
set hive.exec.mode.local.auto=true;
with tmp as(
SELECT
user_id,
from_unixtime(unix_timestamp(dt,'yyyyMMdd'),'yyyy-MM-01') mon,
sum(amt) amt
FROM bg1.amt
GROUP BY user_id,from_unixtime(unix_timestamp(dt,'yyyyMMdd'),'yyyy-MM-01')
),tmp1 as (
SELECT
*,
lag(mon,1,mon) over(distribute by user_id sort by mon asc) lagmon,
lag(amt,1,amt) over(distribute by user_id sort by mon asc) lagamt
FROM tmp
),tmp2 as (
SELECT
*,
months_between(mon,lagmon) mc,
(amt - lagamt) / lagamt * 100 ac
FROM tmp1
),tmp3 as (
SELECT
*,
case when mc = 1 and ac > 50 then 1 else 0 end tr
FROM tmp2
),tmp4 as (
SELECT
*,
sum(if(tr=0,1,0)) over(distribute by user_id sort by mon asc) gr
FROM tmp3
)
SELECT
user_id
FROM tmp4
GROUP BY
user_id,gr
HAVING COUNT(1) >= 3
;
hive经典查询,top-N统计,英雄出场率,判断月份连续
最新推荐文章于 2024-05-08 10:43:08 发布