hive的练习题
1:编写sql实现每个用户截止到每月为止的最大单月访问次数和累 计到该月的总访问次数
数据: userid,month,visits
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
A,2015-02,4
A,2015-02,6
B,2015-02,10
B,2015-02,5
A,2015-03,16
A,2015-03,22
B,2015-03,23
B,2015-03,10
B,2015-03,1
create table visits(
userid string,
month string,
visits int
)
row format delimited fields terminated by ','
;
load data local inpath '/hivedata/visits.txt' overwrite into table visits;
select
userid,
month,
visits,
max(visits) over(distribute by userid sort by month) maxvisit,
sum(visits) over(distribute by userid sort by month) totalvisit
from (
select
userid,
month,
sum(visits) visits
from visits
group by userid,month
) t1 ;
2: 求出每个栏目的被观看次数及累计观看时长
数据: vedio表
Uid channel min
1 1 23
2 1 12
3 1 12
4 1 32
5 1 342
6 2 13
7 2 34
8 2 13
9 2 134
参考答案: create table video(
Uid int,
channel string,
min int )
row format delimited fields terminated by ' '
;
load data local inpath '/home/hivedata/video.txt' into table video;
select
channel,
count(Uid) num, --也可以写成count(1)
sum(min) total
from
video
group by channel ;
3: 编写连续7天登录的总人数
数据: t1表
Uid dt login_status(1登录成功,0异常)
1 2019-07-11 1
1 2019-07-12 1
1 2019-07-13 1
1 2019-07-14 1
1 2019-07-15 1
1 2019-07-16 1
1 2019-07-17 1
1 2019-07-18 1
2 2019-07-11 1
2 2019-07-12 1
2 2019-07-13 0
2 2019-07-14 1
2 2019-07-15 1
2 2019-07-16 0
2 2019-07-17 1
2 2019-07-18 0
3 2019-07-11 1
3 2019-07-12 1
3 2019-07-13 1
3 2019-07-14 1
3 2019-07-15 1
3 2019-07-16 1
3 2019-07-17 1
3 2019-07-18 1
create table login1(
uid int,
dt string,
login_status int
) row format delimited fields terminated by ' '
;
load data local inpath '/hivedata/login.txt' into table login;
select
uid,
dt
from
(
select
t1.uid uid,
date_sub(t1.dt,t1.rm) dt
from (
select
uid,
dt,
row_number() over(distribute by uid sort by dt) rm
from
login1
where login_status=1
) t1
)t2
group by uid,dt
having count(uid) =7 ;
select
tmp.uid uid,
tmp.dt
from(
select
uid,
date_sub(dt,row_number() over(distribute by uid sort by dt)) as dt,
login_status
from login1
where login_status=1
) tmp
group by uid,dt
having count(uid)>7
;
4: 编写sql语句实现每班前三名,分数一样并列,同时求出前三名 按名次排序的一次的分差:
数据: stu表
Stu_no class score
1 1901 90
2 1901 90
3 1901 83
4 1901 60
5 1902 66
6 1902 23
7 1902 99
8 1902 67
9 1902 87
select
tmp.class,
tmp.stu_no,
tmp.score,
tmp.score-nvl(LAG(tmp.score) over(distribute by tmp.class sort by rn),0),
tmp.rn
from(
select
class,
stu_no,
score,
row_number() over(distribute by class sort by score desc) rn
from student
) tmp
where tmp.rn<4
;
create table student(
stu_no int,
class string,
score int
)
row format delimited fields terminated by '\t'
;
load data local inpath '/home/hivedata/stu.txt' into table stu;
select
class,
stu_no,
score,
dr,
score-nvl(lag(score) over(distribute by class sort by dr),0)
from (
select
class,
stu_no,
score,
row_number() over(distribute by class sort by score desc) dr
from student) t1
where t1.dr<4 ;
5:每个店铺的当月销售额和累计到当月的总销售额
数据:
店铺,月份,金额
a,01,150
a,01,200
b,01,1000
b,01,800
c,01,250
c,01,220
b,01,6000
a,02,2000
a,02,3000
b,02,1000
create table shop1(
shopid string,
month string,
money int )
row format delimited fields terminated by ',' ;
load data local inpath '/home/hivedata/shop1.txt' overwrite into table shop1;
select
shopid,
month,
tmp.m,
tmp.m+lag(tmp.m,1,0) over(partition by shopid order by month)
from(
select
shopid,
month,
sum(money) m
from
shop1
group by shopid,month
) tmp
group by shopid,month,tmp.m;
select
t1.shopid,
t1.month,
t1.m,
sum(t1.m) over(partition by shopid order by month)
from
(select
shopid,
month,
sum(money) m
from
shop1
group by shopid,month)t1
6:订单及订单类型行列转换
t1表:
order_id order_type order_time
111 N 10:00
111 A 10:05
111 B 10:10
是用hql获取结果如下:
order_id order_type_1 order_type_2 order_time_1 order_time_2
111 N A 10:00 10:05
111 A B 10:05 10:10
create table t11
(order_id int,
order_type string,
order_time string)
row format delimited fields terminated by ' '
load data local inpath '/home/hadoop/exam/t11' into table t11
select
*
from
(select
order_id,
order_type,
lead(order_type,1) over(partition by order_id order by order_time) order_type1,
order_time,
lead(order_time,1) over(partition by order_id order by order_time) order_time1
from
t11)t1
where t1.order_type1 is not null
7://找出用户在表中的第一次行为
select
t1.id,
t1.time,
t1.action
from
(select
id,
time,
action,
first_value(time) over(partition by id order by time) first
from
test2)t1
where t1.time=t1.first;
8: 某APP每天访问数据存放在表access_log里面,包含日期字段 ds,用户类型字段user_type,用户账号user_id,用户访问时间 log_time,请使用hive的hql语句实现如下需求
(1)、每天整体的访问UV、PV?
(2)、每天每个类型的访问UV、PV?
(3)、每天每个类型中最早访问时间和最晚访问时间?
(4)、每天每个类型中访问次数最高的10个用户?
没有数据!!!!!!!!!!!!!!!!!11
9:每个用户连续登陆的最大天数
login表
uid,udate
1,2019-08-01
1,2019-08-02
1,2019-08-03
2,2019-08-01
2,2019-08-02
3,2019-08-01
3,2019-08-03
4,2019-07-28
4,2019-07-29
4,2019-08-01
4,2019-08-02
4,2019-08-03
结果如下:
uid cnt_days
1 3
2 2
3 1
4 3
select
b.uid,
max(b.c)
from (
select
uid,
count(a.e) c
from (
select
uid,
udate d,
date_sub(udate,row_number() over(partition by uid order by udate)) as e
from login) a
group by a.uid,a.e) b
group by b.uid;
select
uid,
max(tmp1.total)
from(
select
tmp.uid,
count(tmp.dt) total
from(
select
uid,
date_sub(udate,row_number() over(partition by uid order by udate)) as dt
from login
) tmp
group by tmp.uid,tmp.dt
) tmp1
group by tmp1.uid
;
10:使用hive的hql实现男女各自第一名及其它
id sex chinese math
0 0 70 50
1 0 90 70
2 1 80 90
1、男女各自语文第一名(0:男,1:女)
2、男生成绩语文大于80,女生数学成绩大于70
create table cource
(id int,
sex int,
chinese int,
math int)
row format delimited fields terminated by ' ';
load data local inpath '/home/hivedata/cource.txt' into table cource;
1:
select
tmp.sex,
fir
from
(
select
sex,
first_value(chinese) over(partition by sex order by chinese desc) fir
from
cource) tmp
;
select
sex,
first_value(chinese) over(partition by sex order by chinese desc) fir
from
cource;
或者:
select
*
from
(select
id,
sex,
chinese,
math,
row_number() over(partition by sex order by chinese desc) rr
from
cource)t1
where t1.rr = 1
2:
select
*
from
cource
where sex=0 and chinese>80
union
select
*
from
cource
where sex=1 and manth>70;
10:课程行转列
数据:
t1表
id course
1,a
1,b
1,c
1,e
2,a
2,c
2,d
2,f
3,a
3,b
3,c
3,e
根据编写sql,得到结果如下(表中的1表示选修,表中的0表示未选修):
id a b c d e f
1 1 1 1 0 1 0
2 1 0 1 1 0 1
3 1 1 1 0 1 0
select id,
sum(case course when 'a' then 1 else 0 end) a,
sum(case course when 'b' then 1 else 0 end) b,
sum(case course when 'c' then 1 else 0 end) c,
sum(case course when 'd' then 1 else 0 end) d,
sum(case course when 'e' then 1 else 0 end) e,
sum(case course when 'f' then 1 else 0 end) f
from t1 group by id ;
select
id,
sum(case when subject="语文" then score else 0 end) as "语文",
sum(case when subject="数学" then score else 0 end) as "数学",
sum(case when subject="英语" then score else 0 end) as "英语",
sum(case when subject="政治" then score else 0 end) as "政治",
sum(score) as total
from
score
11:编写sql实现如下:
数据: t1表
uid tags
1 1,2,3
2 2,3
3 1,2
编写sql实现如下结果:
uid tag
1 1
1 2
1 3
2 2
2 3
3 1
3 2
create table t111
(uid int,
tags string)
row format delimited fields terminated by ' ';
load data local inpath '/home/hadoop/exam/t111' into table t111
select
uid,
tag
from t111
lateral view explode(split(tags,',')) tags as tag
12:用户标签连接查询
数据:
T1表:
Tags
1,2,3
1,2
2,3
T2表:
Id lab
1 A
2 B
3 C
根据T1和T2表的数据,编写sql实现如下结果:
ids tags
1,2,3 A,B,C
1,2 A,B
2,3 B,C
create table tab1(
tags array<string>
)
row format delimited fields terminated by ' '
collection items terminated by ','
;
create table tab2(
id int,
lab string
)
row format delimited fields terminated by ' ' ;
select tags,cj from tab1 lateral view explode(tags) tags as cj;
select
tmp01.tags,
collect_list(tmp01.lab)
from(
select
tmp.tags,
tab2.lab
from(
select tags,cj from tab1 lateral view explode(tags) tags as cj
) tmp inner join tab2
on tmp.cj=tab2.id) tmp01
group by tmp01.tags;
13:用户标签组合
数据:
t1表:
id tag flag
a b 2
a b 1
a b 3
c d 6
c d 8
c d 8
编写sql实现如下结果:
id tag flag
a b 1|2|3
c d 6|8
create table s1
(id string,
tag string,
flag string)
row format delimited fields terminated by ' ';
load data local inpath '/home/hivedata/s1' into table s1;
select
id,
tag,
concat_ws('|',collect_set(flag))
from
s1
group by id,tag;
select
t1.id,
t1.tag,
concat_ws("|",collect_list(t1.flag))
from
(select
id id,
tag tag,
flag flag
from s1
order by flag)t1
group by t1.id,t1.tag;
14: 用户标签行列互换
用户标签行列互换
t1表
uid name tags
1 goudan chihuo,huaci
2 mazi sleep
3 laotie paly
编写sql实现如下结果:
uid name tag
1 goudan chihuo
1 goudan huaci
2 mazi sleep
3 laotie paly
create table k1
(
uid int,
name string,
tags array<string>
)
row format delimited fields terminated by ' '
collection items terminated by ',';
load data local inpath '/home/hivedata/k1' into table k1
select uid,name,cj from k1 lateral view explode(tags) tags as cj;
15:词频统计
词频统计
数据:
t1表:
uid contents
1 i|love|china
2 china|is|good|i|i|like
统计结果如下,如果出现次数一样,则按照content名称排序:
content cnt
i 3
china 2
good 1
like 1
love 1
is 1
create table l1
(uid int,
contents string)
row format delimited fields terminated by ' ';
load data local inpath '/home/hivedata/l1' into table l1;
select explode(split(contents,'\|')) from l1;
select cj from l1 lateral view explode(split(contents,'\\|')) contents as cj ;
select
t1.cj cj,
count(t1.cj) cnt
from (
select cj from l1 lateral view explode(split(contents,'\\|')) contents as cj
) t1
group by t1.cj
order by cnt desc;
select ct, count(1) cn from (select ct from l1 lateral view explode(split(contents,"\\|")) tmp as ct) tmp group by ct order by cn desc,ct ;
16: 兴趣行转列
t1表
name sex hobby
janson 男 打乒乓球、游泳、看电影
tom 男 打乒乓球、看电影
hobby多3个值,使用hql实现结果如下:
name sex hobby1 hobby2 hobby3
janson 男 打乒乓球 游泳 看电影
tom 男 打乒乓球 看电影
create table ho(
name string,
sex string,
hobby array<string>
)
row format delimited fields terminated by '\t'
collection items terminated by ','
;
load data local inpath '/home/hivedata/hobby.txt' overwrite into table ho;
select
name,
sex,
case when true then nvl(hobby[0],"") else null end hobby1,
case when true then nvl(hobby[1],"") else null end hobby2,
case when true then nvl(hobby[2],"") else null end hobby3
from ho
;
17:求top3英雄及其pick率----------------------
1 亚索,挖掘机,艾瑞,洛,卡沙
2 亚索,盖伦,奥巴马,牛头,皇子
3 亚索,盖伦,艾瑞,宝石,琴女
4 亚索,盖伦,赵信,老鼠,锤石
create table heros(
id int,
names array<string>
)
row format delimited fields terminated by '\t'
collection items terminated by ',';
load data local inpath '/home/hivedata/heros.txt' overwrite into table heros;
select
tmp.cj,
tmp.tal,
round((tmp.tal/sum(tmp.tal)),2)
from
(select
cj,
count(cj) tal
from heros
lateral view explode(names) names as cj
group by cj) tmp
group by cj,tal
limit 3;
select
name,
count(1) ann,
round(count(1)/totalcn,2)
from (
select
count(name) over() totalcn,
name
from
heros
lateral view explode(names) t as name
) t1
group by name,totalcn
order by ann desc
limit 3;
18:时间格式转换
数据: t1表
20190730 20190731
编写sql实现如下的结果:
2019-07-30 2019-07-31
create table dt(
dt string
);
load data local inpath '/home/hivedata/dt.txt' overwrite into table dt;
select from_unixtime(
unix_timestamp(dt,"yyyyMMdd"),"yyyy-MM-dd"
)
from
dt
;
19:两个表A 和B ,均有key 和value 两个字段,写一个SQL语 句,将B表中的value值置成A表中相同key值对应的value值
A:
key vlaue
k1 123
k2 234
k3 235
B:
key value
k1 111
k2 222
k5 246
使用hive的hql实现,结果是B表数据如下:
k1 123
k2 234
k5 246
create table A
(key string,
value int)
row format delimited fields terminated by ' ';
load data local inpath '/home/hivedata/A' into table A;
create table B
(key string,
value int)
row format delimited fields terminated by ' ';
load data local inpath '/home/hivedata/B' into table B;
select
B.key,
if(A.key is null,B.value,A.value) value
from
B left join A
on B.key = A.key;
2111:有用户表user(uid,name)以及黑名单表Banuser(uid)
1、用left join方式写sql查出所有不在黑名单的用户信息
2、用not exists方式写sql查出所有不在黑名单的用户信息
create table u(
id string,
name string )
row format delimited fields terminated by ',' ;
create table banuser(
id string
)
;
load data local inpath '/hivedata/banuser.txt' overwrite into table banuser; load data local inpath '/hivedata/u.txt' overwrite into table u;
1、
select
u.id,
u.name
from u
left join banuser b
on u.id=b.id
where
b.id is null ;
2、
select
u.id,
u.name
from
u
where
not exists (
select 1 from banuser b where u.id=b.id
)
;
20访问日志正则提取
表t1(注:数据时正常的访问日志数据,分隔符全是空格)
8.35.201.160 - - [16/May/2018:17:38:21 +0800] "GET /uc_server/data/avatar/000/01/54/22_avatar_middle.jpg HTTP/1.1" 200 5396
使用hive的hql实现结果如下:
ip dt url
8.35.201.160 2018-5-16 17:38:21 /uc_server/data/avatar/000/01/54/22_avatar_middle.jpg
create table login_log( log string )
load data local inpath '/hivedata/login_log.txt' overwrite into table login_log;
select regexp_extract(log,"([0-9.]+\\d+) - - \\[(.+ \\+\\d+)\\] .+(GET) (.+) (HTTP)\\S+ (\\d+) (\\d+)",1), from_unixtime(unix_timestamp(regexp_extract(log,"([0-9.]+\\d+) - - \\[(.+ \\+\\d+)\\] .+(GET) (.+) (HTTP)\\S+ (\\d+) (\\d+)",2),"dd/MMM/yyyy:HH:mm:ss Z"),"yyyy-MM-dd HH:mm:ss"), regexp_extract(log,"([0-9.]+\\d+) - - \\[(.+ \\+\\d+)\\] .+(GET) (.+) (HTTP)\\S+ (\\d+) (\\d+)",4) from login_log where log regexp "([0-9.]+\\d+) - - \\[(.+ \\+\\d+)\\] .+(GET) (.+) (HTTP)\\S+ (\\d+) (\\d+)" ;
21 使用hive的hql查询用户所在部门
dpt表
dpt_id dpt_name
1 产品
2 技术
user_dpt表
user_id dpt_id
1 1
2 1
3 2
4 2
5 3
result表
user_id dpt_id dpt_name
1 1 产品
2 1 产品
3 2 技术
4 2 技术
5 3 其他部门
create table dpt(
dpt_id int,
dpt_name string
)
row format delimited fields terminated by '\t' ;
load data local inpath '/hivedata/dpt.txt' overwrite into table dpt;
create table user_dpt(
user_id int,
dpt_id int )
row format delimited fields terminated by '\t' ;
load data local inpath '/hivedata/user_dpt.txt' overwrite into table user_dpt;
select
t1.user_id,
t1.dpt_id,
nvl(t2.dpt_name,"其他部门")
from
user_dpt t1
left join dpt t2
on t1.dpt_id=t2.dpt_id ;
22 查出每个学期每门课程最高分记录
course_score表:
id,name,course,score,term
1,zhangsan,数学,80,2015
2,lisi,语文,90,2016
3,lisi,数学,70,2016
4,wangwu,化学,80,2017
5,zhangsan,语文,85,2015
6,zhangsan,化学,80,2015
7,wangwuww,化学,90,2017
8,zhangsanwww,语文,85,2015
编写sql完成如下查询,一次查询实现好,也可以写多次查询实现:
1、查出每个学期每门课程高分记录(包含全部5个字段)
2、查出单个学期中语文课在90分以上的学生的数学成绩记录(包含全部5个字段)
create table test4
(id int,
name string,
course string,
score int,
term int)
row format delimited fields terminated by ','
load data local inpath '/home/hivedata/test4' into table test4;
select
t1.*
from
test4 t1
inner join
(select
course,
max(score) score,
term
from
test4
group by course,term)t2
on t2.score = t1.score and t2.term = t1.term and t1.course = t2.course;
select
test4.*
from test4
inner join
(select
id,name,course,score,term
from test4
where score>=90 and course='语文')t
on
test4.term = t.term
where test4.course='数学'
----------------------------and t.id is not null
23:设计数据库表,用来存放学生基本信息,课程信息,学生的课 程及成绩,并给出sql语句,查询平均成绩大于85的所有学生
stu_1
id,name,age,addr
1,zs1,22,bj
2,zs2,22,bj
3,zs3,22,bj
4,zs4,22,bj
5,zs5,22,bj
course_1
cid,cname
1,语文
2,数学
3,政治
4,美术
5,历史
course_sc
id,cid,score
1,1,87
1,2,92
1,3,69
2,2,83
2,3,92
2,4,87
2,5,83
create table stu_1(
id string,
name string,
age int,
addr string )
row format delimited fields terminated by ',' ;
create table course_1(
cid string,
cname string
)
row format delimited fields terminated by ',' ;
create table course_sc(
id string,
cid string,
score int )
row format delimited fields terminated by ',' ;
load data local inpath '/hivedata/course_1.txt' overwrite into table course_1; load data local inpath '/hivedata/stu_1.txt' overwrite into table stu_1; load data local inpath '/hivedata/course_sc.txt' overwrite into table course_sc;
select cs.id, avg(score) avgscore from course_sc cs group by cs.id having avgscore>85 ;
24:每个渠道的下单用户数、订单总金额
请使用hive hql查询出2019-08-06号 每个渠道的下单用户数、订单总金额。 hql语句实现,结果表头如下: channel user_num order_amount
create table order_34(
order_id bigint,
user_id int,
amount double,
channel string,
time string comment 'yyyy-MM-dd HH:mm:ss'
)
partitioned by(dt string comment 'yyyy-MM-dd')
row format delimited fields terminated by ',' ;
1,100,19,a,2019-08-06 19:00:00
2,101,19,b,2019-08-06 19:00:01
3,100,19,a,2019-08-05 19:00:00
4,101,19,b,2019-08-05 19:00:01
5,102,19,a,2019-08-06 19:00:00
6,102,19,a,2019-08-06 19:00:01
load data local inpath '/hivedata/order_34.txt' into table order_34 partition(dt='2019-08-06');
hive hql查询出2019-08-06号 每个渠道的下单用户数、订单总金额。
select
channel,
count(distinct user_id),
sum(amount)
from order_34
where dt='2019-08-06' and to_date(time)='2019-08-06'
group by
channel,to_date(time) ;
25:登录且阅读的用户数,已经阅读书籍数量及其它
有如下三张表:
表A(登录表):
ds user_id
2019-08-06 1
2019-08-06 2
2019-08-06 3
2019-08-06 4
表B(阅读表):
ds user_id read_num
2019-08-06 1 2
2019-08-06 2 3
2019-08-06 3 6
表C(付费表):
ds user_id price
2019-08-06 1 55.6
2019-08-06 2 55.8
基于上述三张表,请使用hive的hql语句实现如下需求:
(1)、用户登录并且当天有阅读的用户数,已经阅读书籍数量
(2)、用户登录并且阅读,但是没有付费的用户数
(3)、用户登录并且付费,付费用户书籍和金额
create table if not exists lg(
ds string,
user_id string
)
row format delimited fields terminated by ' ';
load data local inpath '/home/hivedata/lg.txt' into table lg;
create table if not exists read(
ds string,
user_id string,
read_num int)
row format delimited fields terminated by ' ';
load data local inpath '/home/hivedata/read.txt' into table read;
create table if not exists pay(
ds string,
user_id string,
price double)
row format delimited fields terminated by ' ';
load data local inpath '/home/hivedata/pay.txt' into table pay;
(1)、用户登录并且当天有阅读的用户数,已经阅读书籍数量
select
count(1),
sum(read_num)
from read r
join lg l
on l.user_id = r.user_id and l.ds = r.ds
group by r.ds;
(2)、用户登录并且阅读,但是没有付费的用户数
select
r.ds ds,
count(1)
from read r
left semi join lg l
on l.user_id = r.user_id and l.ds = r.ds
left join pay p
on r.ds=p.ds and r.user_id=p.user_id
where p.user_id is null
group by r.ds;
(3)、用户登录并且付费,付费用户id和金额
select
p.user_id,
p.price
from pay p
left join lg
on p.user_id=lg.user_id and p.ds=lg.ds
where lg.user_id is not null;
26:高消费者报表
有三个表,分别是:
区域(district) 区域中有两个字段分别是区域Id(disid)和区域名称(disname)
城市(city) 城市有两个字段分别是城市ID(cityid)和区域ID(disid)
订单(order) 订单有四个字段分别是订单ID(orderid)、用户ID(userid)、城市ID(cityid)和消费金 额(amount)。
district表:
disid disname
1 华中
2 西南
city表:
cityid disid
1 1
2 1
3 2
4 2
5 2
order表:
oid userid cityid amount
1 1 1 1223.9
2 1 1 9999.9
3 2 2 2322
4 2 2 8909
5 2 3 6789
6 2 3 798
7 3 4 56786
8 4 5 78890
高消费者是消费金额大于1W的用户,使用hive hql生成如下报表: 区域名 高消费者人数 消费总额
create table district(
disid string,
disname string )
row format delimited fields terminated by '\t' ;
create table city(
cityid string,
disid string )
row format delimited fields terminated by '\t' ;
create table order_29(
oid string,
userid string,
cityid string,
amount double )
row format delimited fields terminated by '\t' ;
load data local inpath '/hivedata/district.txt' into table district;
load data local inpath '/hivedata/city.txt' into table city;
load data local inpath '/hivedata/order_29.txt' into table order_29;
select
b.disid,
c.disname,
a.userid,
sum(a.amount)
from order_29 a join city b
on a.cityid=b.cityid
join district c
on b.disid = c.disid
group by b.disid,c.disname,a.userid;
27:请使用sql计算pv.uv
数据:t1表
uid dt url
1 2019-08-06 http://www.baidu.com
2 2019-08-06 http://www.baidu.com
3 2019-08-06 http://www.baidu.com
3 2019-08-06 http://www.soho.com
3 2019-08-06 http://www.meituan.com
3 2019-08-06
结果如下:
date uv pv2019-08-6 3 5
create table uv_pv(
uid string,
dt string,
url string
)
row format delimited fields terminated by ',';
load data local inpath '/hivedata/uv_pv.txt' overwrite into table uv_pv;
select
dt,
count(distinct uid),
count(url)
from uv_pv
group by dt
;
28:使用hive的hql实现买过商品3的用户及其昨日消费
orderid userid productid price timestamp date
123,00101,3,200,1535945356,2019-08-28
124,00100,1,200,1535945356,2019-08-28
125,00101,3,200,1535945356,2019-08-29
126,00101,2,200,1535945356,2019-08-29
127,00102,5,200,1535945356,2019-08-29
128,00103,3,200,1535945356,2019-08-29
129,00103,3,200,1535945356,2019-08-29
create table buy_log(
orderid int,
userid string,
productid int,
price double,
tm string,
dt string
)
row format delimited fields terminated by ',';
load data local inpath '/home/hivedata/buy_log.txt' overwrite into table buy_log;
select
userid userid,
sum(case when dt=date_sub(current_date,1) then price else 0 end) pay
from
(select
userid,
price,
dt
from buy_log
where productid=3 and (dt=current_date or dt=date_sub(current_date,1))) t1
group by userid,dt
having pay>0
;
29:统计为用户配重了角色的用户角色数量
用户表和角色表,统计为用户配重了角色的用户角色数量(hql)
用户表t1:
uid uname role
1 zs 1,1,2,2,2
2 ls 1,2
3 ww 2,3,3
角色表t2:
roleid rolename
1,唤醒师
2,召唤师
3,魔法师
结果如下:
uid uname roleid cnt
1 zs 1 召唤师 2
1 zs 2 唤醒师 3
3 ww 3 魔法师 2
select
create table user_role(
uid int,
uname string,
role array<int>
)
row format delimited fields terminated by '\t'
collection items terminated by ',';
load data local inpath '/home/hivedata/user_role.txt' overwrite into table user_role;
create table role(
roleid int,
rolename string)
row format delimited fields terminated by ',';
load data local inpath '/home/hivedata/role.txt' overwrite into table role;
select
tmp.uid,
tmp.uname,
tmp.roleid,
role.rolename,
count(tmp.roleid) --count(1)也可以
from role join
(
select
uid,
uname,
roleid
from user_role
lateral view explode(role) t as roleid) tmp
on tmp.roleid=role.roleid
group by tmp.uid,tmp.uname,tmp.roleid,role.rolename;
[外链图片转存失败(img-KXeS0TZH-1569338883147)(E:\千峰培训\hadoop第四周笔记\1568824364983.png)]
30: 统计两个人的通话总时长
有如下通话记录:
呼叫 被叫 时长
goudan haoge 01:01:01
goudan mazi 00:11:21
goudan laowang 00:19:01
goudan Jingba 00:21:01
goudan weige 01:31:17
haoge mazi 00:51:01
haoge mazi 01:11:19
haoge laowang 00:00:21
haoge laowang 00:23:01
laowang mazi 01:18:01
laowang weige 00:18:00
Jingba haoge 00:01:01
Jingba haoge 00:00:06
Jingba haoge 00:02:04
Jingba haoge 00:02:54
haoge laowang 01:00:13
haoge laowang 00:01:01
haoge goudan 00:01:01
使用hql统计两个人的通话总时长(用户之间互相通话的时长)?
create table callme(
huname string,
huiname string,
times string
)
row format delimited fields terminated by ' '
;
load data local inpath '/home/hivedata/callme.txt' into table callme;
select
tmp1.huname,
tmp1.huiname,
from_unixtime( (unix_timestamp(tmp1.sumt,"HH:mm:ss") + unix_timestamp(nvl(tmp2.sumt,'00:00:00'), "HH:mm:ss") - unix_timestamp("24:00:00","HH:mm:ss") ),"HH:mm:ss")
from(
select
huname,
huiname,
from_unixtime(sum(unix_timestamp(times,"HH:mm:ss")),"HH:mm:ss" ) sumt
from
callme
group by huname, huiname
) tmp1
left join
(
select
huiname,
huname,
from_unixtime(sum(unix_timestamp(times,"HH:mm:ss")),"HH:mm:ss" ) sumt
from
callme
group by huname, huiname
) tmp2
on tmp1.huiname = tmp2.huname and tmp2.huiname = tmp1.huname
;
31:统计三个表中互不重复的数
ABC三个hive表 每个表中都只有一列int类型且列名相同,求三个表中互不重复的数
a.txt b.txt c.txt
1 2 1
2 3 2
3 11 3
4 12 11
5 14 5
6 15 6
7 16 7
8 18 8
9 35 20
6 30
7 40
8
create table a1(
id int
);
load data local inpath '/home/hivedata/c1.txt' into table c1;
create table b1(
id int
);
create table c1(
id int
);
select
tmp.id
from
(select
id
from
a1
union all
select
id
from
b1
union all
select
id
from
c1) tmp
group by tmp.id
having count(tmp.id)=1
;
32
laowang mazi 01:18:01
laowang weige 00:18:00
Jingba haoge 00:01:01
Jingba haoge 00:00:06
Jingba haoge 00:02:04
Jingba haoge 00:02:54
haoge laowang 01:00:13
haoge laowang 00:01:01
haoge goudan 00:01:01
使用hql统计两个人的通话总时长(用户之间互相通话的时长)?
create table callme(
huname string,
huiname string,
times string
)
row format delimited fields terminated by ' '
;
load data local inpath '/home/hivedata/callme.txt' into table callme;
select
tmp1.huname,
tmp1.huiname,
from_unixtime( (unix_timestamp(tmp1.sumt,"HH:mm:ss") + unix_timestamp(nvl(tmp2.sumt,'00:00:00'), "HH:mm:ss") - unix_timestamp("24:00:00","HH:mm:ss") ),"HH:mm:ss")
from(
select
huname,
huiname,
from_unixtime(sum(unix_timestamp(times,"HH:mm:ss")),"HH:mm:ss" ) sumt
from
callme
group by huname, huiname
) tmp1
left join
(
select
huiname,
huname,
from_unixtime(sum(unix_timestamp(times,"HH:mm:ss")),"HH:mm:ss" ) sumt
from
callme
group by huname, huiname
) tmp2
on tmp1.huiname = tmp2.huname and tmp2.huiname = tmp1.huname
;