create table youtube_ori(
videoid string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedid array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
--SDNkMu8Z w00dy911 630 People&Blogs 186 10181 3.49 494 257 rjnbgpPJUks qlC39j5JImU
create table youtube_orc(
videoid string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedid array<string>)
clustered by (uploader) into 8 buckets
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as orc;
create table youtube_user_ori(
uploader string,
videos int,
friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as textfile;
create table youtube_user_orc(
uploader string,
videos int,
friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as orc;
yarn jar ~/0707youtube-0.0.1-SNAPSHOT.jar \
> com.atguigu.youtube_syllabus.etl.VideoETLRunner \
> /youtube/video/2008/0222 \
> /youtube/output/video/2008/0222
load data inpath "/youtube/output/video/2008/0222" into table youtube_ori;
load data inpath "/youtube/user/2008/0903" into table youtube_user_ori;
insert into table youtube_orc select * from youtube_ori;
insert into table youtube_user_orc select * from youtube_user_ori;
====================================================
统计视频观看数 top10
bin/hive -e "select videoid,
views
from yt0706.youtube_orc
order by views desc limit 3;";
select videoid,
views
from youtube_orc
order by views desc limit 10;
====================================================
统计视频类别热度 top10
1) 即统计每个类别有多少个视频,显示出包含视频最多的前10 个类别。
select category_name,
count(t1.videoid) as hot
from
(select videoid,
category_name
from youtube_orc lateral view explode(category) t_catetory as category_name) t1
group by t1.category_name
order by hot desc limit 5;
select category_name as category,
count(t1.videoid) as hot
from
(select videoid,
category_name
from youtube_orc lateral view explode(category) t_catetory as category_name) t1
group by t1.category_name
order by hot desc limit 10;
category hot
Comedy 46
Entertainment 41
Music 23
Film 21
Animation 21
Blogs 20
====================================================
^^^^统计视频类别总播放数 top5
select category_name,
sum(t1.views) as hot
from
(select views,
category_name
from youtube_orc lateral view explode(category) t_catetory as category_name) t1
group by t1.category_name
order by hot desc limit 5;
category_name hot
Comedy 128836256
Entertainment 91643078
Music 67456436
Blogs 29017066
People 29017066
====================================================
统计出视频观看数最高的 20 个视频 的所属类别 以及类别包含这 top20 视频的个数
select category_name as category,
count(t2.videoid) as hot_with_views
from
(select videoid,
category_name
from
(select videoid,
category,
views
from youtube_orc
order by views desc limit 20) t1
lateral view explode(category) t_catetory as category_name) t2
group by category_name
order by hot_with_views desc;
(select videoid,
category,
views
from youtube_orc
order by views desc limit 20) t1
videoid category views
dmh0bheirng ["comedy"] 42513417
0xxi-hvprra ["comedy"] 20282464
1dmvu08zvpa ["entertainment"] 16087899
rb-wugnygv0 ["entertainment"] 15712924
qja5fazf1a8 ["music"] 15256922
(select videoid,
category_name
from t1 lateral view explode(category) t_catetory as category_name) t2
videoid category_name
dmh0bheirng comedy
0xxi-hvprra comedy
1dmvu08zvpa entertainment
rb-wugnygv0 entertainment
qja5fazf1a8 music
select category_name as category,
count(t2.videoid) as hot_with_views
from
t2
group by category_name
order by hot_with_views desc;
category hot_with_views
entertainment 6
comedy 6
music 5
people 2
blogs 2
una 1
====================================================
统计视频观看数 top50 所关联视频的 所属类别的热度排名
t1
select
relatedid
from
youtube_orc
order by
views desc
limit 50;
根据 views top50的 1列数据, 共50行
t2:
select
videoid
from
t1 lateral view explode(relatedid) explode_relatedid as videoid
videoid
yuo6yjlvxe8
vqpnwbo-r4e
目的:得到关联视频id的所有的数据信息,必须join操作
t4:
select
distinct videoid,
category
from
youtube_orc t3 join t2 on t3.videoid=t2.videoid;
videoid category(没有炸开)
7wt5FiZQrgM ["Gadgets","Games"]
AbndgwfG22k ["Music"]
Ddn4MGaS3N4 ["Music"]
Gm6XszMrw9Y ["Film","Animation"]
MNxwAU_xAMk ["Comedy"]
N0TR0Irx4Y0 ["Comedy"]
t5
select
videoid,
category_name
from
t4 lateral view explode(category) explode_category as category_name;
videoid category_name(已炸开)
7wt5FiZQrgM Gadgets
7wt5FiZQrgM Games
AbndgwfG22k Music
Ddn4MGaS3N4 Music
Gm6XszMrw9Y Film
Gm6XszMrw9Y Animation
MNxwAU_xAMk Comedy
N0TR0Irx4Y0 Comedy
O9mEKMz2Pvo Music
OMndH4egfSk Music
OUi9-jqq_i0 Music
RB-wUgnyGv0 Entertainment
RUCZJVJ_M8o Entertainment
S2n1_h3Bvt0 Entertainment
_BuRwH59oAo Comedy
ixsZy2425eY Entertainment
o4x-VW_rCSE Entertainment
sdUUx5FdySs Film
sdUUx5FdySs Animation
u2pW7PSyZhw Sports
result:
select
count(videoid) as hot,
category_name
from
t5
group by
category_name
order by
hot desc;
select category_name as category,
count(t5.videoid) as hot
from
(select videoid,
category_name
from
(select distinct(t2.videoid),
t3.category
from
(select explode(relatedid) as videoid
from
(select *
from youtube_orc
order by views desc limit 50) t1) t2
inner join youtube_orc t3 on t2.videoid = t3.videoid) t4 lateral view explode(category) t_catetory as category_name) t5
group by category_name
order by hot desc;
select
category_name as category,
count(videoId) as hot
from
(select
videoId,
category_name
from
(select
distinct videoId,
category
from
(select
video2Id
from
(select
*
from
youtube_orc
order by
views desc
limit 50) t1
lateral view explode(relatedId) explode_relatedId as video2Id) t2
join youtube_orc t3 on t3.videoId=t2.video2Id) t4
lateral view explode(category) explode_category as category_name) t5
group by
category_name
order by
hot desc;
category hot
Music 5
Entertainment 5
Comedy 3
Film 2
Animation 2
Sports 1
Games 1
Gadgets 1
====================================================
统计每个类别中的视频热度 Top10,以 Music 为例
create table youtube_category(videoid string, uploader string, age int, categoryid string, length int, views int, rate float, ratings int, comments int, relatedid array<string>) row format delimited fields terminated by "\t" collection items terminated by "&" stored as orc;
insert into table youtube_category
select videoid,
uploader,
age,
categoryid,
length,
views,
rate,
ratings,
comments,
relatedid
from youtube_orc lateral view explode(category) catetory as categoryid;
select videoid,
views
from youtube_category
where categoryid = "Music"
order by views desc limit 10;
====================================================
统计每个类别中视频流量 Top10,以 Music 为例
select videoid,
ratings
from youtube_category
where categoryid = "Music"
order by ratings desc limit 10;
====================================================
统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 的视频
select *
from youtube_user_orc
order by videos desc limit 10;
uploader videos friends
expertvillage 86228 5659
TourFactory 49078 0
myHotelVideo 33506 102
AlexanderRodchenko 24315 22
VHTStudios 20230 3
ephemeral8 19498 11
HSN 15371 309
rattanakorn 12637 677
Ruchaneewan 10059 774
futifu 9668 118
select
*
from
(select
*
from
youtube_user_orc
order by videos desc
limit 10) t1
join youtube_orc t2
on t1.uploader = t2.uploader
distribute by
t1.uploader
sort by
views desc
limit 20 ;
select
t2.videoid,
t2.views,
t2.ratings,
t1.videos,
t1.friends
from
(select
*
from
youtube_user_orc
order by videos desc
limit 10) t1
join youtube_orc t2
on t1.uploader = t2.uploader
order by views desc
limit 20 ;
====================================================
统计每个类别视频观看数 Top10
select
t1.*
from
(select
videoId,
categoryId,
views,
row_number () over (partition by categoryId order by views desc) rank
from
youtube_category) t1
where rank <= 10 ;
t1.videoid t1.categoryid t1.views t1.rank
LHyJH1yGKZY Animals 1085020 1
Qz_nZixWX6Q Animals 1027067 2
yg2enZsknZM Animals 2429 3
sdUUx5FdySs Animation 5840839 1
6B26asyGKDo Animation 5147533 2
JzqumbhfxRo Animation 3230774 3
ElrldD02if0 Animation 2337238 4
Gm6XszMrw9Y Animation 1227020 5
hBRjiLPNJAA Animation 927340 6
zRVts7TFw-Y Animation 607456 7
Jy09vLHkIes Animation 544237 8
bqZauhidT1w Animation 374550 9
A0eZ9qkUvJU Animation 351288 10
pdiuDXwgrjQ Autos 1013697 1
zgpbblz9wHw Autos 11322 2
-_CSo1gOd48 Blogs 13199833 1
D2kJZOfq7zk Blogs 11184051 2
PL4Uzun4CKA Blogs 1235410 3
videoid string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedid array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
--SDNkMu8Z w00dy911 630 People&Blogs 186 10181 3.49 494 257 rjnbgpPJUks qlC39j5JImU
create table youtube_orc(
videoid string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedid array<string>)
clustered by (uploader) into 8 buckets
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as orc;
create table youtube_user_ori(
uploader string,
videos int,
friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as textfile;
create table youtube_user_orc(
uploader string,
videos int,
friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as orc;
yarn jar ~/0707youtube-0.0.1-SNAPSHOT.jar \
> com.atguigu.youtube_syllabus.etl.VideoETLRunner \
> /youtube/video/2008/0222 \
> /youtube/output/video/2008/0222
load data inpath "/youtube/output/video/2008/0222" into table youtube_ori;
load data inpath "/youtube/user/2008/0903" into table youtube_user_ori;
insert into table youtube_orc select * from youtube_ori;
insert into table youtube_user_orc select * from youtube_user_ori;
====================================================
统计视频观看数 top10
bin/hive -e "select videoid,
views
from yt0706.youtube_orc
order by views desc limit 3;";
select videoid,
views
from youtube_orc
order by views desc limit 10;
====================================================
统计视频类别热度 top10
1) 即统计每个类别有多少个视频,显示出包含视频最多的前10 个类别。
select category_name,
count(t1.videoid) as hot
from
(select videoid,
category_name
from youtube_orc lateral view explode(category) t_catetory as category_name) t1
group by t1.category_name
order by hot desc limit 5;
select category_name as category,
count(t1.videoid) as hot
from
(select videoid,
category_name
from youtube_orc lateral view explode(category) t_catetory as category_name) t1
group by t1.category_name
order by hot desc limit 10;
category hot
Comedy 46
Entertainment 41
Music 23
Film 21
Animation 21
Blogs 20
====================================================
^^^^统计视频类别总播放数 top5
select category_name,
sum(t1.views) as hot
from
(select views,
category_name
from youtube_orc lateral view explode(category) t_catetory as category_name) t1
group by t1.category_name
order by hot desc limit 5;
category_name hot
Comedy 128836256
Entertainment 91643078
Music 67456436
Blogs 29017066
People 29017066
====================================================
统计出视频观看数最高的 20 个视频 的所属类别 以及类别包含这 top20 视频的个数
select category_name as category,
count(t2.videoid) as hot_with_views
from
(select videoid,
category_name
from
(select videoid,
category,
views
from youtube_orc
order by views desc limit 20) t1
lateral view explode(category) t_catetory as category_name) t2
group by category_name
order by hot_with_views desc;
(select videoid,
category,
views
from youtube_orc
order by views desc limit 20) t1
videoid category views
dmh0bheirng ["comedy"] 42513417
0xxi-hvprra ["comedy"] 20282464
1dmvu08zvpa ["entertainment"] 16087899
rb-wugnygv0 ["entertainment"] 15712924
qja5fazf1a8 ["music"] 15256922
(select videoid,
category_name
from t1 lateral view explode(category) t_catetory as category_name) t2
videoid category_name
dmh0bheirng comedy
0xxi-hvprra comedy
1dmvu08zvpa entertainment
rb-wugnygv0 entertainment
qja5fazf1a8 music
select category_name as category,
count(t2.videoid) as hot_with_views
from
t2
group by category_name
order by hot_with_views desc;
category hot_with_views
entertainment 6
comedy 6
music 5
people 2
blogs 2
una 1
====================================================
统计视频观看数 top50 所关联视频的 所属类别的热度排名
t1
select
relatedid
from
youtube_orc
order by
views desc
limit 50;
根据 views top50的 1列数据, 共50行
t2:
select
videoid
from
t1 lateral view explode(relatedid) explode_relatedid as videoid
videoid
yuo6yjlvxe8
vqpnwbo-r4e
目的:得到关联视频id的所有的数据信息,必须join操作
t4:
select
distinct videoid,
category
from
youtube_orc t3 join t2 on t3.videoid=t2.videoid;
videoid category(没有炸开)
7wt5FiZQrgM ["Gadgets","Games"]
AbndgwfG22k ["Music"]
Ddn4MGaS3N4 ["Music"]
Gm6XszMrw9Y ["Film","Animation"]
MNxwAU_xAMk ["Comedy"]
N0TR0Irx4Y0 ["Comedy"]
t5
select
videoid,
category_name
from
t4 lateral view explode(category) explode_category as category_name;
videoid category_name(已炸开)
7wt5FiZQrgM Gadgets
7wt5FiZQrgM Games
AbndgwfG22k Music
Ddn4MGaS3N4 Music
Gm6XszMrw9Y Film
Gm6XszMrw9Y Animation
MNxwAU_xAMk Comedy
N0TR0Irx4Y0 Comedy
O9mEKMz2Pvo Music
OMndH4egfSk Music
OUi9-jqq_i0 Music
RB-wUgnyGv0 Entertainment
RUCZJVJ_M8o Entertainment
S2n1_h3Bvt0 Entertainment
_BuRwH59oAo Comedy
ixsZy2425eY Entertainment
o4x-VW_rCSE Entertainment
sdUUx5FdySs Film
sdUUx5FdySs Animation
u2pW7PSyZhw Sports
result:
select
count(videoid) as hot,
category_name
from
t5
group by
category_name
order by
hot desc;
select category_name as category,
count(t5.videoid) as hot
from
(select videoid,
category_name
from
(select distinct(t2.videoid),
t3.category
from
(select explode(relatedid) as videoid
from
(select *
from youtube_orc
order by views desc limit 50) t1) t2
inner join youtube_orc t3 on t2.videoid = t3.videoid) t4 lateral view explode(category) t_catetory as category_name) t5
group by category_name
order by hot desc;
select
category_name as category,
count(videoId) as hot
from
(select
videoId,
category_name
from
(select
distinct videoId,
category
from
(select
video2Id
from
(select
*
from
youtube_orc
order by
views desc
limit 50) t1
lateral view explode(relatedId) explode_relatedId as video2Id) t2
join youtube_orc t3 on t3.videoId=t2.video2Id) t4
lateral view explode(category) explode_category as category_name) t5
group by
category_name
order by
hot desc;
category hot
Music 5
Entertainment 5
Comedy 3
Film 2
Animation 2
Sports 1
Games 1
Gadgets 1
====================================================
统计每个类别中的视频热度 Top10,以 Music 为例
create table youtube_category(videoid string, uploader string, age int, categoryid string, length int, views int, rate float, ratings int, comments int, relatedid array<string>) row format delimited fields terminated by "\t" collection items terminated by "&" stored as orc;
insert into table youtube_category
select videoid,
uploader,
age,
categoryid,
length,
views,
rate,
ratings,
comments,
relatedid
from youtube_orc lateral view explode(category) catetory as categoryid;
select videoid,
views
from youtube_category
where categoryid = "Music"
order by views desc limit 10;
====================================================
统计每个类别中视频流量 Top10,以 Music 为例
select videoid,
ratings
from youtube_category
where categoryid = "Music"
order by ratings desc limit 10;
====================================================
统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 的视频
select *
from youtube_user_orc
order by videos desc limit 10;
uploader videos friends
expertvillage 86228 5659
TourFactory 49078 0
myHotelVideo 33506 102
AlexanderRodchenko 24315 22
VHTStudios 20230 3
ephemeral8 19498 11
HSN 15371 309
rattanakorn 12637 677
Ruchaneewan 10059 774
futifu 9668 118
select
*
from
(select
*
from
youtube_user_orc
order by videos desc
limit 10) t1
join youtube_orc t2
on t1.uploader = t2.uploader
distribute by
t1.uploader
sort by
views desc
limit 20 ;
select
t2.videoid,
t2.views,
t2.ratings,
t1.videos,
t1.friends
from
(select
*
from
youtube_user_orc
order by videos desc
limit 10) t1
join youtube_orc t2
on t1.uploader = t2.uploader
order by views desc
limit 20 ;
====================================================
统计每个类别视频观看数 Top10
select
t1.*
from
(select
videoId,
categoryId,
views,
row_number () over (partition by categoryId order by views desc) rank
from
youtube_category) t1
where rank <= 10 ;
t1.videoid t1.categoryid t1.views t1.rank
LHyJH1yGKZY Animals 1085020 1
Qz_nZixWX6Q Animals 1027067 2
yg2enZsknZM Animals 2429 3
sdUUx5FdySs Animation 5840839 1
6B26asyGKDo Animation 5147533 2
JzqumbhfxRo Animation 3230774 3
ElrldD02if0 Animation 2337238 4
Gm6XszMrw9Y Animation 1227020 5
hBRjiLPNJAA Animation 927340 6
zRVts7TFw-Y Animation 607456 7
Jy09vLHkIes Animation 544237 8
bqZauhidT1w Animation 374550 9
A0eZ9qkUvJU Animation 351288 10
pdiuDXwgrjQ Autos 1013697 1
zgpbblz9wHw Autos 11322 2
-_CSo1gOd48 Blogs 13199833 1
D2kJZOfq7zk Blogs 11184051 2
PL4Uzun4CKA Blogs 1235410 3