yt0706_08sql

create table youtube_ori(
videoid string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedid array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;


--SDNkMu8Z w00dy911 630 People&Blogs 186 10181 3.49  494 257 rjnbgpPJUks qlC39j5JImU


create table youtube_orc(
     videoid string,
     uploader string,
     age int,
     category array<string>,
     length int,
     views int,
     rate float,
     ratings int,
     comments int,
     relatedid array<string>)
clustered by (uploader) into 8 buckets
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as orc;


create table youtube_user_ori(
     uploader string,
     videos int,
     friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as textfile;


create table youtube_user_orc(
     uploader string,
     videos int,
     friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as orc;


yarn jar ~/0707youtube-0.0.1-SNAPSHOT.jar \
> com.atguigu.youtube_syllabus.etl.VideoETLRunner \
> /youtube/video/2008/0222 \
> /youtube/output/video/2008/0222


load data inpath "/youtube/output/video/2008/0222" into table youtube_ori;


load data inpath "/youtube/user/2008/0903" into table youtube_user_ori;


insert into table youtube_orc select * from youtube_ori;


insert into table youtube_user_orc select * from youtube_user_ori;


====================================================


统计视频观看数 top10
bin/hive -e "select videoid,
       views
from yt0706.youtube_orc
order by views desc limit 3;";


select videoid,
       views
from youtube_orc
order by views desc limit 10;


====================================================


统计视频类别热度 top10


1)  即统计每个类别有多少个视频,显示出包含视频最多的前10 个类别。


select category_name,
       count(t1.videoid) as hot
from
  (select videoid,
          category_name
   from youtube_orc lateral view explode(category) t_catetory as category_name) t1
group by t1.category_name
order by hot desc limit 5;




select category_name as category,
       count(t1.videoid) as hot
from
  (select videoid,
          category_name
   from youtube_orc lateral view explode(category) t_catetory as category_name) t1
group by t1.category_name
order by hot desc limit 10;
category        hot
Comedy  46
Entertainment   41
Music    23
Film    21
Animation       21
Blogs  20


====================================================


^^^^统计视频类别总播放数 top5


select category_name,
       sum(t1.views) as hot
from
  (select views,
          category_name
   from youtube_orc lateral view explode(category) t_catetory as category_name) t1
group by t1.category_name
order by hot desc limit 5;
category_name   hot
Comedy  128836256
Entertainment   91643078
Music    67456436
Blogs    29017066
People  29017066
====================================================


统计出视频观看数最高的 20 个视频 的所属类别 以及类别包含这 top20 视频的个数


select category_name as category,
       count(t2.videoid) as hot_with_views
from
  (select videoid,
          category_name
   from
     (select videoid,
             category,
             views
      from youtube_orc
      order by views desc limit 20) t1
   lateral view explode(category) t_catetory as category_name) t2
group by category_name
order by hot_with_views desc;




(select videoid,
      category,
      views
from youtube_orc
order by views desc limit 20) t1


videoid category        views
dmh0bheirng     ["comedy"]      42513417
0xxi-hvprra     ["comedy"]      20282464
1dmvu08zvpa     ["entertainment"]       16087899
rb-wugnygv0     ["entertainment"]       15712924
qja5fazf1a8     ["music"]        15256922






(select videoid,
      category_name
from t1 lateral view explode(category) t_catetory as category_name) t2


videoid category_name
dmh0bheirng     comedy
0xxi-hvprra     comedy
1dmvu08zvpa     entertainment
rb-wugnygv0     entertainment
qja5fazf1a8     music




select category_name as category,
       count(t2.videoid) as hot_with_views
from
   t2
group by category_name
order by hot_with_views desc;


category        hot_with_views
entertainment   6
comedy  6
music    5
people  2
blogs    2
una      1




====================================================


统计视频观看数 top50   所关联视频的  所属类别的热度排名


t1
select
   relatedid
from
   youtube_orc
order by
   views desc
limit 50;
根据 views top50的 1列数据, 共50行




t2:
select
   videoid
from
   t1 lateral view explode(relatedid) explode_relatedid as videoid


videoid
yuo6yjlvxe8
vqpnwbo-r4e




目的:得到关联视频id的所有的数据信息,必须join操作
t4:
select
   distinct videoid,
   category
from
   youtube_orc t3 join t2 on t3.videoid=t2.videoid;


videoid category(没有炸开)
7wt5FiZQrgM     ["Gadgets","Games"]
AbndgwfG22k     ["Music"]
Ddn4MGaS3N4     ["Music"]
Gm6XszMrw9Y     ["Film","Animation"]
MNxwAU_xAMk     ["Comedy"]
N0TR0Irx4Y0     ["Comedy"]


t5
select
   videoid,
   category_name
from
   t4 lateral view explode(category) explode_category as category_name;


videoid category_name(已炸开)
7wt5FiZQrgM     Gadgets
7wt5FiZQrgM     Games
AbndgwfG22k     Music
Ddn4MGaS3N4     Music
Gm6XszMrw9Y     Film
Gm6XszMrw9Y     Animation
MNxwAU_xAMk     Comedy
N0TR0Irx4Y0     Comedy
O9mEKMz2Pvo     Music
OMndH4egfSk     Music
OUi9-jqq_i0     Music
RB-wUgnyGv0     Entertainment
RUCZJVJ_M8o     Entertainment
S2n1_h3Bvt0     Entertainment
_BuRwH59oAo     Comedy
ixsZy2425eY     Entertainment
o4x-VW_rCSE     Entertainment
sdUUx5FdySs     Film
sdUUx5FdySs     Animation
u2pW7PSyZhw     Sports




result:
select
   count(videoid) as hot,
   category_name
from
   t5
group by
   category_name
order by
   hot desc;




select category_name as category,
       count(t5.videoid) as hot
from
  (select videoid,
          category_name
   from
     (select distinct(t2.videoid),
             t3.category
      from
        (select explode(relatedid) as videoid
         from
           (select *
            from youtube_orc
            order by views desc limit 50) t1) t2
      inner join youtube_orc t3 on t2.videoid = t3.videoid) t4 lateral view explode(category) t_catetory as category_name) t5
group by category_name
order by hot desc;


select
   category_name as category,
   count(videoId) as hot
from
    (select
       videoId,
       category_name
    from
       (select
           distinct videoId,
           category
        from
            (select
               video2Id
            from
                (select
                   *
                from
                   youtube_orc
                order by
                   views desc
                limit 50) t1
             lateral view explode(relatedId) explode_relatedId as video2Id) t2
        join youtube_orc t3 on t3.videoId=t2.video2Id) t4
    lateral view explode(category) explode_category as category_name) t5
group by
   category_name
order by
   hot desc;


category        hot
Music   5
Entertainment   5
Comedy  3
Film    2
Animation       2
Sports  1
Games   1
Gadgets 1






====================================================


统计每个类别中的视频热度 Top10,以 Music 为例


create table youtube_category(videoid string, uploader string, age int, categoryid string, length int, views int, rate float, ratings int, comments int, relatedid array<string>) row format delimited fields terminated by "\t" collection items terminated by "&" stored as orc;


insert into table youtube_category
select videoid,
       uploader,
       age,
       categoryid,
       length,
       views,
       rate,
       ratings,
       comments,
       relatedid
from youtube_orc lateral view explode(category) catetory as categoryid;




select videoid,
       views
from youtube_category
where categoryid = "Music"
order by views desc limit 10;




====================================================


统计每个类别中视频流量 Top10,以 Music 为例


select videoid,
       ratings
from youtube_category
where categoryid = "Music"
order by ratings desc limit 10;




====================================================


统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 的视频


select *
from youtube_user_orc
order by videos desc limit 10;


uploader       videos friends
expertvillage   86228   5659
TourFactory     49078   0
myHotelVideo    33506   102
AlexanderRodchenko      24315   22
VHTStudios      20230   3
ephemeral8      19498   11
HSN     15371   309
rattanakorn     12637   677
Ruchaneewan     10059   774
futifu  9668    118


select
  *
from
  (select
    *
  from
    youtube_user_orc
  order by videos desc
  limit 10) t1
  join youtube_orc t2
    on t1.uploader = t2.uploader
distribute by
t1.uploader
sort by
views desc
limit 20 ;


select
  t2.videoid,
  t2.views,
  t2.ratings,
  t1.videos,
  t1.friends
from
  (select
    *
  from
    youtube_user_orc
  order by videos desc
  limit 10) t1
  join youtube_orc t2
    on t1.uploader = t2.uploader
order by views desc
limit 20 ;




====================================================


统计每个类别视频观看数 Top10


select
  t1.*
from
  (select
    videoId,
    categoryId,
    views,
    row_number () over (partition by categoryId order by views desc) rank
  from
    youtube_category) t1
where rank <= 10 ;


t1.videoid      t1.categoryid   t1.views        t1.rank
LHyJH1yGKZY     Animals 1085020 1
Qz_nZixWX6Q     Animals 1027067 2
yg2enZsknZM     Animals 2429    3
sdUUx5FdySs     Animation       5840839 1
6B26asyGKDo     Animation       5147533 2
JzqumbhfxRo     Animation       3230774 3
ElrldD02if0     Animation       2337238 4
Gm6XszMrw9Y     Animation       1227020 5
hBRjiLPNJAA     Animation       927340  6
zRVts7TFw-Y     Animation       607456  7
Jy09vLHkIes     Animation       544237  8
bqZauhidT1w     Animation       374550  9
A0eZ9qkUvJU     Animation       351288  10
pdiuDXwgrjQ     Autos   1013697 1
zgpbblz9wHw     Autos   11322   2
-_CSo1gOd48     Blogs   13199833        1
D2kJZOfq7zk     Blogs   11184051        2
PL4Uzun4CKA     Blogs   1235410 3
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值