Hive实战之谷粒影音

需求描述

统计硅谷影音视频网站的常规指标,各种TopN指标:

  • 统计视频观看数Top10
  • 统计视频类别热度Top10
  • 统计视频观看数Top20所属类别
  • 统计视频观看数Top50所关联视频的所属类别Rank
  • 统计每个类别中的视频热度Top10
  • 统计每个类别中视频流量Top10
  • 统计上传视频最多的用户Top10以及他们上传的视频
  • 统计每个类别视频观看数Top10

数据清洗

将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割。

核心代码

package com.atguigu.etl;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import org.junit.Test;
import org.junit.internal.runners.statements.Fail;

import java.io.IOException;


public class ETLMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
    /* 空格去掉,分隔符改成‘&’ */
    private Counter pass;
    private Counter fail;

    private StringBuilder sb = new StringBuilder();
    private Text result=new Text();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        pass = context.getCounter("ETL","Pass");
        fail = context.getCounter("ETL", "Fail");
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //一行数据

        String line = value.toString();

        //数据切分
        String[] fields = line.split("\t");

        //判断字段个数
        if (fields.length>=9){
            //去掉第四个字段的空格
            fields[3] = fields[3].replace(" ","");
            sb.setLength(0);

            //拼接字段成一行
            for (int i = 0; i < fields.length; i++) {
                    if (i == fields.length-1){
                        sb.append(fields[i]);
                    }else if (i <= 8){
                        //如果拼的是前九个字段
                        sb.append(fields[i]).append("\t");
                    }else {
                        sb.append(fields[i]).append("&");
                    }


            }
            result.set(sb.toString());

            context.write(result, NullWritable.get());

            pass.increment(1);
        }else {
            fail.increment(1);//不要了
        }

    }
}

建立video表和user表

create external table video_ori(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
location '/guivideo/video/video_etl';
--user表
create external table user_ori(
    uploader string,
    videos int,
    friends int)
row format delimited fields terminated by "\t" 
location '/guivideo/user';

简单查询一下:

  • video表
    在这里插入图片描述
  • user表
    在这里插入图片描述
    以上为外部表,需要再建立内部表
create table video_orc(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
--user_orc表
create table user_orc(
    uploader string,
    videos int,
    friends int)
stored as orc
tblproperties("orc.compress"="SNAPPY");

插入数据

--从外部表中插入数据
insert into table video_orc select * from video_ori;
insert into table user_orc select * from user_ori;

需求实现

1.统计视频观看数top10

SELECT
    videoid,
    views
FROM
    video_orc
ORDER BY
    views DESC
LIMIT 10;
  • 结果
    在这里插入图片描述

2. 统计视频类别热度Top10

思路:
(1)即统计每个类别有多少个视频,显示出包含视频最多的前10个类别。
(2)我们需要按照类别group by聚合,然后count组内的videoId个数即可。
(3)因为当前表结构为:一个视频对应一个或多个类别。所以如果要group by类别,需要先将类别进行列转行(展开),然后再进行count即可。
(4)最后按照热度排序,显示前10条。

SELECT
    cate,
    COUNT(videoid) n
FROM
    (
    SELECT
        videoid,
        cate
    FROM
        video_orc LATERAL VIEW explode(category) tbl as cate) t1
GROUP BY
    cate
ORDER BY
    n desc
limit 10;

3.统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数

思路:
(1)先找到观看数最高的20个视频所属条目的所有信息,降序排列
(2)把这20条信息中的category分裂出来(列转行)
(3)最后查询视频分类名称和该分类下有多少个Top20的视频

SELECT
    cate,
    COUNT(videoid) n
FROM
    (
    SELECT
        videoid,
        cate
    FROM
        (
        SELECT
            videoid,
            views,
            category
        FROM
            video_orc
        ORDER BY
            views DESC
        LIMIT 20 ) t1 LATERAL VIEW explode(category) tbl as cate ) t2
GROUP BY
    cate
ORDER BY
    n DESC;

4.统计视频观看数Top50所关联视频的所属类别排序

  1. 统计观看数前50的视频的关联视频
SELECT
    videoid,
    views,
    relatedid
FROM
    video_orc
ORDER BY
    views DESC
LIMIT 50;
  1. 炸开关联视频
SELECT
    explode(relatedid) videoid
FROM
    t1;
  1. 和原表Join获取关联视频的类别
SELECT
    DISTINCT t2.videoid,
    v.category
FROM
    t2
JOIN video_orc v on
    t2.videoid = v.videoid;
  1. 炸开类别
SELECT
    explode(category) cate
FROM
    t3;

最终代码:

SELECT
    DISTINCT t4.cate,
    t5.n
FROM
    (
    SELECT
        explode(category) cate
    FROM
        (
        SELECT
            DISTINCT t2.videoid,
            v.category
        FROM
            (
            SELECT
                explode(relatedid) videoid
            FROM
                (
                SELECT
                    videoid,
                    views,
                    relatedid
                FROM
                    video_orc
                ORDER BY
                    views DESC
                LIMIT 50 ) t1 ) t2
        JOIN video_orc v on
            t2.videoid = v.videoid ) t3 ) t4
JOIN (
    SELECT
        cate,
        COUNT(videoid) n
    FROM
        (
        SELECT
            videoid,
            cate
        FROM
            video_orc LATERAL VIEW explode(category) tbl as cate) g1
    GROUP BY
        cate ) t5 ON
    t4.cate = t5.cate
ORDER BY
    t5.n DESC;

5. 统计每个类别中的视频热度Top10,以Music为例

  1. 把视频表的类别炸开,生成中间表格video_category
CREATE
    TABLE
        video_category STORED AS orc TBLPROPERTIES("orc.compress"="SNAPPY") AS SELECT
            videoid,
            uploader,
            age,
            cate,
            length,
            views,
            rate,
            ratings,
            comments,
            relatedid
        FROM
            video_orc LATERAL VIEW explode(category) tbl as cate;
  1. 从video_category直接查询Music类的前10视频
SELECT
    videoid,
    views
FROM
    video_category
WHERE
    cate ="Music"
ORDER BY
    views DESC
LIMIT 10;

结果
在这里插入图片描述

6. 统计每个类别中视频流量Top10,以Music为例

  1. 从video_category直接查询Music类的流量前10视频
SELECT
    videoid,
    ratings
FROM
    video_category
WHERE
    cate ="Music"
ORDER BY
    ratings DESC
LIMIT 10;

7.统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频

理解一:前十用户每人前20

  1. 统计视频上传最多的用户Top10
SELECT
    uploader,
    videos
FROM
    user_orc
ORDER BY
    videos DESC
LIMIT 10;
  1. 和video_orc联立,找出这些用户上传的视频,并按照热度排名
SELECT
    t1.uploader,
    v.videoid,
    RANK() OVER(PARTITION BY t1.uploader ORDER BY v.views DESC) hot
FROM
    t1
LEFT JOIN video_orc v ON
    t1.uploader = v.uploader;
  1. 求每个人前20
SELECT
    t2.uploader,
    t2.videoid,
    t2.hot
FROM
    t2
WHERE
    hot <= 20;

最终代码:

SELECT
    t2.uploader,
    t2.videoid,
    t2.hot
FROM(
SELECT
    t1.uploader,
    v.videoid,
    RANK() OVER(PARTITION BY t1.uploader ORDER BY v.views DESC) hot
FROM(
SELECT
    uploader,
    videos
FROM
    user_orc
ORDER BY
    videos DESC
LIMIT 10)
    t1
LEFT JOIN video_orc v ON
    t1.uploader = v.uploader)
    t2
WHERE
    hot <= 20;

理解二:前十用户总榜前20

  1. 统计视频上传最多的用户Top10
SELECT
    uploader,
    videos
FROM
    user_orc
ORDER BY
    videos DESC
LIMIT 10;
  1. 观看数前20的视频
SELECT
    videoid,
    uploader,
    views
FROM
    video_orc
ORDER BY
    views DESC
LIMIT 20;
  1. 联立两表,看看有没有他们上传的
SELECT
    t1.uploader,
    t2.videoid
FROM
    t1
LEFT JOIN t2 ON
    t1.uploader = t2.uploader;

最终代码:

SELECT
    t1.uploader,
    t2.videoid
FROM(
SELECT
    uploader,
    videos
FROM
    user_orc
ORDER BY
    videos DESC
LIMIT 10)
    t1
LEFT JOIN (
SELECT
    videoid,
    uploader,
    views
FROM
    video_orc
ORDER BY
    views DESC
LIMIT 20
)t2 ON
    t1.uploader = t2.uploader;

结果:一个都没有…
在这里插入图片描述

8. 统计每个类别视频观看数Top10

  1. 从video_category表查出每个类别视频观看数排名
SELECT
    cate,
    videoid,
    views,
    RANK() OVER(PARTITION BY cate ORDER BY views DESC) hot
FROM
    video_category;
  1. 取每个类别的Top10
SELECT
    cate,
    videoid,
    views
FROM
    t1
WHERE
    hot <= 10;

最终代码

SELECT
    cate,
    videoid,
    views
FROM(
SELECT
    cate,
    videoid,
    views,
    RANK() OVER(PARTITION BY cate ORDER BY views DESC) hot
FROM
    video_category)
    t1
WHERE
    hot <= 10;

运行结果:
在这里插入图片描述

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值