13. Hive实战之谷粒影音

Hive实战之谷粒影音

1. 项目数据结构
1.1 视频表
字段备注详细描述
video_id视频唯一 id11 位字符串
uploader视频上传者上传视频的用户名 String
age视频年龄视频在平台上的整数天
category视频类别上传视频指定的视频分类
length视频长度整形数字标识的视频长度
views观看次数视频被浏览的次数
rate视频评分满分 5 分
ratings流量视频的流量,整型数字
conments评论数一个视频的整数评论数
related_ids相关视频 id相关视频的 id,最多 20 个
1.2 用户表
字段备注字段类型
uploader上传者用户名string
videos上传视频数int
friends朋友数量int
2. 数据清洗
  1. 通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t” 进行分割。
  2. 为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。
    • 即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频 id 也使用“&”进行分割。
video_iduploaderagecategorylengthviewsrateratingsconmentsrelated_ids
SDNkMu8ZT68w00dy911630People & Blogs186101813.49494257rjnbgpPJUksr jnbgpPJUks

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.atguigu</groupId>
    <artifactId>guli-video</artifactId>
    <version>1.0-SNAPSHOT</version>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.2</version>
        </dependency>
    </dependencies>

</project>
2.1 ETLMapper
/**
 * @Date 2020/8/7 13:37
 * @Version 10.21
 * @Author DuanChaojie
 */
public class ETLMapper extends Mapper<LongWritable, Text, NullWritable,Text> {

    Text v = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1.获取数据
        String oriStr = value.toString();

        // 2.过滤数据
        String etlStr = ETLUtil.etlStr(oriStr);

        // 3.写出
        if(etlStr == null){
            return;
        }

        v.set(etlStr);
        context.write(NullWritable.get(),v);
    }
}
2.2 ETLUtil
public class ETLUtil {
    public static String etlStr(String oriStr){
        StringBuffer sb = new StringBuffer();
        // 1.过滤长度不够的,小于9个字段的
        //SDNkMu8ZT68	w00dy911	630	People & Blogs	186	10181	3.49	494	257	rjnbgpPJUksr	jnbgpPJUks
        int len = 9;
        String[] fields = oriStr.split("\t");
        if(fields.length < len){
            return null;
        }
        // 2.去掉类别字段中的空格
        fields[3] = fields[3].replaceAll(" ","");

        // 3.修改相关视频ID字段的分隔符
        for (int i = 0; i < fields.length; i++) {
            if (i < len){
                sb.append(fields[i]).append("\t");
            }else{
                if (i == fields.length-1){
                    sb.append(fields[i]);
                }else{
                    sb.append(fields[i]).append("&");
                }
            }
        }

        return sb.toString();
    }
}
2.3 ETLDriver
public class ETLDriver implements Tool {

    Configuration conf = new Configuration();

    public int run(String[] args) throws Exception {
        // 1.获取Job对象
        Job job = Job.getInstance(conf);
        // 2.获取jar包路径
        job.setJarByClass(ETLDriver.class);

        // 3.设置Mapper类和输出KV类型
        job.setMapperClass(ETLMapper.class);

        // 4.设置最终输出的KV类型
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        // 5.设置输入输出路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        // 6.提交任务
        boolean result = job.waitForCompletion(true);

        return result ? 0:1;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    public Configuration getConf() {
        return conf;
    }

    public static void main(String[] args) {
        Configuration conf = new Configuration();
        try {
            int run = ToolRunner.run(conf, new ETLDriver(), args);
            System.out.println("run = " + run);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

打成Jar包guli-video-1.0-SNAPSHOT.jar

在这里插入图片描述

hadoop fs -mkdir /gulivideo

hadoop fs -put user/ /gulivideo
hadoop fs -put video/ /gulivideo

# 进行数据清洗
yarn jar /opt/module/data/gulivideo/jars/guli-video-1.0-SNAPSHOT.jar com.atguigu.mr.ETLDriver /gulivideo/video/2008/0222 /gulivideo/output

清洗后的数据

在这里插入图片描述

3. 项目准备工作
3.1 创建表

创建表:gulivideo_ori,gulivideo_user_ori,存储为textfile格式

create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;

create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;

创建表:gulivideo_orc,gulivideo_user_orc,存储为orc格式

create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;

create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited fields terminated by "\t"
stored as orc;
3.2 导入ETL后数据
-- 向gulivideo_ori表导入数据
load data inpath "/gulivideo/output/part-r-00000" into table gulivideo_ori;

-- 向gulivideo_user_ori表导入数据
load  data  inpath  "/gulivideo/user/2008/0903"  into  table gulivideo_user_ori;

向 ORC 表插入数据

insert into table gulivideo_orc select * from gulivideo_ori;

insert into table gulivideo_user_orc select * from gulivideo_user_ori;
4. 业务分析

统计硅谷影音视频网站的常规指标,各种 TopN 指标:

–统计视频观看数 Top10

–统计视频类别热度 Top10

–统计视频观看数 Top20 所属类别以及类别包含的 Top20 的视频个数

–统计视频观看数 Top50 所关联视频的所属类别 Rank

–统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 视频

–统计每个类别中的视频热度 Top10

–统计每个类别中视频流量 Top10

–统计每个类别视频观看数 Top10

4.1 统计视频观看数 Top10
select
    videoId,
    views,
    uploader
from
    gulivideo_orc
order by
    views desc
limit
    10;
    
-- 结果:
videoid 		views   		uploader
dMH0bHeiRNg     42513417        judsonlaipply
0XxI-hvPRRA     20282464        smosh
1dmVU08zVpA     16087899        NBC
RB-wUgnyGv0     15712924        ChrisInScotland
QjA5faZF1A8     15256922        guitar90
-_CSo1gOd48     13199833        tasha
49IDp76kjPw     11970018        TexMachina
tYnn51C3X_w     11823701        CowSayingMoo
pv5zWaTEVkI     11672017        OkGo
D2kJZOfq7zk     11184051        mrWoot
4.2 统计视频类别热度 Top10
-- 这里使用limit是为了方便查看结果
-- 使用UDTF函数将类别列炸开
select 
    videoId,
    category_name
from
    gulivideo_orc
lateral view explode(category) tmp_tab as category_name
limit 10;t1

-- 结果:
videoid category_name
o4x-VW_rCSE     Entertainment
P1OXAQHv09E     Comedy
N0TR0Irx4Y0     Comedy
seGhTWE98DU     Music
bNF_P281Uu4     Travel
bNF_P281Uu4     Places
CQO3K8BcyGM     Comedy
3gg5LOd_Zus     Entertainment
sdUUx5FdySs     Film
sdUUx5FdySs     Animation

-- 按照category_name进行分组,统计每种类别视频的总数,同时按照该总数进行倒序排名,取前10
select
    category_name,
    count(category_name) category_count
from (
select 
    videoId,
    category_name
from
    gulivideo_orc
lateral view explode(category) tmp_tab as category_name
)t1
group by
    category_name
order by
    category_count desc
limit 10;

-- 结果:
category_name   category_count
Music   179049
Entertainment   127674
Comedy  87818
Animation       73293
Film    73293
Sports  67329
Gadgets 59817
Games   59817
Blogs   48890
People  48890
4.3 统计视频观看数 Top20 所属类别以及类别包含的 Top20 的视频个数
-- 统计视频观看数Top20
select
    videoId,
    views,
    category
from
    gulivideo_orc
order by 
    views desc
limit 20;t1

-- 结果:
videoid views   category
dMH0bHeiRNg     42513417        ["Comedy"]
0XxI-hvPRRA     20282464        ["Comedy"]
1dmVU08zVpA     16087899        ["Entertainment"]
RB-wUgnyGv0     15712924        ["Entertainment"]
QjA5faZF1A8     15256922        ["Music"]
-_CSo1gOd48     13199833        ["People","Blogs"]
-- ...

-- 对t1表中的category进行炸裂

select
    videoId,
    category_name
from
    (select
    videoId,
    views,
    category
from
    gulivideo_orc
order by 
    views desc
limit 20)t1
lateral view explode(category) tmp_tab as category_name;t2

-- 结果:
videoid category_name
dMH0bHeiRNg     Comedy
0XxI-hvPRRA     Comedy
1dmVU08zVpA     Entertainment
-- ...

-- 对t2表进行分组(category_name)求和(总数)
select
    category_name,
    count(*) category_count
from
    (select
    videoId,
    category_name
from
    (select
    videoId,
    views,
    category
from
    gulivideo_orc
order by 
    views desc
limit 20)t1
lateral view explode(category) tmp_tab as category_name)t2
group by
    category_name
order by
    category_count desc;

-- 结果:
category_name   category_count
Entertainment   6
Comedy  6
Music   5
People  2
Blogs   2
UNA     1
4.4 统计视频观看数 Top50 所关联视频的所属类别 Rank
select
    category
from
    (select
    related_id
from 
    (select
    relatedId,
    views
from 
    gulivideo_orc
order by
    views desc
limit 50)t1
lateral view explode(relatedId) tmp_tab as related_id 

group by 
    related_id)t2
join
    gulivideo_orc orc
on
    t2.related_id = orc.videoId;t3
-- 结果:
["Music"]
["Comedy"]
["Entertainment"]
["People","Blogs"]
["Comedy"]
["Comedy"]
["Comedy"]
["Entertainment"]
["Comedy"]

-- 对t3表中的category进行炸裂
select
    explode(category) category_name
from
    (select
    category
from
    (select
    related_id
from 
    (select
    relatedId,
    views
from 
    gulivideo_orc
order by
    views desc
limit 50)t1
lateral view explode(relatedId) tmp_tab as related_id 

group by 
    related_id)t2
join
    gulivideo_orc orc
on
    t2.related_id = orc.videoId)t3;t4
-- 结果:
Music
Music
UNA
Entertainment
Music
Music


-- 分组(类别)求和(总数)
select
    category_name,
    count(category_name) category_count
from
    (select
    explode(category) category_name
from
    (select
    category
from
    (select
    related_id
from 
    (select
    relatedId,
    views
from 
    gulivideo_orc
order by
    views desc
limit 50)t1
lateral view explode(relatedId) tmp_tab as related_id 

group by 
    related_id)t2
join
    gulivideo_orc orc
on
    t2.related_id = orc.videoId)t3)t4
group by 
    category_name
order by
    category_count desc;

-- 结果:
category_name   category_count
Comedy  232
Entertainment   216
Music   195
Blogs   51
People  51
Film    47
Animation       47
News    22
Politics        22
Games   20
Gadgets 20
Sports  19
Howto   14
DIY     14
UNA     13
Places  12
Travel  12
Animals 11
Pets    11
Autos   4
Vehicles        4
4.5 统计上传视频最多的用户Top10以及他们上传的观看次数在前20视频
-- 统计上传视频最多的用户Top10
select
    uploader,
    videos
from
    gulivideo_user_orc
order by
    videos desc
limit 10;t1
-- 取出这10个人上传的所有视频,按照观看次数进行排名,取前20
select
    gulivideo_orc.videoId,
    gulivideo_orc.views
from
    (select
    uploader,
    videos
from
    gulivideo_user_orc
order by
    videos desc
limit 10)t1
join
    gulivideo_orc
on
    t1.uploader=gulivideo_orc.uploader
order by
    views desc
limit 20;

--结果:
video.videoid   video.views
-IxHBW0YpZw     39059
BU-fT5XI_8I     29975
ADOcaBYbMl0     26270
yAqsULIDJFE     25511
vcm-t0TJXNg     25366
0KYGFawp14c     24659
j4DpuPvMLF4     22593
Msu4lZb2oeQ     18822
ZHZVj44rpjE     16304
foATQY3wovI     13576
-UnQ8rcBOQs     13450
crtNd46CDks     11639
D1leA0JKHhE     11553
NJu2oG1Wm98     11452
CapbXdyv4j4     10915
epr5erraEp4     10817
IyQoDgaLM7U     10597
tbZibBnusLQ     10402
_GnCHodc7mk     9422
hvEYlSlRitU     7123
4.6 统计每个类别(以Music为例)…
  1. 要想统计 Music 类别中的视频热度Top10,需要先找到 Music类别,那么就需要将 category
    展开,所以可以创建一张表用于存放 categoryId 展开的数据。

  2. 向 category 展开的表中插入数据。

    create table gulivideo_category(
    videoId string,
    uploader string,
    age int,
    categoryId string,
    length int,
    views int,
    rate float,
    ratings int,
    comments int,
    relatedId array<string>)
    row format delimited
    fields terminated by "\t"
    collection items terminated by "&"
    stored as orc;
    
    insert into table gulivideo_category
    select
    videoId,
    uploader,
    age,
    categoryId,
    length,
    views,
    rate,
    ratings,
    comments,
    relatedId
    from 
    gulivideo_orc lateral view explode(category) catetory as
    categoryId;
    
    select count(*) from gulivideo_category;
    -- 结果:1019206
    select count(*) from gulivideo_orc;
    -- 结果:743569
    
  3. 统计对应类别(Music)中的视频热度

    -- 给每一种类别根据视频观看数添加rank值(倒序)
    select
        categoryId,
        videoId,
        views,
        rank() over(partition by categoryId order by views desc) rk
    from
        gulivideo_category;
    
    -- 过滤前十
    select
        categoryId,
        videoId,
        views
    from
        (select
        categoryId,
        videoId,
        views,
        rank() over(partition by categoryId order by views desc) rk
    from
        gulivideo_category)t1
    where
        rk<=10;
        
    -- 结果:
    categoryid      videoid views
    Animals 2GWPOPSXGYI     3660009
    Animals xmsV9R8FsDA     3164582
    Animals 12PsUW-8ge4     3133523
    Animals OeNggIGSKH8     2457750
    Animals WofFb_eOxxA     2075728
    Animals AgEmZ39EtFk     1999469
    Animals a-gW3RbJd8U     1836870
    Animals 8CL2hetqpfg     1646808
    Animals QmroaYVD_so     1645984
    Animals Sg9x5mUjbH8     1527238
    -- ...
    Vehicles        RjrEQaG5jPM     2803140
    Vehicles        cv157ZIInUk     2773979
    Vehicles        Gyg9U1YaVk8     1832224
    Vehicles        6GNB7xT3rNE     1412497
    Vehicles        tth9krDtxII     1347317
    Vehicles        46LQd9dXFRU     1262173
    Vehicles        pdiuDXwgrjQ     1013697
    Vehicles        kY_cDpENQLE     956665
    Vehicles        YtxfbxGz1u4     942604
    Vehicles        aCamHfJwSGU     847442
    

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
org.apache.hive.jdbc.HiveDriver是Apache Hive项目的JDBC驱动程序。Hive是一个基于Hadoop的数据仓库基础设施,它提供了一个SQL类似的查询语言(HiveQL)来处理存储在Hadoop集群中的大规模数据。 要下载org.apache.hive.jdbc.HiveDriver,可以按照以下步骤进行: 1. 打开Apache Hive项目的官方网站,可以在https://hive.apache.org/上找到。 2. 导航到“下载”页面,该页面通常位于网站导航栏的顶部或底部。 3. 在下载页面上,查找与你的操作系统和Java版本兼容的Hive JDBC驱动程序。 4. 单击下载链接以开始下载驱动程序。根据你的网速和文件大小,下载过程可能需要一些时间。 5. 下载完成后,将下载的JAR文件保存到你指定的文件夹中。 下载Hive JDBC驱动程序后,你可以在Java项目中使用它来连接和查询Hive服务器。只需将驱动程序JAR文件添加到项目的类路径中,然后使用以下代码示例来加载并注册HiveDriver: ``` import java.sql.DriverManager; import java.sql.Connection; public class HiveDriverExample { public static void main(String[] args) { try { // 加载HiveDriver类 Class.forName("org.apache.hive.jdbc.HiveDriver"); // 建立Hive服务器连接 Connection con = DriverManager.getConnection("jdbc:hive2://localhost:10000/default", "", ""); // 在连接上执行查询语句等操作 // ... // 关闭连接 con.close(); } catch (Exception e) { e.printStackTrace(); } } } ``` 通过以上步骤,你就可以成功下载并使用org.apache.hive.jdbc.HiveDriver来连接和操作Hive服务器。记得根据自己的具体环境和需求进行配置和使用。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值