hive基础【hive项目实战】

一 数据结构

1 视频表

字段备注详细信息
video id视频唯一id(String)11位字符串
uploader视频上传者(String)上传视频的用户名String
age视频年龄(int)视频在平台上的整数天
category视频类别(Array)上传视频指定的视频分类
length视频长度(Int)整形数字标识的视频长度
views观看次数(Int)视频被浏览的次数
rate视频评分(Double)满分5分
Ratings流量(Int)视频的流量,整型数字
conments评论数(Int)一个视频的整数评论数
related ids相关视频id(Array)相关视频的id,最多20个

2 用户表

字段备注详细信息
uploader上传者用户名string
videos上传视频数int
friends朋友数量int

3 ETL原始数据

通过观察原始数据形式发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割。

在HDFS服务器hikevideo目录下创建文件夹video和user,在此目录下将原始数据上传。

3.1 导入依赖

--在pom.xml文件中引入依赖
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>

    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-slf4j-impl</artifactId>
            <version>2.12.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.1.3</version>
        </dependency>
        <!--        <dependency>-->
        <!--            <groupId>org.apache.hadoop</groupId>-->
        <!--            <artifactId>hadoop-client-runtime</artifactId>-->
        <!--            <version>3.1.3</version>-->
        <!--        </dependency>-->
    </dependencies>

3.2 创建log4j2.xml配置文件

--创建log4j2.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<Configuration status="error" strict="true" name="XMLConfig">
    <Appenders>
        <!-- 类型名为Console,名称为必须属性 -->
        <Appender type="Console" name="STDOUT">
            <!-- 布局为PatternLayout的方式,
            输出样式为[INFO] [2018-01-22 17:34:01][org.test.Console]I'm here -->
            <Layout type="PatternLayout"
                    pattern="[%p] [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" />
        </Appender>

    </Appenders>

    <Loggers>
        <!-- 可加性为false -->
        <Logger name="test" level="info" additivity="false">
            <AppenderRef ref="STDOUT" />
        </Logger>

        <!-- root loggerConfig设置 -->
        <Root level="info">
            <AppenderRef ref="STDOUT" />
        </Root>
    </Loggers>

</Configuration>

3.3 ETLMapper

package com.hike.etl;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class ETLMapper extends Mapper<LongWritable,Text,Text,NullWritable> {

    private Counter pass;
    private Counter fail;

    private StringBuilder sb = new StringBuilder();

    private Text result = new Text();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        pass = context.getCounter("ETL","pass");
        fail = context.getCounter("ETL","fail");
    }

    /**
     * 将一行日志进行处理:把第四个字段中的空格去掉,将最后相关视频字段的分隔符改成‘&’,
     * 并且字段长度不够的数据要清理掉
     * @param key   行号
     * @param value 一行日志
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] fields = line.split("\t");
        //判断字段个数是否足够,除了最后一个相关信息字段不存在外,其他字段都要存在
        if(fields.length >= 9){
            //处理数据,去掉第四个字段的空格
            fields[3] = fields[3].replace(" ", "");

            //拼接字段成一行,并注意最后几个字段的分隔符
            sb.setLength(0);    //拼接之前,将以前旧的字符串清零,使得每一次拼接字符串都是从0开始拼接
            for (int i = 0; i < fields.length; i++) {
                //如果当前正在拼接的字段是这一行的最后一个字段
                if(i == fields.length - 1){
                    sb.append(fields[i]);
                }else if(i <= 8){
                    //如果拼接的是前9个字段加上分隔符\t
                    sb.append(fields[i]).append("\t");
                }else {
                    //如果拼接的是相关视频的字段,使用&隔开
                    sb.append(fields[i]).append("&");
                }
            }
            //最后转换成string写出去
            result.set(sb.toString());

            context.write(result,NullWritable.get());

            pass.increment(1);
        }else{
            //清楚数据,不写出即可
            fail.increment(1);
        }
    }
}

3.4 ETLDriver

package com.hike.etl;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import javax.xml.soap.Text;
import java.io.IOException;

public class ETLDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration entries = new Configuration();
        //设置mapreduce在tez引擎上运行
        entries.set("mapreduce.framwork.name","yarn-tez");

        Job job = Job.getInstance(entries);

        job.setJarByClass(ETLDriver.class);

        job.setMapperClass(ETLMapper.class);
        job.setNumReduceTasks(0);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

3.5 提交集群运行

yarn jar etltool-1.0-SNAPSHOT.jar com.hike.etl.ETLDriver /hikevideo/video /hikevideo/video_etl

4 准备工作

在user和video_etl目录下创建两张外部表,将数据囊括进去,外部表中的数据不直接进行查找,因为文本数据查询效率比较低,一般是创建外部表,再将外部表的数据导入到内部表中。

最简单的etl工作一般是在导表这一步骤进行的。

一般引用外部数据的时候都会创建外部表,因为数据是共享的,外部表被删除数据仍然存在。

4.1 创建外部ori表

--创建外部表video_ori
create external table video_ori(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
location '/hikevideo/video_etl';
--创建外部表user_uri
create external table user_ori(
    uploader string,
    videos int,
    friends int)
row format delimited fields terminated by "\t" 
location '/hikevideo/user';

4.2 创建内部orc表

--创建video_orc
create table video_orc(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
--创建内部表user_orc
create table user_orc(
    uploader string,
    videos int,
    friends int)
stored as orc
tblproperties("orc.compress"="SNAPPY");

4.3 向orc表插入数据

在导入数据的过程中,如果有需要,可以进行必要的数据清洗操作

--从外部表插入数据
insert into table video_orc select * from video_ori;
insert into table user_orc select * from user_ori;

5 业务分析

5.1 统计视频观看数Top10

--使用order by按照views字段做一个全局排序即可,同时设置只显示前10条
select 
    videoid, 
    views
from 
    video_orc 
order by 
    views desc
limit 10;

5.2 统计视频类别热度top10

  • 定义视频类别热度(假设按照类别下视频的个数决定)

  • 类别是一个数组,需要将内部元素分开

    select 
        videoid,
        cate
    from
        video_orc lateral view explode(category) tbl as cate;
    
  • 在以上基础上统计各个类别有多少视频,并排序取前10

    select
        cate,
        count(videoid) n
    from
        (select 
        videoid,
        cate
    from
        video_orc lateral view explode(category) tbl as cate) t1
    group by
        cate
    order by n limit 10;
    

5.3 统计出视频观看数最高的20个视频的所属类别以及类别包含top20视频的个数

  • 统计前20视频和其类别

    select 
        videoid,
        views,
        category
    from
        video_orc
    order by
        views desc
    limit
        20;
    
  • 将类别分开

    select
        videoid,
        cate
    from
        t1 lateral view explode(category) tbl as cate;
    
  • 按照类别统计个数

    select
        cate,
        count(videoid) n
    from
        t2
    group by
        cate
    order by
        n desc;
    
  • 合并以上三个步骤

    select
        cate,
        count(videoid) n
    from
        (select
        videoid,
        cate
    from
        (select 
        videoid,
        views,
        category
    from
        video_orc
    order by
        views desc
    limit
        20)t1 lateral view explode(category) tbl as cate)t2
    group by
        cate
    order by
        n desc;
    

5.4 统计视频观看数top50所关联视频的所属类别排序

  • 统计视频频观看数前50所关联的视频

    select
        videoid,
        views,
        relatedid
    from
        video_orc
    order by
        views desc
    limit 50
    
  • 将关联视频的类别拆开

    select
        explode(relatedid) videoid
    from 
        t1
    
  • 和原表join获取类别

    select
        distinct t2.videoid,
        v.category
    from
        t2
    join
        video_orc v
    on
        t2.videoid=v.ideoid
    
  • 将关联视频的类别拆散开

    select
        explode(category) cate
    from
        t3
    
  • 和类别热度表join排序

    select
        distinct t4.cate,
        t5.n
    from
        t4
    join 
        t5
    on 
        t4.cate=t5.cate
    order by
        t5.n desc
    
  • 综合

    select
        distinct t4.cate,
        t5.n
    from
        (
        select
        explode(category) cate
    from
        (
        select
        distinct t2.videoid,
        v.category
    from
        (
        select
        explode(relatedid) videoid
    from 
        (
        select
        videoid,
        views,
        relatedid
    from
        video_orc
    order by
        views desc
    limit 50
        )t1
        )t2
    join
        video_orc v
    on
        t2.videoid=v.ideoid
        )t3
        )t4
    join 
        (
        select
        cate,
        count(videoid) n
    from
        (select 
        videoid,
        cate
    from
        video_orc lateral view explode(category) tbl as cate) x1
    group by
        cate
        )t5
    on 
        t4.cate=t5.cate
    order by
        t5.n desc;
    

5.5 统计每个类别中的视频热度top10,以music为例

  • 将视频表的类别拆开

    create table video_category
    stored as orc tblproperties("orc.compress"="SNAPPY") as
    select
        videoid,
        uploader,
        age,
        cate,
        length,
        views,
        ratings,
        comments,
        relatedid
    from
        video_orc
    lateral view explode(category) tbl as cate;
    
  • 从中间表格查询music类的前10视频

    select
        videoid,
        views
    from
        video_category
    where
        cate="Music"
    order by
        views desc
    limit 10;
    

5.6 统计每个类别中视频流量Top10,以Music为例

  • 从video_category查询Music类的流量前10视频

    select
        videoid,
        ratings
    from
        video_category
    where
        cate="Music"
    order by
        ratings desc
    limit 10;
    

5.7 统计上传视频最多的用户Top10以及每人上传的观看次数在前20的视频

  • 统计上传视频最多的用户Top10

    select uploader,videos from user_orc order by videos desc limit 10;
    
  • 和video_orc联立,找出这些用户上传的视频

    select
        t1.uploader,
        v.videoid,
        rank() over(partition by t1.uploader order by v.views desc) hot
    from 
        t1
    join video_orc v on t1.uploader=v.uploader;
    
  • 求每人前20

    select
        t2.uploader,
        t2.videoid,
        t2.hot
    from
        (
        select
        t1.uploader,
        v.videoid,
        rank() over(partition by t1.uploader order by v.views desc) hot
    from 
        (
        select uploader,videos from user_orc order by videos desc limit 10
        )t1
    join video_orc v on t1.uploader=v.uploader;
        )t2
    where
        hot <= 20;
    

5.8 统计上传视频最多的用户Top10以及上传的观看次数在前20的视频

  • 求视频上传最多的前10用户

    select uploader,videos from user_orc order by videos desc limit 10;
    
  • 求观看数总榜单前二十的视频

    select
        videoid,
        uploader,
        views
    from
        video_orc
    order by
        views desc
    limit 20;
    
  • 联立两张表

    select
        t1.uploader,
        t2.videoid
    from
        (
        select uploader,videos from user_orc order by videos desc limit 10
        )t1 
    join
        (
        select
        videoid,
        uploader,
        views
    from
        video_orc
    order by
        views desc
    limit 20
        )t2
    on
        t1.uploader=t2.uploader;
    

5.9 统计每个类别视频观看数Top10

  • 从video_category表查出每个类别视频观看数排名

    select
        cate,
        videoid,
        views,
        rank() over(partition by cate order by views desc) hot
    from
        video_category
    
  • 取每个类别的top10

    select 
        cate,
        videoid,
        views
    from
        (
        select
        cate,
        videoid,
        views,
        rank() over(partition by cate order by views desc) hot
        from
        video_category
        )t1
    where
        hot<=10;
    

用到的数据信息

  • 0
    点赞
  • 21
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论
Hadoop Hive数仓实战项目是基于HadoopHive技术的数据仓库项目。在这个项目中,使用Hadoop集群来存储和处理大规模的数据,并使用Hive作为数据仓库的查询和分析工具。 在项目中,首先需要添加一个hadoop用户组,并创建一个hadoop用户,并设置用户密码。然后,切换到hadoop用户,并启动Hive。通过Hive,可以执行一系列的命令,如展示数据库、退出等操作。 引用中提到,Hive建立在Hadoop之上,具有与Hadoop相同的可扩展性,可以轻松应对大规模的数据处理需求。这意味着Hadoop Hive数仓实战项目可以处理大规模的数据,并能够支持超过1000个节点的Hadoop集群。 另外,引用中提到了一些配置文件的重要性,如hive-site.xml和hive-default.xml,它们可以通过设置-hiveconf参数来进行配置。 综上所述,Hadoop Hive数仓实战项目是一个基于HadoopHive技术的大规模数据仓库项目,可以通过Hive进行数据查询和分析,并具有与Hadoop相同的可扩展性。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *2* *3* [大数据开发基础入门与项目实战(三)Hadoop核心及生态圈技术栈之3.数据仓库工具Hive基础](https://blog.csdn.net/CUFEECR/article/details/121189073)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 100%"] [ .reference_list ]

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

OneTenTwo76

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值