hadoop——hive视频观看热度，Top N案例（youtube）

最新推荐文章于 2023-03-08 15:09:23 发布

qq_37001101

最新推荐文章于 2023-03-08 15:09:23 发布

阅读量1.6k

点赞数

文章标签： hadoop hive 数据仓库案例

本文链接：https://blog.csdn.net/qq_37001101/article/details/84959293

版权

数据准备

user.txt

0.txt

字段以及字段名解析

user表

字段备注字段类型

uploader 上传者用户名 string

videos 上传视频数 int

friends 朋友数量 int

视频表：

字段备注详细描述

video id 视频唯一 id 11 位字符串

uploader 视频上传者上传视频的用户名 String

age 视频年龄视频上传日期和 2007 年 2 月

15 日之间的整数天（Youtube的独特设定）

category 视频类别上传视频指定的视频分类

length 视频长度整形数字标识的视频长度

views 观看次数视频被浏览的次数

rate 视频评分满分 5 分

ratings 流量视频的流量，整型数字

conments 评论数一个视频的整数评论数

related ids 相关视频 id 相关视频的 id，最多 20 个

数据清洗

通过mapreduce将数据清洗出来，通过观察原始数据形式，可以发现，视频可以有多个所属分类，每个所属分类用&符号分割，且分割的两边有空格字符，同时相关视频也是可以有多个元素，多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作，我们首先进行数据重组清洗操作。即：将所有的类别用“&”分割，同时去掉两边空格，多个相关视频 id 也使用“&”进行分割。将数据放到hdfs指定的文件夹里面。

ETL数据清洗

ETLUtils.java

package ETLUtils;

public class ETLUtils {

    public static String getETCString(String str){
        String[] lines=str.split("\t");
        StringBuilder newLines=new StringBuilder();
        //1.去掉空格
        lines[3]=lines[3].replaceAll(" ","");
        //2.过滤不合法的值
        if(lines.length<9) return null;
        //3.大于9的下标的\t变化成&连接符
        for(int i=0;i<lines.length;i++){
            newLines.append(lines[i]);
            if(i<9){

                newLines.append("\t");
            }else{
                if(i!=lines.length-1){
                    newLines.append("&");
                }
            }
        }
        return newLines.toString();
    }

}

ETLMapper

package mapper;

import ETLUtils.ETLUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class ETLMapper extends Mapper<Object,Text,NullWritable,Text> {

    Text text=new Text();
    @Override
    protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {

        String lines=ETLUtils.getETCString(value.toString());

        if(StringUtils.isBlank(lines)) return;

        text.set(lines);

        context.write(NullWritable.get(),text);

    }
}

ETLRunner

package runner;

import java.io.IOException;

import mapper.ETLMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class ETLRunner implements Tool {

    private Configuration conf=null;

    @Override
    public void setConf(Configuration conf) {
        this.conf=conf;
    }

    @Override
    public Configuration getConf() {
        return this.conf;
    }

    @Override
    public int run(String[] args) throws Exception {

        conf=this.getConf();
        //传输路径变量
        conf.set("inpath",args[0]);
        conf.set("outpath",args[1]);

        Job job=Job.getInstance(conf,"youtub_etl_video");
        job.setJarByClass(ETLRunner.class);

        job.setMapperClass(ETLMapper.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        job.setNumReduceTasks(0);

        this.initInputPath(job);
        this.initOutputPath(job);

        return job.waitForCompletion(true)? 0:1;
    }

    private void initInputPath(Job job) throws IOException {

        Configuration conf=job.getConfiguration();
        String inpaths=conf.get("inpath");

        //获取抽象文件系统对象
        FileSystem fs=FileSystem.get(conf);
        //创建hdfs路径实体对象
        Path inpath =new Path(inpaths);
        //判断文件系统存在该路径
        if(fs.exists(inpath)){
           //设置输入路径
            FileInputFormat.addInputPath(job,inpath);
        }else{
            throw new RuntimeException("HDFS目录不存在"+inpaths);
        }
    }

    private void initOutputPath(Job job) throws IOException {

        Configuration conf=job.getConfiguration();

        String outpath=conf.get("outpath");

        FileSystem fs=FileSystem.get(conf);

        Path opath=new Path(outpath);

        if(fs.exists(opath)){
            //存在输出路径，删除输出路径
            fs.delete(opath,true);
        }
            FileOutputFormat.setOutputPath(job,opath);

    }

    public static void main(String[] args) {

        try {
            int result=ToolRunner.run(new ETLRunner(),args);
            if(result==0){
                System.out.println("Success!");
            }else{
                System.out.println("Fail!");
            }
            System.exit(result);
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }

}

建立四张表打算用orc的表进行操作（操作效率比较高），但是orc类型的表只能通过insert的形式插入数据，所以需要建立两张ori的表再进行插入操作。

create table youtube_ori(

videoId string,

uploader string,

age int,

category array<string>,

length int,

views int,

rate float,

ratings int,

comments int,

relatedId array<string>)

row format delimited

fields terminated by "\t"

collection items terminated by "&"

stored as textfile;

创建user ori表

create table youtube_user_ori(

uploader string,

videos int,

friends int)

clustered by (uploader) into 24 buckets

row format delimited

fields terminated by "\t"

stored as textfile;

//创建orc表

create table youtube_orc(

videoId string,

uploader string,

age int,

category array<string>,

length int,

views int,

rate float,

ratings int,

comments int,

relatedId array<string>)

row format delimited

fields terminated by "\t"

collection items terminated by "&"

stored as orc;

create table youtube_user_orc(

uploader string,

videos int,

friends int)

clustered by (uploader) into 24 buckets

row format delimited

fields terminated by "\t"

stored as orc;

导入数据到表ori中

load data inpath '/output/part-m-00000' into table youtube_ori;

load data inpath '/output/user.txt' into table youtube_user_ori;

将表导入到orc中

用insert into table A select * from B倒入到orc表

数据分析

字段：

视频id 上传者视频年龄视频类别观看长度观看次数视频评分流量评论数相关视频

videoId,uploader,age,category,length,views,rate,ratings,comments,relatedId

统计视频观看数top10

create table viewstop10 as select videoId,uploader,age,category,length,views,rate,ratings,comments,relatedId from youtube_orc order by views desc limit 10;