黑猴子的家：Hive 扩展项目二之原数据ETL处理

最新推荐文章于 2023-02-27 17:12:56 发布

黑猴子的家

最新推荐文章于 2023-02-27 17:12:56 发布

阅读量275

点赞数

分类专栏： Hive

本文链接：https://blog.csdn.net/qq_28652401/article/details/83509072

版权

Hive 专栏收录该内容

91 篇文章 1 订阅

订阅专栏

Code -> GitHub
https://github.com/liufengji/hive_code.git

1、ETL原始数据

通过观察原始数据形式，可以发现，视频可以有多个所属分类，每个所属分类用&符号分割，且分割的两边有空格字符，同时相关视频也是可以有多个元素，多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作，我们首先进行数据重组清洗操作。
即：将所有的类别用“&”分割，同时去掉两边空格，多个相关视频id也使用“&”进行分割。

2、ETL之ETLUtil

public class ETLUtil {
    public static String oriString2ETLString(String ori){
        StringBuilder etlString = new StringBuilder();
        String[] splits = ori.split("\t");
        if(splits.length < 9) return null;
        splits[3] = splits[3].replace(" ", "");
        for(int i = 0; i < splits.length; i++){
            if(i < 9){
                if(i == splits.length - 1){
                    etlString.append(splits[i]);                    
                }else{
                    etlString.append(splits[i] + "\t"); 
                }
            }else{
                if(i == splits.length - 1){
                    etlString.append(splits[i]);
                }else{
                    etlString.append(splits[i] + "&");
                }
            }
        }
        
        return etlString.toString();
    }
}

3、ETL之Mapper

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import com.z.youtube.util.ETLUtil;

public class VideoETLMapper extends Mapper<Object, Text, NullWritable, Text>{
    Text text = new Text();
    
    @Override
    protected void map(Object key, Text value, Context context) 
                                     throws IOException, InterruptedException {
        String etlString = ETLUtil.oriString2ETLString(value.toString());
        
        if(StringUtils.isBlank(etlString)) return;
        
        text.set(etlString);
        context.write(NullWritable.get(), text);
    }   
}

4、ETL之Runner

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class VideoETLRunner implements Tool {
    private Configuration conf = null;

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    @Override
    public Configuration getConf() {

        return this.conf;
    }

    @Override
    public int run(String[] args) throws Exception {
        conf = this.getConf();
        conf.set("inpath", args[0]);
        conf.set("outpath", args[1]);

        Job job = Job.getInstance(conf, "youtube-video-etl");
        
        job.setJarByClass(VideoETLRunner.class);
        
        job.setMapperClass(VideoETLMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setNumReduceTasks(0);
        
        this.initJobInputPath(job);
        this.initJobOutputPath(job);
        
        return job.waitForCompletion(true) ? 0 : 1;
    }

    private void initJobOutputPath(Job job) throws IOException {
        Configuration conf = job.getConfiguration();
        String outPathString = conf.get("outpath");
        
        FileSystem fs = FileSystem.get(conf);
        
        Path outPath = new Path(outPathString);
        if(fs.exists(outPath)){
            fs.delete(outPath, true);
        }
        
        FileOutputFormat.setOutputPath(job, outPath);
        
    }

    private void initJobInputPath(Job job) throws IOException {
        Configuration conf = job.getConfiguration();
        String inPathString = conf.get("inpath");
        
        FileSystem fs = FileSystem.get(conf);
        
        Path inPath = new Path(inPathString);
        if(fs.exists(inPath)){
            FileInputFormat.addInputPath(job, inPath);
        }else{
            throw new RuntimeException("HDFS中该文件目录不存在：" + inPathString);
        }
    }

    public static void main(String[] args) {
        try {
            int resultCode = ToolRunner.run(new VideoETLRunner(), args);
            if(resultCode == 0){
                System.out.println("Success!");
            }else{
                System.out.println("Fail!");
            }
            System.exit(resultCode);
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }
}

5、执行ETL

[victor@hadoop102 hadoop]$ bin/yarn jar /opt/youtube.jar \
com.z.youtube.etl.ETLYoutubeVideosRunner \
/youtube/video/2008/0222 \
/youtube/output/video/2008/0222

赠送maven编译打包命令提示