Code -> GitHub
https://github.com/liufengji/hive_code.git
1、ETL原始数据
通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。
即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割。
2、ETL之ETLUtil
public class ETLUtil {
public static String oriString2ETLString(String ori){
StringBuilder etlString = new StringBuilder();
String[] splits = ori.split("\t");
if(splits.length < 9) return null;
splits[3] = splits[3].replace(" ", "");
for(int i = 0; i < splits.length; i++){
if(i < 9){
if(i == splits.length - 1){
etlString.append(splits[i]);
}else{
etlString.append(splits[i] + "\t");
}
}else{
if(i == splits.length - 1){
etlString.append(splits[i]);
}else{
etlString.append(splits[i] + "&");
}
}
}
return etlString.toString();
}
}
3、ETL之Mapper
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import com.z.youtube.util.ETLUtil;
public class VideoETLMapper extends Mapper<Object, Text, NullWritable, Text>{
Text text = new Text();
@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String etlString = ETLUtil.oriString2ETLString(value.toString());
if(StringUtils.isBlank(etlString)) return;
text.set(etlString);
context.write(NullWritable.get(), text);
}
}
4、ETL之Runner
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class VideoETLRunner implements Tool {
private Configuration conf = null;
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return this.conf;
}
@Override
public int run(String[] args) throws Exception {
conf = this.getConf();
conf.set("inpath", args[0]);
conf.set("outpath", args[1]);
Job job = Job.getInstance(conf, "youtube-video-etl");
job.setJarByClass(VideoETLRunner.class);
job.setMapperClass(VideoETLMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(0);
this.initJobInputPath(job);
this.initJobOutputPath(job);
return job.waitForCompletion(true) ? 0 : 1;
}
private void initJobOutputPath(Job job) throws IOException {
Configuration conf = job.getConfiguration();
String outPathString = conf.get("outpath");
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path(outPathString);
if(fs.exists(outPath)){
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
}
private void initJobInputPath(Job job) throws IOException {
Configuration conf = job.getConfiguration();
String inPathString = conf.get("inpath");
FileSystem fs = FileSystem.get(conf);
Path inPath = new Path(inPathString);
if(fs.exists(inPath)){
FileInputFormat.addInputPath(job, inPath);
}else{
throw new RuntimeException("HDFS中该文件目录不存在:" + inPathString);
}
}
public static void main(String[] args) {
try {
int resultCode = ToolRunner.run(new VideoETLRunner(), args);
if(resultCode == 0){
System.out.println("Success!");
}else{
System.out.println("Fail!");
}
System.exit(resultCode);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}
5、执行ETL
[victor@hadoop102 hadoop]$ bin/yarn jar /opt/youtube.jar \
com.z.youtube.etl.ETLYoutubeVideosRunner \
/youtube/video/2008/0222 \
/youtube/output/video/2008/0222
赠送maven编译打包命令提示
-P local clean package