1.数据清洗和多目录输出
package cn.yu.hive.dataclearing;
import cn.yu.hive.tools.Constant;
import cn.yu.hive.tools.Fileds;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.net.URISyntaxException;
/**
* @author liuyujing
* @create 2019-08-07 11:14
*/
public class DataClearingV {
protected static class CleansingMap extends Mapper<LongWritable, Text, Text, NullWritable> {
private MultipleOutputs<Text, NullWritable> mops;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
mops = new MultipleOutputs <Text, NullWritable> ( context );
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
mops.close ();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String string = value.toString ();
String[] str = string.split ( Constant.IN_Split );
if(!cleansing ( str )){
mops.write(Constant.OUT_NUNORMAL,value,NullWritable.get (),Constant.OUT_NUNORMAL+"\\"+Constant.OUT_NUNORMAL);
}else {
mops.write(Constant.OUT_NORMAl,value,NullWritable.get (),Constant.OUT_NORMAl+"\\"+Constant.OUT_NORMAl);
}
}
//长度为9-29并id的字符长度为11 为正常数据
protected boolean cleansing(String[] str) {
if (str.length <9||str.length>29) {
return false;
}
if (str[Fileds.video_id].length ()!=11) {
return false;
}
return true;
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration configuration = new Configuration ();
FileSystem filesystem = FileSystem.get ( configuration );
Job job = Job.getInstance ( configuration );
job.setJarByClass ( DataClearingV.class );
job.setMapperClass ( DataClearingV.CleansingMap.class );
job.setNumReduceTasks ( 0 );
job.setOutputKeyClass ( Text.class );
job.setOutputValueClass ( NullWritable.class );
FileInputFormat.setInputPaths ( job,new Path ( Constant.FILE_INV ) );
Path outpath = new Path ( Constant.FILE_OUTV );
if(filesystem.exists ( outpath )){
filesystem.delete ( outpath,true );
}
FileOutputFormat.setOutputPath ( job,new Path ( Constant.FILE_OUTV ) );
MultipleOutputs.addNamedOutput (job,Constant.OUT_NUNORMAL,TextOutputFormat.class,Text.class,NullWritable.class );
MultipleOutputs.addNamedOutput (job,Constant.OUT_NORMAl,TextOutputFormat.class,Text.class,NullWritable.class );
boolean b=job.waitForCompletion ( true );
System.exit ( b ? 1:0);
}
}
2.正常数据的格式转换
package cn.yu.hive.form;
import cn.yu.hive.tools.Constant;
import cn.yu.hive.tools.Fileds;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URISyntaxException;
/**
* @author liuyujing
* @create 2019-08-07 14:55
*/
public class FromV {
protected static class Vmap extends Mapper<LongWritable, Text, Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString ();
String[] split = line.split ( Constant.IN_Split );
String video_id = split[Fileds.video_id];
String uploader = split[Fileds.uploader];
String age = split[Fileds.age];
String category = split[Fileds.category].replace ( " ", "" );
String length = split[Fileds.length];
String views = split[Fileds.views];
String rate = split[Fileds.ratings];
String ratings = split[Fileds.ratings];
String conments = split[Fileds.conments];
String out = video_id + Constant.OUT_Split + uploader + Constant.OUT_Split + age + Constant.OUT_Split + category + Constant.OUT_Split + length + Constant.OUT_Split + views + Constant.OUT_Split + rate + Constant.OUT_Split + ratings + Constant.OUT_Split + conments;
if (split.length > 9) {
out=out+ Constant.OUT_Split;
for (int i = 9; i < split.length-1; i++) {
out = out + split[i] + Constant.OUT_ARRAY;
}
out=out+split[split.length-1];
}
context.write ( new Text ( out ), NullWritable.get () );
}
}
public static void main(String[] args) throws URISyntaxException, IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration ();
FileSystem fileSystem = FileSystem.get ( configuration );
Job job = Job.getInstance ( configuration );
job.setJarByClass (FromV .class );
job.setMapperClass ( FromV.Vmap.class );
job.setNumReduceTasks ( 0 );
job.setOutputKeyClass ( Text.class );
job.setOutputValueClass ( NullWritable.class );
FileInputFormat.setInputPaths ( job, new Path ( Constant.FILE_OUTV + "\\" + Constant.OUT_NORMAl ) );
Path path = new Path ( Constant.FILE_OUTV + "\\" + Constant.OUT_V );
if (fileSystem.exists ( path )) {
fileSystem.delete ( path, true );
}
FileOutputFormat.setOutputPath ( job, new Path ( Constant.FILE_OUTV + "\\" + Constant.OUT_V ) );
boolean b = job.waitForCompletion ( true );
System.exit ( b ? 1 : 0 );
}
}
3. tools工具
package cn.yu.hive.tools;
/**
* @author liuyujing
* @create 2019-07-22 10:03
*/
public class Constant {
/**
* 输入输出文件夹的地址
*/
public static final String FILE_IN="E:\\text\\use\\in";
public static final String FILE_OUT="E:\\text\\use\\out";
public static final String FILE_INV="E:\\text\\video\\in";
public static final String FILE_OUTV="E:\\text\\video\\out";
/**
* 输出目录下的文件夹
*
*/
public static final String OUT_NORMAl="normal";
public static final String OUT_NUNORMAL="nunormal";
public static final String OUT_V="video";
/**
* 输入输出的分隔符
*/
public static final String IN_Split="\t";
public static final String OUT_Split="\t";
public static final String OUT_ARRAY="&";
}
package cn.yu.hive.tools;
/**
* @author liuyujing
* @create 2019-08-07 11:16
*/
public class Fileds {
//视频唯一id
public static final int video_id=0;
//视频上传者
public static final int uploader=1;
//视频年龄
public static final int age=2;
//视频类别
public static final int category=3;
//视频长度(观看时长,秒)
public static final int length=4;
//观看次数
public static final int views=5;
//视频评分
public static final int rate=6;
//流量
public static final int ratings=7;
//评论数
public static final int conments=8;
//相关视频id
public static final int related_ids=9;
}
日志文件log4j.properties
log4j.rootLogger=INFO,stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m %n
<packaging>jar</packaging>
<properties>
<hadoop.version>2.6.1</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn</artifactId>
<version>${hadoop.version}</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>