0. 项目结构
数据处理过程图
1. DupDriver
package hadoop_test.data_duplicate_demo_02;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DupDriver {
/* 处理数据:
192.168.234.21
192.168.234.22
192.168.234.21
192.168.234.21
192.168.234.23
192.168.234.21
192.168.234.21
192.168.234.21
192.168.234.25
192.168.234.21
192.168.234.21
192.168.234.26
192.168.234.21
192.168.234.27
192.168.234.21
192.168.234.27
192.168.234.21
192.168.234.29
192.168.234.21
192.168.234.26
192.168.234.21
192.168.234.25
192.168.234.25
192.168.234.21
192.168.234.22
192.168.234.21
*/
public static void main(String[] args) throws Exception {
System.setProperty("HADOOP_USER_NAME", "root");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(DupDriver.class);
job.setMapperClass(DupMapper.class);
job.setReducerClass(DupReducer .class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class); // Value为NULL,具体原因后续会有说明
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("/hadoop_test/dup/dup.txt"));
FileOutputFormat.setOutputPath(job, new Path("/hadoop_test/dup/word_count_result"));
job.waitForCompletion(true);
}
}
2. DupMapper
package hadoop_test.data_duplicate_demo_02;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class DupMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// value : 192.168.70.49
// 因为目标为去重,不用统计数量。因此不用让key为1,可减少IO时间开销
context.write(new Text(value),NullWritable.get());
}
}
3. DupReducer
package hadoop_test.data_duplicate_demo_02;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class DupReducer extends Reducer<Text,NullWritable,Text,NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(new Text(key),NullWritable.get());
}
}
详细每行代码功能可参考 【Hadoop学习项目】1. wordcount + combine 详解每行代码