package com.zhiyou.db23;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class DuplicateRemove {
//定义一个map类继承自Mapper类,在该类的map方法上实行map数据处理的业务逻辑
public static class DuplicateRemoveMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
private Text outputKey = new Text();
private NullWritable outputValue =NullWritable.get();
private static String[] info;
@Ov
HADOOP去除重复
最新推荐文章于 2024-05-11 19:12:00 发布
本文介绍了一个使用Hadoop MapReduce实现去除重复数据的Java程序。通过自定义`DuplicateRemoveMapper`和`DuplicateRemoveReduce`类,分别处理Map阶段的数据解析和Reduce阶段的数据聚合。Mapper接收LongWritable和Text作为输入,输出Text和NullWritable,Reducer则将相同的Key进行聚合,最终输出不重复的Key。程序最后配置并启动Job,完成数据去重任务。
摘要由CSDN通过智能技术生成