数据是这样的文本文件,一行为一个单词
如: words.txt
aa
aaa
b
xxx
s
v
words2.txt:
aa
bb
aaa
bbb
ddddd
d
代码和单词计数没有太大差别,例如map阶段传入的key为行标,value为这一行所对应的单词, 只要将单词作为key传入下一步骤(shuffle阶段,此阶段默认做了数据排序,分类,combiler的工作), value可以随便指定,因为不会使用到,那shuffle阶段默认排序分类后 以key为输出的key就自动做了去重工作,那么shuffler阶段传入给reduce阶段后,只需要将key打印出来即可
代码为:
package deduplication;
import java.io.IOException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Dedup extends Configured implements Tool {
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
private Text line = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
line = value;
context.write(line, new Text(""));
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text> {
protected void reduce(Text key, Iterable<Text> value, Context context)
throws IOException, InterruptedException {
context.write(key, new Text(""));
}
}
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new Dedup(), args);
System.exit(ret);
}
@Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
Job job = new Job(getConf());
job.setJarByClass(Dedup.class);
job.setJobName("dep");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(
"src/deduplication/words.txt"), new Path(
"src/deduplication/words2.txt"));
FileOutputFormat.setOutputPath(job, new Path("rst2"));
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
}
结果为:
aa
aaa
b
bb
bbb
d
ddddd
s
v
xxx