1.重写map阶段
package hzy.com.WordDeduplication;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordDeduplicationMapper extends Mapper<Object,Text, Text, Text>{
private Text text = new Text("");
@Override
protected void map(Object key, Text value,
Mapper<Object, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
//具体实现方法 输出为 text,null
context.write(value, text);
}
//
}
2.重写reduce阶段
package hzy.com.WordDeduplication;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordDeduplicationReducer extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text arg0, Iterable<Text> arg1,
Reducer<Text, Text, Text, Text>.Context arg2) throws IOException,
InterruptedException {
arg2.write(arg0, new Text(""));
}
}
3.main 函数
package hzy.com.WordDeduplication;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordDeduplicationApp {
public static void main(String[] args) throws Exception{
//1.连接hadoop
Configuration cf=new Configuration();
cf.set("fs.defaultFS","hdfs://hadoop0:9000/");
//2.创建Job,设置入口
Job job = Job.getInstance(cf);
job.setJarByClass(WordDeduplicationApp.class);
//3.读取hdfs的数
FileInputFormat.addInputPath(job, new Path("/data/source.txt"));
FileInputFormat.addInputPath(job, new Path("/data/source2.txt"));
//4.进行mapper计算
job.setMapperClass(WordDeduplicationMapper.class);
//map输出 key
job.setMapOutputKeyClass(Text.class);
//map输出value
job.setMapOutputValueClass(Text.class);
//setCombinerClass 与 setReducerClass同时只能使用一个
// job.setCombinerClass(WordDeduplicationReducer.class);
//5.进行reducer计
job.setReducerClass(WordDeduplicationReducer.class);
//reduce输出key
job.setOutputKeyClass(Text.class);
//reduce输出value
job.setOutputValueClass(Text.class);
//6.写出结果到hdfs
FileOutputFormat.setOutputPath(job, new Path("/data/xt0"));//写入的目录应该是空的,否则报错
//7.提交任
job.waitForCompletion(true);
}
}