键值对类型
k:单词,v:文件名
如何获取value这一行来自于哪个文件?
//获取属于哪个文件分片
FileSplit fs = (FileSplit) context.getInputSplit();
//属于哪个文件
String name = fs.getPath().getName();
package text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class TextMapper extends Mapper<LongWritable, Text,Text,Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取属于哪个文件分片
FileSplit fs = (FileSplit) context.getInputSplit();
//属于哪个文件
String name = fs.getPath().getName();
String[] word = value.toString().split(" ");
for(String s:word){
context.write(new Text(s),new Text(name));
}
}
}
改进
在上面的代码中,获取文件名的代码时重复的,但是每读取一行文件,这块代码就会执行一次。但其实上,对于处理的每一个split来说只需要调用1次就好了。
mr提供setup方法,在task开始前调用过一次
cleanup方法是在task结束之前调用一次。
一个task可以处理一个分片,所以可以把获取分片的文件名代码写在setup方法中
package text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class TextMapper extends Mapper<LongWritable, Text,Text,Text> {
private String name = "";
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取属于哪个文件分片
FileSplit fs = (FileSplit) context.getInputSplit();
//属于哪个文件
this.name = fs.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] word = value.toString().split(" ");
for(String s:word){
context.write(new Text(s),new Text(name));
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
super.cleanup(context);
}
}
Reducer
package text;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
public class TextReducer extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String filenames = "[";
HashSet<String> path = new HashSet<>();
for(Text t:values){
path.add(t.toString());
}
Iterator<String> iterator = path.iterator();
while(iterator.hasNext()){
filenames = filenames + iterator.next()+",";
}
String substring = filenames.substring(0, filenames.length() - 1)+"]";
context.write(key,new Text(substring));
}
}
Driver
package text;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class TextDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job instance = Job.getInstance(new Configuration());
instance.setJarByClass(TextDriver.class);
instance.setMapperClass(TextMapper.class);
instance.setReducerClass(TextReducer.class);
instance.setOutputKeyClass(Text.class);
instance.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(instance,new Path("hdfs://hadoop01:9000/txt/invert"));
FileOutputFormat.setOutputPath(instance,new Path("hdfs://hadoop01:9000/result/aaa1"));
instance.waitForCompletion(true);
}
}