数据:
文件a.txt
a a a b
b c d
文件b.txt
c c c d
d a b
文件c.txt
d a d c
c f b
需求统计所有文件中单词出现的个数,按TAB切分,并计算在各个文件中的出现的个数
Map阶段
package day4_jobs_input.jobs.index1mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* IndexOneMap倒排索引一个MR实现
* @author Fantome
* @date 2019/06/21
*/
public class IndexOneMap extends Mapper<LongWritable,Text, Text, Text> {
Text k=new Text();
Text v=new Text();
String fileName;
/**
* 获得文件名
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit split = (FileSplit)context.getInputSplit();
fileName = split.getPath().getName();
}
/**
* 输出 k:word v:fileName
* @param key LongWritable
* @param value word
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split("\t");
for (String word:splits){
//k:word v:fileName
k.set(word);
v.set(fileName);
context.write(k,v);
}
}
}
Reduce阶段
package day4_jobs_input.jobs.index1mr;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* IndexOneReduce倒排索引一个MR实现
* @author Fantome
* @date 2019/06/21
*/
public class IndexOneReduce extends Reducer<Text,Text,Text,Text> {
Text v=new Text();
/**
* reduce端
* @param key 单词
* @param fileName 文件名
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<Text> fileName, Context context) throws IOException, InterruptedException {
//key:word value:fileName
//获得文件中所有词信息,以及词对应的文件位置
//存储在lists中
List<String> lists=new ArrayList<>();
for (Text x:fileName){
String s=x.toString();
lists.add(s);
}
//lists排序后,获得各个fileName出现的个数
lists.sort(String::indexOf);
StringBuffer put=new StringBuffer();
int num=lists.size();
lists.add(" ");
int count=1;
for (int i=0;i<num;i++){
String file=lists.get(i);
String fileNext=lists.get(i+1);
if(file.equals(fileNext)){
count++;
}else{
//输出 fileName->count 的格式
put.append(file+"->"+count+"\t");
count=1;
}
}
v.set(put.toString());
context.write(key,v);
}
}
结果
a c.txt->1 a.txt->3 b.txt->1
b a.txt->2 b.txt->1 c.txt->1
c b.txt->3 c.txt->2 a.txt->1
d a.txt->1 c.txt->2 b.txt->2
f c.txt->1