倒排索引基础知识可以参博客:http://blog.csdn.net/hguisu/article/details/7962350。
需求:
a.txt内容:
hello tom
hello jerry
hello kitty
hello world
hello tom
b.txt内容:
hello jerry
hello tom
hello world
需要我们在hdfs中生成一个文件:
hello a.txt-->5 b.txt-->3
tom a.txt--2> b.txt-->1
...
分析1:
-----------------mapper-----------------------------------------
context.write("hello-->a.txt","1");
context.write("hello-->a.txt","1");
context.write("hello-->a.txt","1");
context.write("hello-->a.txt","1");
context.write("hello-->a.txt","1");
<"hello-->a.txt",{1,1,1,1,1}>
context.write("hello-->b.txt","b.txt->1");
context.write("hello-->b.txt","b.txt->1");
context.write("hello-->b.txt","b.txt->1");
<"hello-->b.txt",{1,1,1}>
---------------reducer------------------------------------------
context.write("hello","a.txt->5");
context.write("hello","b.txt->3");
----------------mapper------------------------------------------
context.write("hello","a.txt->5");
context.write("hello","b.txt->3");
然后放到迭代器中:
<"hello",{"a.txt->5", "b.txt->3"}>
-------------------reducer--------------------------------------
context.write("hello","a.txt->5 b.txt->3");
-------------------------------------------------------------------
hello"a.txt->5 b.txt->3"
tom"a.txt->2 b.txt->1"
**************************************************************************
分析2(代码示例是按照分析2编写的):
-----------------------Mapper--------------------------------------------------
<0,"hello tom">
....
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->b.txt",1);
context.write("hello->b.txt",1);
context.write("hello->b.txt",1);
----------------------combiner----------------------------------
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->b.txt",1>
<"hello->b.txt",1>
<"hello->b.txt",1>
context.write("hello","a.txt->5");
context.write("hello","b.txt->3");
-------------------------Reducer-------------------------------
<"hello",{"a.txt->5","b.txt->3"}>
context.write("hello","a.txt->5 b.txt->3");
-------------------------------------------------------
hello "a.txt->5 b.txt->3"
tom "a.txt->2 b.txt->1"
kitty "a.txt->1"
.......
代码示例代码示例是按照分析2编写的:
package com.heres.hadoop.mr.ii;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.heres.hadoop.mr.ii.InverseIndex.IndexMapper.IndexCombiner;
public class InverseIndex {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(InverseIndex.class);
job.setMapperClass(IndexMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setCombinerClass(IndexCombiner.class);
job.setReducerClass(IndexReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
public static class IndexMapper extends Mapper<LongWritable, Text, Text, Text>{
private Text k = new Text();
private Text v = new Text();
@Override
protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String path = inputSplit.getPath().toString();
for(String w : words){
k.set(w+"->"+path);
v.set("1");
context.write(k, v);
}
}
public static class IndexCombiner extends Reducer<Text, Text, Text, Text>{
private Text k = new Text();
private Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values,Context context)
throws IOException, InterruptedException {
String[] wordsAndPath = key.toString().split("->");
String word = wordsAndPath[0];
String path = wordsAndPath[1];
int counter = 0;
for(Text t:values){
counter += Integer.parseInt(t.toString());
}
k.set(word);
v.set(path+"->"+counter);
context.write(k, v);
}
}
}
public static class IndexReducer extends Reducer<Text, Text, Text, Text>{
private Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values,Context context)
throws IOException, InterruptedException {
String re = "";
for(Text t:values){
re += t.toString()+"\t";
}
v.set(re);
context.write(key, v);
}
}
}