介绍
倒排索引是将文章中的单词挑出来,排序,便于检索。利用map-reduce思想来实现,如下:
原始文本及内容:
doc1.txt:MapReduce is simple
doc2.txt:MapReduce ispowerful is simple
doc3.txt:Hello MapReduce byeMapReduce
那么输出结果应该是这样子的:
MapReduce:doc1.txt:1;doc2.txt:1;doc3.txt:2;
is:doc1.txt:1;doc2.txt:2;
simple:doc1.txt:1;doc2.txt:1;
Hello:doc3.txt:1;
MapReduce:doc3.txt:1;
其中冒号之前表示文档,之后表示在这个文档中出现的次数,分号分隔各个文档。例如:MapReduce:doc1.txt:1;doc2.txt:1;doc3.txt:2; 表示MapReduce在doc1.txt中出现一次,在doc2.txt中出现一次,在doc3.txt中出现两次。
明白了原理之后,看如何用MapReduce来实现。
原始文件作为输入,经过Map之后变成以下格式:
<MapReduce:doc1.txt, 1>
<is:doc1.txt, 1>
<simple:doc1.txt, 1>
<MapReduce:doc2.txt, 1>
<is:doc2.txt, 1>
<powerful:doc2.txt, 1>
<is:doc2.txt, 1>
<simple:doc2.txt, 1>
<Hello:doc3.txt, 1>
<MapReduce:doc3.txt, 1>
<bye:doc3.txt, 1>
<MapReduce:doc3.txt, 1>
经过Combiner之后变成以下格式:
<MapReduce:doc1.txt, 1>
<is:doc1.txt, 1>
<simple:doc1.txt, 1>
<MapReduce:doc2.txt, 1>
<is:doc2.txt, 2>
<powerful:doc2.txt, 1>
<simple:doc2.txt, 1>
<Hello:doc3.txt, 1>
<MapReduce:doc3.txt, 2>
<bye:doc3.txt, 1>
经过reduce之后变成以下内容:
<MapReduce, doc1.txt:1;doc2.txt:1;doc3.txt:2;>
<is, doc1.txt:1;doc2.txt:2;>
<simple, doc1.txt:1;doc2.txt:1;>
<Hello, doc3.txt:1;>
<MapReduce, doc3.txt:1;>
可以考虑考虑为什么这么做。
源代码
Mapper类:
- package cn.kepu.littlefu;
- import java.io.IOException;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapred.JobConf;
- import org.apache.hadoop.mapred.MapReduceBase;
- import org.apache.hadoop.mapred.Mapper;
- import org.apache.hadoop.mapred.OutputCollector;
- import org.apache.hadoop.mapred.Reporter;
- @SuppressWarnings(“deprecation”)
- public class InverseIndexMapper extends MapReduceBaseimplements Mapper<Object, Text, Text, Text> {
- StringinputFile;
- public voidconfigure(JobConf job) {
- StringinputFileFull = job.get(”map.input.file”);
- inputFile =inputFileFull.substring(inputFileFull.lastIndexOf(”/”)+1);
- }
- @Override
- public void map(Object key, Text value,OutputCollector<Text, Text> output, Reporter reporter) throwsIOException{
- Stringline = value.toString();
- String[]word = line.split(” ”);
- for(Strings : word){
- //output<word:doc1, 1>
- output.collect(newText(s+”:”+inputFile), new Text(“1”));
- }
- }
- }
package cn.kepu.littlefu;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
@SuppressWarnings("deprecation")
public class InverseIndexMapper extends MapReduceBaseimplements Mapper<Object, Text, Text, Text> {
StringinputFile;
public voidconfigure(JobConf job) {
StringinputFileFull = job.get("map.input.file");
inputFile =inputFileFull.substring(inputFileFull.lastIndexOf("/")+1);
}
@Override
public void map(Object key, Text value,OutputCollector<Text, Text> output, Reporter reporter) throwsIOException{
Stringline = value.toString();
String[]word = line.split(" ");
for(Strings : word){
//output<word:doc1, 1>
output.collect(newText(s+":"+inputFile), new Text("1"));
}
}
}
Combiner类:
- package cn.kepu.littlefu;
- import java.io.IOException;
- import java.util.Iterator;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapred.MapReduceBase;
- import org.apache.hadoop.mapred.OutputCollector;
- import org.apache.hadoop.mapred.Reducer;
- import org.apache.hadoop.mapred.Reporter;
- public class InverseIndexCombiner extendsMapReduceBase implements
- Reducer<Text, Text, Text, Text>{
- @Override
- public void reduce(Text key, Iterator<Text>values,
- OutputCollector<Text,Text> output, Reporter reporter)
- throwsIOException {
- //total
- int sum =0;
- while(values.hasNext()){
- sum+= Integer.parseInt(values.next().toString());
- }
- //outputposition
- int pos =key.toString().indexOf(“:”);
- //output<word,doc1:1>
- TextoutKey = new Text(key.toString().subSequence(0, pos).toString());
- TextoutValue = new Text(key.toString().substring(pos+1).toString()+“:”+sum);
- System.out.print(”combiner:<key:”+outKey.toString()+“,value:”+outValue.toString()+“>”);
- output.collect(outKey, outValue);
- }
- }
package cn.kepu.littlefu;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class InverseIndexCombiner extendsMapReduceBase implements
Reducer<Text, Text, Text, Text>{
@Override
public void reduce(Text key, Iterator<Text>values,
OutputCollector<Text,Text> output, Reporter reporter)
throwsIOException {
//total
int sum =0;
while(values.hasNext()){
sum+= Integer.parseInt(values.next().toString());
}
//outputposition
int pos =key.toString().indexOf(":");
//output<word,doc1:1>
TextoutKey = new Text(key.toString().subSequence(0, pos).toString());
TextoutValue = new Text(key.toString().substring(pos+1).toString()+":"+sum);
System.out.print("combiner:<key:"+outKey.toString()+",value:"+outValue.toString()+">");
output.collect(outKey, outValue);
}
}
Reduce类:
- package cn.kepu.littlefu;
- import java.io.IOException;
- import java.util.Iterator;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapred.MapReduceBase;
- import org.apache.hadoop.mapred.OutputCollector;
- import org.apache.hadoop.mapred.Reducer;
- import org.apache.hadoop.mapred.Reporter;
- public class InverseIndexReducer extends MapReduceBaseimplements
- Reducer<Text,Text, Text, Text> {
- @Override
- public voidreduce(Text key, Iterator<Text> values,
- OutputCollector<Text,Text> output, Reporter reporter)
- throwsIOException {
- StringfileList = new String();
- while(values.hasNext()){
- fileList+= values.next().toString()+”;”;
- }
- //output<word,doc1:1;doc2:2;doc3:1;>
- output.collect(key,new Text(fileList));
- }
- }
package cn.kepu.littlefu;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class InverseIndexReducer extends MapReduceBaseimplements
Reducer<Text,Text, Text, Text> {
@Override
public voidreduce(Text key, Iterator<Text> values,
OutputCollector<Text,Text> output, Reporter reporter)
throwsIOException {
StringfileList = new String();
while(values.hasNext()){
fileList+= values.next().toString()+";";
}
//output<word,doc1:1;doc2:2;doc3:1;>
output.collect(key,new Text(fileList));
}
}
Main类:
- package cn.kepu.littlefu;
- import java.io.IOException;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapred.FileInputFormat;
- import org.apache.hadoop.mapred.FileOutputFormat;
- import org.apache.hadoop.mapred.JobClient;
- import org.apache.hadoop.mapred.JobConf;
- public class InverseIndexLuncher {
- public staticvoid main(String[] args) throws IOException{
- if(args.length != 2){
- System.err.println(”Usage :InverseIndex <input path> <output path>”);
- System.exit(-1);
- }
- JobConfconf = new JobConf(InverseIndexLuncher.class);
- conf.setJobName(”inverseindex”);
- FileInputFormat.addInputPath(conf,new Path(args[0]));
- FileOutputFormat.setOutputPath(conf,new Path(args[1]));
- conf.setMapperClass(InverseIndexMapper.class);
- conf.setCombinerClass(InverseIndexCombiner.class);
- conf.setReducerClass(InverseIndexReducer.class);
- conf.setMapOutputKeyClass(Text.class);
- conf.setMapOutputValueClass(Text.class);
- conf.setOutputKeyClass(Text.class);
- conf.setOutputValueClass(Text.class);
- JobClient.runJob(conf);
- }
- }
package cn.kepu.littlefu;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
public class InverseIndexLuncher {
public staticvoid main(String[] args) throws IOException{
if(args.length != 2){
System.err.println("Usage :InverseIndex <input path> <output path>");
System.exit(-1);
}
JobConfconf = new JobConf(InverseIndexLuncher.class);
conf.setJobName("inverseindex");
FileInputFormat.addInputPath(conf,new Path(args[0]));
FileOutputFormat.setOutputPath(conf,new Path(args[1]));
conf.setMapperClass(InverseIndexMapper.class);
conf.setCombinerClass(InverseIndexCombiner.class);
conf.setReducerClass(InverseIndexReducer.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
JobClient.runJob(conf);
}
}
运行截图
参考:《实战Hadoop–开启通向云计算的捷径》P74-P83
转自http://blog.csdn.net/fufengrui/article/details/8169583?locationNum=14