倒排索引是文档检索系统中最常用的数据结构,被广泛用于全文搜索引擎,它主要使用来存储某个单词(或词组)在一个文档或一组文档中的存储位置的映射,即提供了一种根据内容来查找文档的方式。
以前不使用hadoop时,实现倒排索引真是费劲啊,尤其是当处理大量文本时,更是让人头疼啊。自从有了hadoop爸爸妈妈再也不用担心我们处理大数据了,呵呵,废话就说到这,下面实现简单的倒排索引:
结构:
主类:InvertedIndex
主类中的三个静态内部类: InvertIndexMapper, InvertedIndexCombiner, InvertedIndexReduce
package com.mr.index;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 倒排索引:根据内容来查找文档,就像搜索引擎一样,通过关键字查询想要的内容
* map过程 combiner过程 reduce过程
* 单词:文档x------- 词频 单词----------文档x:词频 单词-------------文档1:词频;文档2:词频。。。
* */
public class InvertedIndex {
public static class InvertIndexMapper extends Mapper<Object,Text,Text,Text>{
private Text keyInfo=new Text(); //存储单词和URI的组合
private Text valueInfo=new Text(); //存储词频
private FileSplit split; //存储Split对象
public void map(Object key,Text value,Context context)
throws IOException,InterruptedException{
//获得<key,value>对所属FileSplit对象
split=(FileSplit)context.getInputSplit();
StringTokenizer stk=new StringTokenizer(value.toString());
while(stk.hasMoreTokens()){
keyInfo.set(stk.nextToken()+":"+split.getPath().getName().toString());
valueInfo.set("1");
context.write(keyInfo, valueInfo);
System.out.println(keyInfo+"---------"+valueInfo);
}
}
}
public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{
private Text info=new Text();
public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
//统计词频
int sum=0;
for(Text value:values){
sum+=Integer.parseInt(value.toString());
}
int pos=key.toString().indexOf(":");
//重新设置value值由文件名和词频组成
info.set(key.toString().substring(pos+1)+":"+String.valueOf(sum));
key.set(key.toString().substring(0, pos));
context.write(key, info);
}
}
public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{
private Text result=new Text();
public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
//用于存储最后哦偶的结果
StringBuffer buf=new StringBuffer();
for(Text value:values){
buf.append(value.toString()+";");
}
result.set(buf.toString());
context.write(key, result);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job=Job.getInstance();
job.setJarByClass(InvertedIndex.class);
job.setJobName("InvertedIndex");
if(args.length!=2){
System.out.println("Usage:invertedidex<in><out>");
System.exit(2);
}
job.setMapperClass(InvertIndexMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setCombinerClass(InvertedIndexCombiner.class);
job.setReducerClass(InvertedIndexReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true)? 0 :1);
}
}
运行时配置输入文件和输出文件的路径即可