hadoop实现倒排索引

最新推荐文章于 2023-04-10 20:43:19 发布

Crazy-Jobs

最新推荐文章于 2023-04-10 20:43:19 发布

阅读量1k

点赞数

分类专栏： Hadoop 文章标签：搜索引擎索引大数据 hadoop

本文链接：https://blog.csdn.net/z595054650/article/details/46052499

版权

Hadoop 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

倒排索引是文档检索系统中最常用的数据结构，被广泛用于全文搜索引擎，它主要使用来存储某个单词（或词组）在一个文档或一组文档中的存储位置的映射，即提供了一种根据内容来查找文档的方式。

以前不使用hadoop时，实现倒排索引真是费劲啊，尤其是当处理大量文本时，更是让人头疼啊。自从有了hadoop爸爸妈妈再也不用担心我们处理大数据了，呵呵，废话就说到这，下面实现简单的倒排索引：

结构：

主类：InvertedIndex

主类中的三个静态内部类： InvertIndexMapper， InvertedIndexCombiner， InvertedIndexReduce

package com.mr.index;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 倒排索引：根据内容来查找文档，就像搜索引擎一样，通过关键字查询想要的内容
 *                   map过程                                      combiner过程                                            reduce过程             
 *    单词：文档x------- 词频                        单词----------文档x：词频                    单词-------------文档1:词频；文档2:词频。。。
 * */
public class InvertedIndex {
	
	public static class InvertIndexMapper extends Mapper<Object,Text,Text,Text>{
		
		private  Text keyInfo=new Text(); //存储单词和URI的组合
		
		private Text valueInfo=new Text(); //存储词频
		
		private FileSplit  split;   //存储Split对象
		
		public void map(Object key,Text value,Context context)
						throws IOException,InterruptedException{
			//获得<key,value>对所属FileSplit对象
			split=(FileSplit)context.getInputSplit();
			
			StringTokenizer stk=new StringTokenizer(value.toString());
			
			while(stk.hasMoreTokens()){
				
					keyInfo.set(stk.nextToken()+":"+split.getPath().getName().toString());
					
					valueInfo.set("1");
					
					context.write(keyInfo, valueInfo);
					
					System.out.println(keyInfo+"---------"+valueInfo);
			}
			
		}
	}
	
	public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{
		
			private Text info=new Text();
			
			public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
				//统计词频
				int sum=0;
				
				for(Text value:values){
					sum+=Integer.parseInt(value.toString());
				}
				
				int pos=key.toString().indexOf(":");
				
				//重新设置value值由文件名和词频组成
				info.set(key.toString().substring(pos+1)+":"+String.valueOf(sum));
				
				key.set(key.toString().substring(0, pos));
				
				context.write(key, info);
			}
	}
	
	
	public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{
		
		private Text result=new Text();
		
		public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
			//用于存储最后哦偶的结果
			StringBuffer buf=new StringBuffer();
			
			for(Text value:values){
				buf.append(value.toString()+";");
			}
			result.set(buf.toString());
			
			context.write(key, result);
		}
	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		
		Job job=Job.getInstance();
		
		job.setJarByClass(InvertedIndex.class);
		
		job.setJobName("InvertedIndex");
		
		if(args.length!=2){
			System.out.println("Usage:invertedidex<in><out>");
			System.exit(2);
		}
		
		job.setMapperClass(InvertIndexMapper.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		
		job.setCombinerClass(InvertedIndexCombiner.class);
		job.setReducerClass(InvertedIndexReduce.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		System.exit(job.waitForCompletion(true)? 0 :1);
	}

}

运行时配置输入文件和输出文件的路径即可