浅谈MapReduce之倒排索引

最新推荐文章于 2021-05-16 12:03:00 发布

贾诩是也

最新推荐文章于 2021-05-16 12:03:00 发布

阅读量336

点赞数

分类专栏： Hadoop Big DATA

本文链接：https://blog.csdn.net/xiangxizhishi/article/details/77621767

版权

Hadoop 同时被 2 个专栏收录

97 篇文章 2 订阅

订阅专栏

Big DATA

61 篇文章 1 订阅

订阅专栏

倒排索引：由于不是根据文档来确定文档所包含的内容，而是进行相反的操作,因而称为倒排索引

--------------------------------

map

输出：

key:单词+文档URI，

value：词频

-------------------------------

combiner

输入：

key:单词+文档URI，

value：词频

输出：

key:单词+文档URI，

value，单个文档的中出现的词频汇总就是map的value-list合并的结果

---------------------------------------

Reduce:

输入：
combiner的结果key value-list

输出

key:单词

value:文档URI:词频

----------------------------------------------------------

源代码

import Java.io.IOException;
import java.util.StringTokenizer;

import org.apache.Hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class InvertedIndex {

   public static class InvertedIndexMap extends Mapper<Object,Text,Text,Text>{

       private Text valueInfo = new Text();
       private Text keyInfo = new Text();
       private FileSplit split;

       public void map(Object key, Text value,Context context)
               throws IOException, InterruptedException {
           //获取<key value>对所属的FileSplit对象
           split = (FileSplit) context.getInputSplit();
           StringTokenizer stk = new StringTokenizer(value.toString());
           while (stk.hasMoreElements()) {
               //key值由（单词：URI）组成
               keyInfo.set(stk.nextToken()+":"+split.getPath().toString());
               //词频
               valueInfo.set("1");
               context.write(keyInfo, valueInfo);

           }


       }
   }

   public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{

       Text info = new Text();

       public void reduce(Text key, Iterable<Text> values,Context contex)
               throws IOException, InterruptedException {
           int sum = 0;
           for (Text value : values) {
               sum += Integer.parseInt(value.toString());
           }

           int splitIndex = key.toString().indexOf(":");
           //重新设置value值由（URI+:词频组成）
           info.set(key.toString().substring(splitIndex+1) +":"+ sum);
           //重新设置key值为单词
           key.set(key.toString().substring(0,splitIndex));
           contex.write(key, info);
       }
   }

   public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{

       private Text result = new Text();

       public void reduce(Text key, Iterable<Text> values,Context contex)
               throws IOException, InterruptedException {
           //生成文档列表
           String fileList = new String();
           for (Text value : values) {
               fileList += value.toString()+";";
           }
           result.set(fileList);
           contex.write(key, result);
       }
   }

   public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

       Configuration conf = new Configuration();

       Job job = new Job(conf,"InvertedIndex");

       job.setJarByClass(InvertedIndex.class);

       job.setMapperClass(InvertedIndexMap.class);
       job.setMapOutputKeyClass(Text.class);
       job.setMapOutputValueClass(Text.class);

       job.setCombinerClass(InvertedIndexCombiner.class);

       job.setReducerClass(InvertedIndexReduce.class);
       job.setOutputKeyClass(Text.class);
       job.setOutputValueClass(Text.class);

       FileInputFormat.addInputPath(job, new Path("./in/invertedindex/"));
       FileOutputFormat.setOutputPath(job, new Path("./out/"));

       System.exit(job.waitForCompletion(true)?0:1);


   }
}
测试文件：

1.txt:

Hello MapReduce Hello Hadoop

2.txt

Hello MapReduce Hello Hadoop

3.txt

Hello MapReduce Hello Hadoop

part-r-00000

Hadoop   file:/E:/KAI_FA/eclipse/workshop/InvertedIndex/in/invertedindex/1.txt:1;file:/E:/KAI_FA/eclipse/workshop/InvertedIndex/in/invertedindex/2.txt:2;file:/E:/KAI_FA/eclipse/workshop/InvertedIndex/in/invertedindex/3.txt:3;
Hello   file:/E:/KAI_FA/eclipse/workshop/InvertedIndex/in/invertedindex/3.txt:6;file:/E:/KAI_FA/eclipse/workshop/InvertedIndex/in/invertedindex/1.txt:2;file:/E:/KAI_FA/eclipse/workshop/InvertedIndex/in/invertedindex/2.txt:4;
MapReduce   file:/E:/KAI_FA/eclipse/workshop/InvertedIndex/in/invertedindex/2.txt:2;file:/E:/KAI_FA/eclipse/workshop/InvertedIndex/in/invertedindex/1.txt:1;file:/E:/KAI_FA/eclipse/workshop/InvertedIndex/in/invertedindex/3.txt:3;