hadoop < MapReduce 编写程序实现倒排索引>

最新推荐文章于 2021-03-20 16:30:35 发布

weixin_33994429

最新推荐文章于 2021-03-20 16:30:35 发布

阅读量675

点赞数 1

文章标签：大数据 java

原文链接：https://my.oschina.net/004/blog/170688

版权

为什么80%的码农都做不了架构师？>>>

倒排索引是文档检索系统中最常用到的数据结果，应用于搜索引擎，根据内容来查找文档的一种方式。进行相反的操作，因称为倒排索引。下列如图可以清楚的解释：

分析与设计

1 Map过程:Map过程首先分析输入的<key,value>对，得到索引中需要的信息：单词，文档URI 和词频。key：单词和URI.value：出现同样单词的次数。

2 Combine过程：经过map方法处理后，Combine过程将key值相同的value值累加，得到一个单词在文档中的词频。

3 Reduce过程：经过上述的俩个过程后，Reduce过程只需要将相同的key值的value值组合成倒排引索文件的格式即可，其余的事情直接交给MapReduce框架进行处理。下面我从百度得带一张图给大家展示：

实现代码：环境的搭配与建工程导包，我就不在这里介绍了，你可以参考下面网址：http://hi.baidu.com/itapadwyxebcfud/item/f93409341f683967033edc00

          package org.apache.hadoop.daopai; 
        
          import java.io.IOException; 
        
          import java.util.StringTokenizer; 
        
          import org.apache.hadoop.conf.Configuration; 
        
          import org.apache.hadoop.fs.Path; 
        
          import org.apache.hadoop.io.Text; 
        
          import org.apache.hadoop.mapreduce.Job; 
        
          import org.apache.hadoop.mapreduce.Mapper; 
        
          import org.apache.hadoop.mapreduce.Reducer; 
        
          import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
        
          import org.apache.hadoop.mapreduce.lib.input.FileSplit; 
        
          import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
        
          public class InvertedIndex { 
        
              public static class InvertedIndexMap extends Mapper<Object,Text,Text,Text>{ 
        
                  private Text valueInfo = new Text(); 
        
                  private Text keyInfo = new Text(); 
        
                  private FileSplit split; 
        
                  public void map(Object key, Text value,Context context) 
        
                          throws IOException, InterruptedException { 
        
                      //获取<key value>对所属的FileSplit对象 
        
                      split = (FileSplit) context.getInputSplit(); 
        
                      StringTokenizer stk = new StringTokenizer(value.toString()); 
        
                      while (stk.hasMoreElements()) { 
        
                          //key值由（单词：URI）组成 
        
                          keyInfo.set(stk.nextToken()+":"+split.getPath().toString()); 
        
                          //词频 
        
                          valueInfo.set("1"); 
        
                          context.write(keyInfo, valueInfo); 
        
                      } 
        
                  } 
        
              } 
        
              public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{ 
        
                  Text info = new Text(); 
        
                  public void reduce(Text key, Iterable<Text> values,Context contex) 
        
                          throws IOException, InterruptedException { 
        
                      int sum = 0; 
        
                      for (Text value : values) { 
        
                          sum += Integer.parseInt(value.toString()); 
        
                      } 
        
                      int splitIndex = key.toString().indexOf(":"); 
        
                      //重新设置value值由（URI+:词频组成） 
        
                      info.set(key.toString().substring(splitIndex+1) +":"+ sum); 
        
                      //重新设置key值为单词 
        
                      key.set(key.toString().substring(0,splitIndex)); 
        
                      contex.write(key, info); 
        
                  } 
        
              } 
        
              public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{ 
        
                  private Text result = new Text(); 
        
                   public void reduce(Text key, Iterable<Text> values, Context context) 
        
                          throws IOException, InterruptedException { 
        
                      //生成文档列表 
        
                      String fileList = new String(); 
        
                      for (Text value : values) { 
        
                          fileList += value.toString()+";"; 
        
                      } 
        
                      result.set(fileList); 
        
                      context.write(key, result); 
        
                  } 
        
              } 
        
              public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 
        
                  Configuration conf = new Configuration(); 
        
                  Job job = new Job(conf,"InvertedIndex"); 
        
                  job.setJarByClass(InvertedIndex.class); 
        
                  job.setMapperClass(InvertedIndexMap.class); 
        
                  job.setMapOutputKeyClass(Text.class); 
        
                  job.setMapOutputValueClass(Text.class); 
        
                  job.setCombinerClass(InvertedIndexCombiner.class); 
        
                  job.setReducerClass(InvertedIndexReduce.class); 
        
                  job.setOutputKeyClass(Text.class); 
        
                  job.setOutputValueClass(Text.class); 
        
                  FileInputFormat.addInputPath(job, new Path("./in/invertedindex/")); 
        
                  FileOutputFormat.setOutputPath(job, new Path("./out/")); 
        
                  System.exit(job.waitForCompletion(true)?0:1); 
        
              } 
        
          }

在导入到ecipse中，如图：根据自己的需要点击creat new directory 创建目录，再在目录里点击upload files to DFS添加本例文档。再点击Refresh。就会出现你想要的目录和文件。在运行之前，现在本地建立几个文档（里面写些单词字母都可以）

输入文档，我就是在桌面建立的文档，out文件以及下面文件运行后出现的文件及结果文档.

这是运行输出的结果：我把部分标记出来，绿色字体就是代表文档的里单词，红色字体就是代表这个单词在哪个文档出现的次数—词频。黑色字体就是代表我的路径。

MapReduce hdfs://masters:9000/user/hadoop/in/invertedindex/1.txt:4;hdfs://masters:9000/user/hadoop/in/invertedindex/2.txt:1;

and

hdfs://masters:9000/user/hadoop/in/invertedindex/2.txt:1;hdfs://masters:9000/user/hadoop/in/invertedindex/1.txt:1;

apache hdfs://masters:9000/user/hadoop/in/invertedindex/4.txt:2;

bye hdfs://masters:9000/user/hadoop/in/invertedindex/1.txt:1;

conf hdfs://masters:9000/user/hadoop/in/invertedindex/4.txt:1;

hadoop hdfs://masters:9000/user/hadoop/in/invertedindex/4.txt:2;hdfs://masters:9000/user/hadoop/in/invertedindex/1.txt:4;

hdfs://masters:9000/user/hadoop/in/invertedindex/2.txt:1;

hbase hdfs://masters:9000/user/hadoop/in/invertedindex/1.txt:1;

hdfs hdfs://masters:9000/user/hadoop/in/invertedindex/1.txt:2;

hello hdfs://masters:9000/user/hadoop/in/invertedindex/1.txt:1;

import hdfs://masters:9000/user/hadoop/in/invertedindex/4.txt:4;

io.IOException; hdfs://masters:9000/user/hadoop/in/invertedindex/4.txt:1;

is hdfs://masters:9000/user/hadoop/in/invertedindex/1.txt:1;hdfs://masters:9000/user/hadoop/in/invertedindex/2.txt:1;

java hdfs://masters:9000/user/hadoop/in/invertedindex/4.txt:2;

org hdfs://masters:9000/user/hadoop/in/invertedindex/4.txt:2;

powerful hdfs://masters:9000/user/hadoop/in/invertedindex/2.txt:1;hdfs://masters:9000/user/hadoop/in/invertedindex/1.txt:1;

simple hdfs://masters:9000/user/hadoop/in/invertedindex/1.txt:1;hdfs://masters:9000/user/hadoop/in/invertedindex/2.txt:1;

util.StringTokenizer; hdfs://masters:9000/user/hadoop/in/invertedindex/4.txt:1;

在这里：我遇到的问题还是路径问题，我开始输入文档的路径设置在/user/root/test1/下面，结果运行出错，提示改进路径：hdfs://masters:9000/user/hadoop/in/invertedindex/下面。后来我安找提示要求改正。结果运行成功！大家看了有说明问题一起留言讨论学习

转载于:https://my.oschina.net/004/blog/170688