hadoop学习笔记-6-倒排索引InverseIndex

介绍

倒排索引是将文章中的单词挑出来,排序,便于检索。利用map-reduce思想来实现,如下:

原始文本及内容:

doc1.txt:MapReduce is simple

doc2.txt:MapReduce ispowerful is simple

doc3.txt:Hello MapReduce byeMapReduce

那么输出结果应该是这样子的:

MapReduce:doc1.txt:1;doc2.txt:1;doc3.txt:2;

is:doc1.txt:1;doc2.txt:2;

simple:doc1.txt:1;doc2.txt:1;

Hello:doc3.txt:1;

MapReduce:doc3.txt:1;

其中冒号之前表示文档,之后表示在这个文档中出现的次数,分号分隔各个文档。例如:MapReduce:doc1.txt:1;doc2.txt:1;doc3.txt:2; 表示MapReduce在doc1.txt中出现一次,在doc2.txt中出现一次,在doc3.txt中出现两次。

明白了原理之后,看如何用MapReduce来实现。

原始文件作为输入,经过Map之后变成以下格式:

<MapReduce:doc1.txt, 1>

<is:doc1.txt, 1>

<simple:doc1.txt, 1>

<MapReduce:doc2.txt, 1>

<is:doc2.txt, 1>

<powerful:doc2.txt, 1>

<is:doc2.txt, 1>

<simple:doc2.txt, 1>

<Hello:doc3.txt, 1>

<MapReduce:doc3.txt, 1>

<bye:doc3.txt, 1>

<MapReduce:doc3.txt, 1>

经过Combiner之后变成以下格式:

<MapReduce:doc1.txt, 1>

<is:doc1.txt, 1>

<simple:doc1.txt, 1>

<MapReduce:doc2.txt, 1>

<is:doc2.txt, 2>

<powerful:doc2.txt, 1>

<simple:doc2.txt, 1>

<Hello:doc3.txt, 1>

<MapReduce:doc3.txt, 2>

<bye:doc3.txt, 1>

经过reduce之后变成以下内容:

<MapReduce, doc1.txt:1;doc2.txt:1;doc3.txt:2;>

<is, doc1.txt:1;doc2.txt:2;>

<simple, doc1.txt:1;doc2.txt:1;>

<Hello, doc3.txt:1;>

<MapReduce, doc3.txt:1;>

 

可以考虑考虑为什么这么做。

 

源代码

Mapper类:

 

  1. package cn.kepu.littlefu;  
  2.    
  3. import java.io.IOException;  
  4.    
  5. import org.apache.hadoop.io.IntWritable;  
  6. import org.apache.hadoop.io.LongWritable;  
  7. import org.apache.hadoop.io.Text;  
  8. import org.apache.hadoop.mapred.JobConf;  
  9. import org.apache.hadoop.mapred.MapReduceBase;  
  10. import org.apache.hadoop.mapred.Mapper;  
  11. import org.apache.hadoop.mapred.OutputCollector;  
  12. import org.apache.hadoop.mapred.Reporter;  
  13.    
  14.    
  15. @SuppressWarnings(“deprecation”)  
  16. public class InverseIndexMapper extends MapReduceBaseimplements Mapper<Object, Text, Text, Text> {  
  17.      
  18.      StringinputFile;  
  19.      
  20.      public voidconfigure(JobConf job) {  
  21.          StringinputFileFull = job.get(”map.input.file”);  
  22.         inputFile =inputFileFull.substring(inputFileFull.lastIndexOf(”/”)+1);  
  23.       }  
  24.      
  25.      @Override   
  26.       public void map(Object key, Text value,OutputCollector<Text, Text> output, Reporter reporter) throwsIOException{  
  27.    
  28.         Stringline = value.toString();  
  29.         String[]word = line.split(” ”);  
  30.          
  31.         for(Strings : word){  
  32.             //output<word:doc1, 1>  
  33.             output.collect(newText(s+”:”+inputFile), new Text(“1”));  
  34.         }  
  35.          
  36.     }  
  37.    
  38. }  
package cn.kepu.littlefu;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;


@SuppressWarnings("deprecation")
public class InverseIndexMapper extends MapReduceBaseimplements Mapper<Object, Text, Text, Text> {

     StringinputFile;

     public voidconfigure(JobConf job) {
         StringinputFileFull = job.get("map.input.file");
        inputFile =inputFileFull.substring(inputFileFull.lastIndexOf("/")+1);
      }

     @Override 
      public void map(Object key, Text value,OutputCollector<Text, Text> output, Reporter reporter) throwsIOException{

        Stringline = value.toString();
        String[]word = line.split(" ");

        for(Strings : word){
            //output<word:doc1, 1>
            output.collect(newText(s+":"+inputFile), new Text("1"));
        }

    }

}


 

Combiner类:

 

  1. package cn.kepu.littlefu;  
  2.    
  3. import java.io.IOException;  
  4. import java.util.Iterator;  
  5.    
  6. import org.apache.hadoop.io.IntWritable;  
  7. import org.apache.hadoop.io.Text;  
  8. import org.apache.hadoop.mapred.MapReduceBase;  
  9. import org.apache.hadoop.mapred.OutputCollector;  
  10. import org.apache.hadoop.mapred.Reducer;  
  11. import org.apache.hadoop.mapred.Reporter;  
  12.    
  13. public class InverseIndexCombiner extendsMapReduceBase implements  
  14. Reducer<Text, Text, Text, Text>{  
  15.    
  16.     @Override  
  17.     public void reduce(Text key, Iterator<Text>values,  
  18.             OutputCollector<Text,Text> output, Reporter reporter)  
  19.             throwsIOException {  
  20.         //total  
  21.         int sum =0;  
  22.         while(values.hasNext()){  
  23.             sum+= Integer.parseInt(values.next().toString());  
  24.         }  
  25.         //outputposition  
  26.         int pos =key.toString().indexOf(“:”);  
  27.         //output<word,doc1:1>  
  28.         TextoutKey = new Text(key.toString().subSequence(0, pos).toString());  
  29.         TextoutValue = new Text(key.toString().substring(pos+1).toString()+“:”+sum);  
  30.         System.out.print(”combiner:<key:”+outKey.toString()+“,value:”+outValue.toString()+“>”);  
  31.         output.collect(outKey, outValue);  
  32.     }  
  33.    
  34. }  
  35.    
package cn.kepu.littlefu;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;

public class InverseIndexCombiner extendsMapReduceBase implements
Reducer<Text, Text, Text, Text>{

    @Override
    public void reduce(Text key, Iterator<Text>values,
            OutputCollector<Text,Text> output, Reporter reporter)
            throwsIOException {
        //total
        int sum =0;
        while(values.hasNext()){
            sum+= Integer.parseInt(values.next().toString());
        }
        //outputposition
        int pos =key.toString().indexOf(":");
        //output<word,doc1:1>
        TextoutKey = new Text(key.toString().subSequence(0, pos).toString());
        TextoutValue = new Text(key.toString().substring(pos+1).toString()+":"+sum);
        System.out.print("combiner:<key:"+outKey.toString()+",value:"+outValue.toString()+">");
        output.collect(outKey, outValue);
    }

}
 


 

Reduce类:

 

  1. package cn.kepu.littlefu;  
  2.    
  3. import java.io.IOException;  
  4. import java.util.Iterator;  
  5.    
  6. import org.apache.hadoop.io.Text;  
  7. import org.apache.hadoop.mapred.MapReduceBase;  
  8. import org.apache.hadoop.mapred.OutputCollector;  
  9. import org.apache.hadoop.mapred.Reducer;  
  10. import org.apache.hadoop.mapred.Reporter;  
  11.    
  12. public class InverseIndexReducer extends MapReduceBaseimplements  
  13.         Reducer<Text,Text, Text, Text> {  
  14.    
  15.     @Override  
  16.     public voidreduce(Text key, Iterator<Text> values,  
  17.             OutputCollector<Text,Text> output, Reporter reporter)  
  18.             throwsIOException {  
  19.          
  20.         StringfileList = new String();  
  21.         while(values.hasNext()){  
  22.             fileList+= values.next().toString()+”;”;  
  23.         }  
  24.         //output<word,doc1:1;doc2:2;doc3:1;>  
  25.         output.collect(key,new Text(fileList));  
  26.     }  
  27. }  
  28.    
package cn.kepu.littlefu;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;

public class InverseIndexReducer extends MapReduceBaseimplements
        Reducer<Text,Text, Text, Text> {

    @Override
    public voidreduce(Text key, Iterator<Text> values,
            OutputCollector<Text,Text> output, Reporter reporter)
            throwsIOException {

        StringfileList = new String();
        while(values.hasNext()){
            fileList+= values.next().toString()+";";
        }
        //output<word,doc1:1;doc2:2;doc3:1;>
        output.collect(key,new Text(fileList));
    }
}
 


 

Main类:

  1. package cn.kepu.littlefu;  
  2.    
  3. import java.io.IOException;  
  4.    
  5. import org.apache.hadoop.fs.Path;  
  6. import org.apache.hadoop.io.IntWritable;  
  7. import org.apache.hadoop.io.Text;  
  8. import org.apache.hadoop.mapred.FileInputFormat;  
  9. import org.apache.hadoop.mapred.FileOutputFormat;  
  10. import org.apache.hadoop.mapred.JobClient;  
  11. import org.apache.hadoop.mapred.JobConf;  
  12.    
  13. public class InverseIndexLuncher {  
  14.      
  15.     public staticvoid main(String[] args) throws IOException{  
  16.      
  17.         if(args.length != 2){  
  18.             System.err.println(”Usage :InverseIndex <input path> <output path>”);  
  19.             System.exit(-1);  
  20.         }  
  21.          
  22.         JobConfconf = new JobConf(InverseIndexLuncher.class);  
  23.         conf.setJobName(”inverseindex”);  
  24.          
  25.         FileInputFormat.addInputPath(conf,new Path(args[0]));  
  26.         FileOutputFormat.setOutputPath(conf,new Path(args[1]));  
  27.          
  28.         conf.setMapperClass(InverseIndexMapper.class);  
  29.         conf.setCombinerClass(InverseIndexCombiner.class);  
  30.         conf.setReducerClass(InverseIndexReducer.class);  
  31.          
  32.         conf.setMapOutputKeyClass(Text.class);  
  33.         conf.setMapOutputValueClass(Text.class);  
  34.          
  35.         conf.setOutputKeyClass(Text.class);  
  36.         conf.setOutputValueClass(Text.class);  
  37.          
  38.         JobClient.runJob(conf);  
  39.     }  
  40. }  
  41.    
package cn.kepu.littlefu;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;

public class InverseIndexLuncher {

    public staticvoid main(String[] args) throws IOException{

        if(args.length != 2){
            System.err.println("Usage :InverseIndex <input path> <output path>");
            System.exit(-1);
        }

        JobConfconf = new JobConf(InverseIndexLuncher.class);
        conf.setJobName("inverseindex");

        FileInputFormat.addInputPath(conf,new Path(args[0]));
        FileOutputFormat.setOutputPath(conf,new Path(args[1]));

        conf.setMapperClass(InverseIndexMapper.class);
        conf.setCombinerClass(InverseIndexCombiner.class);
        conf.setReducerClass(InverseIndexReducer.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(Text.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        JobClient.runJob(conf);
    }
}
 


 

 

运行截图

 

 


 


 

参考:《实战Hadoop–开启通向云计算的捷径》P74-P83

转自http://blog.csdn.net/fufengrui/article/details/8169583?locationNum=14

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值