复杂倒排索引Java实现

最新推荐文章于 2024-06-22 14:46:59 发布

weixin_34038652

最新推荐文章于 2024-06-22 14:46:59 发布

阅读量231

点赞数

文章标签： java 大数据

原文链接：https://my.oschina.net/eager/blog/675675

版权

为什么80%的码农都做不了架构师？>>>

1、定义数据和简单倒排索引稍作修改，以便验证结果的正确性。定义初始数据如图：

期望结果（PS：需要对outputValue按照文件名排序）

2、在学习之前首先要掌握Mapper与Reducer层输入输出类型关系，我们知道Mapper的输出类型和Reducer层输入类型要对应。

Combiner层相当于是Reducer层本地化，一般两个类中实现代码一致。例如：找出全国身高最高一位人士，可以先找出每一个省的身高最高的（Combiner层），然后再去帝都一起比较找出全国最高的（Reducer层）。

值得注意的是Combiner层的输入输出类型，或许你会以为是Combiner的输入类型和Mapper的输出类型一致，Combiner的输出和Reducer的输入一致即可。那么我觉得你完全可以试试，毕竟遇到坑并填上坑是我们学习路上不可少的。

实际上Combiner层的输入类型不仅与Mapper的输出类型要一致，Combiner的输出类型也要与Mapper的输出类型一致，因为Combiner的下一阶段是Reducer层，所以Combiner的输出类型要与Mapper的输出类型和Reducer的输入类型要一致。

3、Hadoop自带一个默认的分区类HashPartitioner。Partitioner的作用就是将Mapper（若定义了Combiner类则为Combiner）输出的Key-Value拆分成分片，每一个Reducer对应一个分片，然后根据key的hash值，均匀分布到Reduce Tasks上，使key相同的被分发到同一个Reducer。

4、知识点都了解的差不多了，直接上代码了！

/**
* 统计每个单词在哪些文档中出现了，并对在文档中出现的次数做统计和按照文档名排序
* @author ZD
*
*/
public class ComplexIndex {
   //key为单词；value为存在该单词的所有文档，格式为（文档名：数量）
   private static Map<String, String> map = new HashMap<String, String>();

   private static class ComplexIndexMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

       @Override
       protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
               throws IOException, InterruptedException {
           //获取文件名，首先需强转为FileSplit类型
           String fileName = ((FileSplit)context.getInputSplit()).getPath().getName();
           String[] values = value.toString().trim().split(" ");
           for(int i=0; i<values.length; i++){
               //传（（关键字：文件名），数量）形式
               context.write(new Text(values[i]+":"+fileName), new IntWritable(1));
           }
       }
   }

   private static class ComplexIndexCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {

       @Override
       protected void reduce(Text value, Iterable<IntWritable> datas, Reducer<Text, IntWritable, Text, IntWritable>.Context context)
               throws IOException, InterruptedException {
           //统计关键字在文档中出现的次数
           int sum=0;
           for (IntWritable data : datas) {
               sum++;
           }
           String[] strs = value.toString().split(":");
           //将所有关键字放入map
           if(!map.containsKey(strs[0])){
               map.put(strs[0], "");
           }
           context.write(new Text(value), new IntWritable(sum));
       }
   }

   private static class ComplexIndexReducer extends Reducer<Text, IntWritable, Text, Text> {
       @Override
       protected void reduce(Text value, Iterable<IntWritable> datas, Reducer<Text, IntWritable, Text, Text>.Context context)
               throws IOException, InterruptedException {
           System.out.println("reducer===========");
           String[] values = value.toString().split(":");
           String str="";
           for (IntWritable data : datas) {
               str+="("+values[1]+","+data.get()+")";
           }
           if(map.containsKey(values[0])){
               map.put(values[0], map.get(values[0])+str);
           }
           //此处中不向文件中输出结果，而在cleanup()方法中输出结果
       }

       @Override
       protected void cleanup(Reducer<Text, IntWritable, Text, Text>.Context context)
               throws IOException, InterruptedException {
           for(String key:map.keySet()){
               context.write(new Text(key), new Text(map.get(key)));
           }
       }
   }

   /** 自定义HashPartitioner，保证 <word:docid>格式的key值按照word均匀分布给Reduce Tasks **/
   public static class NewPartitioner extends HashPartitioner<Text, IntWritable> {
       public int getPartition(Text key, IntWritable value, int numReduceTasks) {
           String word = new String();
           word = key.toString().split(":")[0]; // <word:docid>=>word
           //将word为key，均匀分布在Reduce Task上（传入Reducer的inputKey值未变）
           return super.getPartition(new Text(word), value, numReduceTasks);
       }
   }

   public static void main(String[] args) {
       try {
           Configuration cfg = HadoopCfg.getConfigration();
           Job job = Job.getInstance(cfg);
           job.setJobName("ComplexIndex");
           job.setJarByClass(ComplexIndex.class);
           job.setMapperClass(ComplexIndexMapper.class);
           job.setMapOutputKeyClass(Text.class);
           job.setMapOutputValueClass(IntWritable.class);
           job.setCombinerClass(ComplexIndexCombiner.class);
           job.setMapOutputKeyClass(Text.class);
           job.setMapOutputValueClass(IntWritable.class);
           job.setReducerClass(ComplexIndexReducer.class);
           job.setOutputKeyClass(Text.class);
           job.setOutputValueClass(Text.class);
           job.setPartitionerClass(NewPartitioner.class);

           FileInputFormat.addInputPath(job, new Path("/input/index"));
           FileOutputFormat.setOutputPath(job, new Path("/complexIndex/"));
           System.exit(job.waitForCompletion(true) ? 0 : 1);
       } catch (Exception e) {
           e.printStackTrace();
       }
   }
}