MapReduce/Hadoop的TopN解决方案之键不唯一的情况

最新推荐文章于 2023-05-13 22:57:16 发布

土豆拍死马铃薯

最新推荐文章于 2023-05-13 22:57:16 发布

阅读量772

点赞数

分类专栏：大数据文章标签： MapReduce hadoop TopN 非唯一键重复键

本文链接：https://blog.csdn.net/csj941227/article/details/78425574

版权

大数据专栏收录该内容

81 篇文章 3 订阅

订阅专栏

一、MapReduce/Hadoop的TopN解决方案之键唯一的情况（点击打开链接）

二、针对键不唯一的情况，即文件中可能出现多次关键字

解决办法：先讲不唯一键转换为唯一键，即使用MapReduce合并键相同的项，再使用（一）所述的唯一键TopN方案

即

package topN_hadoop1;

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;

public class AggregateByKeyMapper extends
         Mapper<Object, Text, Text, IntWritable> {
   private Text K2 = new Text();
   private IntWritable V2 = new IntWritable();

   @Override
   public void map(Object key, Text value, Context context)
         throws IOException, InterruptedException {
      String valueAsString = value.toString().trim();
      String[] tokens = valueAsString.split(",");
      if (tokens.length != 2) {
         return;
      }
      String url = tokens[0];
      int frequency =  Integer.parseInt(tokens[1]);
      K2.set(url);
      V2.set(frequency);
      context.write(K2, V2);
   }
}

package topN_hadoop1;

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class AggregateByKeyReducer  extends
    Reducer<Text, IntWritable, Text, IntWritable> {
      @Override
      public void reduce(Text key, Iterable<IntWritable> values, Context context)  throws IOException, InterruptedException {
         int sum = 0;
         for (IntWritable value : values) {
               sum += value.get();
         }
         context.write(key, new IntWritable(sum));
      }
}

package topN_hadoop1;

import org.apache.log4j.Logger;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class AggregateByKeyDriver  extends Configured implements Tool {

   private static Logger THE_LOGGER = Logger.getLogger(AggregateByKeyDriver.class);

   public int run(String[] args) throws Exception {
      Job job = new Job(getConf());
      HadoopUtil.addJarsToDistributedCache(job, "/lib/");
      job.setJobName("AggregateByKeyDriver");

      job.setInputFormatClass(TextInputFormat.class);
      job.setOutputFormatClass(SequenceFileOutputFormat.class);

      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(IntWritable.class);

      job.setMapperClass(AggregateByKeyMapper.class);
      job.setReducerClass(AggregateByKeyReducer.class);
      job.setCombinerClass(AggregateByKeyReducer.class);

       // args[0] = input directory
       // args[1] = output directory
      FileInputFormat.setInputPaths(job, new Path(args[0]));
      FileOutputFormat.setOutputPath(job, new Path(args[1]));

      boolean status = job.waitForCompletion(true);
      THE_LOGGER.info("run(): status="+status);
      return status ? 0 : 1;
   }

   /**
   * The main driver for "Aggregate By Key" program.
   * Invoke this method to submit the map/reduce job.
   * @throws Exception When there is communication problems with the job tracker.
   */
   public static void main(String[] args) throws Exception {
      // Make sure there are exactly 2 parameters
      if (args.length != 2) {
         THE_LOGGER.warn("usage AggregateByKeyDriver <input> <output>");
         System.exit(1);
      }

      THE_LOGGER.info("inputDir="+args[0]);
      THE_LOGGER.info("outputDir="+args[1]);
      int returnStatus = ToolRunner.run(new AggregateByKeyDriver(), args);
      System.exit(returnStatus);
   }

}