MapReduce/Hadoop的TopN解决方案之键不唯一的情况

一、MapReduce/Hadoop的TopN解决方案之键唯一的情况(点击打开链接



二、针对键不唯一的情况,即文件中可能出现多次关键字

解决办法:先讲不唯一键转换为唯一键,即使用MapReduce合并键相同的项,再使用(一)所述的唯一键TopN方案

package topN_hadoop1;

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;

public class AggregateByKeyMapper extends
         Mapper<Object, Text, Text, IntWritable> {
   private Text K2 = new Text();
   private IntWritable V2 = new IntWritable();

   @Override
   public void map(Object key, Text value, Context context)
         throws IOException, InterruptedException {
      String valueAsString = value.toString().trim();
      String[] tokens = valueAsString.split(",");
      if (tokens.length != 2) {
         return;
      }
      String url = tokens[0];
      int frequency =  Integer.parseInt(tokens[1]);
      K2.set(url);
      V2.set(frequency);
      context.write(K2, V2);
   }
}


package topN_hadoop1;

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class AggregateByKeyReducer  extends
    Reducer<Text, IntWritable, Text, IntWritable> {
      @Override
      public void reduce(Text key, Iterable<IntWritable> values, Context context)  throws IOException, InterruptedException {
         int sum = 0;
         for (IntWritable value : values) {
               sum += value.get();
         }
         context.write(key, new IntWritable(sum));
      }
}


package topN_hadoop1;

import org.apache.log4j.Logger;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class AggregateByKeyDriver  extends Configured implements Tool {

   private static Logger THE_LOGGER = Logger.getLogger(AggregateByKeyDriver.class);

   public int run(String[] args) throws Exception {
      Job job = new Job(getConf());
      HadoopUtil.addJarsToDistributedCache(job, "/lib/");
      job.setJobName("AggregateByKeyDriver");

      job.setInputFormatClass(TextInputFormat.class);
      job.setOutputFormatClass(SequenceFileOutputFormat.class);

      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(IntWritable.class);

      job.setMapperClass(AggregateByKeyMapper.class);
      job.setReducerClass(AggregateByKeyReducer.class);
      job.setCombinerClass(AggregateByKeyReducer.class);

       // args[0] = input directory
       // args[1] = output directory
      FileInputFormat.setInputPaths(job, new Path(args[0]));
      FileOutputFormat.setOutputPath(job, new Path(args[1]));

      boolean status = job.waitForCompletion(true);
      THE_LOGGER.info("run(): status="+status);
      return status ? 0 : 1;
   }

   /**
   * The main driver for "Aggregate By Key" program.
   * Invoke this method to submit the map/reduce job.
   * @throws Exception When there is communication problems with the job tracker.
   */
   public static void main(String[] args) throws Exception {
      // Make sure there are exactly 2 parameters
      if (args.length != 2) {
         THE_LOGGER.warn("usage AggregateByKeyDriver <input> <output>");
         System.exit(1);
      }

      THE_LOGGER.info("inputDir="+args[0]);
      THE_LOGGER.info("outputDir="+args[1]);
      int returnStatus = ToolRunner.run(new AggregateByKeyDriver(), args);
      System.exit(returnStatus);
   }

}




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值