Hadoop 使用Combiner提高Map/Reduce程序效率

众所周知,Hadoop框架使用Mapper将数据处理成一个<key,value>键值对,再网络节点间对其进行整理(shuffle),然后使用Reducer处理数据并进行最终输出。

??? 在上述过程中,我们看到至少两个性能瓶颈:

  1. 如果我们有10亿个数据,Mapper会生成10亿个键值对在网络间进行传输,但如果我们只是对数据求最大值,那么很明显的Mapper只需要输出它所知道的最大值即可。这样做不仅可以减轻网络压力,同样也可以大幅度提高程序效率。
  2. 使用专利中的国家一项来阐述数据倾斜这个定义。这样的数据远远不是一致性的或者说平衡分布的,由于大多数专利的国家都属于美国,这样不仅Mapper中的键值对、中间阶段(shuffle)的键值对等,大多数的键值对最终会聚集于一个单一的Reducer之上,压倒这个Reducer,从而大大降低程序的性能。

??? Hadoop通过使用一个介于Mapper和Reducer之间的Combiner步骤来解决上述瓶颈。你可以将Combiner视为Reducer的一个帮手,它主要是为了削减Mapper的输出从而减少网

络带宽和Reducer之上的负载。如果我们定义一个Combiner,MapReducer框架会对中间数据多次地使用它进行处理。

??? 如果Reducer只运行简单的分布式方法,例如最大值、最小值、或者计数,那么我们可以让Reducer自己作为Combiner。但许多有用的方法不是分布式的。以下我们使用求平均值作为例子进行讲解:

Mapper输出它所处理的键值对,为了使单个DataNode计算平均值Reducer会对它收到的<key,value>键值对进行排序,求和。

??? 由于Reducer将它所收到的<key,value>键值的数目视为输入数据中的<key,value>键值对的数目,此时使用Combiner的主要障碍就是计数操作。我们可以重写MapReduce程序来明确的跟踪计数过程。

??? 代码如下:

[java] view plaincopy

  1. package com;?
  2. import java.io.IOException;?
  3. import org.apache.hadoop.conf.Configuration;?
  4. import org.apache.hadoop.conf.Configured;?
  5. import org.apache.hadoop.fs.Path;?
  6. import org.apache.hadoop.io.DoubleWritable;?
  7. import org.apache.hadoop.io.LongWritable;?
  8. import org.apache.hadoop.io.Text;?
  9. import org.apache.hadoop.mapreduce.Job;?
  10. import org.apache.hadoop.mapreduce.Mapper;?
  11. import org.apache.hadoop.mapreduce.Reducer;?
  12. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;?
  13. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;?
  14. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;?
  15. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;?
  16. import org.apache.hadoop.util.Tool;?
  17. import org.apache.hadoop.util.ToolRunner;?
  18. public class AveragingWithCombiner extends Configured implements Tool {?
  19. public static class MapClass extends Mapper<LongWritable,Text,Text,Text> {?
  20. static enum ClaimsCounters { MISSING, QUOTED };?
  21. // Map Method
  22. public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {?
  23. ??????????? String fields[] = value.toString().split(",", -20);?
  24. ??????????? String country = fields[4];?
  25. ??????????? String numClaims = fields[8];?
  26. if (numClaims.length() > 0 && !numClaims.startsWith("\"")) {?
  27. ??????????????? context.write(new Text(country), new Text(numClaims + ",1"));?
  28. ??????????? }?
  29. ??????? }?
  30. ??? }?
  31. public static class Reduce extends Reducer<Text,Text,Text,DoubleWritable> {?
  32. // Reduce Method
  33. public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {?
  34. double sum = 0;?
  35. int count = 0;?
  36. for (Text value : values) {?
  37. ??????????????? String fields[] = value.toString().split(",");?
  38. ??????????????? sum += Double.parseDouble(fields[0]);?
  39. ??????????????? count += Integer.parseInt(fields[1]);?
  40. ??????????? }?
  41. ??????????? context.write(key, new DoubleWritable(sum/count));?
  42. ??????? }?
  43. ??? }?
  44. public static class Combine extends Reducer<Text,Text,Text,Text> {?
  45. // Reduce Method
  46. public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {?
  47. double sum = 0;?
  48. int count = 0;?
  49. for (Text value : values) {?
  50. ??????????????? String fields[] = value.toString().split(",");?
  51. ??????????????? sum += Double.parseDouble(fields[0]);?
  52. ??????????????? count += Integer.parseInt(fields[1]);?
  53. ??????????? }?
  54. ??????????? context.write(key, new Text(sum+","+count));?
  55. ??????? }?
  56. ??? }?
  57. // run Method
  58. public int run(String[] args) throws Exception {?
  59. // Create and Run the Job
  60. ??????? Job job = new Job();?
  61. ??????? job.setJarByClass(AveragingWithCombiner.class);?
  62. ??????? FileInputFormat.addInputPath(job, new Path(args[0]));?
  63. ??????? FileOutputFormat.setOutputPath(job, new Path(args[1]));?
  64. ??????? job.setJobName("AveragingWithCombiner");?
  65. ??????? job.setMapperClass(MapClass.class);?
  66. ??????? job.setCombinerClass(Combine.class);?
  67. ??????? job.setReducerClass(Reduce.class);?
  68. ??????? job.setInputFormatClass(TextInputFormat.class);?
  69. ??????? job.setOutputFormatClass(TextOutputFormat.class);?
  70. ??????? job.setOutputKeyClass(Text.class);?
  71. ??????? job.setOutputValueClass(Text.class);?
  72. ??????? System.exit(job.waitForCompletion(true) ? 0 : 1);?
  73. return 0;?
  74. ??? }?
  75. public static void main(String[] args) throws Exception {?
  76. int res = ToolRunner.run(new Configuration(), new AveragingWithCombiner(), args);?
  77. ??????? System.exit(res);?
  78. ??? }?
  79. }?

、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、

自己的程序:

import hadoop.WordCount.Map;
import hadoop.WordCount.Reduce;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

public class AnalyseLog {

?????? /**
???? * MapReduceBase类:实现了Mapper和Reducer接口的基类(其中的方法只是实现接口,而未作任何事情)
???? * Mapper接口:
???? * WritableComparable接口:实现WritableComparable的类可以相互比较。所有被用作key的类应该实现此接口。
???? * Reporter 则可用于报告整个应用的运行进度,本例中未使用。?
???? *?
???? */?
??? public static class Map extends MapReduceBase implements?
??????????? Mapper<LongWritable, Text, Text, Text>?
??? {?
??????? /**
???????? * LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java 数据类型的类,这些类实现了WritableComparable接口,
???????? * 都能够被串行化从而便于在分布式环境中进行数据交换,你可以将它们分别视为long,int,String 的替代品。
???????? */?
??????? private final static IntWritable one = new IntWritable(1);?
???????
??????? private Text value = new Text();
???????
??????? private Text word = new Text();?
?????????
??????? /**
???????? * Mapper接口中的map方法:
???????? * void map(K1 key, V1 value, OutputCollector<K2,V2> output, Reporter reporter)
???????? * 映射一个单个的输入k/v对到一个中间的k/v对
???????? * 输出对不需要和输入对是相同的类型,输入对可以映射到0个或多个输出对。
???????? * OutputCollector接口:收集Mapper和Reducer输出的<k,v>对。
???????? * OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output
???????? */?
??????? public void map(LongWritable key, Text value,?
??????????????? OutputCollector<Text, Text> output, Reporter reporter)?
??????????????? throws IOException?
??????? {?
??????????? String line = value.toString();?
??????????? StringTokenizer tokenizer = new StringTokenizer(line,"\n");?
?????????? L: while (tokenizer.hasMoreTokens())?
??????????? {?
?????????????
???????????????? String domain = "";
?????????????????
???????????????? String req = "";
????????????????
??????????????? String time = "";
???????????????
??????????????? String pvSiteId = "";
????????????????
???????????????? String result = "";
?????????????????
???????????????? String sss = tokenizer.nextToken();
????????????????
????????????????? String s[] = sss.split(" ");

?????????????????
????????????????? if(sss.indexOf("pv.gif")>-1){
?????????????????????

???????????????????? for(String str:s){
????????????????????????
???????????????????????? if(str.indexOf("pv.gif")>-1){
????????????????????????????
???????????????????????????? req = str;
???????????????????????????? continue;
???????????????????????? }
????????????????????????
???????????????????????? if(str.indexOf("http://")>-1){
????????????????????????????
???????????????????????????? domain = str.substring(8, str.length());
???????????????????????????? if(domain.indexOf("/")>-1){
????????????????????????????????
???????????????????????????????? domain = domain.substring(0,domain.indexOf("/"));
???????????????????????????? }else if(domain.indexOf("\"")>-1){
????????????????????????????????
???????????????????????????????? domain = domain.substring(0,domain.indexOf("\""));
???????????????????????????? }
????????????????????????????
????????????????????????????
???????????????????????????? break;
???????????????????????? }
????????????????????????
???????????????????? }
????????????????????
??????????????????? String reqSub = req.substring(req.indexOf("&"),req.length());
???????????????????
??????????????????? String[] reqSubSpli = reqSub.split("&");
???????????????????
???????????????????
??????????????????? for(String ss:reqSubSpli){
???????????????????????
??????????????????????? if(ss.indexOf("time")>-1){
???????????????????????????
??????????????????????????? time = ss.split("=")[1];
??????????????????????????? //System.out.println("time:"+beanPV.getTime());
??????????????????????? }
???????????????????????
??????????????????????? if(ss.indexOf("siteid")>-1){
???????????????????????????
??????????????????????????? pvSiteId =ss.split("=")[1];
???????????????????????????
??????????????????????????? //beanUV.setSiteId(ss.split("=")[1]);
??????????????????????????? //System.out.println("setSiteId:"+beanPV.getSiteId());
??????????????????????? }
??????????????????? }
???????????????????? ///
???????????????????
??????????????????? if(pvSiteId.equals("")){
???????????????????????
??????????????????????? continue L;
??????????????????? }
???????????????????
??????????????????? result = pvSiteId + ";" + domain + "@" + "1";
?????????????????????
????????????????? }
???????????????
??????????????? if(pvSiteId.equals("")){
???????????????????
??????????????????? continue L;
??????????????? }
???????????????
??????????????? word.set(pvSiteId);
??????????????? value.set(result);
??????????????? output.collect(word, value);?
??????????? }?
??????? }?
??? }?
?
??? public static class Reduce extends MapReduceBase implements?
??????????? Reducer<Text, Text, Text, Text>?
??? {?
???????
??????? public void reduce(Text key, Iterator<Text> values,?
??????????????? OutputCollector<Text, Text> output, Reporter reporter)?
??????????????? throws IOException?
??????? {?
??????????? int sum = 0;
??????????? String re = "";
??????????? String re1 = "";
??????????? System.out.println("???####################"+values);
???????????

??????????? while (values.hasNext())?
??????????? {?
??????????????? String fields[] = values.next().toString().split("@");
???????????????
??????????????? sum += Double.parseDouble(fields[1]);
???????????????
??????????????? re1 = fields[0];
???????????????
??????????????? System.out.println("???????????????!!!!!!!!"+re1);
?
????????????? //? System.out.println("???????????????@@@@@:"+sum);

??????????? }?
???????????
??????????? re = re1 + "@"+sum;
??????????? System.out.println("???????????????@@@@@!:"+sum);
???????????
??????????? System.out.println("???????????????@@@@!!!@!:"+re);

??????????? output.collect(key, new Text(re));?
??????? }?
??? }?
?
??? public static void main(String[] args) throws Exception?
??? {?
??????? /**
???????? * JobConf:map/reduce的job配置类,向hadoop框架描述map-reduce执行的工作
???????? * 构造方法:JobConf()、JobConf(Class exampleClass)、JobConf(Configuration conf)等
???????? */?
??????? JobConf conf = new JobConf(AnalyseLog.class);?
??????? conf.setJobName("AnalyseLog");?????????? //设置一个用户定义的job名称?
?
??????? conf.setOutputKeyClass(Text.class);??? //为job的输出数据设置Key类?
??????? conf.setOutputValueClass(Text.class);?? //为job输出设置value类?
?
??????? conf.setMapperClass(Map.class);???????? //为job设置Mapper类?
??????? //本地 reducer
??????? conf.setCombinerClass(Reduce.class);????? //为job设置Combiner类?
??????? conf.setReducerClass(Reduce.class);??????? //为job设置Reduce类?
?
??????? conf.setInputFormat(TextInputFormat.class);??? //为map-reduce任务设置InputFormat实现类?
??????? conf.setOutputFormat(TextOutputFormat.class);? //为map-reduce任务设置OutputFormat实现类?
?
??????? /**
???????? * InputFormat描述map-reduce中对job的输入定义
???????? * setInputPaths():为map-reduce job设置路径数组作为输入列表
???????? * setInputPath():为map-reduce job设置路径数组作为输出列表
???????? */?
??????? FileInputFormat.setInputPaths(conf, new Path(args[0]));?
??????? FileOutputFormat.setOutputPath(conf, new Path(args[1]));?
?
??????? JobClient.runJob(conf);???????? //运行一个job?
??? }?
???
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值