Hadoop 使用Combiner提高Map/Reduce程序效率

最新推荐文章于 2024-07-14 11:10:53 发布

guo_yang

最新推荐文章于 2024-07-14 11:10:53 发布

阅读量89

点赞数

本文链接：https://blog.csdn.net/guo_yang/article/details/82186395

版权

众所周知，Hadoop框架使用Mapper将数据处理成一个<key,value>键值对，再网络节点间对其进行整理(shuffle)，然后使用Reducer处理数据并进行最终输出。

??? 在上述过程中，我们看到至少两个性能瓶颈：

如果我们有10亿个数据，Mapper会生成10亿个键值对在网络间进行传输，但如果我们只是对数据求最大值，那么很明显的Mapper只需要输出它所知道的最大值即可。这样做不仅可以减轻网络压力，同样也可以大幅度提高程序效率。
使用专利中的国家一项来阐述数据倾斜这个定义。这样的数据远远不是一致性的或者说平衡分布的，由于大多数专利的国家都属于美国，这样不仅Mapper中的键值对、中间阶段(shuffle)的键值对等，大多数的键值对最终会聚集于一个单一的Reducer之上，压倒这个Reducer，从而大大降低程序的性能。

??? Hadoop通过使用一个介于Mapper和Reducer之间的Combiner步骤来解决上述瓶颈。你可以将Combiner视为Reducer的一个帮手，它主要是为了削减Mapper的输出从而减少网

络带宽和Reducer之上的负载。如果我们定义一个Combiner，MapReducer框架会对中间数据多次地使用它进行处理。

??? 如果Reducer只运行简单的分布式方法，例如最大值、最小值、或者计数，那么我们可以让Reducer自己作为Combiner。但许多有用的方法不是分布式的。以下我们使用求平均值作为例子进行讲解：

Mapper输出它所处理的键值对，为了使单个DataNode计算平均值Reducer会对它收到的<key,value>键值对进行排序，求和。

??? 由于Reducer将它所收到的<key,value>键值的数目视为输入数据中的<key,value>键值对的数目，此时使用Combiner的主要障碍就是计数操作。我们可以重写MapReduce程序来明确的跟踪计数过程。

??? 代码如下：

[java] view plain copy

package com;?
import java.io.IOException;?
import org.apache.hadoop.conf.Configuration;?
import org.apache.hadoop.conf.Configured;?
import org.apache.hadoop.fs.Path;?
import org.apache.hadoop.io.DoubleWritable;?
import org.apache.hadoop.io.LongWritable;?
import org.apache.hadoop.io.Text;?
import org.apache.hadoop.mapreduce.Job;?
import org.apache.hadoop.mapreduce.Mapper;?
import org.apache.hadoop.mapreduce.Reducer;?
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;?
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;?
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;?
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;?
import org.apache.hadoop.util.Tool;?
import org.apache.hadoop.util.ToolRunner;?
public class AveragingWithCombiner extends Configured implements Tool {?
public static class MapClass extends Mapper<LongWritable,Text,Text,Text> {?
static enum ClaimsCounters { MISSING, QUOTED };?
// Map Method
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {?
??????????? String fields[] = value.toString().split(",", -20);?
??????????? String country = fields[4];?
??????????? String numClaims = fields[8];?
if (numClaims.length() > 0 && !numClaims.startsWith("\"")) {?
??????????????? context.write(new Text(country), new Text(numClaims + ",1"));?
??????????? }?
??????? }?
??? }?
public static class Reduce extends Reducer<Text,Text,Text,DoubleWritable> {?
// Reduce Method
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {?
double sum = 0;?
int count = 0;?
for (Text value : values) {?
??????????????? String fields[] = value.toString().split(",");?
??????????????? sum += Double.parseDouble(fields[0]);?
??????????????? count += Integer.parseInt(fields[1]);?
??????????? }?
??????????? context.write(key, new DoubleWritable(sum/count));?
??????? }?
??? }?
public static class Combine extends Reducer<Text,Text,Text,Text> {?
// Reduce Method
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {?
double sum = 0;?
int count = 0;?
for (Text value : values) {?
??????????????? String fields[] = value.toString().split(",");?
??????????????? sum += Double.parseDouble(fields[0]);?
??????????????? count += Integer.parseInt(fields[1]);?
??????????? }?
??????????? context.write(key, new Text(sum+","+count));?
??????? }?
??? }?
// run Method
public int run(String[] args) throws Exception {?
// Create and Run the Job
??????? Job job = new Job();?
??????? job.setJarByClass(AveragingWithCombiner.class);?
??????? FileInputFormat.addInputPath(job, new Path(args[0]));?
??????? FileOutputFormat.setOutputPath(job, new Path(args[1]));?
??????? job.setJobName("AveragingWithCombiner");?
??????? job.setMapperClass(MapClass.class);?
??????? job.setCombinerClass(Combine.class);?
??????? job.setReducerClass(Reduce.class);?
??????? job.setInputFormatClass(TextInputFormat.class);?
??????? job.setOutputFormatClass(TextOutputFormat.class);?
??????? job.setOutputKeyClass(Text.class);?
??????? job.setOutputValueClass(Text.class);?
??????? System.exit(job.waitForCompletion(true) ? 0 : 1);?
return 0;?
??? }?
public static void main(String[] args) throws Exception {?
int res = ToolRunner.run(new Configuration(), new AveragingWithCombiner(), args);?
??????? System.exit(res);?
??? }?
}?

、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、

自己的程序：

import hadoop.WordCount.Map;
import hadoop.WordCount.Reduce;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

public class AnalyseLog {

?????? /**
???? * MapReduceBase类:实现了Mapper和Reducer接口的基类（其中的方法只是实现接口，而未作任何事情）
???? * Mapper接口：
???? * WritableComparable接口：实现WritableComparable的类可以相互比较。所有被用作key的类应该实现此接口。
???? * Reporter 则可用于报告整个应用的运行进度，本例中未使用。?
???? *?
???? */?
??? public static class Map extends MapReduceBase implements?
??????????? Mapper<LongWritable, Text, Text, Text>?
??? {?
??????? /**
???????? * LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java 数据类型的类，这些类实现了WritableComparable接口，
???????? * 都能够被串行化从而便于在分布式环境中进行数据交换，你可以将它们分别视为long,int,String 的替代品。
???????? */?
??????? private final static IntWritable one = new IntWritable(1);?
???????
??????? private Text value = new Text();
???????
??????? private Text word = new Text();?
?????????
??????? /**
???????? * Mapper接口中的map方法：
???????? * void map(K1 key, V1 value, OutputCollector<K2,V2> output, Reporter reporter)
???????? * 映射一个单个的输入k/v对到一个中间的k/v对
???????? * 输出对不需要和输入对是相同的类型，输入对可以映射到0个或多个输出对。
???????? * OutputCollector接口：收集Mapper和Reducer输出的<k,v>对。
???????? * OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output
???????? */?
??????? public void map(LongWritable key, Text value,?
??????????????? OutputCollector<Text, Text> output, Reporter reporter)?
??????????????? throws IOException?
??????? {?
??????????? String line = value.toString();?
??????????? StringTokenizer tokenizer = new StringTokenizer(line,"\n");?
?????????? L: while (tokenizer.hasMoreTokens())?
??????????? {?
?????????????
???????????????? String domain = "";
?????????????????
???????????????? String req = "";
????????????????
??????????????? String time = "";
???????????????
??????????????? String pvSiteId = "";
????????????????
???????????????? String result = "";
?????????????????
???????????????? String sss = tokenizer.nextToken();
????????????????
????????????????? String s[] = sss.split(" ");

?????????????????
????????????????? if(sss.indexOf("pv.gif")>-1){
?????????????????????

???????????????????? for(String str:s){
????????????????????????
???????????????????????? if(str.indexOf("pv.gif")>-1){
????????????????????????????
???????????????????????????? req = str;
???????????????????????????? continue;
???????????????????????? }
????????????????????????
???????????????????????? if(str.indexOf("http://")>-1){
????????????????????????????
???????????????????????????? domain = str.substring(8, str.length());
???????????????????????????? if(domain.indexOf("/")>-1){
????????????????????????????????
???????????????????????????????? domain = domain.substring(0,domain.indexOf("/"));
???????????????????????????? }else if(domain.indexOf("\"")>-1){
????????????????????????????????
???????????????????????????????? domain = domain.substring(0,domain.indexOf("\""));
???????????????????????????? }
????????????????????????????
????????????????????????????
???????????????????????????? break;
???????????????????????? }
????????????????????????
???????????????????? }
????????????????????
??????????????????? String reqSub = req.substring(req.indexOf("&"),req.length());
???????????????????
??????????????????? String[] reqSubSpli = reqSub.split("&");
???????????????????
???????????????????
??????????????????? for(String ss:reqSubSpli){
???????????????????????
??????????????????????? if(ss.indexOf("time")>-1){
???????????????????????????
??????????????????????????? time = ss.split("=")[1];
??????????????????????????? //System.out.println("time:"+beanPV.getTime());
??????????????????????? }
???????????????????????
??????????????????????? if(ss.indexOf("siteid")>-1){
???????????????????????????
??????????????????????????? pvSiteId =ss.split("=")[1];
???????????????????????????
??????????????????????????? //beanUV.setSiteId(ss.split("=")[1]);
??????????????????????????? //System.out.println("setSiteId:"+beanPV.getSiteId());
??????????????????????? }
??????????????????? }
???????????????????? ///
???????????????????
??????????????????? if(pvSiteId.equals("")){
???????????????????????
??????????????????????? continue L;
??????????????????? }
???????????????????
??????????????????? result = pvSiteId + ";" + domain + "@" + "1";
?????????????????????
????????????????? }
???????????????
??????????????? if(pvSiteId.equals("")){
???????????????????
??????????????????? continue L;
??????????????? }
???????????????
??????????????? word.set(pvSiteId);
??????????????? value.set(result);
??????????????? output.collect(word, value);?
??????????? }?
??????? }?
??? }?
?
??? public static class Reduce extends MapReduceBase implements?
??????????? Reducer<Text, Text, Text, Text>?
??? {?
???????
??????? public void reduce(Text key, Iterator<Text> values,?
??????????????? OutputCollector<Text, Text> output, Reporter reporter)?
??????????????? throws IOException?
??????? {?
??????????? int sum = 0;
??????????? String re = "";
??????????? String re1 = "";
??????????? System.out.println("???####################"+values);
???????????

??????????? while (values.hasNext())?
??????????? {?
??????????????? String fields[] = values.next().toString().split("@");
???????????????
??????????????? sum += Double.parseDouble(fields[1]);
???????????????
??????????????? re1 = fields[0];
???????????????
??????????????? System.out.println("???????????????!!!!!!!!"+re1);
?
????????????? //? System.out.println("???????????????@@@@@:"+sum);

??????????? }?
???????????
??????????? re = re1 + "@"+sum;
??????????? System.out.println("???????????????@@@@@!:"+sum);
???????????
??????????? System.out.println("???????????????@@@@!!!@!:"+re);

??????????? output.collect(key, new Text(re));?
??????? }?
??? }?
?
??? public static void main(String[] args) throws Exception?
??? {?
??????? /**
???????? * JobConf：map/reduce的job配置类，向hadoop框架描述map-reduce执行的工作
???????? * 构造方法：JobConf()、JobConf(Class exampleClass)、JobConf(Configuration conf)等
???????? */?
??????? JobConf conf = new JobConf(AnalyseLog.class);?
??????? conf.setJobName("AnalyseLog");?????????? //设置一个用户定义的job名称?
?
??????? conf.setOutputKeyClass(Text.class);??? //为job的输出数据设置Key类?
??????? conf.setOutputValueClass(Text.class);?? //为job输出设置value类?
?
??????? conf.setMapperClass(Map.class);???????? //为job设置Mapper类?
??????? //本地 reducer
??????? conf.setCombinerClass(Reduce.class);????? //为job设置Combiner类?
??????? conf.setReducerClass(Reduce.class);??????? //为job设置Reduce类?
?
??????? conf.setInputFormat(TextInputFormat.class);??? //为map-reduce任务设置InputFormat实现类?
??????? conf.setOutputFormat(TextOutputFormat.class);? //为map-reduce任务设置OutputFormat实现类?
?
??????? /**
???????? * InputFormat描述map-reduce中对job的输入定义
???????? * setInputPaths():为map-reduce job设置路径数组作为输入列表
???????? * setInputPath()：为map-reduce job设置路径数组作为输出列表
???????? */?
??????? FileInputFormat.setInputPaths(conf, new Path(args[0]));?
??????? FileOutputFormat.setOutputPath(conf, new Path(args[1]));?
?
??????? JobClient.runJob(conf);???????? //运行一个job?
??? }?
???
}