众所周知,Hadoop框架使用Mapper将数据处理成一个<key,value>键值对,再网络节点间对其进行整理(shuffle),然后使用Reducer处理数据并进行最终输出。
??? 在上述过程中,我们看到至少两个性能瓶颈:
- 如果我们有10亿个数据,Mapper会生成10亿个键值对在网络间进行传输,但如果我们只是对数据求最大值,那么很明显的Mapper只需要输出它所知道的最大值即可。这样做不仅可以减轻网络压力,同样也可以大幅度提高程序效率。
- 使用专利中的国家一项来阐述数据倾斜这个定义。这样的数据远远不是一致性的或者说平衡分布的,由于大多数专利的国家都属于美国,这样不仅Mapper中的键值对、中间阶段(shuffle)的键值对等,大多数的键值对最终会聚集于一个单一的Reducer之上,压倒这个Reducer,从而大大降低程序的性能。
??? Hadoop通过使用一个介于Mapper和Reducer之间的Combiner步骤来解决上述瓶颈。你可以将Combiner视为Reducer的一个帮手,它主要是为了削减Mapper的输出从而减少网
络带宽和Reducer之上的负载。如果我们定义一个Combiner,MapReducer框架会对中间数据多次地使用它进行处理。
??? 如果Reducer只运行简单的分布式方法,例如最大值、最小值、或者计数,那么我们可以让Reducer自己作为Combiner。但许多有用的方法不是分布式的。以下我们使用求平均值作为例子进行讲解:
Mapper输出它所处理的键值对,为了使单个DataNode计算平均值Reducer会对它收到的<key,value>键值对进行排序,求和。
??? 由于Reducer将它所收到的<key,value>键值的数目视为输入数据中的<key,value>键值对的数目,此时使用Combiner的主要障碍就是计数操作。我们可以重写MapReduce程序来明确的跟踪计数过程。
??? 代码如下:
[java] view plaincopy
- package com;?
- import java.io.IOException;?
- import org.apache.hadoop.conf.Configuration;?
- import org.apache.hadoop.conf.Configured;?
- import org.apache.hadoop.fs.Path;?
- import org.apache.hadoop.io.DoubleWritable;?
- import org.apache.hadoop.io.LongWritable;?
- import org.apache.hadoop.io.Text;?
- import org.apache.hadoop.mapreduce.Job;?
- import org.apache.hadoop.mapreduce.Mapper;?
- import org.apache.hadoop.mapreduce.Reducer;?
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;?
- import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;?
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;?
- import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;?
- import org.apache.hadoop.util.Tool;?
- import org.apache.hadoop.util.ToolRunner;?
- public class AveragingWithCombiner extends Configured implements Tool {?
- public static class MapClass extends Mapper<LongWritable,Text,Text,Text> {?
- static enum ClaimsCounters { MISSING, QUOTED };?
- // Map Method
- public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {?
- ??????????? String fields[] = value.toString().split(",", -20);?
- ??????????? String country = fields[4];?
- ??????????? String numClaims = fields[8];?
- if (numClaims.length() > 0 && !numClaims.startsWith("\"")) {?
- ??????????????? context.write(new Text(country), new Text(numClaims + ",1"));?
- ??????????? }?
- ??????? }?
- ??? }?
- public static class Reduce extends Reducer<Text,Text,Text,DoubleWritable> {?
- // Reduce Method
- public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {?
- double sum = 0;?
- int count = 0;?
- for (Text value : values) {?
- ??????????????? String fields[] = value.toString().split(",");?
- ??????????????? sum += Double.parseDouble(fields[0]);?
- ??????????????? count += Integer.parseInt(fields[1]);?
- ??????????? }?
- ??????????? context.write(key, new DoubleWritable(sum/count));?
- ??????? }?
- ??? }?
- public static class Combine extends Reducer<Text,Text,Text,Text> {?
- // Reduce Method
- public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {?
- double sum = 0;?
- int count = 0;?
- for (Text value : values) {?
- ??????????????? String fields[] = value.toString().split(",");?
- ??????????????? sum += Double.parseDouble(fields[0]);?
- ??????????????? count += Integer.parseInt(fields[1]);?
- ??????????? }?
- ??????????? context.write(key, new Text(sum+","+count));?
- ??????? }?
- ??? }?
- // run Method
- public int run(String[] args) throws Exception {?
- // Create and Run the Job
- ??????? Job job = new Job();?
- ??????? job.setJarByClass(AveragingWithCombiner.class);?
- ??????? FileInputFormat.addInputPath(job, new Path(args[0]));?
- ??????? FileOutputFormat.setOutputPath(job, new Path(args[1]));?
- ??????? job.setJobName("AveragingWithCombiner");?
- ??????? job.setMapperClass(MapClass.class);?
- ??????? job.setCombinerClass(Combine.class);?
- ??????? job.setReducerClass(Reduce.class);?
- ??????? job.setInputFormatClass(TextInputFormat.class);?
- ??????? job.setOutputFormatClass(TextOutputFormat.class);?
- ??????? job.setOutputKeyClass(Text.class);?
- ??????? job.setOutputValueClass(Text.class);?
- ??????? System.exit(job.waitForCompletion(true) ? 0 : 1);?
- return 0;?
- ??? }?
- public static void main(String[] args) throws Exception {?
- int res = ToolRunner.run(new Configuration(), new AveragingWithCombiner(), args);?
- ??????? System.exit(res);?
- ??? }?
- }?
、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、、
自己的程序:
import hadoop.WordCount.Map;
import hadoop.WordCount.Reduce;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class AnalyseLog {
?????? /**
???? * MapReduceBase类:实现了Mapper和Reducer接口的基类(其中的方法只是实现接口,而未作任何事情)
???? * Mapper接口:
???? * WritableComparable接口:实现WritableComparable的类可以相互比较。所有被用作key的类应该实现此接口。
???? * Reporter 则可用于报告整个应用的运行进度,本例中未使用。?
???? *?
???? */?
??? public static class Map extends MapReduceBase implements?
??????????? Mapper<LongWritable, Text, Text, Text>?
??? {?
??????? /**
???????? * LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java 数据类型的类,这些类实现了WritableComparable接口,
???????? * 都能够被串行化从而便于在分布式环境中进行数据交换,你可以将它们分别视为long,int,String 的替代品。
???????? */?
??????? private final static IntWritable one = new IntWritable(1);?
???????
??????? private Text value = new Text();
???????
??????? private Text word = new Text();?
?????????
??????? /**
???????? * Mapper接口中的map方法:
???????? * void map(K1 key, V1 value, OutputCollector<K2,V2> output, Reporter reporter)
???????? * 映射一个单个的输入k/v对到一个中间的k/v对
???????? * 输出对不需要和输入对是相同的类型,输入对可以映射到0个或多个输出对。
???????? * OutputCollector接口:收集Mapper和Reducer输出的<k,v>对。
???????? * OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output
???????? */?
??????? public void map(LongWritable key, Text value,?
??????????????? OutputCollector<Text, Text> output, Reporter reporter)?
??????????????? throws IOException?
??????? {?
??????????? String line = value.toString();?
??????????? StringTokenizer tokenizer = new StringTokenizer(line,"\n");?
?????????? L: while (tokenizer.hasMoreTokens())?
??????????? {?
?????????????
???????????????? String domain = "";
?????????????????
???????????????? String req = "";
????????????????
??????????????? String time = "";
???????????????
??????????????? String pvSiteId = "";
????????????????
???????????????? String result = "";
?????????????????
???????????????? String sss = tokenizer.nextToken();
????????????????
????????????????? String s[] = sss.split(" ");
?????????????????
????????????????? if(sss.indexOf("pv.gif")>-1){
?????????????????????
???????????????????? for(String str:s){
????????????????????????
???????????????????????? if(str.indexOf("pv.gif")>-1){
????????????????????????????
???????????????????????????? req = str;
???????????????????????????? continue;
???????????????????????? }
????????????????????????
???????????????????????? if(str.indexOf("http://")>-1){
????????????????????????????
???????????????????????????? domain = str.substring(8, str.length());
???????????????????????????? if(domain.indexOf("/")>-1){
????????????????????????????????
???????????????????????????????? domain = domain.substring(0,domain.indexOf("/"));
???????????????????????????? }else if(domain.indexOf("\"")>-1){
????????????????????????????????
???????????????????????????????? domain = domain.substring(0,domain.indexOf("\""));
???????????????????????????? }
????????????????????????????
????????????????????????????
???????????????????????????? break;
???????????????????????? }
????????????????????????
???????????????????? }
????????????????????
??????????????????? String reqSub = req.substring(req.indexOf("&"),req.length());
???????????????????
??????????????????? String[] reqSubSpli = reqSub.split("&");
???????????????????
???????????????????
??????????????????? for(String ss:reqSubSpli){
???????????????????????
??????????????????????? if(ss.indexOf("time")>-1){
???????????????????????????
??????????????????????????? time = ss.split("=")[1];
??????????????????????????? //System.out.println("time:"+beanPV.getTime());
??????????????????????? }
???????????????????????
??????????????????????? if(ss.indexOf("siteid")>-1){
???????????????????????????
??????????????????????????? pvSiteId =ss.split("=")[1];
???????????????????????????
??????????????????????????? //beanUV.setSiteId(ss.split("=")[1]);
??????????????????????????? //System.out.println("setSiteId:"+beanPV.getSiteId());
??????????????????????? }
??????????????????? }
???????????????????? ///
???????????????????
??????????????????? if(pvSiteId.equals("")){
???????????????????????
??????????????????????? continue L;
??????????????????? }
???????????????????
??????????????????? result = pvSiteId + ";" + domain + "@" + "1";
?????????????????????
????????????????? }
???????????????
??????????????? if(pvSiteId.equals("")){
???????????????????
??????????????????? continue L;
??????????????? }
???????????????
??????????????? word.set(pvSiteId);
??????????????? value.set(result);
??????????????? output.collect(word, value);?
??????????? }?
??????? }?
??? }?
?
??? public static class Reduce extends MapReduceBase implements?
??????????? Reducer<Text, Text, Text, Text>?
??? {?
???????
??????? public void reduce(Text key, Iterator<Text> values,?
??????????????? OutputCollector<Text, Text> output, Reporter reporter)?
??????????????? throws IOException?
??????? {?
??????????? int sum = 0;
??????????? String re = "";
??????????? String re1 = "";
??????????? System.out.println("???####################"+values);
???????????
??????????? while (values.hasNext())?
??????????? {?
??????????????? String fields[] = values.next().toString().split("@");
???????????????
??????????????? sum += Double.parseDouble(fields[1]);
???????????????
??????????????? re1 = fields[0];
???????????????
??????????????? System.out.println("???????????????!!!!!!!!"+re1);
?
????????????? //? System.out.println("???????????????@@@@@:"+sum);
??????????? }?
???????????
??????????? re = re1 + "@"+sum;
??????????? System.out.println("???????????????@@@@@!:"+sum);
???????????
??????????? System.out.println("???????????????@@@@!!!@!:"+re);
??????????? output.collect(key, new Text(re));?
??????? }?
??? }?
?
??? public static void main(String[] args) throws Exception?
??? {?
??????? /**
???????? * JobConf:map/reduce的job配置类,向hadoop框架描述map-reduce执行的工作
???????? * 构造方法:JobConf()、JobConf(Class exampleClass)、JobConf(Configuration conf)等
???????? */?
??????? JobConf conf = new JobConf(AnalyseLog.class);?
??????? conf.setJobName("AnalyseLog");?????????? //设置一个用户定义的job名称?
?
??????? conf.setOutputKeyClass(Text.class);??? //为job的输出数据设置Key类?
??????? conf.setOutputValueClass(Text.class);?? //为job输出设置value类?
?
??????? conf.setMapperClass(Map.class);???????? //为job设置Mapper类?
??????? //本地 reducer
??????? conf.setCombinerClass(Reduce.class);????? //为job设置Combiner类?
??????? conf.setReducerClass(Reduce.class);??????? //为job设置Reduce类?
?
??????? conf.setInputFormat(TextInputFormat.class);??? //为map-reduce任务设置InputFormat实现类?
??????? conf.setOutputFormat(TextOutputFormat.class);? //为map-reduce任务设置OutputFormat实现类?
?
??????? /**
???????? * InputFormat描述map-reduce中对job的输入定义
???????? * setInputPaths():为map-reduce job设置路径数组作为输入列表
???????? * setInputPath():为map-reduce job设置路径数组作为输出列表
???????? */?
??????? FileInputFormat.setInputPaths(conf, new Path(args[0]));?
??????? FileOutputFormat.setOutputPath(conf, new Path(args[1]));?
?
??????? JobClient.runJob(conf);???????? //运行一个job?
??? }?
???
}