在上一节的AverageByAttributeMapper.py和AverageByAttributeReducer.py中计算平均值,由mapper读取<属性,1>,在网络上进行洗牌,reducer计算每个键的平均值。可以看到至少有两个效率瓶颈:
- 如果有10亿条记录,mapper会生成10亿个 键值对 在网络上洗牌,实际上,在求最大值时,mapper只需要输出键中最大的一个。求平均值则可以重新定义算法,使每个键只有一个记录参与洗牌。
- 若使用国家作为键,会出现 数据倾斜,不是均匀分布,导致一个reducer中进入很多数据。
可以使用combiner先进行本地reduce,它能有效的减少mapper的输出以降低网络和reducer上的压力。另外注意,combine在数据转换上需与reducer等价。就是说,如果去掉combiner,输出应该保持不变。
对于分配型函数,如最大值,最小值,求和等,可以使用reducer直接作为combiner。对于其他如平均值,进行部分转换即可。
如下AverageByAttribute.java求平均值:
不加combiner代码:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class AverageByAttribute extends Configured implements Tool {
public static class MapClass extends Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException
{
String [] lines = value.toString().split(",");
String country = lines[4];
String numClaims = lines[8];
//length > 0 可以排除未声明专利得国家,startswith则排除第一行属性名
if(numClaims.length() > 0 && !numClaims.startsWith("\""))
{
//这里作为v2的Text值分别存储了声明的专利数,和计数count,用于在reduce过程中num/count计算平均值
context.write(new Text(country), new Text(numClaims + ",1"));
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, DoubleWritable>
{
public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException
{
double sum = 0.0;
int count = 0;
for(Text element: values)
{
String [] fields = element.toString().split(",");
sum += Double.parseDouble(fields[0]);
count += Integer.parseInt(fields[1]);
}
context.write(key, new DoubleWritable(sum/count));
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf,"AverageByAttribute");
//注意:此处设置得输出key\value是map阶段结束后的key\value对,开始把它当成了最终输出,导致一直提示类型不匹配!!
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//成功结束时返回0,失败时返回1
System.exit(job.waitForCompletion(true)? 0: 1);
return 0;
}
public static void main(String [] args) throws Exception
{
int res = ToolRunner.run(new Configuration(), new AverageByAttribute(), args);
System.exit(res);
}
}
添加combiner后,代码如下:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//这个程序与AverageByAttribute作用完全相同,但我们采用了combiner来实现国家专利数的累加和计数,只在reducer中进行求平均值操作
//相当于将reducer的一部分工作放到了combiner中,所以combiner要实现reducer接口,这在分配型函数中非常常见!!
public class AverageCombiner extends Configured implements Tool {
public static class MapClass extends Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException
{
String [] lines = value.toString().split(",");
String country = lines[4];
String numClaims = lines[8];
//length > 0 可以排除未声明专利得国家,startswith则排除第一行属性名
if(numClaims.length() > 0 && !numClaims.startsWith("\""))
{
//这里作为v2的Text值分别存储了声明的专利数,和计数count,用于在reduce过程中num/count计算平均值
context.write(new Text(country), new Text(numClaims + ",1"));
}
}
}
public static class Combine extends Reducer<Text, Text, Text, Text>
{
public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException
{
double sum = 0.0;
int count = 0;
for(Text element: values)
{
String [] fields = element.toString().split(",");
sum += Double.parseDouble(fields[0]);
count += Integer.parseInt(fields[1]);
}
//由于","的存在,在sum+","+count中会自动调用两者得toString方法,最终得到一个字符串。
context.write(key, new Text(sum + "," + count));
}
}
public static class Reduce extends Reducer<Text, Text, Text, DoubleWritable>
{
//注意,尽管添加了combiner,此处传入的values仍然需要是Iterable<>泛型,因为combiner只是对每一个map本地进行reduce
//输入仍是分片的,但是减少了很多需要洗牌的记录,开始没有加Iterable<>,发现结果只计算了一半。。。
public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException
{
double sum = 0.0;
int count = 0;
for(Text element: values)
{
String [] fields = element.toString().split(",");
sum += Double.parseDouble(fields[0]);
count += Integer.parseInt(fields[1]);
}
//由于","的存在,在sum+","+count中会自动调用两者得toString方法,最终得到一个字符串。
context.write(key, new DoubleWritable(sum/count));
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf,"AverageCombiner");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(MapClass.class);
job.setCombinerClass(Combine.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//成功结束时返回0,失败时返回1
System.exit(job.waitForCompletion(true)? 0: 1);
return 0;
}
public static void main(String [] args) throws Exception
{
int res = ToolRunner.run(new Configuration(), new AverageCombiner(), args);
System.exit(res);
}
}
比较可知,combiner几乎与reducer相同,reducer多计算了一个sum/count表示平均值