使用combiner提升性能

最新推荐文章于 2020-04-22 15:25:19 发布

dandingyy

最新推荐文章于 2020-04-22 15:25:19 发布

阅读量1k

点赞数

文章标签： string exception class path import 网络

本文链接：https://blog.csdn.net/dandingyy/article/details/7405261

版权

在上一节的AverageByAttributeMapper.py和AverageByAttributeReducer.py中计算平均值，由mapper读取<属性，1>，在网络上进行洗牌，reducer计算每个键的平均值。可以看到至少有两个效率瓶颈：

如果有10亿条记录，mapper会生成10亿个键值对在网络上洗牌，实际上，在求最大值时，mapper只需要输出键中最大的一个。求平均值则可以重新定义算法，使每个键只有一个记录参与洗牌。
若使用国家作为键，会出现数据倾斜，不是均匀分布，导致一个reducer中进入很多数据。

可以使用combiner先进行本地reduce，它能有效的减少mapper的输出以降低网络和reducer上的压力。另外注意，combine在数据转换上需与reducer等价。就是说，如果去掉combiner，输出应该保持不变。

对于分配型函数，如最大值，最小值，求和等，可以使用reducer直接作为combiner。对于其他如平均值，进行部分转换即可。

如下AverageByAttribute.java求平均值：

不加combiner代码：

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class AverageByAttribute extends Configured implements Tool {
	public static class MapClass extends Mapper<LongWritable, Text, Text, Text>
	{
		public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException
		{
			String [] lines = value.toString().split(",");
			String country = lines[4];
			String numClaims = lines[8];
			//length > 0 可以排除未声明专利得国家，startswith则排除第一行属性名
			if(numClaims.length() > 0 && !numClaims.startsWith("\""))
			{
				//这里作为v2的Text值分别存储了声明的专利数，和计数count，用于在reduce过程中num/count计算平均值
				context.write(new Text(country), new Text(numClaims + ",1"));
			}
		}
	}
	
	public static class Reduce extends Reducer<Text, Text, Text, DoubleWritable>
	{
		public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException
		{
			double sum = 0.0;
			int count = 0;
			for(Text element: values)
			{
				String [] fields = element.toString().split(",");
				sum += Double.parseDouble(fields[0]);
				count += Integer.parseInt(fields[1]);
			}
			context.write(key, new DoubleWritable(sum/count));
		}
	}

	@Override
	public int run(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		Job job = new Job(conf,"AverageByAttribute");
		//注意：此处设置得输出key\value是map阶段结束后的key\value对，开始把它当成了最终输出，导致一直提示类型不匹配！！
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		job.setMapperClass(MapClass.class);
		job.setReducerClass(Reduce.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//成功结束时返回0，失败时返回1
		System.exit(job.waitForCompletion(true)? 0: 1);
		return 0;
	}
	
	public static void main(String [] args) throws Exception
	{
		int res = ToolRunner.run(new Configuration(), new AverageByAttribute(), args);
		System.exit(res);
	}
}

添加combiner后，代码如下：

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

//这个程序与AverageByAttribute作用完全相同，但我们采用了combiner来实现国家专利数的累加和计数，只在reducer中进行求平均值操作
//相当于将reducer的一部分工作放到了combiner中，所以combiner要实现reducer接口，这在分配型函数中非常常见！！
public class AverageCombiner extends Configured implements Tool {
	public static class MapClass extends Mapper<LongWritable, Text, Text, Text>
	{
		public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException
		{
			String [] lines = value.toString().split(",");
			String country = lines[4];
			String numClaims = lines[8];
			//length > 0 可以排除未声明专利得国家，startswith则排除第一行属性名
			if(numClaims.length() > 0 && !numClaims.startsWith("\""))
			{
				//这里作为v2的Text值分别存储了声明的专利数，和计数count，用于在reduce过程中num/count计算平均值
				context.write(new Text(country), new Text(numClaims + ",1"));
			}
		}
	}
	
	public static class Combine extends Reducer<Text, Text, Text, Text>
	{
		public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException
		{
			double sum = 0.0;
			int count = 0;
			for(Text element: values)
			{
				String [] fields = element.toString().split(",");
				sum += Double.parseDouble(fields[0]);
				count += Integer.parseInt(fields[1]);
			}
			//由于","的存在，在sum+","+count中会自动调用两者得toString方法，最终得到一个字符串。
			context.write(key, new Text(sum + "," + count));
		}
	}
	public static class Reduce extends Reducer<Text, Text, Text, DoubleWritable>
	{
		//注意，尽管添加了combiner，此处传入的values仍然需要是Iterable<>泛型，因为combiner只是对每一个map本地进行reduce
		//输入仍是分片的，但是减少了很多需要洗牌的记录，开始没有加Iterable<>，发现结果只计算了一半。。。
		public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException
		{
			double sum = 0.0;
			int count = 0;
			for(Text element: values)
			{
				String [] fields = element.toString().split(",");
				sum += Double.parseDouble(fields[0]);
				count += Integer.parseInt(fields[1]);
			}
			//由于","的存在，在sum+","+count中会自动调用两者得toString方法，最终得到一个字符串。
			context.write(key, new DoubleWritable(sum/count));
		}
	}

	@Override
	public int run(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		Job job = new Job(conf,"AverageCombiner");
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		job.setMapperClass(MapClass.class);
		job.setCombinerClass(Combine.class);
		job.setReducerClass(Reduce.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//成功结束时返回0，失败时返回1
		System.exit(job.waitForCompletion(true)? 0: 1);
		return 0;
	}
	
	public static void main(String [] args) throws Exception
	{
		int res = ToolRunner.run(new Configuration(), new AverageCombiner(), args);
		System.exit(res);
	}
}

比较可知，combiner几乎与reducer相同，reducer多计算了一个sum/count表示平均值

dandingyy

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
使用combiner提升性能

在上一节的AverageByAttributeMapper.py和AverageByAttributeReducer.py中计算平均值，由mapper读取，在网络上进行洗牌，reducer计算每个键的平均值。可以看到至少有两个效率瓶颈：如果有10亿条记录，mapper会生成10亿个键值对在网络上洗牌，实际上，在求最大值时，mapper只需要输出键中最大的一个。求平均值则可以重新定义算法
复制链接

扫一扫