MapReduce数据倾斜解决方案2-- 自定义分区类---二次作业

最新推荐文章于 2022-03-24 20:15:57 发布

缘定三石

最新推荐文章于 2022-03-24 20:15:57 发布

阅读量515

点赞数

分类专栏： Hadoop实战文章标签： Hadoop mapreduce 数据倾斜

本文链接：https://blog.csdn.net/tian_qing_lei/article/details/77388348

版权

Hadoop实战专栏收录该内容

46 篇文章 1 订阅

订阅专栏

数据倾斜：大量数据涌向到一个或者几个reduce，造成大量的reduce空闲。

解决数据倾斜方案2：自定义分区类---二次作业

下面以单次统计为例进行说明:

1、DataLeanMapper1

package hadoop.lean.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.Random;

/**
 * DataLeanMapper1
 */
public class DataLeanMapper1 extends Mapper<LongWritable, Text, Text,IntWritable> {
	Random r = new Random();

	/**
	 * 每一行
	 */
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

		String line = value.toString();
		String[] arr = line.split(" ");

		Text keyOut = new Text();
		IntWritable valueOut = new IntWritable(1);
		for(String word : arr){
			keyOut.set(word);
			context.write(keyOut,valueOut);
		}
	}
}

2、DataLeanMapper2

package hadoop.lean.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.Random;

public class DataLeanMapper2 extends Mapper<Text, Text, Text,IntWritable> {

	/**
	 * 每一行
	 */
	protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
		context.write(key , new IntWritable(Integer.parseInt(value.toString())));

	}
}

3、DataLeanReducer1

package hadoop.lean.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * DataLeanReducer1
 */
public class DataLeanReducer1 extends Reducer<Text, IntWritable, Text, IntWritable>{

	protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
		int count = 0 ;
		for(IntWritable iw : values){
			count = count + iw.get() ;
		}
		context.write(key,new IntWritable(count));
	}
}

4、RandomPartitioner 随机分区

package hadoop.lean.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

import java.util.Random;

/**
 * 随机分区
 */
public class RandomPartitioner extends Partitioner<Text,IntWritable> {

	Random r = new Random() ;
	public int getPartition(Text text, IntWritable intWritable, int numPartitions) {
		return r.nextInt(numPartitions);
	}
}

5、App

* 数据倾斜解决办法需要二次作业
* 自定义分区类

package hadoop.lean.partitioner;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

/**
 * 数据倾斜解决办法需要二次作业
 * 自定义分区类
 */
public class App {
	public static void main(String[] args) throws Exception {
		args = new String[]{"d:/java/mr/data/1.txt", "d:/java/mr/out1", "d:/java/mr/out2"} ;
		Configuration conf = new Configuration();

		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path(args[1]))){
			fs.delete(new Path(args[1]),true);
		}

		Job job = Job.getInstance(conf);

		job.setJobName("WordCount-1");
		job.setJarByClass(App.class);

		job.setMapperClass(DataLeanMapper1.class);
		job.setReducerClass(DataLeanReducer1.class);

		//添加输入路径
		FileInputFormat.addInputPath(job,new Path(args[0]));
		//设置输出路径
		FileOutputFormat.setOutputPath(job,new Path(args[1]));

		//设置mapreduce输出
		job.setPartitionerClass(RandomPartitioner.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		job.setNumReduceTasks(3);

		//第一个阶段(job)
		if(job.waitForCompletion(true)){
			job = Job.getInstance(conf);

			job.setJobName("WordCount-2");
			job.setJarByClass(App.class);

			job.setMapperClass(DataLeanMapper2.class);
			job.setReducerClass(DataLeanReducer1.class);

			//添加输入路径
			FileInputFormat.addInputPath(job, new Path(args[1]));
			//设置输出路径
			FileOutputFormat.setOutputPath(job, new Path(args[2]));
			//第一次的输出是第二次的输入，首次输出的key - value
			job.setInputFormatClass(KeyValueTextInputFormat.class);

			//第二次哈希分区
			job.setPartitionerClass(HashPartitioner.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);

			job.setNumReduceTasks(3);
			job.waitForCompletion(true);
		}
	}
}