MapReduce数据倾斜解决方案2-- 自定义分区类---二次作业

数据倾斜:大量数据涌向到一个或者几个reduce,造成大量的reduce空闲。

解决数据倾斜方案2:自定义分区类---二次作业

下面以单次统计为例进行说明:

1、DataLeanMapper1

package hadoop.lean.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.Random;

/**
 * DataLeanMapper1
 */
public class DataLeanMapper1 extends Mapper<LongWritable, Text, Text,IntWritable> {
	Random r = new Random();

	/**
	 * 每一行
	 */
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

		String line = value.toString();
		String[] arr = line.split(" ");

		Text keyOut = new Text();
		IntWritable valueOut = new IntWritable(1);
		for(String word : arr){
			keyOut.set(word);
			context.write(keyOut,valueOut);
		}
	}
}
2、DataLeanMapper2
package hadoop.lean.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.Random;

public class DataLeanMapper2 extends Mapper<Text, Text, Text,IntWritable> {

	/**
	 * 每一行
	 */
	protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
		context.write(key , new IntWritable(Integer.parseInt(value.toString())));

	}
}
3、DataLeanReducer1

package hadoop.lean.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * DataLeanReducer1
 */
public class DataLeanReducer1 extends Reducer<Text, IntWritable, Text, IntWritable>{

	protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
		int count = 0 ;
		for(IntWritable iw : values){
			count = count + iw.get() ;
		}
		context.write(key,new IntWritable(count));
	}
}
4、RandomPartitioner 随机分区

package hadoop.lean.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

import java.util.Random;

/**
 * 随机分区
 */
public class RandomPartitioner extends Partitioner<Text,IntWritable> {

	Random r = new Random() ;
	public int getPartition(Text text, IntWritable intWritable, int numPartitions) {
		return r.nextInt(numPartitions);
	}
}
5、App 

 * 数据倾斜解决办法需要二次作业
 * 自定义分区类

package hadoop.lean.partitioner;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

/**
 * 数据倾斜解决办法需要二次作业
 * 自定义分区类
 */
public class App {
	public static void main(String[] args) throws Exception {
		args = new String[]{"d:/java/mr/data/1.txt", "d:/java/mr/out1", "d:/java/mr/out2"} ;
		Configuration conf = new Configuration();

		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path(args[1]))){
			fs.delete(new Path(args[1]),true);
		}

		Job job = Job.getInstance(conf);

		job.setJobName("WordCount-1");
		job.setJarByClass(App.class);

		job.setMapperClass(DataLeanMapper1.class);
		job.setReducerClass(DataLeanReducer1.class);

		//添加输入路径
		FileInputFormat.addInputPath(job,new Path(args[0]));
		//设置输出路径
		FileOutputFormat.setOutputPath(job,new Path(args[1]));

		//设置mapreduce输出
		job.setPartitionerClass(RandomPartitioner.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		job.setNumReduceTasks(3);

		//第一个阶段(job)
		if(job.waitForCompletion(true)){
			job = Job.getInstance(conf);

			job.setJobName("WordCount-2");
			job.setJarByClass(App.class);

			job.setMapperClass(DataLeanMapper2.class);
			job.setReducerClass(DataLeanReducer1.class);

			//添加输入路径
			FileInputFormat.addInputPath(job, new Path(args[1]));
			//设置输出路径
			FileOutputFormat.setOutputPath(job, new Path(args[2]));
			//第一次的输出是第二次的输入,首次输出的key - value
			job.setInputFormatClass(KeyValueTextInputFormat.class);

			//第二次哈希分区
			job.setPartitionerClass(HashPartitioner.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);

			job.setNumReduceTasks(3);
			job.waitForCompletion(true);
		}
	}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值