Hadoop系列-MapReduce自定义Partitioner（十四）

最新推荐文章于 2021-10-19 17:31:55 发布

贺佬湿

最新推荐文章于 2021-10-19 17:31:55 发布

阅读量2.1k

点赞数

分类专栏： Hadoop 云计算/大数据文章标签： hadoop

本文链接：https://blog.csdn.net/hemin1003/article/details/73850521

版权

云计算/大数据同时被 2 个专栏收录

19 篇文章 0 订阅

订阅专栏

Hadoop

14 篇文章 2 订阅

订阅专栏

Github代码下载地址：

1，JAVA工程代码

测试数据：

实现代码如下：

package com.hadoop.minbo.mapreduce.partitioner;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

/**
 * 自定义分组实现
 */
public class TestPartitioner {

	static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			// 拿到日志中的一行数据
			String line = value.toString();
			// 切分各个字段
			String[] splited = line.split(" ");
			// 获取我们所需要的字段:手机号、上行流量、下行流量
			String num = splited[0];
			String upPayLoad = splited[1];
			String downPayLoad = splited[2];
			String str = "" + upPayLoad + " " + downPayLoad;// 这样改变即可
			// 将数据进行输出
			context.write(new Text(num), new Text(str));
		}
	}

	static class MyReducer extends Reducer<Text, Text, Text, Text> {

		@Override
		protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			long payLoadSum = 0L; // 计算每个用户的上行流量和
			long downLoadSum = 0L; // 统计每个用户的下行流量和
			long sum = 0L;
			for (Text v : values) {
				String[] splited = v.toString().split(" ");
				payLoadSum += Long.parseLong(splited[0]);
				downLoadSum += Long.parseLong(splited[1]);
			}

			sum = payLoadSum + downLoadSum;
			String result = "" + payLoadSum + " " + downLoadSum + " " + sum;
			context.write(key, new Text(result));
		}

	}

	public static String path1 = "input3";
	public static String path2 = "output3";

	public static void main(String[] args) throws Exception {
		// Window下运行设置
		System.setProperty("hadoop.home.dir", "F:\\hadoop\\hadoop-2.7.3"); // 设置hadoop安装路径
		System.setProperty("HADOOP_USER_NAME", "hadoop"); // 用户名

		Configuration conf = new Configuration();
		FileSystem fileSystem = FileSystem.get(conf);
		if (fileSystem.exists(new Path(path2))) {
			fileSystem.delete(new Path(path2), true);
		}

		Job job = Job.getInstance(conf);
		job.setJarByClass(TestPartitioner.class);

		FileInputFormat.setInputPaths(job, new Path(path1));
		job.setInputFormatClass(TextInputFormat.class);

		job.setMapperClass(MyMapper.class);
		job.setReducerClass(MyReducer.class);

		//设置自定义分组逻辑  
		job.setPartitionerClass(AreaPartitioner.class);  
		//设置reducer数量 应当和AreaPartitioner设置的分组数目一致，或者多于，但少于的时候会报错  
		job.setNumReduceTasks(4);  

		// 指定maptask的输出类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		// 指定reducetask的输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		job.setOutputFormatClass(TextOutputFormat.class);
		FileOutputFormat.setOutputPath(job, new Path(path2));

		job.waitForCompletion(true);

		// 查看其中一份的运行结果：
		FSDataInputStream fr = fileSystem.open(new Path(path2 + "/part-r-00000"));
		IOUtils.copyBytes(fr, System.out, 2048, true);
	}
}

自定义分组类：

package com.hadoop.minbo.mapreduce.partitioner;

import java.util.HashMap;

import org.apache.hadoop.mapreduce.Partitioner;

/**
 * 自定义分组实现
 * @param <KEY>
 * @param <VALUE>
 */
public class AreaPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE> {
	private static HashMap<String, Integer> areaMap = new HashMap<String, Integer>();
	static {
		areaMap.put("186", 0);
		areaMap.put("136", 1);
		areaMap.put("137", 2);
		areaMap.put("183", 3);
	}

	public int getPartition(KEY key, VALUE value, int numPartitions) {
		// 从key中拿到手机号，查询手机号归属地词典，不同省份返回不同的归属地号
		int areaCode = areaMap.get(key.toString().substring(0, 3)) == null ? 5
				: areaMap.get(key.toString().substring(0, 3));
		return areaCode;
	}
}

运行结果：