MapReduce高级-合并、分区

最新推荐文章于 2019-06-14 21:41:46 发布

喜笑延开

最新推荐文章于 2019-06-14 21:41:46 发布

阅读量257

点赞数

分类专栏： MapReduce 编程大数据 hadoop 文章标签： MapReduce shuffle

本文链接：https://blog.csdn.net/qq_29305191/article/details/82794576

版权

编程同时被 3 个专栏收录

9 篇文章 0 订阅

订阅专栏

大数据

9 篇文章 0 订阅

订阅专栏

MapReduce

3 篇文章 0 订阅

订阅专栏

分区多个分区，有多个文件分区类型范围分区列表分区散列分区范围-散列范围-列表散列分区：(Hive 桶表) 避免热库的产生MapReduce实例　　使用MR实现一个分区：根据员工的部门号，将相同部门号的员工分到一分区要点：在Mapper的输出创建分区即对k2 v2进行操作创建一个序列化对象 Employeepackage demo.partition;...

摘要由CSDN通过智能技术生成

分区

多个分区，有多个文件
分区类型

范围分区
列表分区
散列分区
范围-散列
范围-列表

散列分区：(Hive 桶表) 避免热库的产生
MapReduce实例
　　使用MR实现一个分区：根据员工的部门号，将相同部门号的员工分到一分区

要点：在Mapper的输出创建分区即对k2 v2进行操作

创建一个序列化对象 Employee

package demo.partition;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

//员工类
public class Employee implements Writable{
	private int empno;
	private String ename;
	private String job;
	private int mgr;
	private String hiredata;
	private int sal;//月薪
	private int comm;//奖金
	private int deptno;//部门号
	
	
	@Override
	public String toString() {
		return "["+this.empno+"\t"+this.ename+"\t"+this.sal+"\t"+this.deptno+"]";
	}

	@Override
	public void write(DataOutput output) throws IOException {
		// 序列化
		output.writeInt(this.empno);
		output.writeUTF(this.ename);
		output.writeUTF(this.job);
		output.writeInt(this.mgr);
		output.writeUTF(this.hiredata);
		output.writeInt(this.sal);
		output.writeInt(this.comm);
		output.writeInt(this.deptno);
	}
	
	@Override
	public void readFields(DataInput input) throws IOException {
		// 反序列化
		this.empno = input.readInt();
		this.ename=input.readUTF();
		this.job = input.readUTF();
		this.mgr = input.readInt();
		this.hiredata = input.readUTF();
		this.sal = input.readInt();
		this.comm = input.readInt();
		this.deptno = input.readInt();
	}

	
	
	public int getEmpno() {
		return empno;
	}
	public void setEmpno(int empno) {
		this.empno = empno;
	}
	public String getEname() {
		return ename;
	}
	public void setEname(String ename) {
		this.ename = ename;
	}
	public String getJob() {
		return job;
	}
	public void setJob(String job) {
		this.job = job;
	}
	public int getMgr() {
		return mgr;
	}
	public void setMgr(int mgr) {
		this.mgr = mgr;
	}
	public String getHiredata() {
		return hiredata;
	}
	public void setHiredata(String hiredata) {
		this.hiredata = hiredata;
	}
	public int getSal() {
		return sal;
	}
	public void setSal(int sal) {
		this.sal = sal;
	}
	public int getComm() {
		return comm;
	}
	public void setComm(int comm) {
		this.comm = comm;
	}
	public int getDeptno() {
		return deptno;
	}
	public void setDeptno(int deptno) {
		this.deptno = deptno;
	}

}

创建一个Mapper程序

package demo.partition;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class EmpPartitionMapper extends Mapper<LongWritable, Text, LongWritable, Employee> {

	
	@Override
	protected void map(LongWritable k1, Text v1, Context context)
			throws IOException, InterruptedException {
		// 分词
		String data=v1.toString();
		String[] words = data.split(",");
		Employee e = new Employee();
		e.setEmpno(Integer.parseInt(words[0]));
		e.setEname(words[1]);
		e.setJob(words[2]);
		try{
		e.setMgr(Integer.parseInt(words[3]));
		}catch(Exception ex){
			e.setMgr(0);
		}
		e.setHiredata(words[4]);
		e.setSal(Integer.parseInt(words[5]));
		try{
			e.setComm(Integer.parseInt(words[6]));
		}catch(Exception ex){
			e.setComm(0);
		}
		e.setDeptno(Integer.parseInt(words[7]));

		context.write(new LongWritable(e.getDeptno()), e);
	}

}

创建一个Reducer程序

package demo.serializable.mr;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class EmployeeReducer extends Reducer<LongWritable, Employee, LongWritable, Employee> {

	@Override
	protected void reduce(LongWritable k3, Iterable<Employee> v3,Context context)
					throws IOException, InterruptedException {
			for(Employee e:v3){
				context.write(k3, e);
				
			}
	}
	

}

创建一个Partition程序

package demo.partition;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class EmpPartitioner extends Partitioner<LongWritable, Employee>{

	@Override
	public int getPartition(LongWritable k2, Employee v2, int numParts) {
			//建立自己的分区
		    //参数
		if(v2.getDeptno()==10){
			//分到1号区
			return 1%numParts;
		}else if(v2.getDeptno()==20){
			//分到2号区
			return 2%numParts;
		}else{
			//分到剩下的区
			return 3%numParts;
		}
	}

}

创建一个Main程序

package demo.partition;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class EmpPartitionMain {

	public static void main(String[] args) throws Exception {
		Job job = Job.getInstance(new Configuration());
		job.setJarByClass(EmpPartitionMain.class);
		
		
		job.setMapperClass(EmpPartitionMapper.class);
		job.setMapOutputKeyClass(LongWritable.class);
		job.setMapOutputValueClass(Employee.class);
		
		//指定任务的分区规则 
		job.setPartitionerClass(EmpPartitioner.class);
		//指定任务的分区规则
		job.setNumReduceTasks(3);
		
		job.setReducerClass(EmpPartitionReducer.class);
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(Employee.class);
		
		FileInputFormat.setInputPaths(job,new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
		
		
	}

}

生成jar包part.jar
在linux上执行jar包

hardoop jar part.jar /input/emp.csv /output/w0922a

7.执行结果分区数为3时

[root@redhat temp]# hdfs dfs -ls /output/w0922B
Found 4 items
-rw-r--r--   1 root supergroup          0 2017-09-22 10:08 /output/w0922B/_SUCCESS
-rw-r--r--   1 root supergroup        144 2017-09-22 10:07 /output/w0922B/part-r-00000
-rw-r--r--   1 root supergroup         72 2017-09-22 10:08 /output/w0922B/part-r-00001
-rw-r--r--   1 root supergroup        118 2017-09-22 10:08 /output/w0922B/part-r-00002
[root@redhat temp]# hdfs dfs -cat /output/w0922B/part-r-00000
30	[7654	MARTIN	1250	30]
30	[7900	JAMES	950	30]
30	[7698	BLAKE	2850	30]
30	[7521	WARD	1250	30]
30	[7844	TURNER	1500	30]
30	[7499	ALLEN	1600	30]
[root@redhat temp]# hdfs dfs -cat /output/w0922B/part-r-00001
10	[7934	MILLER	1300	10]
10	[7839	KING	5000	10]
10	[7782	CLARK	2450	10]
[root@redhat temp]# hdfs dfs -cat /output/w0922B/part-r-00002
20	[7788	SCOTT	3000	20]
20	[7566	JONES	2975	20]
20	[7876	ADAMS	1100	20]
20	[7902	FORD	3000	20]
20	[7369	SMITH	800	20]

拓展如果将分区数设置成1 ,2 ,6 的结果

分区数为1时   
	[root@redhat temp]# hdfs dfs -cat /output/w0922f/part-r-00000
	10	[7934	MILLER	1300	10]
	10	[7839	KING	5000	10]
	10	[7782	CLARK	2450	10]
	20	[7876	ADAMS	1100	20]
	20	[7788	SCOTT	3000	20]
	20	[7369	SMITH	800	20]
	20	[7566	JONES	2975	20]
	20	[7902	FORD	3000	20]
	30	[7844	TURNER	1500	30]
	30	[7499	ALLEN	1600	30]
	30	[7698	BLAKE	2850	30]
	30	[7654	MARTIN	1250	30]
	30	[7521	WARD	1250	30]
	30	[7900	JAMES	950	30]
	
分区数为2时

	[root@redhat temp]# hdfs dfs -ls /output/w0922de
	Found 3 items
	-rw-r--r--   1 root supergroup          0 2017-09-22 10:36 /output/w0922de/_SUCCESS
	-rw-r--r--   1 root supergroup        118 2017-09-22 10:36 /output/w0922de/part-r-00000
	-rw-r--r--   1 root supergroup        216 2017-09-22 10:36 /output/w0922de/part-r-00001
	[root@redhat temp]# hdfs dfs -cat /output/w0922de/part-r-00000
	20	[7902	FORD	3000	20]
	20	[7788	SCOTT	3000	20]
	20	[7566	JONES	2975	20]
	20	[7876	ADAMS	1100	20]
	20	[7369	SMITH	800	20]
	[root@redhat temp]# hdfs dfs -cat /output/w0922de/part-r-00001
	10	[7934	MILLER	1300	10]
	10	[7839	KING	5000	10]
	10	[7782	CLARK	2450	10]
	30	[7698	BLAKE	2850	30]
	30	[7654	MARTIN	1250	30]
	30	[7900	JAMES	950	30]
	30	[7521	WARD	1250	30]
	30	[7499	ALLEN	1600	30]
	30	[7844	TURNER	1500	30]
	
分区数为6时

	-rw-r--r--   1 root supergroup          0 2017-09-22 10:19 /output/w0922d/_SUCCESS
	-rw-r--r--   1 root supergroup          0 2017-09-22 10:19 /output/w0922d/part-r-00000
	-rw-r--r--   1 root supergroup         72 2017-09-22 10:19 /output/w0922d/part-r-00001
	-rw-r--r--   1 root supergroup        118 2017-09-22 10:19 /output/w0922d/part-r-00002
	-rw-r--r--   1 root supergroup        144 2017-09-22 10:19 /output/w0922d/part-r-00003
	-rw-r--r--   1 root supergroup          0 2017-09-22 10:19 /output/w0922d/part-r-00004
	-rw-r--r--   1 root supergroup          0 2017-09-22 10:19 /output/w0922d/part-r-00005
	[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00000
	[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00001
	10	[7934	MILLER	1300	10]
	10	[7839	KING	5000	10]
	10	[7782	CLARK	2450	10]
	[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00002
	20	[7876	ADAMS	1100	20]
	20	[7788	SCOTT	3000	20]
	20	[7369	SMITH	800	20]
	20	[7566	JONES	2975	20]
	20	[7902	FORD	3000	20]
	[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00003
	30	[7844	TURNER	1500	30]
	30	[7499	ALLEN	1600	30]
	30	[7698	BLAKE	2850	30]
	30	[7654	MARTIN	1250	30]
	30	[7521	WARD	1250	30]
	30	[7900	JAMES	950	30]

合并：Combiner

（1）什么是合并？
　　在Map端先进行一次Reducer的操作，Combiner是一种特殊的Reducer
（2）好处：减少Map输出到Reducer中的数据量，从而提高性能
（3）举例：使用Combiner重写WordCount程序

（*）注意事项： ----> 编程：求平均值谨慎使用！！！！

（4）并不是所有的问题都可以使用Combiner: 求平均值

（5）引入了Combiner后，不能改变原来的逻辑；如果改变了，想个办法，让他不改变。

Error: java.io.IOException: wrong value class: class org.apache.hadoop.io.DoubleWritable is not class org.apache.hadoop.io.LongWritable

编程在主函数添加

job.setCombinerClass(Reducer.class);

Combiner可以自己进行修改编程

洗牌Shuffle

MapReduce的核心

数据块(128M)进入
转换为可被Mapper处理的切片
进入map处理
将map后的信息输入内存缓存区(100M)
文件大于缓存区80%的时候，会产生溢写将内存中的数据写到磁盘上
将文件进行排序(Sort,Split)等操作
合并文件成一个大文件
(非必须)将大文件进行Combiner操作
mapper结束后将等待Reducer的调度储存在
Reducer进行调度

网上关于shuffle的见解

喜笑延开

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
MapReduce高级-合并、分区

分区多个分区，有多个文件分区类型范围分区列表分区散列分区范围-散列范围-列表散列分区：(Hive 桶表) 避免热库的产生MapReduce实例　　使用MR实现一个分区：根据员工的部门号，将相同部门号的员工分到一分区要点：在Mapper的输出创建分区即对k2 v2进行操作创建一个序列化对象 Employeepackage demo.partition;...
复制链接

扫一扫

专栏目录