MapReduce高级-合并、分区

9 篇文章 0 订阅
3 篇文章 0 订阅
分区多个分区,有多个文件分区类型范围分区 列表分区 散列分区 范围-散列 范围-列表散列分区:(Hive 桶表) 避免热库的产生MapReduce实例  使用MR实现一个分区:根据员工的部门号,将相同部门号的员工分到一分区要点:在Mapper的输出创建分区 即对k2 v2进行操作创建一个序列化对象 Employeepackage demo.partition;...
摘要由CSDN通过智能技术生成

分区

多个分区,有多个文件
分区类型

  • 范围分区
  • 列表分区
  • 散列分区
  • 范围-散列
  • 范围-列表

散列分区:(Hive 桶表) 避免热库的产生
MapReduce实例
  使用MR实现一个分区:根据员工的部门号,将相同部门号的员工分到一分区

要点:在Mapper的输出创建分区 即对k2 v2进行操作

  1. 创建一个序列化对象 Employee
package demo.partition;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

//员工类
public class Employee implements Writable{
	private int empno;
	private String ename;
	private String job;
	private int mgr;
	private String hiredata;
	private int sal;//月薪
	private int comm;//奖金
	private int deptno;//部门号
	
	
	@Override
	public String toString() {
		return "["+this.empno+"\t"+this.ename+"\t"+this.sal+"\t"+this.deptno+"]";
	}

	@Override
	public void write(DataOutput output) throws IOException {
		// 序列化
		output.writeInt(this.empno);
		output.writeUTF(this.ename);
		output.writeUTF(this.job);
		output.writeInt(this.mgr);
		output.writeUTF(this.hiredata);
		output.writeInt(this.sal);
		output.writeInt(this.comm);
		output.writeInt(this.deptno);
	}
	
	@Override
	public void readFields(DataInput input) throws IOException {
		// 反序列化
		this.empno = input.readInt();
		this.ename=input.readUTF();
		this.job = input.readUTF();
		this.mgr = input.readInt();
		this.hiredata = input.readUTF();
		this.sal = input.readInt();
		this.comm = input.readInt();
		this.deptno = input.readInt();
	}

	
	
	public int getEmpno() {
		return empno;
	}
	public void setEmpno(int empno) {
		this.empno = empno;
	}
	public String getEname() {
		return ename;
	}
	public void setEname(String ename) {
		this.ename = ename;
	}
	public String getJob() {
		return job;
	}
	public void setJob(String job) {
		this.job = job;
	}
	public int getMgr() {
		return mgr;
	}
	public void setMgr(int mgr) {
		this.mgr = mgr;
	}
	public String getHiredata() {
		return hiredata;
	}
	public void setHiredata(String hiredata) {
		this.hiredata = hiredata;
	}
	public int getSal() {
		return sal;
	}
	public void setSal(int sal) {
		this.sal = sal;
	}
	public int getComm() {
		return comm;
	}
	public void setComm(int comm) {
		this.comm = comm;
	}
	public int getDeptno() {
		return deptno;
	}
	public void setDeptno(int deptno) {
		this.deptno = deptno;
	}

}
  1. 创建一个Mapper程序
package demo.partition;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class EmpPartitionMapper extends Mapper<LongWritable, Text, LongWritable, Employee> {

	
	@Override
	protected void map(LongWritable k1, Text v1, Context context)
			throws IOException, InterruptedException {
		// 分词
		String data=v1.toString();
		String[] words = data.split(",");
		Employee e = new Employee();
		e.setEmpno(Integer.parseInt(words[0]));
		e.setEname(words[1]);
		e.setJob(words[2]);
		try{
		e.setMgr(Integer.parseInt(words[3]));
		}catch(Exception ex){
			e.setMgr(0);
		}
		e.setHiredata(words[4]);
		e.setSal(Integer.parseInt(words[5]));
		try{
			e.setComm(Integer.parseInt(words[6]));
		}catch(Exception ex){
			e.setComm(0);
		}
		e.setDeptno(Integer.parseInt(words[7]));

		context.write(new LongWritable(e.getDeptno()), e);
	}

}
  1. 创建一个Reducer程序
package demo.serializable.mr;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class EmployeeReducer extends Reducer<LongWritable, Employee, LongWritable, Employee> {

	@Override
	protected void reduce(LongWritable k3, Iterable<Employee> v3,Context context)
					throws IOException, InterruptedException {
			for(Employee e:v3){
				context.write(k3, e);
				
			}
	}
	

}
  1. 创建一个Partition程序
package demo.partition;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class EmpPartitioner extends Partitioner<LongWritable, Employee>{

	@Override
	public int getPartition(LongWritable k2, Employee v2, int numParts) {
			//建立自己的分区
		    //参数
		if(v2.getDeptno()==10){
			//分到1号区
			return 1%numParts;
		}else if(v2.getDeptno()==20){
			//分到2号区
			return 2%numParts;
		}else{
			//分到剩下的区
			return 3%numParts;
		}
	}

}
  1. 创建一个Main程序
package demo.partition;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class EmpPartitionMain {

	public static void main(String[] args) throws Exception {
		Job job = Job.getInstance(new Configuration());
		job.setJarByClass(EmpPartitionMain.class);
		
		
		job.setMapperClass(EmpPartitionMapper.class);
		job.setMapOutputKeyClass(LongWritable.class);
		job.setMapOutputValueClass(Employee.class);
		
		//指定任务的分区规则 
		job.setPartitionerClass(EmpPartitioner.class);
		//指定任务的分区规则
		job.setNumReduceTasks(3);
		
		job.setReducerClass(EmpPartitionReducer.class);
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(Employee.class);
		
		FileInputFormat.setInputPaths(job,new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
		
		
	}

}
  1. 生成jar包part.jar
  2. 在linux上执行jar包
hardoop jar part.jar /input/emp.csv /output/w0922a

7.执行结果 分区数为3时

[root@redhat temp]# hdfs dfs -ls /output/w0922B
Found 4 items
-rw-r--r--   1 root supergroup          0 2017-09-22 10:08 /output/w0922B/_SUCCESS
-rw-r--r--   1 root supergroup        144 2017-09-22 10:07 /output/w0922B/part-r-00000
-rw-r--r--   1 root supergroup         72 2017-09-22 10:08 /output/w0922B/part-r-00001
-rw-r--r--   1 root supergroup        118 2017-09-22 10:08 /output/w0922B/part-r-00002
[root@redhat temp]# hdfs dfs -cat /output/w0922B/part-r-00000
30	[7654	MARTIN	1250	30]
30	[7900	JAMES	950	30]
30	[7698	BLAKE	2850	30]
30	[7521	WARD	1250	30]
30	[7844	TURNER	1500	30]
30	[7499	ALLEN	1600	30]
[root@redhat temp]# hdfs dfs -cat /output/w0922B/part-r-00001
10	[7934	MILLER	1300	10]
10	[7839	KING	5000	10]
10	[7782	CLARK	2450	10]
[root@redhat temp]# hdfs dfs -cat /output/w0922B/part-r-00002
20	[7788	SCOTT	3000	20]
20	[7566	JONES	2975	20]
20	[7876	ADAMS	1100	20]
20	[7902	FORD	3000	20]
20	[7369	SMITH	800	20]
  1. 拓展 如果将分区数设置成1 ,2 ,6 的结果
分区数为1时   
	[root@redhat temp]# hdfs dfs -cat /output/w0922f/part-r-00000
	10	[7934	MILLER	1300	10]
	10	[7839	KING	5000	10]
	10	[7782	CLARK	2450	10]
	20	[7876	ADAMS	1100	20]
	20	[7788	SCOTT	3000	20]
	20	[7369	SMITH	800	20]
	20	[7566	JONES	2975	20]
	20	[7902	FORD	3000	20]
	30	[7844	TURNER	1500	30]
	30	[7499	ALLEN	1600	30]
	30	[7698	BLAKE	2850	30]
	30	[7654	MARTIN	1250	30]
	30	[7521	WARD	1250	30]
	30	[7900	JAMES	950	30]
	
分区数为2时

	[root@redhat temp]# hdfs dfs -ls /output/w0922de
	Found 3 items
	-rw-r--r--   1 root supergroup          0 2017-09-22 10:36 /output/w0922de/_SUCCESS
	-rw-r--r--   1 root supergroup        118 2017-09-22 10:36 /output/w0922de/part-r-00000
	-rw-r--r--   1 root supergroup        216 2017-09-22 10:36 /output/w0922de/part-r-00001
	[root@redhat temp]# hdfs dfs -cat /output/w0922de/part-r-00000
	20	[7902	FORD	3000	20]
	20	[7788	SCOTT	3000	20]
	20	[7566	JONES	2975	20]
	20	[7876	ADAMS	1100	20]
	20	[7369	SMITH	800	20]
	[root@redhat temp]# hdfs dfs -cat /output/w0922de/part-r-00001
	10	[7934	MILLER	1300	10]
	10	[7839	KING	5000	10]
	10	[7782	CLARK	2450	10]
	30	[7698	BLAKE	2850	30]
	30	[7654	MARTIN	1250	30]
	30	[7900	JAMES	950	30]
	30	[7521	WARD	1250	30]
	30	[7499	ALLEN	1600	30]
	30	[7844	TURNER	1500	30]
	
分区数为6时

	-rw-r--r--   1 root supergroup          0 2017-09-22 10:19 /output/w0922d/_SUCCESS
	-rw-r--r--   1 root supergroup          0 2017-09-22 10:19 /output/w0922d/part-r-00000
	-rw-r--r--   1 root supergroup         72 2017-09-22 10:19 /output/w0922d/part-r-00001
	-rw-r--r--   1 root supergroup        118 2017-09-22 10:19 /output/w0922d/part-r-00002
	-rw-r--r--   1 root supergroup        144 2017-09-22 10:19 /output/w0922d/part-r-00003
	-rw-r--r--   1 root supergroup          0 2017-09-22 10:19 /output/w0922d/part-r-00004
	-rw-r--r--   1 root supergroup          0 2017-09-22 10:19 /output/w0922d/part-r-00005
	[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00000
	[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00001
	10	[7934	MILLER	1300	10]
	10	[7839	KING	5000	10]
	10	[7782	CLARK	2450	10]
	[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00002
	20	[7876	ADAMS	1100	20]
	20	[7788	SCOTT	3000	20]
	20	[7369	SMITH	800	20]
	20	[7566	JONES	2975	20]
	20	[7902	FORD	3000	20]
	[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00003
	30	[7844	TURNER	1500	30]
	30	[7499	ALLEN	1600	30]
	30	[7698	BLAKE	2850	30]
	30	[7654	MARTIN	1250	30]
	30	[7521	WARD	1250	30]
	30	[7900	JAMES	950	30]

合并:Combiner

(1)什么是合并?
  在Map端先进行一次Reducer的操作,Combiner是一种特殊的Reducer
(2)好处:减少Map输出到Reducer中的数据量,从而提高性能
(3)举例:使用Combiner重写WordCount程序

(*)注意事项: ----> 编程:求平均值 谨慎使用!!!!

(4)并不是所有的问题都可以使用Combiner: 求平均值

(5)引入了Combiner后,不能改变原来的逻辑;如果改变了,想个办法,让他不改变。

Error: java.io.IOException: wrong value class: class org.apache.hadoop.io.DoubleWritable is not class org.apache.hadoop.io.LongWritable

编程在主函数添加

job.setCombinerClass(Reducer.class);

Combiner可以自己进行修改编程

洗牌Shuffle

MapReduce的核心
image

  1. 数据块(128M)进入
  2. 转换为可被Mapper处理的切片
  3. 进入map处理
  4. 将map后的信息输入内存缓存区(100M)
  5. 文件大于缓存区80%的时候,会产生 溢写 将内存中的数据 写到磁盘上
  6. 将文件进行排序(Sort,Split)等操作
  7. 合并文件成一个大文件
  8. (非必须)将大文件进行Combiner操作
  9. mapper结束后将等待Reducer的调度储存在
  10. Reducer进行调度

网上关于shuffle的见解

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值