MapReduce的高级特性 3、分区: Partition

MapReduce的高级特性

3、分区: Partition

*)MR默认只有一个分区(一个输出的文件);如果有多个分区,就有多个输出文件

      /output/0918/s8/part-r-00000

  /output/0918/s8/part-r-00001

  /output/0918/s8/part-r-00002

  

*)什么是分区?结合一下Oracle的表

*)使用MR实现一个分区:根据员工的部门号,将相同部门号的员工分到一份区

  注意:MR是根据Map的输出(k2   v2)进行分区

  

*)日志

17/09/20 21:01:04 INFO mapreduce.Job:  map 0% reduce 0%

17/09/20 21:01:12 INFO mapreduce.Job:  map 100% reduce 0%

17/09/20 21:01:34 INFO mapreduce.Job:  map 100% reduce 33%

17/09/20 21:01:36 INFO mapreduce.Job:  map 100% reduce 67%

17/09/20 21:01:37 INFO mapreduce.Job:  map 100% reduce 100%

*)问题:如何确定建立分区后,提高了性能? ------>  Hive分区表(Hive中SQL的执行计划)

======================================================================

package demo.partition;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;


import org.apache.hadoop.io.Writable;


//员工类: 7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
public class Employee implements Writable{


private int empno;//员工号
private String ename;//姓名
private String job;//职位
private int mgr;//老板号
private String hiredate;//入职日期
private int sal;//月薪
private int comm;//奖金
private int deptno;// 部门号

@Override
public String toString() {
return "["+this.empno+"\t"+this.ename+"\t"+this.sal+"\t"+this.deptno+"]";
}


@Override
public void write(DataOutput output) throws IOException {
// 序列化:把对象输出
output.writeInt(this.empno);
output.writeUTF(this.ename);
output.writeUTF(this.job);
output.writeInt(this.mgr);
output.writeUTF(this.hiredate);
output.writeInt(this.sal);
output.writeInt(this.comm);
output.writeInt(this.deptno);
}

@Override
public void readFields(DataInput input) throws IOException {
// 反序列化:把对象读入
this.empno = input.readInt();
this.ename = input.readUTF();
this.job = input.readUTF();
this.mgr = input.readInt();
this.hiredate = input.readUTF();
this.sal = input.readInt();
this.comm = input.readInt();
this.deptno = input.readInt();
}


public int getEmpno() {
return empno;
}
public void setEmpno(int empno) {
this.empno = empno;
}
public String getEname() {
return ename;
}
public void setEname(String ename) {
this.ename = ename;
}
public String getJob() {
return job;
}
public void setJob(String job) {
this.job = job;
}
public int getMgr() {
return mgr;
}
public void setMgr(int mgr) {
this.mgr = mgr;
}
public String getHiredate() {
return hiredate;
}
public void setHiredate(String hiredate) {
this.hiredate = hiredate;
}
public int getSal() {
return sal;
}
public void setSal(int sal) {
this.sal = sal;
}
public int getComm() {
return comm;
}
public void setComm(int comm) {
this.comm = comm;
}
public int getDeptno() {
return deptno;
}
public void setDeptno(int deptno) {
this.deptno = deptno;
}



}

------------------------------------------------------------------------------------------------------------

package demo.partition;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Partitioner;

//实现分区的逻辑:按照部门号进行分区                                                                                            k2   v2
public class MyEmployeePartitioner extends Partitioner<LongWritable, Employee>{


@Override
public int getPartition(LongWritable k2, Employee v2, int numParts) {
// 建立自己的分区规则:按照部门号进行分区             
// 参数: numParts 分区的个数,需要在主程序中的job中设置

int detpno = v2.getDeptno();
if(detpno == 10){
//10号部门的员工,分到1号区中
return 1%numParts;
}else if(detpno == 20){
//20号部门的员工,分到2号区中
return 2%numParts;
}else{
//30号部门的员工,分到0号区
return 3%numParts;
}
}


}

-------------------------------------------------------------------------------------------

package demo.partition;


import java.io.IOException;


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


public class EmpPartionMapper extends Mapper<LongWritable, Text, LongWritable, Employee> {


@Override
protected void map(LongWritable key1, Text value1, Context context)
throws IOException, InterruptedException {
// 数据:7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
String data = value1.toString();

//分词
String[] words = data.split(",");

//创建一个员工对象
Employee e = new Employee();

//设置员工号
e.setEmpno(Integer.parseInt(words[0]));
//设置姓名
e.setEname(words[1]);

//设置职位 job
e.setJob(words[2]);

//设置老板号
try{
e.setMgr(Integer.parseInt(words[3]));
}catch(Exception ex){
//老板号为null
e.setMgr(0);
}

//设置入职日期
e.setHiredate(words[4]);

//设置薪水
e.setSal(Integer.parseInt(words[5]));

//设置奖金
try{
e.setComm(Integer.parseInt(words[6]));
}catch(Exception ex){
//没有奖金
e.setComm(0);
}

//设置部门号
e.setDeptno(Integer.parseInt(words[7]));


//输出
context.write(new LongWritable(e.getDeptno()), e);
}


}

-----------------------------------------------------------------------------------------------

package demo.partition;


import java.io.IOException;


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;




//把分区后的数据写到HDFS:按照相同部门号输出                                                                                                                         k4: 员工号    v4 员工信息
public class EmpPartionReducer extends Reducer<LongWritable, Employee, LongWritable, Employee> {


@Override
protected void reduce(LongWritable k3, Iterable<Employee> v3,Context context)
throws IOException, InterruptedException {
for(Employee e: v3){
context.write(new LongWritable(e.getEmpno()), e);
}
}
}

----------------------------------------------------------------------------------------------------

package demo.partition;


import java.io.IOException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;




public class EmpPartitionMain {


public static void main(String[] args) throws Exception {
// 创建一个任务job = map + reduce
Job job = Job.getInstance(new Configuration());
//指定任务的入口
job.setJarByClass(EmpPartitionMain.class);

//指定任务的Map和输出的数据类型
job.setMapperClass(EmpPartionMapper.class);
job.setMapOutputKeyClass(LongWritable.class);//员工号
job.setMapOutputValueClass(Employee.class);//员工对象

//设置任务的分区规则
job.setPartitionerClass(MyEmployeePartitioner.class);
//指定分区的个数
job.setNumReduceTasks(3);


//指定任务的Reduce和输出的数据类型
job.setReducerClass(EmpPartionReducer.class);
job.setOutputKeyClass(LongWritable.class);//员工号
job.setOutputValueClass(Employee.class); // 员工对象

//指定输入和输出的HDFS路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));

//提交任务
job.waitForCompletion(true);


}


}










  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值