分区
多个分区,有多个文件
分区类型
- 范围分区
- 列表分区
- 散列分区
- 范围-散列
- 范围-列表
散列分区:(Hive 桶表) 避免热库的产生
MapReduce实例
使用MR实现一个分区:根据员工的部门号,将相同部门号的员工分到一分区
要点:在Mapper的输出创建分区 即对k2 v2进行操作
- 创建一个序列化对象 Employee
package demo.partition;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
//员工类
public class Employee implements Writable{
private int empno;
private String ename;
private String job;
private int mgr;
private String hiredata;
private int sal;//月薪
private int comm;//奖金
private int deptno;//部门号
@Override
public String toString() {
return "["+this.empno+"\t"+this.ename+"\t"+this.sal+"\t"+this.deptno+"]";
}
@Override
public void write(DataOutput output) throws IOException {
// 序列化
output.writeInt(this.empno);
output.writeUTF(this.ename);
output.writeUTF(this.job);
output.writeInt(this.mgr);
output.writeUTF(this.hiredata);
output.writeInt(this.sal);
output.writeInt(this.comm);
output.writeInt(this.deptno);
}
@Override
public void readFields(DataInput input) throws IOException {
// 反序列化
this.empno = input.readInt();
this.ename=input.readUTF();
this.job = input.readUTF();
this.mgr = input.readInt();
this.hiredata = input.readUTF();
this.sal = input.readInt();
this.comm = input.readInt();
this.deptno = input.readInt();
}
public int getEmpno() {
return empno;
}
public void setEmpno(int empno) {
this.empno = empno;
}
public String getEname() {
return ename;
}
public void setEname(String ename) {
this.ename = ename;
}
public String getJob() {
return job;
}
public void setJob(String job) {
this.job = job;
}
public int getMgr() {
return mgr;
}
public void setMgr(int mgr) {
this.mgr = mgr;
}
public String getHiredata() {
return hiredata;
}
public void setHiredata(String hiredata) {
this.hiredata = hiredata;
}
public int getSal() {
return sal;
}
public void setSal(int sal) {
this.sal = sal;
}
public int getComm() {
return comm;
}
public void setComm(int comm) {
this.comm = comm;
}
public int getDeptno() {
return deptno;
}
public void setDeptno(int deptno) {
this.deptno = deptno;
}
}
- 创建一个Mapper程序
package demo.partition;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class EmpPartitionMapper extends Mapper<LongWritable, Text, LongWritable, Employee> {
@Override
protected void map(LongWritable k1, Text v1, Context context)
throws IOException, InterruptedException {
// 分词
String data=v1.toString();
String[] words = data.split(",");
Employee e = new Employee();
e.setEmpno(Integer.parseInt(words[0]));
e.setEname(words[1]);
e.setJob(words[2]);
try{
e.setMgr(Integer.parseInt(words[3]));
}catch(Exception ex){
e.setMgr(0);
}
e.setHiredata(words[4]);
e.setSal(Integer.parseInt(words[5]));
try{
e.setComm(Integer.parseInt(words[6]));
}catch(Exception ex){
e.setComm(0);
}
e.setDeptno(Integer.parseInt(words[7]));
context.write(new LongWritable(e.getDeptno()), e);
}
}
- 创建一个Reducer程序
package demo.serializable.mr;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class EmployeeReducer extends Reducer<LongWritable, Employee, LongWritable, Employee> {
@Override
protected void reduce(LongWritable k3, Iterable<Employee> v3,Context context)
throws IOException, InterruptedException {
for(Employee e:v3){
context.write(k3, e);
}
}
}
- 创建一个Partition程序
package demo.partition;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class EmpPartitioner extends Partitioner<LongWritable, Employee>{
@Override
public int getPartition(LongWritable k2, Employee v2, int numParts) {
//建立自己的分区
//参数
if(v2.getDeptno()==10){
//分到1号区
return 1%numParts;
}else if(v2.getDeptno()==20){
//分到2号区
return 2%numParts;
}else{
//分到剩下的区
return 3%numParts;
}
}
}
- 创建一个Main程序
package demo.partition;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class EmpPartitionMain {
public static void main(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(EmpPartitionMain.class);
job.setMapperClass(EmpPartitionMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Employee.class);
//指定任务的分区规则
job.setPartitionerClass(EmpPartitioner.class);
//指定任务的分区规则
job.setNumReduceTasks(3);
job.setReducerClass(EmpPartitionReducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Employee.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
- 生成jar包part.jar
- 在linux上执行jar包
hardoop jar part.jar /input/emp.csv /output/w0922a
7.执行结果 分区数为3时
[root@redhat temp]# hdfs dfs -ls /output/w0922B
Found 4 items
-rw-r--r-- 1 root supergroup 0 2017-09-22 10:08 /output/w0922B/_SUCCESS
-rw-r--r-- 1 root supergroup 144 2017-09-22 10:07 /output/w0922B/part-r-00000
-rw-r--r-- 1 root supergroup 72 2017-09-22 10:08 /output/w0922B/part-r-00001
-rw-r--r-- 1 root supergroup 118 2017-09-22 10:08 /output/w0922B/part-r-00002
[root@redhat temp]# hdfs dfs -cat /output/w0922B/part-r-00000
30 [7654 MARTIN 1250 30]
30 [7900 JAMES 950 30]
30 [7698 BLAKE 2850 30]
30 [7521 WARD 1250 30]
30 [7844 TURNER 1500 30]
30 [7499 ALLEN 1600 30]
[root@redhat temp]# hdfs dfs -cat /output/w0922B/part-r-00001
10 [7934 MILLER 1300 10]
10 [7839 KING 5000 10]
10 [7782 CLARK 2450 10]
[root@redhat temp]# hdfs dfs -cat /output/w0922B/part-r-00002
20 [7788 SCOTT 3000 20]
20 [7566 JONES 2975 20]
20 [7876 ADAMS 1100 20]
20 [7902 FORD 3000 20]
20 [7369 SMITH 800 20]
- 拓展 如果将分区数设置成1 ,2 ,6 的结果
分区数为1时
[root@redhat temp]# hdfs dfs -cat /output/w0922f/part-r-00000
10 [7934 MILLER 1300 10]
10 [7839 KING 5000 10]
10 [7782 CLARK 2450 10]
20 [7876 ADAMS 1100 20]
20 [7788 SCOTT 3000 20]
20 [7369 SMITH 800 20]
20 [7566 JONES 2975 20]
20 [7902 FORD 3000 20]
30 [7844 TURNER 1500 30]
30 [7499 ALLEN 1600 30]
30 [7698 BLAKE 2850 30]
30 [7654 MARTIN 1250 30]
30 [7521 WARD 1250 30]
30 [7900 JAMES 950 30]
分区数为2时
[root@redhat temp]# hdfs dfs -ls /output/w0922de
Found 3 items
-rw-r--r-- 1 root supergroup 0 2017-09-22 10:36 /output/w0922de/_SUCCESS
-rw-r--r-- 1 root supergroup 118 2017-09-22 10:36 /output/w0922de/part-r-00000
-rw-r--r-- 1 root supergroup 216 2017-09-22 10:36 /output/w0922de/part-r-00001
[root@redhat temp]# hdfs dfs -cat /output/w0922de/part-r-00000
20 [7902 FORD 3000 20]
20 [7788 SCOTT 3000 20]
20 [7566 JONES 2975 20]
20 [7876 ADAMS 1100 20]
20 [7369 SMITH 800 20]
[root@redhat temp]# hdfs dfs -cat /output/w0922de/part-r-00001
10 [7934 MILLER 1300 10]
10 [7839 KING 5000 10]
10 [7782 CLARK 2450 10]
30 [7698 BLAKE 2850 30]
30 [7654 MARTIN 1250 30]
30 [7900 JAMES 950 30]
30 [7521 WARD 1250 30]
30 [7499 ALLEN 1600 30]
30 [7844 TURNER 1500 30]
分区数为6时
-rw-r--r-- 1 root supergroup 0 2017-09-22 10:19 /output/w0922d/_SUCCESS
-rw-r--r-- 1 root supergroup 0 2017-09-22 10:19 /output/w0922d/part-r-00000
-rw-r--r-- 1 root supergroup 72 2017-09-22 10:19 /output/w0922d/part-r-00001
-rw-r--r-- 1 root supergroup 118 2017-09-22 10:19 /output/w0922d/part-r-00002
-rw-r--r-- 1 root supergroup 144 2017-09-22 10:19 /output/w0922d/part-r-00003
-rw-r--r-- 1 root supergroup 0 2017-09-22 10:19 /output/w0922d/part-r-00004
-rw-r--r-- 1 root supergroup 0 2017-09-22 10:19 /output/w0922d/part-r-00005
[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00000
[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00001
10 [7934 MILLER 1300 10]
10 [7839 KING 5000 10]
10 [7782 CLARK 2450 10]
[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00002
20 [7876 ADAMS 1100 20]
20 [7788 SCOTT 3000 20]
20 [7369 SMITH 800 20]
20 [7566 JONES 2975 20]
20 [7902 FORD 3000 20]
[root@redhat temp]# hdfs dfs -cat /output/w0922d/part-r-00003
30 [7844 TURNER 1500 30]
30 [7499 ALLEN 1600 30]
30 [7698 BLAKE 2850 30]
30 [7654 MARTIN 1250 30]
30 [7521 WARD 1250 30]
30 [7900 JAMES 950 30]
合并:Combiner
(1)什么是合并?
在Map端先进行一次Reducer的操作,Combiner是一种特殊的Reducer
(2)好处:减少Map输出到Reducer中的数据量,从而提高性能
(3)举例:使用Combiner重写WordCount程序
(*)注意事项: ----> 编程:求平均值 谨慎使用!!!!
(4)并不是所有的问题都可以使用Combiner: 求平均值
(5)引入了Combiner后,不能改变原来的逻辑;如果改变了,想个办法,让他不改变。
Error: java.io.IOException: wrong value class: class org.apache.hadoop.io.DoubleWritable is not class org.apache.hadoop.io.LongWritable
编程在主函数添加
job.setCombinerClass(Reducer.class);
Combiner可以自己进行修改编程
洗牌Shuffle
MapReduce的核心
- 数据块(128M)进入
- 转换为可被Mapper处理的切片
- 进入map处理
- 将map后的信息输入内存缓存区(100M)
- 文件大于缓存区80%的时候,会产生 溢写 将内存中的数据 写到磁盘上
- 将文件进行排序(Sort,Split)等操作
- 合并文件成一个大文件
- (非必须)将大文件进行Combiner操作
- mapper结束后将等待Reducer的调度储存在
- Reducer进行调度