1.要求
编写程序,将对员工数据按低薪、中薪、高薪进行分区存储。输出到三个文件。
特殊要求:
结合课堂学习的知识,职工信息采用一个独立的类存放,并且实现Hadoop序列化。
2.数据表emp.csv
7369,SMITH,CLERK,7902,1980/12/17,800,,20
7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30
7521,WARD,SALESMAN,7698,1981/2/22,1250,500,30
7566,JONES,MANAGER,7839,1981/4/2,2975,,20
7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
7698,BLAKE,MANAGER,7839,1981/5/1,2850,,30
7782,CLARK,MANAGER,7839,1981/6/9,2450,,10
7788,SCOTT,ANALYST,7566,1987/4/19,3000,,20
7839,KING,PRESIDENT,,1981/11/17,5000,,10
7844,TURNER,SALESMAN,7698,1981/9/8,1500,0,30
7876,ADAMS,CLERK,7788,1987/5/23,1100,,20
7900,JAMES,CLERK,7698,1981/12/3,950,,30
7902,FORD,ANALYST,7566,1981/12/3,3000,,20
7934,MILLER,CLERK,7782,1982/1/23,1300,,10
将此文件保存到hdfs,作为输入文件。
假如薪资<1500,为低薪,
假如薪资>=1500,薪资<3000为中薪,
假如薪资>=3000,为高薪。
原理
3.代码
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class MyemployeeMain {
public static void main(String[] args) throws Exception {
//1. 创建一个job和任务入口(指定主类)
Job job = Job.getInstance(new Configuration());
job.setJarByClass(MyemployeeMain.class);
//2. 指定job的mapper和输出的类型<k2 v2>
job.setMapperClass(MyemployeeMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Employee.class);
//这里有变化:
//指定任务的分区规则的类
job.setPartitionerClass(Myemployee.class);
//指定建立几个分区
job.setNumReduceTasks(3);
//3. 指定job的reducer和输出的类型<k4 v4>
job.setReducerClass(MyemployeeReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
//4. 指定job的输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//5. 执行job
job.waitForCompletion(true);
}
}
class Myemployee extends Partitioner<IntWritable, Employee> {
@Override
public int getPartition(IntWritable k2, Employee v2, int numPatition) {
//如何分区: 每个部门放在一个分区
if(v2.getSal() < 1500) {
//放入1号分区中
return 1%numPatition;// 1%3=1
}else if(v2.getSal() >=1500 && v2.getSal() < 3000){
//放入2号分区中
return 2%numPatition;// 2%3=2
}else {
//放入3号分区中
return 3%numPatition;// 3%3=0
}
}
}
class Employee implements Writable {
//字段名 EMPNO, ENAME, JOB, MGR, HIREDATE, SAL, COMM, DEPTNO
//数据类型:Int,Char, Char , Int, Date , Int Int, Int
//数据: 7654, MARTIN, SALESMAN, 7698, 1981/9/28, 1250, 1400, 30
//由以上定义变量
private int empno;
private String ename;
private String job;
private int mgr;
private String hiredate;
private int sal;
private int comm;//奖金
private int deptno;
public void write(DataOutput out) throws IOException {
out.writeInt(this.empno);
out.writeUTF(this.ename);
out.writeUTF(this.job);
out.writeInt(this.mgr);
out.writeUTF(this.hiredate);
out.writeInt(this.sal);
out.writeInt(this.comm);
out.writeInt(this.deptno);
}
//反序列化方法:将可跨机器传输数据流(二进制串)转化为java对象的一种技术**
//反序列化方法:将可跨机器传输数据流(二进制串)转化为java对象的一种技术**
//反序列化方法:将可跨机器传输数据流(二进制串)转化为java对象的一种技术**
public void readFields(DataInput in) throws IOException {
this.empno = in.readInt();
this.ename = in.readUTF();
this.job = in.readUTF();
this.mgr = in.readInt();
this.hiredate = in.readUTF();
this.sal = in.readInt();
this.comm = in.readInt();
this.deptno = in.readInt();
}
//其他类通过set/get方法操作变量:Source-->Generator Getters and Setters
public int getEmpno() { return empno; }
public void setEmpno(int empno) {
this.empno = empno;
}
public String getEname() {
return ename;
}
public void setEname(String ename) {
this.ename = ename;
}
public String getJob() {
return job;
}
public void setJob(String job) {
this.job = job;
}
public int getMgr() {
return mgr;
}
public void setMgr(int mgr) {
this.mgr = mgr;
}
public String getHiredate() {
return hiredate;
}
public void setHiredate(String hiredate) {
this.hiredate = hiredate;
}
public int getSal() {
return sal;
}
public void setSal(int sal) {
this.sal = sal;
}
public int getComm() {
return comm;
}
public void setComm(int comm) {
this.comm = comm;
}
public int getDeptno() {
return deptno;
}
public void setDeptno(int deptno) {
this.deptno = deptno;
}
}
class MyemployeeMapper extends Mapper< LongWritable, Text, IntWritable, Employee> {
@Override
protected void map(LongWritable k1, Text v1,
Context context)
throws IOException, InterruptedException {
//数据:7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30
String data = v1.toString();
String[] words = data.split(",");
//创建员工对象
Employee emp = new Employee();
//设置员工属性
emp.setEmpno(Integer.parseInt(words[0]));
emp.setEname(words[1]);
emp.setJob(words[2]);
try {
emp.setMgr(Integer.parseInt(words[3]));//可能为空,加try...catch包围
} catch (Exception ex) {
emp.setMgr(-1);
}
emp.setHiredate(words[4]);
emp.setSal(Integer.parseInt(words[5]));
try {
emp.setComm(Integer.parseInt(words[6]));//可能为空
} catch (Exception ex) {
emp.setComm(0);
}
emp.setDeptno(Integer.parseInt(words[7]));
//取出部门号words[7],将String转换为Int,Int转换为IntWritable对象,赋值为k2
context.write(new IntWritable(emp.getEmpno()),emp);
}
}
class MyemployeeReducer extends Reducer<IntWritable,Employee,IntWritable,Employee> {
protected void reduce(IntWritable k3, Iterable<Employee> v3,
Context context) throws IOException, InterruptedException {
for (Employee e : v3) {
context.write(k3, e);
}
}
}
4.打包jar包
之后我们需要右击项目找到Open Module Settings
找到Artifacts→JAR→From modules with dependencies…
找到主类Myemployee_partitioner
在主页面找到Build → Build Artifacts→Build
之后我们可以发现目录树中出现了一个out目录,在字树文件中有一个
mypro.jar
文件,这个就是我们打包的文件
5.将打包的jar文件上传到hdfs
hdfs dfs -put 虚拟机本地jar包存储完整路径 hdfs文件系统存储的路径
6.利用Hadoop集群运行该jar包
hadoop jar 项目名.jar 输入目录 输出目录
7.演示
这是emp.csv文件,存有职工信息
写入命令 mapreduce开始工作
结果
8.总结
将程序打包并上传至hdfs,通过partitioner分区,将文件按照薪资的水平分为低薪,中薪和高薪,在hdfs输出目录下生成三个分区,各自存放代表低薪,中薪和高薪的员工号组合。