MapReduce之自定义分组
一、需求说明
- 说明: 分组是一种特殊的比较器,对key做比较,并进行归并,类似于合并同类项,也类似于SQL中的分组查询
- 要求: 通过自定义分组比较器实现将emp.csv中的数据按照部门号分成三个分组,并显示出每组的人员名称,
最终显示的结果格式如下所示:
<10,CLARK1;KING1;MILLER1>
<20,CLARK2;KING2;MILLER2>
<30,CLARK3;KING3;MILLER3>
二、测试数据
- 员工信息表:下载地址
- 表字段说明:
三、编程思路
- 思路:
1、需要自定义分组比较器
2、需要自定义Employee对象排序规则,并注意所使用排序字段应该与分组比较器规则相同
3、mapper和reducer都需要修改
四、实现步骤
-
在Idea或eclipse中创建maven项目
-
在pom.xml中添加hadoop依赖
<dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.7.3</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.7.3</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-common</artifactId> <version>2.7.3</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>2.7.3</version> </dependency>
-
添加log4j.properties文件在资源目录下即resources,文件内容如下:
### 配置根 ### log4j.rootLogger = debug,console,fileAppender ## 配置输出到控制台 ### log4j.appender.console = org.apache.log4j.ConsoleAppender log4j.appender.console.Target = System.out log4j.appender.console.layout = org.apache.log4j.PatternLayout log4j.appender.console.layout.ConversionPattern = %d{ABSOLUTE} %5p %c:%L - %m%n ### 配置输出到文件 ### log4j.appender.fileAppender = org.apache.log4j.FileAppender log4j.appender.fileAppender.File = logs/logs.log log4j.appender.fileAppender.Append = false log4j.appender.fileAppender.Threshold = DEBUG,INFO,WARN,ERROR log4j.appender.fileAppender.layout = org.apache.log4j.PatternLayout log4j.appender.fileAppender.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
-
编写序列化类Employee,编写排序规则
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; public class Employee implements WritableComparable<Employee> { //7369,SMITH,CLERK,7902,1980/12/17,800,,20 private IntWritable empNo; private Text empName; private Text empJob; private IntWritable leaderNo; private Text hireDate; private IntWritable empSalary; private Text empBonus; private IntWritable deptNo; public Employee() { this.empNo = new IntWritable(); this.empName = new Text(""); this.empJob = new Text(""); this.leaderNo = new IntWritable(); this.hireDate = new Text(""); this.empSalary =new IntWritable(); this.empBonus = new Text(""); this.deptNo = new IntWritable(); } public Employee(int empNo, String empName, String empJob, int leaderNo, String hireDate, int empSalary, String empBonus, int deptNo) { this.empNo = new IntWritable(empNo); this.empName = new Text(empName); this.empJob = new Text(empJob); this.leaderNo = new IntWritable(leaderNo); this.hireDate = new Text(hireDate); this.empSalary =new IntWritable(empSalary); this.empBonus = new Text(empBonus); this.deptNo = new IntWritable(deptNo); } @Override public void write(DataOutput out) throws IOException { //序列化 this.deptNo.write(out); this.empSalary.write(out); this.empNo.write(out); this.empName.write(out); this.empJob.write(out); this.leaderNo.write(out); this.hireDate.write(out); this.empBonus.write(out); } @Override public void readFields(DataInput in) throws IOException { this.deptNo.readFields(in); this.empSalary.readFields(in); this.empNo.readFields(in); this.empName.readFields(in); this.empJob.readFields(in); this.leaderNo.readFields(in); this.hireDate.readFields(in); this.empBonus.readFields(in); } @Override public String toString() { return "Employee{" + "empNo=" + empNo + ", empName=" + empName + ", empJob=" + empJob + ", leaderNo=" + leaderNo + ", hireDate=" + hireDate + ", empSalary=" + empSalary + ", empBonus=" + empBonus + ", deptNo=" + deptNo + '}'; } public IntWritable getEmpNo() { return empNo; } public void setEmpNo(IntWritable empNo) { this.empNo = empNo; } public Text getEmpName() { return empName; } public void setEmpName(Text empName) { this.empName = empName; } public Text getEmpJob() { return empJob; } public void setEmpJob(Text empJob) { this.empJob = empJob; } public IntWritable getLeaderNo() { return leaderNo; } public void setLeaderNo(IntWritable leaderNo) { this.leaderNo = leaderNo; } public Text getHireDate() { return hireDate; } public void setHireDate(Text hireDate) { this.hireDate = hireDate; } public IntWritable getEmpSalary() { return empSalary; } public void setEmpSalary(IntWritable empSalary) { this.empSalary = empSalary; } public Text getEmpBonus() { return empBonus; } public void setEmpBonus(Text empBonus) { this.empBonus = empBonus; } public IntWritable getDeptNo() { return deptNo; } public void setDeptNo(IntWritable deptNo) { this.deptNo = deptNo; } /** * 自定义排序规则 * 按照部门号升序排,员工工资降序排序 * @param o * @return */ public int compareTo(Employee o) { if (this.deptNo.get() > o.getDeptNo().get()){ return 1; }else if(this.deptNo.get() < o.getDeptNo().get()){ return -1; } //说明:部门号是相同的情况,执行下面代码 //按照工资降序排 if (this.empSalary.get() > o.getEmpSalary().get()){ return -1; }else if(this.empSalary.get() < o.getEmpSalary().get()){ return 1; }else{ return 0; } } }
-
编写自定义分组类MyEmployeeGrouper
import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; public class MyEmployeeGrouper extends WritableComparator { public MyEmployeeGrouper() { //每一个实现都需要对比较的对象进行注册 super(Employee.class,true); } /** * 然后使用部门号进行比较 * @param a * @param b * @return */ @Override public int compare(WritableComparable a, WritableComparable b) { Employee employee1 = (Employee) a; Employee employee2 = (Employee) b; return employee1.getDeptNo().compareTo(employee2.getDeptNo()); } }
-
编写maper类
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class EmpGroupMapper extends Mapper<LongWritable, Text, Employee, Text> { Employee employee = new Employee();//保证对象的hashCode一致 protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //数据格式:<0,7369,SMITH,CLERK,7902,1980/12/17,800,,20> System.out.println("====key:" + key + "value:" + value.toString() + " ====== "); //1、分词 String[] splits = value.toString().split(","); //2、创建Employee对象,并且赋值 employee.setDeptNo(new IntWritable(Integer.valueOf(splits[7]))); employee.setEmpSalary(new IntWritable(Integer.valueOf(splits[5]))); employee.setEmpName(new Text(splits[1])); employee.setEmpJob(new Text(splits[2])); //3、通过context写出去 context.write(employee,employee.getEmpName()); } }
-
编写reducer类
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class EmpGroupReducer extends Reducer<Employee, Text,IntWritable,Text> { @Override protected void reduce(Employee key, Iterable<Text> values, Context context) throws IOException, InterruptedException { //1、对数据进行处理:取出相同部门的员工名称集合 System.out.println("======key===========" + key.toString()); String names = ""; for (Text e: values) { names = e.toString() + ";" + names; System.out.println("======value===========" + e.toString()); } //2、将结果通过context写出去 context.write(key.getDeptNo(),new Text(names)); } }
-
编写Driver类
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.util.Random; public class EmpGroupJob { public static void main(String[] args) throws Exception { Configuration configuration = new Configuration(); Job job = Job.getInstance(configuration); job.setMapperClass(EmpGroupMapper.class); job.setMapOutputKeyClass(Employee.class); job.setMapOutputValueClass(Text.class); //设置分组 job.setGroupingComparatorClass(MyEmployeeGrouper.class); //设置reduce job.setReducerClass(EmpGroupReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job,new Path("F:\\NIIT\\hadoopOnWindow\\input\\emp.csv")); FileOutputFormat.setOutputPath(job,new Path(getOutputDir())); boolean result = job.waitForCompletion(true); if (result) System.out.println("运行成功"); else System.out.println("运行失败"); } //用于产生随机输出目录 public static String getOutputDir(){ String prefix = "F:\\NIIT\\hadoopOnWindow\\output\\"; long time = System.currentTimeMillis(); int random = new Random().nextInt(1000); return prefix + "result_" + time + "_" + random; } }
-
本地运行代码,测试下结果正确与否
-
日志信息:表示shuffle中Sort Phase阶段以及分组好了
-
最终结果:
-
五、打包上传到集群中运行
-
上传emp.csv到hdfs中的datas目录下
-
本地运行测试结果正确后,需要对Driver类输入输出部分代码进行修改,具体修改如下:
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1])); -
将程序打成jar包,需要在pom.xml中配置打包插件
<build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId> maven-assembly-plugin </artifactId> <configuration> <!-- 使用Maven预配置的描述符--> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <!-- 绑定到package生命周期 --> <phase>package</phase> <goals> <!-- 只运行一次 --> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build>
按照如下图所示进行操作
-
提交集群运行,执行如下命令:
hadoop jar packagedemo-1.0-SNAPSHOT.jar com.niit.mr.EmpJob /datas/emp.csv /output/emp/
至此,所有的步骤已经完成,大家可以试试,祝大家好运~~~~