需求:展示部门员工的员工编号、员工姓名、部门编号、部门名称。
分析:,部门表跟员工表都有一个相同的字段,部门编号。可以根据部门编号进行jion操作,再将所属表的信息进行一个标识(flag)。
map中根据文件名判断当前行数据所属的数据,并设置对应的flag值
在reduce中根据根据关联字段进行join处理
部门表:
10 ACCOUNTING NEWYORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON
员工表:
7369 SMITH CLERK 7902 1980-12-17 800.00 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.00 30
7521 WARD SALESMAN 7698 1981-2-22 1250.00 30
7566 JONES MANAGER 7839 1981-4-2 2975.00 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250.00 30
7698 BLAKE MANAGER 7839 1981-5-1 2850.00 30
7782 CLARK MANAGER 7839 1981-6-9 2450.00 10
7788 SCOTT ANALYST 7566 1987-4-19 3000.00 20
7839 KING PRESIDENT 1981-11-17 5000.00 10
7844 TURNER SALESMAN 7698 1981-9-8 1500.00 30
7876 ADAMS CLERK 7788 1987-5-23 1100.00 20
7900 JAMES CLERK 7698 1981-12-3 950.00 30
7902 FORD ANALYST 7566 1981-12-3 3000.00 20
7934 MILLER CLERK 7782 1982-1-23 1300.00 10
7988 ruoze CLERK 7732 1982-7-23 1900.00
代码操作:
首先自定义一个序列化
package com.ruozedata.bigdata.hadoop.mapreduce.join;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
//实现Writable接口,自定义员工编号,员工姓名,部门编号,部门名称,标识
public class Info implements Writable {
private int empno;
private String ename;
private int deptno;
private String dname;
private int flag;
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(empno);
out.writeUTF(ename);
out.writeInt(deptno);
out.writeUTF(dname);
out.writeInt(flag);
}
@Override
public void readFields(DataInput in) throws IOException {
this.empno=in.readInt();
this.ename=in.readUTF();
this.deptno=in.readInt();
this.dname=in.readUTF();
this.flag=in.readInt();
}
public int getEmpno() {
return empno;
}
public String getEname() {
return ename;
}
public int getDeptno() {
return deptno;
}
public String getDname() {
return dname;
}
public int getFlag() {
return flag;
}
public void setEmpno(int empno) {
this.empno = empno;
}
public void setEname(String ename) {
this.ename = ename;
}
public void setDeptno(int deptno) {
this.deptno = deptno;
}
public void setDname(String dname) {
this.dname = dname;
}
public void setFlag(int flag) {
this.flag = flag;
}
@Override
public String toString() {
return "Info{" +
"empno=" + empno +
", ename='" + ename + '\'' +
", deptno=" + deptno +
", flag=" + flag +
'}';
}
}
package com.ruozedata.bigdata.hadoop.mapreduce.join;
import com.ruozedata.bigdata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class ReduceJoinApp {
public static void main(String[] args) throws Exception {
//获取job对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
String input = "data/info/";
String output = "out/";
//自定义工具类,删除文件
FileUtils.deleteTarget(output, configuration);
//设置jar相关信息
job.setJarByClass(ReduceJoinApp.class);
//设置自定义的mapper跟Reducer
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
//设置mapper阶段输出的key跟value类型
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Info.class);
//设置reducer阶段输出的key跟vlaue类型
job.setOutputKeyClass(Info.class);
job.setOutputValueClass(NullWritable.class);
//设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
//提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMapper extends Mapper<LongWritable, Text, IntWritable, Info> {
String name;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//找到文件名
FileSplit fileSplit = (FileSplit) context.getInputSplit();
name = fileSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split("\t");
if(splits.length ==7){
if (name.contains("emp")) {
int empno = Integer.parseInt(splits[0].trim());
String ename = splits[1];
int deptno = Integer.parseInt(splits[6].trim());
Info info = new Info();
info.setEmpno(empno);
info.setEname(ename);
info.setDeptno(deptno);
info.setDname("");
info.setFlag(1);
context.write(new IntWritable(deptno),info);
}
} else {
Info info = new Info();
int deptno = Integer.parseInt(splits[0].trim());
info.setDeptno(deptno);
info.setDname(splits[1].trim());
info.setEmpno(0);
info.setEname("");
info.setFlag(2);
context.write(new IntWritable(deptno), info);
}
}
}
public static class MyReducer extends Reducer<IntWritable, Info, Info, NullWritable> {
@Override
protected void reduce(IntWritable deptno, Iterable<Info> values, Context context) throws IOException, InterruptedException {
List<Info> emps = new ArrayList<>();
String dname = "";
for (Info info : values) {
if (info.getFlag() == 1) {
Info tmp = new Info();
tmp.setEmpno(info.getEmpno());
tmp.setEname(info.getEname());
tmp.setDeptno(info.getDeptno());
emps.add(tmp);
} else {
dname = info.getDname();
}
}
for (Info bean : emps) {
bean.setDname(dname);
context.write(bean, NullWritable.get());
}
}
}
}