mr实现join

最新推荐文章于 2023-01-15 08:09:15 发布

木_头人

最新推荐文章于 2023-01-15 08:09:15 发布

阅读量301

点赞数

分类专栏： java hadoop

本文链接：https://blog.csdn.net/weixin_43975538/article/details/100652826

版权

hadoop 同时被 2 个专栏收录

10 篇文章 0 订阅

订阅专栏

java

6 篇文章 0 订阅

订阅专栏

需求：展示部门员工的员工编号、员工姓名、部门编号、部门名称。
分析：，部门表跟员工表都有一个相同的字段，部门编号。可以根据部门编号进行jion操作，再将所属表的信息进行一个标识（flag）。
map中根据文件名判断当前行数据所属的数据，并设置对应的flag值
在reduce中根据根据关联字段进行join处理
部门表：

10  ACCOUNTING  NEWYORK
20  RESEARCH    DALLAS
30  SALES   CHICAGO
40  OPERATIONS  BOSTON

员工表：

7369	SMITH	CLERK	7902	1980-12-17	800.00	20
7499	ALLEN	SALESMAN	7698	1981-2-20	1600.00	30
7521	WARD	SALESMAN	7698	1981-2-22	1250.00	30
7566	JONES	MANAGER	7839	1981-4-2	2975.00	20
7654	MARTIN	SALESMAN	7698	1981-9-28	1250.00	30
7698	BLAKE	MANAGER	7839	1981-5-1	2850.00	30
7782	CLARK	MANAGER	7839	1981-6-9	2450.00	10
7788	SCOTT	ANALYST	7566	1987-4-19	3000.00	20
7839	KING	PRESIDENT		1981-11-17	5000.00	10
7844	TURNER	SALESMAN	7698	1981-9-8	1500.00	30
7876	ADAMS	CLERK	7788	1987-5-23	1100.00	20
7900	JAMES	CLERK	7698	1981-12-3	950.00	30
7902	FORD	ANALYST	7566	1981-12-3	3000.00	20
7934	MILLER	CLERK	7782	1982-1-23	1300.00	10
7988	ruoze	CLERK	7732	1982-7-23	1900.00

代码操作：
首先自定义一个序列化

package com.ruozedata.bigdata.hadoop.mapreduce.join;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
//实现Writable接口，自定义员工编号，员工姓名，部门编号，部门名称，标识
public class Info implements Writable {
    private int empno;
    private String ename;
    private int deptno;
    private String dname;
    private int flag;

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(empno);
        out.writeUTF(ename);
        out.writeInt(deptno);
        out.writeUTF(dname);
        out.writeInt(flag);
    }
    @Override
    public void readFields(DataInput in) throws IOException {
        this.empno=in.readInt();
        this.ename=in.readUTF();
        this.deptno=in.readInt();
        this.dname=in.readUTF();
        this.flag=in.readInt();

    }

    public int getEmpno() {
        return empno;
    }

    public String getEname() {
        return ename;
    }

    public int getDeptno() {
        return deptno;
    }

    public String getDname() {
        return dname;
    }

    public int getFlag() {
        return flag;
    }

    public void setEmpno(int empno) {
        this.empno = empno;
    }

    public void setEname(String ename) {
        this.ename = ename;
    }

    public void setDeptno(int deptno) {
        this.deptno = deptno;
    }

    public void setDname(String dname) {
        this.dname = dname;
    }

    public void setFlag(int flag) {
        this.flag = flag;
    }

    @Override
    public String toString() {
        return "Info{" +
                "empno=" + empno +
                ", ename='" + ename + '\'' +
                ", deptno=" + deptno +
                ", flag=" + flag +
                '}';
    }
}

package com.ruozedata.bigdata.hadoop.mapreduce.join;

import com.ruozedata.bigdata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class ReduceJoinApp {

    public static void main(String[] args) throws Exception {
            //获取job对象
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        String input = "data/info/";
        String output = "out/";
        //自定义工具类，删除文件
        FileUtils.deleteTarget(output, configuration);
        //设置jar相关信息
        job.setJarByClass(ReduceJoinApp.class);
        //设置自定义的mapper跟Reducer
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        //设置mapper阶段输出的key跟value类型
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Info.class);
        //设置reducer阶段输出的key跟vlaue类型
        job.setOutputKeyClass(Info.class);
        job.setOutputValueClass(NullWritable.class);
        //设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));
        //提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);

    }

    public static class MyMapper extends Mapper<LongWritable, Text, IntWritable, Info> {

        String name;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
        //找到文件名
            FileSplit fileSplit = (FileSplit) context.getInputSplit();
            name = fileSplit.getPath().getName();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] splits = value.toString().split("\t");
            if(splits.length ==7){
                if (name.contains("emp")) {
                    int empno = Integer.parseInt(splits[0].trim());
                    String ename = splits[1];
                    int deptno = Integer.parseInt(splits[6].trim());
                    Info info = new Info();
                    info.setEmpno(empno);
                    info.setEname(ename);
                    info.setDeptno(deptno);
                    info.setDname("");
                    info.setFlag(1);
                    context.write(new IntWritable(deptno),info);
            }
            } else {
                Info info = new Info();
                int deptno = Integer.parseInt(splits[0].trim());
                info.setDeptno(deptno);
                info.setDname(splits[1].trim());
                info.setEmpno(0);
                info.setEname("");
                info.setFlag(2);
                context.write(new IntWritable(deptno), info);
            }
        }
    }

    public static class MyReducer extends Reducer<IntWritable, Info, Info, NullWritable> {
        @Override
        protected void reduce(IntWritable deptno, Iterable<Info> values, Context context) throws IOException, InterruptedException {
            List<Info> emps = new ArrayList<>();
            String dname = "";

            for (Info info : values) {
                if (info.getFlag() == 1) {
                    Info tmp = new Info();
                    tmp.setEmpno(info.getEmpno());
                    tmp.setEname(info.getEname());
                    tmp.setDeptno(info.getDeptno());
                    emps.add(tmp);
                } else {
                    dname = info.getDname();
                }
            }
            for (Info bean : emps) {
                bean.setDname(dname);
                context.write(bean, NullWritable.get());
            }
        }
    }
}

木_头人

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
mr实现join

需求：展示部门员工的员工编号、员工姓名、部门编号、部门名称。分析：，部门表跟员工表都有一个相同的字段，部门编号。可以根据部门编号进行jion操作，再将所属表的信息进行一个标识（flag）。map中根据文件名判断当前行数据所属的数据，并设置对应的flag值在reduce中根据根据关联字段进行join处理部门表：10 ACCOUNTING NEWYORK20 RESEARCH ...
复制链接

扫一扫

专栏目录