Hadoop应用——Reduce端Join操作

最新推荐文章于 2023-05-09 09:44:02 发布

lfdanding

最新推荐文章于 2023-05-09 09:44:02 发布

阅读量696

点赞数

分类专栏： hadoop 大数据文章标签： hadoop ReduceJoin

本文链接：https://blog.csdn.net/lfdanding/article/details/51436554

版权

大数据同时被 2 个专栏收录

29 篇文章 0 订阅

订阅专栏

hadoop

28 篇文章 1 订阅

订阅专栏

联接
使用案例
Table EMP：

Name    Sex Age DepNo
zhang   male    20  1
li  female  25  2
wang    female  30  3
zhou    male    35  2

Table DEP：

DepNo   DepName
1   Sales
2   Dev
3   Mgt

reduce端联接比map端联接更普遍，因为输入的数据不需要特定的结构；但是效率低，因为所有数据必须经过shuffle过程
基本思路：
1）Map端读取所有文件，并在输出的内容里加上标识代表数据是从哪个文件里来的
2）在reduce处理函数里，按照标识对数据进行保存
3）然后根据key的join来求出结果直接输出

定义EMP_DEP.java实体bean保存表中数据

package com.join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class EMP_DEP implements WritableComparable{
    private String name  = "" ;
    private String sex = "" ;
    private int age = 0 ;
    private int depNo = 0 ;
    private String depName = "" ;
    private String table = "" ;

    public EMP_DEP(){

    }

    public EMP_DEP(EMP_DEP emp_dep){
        this.name = emp_dep.getName() ;
        this.sex = emp_dep.getSex() ;
        this.age = emp_dep.getAge() ;
        this.depNo = emp_dep.getDepNo() ;
        this.depName = emp_dep.getDepName() ;
        this.table = emp_dep.getTable() ;
    }
    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getSex() {
        return sex;
    }

    public void setSex(String sex) {
        this.sex = sex;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }

    public int getDepNo() {
        return depNo;
    }

    public void setDepNo(int depNo) {
        this.depNo = depNo;
    }

    public String getDepName() {
        return depName;
    }

    public void setDepName(String depName) {
        this.depName = depName;
    }

    public String getTable() {
        return table;
    }

    public void setTable(String table) {
        this.table = table;
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.name = in.readUTF() ;
        this.sex = in.readUTF() ;
        this.age = in.readInt() ;
        this.depNo = in.readInt() ;
        this.depName = in.readUTF() ;
        this.table = in.readUTF() ;

    }
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(name) ;
        out.writeUTF(sex) ;
        out.writeInt(age) ;
        out.writeInt(depNo) ;
        out.writeUTF(depName) ;
        out.writeUTF(table) ;

    }
    @Override
    public int compareTo(Object o) {
        return 0;
    }

    public String toString(){
        return name + " " + sex + " " + age + " " + depName ; 
    }
}

定义Mapper函数

package com.join;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class ReduceSideMapper extends Mapper<LongWritable, Text, IntWritable, EMP_DEP> {

    private EMP_DEP emp_dep = new EMP_DEP() ;
    @Override
    protected void map(LongWritable key, Text value,Context context)
            throws IOException, InterruptedException {
        String[] values = value.toString().split("\\s+") ;
        if(values.length == 4 ){
            emp_dep.setName(values[0]) ;
            emp_dep.setSex(values[1]) ;
            emp_dep.setAge(Integer.valueOf(values[2])) ;
            emp_dep.setDepNo(Integer.valueOf(values[3])) ;
            emp_dep.setTable("EMP") ;
            context.write(new IntWritable(Integer.valueOf(values[3])), emp_dep) ;
        } 
        if(values.length ==2 ){
            emp_dep.setDepNo(Integer.valueOf(values[0])) ;
            emp_dep.setDepName(values[1]) ;
            emp_dep.setTable("DEP") ;
            context.write(new IntWritable(Integer.valueOf(values[0])), emp_dep) ;
        }
    }

}

定义reducer函数

package com.join;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class ReduceSideReducer extends
        Reducer<IntWritable, EMP_DEP, NullWritable, EMP_DEP> {

    @Override
    protected void reduce(IntWritable key, Iterable<EMP_DEP> value,Context context)
            throws IOException, InterruptedException {
        String depName = "" ;
        List<EMP_DEP> list = new LinkedList<EMP_DEP>() ;
        for(EMP_DEP val : value){
            //在DEP表中，depNo是主键，所以只会存在一个
            if(val.getTable().equals("DEP")){
                depName = val.getDepName() ;
            }else{
                list.add(new EMP_DEP(val)) ;
            }
        }

        for(EMP_DEP v : list){
            if(v.getTable().equals("EMP")){
                v.setDepName(depName) ;
                context.write(NullWritable.get(), v) ;
            }

        }
    }

}

定义TestReduceSideJoin

package com.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;


public class TestReduceSideJoin {
    public static void main(String args[]) throws Exception{
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
          System.err.println("Usage: wordcount <in> <out>");
          System.exit(2);
        }

        Job job = new Job(conf, "Reduce side join");
        job.setJarByClass(TestReduceSideJoin.class);
        job.setMapperClass(ReduceSideMapper.class);
        job.setReducerClass(ReduceSideReducer.class);

        job.setMapOutputKeyClass(IntWritable.class) ;
        job.setMapOutputValueClass(EMP_DEP.class) ;

        job.setOutputKeyClass(NullWritable.class) ;
        job.setOutputValueClass(EMP_DEP.class) ;


        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

lfdanding

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Hadoop应用——Reduce端Join操作

联接使用案例 Table EMP：Name Sex Age DepNozhang male 20 1li female 25 2wang female 30 3zhou male 35 2Table DEP：DepNo DepName1 Sales2 Dev3 Mgtreduce端联接比map端联接更普遍，因为输入的
复制链接

扫一扫

专栏目录