MR之join

梵圣

于 2021-09-21 14:44:16 发布

阅读量116

点赞数

分类专栏： hadoop 文章标签： mr hadoop

本文链接：https://blog.csdn.net/ly13607255628/article/details/120401099

版权

hadoop 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

1、reduce join

在这里插入图片描述
bean

package reduce_join;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class DeliverBean implements  Writable {
    private String userId;
    private String positionId;
    private String date;
    private String positionName;
    private String flag;
    public DeliverBean() {
    }
    public DeliverBean(String userId, String positionId, String date, String
            positionName, String flag) {
        this.userId = userId;
        this.positionId = positionId;
        this.date = date;
        this.positionName = positionName;
        this.flag = flag;
    }
    public String getUserId() {
        return userId;
    }
    public void setUserId(String userId) {
        this.userId = userId;
    }
    public String getPositionId() {
        return positionId;
    }
    public void setPositionId(String positionId) {
        this.positionId = positionId;
    }
    public String getDate() {
        return date;
    }
    public void setDate(String date) {
        this.date = date;
    }
    public String getPositionName() {
        return positionName;
    }
    public void setPositionName(String positionName) {
        this.positionName = positionName;
    }
    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(userId);
        out.writeUTF(positionId);
        out.writeUTF(date);
        out.writeUTF(positionName);
        out.writeUTF(flag);
    }
    @Override
    public void readFields(DataInput in) throws IOException {
        this.userId = in.readUTF();
        this.positionId = in.readUTF();
        this.date = in.readUTF();
        this.positionName = in.readUTF();
        this.flag=in.readUTF();
    }
    @Override
    public String toString() {
        return "DeliverBean{" +
                "userId='" + userId + '\'' +
                ", positionId='" + positionId + '\'' +
                ", date='" + date + '\'' +
                ", positionName='" + positionName ;
    }
}

map

package reduce_join;


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class ReduceJoinMapper extends Mapper<LongWritable, Text, Text, DeliverBean> {
    String name;
    DeliverBean bean = new DeliverBean();
    Text k = new Text();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 1 获取输入文件切片
        FileSplit split = (FileSplit) context.getInputSplit();
        // 2 获取输入文件名称
        name = split.getPath().getName();
        System.out.println("name = " + name);
    }
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1 获取输入数据
        String line = value.toString();
        // 2 不同文件分别处理
        if (name.startsWith("deliver_info")) {
             // 2.1 切割
            String[] fields = line.split("\t");
            // 2.2 封装bean对象
            bean.setUserId(fields[0]);
            bean.setPositionId(fields[1]);
            bean.setDate(fields[2]);
            bean.setPositionName("");
            bean.setFlag("deliver");
            k.set(fields[1]);
        } else {
            // 2.3 切割
            String[] fields = line.split("\t");
            // 2.4 封装bean对象
            bean.setPositionId(fields[0]);
            bean.setPositionName(fields[1]);
            bean.setUserId("");
            bean.setDate("");
            bean.setFlag("position");
            k.set(fields[0]);
        }
        // 3 写出
        context.write(k, bean);
    }
}

reduce

package reduce_join;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
public class ReduceJoinReducer extends Reducer<Text, DeliverBean, DeliverBean,NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<DeliverBean> values, Context context) throws IOException, InterruptedException {
       // 1准备投递行为数据的集合
        ArrayList<DeliverBean> deBeans = new ArrayList<>();
       // 2 准备bean对象
        DeliverBean pBean = new DeliverBean();
        for (DeliverBean bean : values) {
            if ("deliver".equals(bean.getFlag())) {//
                DeliverBean dBean = new DeliverBean();
                try {
                    BeanUtils.copyProperties(dBean, bean);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                deBeans.add(dBean);
            } else {
                try {
                    BeanUtils.copyProperties(pBean, bean);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
        // 3 表的拼接
        for (DeliverBean bean : deBeans) {
            bean.setPositionName(pBean.getPositionName());
            // 4 数据写出去
            context.write(bean, NullWritable.get());
        }
    }
}

driver

package reduce_join;

//import com.lagou.mr.wc.WordCountCombiner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class ReduceJoinDriver {
    public static void main(String[] args) throws IOException,ClassNotFoundException, InterruptedException {
        System.setProperty("hadoop.home.dir", "D:\\dev_soft\\hadoop-2.9.2");
        // 1. 获取配置文件对象，获取job对象实例
        final Configuration conf = new Configuration();
        final Job job = Job.getInstance(conf, "ReduceJoinDriver");
        // 2. 指定程序jar的本地路径
        job.setJarByClass(ReduceJoinDriver.class);
        // 3. 指定Mapper/Reducer类
        job.setMapperClass(ReduceJoinMapper.class);
        job.setReducerClass(ReduceJoinReducer.class);
        // 4. 指定Mapper输出的kv数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(DeliverBean.class);
         // 5. 指定最终输出的kv数据类型
        job.setOutputKeyClass(DeliverBean.class);
        job.setOutputValueClass(NullWritable.class);
        // 6. 指定job输出结果路径
        FileInputFormat.setInputPaths(job, new Path(args[0])); //指定读取数据的原始路径
        // 7. 指定job输出结果路径
        FileOutputFormat.setOutputPath(job, new Path(args[1])); //指定结果数据输出路径
        // 8. 提交作业
        final boolean flag = job.waitForCompletion(true);
        //jvm退出：正常退出0，非0值则是错误退出
        System.exit(flag ? 0 : 1);
    }
}

打包运行

hadoop jar  03_hadoop-1.0-SNAPSHOT.jar  reduce_join.ReduceJoinDriver  /reduce_join/input   /reduce_join/output

结果
在这里插入图片描述

2、map join

适用于关联表中有小表的情形；可以将小表分发到所有的map节点，这样，map节点就可以在本地对自己所读到的大表数据进行join并输出最终结果，可以大大提高join操作的并发度，加快处理速度

bean

package map_join;

import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class DeliverBean implements Writable {
    private String userId;
    private String positionId;
    private String date;
    private String positionName;
    private String flag;
    public DeliverBean() {
    }
    public DeliverBean(String userId, String positionId, String date, String
            positionName, String flag) {
        this.userId = userId;
        this.positionId = positionId;
        this.date = date;
        this.positionName = positionName;
        this.flag = flag;
    }
    public String getUserId() {
        return userId;
    }
    public void setUserId(String userId) {
        this.userId = userId;
    }
    public String getPositionId() {
        return positionId;
    }
    public void setPositionId(String positionId) {
        this.positionId = positionId;
    }
    public String getDate() {
        return date;
    }
    public void setDate(String date) {
        this.date = date;
    }

    public String getPositionName() {
        return positionName;
    }
    public void setPositionName(String positionName) {
        this.positionName = positionName;
    }
    public String getFlag() {
        return flag;
    }
    public void setFlag(String flag) {
        this.flag = flag;
    }
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(userId);
        out.writeUTF(positionId);
        out.writeUTF(date);
        out.writeUTF(positionName);
        out.writeUTF(flag);
    }
    @Override
    public void readFields(DataInput in) throws IOException {
        this.userId = in.readUTF();
        this.positionId = in.readUTF();
        this.date = in.readUTF();
        this.positionName = in.readUTF();
        this.flag=in.readUTF();
    }
    @Override
    public String toString() {
        return "DeliverBean{" +
                "userId='" + userId + '\'' +
                ", positionId='" + positionId + '\'' +
                ", date='" + date + '\'' +
                ", positionName='" + positionName + '\'' +
                '}';
    }
}

map

package map_join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class MapJoinDriver {
    public static void main(String[] args) throws IOException,ClassNotFoundException, InterruptedException, URISyntaxException {
        // 1. 获取配置文件对象，获取job对象实例
        final Configuration conf = new Configuration();
        final Job job = Job.getInstance(conf, "ReduceJoinDriver");
        // 2. 指定程序jar的本地路径
        job.setJarByClass(MapJoinDriver.class);
         // 3. 指定Mapper类
        job.setMapperClass(MapJoinMapper.class);
         // 4. 指定最终输出的kv数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
         //5.指定job读取数据路径
        FileInputFormat.setInputPaths(job, new Path(args[0])); //指定读取数据的原始路径
         // 6. 指定job输出结果路径
        FileOutputFormat.setOutputPath(job, new Path(args[1])); //指定结果数据输出路径
        // 7.加载缓存文件
        job.addCacheFile(new URI("file:///E:/hdfs_test_dir/reduce_join_input/position.txt"));
        job.setNumReduceTasks(0);
         // 8. 提交作业
        final boolean flag = job.waitForCompletion(true);
         //jvm退出：正常退出0，非0值则是错误退出
        System.exit(flag ? 0 : 1);
    }
}

driver


import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
public class MapJoinMapper extends Mapper<LongWritable, Text, Text,NullWritable> {
    String name;
    DeliverBean bean = new DeliverBean();
    Text k = new Text();
    Map<String, String> pMap = new HashMap<>();

    //读取文件
    @Override
    protected void setup(Context context) throws IOException,InterruptedException {
        // 1 获取缓存的文件
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("position.txt"), "UTF-8"));
        String line;
        while (StringUtils.isNotEmpty(line = reader.readLine())) {
            // 2 切割
            String[] fields = line.split("\t");
            // 3 缓存数据到集合
            pMap.put(fields[0], fields[1]);
        }
        // 4 关流
        reader.close();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws
            IOException, InterruptedException {
         // 1 获取一行
        String line = value.toString();
        // 2 截取
        String[] fields = line.split("\t");
         // 3 获取职位id
        String pId = fields[1];
        // 4 获取职位名称
        String pName = pMap.get(pId);
       // 5 拼接
        k.set(line + "\t" + pName);
        // 写出
        context.write(k, NullWritable.get());
    }
}

梵圣

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
MR之join

1、reduce joinbeanpackage reduce_join;import org.apache.hadoop.io.Writable;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;public class DeliverBean implements Writable { private String userId; private String po
复制链接

扫一扫