MR之MapJoin


前言

有一个商品文件和一个订单文件,其中订单文件记录了商品文件的编号,而商品文件中有商品的中文名称。
需求:需要输出一个订单文件,字段为:商品名称、商品总量、总金额。


一、实现思路

我们需要读取商品文件形成一个Map集合,再通过订单文件中的商品id,在Map集合中找到商品名称,再实现对应的wc统计就行了

二、具体代码

1.MapJoinOrderBean类

package com.hadoop.mapreduce.mapJoin;

import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @author LengQing
 * @date 2020/5/3 - 15:20
 * 实现将订单表和产品表数据合并
 */
public class MapJoinOrderBean implements WritableComparable<MapJoinOrderBean> {
    private String o_id; // order表:订单id
    private String p_id; // order表、product表:商品id
    private Integer o_price; // order表:商品单价
    private Integer o_num; // order表:商品数量
    private String p_name; // product表:商品名称

    public MapJoinOrderBean() {
        super();
    }

    public MapJoinOrderBean(String o_id, String p_id, Integer o_price, Integer o_num, String p_name) {
        this.o_id = o_id;
        this.p_id = p_id;
        this.o_price = o_price;
        this.o_num = o_num;
        this.p_name = p_name;
    }
    @Override
    public String toString() {
        return o_id + '\t' + p_name + '\t' + o_num + '\t' + o_price;
    }

    public String getO_id() {
        return o_id;
    }

    public void setO_id(String o_id) {
        this.o_id = o_id;
    }

    public String getP_id() {
        return p_id;
    }

    public void setP_id(String p_id) {
        this.p_id = p_id;
    }

    public Integer getO_price() {
        return o_price;
    }

    public void setO_price(Integer o_price) {
        this.o_price = o_price;
    }

    public Integer getO_num() {
        return o_num;
    }

    public void setO_num(Integer o_num) {
        this.o_num = o_num;
    }

    public String getP_name() {
        return p_name;
    }

    public void setP_name(String p_name) {
        this.p_name = p_name;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(o_id);
        out.writeUTF(p_id);
        out.writeInt(o_price);
        out.writeInt(o_num);
        out.writeUTF(p_name);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        o_id = in.readUTF();
        p_id = in.readUTF();
        o_price = in.readInt();
        o_num = in .readInt();
        p_name = in.readUTF();
    }

    @Override
    public int compareTo(MapJoinOrderBean o) {
        return 0;
    }
}

2.MapJoinMapper类

package com.hadoop.mapreduce.mapJoin;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

/**
 * @author LengQing
 * @date 2020/5/3 - 15:07
 */
public class MapJoinMapper extends Mapper<LongWritable, Text, Text, MapJoinOrderBean> {
    private Text outputKey = new Text();
    private MapJoinOrderBean orderBean = new MapJoinOrderBean();
    private Map<String, String> pdMap = new HashMap<>();


    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        String cacheFile = "hdfs://mycluster:9000/datas/products.txt";
        FileSystem fileSystem = FileSystem.get(URI.create(cacheFile), context.getConfiguration());
        FSDataInputStream fsDataInputStream = fileSystem.open(new Path(cacheFile));
        BufferedReader reader = new BufferedReader(new InputStreamReader(fsDataInputStream));

        // 1 获取输入文件切片
        //URI[] cacheFiles = context.getCacheFiles();
        //String path = cacheFiles[0].getPath().toString();
        //FileInputStream fileInputStream = new FileInputStream(path);
        //InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "UTF-8");
        //BufferedReader reader = new BufferedReader(inputStreamReader);
        String line;
        while(StringUtils.isNotEmpty(line = reader.readLine())) {
            // 2 切割
            String[] fields = line.split("\t");
            // 3 缓存数据到集合
            pdMap.put(fields[0], fields[1]);
        }
        // 4 关流
        reader.close();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] line = value.toString().split("\t");
        orderBean.setO_id(line[0]);
        orderBean.setP_id(line[1]);
        orderBean.setO_num(Integer.parseInt(line[2]));
        orderBean.setO_price(Integer.parseInt(line[3]));
        orderBean.setP_name(pdMap.get(line[1]));
        outputKey.set(pdMap.get(line[1]));
        context.write(this.outputKey, orderBean);
    }
}

3.MapJoinReduce类

package com.hadoop.mapreduce.mapJoin;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author LengQing
 * @date 2020/5/3 - 15:08
 */
public class MapJoinReduce extends Reducer<Text, MapJoinOrderBean, Text, Text> {
    private Text outputValue = new Text();
    @Override
    protected void reduce(Text key, Iterable<MapJoinOrderBean> values, Context context) throws IOException, InterruptedException {
        int num = 0;
        Double price = 0.0;
        for (MapJoinOrderBean value : values) {
           num += value.getO_num();
           price += value.getO_price();
        }
        outputValue.set(num + "\t" + price);
        context.write(key, this.outputValue);
    }
}

4.MapJoinDriver类

package com.hadoop.mapreduce.mapJoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * @author LengQing
 * @date 2020/5/3 - 14:29
 * 需要输出一个订单文件,字段为:商品名称、商品总量、总金额。
 */
public class MapJoinDriver extends Configured implements Tool {
    private Configuration conf = new Configuration();
    @Override
    public int run(String[] args) throws Exception {
        // 实例化Job
        Job job = Job.getInstance(conf, "reduceJoin");
        job.setJarByClass(MapJoinDriver.class);
        //job.addCacheFile(new URI("hdfs://mycluster:8020/datas/order/product.txt"));

        // 1 input阶段
        Path inputPath = new Path(args[0]);
        FileInputFormat.setInputPaths(job, inputPath);

        // 2 map阶段
        job.setMapperClass(MapJoinMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(MapJoinOrderBean.class);

        // 3 shuffle阶段
        //job.setGroupingComparatorClass(mapJoinGroup.class);

        // 4 reduce阶段
        job.setReducerClass(MapJoinReduce.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        //job.setNumReduceTasks(0);

        // 5 output阶段
        Path outputPath = new Path(args[1]);
        FileOutputFormat.setOutputPath(job, outputPath);

        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) {
        try {
            int status = ToolRunner.run(new MapJoinDriver(), args);
            System.exit(status);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}


三、运行注意点

注意cacheFile文件位置,文件不能过大
集群运行输入参数:1、输入文件名称 2、输出文件名称

四、个人运行结果

1、输入文件order.txt
在这里插入图片描述
2、product.txt文件
在这里插入图片描述
3、export结果
在这里插入图片描述

总结

3年前被问到的面试题,实现很简单,但是却很重要。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值