21 Mapreduce中的join操作

最新推荐文章于 2021-11-29 11:20:52 发布

qq_34352013

最新推荐文章于 2021-11-29 11:20:52 发布

阅读量70

点赞数

分类专栏： Reduce hadoop

本文链接：https://blog.csdn.net/qq_34352013/article/details/104956001

版权

hadoop 同时被 2 个专栏收录

33 篇文章 0 订阅

订阅专栏

Reduce

20 篇文章 1 订阅

订阅专栏

背景

类似于sql语句中的join，两张表之间有关系，需要进行联合查询的场景
订单表：
1001 20170710 4 2
1002 20170710 3 100
1003 20170710 2 40
1004 20170711 2 23
1005 20170823 4 55
1006 20170824 3 20
1007 20170825 2 3
1008 20170826 4 23
1009 20170912 2 10
1010 20170913 2 2
1011 20170914 3 14
1012 20170915 3 18

产品表：
1 chuizi 3999
2 huawei 3999
3 xiaomi 2999
4 apple 5999

现在需要统计每一类产品销售金额。

代码

设置分布式缓存存根，然后通过map来存储存根的结果

Driver

当多个文件之间有关系时，设置缓存存根，将小的文件放到缓存当中。

package cn.tedu.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;

public class JoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(JoinDriver.class);
        job.setMapperClass(JoinMapper.class);
        job.setReducerClass(JoinReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Order.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);

        // 当需要同时处理多个文件的时候，可以将其中比较小的文件放入缓存中，设置一个缓存存根
        URI[] files = {URI.create("hdfs://hadoop01:9000/txt/union/product.txt")};
        job.setCacheFiles(files);
        // 处理大文件
        FileInputFormat.addInputPath(job,
                new Path("hdfs://hadoop01:9000/txt/union/order.txt"));
        FileOutputFormat.setOutputPath(job,
                new Path("hdfs://hadoop01:9000/result/join"));

        job.waitForCompletion(true);
    }
}

model对象

相当于sql查询结果集中的一行数据的字段

package cn.tedu.join;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Order implements Writable {

    private String orderId = "";
    private String date = "";
    private String proId = "";
    private int num;
    private String name = "";
    private double price;

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getProId() {
        return proId;
    }

    public void setProId(String proId) {
        this.proId = proId;
    }

    public int getNum() {
        return num;
    }

    public void setNum(int num) {
        this.num = num;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public double getPrice() {
        return price;
    }

    public void setPrice(double price) {
        this.price = price;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(orderId);
        out.writeUTF(date);
        out.writeUTF(proId);
        out.writeInt(num);
        out.writeUTF(name);
        out.writeDouble(price);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.orderId = in.readUTF();
        this.date = in.readUTF();
        this.proId = in.readUTF();
        this.num = in.readInt();
        this.name = in.readUTF();
        this.price = in.readDouble();
    }
}

mapper

map方法调用之前会调用setup进行初始化
通过一个map来存储小表，进行主外键连接

package cn.tedu.join;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class JoinMapper extends Mapper<LongWritable, Text, Text, Order> {

    private Map<String, Order> map = new HashMap<>();

    // 需要先处理product.txt文件
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 从缓存中将存储的product.txt地址取出来
        URI file = context.getCacheFiles()[0];
        // 连接HDFS
        FileSystem fs = FileSystem.get(file, context.getConfiguration());
        // 打开文件，获取到一个输出流
        InputStream in = fs.open(new Path(file.toString()));
        // 获取的是字节流，但是数据却是一行一条数据，如果直接使用字节流读取，还需要自己判断什么时候读完一行
        // 所以考虑将字节流包装成一个字符流，并且希望能够进行按行读取
        // BufferedReader->String LineReader->Text
        // 当前场景中，需要将数据读取出来之后还得进行处理
        BufferedReader reader = new BufferedReader(new InputStreamReader(in));
        // 读取数据
        String line;
        while ((line = reader.readLine()) != null) {
            // 1 chuizi 3999
            String[] arr = line.split(" ");
            Order o = new Order();
            o.setProId(arr[0]);
            o.setName(arr[1]);
            o.setPrice(Double.parseDouble(arr[2]));
            map.put(o.getProId(), o);
        }
        // 关流
        reader.close();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1003 20170710 2 40
        String[] arr = value.toString().split(" ");
        Order o = new Order();
        o.setOrderId(arr[0]);
        o.setDate(arr[1]);
        o.setProId(arr[2]);
        o.setNum(Integer.parseInt(arr[3]));
        o.setName(map.get(o.getProId()).getName());
        o.setPrice(map.get(o.getProId()).getPrice());

        context.write(new Text(o.getName()), o);
    }
}

reducer

package cn.tedu.join;

import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class JoinReducer extends Reducer<Text, Order, Text, DoubleWritable> {
    @Override
    protected void reduce(Text key, Iterable<Order> values, Context context) throws IOException, InterruptedException {
        double sum = 0;
        for (Order val : values) {
            sum += val.getNum() * val.getPrice();
        }
        context.write(key, new DoubleWritable(sum));
    }
}

qq_34352013

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
21 Mapreduce中的join操作

背景类似于sql语句中的join，两张表之间有关系，需要进行联合查询的场景订单表：1001 20170710 4 21002 20170710 3 1001003 20170710 2 401004 20170711 2 231005 20170823 4 551006 20170824 3 201007 20170825 2 31008 20170826 4 231009 ...
复制链接

扫一扫

专栏目录