用MR实现Join逻辑的两种方法

最新推荐文章于 2023-01-15 08:09:15 发布

qq_18219755

最新推荐文章于 2023-01-15 08:09:15 发布

阅读量351

点赞数 1

分类专栏：大数据

大数据专栏收录该内容

41 篇文章 0 订阅

订阅专栏

用MR实现Join逻辑的两种方法
需求
订单数据表 order.txt

id date pid amount
1001 20150710 P0001 2
1002 20150710 P0001 3
1002 20150710 P0001 3
商品信息表 product.txt

id pname category_id price
P0001 小米5 1001 2
P0002 锤子T1 1000 3
P0003 锤子 1002 3
假如数据量巨大，两表的数据是以文件的形式存储在HDFS中，需要用mapreduce程序来实现一下SQL查询运算：

select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id
1
reduce端join算法实现
实现机制:
通过将关联的条件作为map输出的key，将两表满足join条件的数据并携带数据所来源的文件信息，发往同一个reduce task，在reduce中进行数据的串联

RJoin.java
public class RJoin {

static class RJoinMapper extends Mapper<LongWritable, Text, Text, InfoBean> {
    InfoBean bean = new InfoBean();
    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] fields = line.split("\t");
        String pid = "";

        // 通过文件名判断是哪种数据
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        String name = inputSplit.getPath().getName();
        if (name.startsWith("order")) {
            pid = fields[2];
            bean.set(fields[0], fields[1], pid, Integer.parseInt(fields[3]), "", "", -1, "0");
        } else {
            pid = fields[0];
            bean.set("", "", pid, -1, fields[1], fields[2], Float.parseFloat(fields[3]), "1");
        }
        k.set(pid);
        context.write(k, bean);
    }
}


static class RJoinReducer extends Reducer<Text, InfoBean, InfoBean, NullWritable> {
    @Override
    protected void reduce(Text pid, Iterable<InfoBean> values, Context context) throws IOException, InterruptedException {
        InfoBean pdBean = new InfoBean();
        List<InfoBean> orderBeans = new ArrayList<InfoBean>();

        for (InfoBean bean : values) {
            if ("1".equals(bean.getFlag())) { //产品
                try {
                    BeanUtils.copyProperties(pdBean, bean);
                } catch (IllegalAccessException | InvocationTargetException e) {
                    e.printStackTrace();
                }
            } else {
                InfoBean orderBean = new InfoBean();
                try {
                    BeanUtils.copyProperties(orderBean, bean);
                    orderBeans.add(orderBean);
                } catch (IllegalAccessException | InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }

        // 拼接两类数据形成最终结果
        for (InfoBean bean : orderBeans) {
            bean.setPname(pdBean.getPname());
            bean.setCategory_id(pdBean.getCategory_id());
            bean.setPrice(pdBean.getPrice());

            context.write(bean, NullWritable.get());
        }
    }
}

public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);

    // 指定本程序的jar包所在的本地路径
    job.setJarByClass(RJoin.class);

    //System.setProperty("hadoop.home.dir", "D:\\hadoop-2.6.5");

    // 指定本业务job要使用的mapper/Reducer业务类
    job.setMapperClass(RJoinMapper.class);
    job.setReducerClass(RJoinReducer.class);

    // 指定mapper输出数据的kv类型
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(InfoBean.class);

    job.setOutputKeyClass(InfoBean.class);
    job.setOutputValueClass(NullWritable.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    boolean res = job.waitForCompletion(true);
    System.exit(res ? 0 : 1);
}

}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
缺点
这种方式中，join的操作是在reduce阶段完成，reduce端的处理压力太大，map节点的运算负载则很低，资源利用率不高，且在reduce阶段极易产生数据倾斜

map端join算法实现
原理阐述
适用于关联表中有小表的情形；
可以将小表分发到所有的map节点，这样，map节点就可以在本地对自己所读到的大表数据进行join并输出最终结果，可以大大提高join操作的并发度，加快处理速度

实现示例
–先在mapper类中预先定义好小表，进行join
–引入实际场景中的解决方案：一次加载数据库或者用distributedcache
MapSideJoin.java

public class MapSideJoin {

static class MapSideJoinMapper extends Mapper<LongWritable, Text, InfoBean, NullWritable> {
    Map<String, InfoBean> pdInfoMap = new HashMap<String, InfoBean>();

    InfoBean bean = new InfoBean();

    /**
     * 通过阅读父类Mapper的源码，发现 setup方法是在maptask处理数据之前调用一次 可以用来做一些初始化工作
     */
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("product.txt")));
        String line;

        while (StringUtils.isNotEmpty(line = br.readLine())) {
            InfoBean pdBean = new InfoBean();
            String[] fields = line.split("\t");
            pdBean.set("", "", fields[0], -1, fields[1], fields[2], Float.parseFloat(fields[3]), "1");
            pdInfoMap.put(fields[0], pdBean);
        }
        br.close();
    }

    // 由于已经持有完整的产品信息表，所以在map方法中就能实现join逻辑了
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] fields = line.split("\t");
        String pid = fields[2];
        //InfoBean productBean = pdInfoMap.get(pid);
        bean.setOrder_id(fields[0]);
        bean.setDate(fields[1]);
        bean.setPid(pid);
        bean.setAmount(Integer.parseInt(fields[3]));
        bean.setPname(pdInfoMap.get(pid).getPname());
        bean.setCategory_id(pdInfoMap.get(pid).getCategory_id());
        bean.setPrice(pdInfoMap.get(pid).getPrice());
        context.write(bean, NullWritable.get());
    }
}

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);

    // 指定本程序的jar包所在的本地路径
    job.setJarByClass(RJoin.class);

    //System.setProperty("hadoop.home.dir", "D:\\hadoop-2.6.5");

    // 指定本业务job要使用的mapper/Reducer业务类
    job.setMapperClass(MapSideJoinMapper.class);

    // 指定mapper输出数据的kv类型
    job.setMapOutputKeyClass(InfoBean.class);
    job.setMapOutputValueClass(NullWritable.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    //FileInputFormat.setInputPaths(job, new Path("hdfs://mini1/mapsidejoin/input"));
    //FileOutputFormat.setOutputPath(job, new Path("hdfs://mini1/mapsidejoin/output"));

    // 指定需要缓存一个文件到所有的maptask运行节点工作目录
    /* job.addArchiveToClassPath(archive); */// 缓存jar包到task运行节点的classpath中
    /* job.addFileToClassPath(file); */// 缓存普通文件到task运行节点的classpath中
    /* job.addCacheArchive(uri); */// 缓存压缩包文件到task运行节点的工作目录
    /* job.addCacheFile(uri) */// 缓存普通文件到task运行节点的工作目录

    // 将产品表文件缓存到task工作节点的工作目录中去
    job.addCacheFile(new URI("hdfs://mini1/mapsidejoin/cache/product.txt"));

    // map端join的逻辑不需要reduce阶段，设置reducetask数量为0
    job.setNumReduceTasks(0);

    boolean res = job.waitForCompletion(true);
    System.exit(res ? 0 : 1);
}

public class InfoBean implements Writable {
private String order_id;
private String date;
private String pid;
private int amount;
private String pname;
private String category_id;
private float price;
// flag=0表示这个对象是封装订单表记录
// flag=1表示这个对象是封装产品信息记录
private String flag;

public void set(String order_id, String date, String pid, int amount, String pname,
        String category_id, float price, String flag) {
    this.order_id = order_id;
    this.date = date;
    this.pid = pid;
    this.amount = amount;
    this.pname = pname;
    this.category_id = category_id;
    this.price = price;
    this.flag = flag;
}

public String getOrder_id() {
    return order_id;
}

public void setOrder_id(String order_id) {
    this.order_id = order_id;
}

public String getDate() {
    return date;
}

public void setDate(String date) {
    this.date = date;
}

public String getPid() {
    return pid;
}

public void setPid(String pid) {
    this.pid = pid;
}

public int getAmount() {
    return amount;
}

public void setAmount(int amount) {
    this.amount = amount;
}

public String getPname() {
    return pname;
}

public void setPname(String pname) {
    this.pname = pname;
}

public String getCategory_id() {
    return category_id;
}

public void setCategory_id(String category_id) {
    this.category_id = category_id;
}

public float getPrice() {
    return price;
}

public void setPrice(float price) {
    this.price = price;
}

public String getFlag() {
    return flag;
}

public void setFlag(String flag) {
    this.flag = flag;
}

@Override
public void readFields(DataInput in) throws IOException {
    this.order_id = in.readUTF();
    this.date = in.readUTF();
    this.pid = in.readUTF();
    this.amount = in.readInt();
    this.pname = in.readUTF();
    this.category_id = in.readUTF();
    this.price = in.readFloat();
    this.flag = in.readUTF();
}

@Override
public void write(DataOutput out) throws IOException {      
    out.writeUTF(order_id);
    out.writeUTF(date);
    out.writeUTF(pid);
    out.writeInt(amount);
    out.writeUTF(pname);
    out.writeUTF(category_id);
    out.writeFloat(price);
    out.writeUTF(flag);
}

@Override
public String toString() {
    return "order_id=" + order_id + ", date=" + date + ", pid=" + pid + ", amount=" + amount + ", pname="
            + pname + ", category_id=" + category_id + ", price=" + price;
}

}

qq_18219755

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
用MR实现Join逻辑的两种方法

用MR实现Join逻辑的两种方法需求订单数据表 order.txtid date pid amount1001 20150710 P0001 21002 20150710 P0001 31002 20150710 P0001 3商品信息表 product.txtid pname category_id priceP0001 小米5 1001 2P0002 锤子T1 1000 3...
复制链接

扫一扫