reduce端join算法实现

一、reduce端join算法实现

1、需求:
订单数据表t_order:

iddatepidamount
100120150710P00012
100220150710P00013
100220150710P00023

商品信息表t_product

idpnamecategory_idprice
P0001小米10002000
P0002锤子P00013000

假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,需要用mapreduce程序来实现一下SQL查询运算:

select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id

2、实现机制:
通过将关联的条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联

orders.txt
在这里插入图片描述
product.txt
在这里插入图片描述

第一步:定义OrderBean

public class OrderBean implements Writable {
private String id;
private String date;
private String pid;
private String amount;
private String name;
private String categoryId;
private String price;
@Override
public String toString() {
    return id+"\t"+date+"\t"+pid+"\t"+amount+"\t"+name+"\t"+categoryId+"\t"+price;
}
public OrderBean() {
}

public OrderBean(String id, String date, String pid, String amount, String name, String categoryId, String price) {
    this.id = id;
    this.date = date;
    this.pid = pid;
    this.amount = amount;
    this.name = name;
    this.categoryId = categoryId;
    this.price = price;
}
public String getId() {
    return id;
}

public void setId(String id) {
    this.id = id;
}

public String getDate() {
    return date;
}
public void setDate(String date) {
    this.date = date;
}
public String getPid() {
    return pid;
}
public void setPid(String pid) {
    this.pid = pid;
}
public String getAmount() {
    return amount;
}
public void setAmount(String amount) {
    this.amount = amount;
}
public String getName() {
    return name;
}
public void setName(String name) {
    this.name = name;
}
public String getCategoryId() {
    return categoryId;
}
public void setCategoryId(String categoryId) {
    this.categoryId = categoryId;
}
public String getPrice() {
    return price;
}

public void setPrice(String price) {
    this.price = price;
}

@Override
public void write(DataOutput out) throws IOException {
    out.writeUTF(id+"");
    out.writeUTF(date+"");
    out.writeUTF(pid+"");
    out.writeUTF(amount+"");
    out.writeUTF(name+"");
    out.writeUTF(categoryId+"");
    out.writeUTF(price+"");
}

@Override
public void readFields(DataInput in) throws IOException {
    this.id =  in.readUTF();
    this.date =  in.readUTF();
    this.pid =  in.readUTF();
    this.amount =  in.readUTF();
    this.name =  in.readUTF();
    this.categoryId =  in.readUTF();
    this.price =  in.readUTF();
  }
}

第二步:定义map类

public class mappers extends Mapper<LongWritable, Text, Text, OrderBean> {
private OrderBean order = new OrderBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    //通过获取文件名来区分两个不同的文件
    String[] split = value.toString().split(",");
    FileSplit inputSplit = (FileSplit) context.getInputSplit();
    String name = inputSplit.getPath().getName();
    if(name.contains("orders")){
        //订单数据
        order.setId(split[0]);
        order.setDate(split[1]);
        order.setPid(split[2]);
        order.setAmount(split[3]);
        context.write(new Text(split[2]),order);
    }else{
        //商品数据
        order.setName(split[1]);
        order.setCategoryId(split[2]);
        order.setPrice(split[3]);
        context.write(new Text(split[0]),order);
    }
  }
}

第三步:自定义reduce类

public class reducer extends Reducer<Text,OrderBean,OrderBean, NullWritable> {
private OrderBean orderBean=new OrderBean();
@Override
protected void reduce(Text key, Iterable<OrderBean> values, Context context) throws IOException, InterruptedException {
    orderBean=new OrderBean();
    for (OrderBean value : values) {
        System.out.println(value.toString());
	    //相同的key的对象都发送到了这里,在这里将数据拼接完整
        if(null !=value.getId() && !value.getId().equals("null") ){
            orderBean.setId(value.getId());
            orderBean.setDate(value.getDate());
            orderBean.setPid(value.getPid());
            orderBean.setAmount(value.getAmount());
        }else{
            orderBean.setName(value.getName());
            orderBean.setCategoryId(value.getCategoryId());
            orderBean.setPrice(value.getPrice());
        }
    }
    context.write(orderBean,NullWritable.get());
  }
}

第四步:开发main方法入口

public class OrderJoinMain {
public static void main(String[] args) throws Exception {
    Job job = Job.getInstance(new Configuration(), "MapJoin");

    job.setJarByClass(OrderJoinMain .class);
    job.setMapperClass(mappers.class);
    job.setReducerClass(reducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(OrderBean.class);

    job.setOutputKeyClass(OrderBean.class);
    job.setOutputValueClass(NullWritable.class);

    TextInputFormat.addInputPath(job,new Path("file:///E:\\input"));
    TextOutputFormat.setOutputPath(job,new Path("file:///E:\\output"));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

缺点:这种方式中,join的操作是在reduce阶段完成,reduce端的处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜

解决方案: map端join实现方式

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值