一、reduce端join算法实现
1、需求:
订单数据表t_order:
id | date | pid | amount |
---|---|---|---|
1001 | 20150710 | P0001 | 2 |
1002 | 20150710 | P0001 | 3 |
1002 | 20150710 | P0002 | 3 |
商品信息表t_product
id | pname | category_id | price |
---|---|---|---|
P0001 | 小米 | 1000 | 2000 |
P0002 | 锤子 | P0001 | 3000 |
假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,需要用mapreduce程序来实现一下SQL查询运算:
select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id
2、实现机制:
通过将关联的条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联
orders.txt
product.txt
第一步:定义OrderBean
public class OrderBean implements Writable {
private String id;
private String date;
private String pid;
private String amount;
private String name;
private String categoryId;
private String price;
@Override
public String toString() {
return id+"\t"+date+"\t"+pid+"\t"+amount+"\t"+name+"\t"+categoryId+"\t"+price;
}
public OrderBean() {
}
public OrderBean(String id, String date, String pid, String amount, String name, String categoryId, String price) {
this.id = id;
this.date = date;
this.pid = pid;
this.amount = amount;
this.name = name;
this.categoryId = categoryId;
this.price = price;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public String getAmount() {
return amount;
}
public void setAmount(String amount) {
this.amount = amount;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCategoryId() {
return categoryId;
}
public void setCategoryId(String categoryId) {
this.categoryId = categoryId;
}
public String getPrice() {
return price;
}
public void setPrice(String price) {
this.price = price;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(id+"");
out.writeUTF(date+"");
out.writeUTF(pid+"");
out.writeUTF(amount+"");
out.writeUTF(name+"");
out.writeUTF(categoryId+"");
out.writeUTF(price+"");
}
@Override
public void readFields(DataInput in) throws IOException {
this.id = in.readUTF();
this.date = in.readUTF();
this.pid = in.readUTF();
this.amount = in.readUTF();
this.name = in.readUTF();
this.categoryId = in.readUTF();
this.price = in.readUTF();
}
}
第二步:定义map类
public class mappers extends Mapper<LongWritable, Text, Text, OrderBean> {
private OrderBean order = new OrderBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//通过获取文件名来区分两个不同的文件
String[] split = value.toString().split(",");
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String name = inputSplit.getPath().getName();
if(name.contains("orders")){
//订单数据
order.setId(split[0]);
order.setDate(split[1]);
order.setPid(split[2]);
order.setAmount(split[3]);
context.write(new Text(split[2]),order);
}else{
//商品数据
order.setName(split[1]);
order.setCategoryId(split[2]);
order.setPrice(split[3]);
context.write(new Text(split[0]),order);
}
}
}
第三步:自定义reduce类
public class reducer extends Reducer<Text,OrderBean,OrderBean, NullWritable> {
private OrderBean orderBean=new OrderBean();
@Override
protected void reduce(Text key, Iterable<OrderBean> values, Context context) throws IOException, InterruptedException {
orderBean=new OrderBean();
for (OrderBean value : values) {
System.out.println(value.toString());
//相同的key的对象都发送到了这里,在这里将数据拼接完整
if(null !=value.getId() && !value.getId().equals("null") ){
orderBean.setId(value.getId());
orderBean.setDate(value.getDate());
orderBean.setPid(value.getPid());
orderBean.setAmount(value.getAmount());
}else{
orderBean.setName(value.getName());
orderBean.setCategoryId(value.getCategoryId());
orderBean.setPrice(value.getPrice());
}
}
context.write(orderBean,NullWritable.get());
}
}
第四步:开发main方法入口
public class OrderJoinMain {
public static void main(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration(), "MapJoin");
job.setJarByClass(OrderJoinMain .class);
job.setMapperClass(mappers.class);
job.setReducerClass(reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(OrderBean.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
TextInputFormat.addInputPath(job,new Path("file:///E:\\input"));
TextOutputFormat.setOutputPath(job,new Path("file:///E:\\output"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
缺点:这种方式中,join的操作是在reduce阶段完成,reduce端的处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜
解决方案: map端join实现方式