MapReduce之join算法案例实现

最新推荐文章于 2024-10-10 12:00:04 发布

peng_0129

最新推荐文章于 2024-10-10 12:00:04 发布

阅读量1k

点赞数

分类专栏： hadoop

本文链接：https://blog.csdn.net/peng_0129/article/details/80623132

版权

hadoop 专栏收录该内容

20 篇文章 0 订阅

订阅专栏

本文介绍了一个使用MapReduce实现大数据集Join的案例。通过将JOIN条件作为Map阶段的Key，将两个表的数据发送到同一个Reduce任务中，然后在Reduce阶段完成数据的合并。示例代码展示了如何处理订单数据表t_order和商品信息表t_product，并实现了基于文件存储的数据JOIN操作。

摘要由CSDN通过智能技术生成

1、需求：

订单数据表t_order：

id	date	pid	amount
1001	20150710	P0001	2
1002	20150710	P0001	3
1002	20150710	P0002	3

商品信息表t_product

id	pname	category_id	price
P0001	小米5	1000	2
P0002	锤子T1	1000	3

假如数据量巨大，两表的数据是以文件的形式存储在HDFS中，需要用mapreduce程序来实现一下SQL查询运算：

select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id

2、实现机制：

通过将关联的条件作为map输出的key，将两表满足join条件的数据并携带数据所来源的文件信息，发往同一个reducetask，在reduce中进行数据的串联

package cn.itcast.bigdata.mr.rjoin;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import cn.itcast.bigdata.mr.flowsum.FlowBean;
import cn.itcast.bigdata.mr.flowsum.FlowCount;

public class Rjoin {

static class RjoinMapper extends Mapper<LongWritable, Text, Text, InfoBean>{

InfoBean bean = new InfoBean();
Text text = new Text();

@Override
protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {

//获取每一行的文本数据
String line = value.toString();
//获取每一行的切片数据
FileSplit inputsplit = (FileSplit) context.getInputSplit();
//获取该行切片的名称
String name = inputsplit.getPath().getName();
String pid = "";
//通过文件判断是哪种数据类型
if(name.startsWith("order")) {
//将数据进行切分
String[] fileds = line.split(",");
pid = fileds[2];
bean.set(Integer.parseInt(fileds[0]), fileds[1], fileds[2], Integer.parseInt(fileds[3]), "", 0, 0, "0");
}else {
String[] fileds = line.split(",");
pid = fileds[0];
bean.set(0, "", fileds[0], 0, fileds[1], Integer.parseInt(fileds[2]), Float.parseFloat(fileds[3]), "1");
}

text.set(pid);
//map汇总写出去的数据是以pid为key,InfoBean为value的方式写出去给reduce进行处理
context.write(text, bean);
}
}

static class RjoinReduce extends Reducer<Text, InfoBean, InfoBean, NullWritable>{

@Override
protected void reduce(Text key, Iterable<InfoBean> values,Context context) throws IOException, InterruptedException {
InfoBean pdBean = new InfoBean();
List<InfoBean> orderBeans = new ArrayList();

for(InfoBean bean : values) {
if("1".equals(bean.getFlag())) { //产品信息的bean
try {

BeanUtils.copyProperties(pdBean, bean);

orderBeans.add(odBean);

} catch (Exception e) {
e.printStackTrace();
}
}else {
InfoBean odBean = new InfoBean(); //订单bean
try {
BeanUtils.copyProperties(odBean, bean);
} catch (Exception e) {
e.printStackTrace();
}
}
}

//拼接两类数据最终形成的bean
for(InfoBean bean : orderBeans) {
bean.setPname(pdBean.getPname());
bean.setCategory_id(pdBean.getCategory_id());
bean.setPrice(pdBean.getPrice());

context.write(bean, NullWritable.get());
}
}
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();

Job job = Job.getInstance(conf);

//指定本程序jar包所在的本地路径
job.setJarByClass(Rjoin.class);

//指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(RjoinMapper.class);
job.setReducerClass(RjoinReduce.class);

//指定mapper输出数据的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfoBean.class);

//指定最终输出的数据的kv类型
job.setOutputKeyClass(InfoBean.class);
job.setOutputValueClass(NullWritable.class);

//指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));

//将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);

}

}

package cn.itcast.bigdata.mr.rjoin;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class InfoBean implements Writable{

private int id;
private String date;
private String pid;
private int amount;
private String pname;
private int category_id;
private float price;
private String flag;

//flag为0代表订单数据表，为1代表商品信息表
public String getFlag() {
return flag;
}

public void setFlag(String flag) {
this.flag = flag;
}

public InfoBean() {}



public void set(int id, String date, String pid, int amount, String pname, int category_id, float price,String flag) {
this.id = id;
this.date = date;
this.pid = pid;
this.amount = amount;
this.pname = pname;
this.category_id = category_id;
this.price = price;
this.flag = flag;
}

public int getId() {
return id;
}

public void setId(int id) {
this.id = id;
}

public String getDate() {
return date;
}

public void setDate(String date) {
this.date = date;
}

public String getPid() {
return pid;
}

public void setPid(String pid) {
this.pid = pid;
}

public int getAmount() {
return amount;
}

public void setAmount(int amount) {
this.amount = amount;
}

public String getPname() {
return pname;
}

public void setPname(String pname) {
this.pname = pname;
}

public int getCategory_id() {
return category_id;
}

public void setCategory_id(int category_id) {
this.category_id = category_id;
}


public float getPrice() {
return price;
}

public void setPrice(float price) {
this.price = price;
}

//序列化方法,将对象以转化为流的方式写出去
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(id);
out.writeUTF(date);
out.writeUTF(pid);
out.writeInt(amount);
out.writeUTF(pname);
out.writeInt(category_id);
out.writeFloat(price);
};

//反序列化方法，将传过来的流转化为我们需要的对象
@Override
public void readFields(DataInput in) throws IOException {
this.id = in.readInt();
this.date = in.readUTF();
this.pid = in.readUTF();
this.amount = in.readInt();
this.pname = in.readUTF();
this.category_id = in.readInt();
this.price = in.readFloat();
}

@Override
public String toString() {
return "id=" + id + ", date=" + date + ", pid=" + pid + ", amount=" + amount + ", pname=" + pname
+ ", category_id=" + category_id + ", price=" + price + "";
}