假设有订单表t_order和t_product两张数据库表,现在需要进行关联查询。这样的sql语句很容易写
select a.id,a.date,b.name,b.category_id,b.price
from t_order a left out join t_product b
on a.pid = b.id
那么怎么样用mapreduce来实现呢?通过将关联的条件作为map输出的key(pid,商品id,订单表与商品表是多对一关系),将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联,最后写到一个文件中。
我们可以自定义一个Bean,里面封装了两张表的所有字段信息,最后写出到文件的时候将bean输出即可。
public class InfoBean implements Writable{
private String oid;//订单id
private String date;
private String pid;//商品id
private int amount;
private String pname;//商品名称
private int category_id;//商品类别
private int price;
//0:订单信息 1:商品信息
private int flag;
//序列化方法
public void write(DataOutput out) throws IOException {
out.writeUTF(oid);
out.writeUTF(date);
out.writeUTF(pid);
out.writeInt(amount);
out.writeUTF(pname);
out.writeInt(category_id);
out.writeInt(price);
out.writeInt(flag);
}
//反序列化方法
public void readFields(DataInput in) throws IOException {
this.oid=in.readUTF();
this.date=in.readUTF();
this.pid=in.readUTF();
this.amount=in.readInt();
this.pname=in.readUTF();
this.category_id=in.readInt();
this.price=in.readInt();
this.flag=in.readInt();
}
//初始化属性
public void setInfoBean(String oid, String date, String pid, int amount, String pname, int category_id, int price,
int flag) {
this.oid = oid;
this.date = date;
this.pid = pid;
this.amount = amount;
this.pname = pname;
this.category_id = category_id;
this.price = price;
this.flag = flag;
}
//重写toString方法,以便文件中展示
@Override
public String toString() {
return "oid=" + oid + ", date=" + date + ", pid=" + pid + ", amount=" + amount + ", pname=" + pname
+ ", category_id=" + category_id + ", price=" + price ;
}
//序列化必须有无参构造方法
public InfoBean() {
}
get、set
}
public class MapReduceJoin {
static class MapReduceJoinMapper extends Mapper<LongWritable, Text, Text, InfoBean>{
InfoBean bean = new InfoBean();
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
//获取切片
FileSplit inputSplit = (FileSplit)context.getInputSplit();
//获取文件名,根据文件名进行不同的处理
//订单文件名order.txt,商品文件名product.txt
String name = inputSplit.getPath().getName();
String pid = "";
if(name.startsWith("order")){
String[] fields = line.split(",");
pid = fields[2];
//订单文件
bean.setInfoBean(fields[0], fields[1], pid,
Integer.parseInt(fields[3]), "", 0, 0, 0);
}else {
String[] fields = line.split(",");
pid = fields[0];
//商品文件
bean.setInfoBean("", "", pid,
0, fields[1], Integer.parseInt(fields[2]), Integer.parseInt(fields[3]), 1);
}
text.set(pid);
context.write(text, bean);
}
}
static class MapReduceJoinReducer extends Reducer<Text, InfoBean, InfoBean, NullWritable>{
//传来的一组信息中只会有一个商品bean可能有多个订单bean
@Override
protected void reduce(Text key, Iterable<InfoBean> beans,
Context context) throws IOException, InterruptedException {
//用来存储唯一的商品bean信息
InfoBean ProductBean = new InfoBean();
//用来存储多个订单信息的集合
List<InfoBean> orderList = new ArrayList<InfoBean>();
for (InfoBean bean : beans) {
//判断是订单还是商品信息
int flag = bean.getFlag();
if(flag == 1){
//商品信息,只能是一个
try {
BeanUtils.copyProperties(ProductBean, bean);
} catch (Exception e) {
e.printStackTrace();
}
}else {
//订单信息,可能是多个
InfoBean orderBean = new InfoBean();
try {
BeanUtils.copyProperties(orderBean, bean);
orderList.add(orderBean);
} catch (Exception e) {
e.printStackTrace();
}
}
}
for (InfoBean orderBean : orderList) {
//多个订单信息中设置好对应的商品信息
orderBean.setPname(ProductBean.getPname());
orderBean.setCategory_id(ProductBean.getCategory_id());
orderBean.setPrice(ProductBean.getPrice());
context.write(orderBean, NullWritable.get());
}
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(MapReduceJoin.class);
//指定本业务job要使用的mapper,reducer业务类
job.setMapperClass(MapReduceJoinMapper.class);
job.setReducerClass(MapReduceJoinReducer.class);
//虽然指定了泛型,以防框架使用第三方的类型
//指定mapper输出数据的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfoBean.class);
//指定最终输出的数据的kv类型
job.setOutputKeyClass(InfoBean.class);
job.setOutputValueClass(NullWritable.class);
//指定job输入原始文件所在位置
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定job输入原始文件所在位置
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//将job中配置的相关参数以及job所用的java类所在的jar包,提交给yarn去运行
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
测试:将工程打包上传到linux
创建输入目录
创建订单文件和商品文件编辑字段信息
将文件传到输入目录
执行程序
查看生产文件的内容
[root@mini1 ~]# hadoop fs -mkdir -p /mrjoin/input
[root@mini1 ~]# vi order.txt
1001,20170710,P0001,1
1002,20170710,P0001,3
1003,20170710,P0002,3
1003,20170710,P0002,4
[root@mini1 ~]# vi product.txt
P0001,xiaomi4,1000,2
P0002,iphone6s,1000,3
[root@mini1 ~]# hadoop fs -put order.txt product.txt /mrjoin/input/
[root@mini1 ~]# hadoop jar mrjoin.jar com.scu.hadoop.rjoin.MapReduceJoin /mrjoin/input /mrjoin/output
[root@mini1 ~]# hadoop fs -cat /mrjoin/output/part-r-00000
oid=1002, date=20170710, pid=P0001, amount=3, pname=xiaomi4, category_id=1000, price=2
oid=1001, date=20170710, pid=P0001, amount=1, pname=xiaomi4, category_id=1000, price=2
oid=1003, date=20170710, pid=P0002, amount=4, pname=iphone6s, category_id=1000, price=3
oid=1003, date=20170710, pid=P0002, amount=3, pname=iphone6s, category_id=1000, price=3