MapReduce Join关联
1、Reduce join(合并)
原理
Map端的主要工作:为来自不同表(文件)的key/value对打标签以区别不同来源的记录。然后用连接字段作为key,其余部分和新加的标志作为value,最后进行输出。
Reduce端的主要工作:在reduce端以连接字段作为key的分组已经完成,我们只需要在每一个分组当中将那些来源于不同文件的记录(在map阶段已经打标志)分开,最后进行合并就ok了。
2、Reduce join案例实操
(1)需求
订单数据表t_order
order.txt
1001 01 1
1002 02 2
1003 03 3
1004 01 4
1005 02 5
1006 03 6
商品信息表t_product
pd.txt
01 小米
02 华为
03 格力
sql:select * from t_order o join t_product p on o.pid=p.pid; 笛卡尔积,join条件:t_order.pid=t_product.pid;
将商品信息表中数据根据商品pid合并到订单数据表中。
最终数据形式
通过将关联条件作为map输出的key,将两表满足join条件的数据(包含数据来源于哪一个文件的标识),发往同一个reduce task,在reduce中进行数据的串联,如图所示:
reduce端表合并
(2)代码实现
a)创建商品和订单合并后的bean类
package com.bigdata.reducejoin;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
//1001 01 1
//01 小米
//02 华为
//03 格力
public class OrderBean implements Writable{
//订单id
private int oid;
//商品id
private int pid;
//商品数量
private int amount;
//商品名称
private String pname;
//bean的标记 0:标记该bean来自于order.txt 1:来自于pd.txt
private String flag ;
public OrderBean() {
super();
// TODO Auto-generated constructor stub
}
public OrderBean(int oid, int pid, int amount, String pname, String flag) {
super();
this.oid = oid;
this.pid = pid;
this.amount = amount;
this.pname = pname;
this.flag = flag;
}
public int getOid() {
return oid;
}
public void setOid(int oid) {
this.oid = oid;
}
public int getPid() {
return pid;
}
public void setPid(int pid) {
this.pid = pid;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
public void write(DataOutput out) throws IOException {
out.writeInt(oid);
out.writeInt(pid);
out.writeInt(amount);
//注意:字符串的序列化写法
out.writeUTF(pname);
out.writeUTF(flag);
}
public void readFields(DataInput in) throws IOException {
oid = in.readInt();
pid = in.readInt();
amount = in.readInt();
pname = in.readUTF();
flag = in.readUTF();
}
@Override
public String toString() {
return this.oid+"\t"+this.pname+"\t"+this.amount;
}
}
b)编写Mapper程序
package com.bigdata.reducejoin;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
//让商品id即pid当keyout,根据它分组调用
public class ReduceJoinMapper extends Mapper<LongWritable, Text, IntWritable, OrderBean>{
IntWritable k = new IntWritable();
OrderBean v = new OrderBean();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
FileSplit fs = (FileSplit) context.getInputSplit();
String name = fs.getPath().getName();
//读取order.txt pd.txt
String line = value.toString();
//按照tab分割,挑出相应的字段封装到OrderBean里面
String[] split = line.split("\t");
if(name.startsWith("order")){//意味着该文件是order.txt
//1001 01 1
v.setOid(Integer.parseInt(split[0]));
v.setPid(Integer.parseInt(split[1]));
v.setAmount(Integer.parseInt(split[2]));
v.setPname("");
v.setFlag("0");//标记为订单的bean
}else{//pd.txt
//01 小米
v.setPid(Integer.parseInt(split[0]));
v.setPname(split[1]);
v.setOid(0);
v.setAmount(0);
v.setFlag("1");//标记为商品的bean
}
//组装kv并写出
k.set(v.getPid());
context.write(k, v);
}
}
c)编写Reducer程序
package com.bigdata.reducejoin;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
//OrderBean 中含有oid,还有amout,让其当keyout
//pid当valueout
public class ReduceJoinReduce extends Reducer<IntWritable, OrderBean, OrderBean, IntWritable>{
@Override
protected void reduce(IntWritable key, Iterable<OrderBean> values,Context context)
throws IOException, InterruptedException {
//pid相同的所有kv中,既有订单的bean,还有商品的bean,
//<01,1001 01 1>
//<01,1004 01 4>
//<01,01 小米>
//遍历所有的bean,要把它们分别挑出来,把订单放到一个集合,商品放到另外一个bean
List<OrderBean> orders = new ArrayList<OrderBean>();
OrderBean pd = new OrderBean();
for (OrderBean orderBean : values) {
if(orderBean.getFlag().equals("0")){
try {
OrderBean order = new OrderBean();
//order.setOid(orderBean.getOid());
BeanUtils.copyProperties(order, orderBean);
orders.add(order);
} catch (Exception e) {
e.printStackTrace();
}
}else{
try {
BeanUtils.copyProperties(pd, orderBean);
} catch (Exception e) {
e.printStackTrace();
}
}
}
//遍历订单集合,给订单集合的所有的订单设置pname
for (OrderBean ss : orders) {
ss.setPname(pd.getPname());
//组装kv写出
context.write(ss, key);
}
}
}
d)编写Driver程序
package com.bigdata.reducejoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ReduceJoinDriver {
public static void main(String[] args) throws Exception {
//1 创建配置对象
Configuration conf = new Configuration();
//2 通过配置对象,创建job
Job job = Job.getInstance(conf);
//3 设置job的jar包位置
job.setJarByClass(ReduceJoinDriver.class);
//4 设置mapper,reduce类
job.setMapperClass(ReduceJoinMapper.class);
job.setReducerClass(ReduceJoinReduce.class);
//5 设置mapper的keyout,valueout
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(OrderBean.class);
//6 设置最终输出的keyout,valueout
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(IntWritable.class);
//7 设置输入数据的路径,输出数据的路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
Path outPath = new Path(args[1]);
if(outPath.getFileSystem(conf).exists(outPath)){
outPath.getFileSystem(conf).delete(outPath,true);
}
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//8 提交任务到yarn集群或者是本地模拟器
boolean waitForCompletion = job.waitForCompletion(true);
System.out.println(waitForCompletion);
}
}
运行程序查看结果:
1001 小米 1
1001 小米 1
1002 华为 2
1002 华为 2
1003 格力 3
1003 格力 3
缺点:这种方式中,合并的操作是在reduce阶段完成,reduce端的处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜(同一个reduce接收到的数据量很大)。
解决方案: map端实现数据合并
3、Map join(合并)
(1)使用场景
一张表十分小、一张表很大。
(2)解决方案
在map端缓存多张表,提前处理业务逻辑,这样增加map端业务,减少reduce端数据的压力,尽可能的减少数据倾斜。
(3)具体办法:采用distributedcache(分布式缓存)
a)在mapper的setup阶段,将文件读取到缓存集合中。
b)在驱动函数中加载缓存。
job.addCacheFile(new URI(“file:/e:/mapjoincache/pd.txt”));// 缓存普通文件到task运行节点,如图所示:
map端表合并
4、Map join案例实操
(1)分析
适用于关联表中有小表的情形。
可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果,可以大大提高合并操作的并发度,加快处理速度。
(2)实现代码
a)先在Driver模块中添加缓存文件
package com.bigdata.mapjoin;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.bigdata.reducejoin.OrderBean;
public class MapJoinDriver {
public static void main(String[] args) throws Exception {
//1 创建配置对象
Configuration conf = new Configuration();
//2 通过配置对象,创建job
Job job = Job.getInstance(conf);
//3 设置job的jar包位置
job.setJarByClass(MapJoinDriver.class);
//4 设置mapper,reduce类
job.setMapperClass(MapJoinMapper.class);
//5 设置mapper的keyout,valueout
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
//6 设置最终输出的keyout,valueout
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(IntWritable.class);
----------------------------------------
//job.addCacheFile(new URI("hdfs://hadoop101:9000/test/mapjoin/pd/pd.txt"));
job.addCacheFile(new URI("file:///D:/test/mapjoin/pd/pd.txt"));
//因为没有reduce阶段,所有设置reduce的数量为0
job.setNumReduceTasks(0);
----------------------------------------
//7 设置输入数据的路径,输出数据的路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
Path outPath = new Path(args[1]);
if(outPath.getFileSystem(conf).exists(outPath)){
outPath.getFileSystem(conf).delete(outPath,true);
}
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//8 提交任务到yarn集群或者是本地模拟器
boolean waitForCompletion = job.waitForCompletion(true);
System.out.println(waitForCompletion);
}
}
b)读取缓存的文件数据
package com.bigdata.mapjoin;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MapJoinMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
Map<String, String> pdMap = new HashMap<String, String>();
@Override//该方法在map task执行之前先执行一次,用于初始化map task,为map task做准备
//提前把商品文本读进来,缓存商品数据
//可以把商品信息放到Map<pid,pname>,以商品pid为key,以商品名称为value存储商品信息
protected void setup(Context context)
throws IOException, InterruptedException {
//01 小米
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream("pd.txt"), "UTF-8"));
String line = null;
while(StringUtils.isNotBlank(line = r.readLine())){
String[] split = line.split("\t");
pdMap.put(split[0], split[1]);
}
r.close();
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
//按照tab分割,挑出相应的字段封装到OrderBean里面
String[] split = line.split("\t");
//1001 01 1
//根据订单的pid去商品的map中寻找商品名称
String pname = pdMap.get(split[1]);
line = line+"\t"+pname;
//组装kv,并写出
context.write(new Text(line), NullWritable.get());
}
}
谢谢观看,有问题感谢指正