13 - MapReduce之Reduce join以及map join分布式缓存

爱上口袋的天空

已于 2024-04-21 09:42:25 修改

阅读量690

点赞数

分类专栏： # hadoop3.x 文章标签： hadoop

于 2019-08-07 21:51:50 首次发布

本文链接：https://blog.csdn.net/K_520_W/article/details/98517826

版权

hadoop3.x 专栏收录该内容

30 篇文章 2 订阅

订阅专栏

一：Reduce join

简介：
⑴原理：
  Map 端的主要工作：
为来自不同表(文件)的 key/value 对打标签以区别不同来源的记录然后用连接字段作为 key，
  其余部分和新加的标志作为 value，最后进行输出。
Reduce 端的主要工作：
在 reduce 端以连接字段作为 key 的分组已经完成，我们只需要在每一个分组当中将那些来
  源于不同文件的记录(在 map 阶段已经打标志)分开，最后进行合并就 ok 了。
⑵缺点：
这种方式的缺点很明显就是会造成 map和 reduce 端也就是 shuffle 阶段出现大量的数据传输，效率很低。
案例：reduce 端表合并（数据倾斜）
⑴需求：
  订单数据表 t_order，在order.txt中：

商品信息表 t_product，在t_product.txt文件中。

将商品信息表中数据根据商品 pid 合并到订单数据表中。最终数据如下：

代码实现
⑴数据准备

⑵思路分析：
a：首先订单表和产品表是多对一的关系，因为每个pid在产品表中是唯一的，而在订单表中可能很多个订单都包含
这个产品。
b：我们需要定义一个bean对象，如下：

c：我们以pId作为Mapper阶段输出的key，其它内容为value
mapper中整合后的数据如下：

d：在ruducer阶段，我们以key（pId）作为条件，一个pId在产品表中只有一条数据，那么就好办了，直接查出产品
表中对应的名称，将order表中的名称填充上即可输出。
⑶TableBean对象代码如下：

package com.kgf.mapreduce.reducerJoin;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/***
 * 定义实体对象
 * @author KGF
 *
 */
public class TableBean implements Writable {

	/**订单ID**/
	private String orderId;
	/**产品ID**/
	private String pId;
	/**产品数量**/
	private int amount;
	/**产品名称**/
	private String pName;
	/**表类型：0-订单表，1-产品表**/
	private String tableType;
	
	public TableBean(String orderId, String pId, int amount, String pName, String tableType) {
		super();
		this.orderId = orderId;
		this.pId = pId;
		this.amount = amount;
		this.pName = pName;
		this.tableType = tableType;
	}

	public TableBean() {
		super();
	}
	/***
	 * 反序列化方法
	 */
	@Override
	public void readFields(DataInput di) throws IOException {
		this.orderId = di.readUTF();
		this.pId = di.readUTF();
		this.amount = di.readInt();
		this.pName = di.readUTF();
		this.tableType = di.readUTF();
	}
	/***
	 * 序列化操作方法
	 */
	@Override
	public void write(DataOutput dot) throws IOException {
		dot.writeUTF(orderId);
		dot.writeUTF(pId);
		dot.writeInt(amount);
		dot.writeUTF(pName);
		dot.writeUTF(tableType);
	}

	@Override
	public String toString() {
		return orderId+"\t"+ pName + "\t"+amount;
	}

	public String getOrderId() {
		return orderId;
	}

	public void setOrderId(String orderId) {
		this.orderId = orderId;
	}

	public String getpId() {
		return pId;
	}

	public void setpId(String pId) {
		this.pId = pId;
	}

	public int getAmount() {
		return amount;
	}

	public void setAmount(int amount) {
		this.amount = amount;
	}

	public String getpName() {
		return pName;
	}

	public void setpName(String pName) {
		this.pName = pName;
	}

	public String getTableType() {
		return tableType;
	}

	public void setTableType(String tableType) {
		this.tableType = tableType;
	}
}

⑷TableMapper类代码：

package com.kgf.mapreduce.reducerJoin;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/***
 *    创建mapper类：主要功能如下
 *    a：获取读取数据来自的标名称
 *    b：对每一行数据进行切割，将我们需要的数据筛选出来，并且标记来自的文件表
 *    c：最后将数据写出到reducer
 * @author KGF
 *
 */
public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> {

	TableBean v = new TableBean();
	
	Text k = new Text();
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		//1：获取读取文件的名称
		FileSplit splitFile = (FileSplit) context.getInputSplit();
		String fileName = splitFile.getPath().getName();
		//2：获取一行数据
		String line = value.toString();
		//3：判断表文件，对数据进行切割
		String[] files = line.split("\t");
		if(fileName.startsWith("order")) {
			//订单表
			v.setOrderId(files[0]);
			v.setpId(files[1]);
			v.setAmount(Integer.parseInt(files[2]));
			v.setpName("");
			v.setTableType("0");
			k.set(files[1]);
		}else {
			//产品表
			v.setOrderId("");
			v.setpId(files[0]);
			v.setAmount(0);
			v.setpName(files[1]);
			v.setTableType("1");
			k.set(files[0]);
		}
		//4:写出数据
		context.write(k,v);
	}
	
}

⑸TableReducer类代码：

package com.kgf.mapreduce.reducerJoin;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/***
 *     创建TableReducer类：
 * @author KGF
 *
 */
public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable> {

	@Override
	protected void reduce(Text key, Iterable<TableBean> values,Context context) 
			throws IOException, InterruptedException {
		//1：循环所有的values,
		List<TableBean> tbList = new ArrayList<TableBean>();//用来存放订单表数据
		TableBean tBean = new TableBean();//存放产品表数据
		for (TableBean val : values) {
			//2：判断表类型
			if("0".equals(val.getTableType())) {//订单表
				try {
					//3：创建一个TableBean对象
					TableBean tb = new TableBean();
					//4：将val拷贝到tb中
					BeanUtils.copyProperties(tb, val);
					tbList.add(tb);//注意：如果我们不进行拷贝会出问题，都是最后一个对象的值，前面的对象数据会被覆盖
				} catch (IllegalAccessException e) {
					e.printStackTrace();
				} catch (InvocationTargetException e) {
					e.printStackTrace();
				}
			}else {//产品表
				try {
					BeanUtils.copyProperties(tBean, val);
				} catch (IllegalAccessException e) {
					e.printStackTrace();
				} catch (InvocationTargetException e) {
					e.printStackTrace();
				}
			}
		}
		//拼接表
		for (TableBean tableBean : tbList) {
			tableBean.setpName(tBean.getpName());
			context.write(tableBean, NullWritable.get());
		}
	}
	
}

⑹TableDriver类代码

package com.kgf.mapreduce.reducerJoin;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TableDriver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		//1：获取job对象
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		//2:设置jar对象
		job.setJarByClass(TableDriver.class);
		//3:关联mapper和reducer
		job.setMapperClass(TableMapper.class);
		job.setReducerClass(TableReducer.class);
		//4:设置mapper输出参数
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(TableBean.class);
		//5:设置最终输出参数
		job.setOutputKeyClass(TableBean.class);
		job.setOutputValueClass(NullWritable.class);
		//6：设置数据路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		//7:提交
		boolean result = job.waitForCompletion(true);
		System.exit(result?0:1);
	}
	
}

缺点
上面这种方式中，合并的操作是在 reduce 阶段完成，reduce 端的处理压力太大，map节点的运算负载则很低，资源利用率不高，且在 reduce 阶段极易产生数据倾斜。

解决方案：map 端实现数据合并之Map Join
⑴ 使用场景：
  Map Join适用于一张表十分小、一张表很大的场景。
⑵优点：
  在Map端缓存多张表，提前处理业务逻辑，这样增加Map端业务，减少Reduce端数据的压力，尽可能的减少数据倾斜。
⑶具体解决办法：
采用DistributedCache。
a:在Mapper的setup阶段，将文件读取到缓存集合中。
b:在驱动函数中加载缓存。
⑷DistributedCacheDriver类代码：

package com.kgf.mapreduce.mapperJoin;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DistributedCacheDriver {

	public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
		//1：获取job对象
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		//2:设置jar对象
		job.setJarByClass(DistributedCacheDriver.class);
		//3:关联mapper
		job.setMapperClass(DistributedCacheMapper.class);
		//4:设置最终输出参数
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		//5：设置数据路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//6：加载缓存数据
		job.addCacheFile(new URI("file:///e:/t_product.txt"));
		
		//7：map端join的逻辑不需要reducer阶段，设置reducetask的数量为0
		job.setNumReduceTasks(0);
		
		//8:提交
		boolean result = job.waitForCompletion(true);
		System.exit(result?0:1);
	}
	
}

⑸DistributedCacheMapper类：

package com.kgf.mapreduce.mapperJoin;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class DistributedCacheMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

	Map<String, String> pdMap = new HashMap<String, String>();

	/***
	 * 这个属于初始化方法，只执行一些，这个我们用来加载缓存中文件数据
	 */
	@Override
	protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context)
			throws IOException, InterruptedException {
		// 1：获取缓存文件
		URI[] cacheFiles = context.getCacheFiles();
	    String path = cacheFiles[0].getPath().toString();
		BufferedReader reader = new BufferedReader(
				new InputStreamReader(new FileInputStream(path), "UTF-8"));
		String line = null;
		while(StringUtils.isNoneBlank(line=reader.readLine())) {
			//对一行进行切割
			String[] fields = line.split("\t");
			//数据缓存到集合中
			pdMap.put(fields[0],fields[1]);
		}
		//关闭流
		reader.close();
	}

	Text k = new Text();
	
	@Override
	protected void map(LongWritable key, Text value,
			Mapper<LongWritable, Text, Text, NullWritable>.Context context)
			throws IOException, InterruptedException {
		//1：读取一行
		String line = value.toString();
		//2：切割
		String[] fields = line.split("\t");
		//3：获取pId
		String pid = fields[1];
		//4：获取pid对应的名称
		String pName = pdMap.get(pid);
		//5：替换掉名称
		line = fields[0]+"\t"+pName+"\t"+fields[2];
		k.set(line);
		context.write(k, NullWritable.get());
	}

}

⑹这里我们不需要reducer类，注意：可以没有reducer,但是mapper必须有，效果如下：