适用于关联表中有小表的情形;
可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果,可以大大提高合并操作的并发度,加快处理速度。
核心思想
驱动文件;
-
加载缓存数据
// 加载缓存数据 job.addCacheFile(new URI("file:/f:/catch/pd.txt"));
-
map端join的逻辑不需要reduce阶段,设置reducetask数量为0
job.setNumReduceTasks(0);
mapper中
setup()方法中
- 获取缓存的文件
- 循环读取缓存文件中的一行
- 切割
- 缓存数据到集合
- 关流
map方法中
- 获取一行
- 截取
- 获取订单id
- 获取商品名称
- 拼接
- 写出
Mapper方法
package com.zyd.tablePlus;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.*;
import java.util.HashMap;
import java.util.Map;
public class DistributedMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
//缓存pd.txt数据
private Map<String,String> pdMap = new HashMap<>();
@Override
protected void setup(Context context) throws IOException{
//读取pd.txt文件,并把文件数据存储到缓存(集合)
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("f:/catch/pd.txt")));
String line;
//01 小米
while (StringUtils.isNotEmpty(line = reader.readLine())) {
//截取
String[] fields = line.split("\t");
//存储数据到缓存
pdMap.put(fields[0], fields[1]);
}
//关闭资源
reader.close();
}
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//需求:要合并pd.txt和order.txt的内容
// 1. 获取一行 1001 01 4 pdName
String line = value.toString();
//2 截取 1001
String[] fields = line.split("\t");
//3 获取pdname
String pdName = pdMap.get(fields[1]);
//4 拼接 1001 01 4 pdName
k.set(line+"\t"+pdName);
//5 写出
context.write(k,NullWritable.get());
}
}
Reducer方法
package com.zyd.tablePlus;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class DistributedDriver {
public static void main (String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
// 1 获取job信息
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
// 2 设置加载jar包路径
job.setJarByClass(DistributedDriver.class);
//3 关联map
job.setMapperClass(DistributedMapper.class);
//4 设置最终输出数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//5 设置输入输出路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//6 加载缓存数据
job.addCacheFile(new URI("file:/f:/catch/pd.txt"));
//7 map端join的逻辑不需要reduce阶段
job.setNumReduceTasks(0);
//8 提交
boolean result = job.waitForCompletion(true);
System.exit(result ? 0:1);
}
}