MapJoin
适用场景:
一张大表和一张小表
小表的定义:在Hive中,是由hive.mapjoin.smalltable.filesize
参数决定的,该参数的默认值为10M。
特点:
在Map端完成Join,没有Shuffle的过程,因此效率比ReduceJoin更高, 是Hive中默认的实现方式。
实现原理:
将小表数据放入内存中,遍历大表数据的时候从缓存中取出小表数据进行关联。
代码实现:
Git项目地址
主类MapJoinDriver
public class MapJoinDriver {
public static void main(String[] args) throws Exception {
String inputEmpPath = args[0];
String inputDeptPath = args[1];
String outputPath = args[2];
//创建job
Job job = Job.getInstance(getConfiguration());
//设置job运行的主类
job.setJarByClass(MapJoinDriver.class);
//设置运行的mapper类
job.setMapperClass(MapJoinMapper.class);
//设置mapper(key,value)输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
//将小文件加入分布式缓存中
job.addCacheFile(new URI(inputDeptPath));
//关闭reduce
job.setNumReduceTasks(0);
//检查文件是否已存在,若存在,则递归删除
Path outputDir = new Path(outputPath);
outputDir.getFileSystem(getConfiguration()).delete(outputDir, true);
//设置文件输入输出路径
FileInputFormat.setInputPaths(job, new Path(inputEmpPath));
FileOutputFormat.setOutputPath(job, outputDir);
job.waitForCompletion(true);
}
}
MapJoinMapper
public class MapJoinMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private Map<Integer, String> deptMap = new HashMap<>();
@Override
protected void setup(Context context) throws IOException {
//从缓存中取部门表数据,初始化部门表Map
String path = context.getCacheFiles()[0].toString();
FSDataInputStream fsDataInputStream = getDataInputStream(path);
if (fsDataInputStream == null){
return;
}
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fsDataInputStream))) {
String line;
while ((line = bufferedReader.readLine()) != null) {
String[] words = line.split("\t");
deptMap.put(Integer.valueOf(words[0]), words[1]);
}
}
}
/**
* 根据hdfs的相对路径读取文件
* @param path hdfs的路径
* @return FSDataInputStream
*/
private FSDataInputStream getDataInputStream(String path){
try {
FileSystem fileSystem = getFileSystem();
return fileSystem.open(new Path(path));
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//读取员工表数据
String[] fields = value.toString().split("\t");
Integer deptno = Integer.valueOf(fields[7]);
String deptname = deptMap.get(deptno);
if (deptname != null){
StringBuilder data = new StringBuilder();
data.append(fields[0]).append("\t")
.append(fields[1]).append("\t")
.append(fields[5]).append("\t")
.append(deptname);
context.write(new Text(data.toString()), NullWritable.get());
}
}
}
ReduceJoin
适用场景:
两张大表
特点:
在Reduce端完成Join,存在Shuffle的流程。
实现原理:
在mapp时,对同一个key的数据来源进行标记;在reduce时,对同一个key的不同数据来源的数据进行关联输出。
代码实现:
Git项目地址
主类ReduceJoinDriver
public class ReduceJoinDriver {
public static void main(String[] args) throws Exception {
String inputEmpPath = args[0];
String inputDeptPath = args[1];
String outputPath = args[2];
//创建job
Job job = Job.getInstance(getConfiguration());
//设置job运行的主类
job.setJarByClass(ReduceJoinDriver.class);
//设置运行的mapper类
job.setMapperClass(ReduceJoinMapper.class);
//设置运行的reducer类
job.setReducerClass(ReduceJoinReduce.class);
//设置mapper(key,value)输出类型
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(JoinData.class);
//设置reducer(key,value)输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
Path outputDir = new Path(outputPath);
outputDir.getFileSystem(getConfiguration()).delete(outputDir, true);
//检查文件是否已存在,若存在,则递归删除
Path output = new Path(outputPath);
output.getFileSystem(getConfiguration()).delete(output, true);
//设置文件输入输出路径
MultipleInputs.addInputPath(job, new Path(inputEmpPath), TextInputFormat.class);
MultipleInputs.addInputPath(job, new Path(inputDeptPath), TextInputFormat.class);
FileOutputFormat.setOutputPath(job, outputDir);
job.waitForCompletion(true);
}
}
ReduceJoinMapper
public class ReduceJoinMapper extends Mapper<LongWritable, Text, LongWritable, JoinData> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split("\t");
if (fields.length > 3){
StringBuilder data = new StringBuilder();
data.append(fields[0]).append("\t")
.append(fields[1]).append("\t")
.append(fields[5]).append("\t");
context.write(new LongWritable(Long.valueOf(fields[7])), new JoinData(data.toString(), 1));
} else {
context.write(new LongWritable(Long.valueOf(fields[0])), new JoinData(fields[1], 0));
}
}
}
ReduceJoinReduce
public class ReduceJoinReduce extends Reducer<LongWritable, JoinData, Text, NullWritable> {
@Override
protected void reduce(LongWritable key, Iterable<JoinData> values, Context context) throws IOException, InterruptedException {
//将员工数据与部门数据分类
List<String> empList = new ArrayList<>();
List<String> deptList = new ArrayList<>();
for (JoinData value : values) {
if (value.getIsemp() == 1){
empList.add(value.getData());
} else {
deptList.add(value.getData());
}
}
//做笛卡尔积
for (String dept : deptList) {
for (String emp : empList) {
context.write(new Text(emp + "\t" + dept), NullWritable.get());
}
}
}
}