Map Join
Map端join是指数据达到map处理函数之前进行合并的,效率要远远高于Reduce端join,因为Reduce端join是把所有的数据都经过Shuffle,非常消耗资源。
order.txt
order011 u001
order012 u001
order033 u005
order034 u002
order055 u003
order066 u004
order077 u010
user.txt
u001,hangge,18,male,angelababy
u002,huihui,58,female,ruhua
u003,guanyu,16,male,chunge
u004,laoduan,38,male,angelababy
u005,nana,24,femal,huangbo
u006,xingge,18,male,laoduan
最终结果
u001,hangge,18,male,angelababy,order012
u001,hangge,18,male,angelababy,order011
u002,huihui,58,female,ruhua,order034
u003,guanyu,16,male,chunge,order055
u004,laoduan,38,male,angelababy,order066
u005,nana,24,femal,huangbo,order033
null,order077
原理:将小文件上传到分布式缓存,保证每个map都可以访问完整的小文件的数据,然后与大文件切分后的数据进行连接,得出最终结果.
package hadoop06.com.doit.demo;
import hadoop03.com.doit.demo02.WordCountMapper;
import hadoop03.com.doit.demo02.WordCountReducer;
import hadoop05.com.doit.demo05.Test;
import org.apache.commons.lang.ObjectUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
public class MapJoinDemo {
public static class JoinMapper extends Mapper<LongWritable,Text,Text, NullWritable>{
//定义集合用来存储user.txt的数据 键是uid 值是这一行记录
private Map<String,String> userMap = new HashMap<>();
private Text k2 = new Text();
@Override
protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
//读取本地user.txt文件 由于user.txt添加到了分布式缓存中,会将这个文件 缓存到执行maptask的计算机上
//由于这个文件和class文件放在一起 可以直接读取
BufferedReader br = new BufferedReader(new FileReader("user.txt"));
String line = null;
while((line = br.readLine())!=null){
//System.out.println(line);
String uid = line.split(",")[0];
//将uid 和 user的一行记录放入到map中
userMap.put(uid,line);
}
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
//得到order的一条记录
String line = value.toString();
//获取order的 uid
String uid = line.split("\\s+")[1];// u001
//获取map中 当前uid的 用户信息
String userInfo = userMap.get(uid);
//拼接字符串写出
k2.set(userInfo+","+line.split("\\s+")[0]);
context.write(k2, NullWritable.get());
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
System.setProperty("HADOOP_USER_NAME", "root");
Configuration con = new Configuration();
//配置到yarn上执行
con.set("mapreduce.framework.name", "yarn");
//配置操作HDFS数据
con.set("fs.defaultFS", "hdfs://linux01:8020");
//配置resourceManager位置
con.set("yarn.resourcemanager.hostname", "linux01");
//配置mr程序运行在windows上的跨平台参数
con.set("mapreduce.app-submission.cross-platform","true");
Job job = Job.getInstance(con,"wordcount");
//分布式缓存user.txt文件
job.addCacheFile(new URI("hdfs://linux01:8020/user.txt"));
//设置jar包的路径
job.setJar("D:\\IdeaProjects\\test_hadoop\\target\\test_hadoop-1.0-SNAPSHOT.jar");
//设置Mapper
job.setMapperClass(JoinMapper.class);
//设置最后结果的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//设置读取HDFS上的文件 的路径
//设置读取文件的位置 可以是文件 也可以是文件夹
FileInputFormat.setInputPaths(job,new Path("/join/order.txt"));
//设置输出文件的位置 指定一个文件夹 文件夹不已存在 会报错
FileOutputFormat.setOutputPath(job,new Path("/join/out"));
//提交任务 并等待任务结束
job.waitForCompletion(true);
}
}