1、依赖
<properties>
<hadoop.version>2.7.3</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!--HDFS-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
2、代码
package cn.dd.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.*;
import static java.lang.System.exit;
public class JoinMapReduce {
//map类,实现map函数
public static class MyJoinMapper extends Mapper<Object, Text,Text,Text> {
private Text outKey=new Text();
private Text outValue=new Text();
protected void map(Object key,Text value,Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(" ");
if(split.length < 2) return;
FileSplit inputSplit = (FileSplit)context.getInputSplit();
String name = inputSplit.getPath().getName();
//根据不同的文件名分别输出
if (name.startsWith("user")){
outKey.set(split[3]);
outValue.set("user#"+split[0]+","+split[1]+","+split[2]+","+split[3]);
context.write(outKey,outValue);
}else{
outKey.set(split[0]);
outValue.set("class#"+split[1]);
context.write(outKey,outValue);
}
}
}
//reduce类,实现reduce函数
public static class MyJoinReduce extends Reducer<Text,Text,Text,NullWritable>{
private Text outValue = new Text();
protected void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException {
Vector<String> users = new Vector<String>();
Vector<String> clas = new Vector<String>();
for (Text text : values) {
String value = text.toString();
if(value.startsWith("user")){
users.add(value.split("#")[1]);
}else{
clas.add(value.split("#")[1]);
}
}
if (clas.size() ==0 ){
for (String user : users){
//clas为空,则用null补全
outValue.set(user + "," + null);
context.write(outValue , NullWritable.get());
}
}else{
for (String user : users){
for (String cla :clas ){
outValue.set(user + "," + cla);
context.write(outValue , NullWritable.get());
}
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//得到集群配置参数
Configuration conf = new Configuration();
//设置本次的job实例
Job job = Job.getInstance(conf,"leftjoin");
//指定本次执行的主类
job.setJarByClass(JoinMapReduce.class);
//指定map类
job.setMapperClass(MyJoinMapper.class);
//指定reduce类
job.setReducerClass(MyJoinReduce.class);
//因为map和reduce的输出类型不同所以要设置map的output类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//指定job的输出的key,value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//指定输入数据的路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//指定输出路径,并要求该输出路径一定是不存在的
FileOutputFormat.setOutputPath(job,new Path (args[1]));
//指定job的执行模式,等待任务执行完成后,提交任务的客户端才会推出!
System.exit(job.waitForCompletion(true) ? 0:1);
}
}
3、运行
//打包到hadoop环境,运行命令
yarn jar MapReduce-1.0-SNAPSHOT.jar cn.dd.mapreduce.JoinMapReduce /user/******/input /user/******/output
4、输入输出
- 输入1:user.txt
xiaoming 男 23 c1
xiaohong 女 23 c1
小刚 男 24 c2
xiaoliang 男 23 c2
wangdong 男 30 c2
chenhui 男 22 c3
yanwei 女 35 c2
- 输入2:class.txt
c1 项目部
c2 产品部
- 输出:
xiaohong,女,23,c1,项目部
xiaoming,男,23,c1,项目部
yanwei,女,35,c2,产品部
wangdong,男,30,c2,产品部
xiaoliang,男,23,c2,产品部
小刚,男,24,c2,产品部
chenhui,男,22,c3,null