笛卡尔积:
使用MR,求出员工姓名以及员工所属部门。
分析等值连接的处理过程:
EqualJoinMapper.java
package com.equaljoin;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class EqualJoinMapper extends Mapper<LongWritable, Text, IntWritable,Text> {
@Override
protected void map(LongWritable key1, Text value1, Context context) throws IOException, InterruptedException {
//数据可能是部门,也可能是员工
String data = value1.toString();
//分词
String[] words = data.split(",");
//判断数组的长度
if(words.length == 3){
//得到是部门数据:部门号 部门名称
context.write(new IntWritable(Integer.parseInt(words[0])), new Text("*"+words[1]));
}else{
//员工数据 : 员工的部门号 员工的姓名
context.write(new IntWritable(Integer.parseInt(words[7])), new Text(words[1]));
}
}
}
EqualJoinReducer.java
package com.equaljoin;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class EqualJoinReducer extends Reducer<IntWritable, Text,Text,Text> {
@Override
protected void reduce(IntWritable key3, Iterable<Text> values3, Context context) throws IOException, InterruptedException {
// 处理v3:可能是部门名称、也可能是员工的姓名
String dname = "";
String empNameList = "";
for(Text value:values3){
String str = value.toString();
//判断是否存在*
int index = str.indexOf("*");
if(index >= 0){
//代表是部门的名称
dname = str.substring(1);
}else{
//代表是员工的名称
empNameList = str + ";" + empNameList;
}
}
//输出
context.write(new Text(dname), new Text(empNameList));
}
}
EqualJoinMain.java
package com.equaljoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class EqualJoinMain {
public static void main(String[] args) throws Exception {
//1、创建一个任务
Job job = Job.getInstance(new Configuration());
job.setJarByClass(EqualJoinMain.class); //任务的入口
//2、指定任务的map和map输出的数据类型
job.setMapperClass(EqualJoinMapper.class);
job.setMapOutputKeyClass(IntWritable.class); //k2的数据类型
job.setMapOutputValueClass(Text.class); //v2的类型
//3、指定任务的reduce和reduce的输出数据的类型
job.setReducerClass(EqualJoinReducer.class);
job.setOutputKeyClass(Text.class); //k4的类型
job.setOutputValueClass(Text.class); //v4的类型
//4、指定任务的输入路径、任务的输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//5、执行任务
job.waitForCompletion(true);
}
}
运行结果: