MR的等值连接
利用MapReduce的功能,v3是相同的k2的v2的集合,这一特性,筛选出对应关系
1.程序流程
要求:得出相应部门的所有员工
根据两个表,部门表和员工表,两个表部门号相同
2.Java编程
1).Maper
要点:取出两个表的部门号作为k2,v2 部门名称+员工姓名,部门姓名加 “*”用作区分
package demo.mutiltable;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MulitiltableMapper extends Mapper<LongWritable, Text, LongWritable,Text> {
@Override
protected void map(LongWritable key1, Text value1, Context context)
throws IOException, InterruptedException {
String data = value1.toString();
//分词
String[] words = data.split(",");
//判断数组的长度
if(words.length == 3){
//部门表:部门号 部门名称
context.write(new LongWritable(Long.parseLong(words[0])), new Text("*"+words[1]));
}else{
//员工表: 员工的部门号 员工姓名
context.write(new LongWritable(Long.parseLong(words[7])), new Text(words[1]));
}
}
}
2).Reducer
要点:将v3的部门和员工分离输出,根据部门带“*”的特征
package demo.mutiltable;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MutiltableReducer extends Reducer<LongWritable,Text, Text, Text> {
@Override
protected void reduce(LongWritable k3, Iterable<Text> v3, Context context)
throws IOException, InterruptedException {
//定义变量保存部门名称、员工姓名
String dname = "";
String empNameList = "";
for(Text t:v3){
String str = t.toString();
//找到*号的位置
int index = str.indexOf("*");
if(index >=0){
//代表部门名称
dname = str.substring(1);
}else{
//代表员工姓名
empNameList = str + ";"+empNameList;
}
}
//输出
context.write(new Text(dname), new Text(empNameList));
}
}
3).Main
package demo.mutiltable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MutiltableMain {
public static void main(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(MutiltableMain.class);
job.setMapperClass(MulitiltableMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(MutiltableReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job,new Path(args[0]) );
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
执行结果
部门 员工
ACCOUNTING CLARK;KING;MILLER;
RESEARCH FORD;JONES;SMITH;SCOTT;ADAMS;
SALES JAMES;WARD;MARTIN;BLAKE;ALLEN;TURNER;
MR的自连接
与多表查询类似,将一个表的相关信息对应提取出k2
1.程序要求
要求:根据一张表 员工表,指定老板对应的管理的员工
2.Java编程
要点:员工表中有员工号和对应的老板号,根据信息,一张分成两个表
1.Mapper
将员工表分成两张表,分别是
1老板号,*姓名 ;
2 员工号,姓名,
package demp.selfjoin;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SelfJoinMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
@Override
protected void map(LongWritable key1, Text value1, Context context)
throws IOException, InterruptedException {
// 数据:7698,BLAKE,MANAGER,7839,1981/5/1,2850,,30
String data = value1.toString();
//分词
String[] words = data.split(",");
//输出
//1、作为老板表
context.write(new LongWritable(Long.parseLong(words[0])), new Text("*"+words[1]));
//2、作为员工表
try{
context.write(new LongWritable(Long.parseLong(words[3])), new Text(words[1]));
}catch(Exception ex){
//如果产生例外,表示:大老板
context.write(new LongWritable(-1), new Text(words[1]));
}
}
}
2.Reducer
取出带*老板的姓名,和员工拼接
需要注意:会出现老板没员工和大老板,通过长度过滤
package demp.selfjoin;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class SelfJoinReducer extends Reducer<LongWritable, Text, Text, Text> {
@Override
protected void reduce(LongWritable k3, Iterable<Text> v3, Context context)
throws IOException, InterruptedException {
// 定义变量:老板姓名 员工姓名
String bossName = "";
String empNameList = "";
for(Text v:v3){
String str = v.toString();
//判断是否存在*号
int index = str.indexOf("*");
if(index >=0){
//代表老板姓名
bossName = str.substring(1);
}else{
//代表员工姓名
empNameList = str + ";" + empNameList;
}
}
//输出
//判断:如果存在老板姓名和员工姓名,才输出
if(bossName.length() > 0 && empNameList.length() > 0)
context.write(new Text(bossName), new Text("("+empNameList+")"));
}
}
3.Main
package demp.selfjoin;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SelfJoinMain {
public static void main(String[] args) throws Exception {
//创建一个job = mapper + reducer
Job job = Job.getInstance(new Configuration());
//指定job的入口
job.setJarByClass(SelfJoinMain.class);
//指定任务的mapper和输出数据类型
job.setMapperClass(SelfJoinMapper.class);
job.setMapOutputKeyClass(LongWritable.class); //指定k2的类型
job.setMapOutputValueClass(Text.class);//指定v2的数据类型
//指定任务的reducer和输出数据类型
job.setReducerClass(SelfJoinReducer.class);
job.setOutputKeyClass(Text.class);//指定k4的类型
job.setOutputValueClass(Text.class);//指定v4的类型
//指定输入的路径和输出的路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//执行任务
job.waitForCompletion(true);
}
}