细节决定成败 MapReduce任务实战 Map Join

最新推荐文章于 2024-09-01 21:21:28 发布

cnkxy68446

最新推荐文章于 2024-09-01 21:21:28 发布

阅读量247

点赞数

一、任务描述

在hdfs 上有两个文件

学生信息文件：hdfs://..***:8020/user/train/joinjob/student.txt

以逗号分隔，第一列为学号，学号每个学生唯一，第二列为学生姓名。

点击(此处)折叠或打开

2016001,Join
2016002,Abigail
2016003,Abby
2016004,Alexandra
2016005,Cathy
2016006,Katherine

学生成绩信息文件：hdfs://..***:8020/user/train/joinjob/student_score.txt

以逗号分隔，第一列为学号，学号每个学生唯一，第二列为课程代码，第三列为分数。

点击(此处)折叠或打开

2016001,YY,60
2016001,SX,88
2016001,YW,91
2016002,SX,77
2016002,YW,33
.............

期望的结果文件形式如下

点击(此处)折叠或打开

2016001,Join,YY,60
2016001,Join,SX,88
2016001,Join,YW,91
2016002,Abigail,SX,77
2016002,Abigail,YW,33
2016002,Abigail,YY,56
2016003,Abby,YY,34
2016003,Abby,SX,84
2016003,Abby,YW,69
2016004,Alexandra,YY,89
2016004,Alexandra,SX,84
.......

二、任务分析

这是一个两个数据集关联的任务。关联有map端join ，和reduce端join。通常在有一个数据集比较小，可在全部放到内存中时，选择map端join；当两个数据集都很大时，使用reduce端join，这时可以使用Bloom过滤器来增加效率。本次任务我们使用map端join实现。
此任务关联到直接输出即可，不需要reduce过程。
本任务会使用到的技术有：分布式缓存DistributedCache、mapreduce中的本地文件读取

三、实现思路

把一个学生信息使用分布式缓存分发到各个任务节点。
学生成绩信息当做map的input。
在mapper 的setup函数中读取已缓存到本的学生信息，并保存到一个HashMap中，key为学号，value为姓名。
在mapper函数中对每一个学生成绩信息，根据学号去上一步的map中去取姓名信息。
封装到Text对象中输出。

三、实现代码

点击(此处)折叠或打开

package join;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
public class MapJoinDirectOutPutJob extends Configured implements Tool {
private static String STUDENTNAME_PARAM ="STUDENTNAME_PARAM";
private static String INPATH_SCORE = "joinjob/student_score.txt";
private static String OUTPATH = "joinjob/output";
public static class MapJoinDirectOutPutMapper extends Mapper<LongWritable,Text,NullWritable,Text>{
private BufferedReader br = null;
private Map <String,String> map =new HashMap<String,String>();
private Text newValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String [] words = StringUtils.split(value.toString(),',');
String name = map.get(words[0]);
newValue.set(words[0]+","+name +","+words[1]+","+words[2]);
context.write(NullWritable.get(), newValue);
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.getLocal(conf);
br = new BufferedReader(new InputStreamReader(fs.open(new Path("studentLink.txt"))));
String current;
while((current= br.readLine())!=null){
String [] words = current.split(",");
map.put(words[0],words[1]);
}
br.close();
}
}
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(getConf(),"MapJoinDirectOutPutJob");
job.setJarByClass(getClass());
Configuration conf = job.getConfiguration();
conf.set(STUDENTNAME_PARAM, args[0]);
Path in = new Path(INPATH_SCORE);
Path out = new Path(OUTPATH);
FileSystem.get(conf).delete(out,true);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapperClass(MapJoinDirectOutPutMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(0);
URI uri = new URI("hdfs://***.***.***.***:8020/user/train/joinjob/student.txt#studentLink.txt");
job.addCacheFile(uri);
return job.waitForCompletion(true) ?0:1;
}
public static void main(String [] args){
int r = 0;
try{
r = ToolRunner.run(new Configuration(),new MapJoinDirectOutPutJob(),args);
}catch(Exception e){
e.printStackTrace();
}
System.exit(r);
}
}