MapReduce学习之MapJoin案例实现
1.当前main方法所在的入口类
package com.shujia.mr.mapJoin;
import com.shujia.mr.reduceJoin.ReduceJoin;
import com.shujia.mr.reduceJoin.ReduceJoinMapper;
import com.shujia.mr.reduceJoin.ReduceJoinReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.FileNotFoundException;
import java.io.IOException;
public class MapJoin {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "MapJoin");
job.setJarByClass(MapJoin.class);
job.setMapperClass(MapJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileSystem fileSystem = FileSystem.get(job.getConfiguration());
Path outPath = new Path("hadoop/out/mapJoin");
Path studentInpath = new Path("hadoop/data/students.txt");
job.addCacheFile(new Path("hadoop/out/count/part-r-00000").toUri());
if (!fileSystem.exists(studentInpath)) {
throw new FileNotFoundException(studentInpath+"不存在");
}
TextInputFormat.addInputPath(job,studentInpath);
if (fileSystem.exists(outPath)) {
System.out.println("路径存在,开始删除");
fileSystem.delete(outPath,true);
}
TextOutputFormat.setOutputPath(job,outPath);
job.waitForCompletion(true);
}
}
2.Map端
package com.shujia.mr.mapJoin;
import jdk.nashorn.internal.runtime.regexp.joni.Config;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
public class MapJoinMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
HashMap<String, Integer> scoreHashMap;
public MapJoinMapper() {
this.scoreHashMap = new HashMap<>();
}
@Override
protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
Configuration configuration = context.getConfiguration();
FileSystem fileSystem = FileSystem.get(configuration);
URI[] files = context.getCacheFiles();
for (URI filePath : files) {
FSDataInputStream open = fileSystem.open(new Path(filePath));
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(open));
String oneScore = null;
while ((oneScore = bufferedReader.readLine()) != null) {
String[] column = oneScore.split("\t");
scoreHashMap.put(column[0], Integer.valueOf(column[1]));
}
}
System.out.println("Score数据加载完成,已存储到HashMap中");
Configuration configuration1 = context.getConfiguration();
FileSystem fileSystem1 = FileSystem.get(configuration1);
URI[] files1 = context.getCacheFiles();
for(URI fillPath : files1){
FSDataInputStream open1 = fileSystem1.open(new Path(fillPath));
BufferedReader bufferedReader1 = new BufferedReader(new InputStreamReader(open1));
String oneScore1 = null;
while((oneScore1 = bufferedReader1.readLine()) != null){
String[] column1 = oneScore1.split("\t");
scoreHashMap.put(column1[0], Integer.valueOf(column1[1]));
}
}
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
String oneStuInfo = value.toString();
String[] columns = oneStuInfo.split(",");
if (columns.length == 5) {
String id = columns[0];
Integer score = scoreHashMap.get(id);
oneStuInfo += (","+score);
context.write(new Text(oneStuInfo), NullWritable.get());
}
}
}