MapJoin案例应用
1)使用场景
Map Join适用于一张表十分小、一张表很大的场景。
2)优点
思考:在Reduce端处理过多的表,非常容易产生数据倾斜。怎么办?
在Map端缓存多张表,提前处理业务逻辑,这样增加Map端业务,减少Reduce端数据的压力,尽可能的减少数据倾斜。
3)具体办法:采用DistributedCache
(1)在Mapper的setup阶段,将文件读取到缓存集合中。
(2)在Driver驱动类中加载缓存。
//缓存普通文件到Task运行节点。
job.addCacheFile(new URI("file:///e:/cache/pd.txt"));
//如果是集群运行,需要设置HDFS路径
job.addCacheFile(new URI("hdfs://hadoop102:9820/cache/pd.txt"));
public class MapJoinMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
private HashMap<String,String> pidMap = new HashMap<>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取在driver中设置的缓存文件。
URI[] cacheFiles = context.getCacheFiles();
FileSystem fileSystem = FileSystem.get(context.getConfiguration());
FSDataInputStream open = fileSystem.open(new Path(cacheFiles[0]));
//转化为字符流,一行一行的读
InputStreamReader isr = new InputStreamReader(open, "UTF-8");
BufferedReader bufferedReader = new BufferedReader(isr);
String line;
String[] split;
while (StringUtils.isNoneEmpty(line = bufferedReader.readLine())) {
split = line.split("\t");
pidMap.put(split[0], split[1]);
}
}
private Text outK = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.getCounter("line","linenumber").increment(1);
String line = value.toString();
String[] split = line.split("\t");
outK.set(split[0]+"\t"+pidMap.get(split[1])+"\t"+split[2]);
context.write(outK,NullWritable.get());
}
}
public class MapJoinDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(MapJoinDriver.class);
job.setMapperClass(MapJoinMapper.class);
// job.setMapOutputKeyClass(Text.class);
// job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0); //没有reduce阶段
job.addCacheFile(new URI(args[2]));
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean b = job.waitForCompletion(true);
System.out.println(b ? "执行成功!" : "执行失败!");
}
}
ReduceJoin案例
public class ReduceJoinDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(ReduceJoinDriver.class);
job.setMapperClass(ReduceJoinMapper.class);
job.setReducerClass(ReduceJoinReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TableBean.class);
job.setOutputKeyClass(TableBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("I:/input/inputtable2/"));
FileOutputFormat.setOutputPath(job, new Path("I:/output/reduceOutput55/"));
boolean b = job.waitForCompletion(true);
System.out.println(b ? "执行成功!" : "执行失败!");
}
}
public class ReduceJoinMapper extends Mapper<LongWritable, Text,Text,TableBean> {
private String filename;
private Text outK = new Text();
private TableBean outV = new TableBean();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取切片信息
InputSplit inputSplit = context.getInputSplit();
FileSplit fileSplit = (FileSplit) inputSplit;
filename = fileSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//以pid为key,tableBean为value
String line = value.toString();
String[] split = line.split("\t");
if(filename.contains("order")){
outK.set(split[1]);
outV.setId(split[0]);
outV.setPid(split[1]);
outV.setPname("");
outV.setAmount(Integer.parseInt(split[2]));
outV.setFlag("order");
}else{
outK.set(split[0]);
outV.setId("");
outV.setPid(split[0]);
outV.setPname(split[1]);
outV.setAmount(0);
outV.setFlag("pd");
}
context.write(outK,outV);
}
}
public class ReduceJoinReduce extends Reducer<Text,TableBean,TableBean, NullWritable> {
@Override
protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
ArrayList<TableBean> orderBeans = new ArrayList<>(); //保存每组order
TableBean pdBean = new TableBean(); //保存pd
//Iterable是hadoop中比较特殊的集合,hadoop为了减少内存开销,
// 只用了一个对象,如果想取数据,需要修改这个对象的中的数据
for (TableBean value : values) {
if("order".equals(value.getFlag())){
//所以如果向把数据的都放入到集合中,就需要创建一个TableBean的对象
TableBean tmp = new TableBean();
try {
BeanUtils.copyProperties(tmp ,value);
} catch (Exception e) {
e.printStackTrace();
}
orderBeans.add(tmp);
}else{
try {
BeanUtils.copyProperties(pdBean,value);
} catch (Exception e) {
e.printStackTrace();
}
}
}
for (TableBean orderBean : orderBeans) {
orderBean.setPname(pdBean.getPname());
context.write(orderBean,NullWritable.get());
}
}
}
public class TableBean implements Writable {
private String id;
private String pid;
private String pname;
private int amount;
private String flag;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
@Override
public String toString() {
return id + "\t" + pname + "\t" + amount;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.getId());
out.writeUTF(this.getPid());
out.writeInt(this.getAmount());
out.writeUTF(this.getPname());
out.writeUTF(this.getFlag());
}
@Override
public void readFields(DataInput in) throws IOException {
this.id = in.readUTF();
this.pid = in.readUTF();
this.amount = in.readInt();
//这里的如果序列化的时候用的是writeInt(),反序列化的时候一定也要用readInt();不然会报EOFException
this.pname = in.readUTF();
this.flag = in.readUTF();
}
}