多数据源的MapReduce作业(二)--基于Distributed的复制联结

最新推荐文章于 2023-08-06 02:12:01 发布

小于号yyds

最新推荐文章于 2023-08-06 02:12:01 发布

阅读量1.5k

点赞数

分类专栏： MapReduce Hadoop

本文链接：https://blog.csdn.net/wawmg/article/details/8764037

版权

Hadoop 同时被 2 个专栏收录

11 篇文章 1 订阅

订阅专栏

MapReduce

10 篇文章 1 订阅

订阅专栏

本文探讨了在MapReduce作业中，如何通过在mapper阶段进行联结来提高多数据源join操作的效率。利用DistributedCache将小表数据分发到每个mapper节点，避免reduce阶段的网络重排和数据丢弃，但这种方法可能面临小表数据量大导致的OOM问题。示例代码包括自定义InputFormat和Mapper类。

摘要由CSDN通过智能技术生成

多数据源在reduce侧做join操作，效率不会太高。我们首先会让所有的数据在网络上重排，然后在conbine联结过程中丢弃了大部分的数据。如果我们在mapper侧就去除不必要的数据，联结会更有效率。

map阶段执行联结主要障碍是一个mapper正在处理的记录要能访问到另外表的所有数据，这样就能保证map侧联结可以正常工作。

引入hadoop的DistributedCache。仔细观察发现，大部分两表做join操作时，都会是一张大表，一张小表。可以将小表的数据复制到每个执行map的节点上，这样就能访问到小表所有的数据。

缺点：如果小表的数据可观，会出现OOM现象。

具体实例如下：

自定义InputFormat，当然也可以不用定义，直接用KeyValueInputFormat。

DisCahceInputFormat.java

package com.hadoop.data.join.disCache;

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;

public class DisCahceInputFormat extends FileInputFormat<Text, Text> {

@Override
protected boolean isSplitable(JobContext context, Path filename) {
// TODO Auto-generated method stub
return false;
}
@Override
public RecordReader<Text, Text> createRecordReader(InputSplit inputsplit,
TaskAttemptContext context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
return new objPosRecordReader();
}
public static class objPosRecordReader extends RecordReader<Text,Text>{

public LineReader in;
public Text lineKey;
public Text lineValue;
public StringTokenizer token=null;

public Text line;

@Override
public void close() throws IOException {
in.close();
}

@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return lineKey;
}

@Override
public Text getCurrentValue() throws IOException,
InterruptedException {
return lineValue;
}

@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}

/**
* 初始化
*/
@Override
public void initialize(InputSplit input, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split=(FileSplit)input;
Configuration job=context.getConfiguration();
Path file=split.getPath();
FileSystem fs=file.getFileSystem(job);

FSDataInputStream filein=fs.open(file);
in=new LineReader(filein,job);
line=new Text();
lineKey=new Text();
lineValue=new Text();
}

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
int linesize=in.readLine(line);
if(linesize==0)
return false;
System.out.println("line = " + line );
String[] pieces = line.toString().split(",",2);
lineKey.set(pieces[0]);
lineValue.set(pieces[1]);

return true;
}
}
}

Mapper类

JoinMapperjava

package com.hadoop.data.join.disCache;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.Hashtable;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class JoinMapper extends Mapper<Text, Text, Text, Text> {
private Hashtable<String,String> joinData = new Hashtable<String,String>();
@Override
protected void setup(Context context) throws IOException,
InterruptedException {

Configuration conf = context.getConfiguration();
System.out.println("here setup begin initializen");
try{
Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf);//获取缓存文件路径
if (cacheFiles != null && cacheFiles.length > 0) {
String line;
String[] tokens;
//String file = "Customers.txt";
BufferedReader joinReader = new BufferedReader(new FileReader(
cacheFiles[0].toString()));
try{
while((line = joinReader.readLine())!= null){
tokens = line.split(",",2);
joinData.put(tokens[0], tokens[1]);
}
System.out.println("cache map is " + joinData);
}finally{
joinReader.close();
}

}
}catch(IOException e){
System.err.println("reading distributedcache: " + e);
}
}

@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {

System.out.println("let's see the joinData is " + joinData);
String joinValue = joinData.get(key.toString());

if(StringUtils.isNotEmpty(joinValue)){
context.write(key, new Text(value.toString() + "," + joinValue));
}

}
}

驱动类

DataDriver.java

package com.hadoop.data.join.disCache;

import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DataDriver{

public void run(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println("Usage:DataDisCache <input path> <output path>");
System.exit(-1);
}
String pathCache = args[0];
String pathIn = args[1];
String pathOut = args[2];
FileSystem hdfs = FileSystem.get(conf);

if (hdfs.exists(new Path(pathOut))) {
hdfs.delete(new Path(pathOut), true);
}
//指定分布式缓存文件
String file = "Customers.txt";
URI cache = URI.create(pathCache + "#" + file);
DistributedCache.addCacheFile(new URI(pathCache+"#"+file), conf);

Job job = new Job(conf, "Data join base on distributedCache");
job.setJarByClass(DataDriver.class);
job.setMapperClass(JoinMapper.class);

//set input format
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(DisCahceInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job,new Path(pathIn));
FileOutputFormat.setOutputPath(job, new Path(pathOut));
System.out.println("here main we begin");
System.exit(job.waitForCompletion(true) ? 0 : 1);
System.out.println("here main we end");
}

public static void main(String[] args) throws Exception {
DataDriver dr = new DataDriver();
dr.run(args);
}
}

缺点：如果小表的数据可观，会出现OOM现象。