多数据源在reduce侧做join操作,效率不会太高。我们首先会让所有的数据在网络上重排,然后在conbine联结过程中丢弃了大部分的数据。如果我们在mapper侧就去除不必要的数据,联结会更有效率。
map阶段执行联结主要障碍是一个mapper正在处理的记录要能访问到另外表的所有数据,这样就能保证map侧联结可以正常工作。
引入hadoop的DistributedCache。仔细观察发现,大部分两表做join操作时,都会是一张大表,一张小表。可以将小表的数据复制到每个执行map的节点上,这样就能访问到小表所有的数据。
缺点:如果小表的数据可观,会出现OOM现象。
具体实例如下:
自定义InputFormat,当然也可以不用定义,直接用KeyValueInputFormat。
DisCahceInputFormat.java
package com.hadoop.data.join.disCache; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.util.LineReader; public class DisCahceInputFormat extends FileInputFormat<Text, Text> { @Override protected boolean isSplitable(JobContext context, Path filename) { // TODO Auto-generated method stub return false; } @Override public RecordReader<Text, Text> createRecordReader(InputSplit inputsplit, TaskAttemptContext context) throws IOException, InterruptedException { // TODO Auto-generated method stub return new objPosRecordReader(); } public static class objPosRecordReader extends RecordReader<Text,Text>{ public LineReader in; public Text lineKey; public Text lineValue; public StringTokenizer token=null; public Text line; @Override public void close() throws IOException { in.close(); } @Override public Text getCurrentKey() throws IOException, InterruptedException { return lineKey; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return lineValue; } @Override public float getProgress() throws IOException, InterruptedException { return 0; } /** * 初始化 */ @Override public void initialize(InputSplit input, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split=(FileSplit)input; Configuration job=context.getConfiguration(); Path file=split.getPath(); FileSystem fs=file.getFileSystem(job); FSDataInputStream filein=fs.open(file); in=new LineReader(filein,job); line=new Text(); lineKey=new Text(); lineValue=new Text(); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { int linesize=in.readLine(line); if(linesize==0) return false; System.out.println("line = " + line ); String[] pieces = line.toString().split(",",2); lineKey.set(pieces[0]); lineValue.set(pieces[1]); return true; } } } |
Mapper类
JoinMapperjava
package com.hadoop.data.join.disCache; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.Hashtable; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class JoinMapper extends Mapper<Text, Text, Text, Text> { private Hashtable<String,String> joinData = new Hashtable<String,String>(); @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); System.out.println("here setup begin initializen"); try{ Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf);//获取缓存文件路径 if (cacheFiles != null && cacheFiles.length > 0) { String line; String[] tokens; //String file = "Customers.txt"; BufferedReader joinReader = new BufferedReader(new FileReader( cacheFiles[0].toString())); try{ while((line = joinReader.readLine())!= null){ tokens = line.split(",",2); joinData.put(tokens[0], tokens[1]); } System.out.println("cache map is " + joinData); }finally{ joinReader.close(); } } }catch(IOException e){ System.err.println("reading distributedcache: " + e); } } @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { System.out.println("let's see the joinData is " + joinData); String joinValue = joinData.get(key.toString()); if(StringUtils.isNotEmpty(joinValue)){ context.write(key, new Text(value.toString() + "," + joinValue)); } } } |
驱动类
DataDriver.java
package com.hadoop.data.join.disCache; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class DataDriver{ public void run(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length != 3) { System.err.println("Usage:DataDisCache <input path> <output path>"); System.exit(-1); } String pathCache = args[0]; String pathIn = args[1]; String pathOut = args[2]; FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(new Path(pathOut))) { hdfs.delete(new Path(pathOut), true); } //指定分布式缓存文件 String file = "Customers.txt"; URI cache = URI.create(pathCache + "#" + file); DistributedCache.addCacheFile(new URI(pathCache+"#"+file), conf); Job job = new Job(conf, "Data join base on distributedCache"); job.setJarByClass(DataDriver.class); job.setMapperClass(JoinMapper.class); //set input format job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(DisCahceInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job,new Path(pathIn)); FileOutputFormat.setOutputPath(job, new Path(pathOut)); System.out.println("here main we begin"); System.exit(job.waitForCompletion(true) ? 0 : 1); System.out.println("here main we end"); } public static void main(String[] args) throws Exception { DataDriver dr = new DataDriver(); dr.run(args); } } |
缺点:如果小表的数据可观,会出现OOM现象。