first Codec
**public class Friend {
public static void main(String[] args){
System.out.println("BigData加QQ群:947967114");
}
}**
在Hadoop中Merge是一个线程,hadoop定义了一个抽象类MergeThread
abstract class MergeThread<T,K,V> extends Thread,我们选取一些重点内容来查看:
private LinkedList<List> pendingToBeMerged;//待合并队列
protected final MergeManagerImpl<K,V> manager;
//所属的MergeManagerImpl
private final int mergeFactor;
//这个merge最多可做几路的合并。
public void startMerge(Set inputs)
//把数据源inputs挂入pendingToBeMerged队列
public void run()
每个MergeThread都有自己的pendingToBeMerged队列,这个队列是List的LinkedList,队列中的元素就是List,实际上就是一组输入数据源,想要某个MergeThread线程合并一组数据源,就把这组数据源挂入它的队列中。但是MergeThread是一个抽象类,需要实体类来实现,hadoop中有三个类对MergeThread进行了扩展。
他们都是定义在MergeManagerImpl中的内部类。分别是:
public IntermediateMemoryToMemoryMerger(MergeManagerImpl<K, V> manager, int mergeFactor)
private class InMemoryMerger extends MergeThread<InMemoryMapOutput<K,V>, K,V>
private class OnDiskMerger extends MergeThread<CompressAwarePath,K,V>
在ReduceTask.run中创建了一个MergeManagerImpl对象,这个类的内部定义了这三个类。并且在创建MergeManagerImpl对象过程中至少创建了其中的两个,也就是InMemoryMerge和OnDiskMerge。
我们要看一下MergeManagerImpl的摘要:我们只看主体内容。
private static final float DEFAULT_SHUFFLE_MEMORY_LIMIT_PERCENT
= 0.25f;
protected MapOutputFile mapOutputFile;
Set<InMemoryMapOutput<K, V>> inMemoryMergedMapOutputs =
new TreeSet<InMemoryMapOutput<K,V>>(new MapOutputComparator<K, V>());
private IntermediateMemoryToMemoryMerger memToMemMerger;
Set<InMemoryMapOutput<K, V>> inMemoryMapOutputs =
new TreeSet<InMemoryMapOutput<K,V>>(new MapOutputComparator<K, V>());
private final MergeThread<InMemoryMapOutput<K,V>, K,V> inMemoryMerger;
Set onDiskMapOutputs = new TreeSet();
private final OnDiskMerger onDiskMerger;
private final Class<? extends Reducer> combinerClass;
private final CombineOutputCollector<K,V> combineCollector;
private final CompressionCodec codec;
public MergeManagerImpl(TaskAttemptID reduceId, JobConf jobConf,
FileSystem localFS,
LocalDirAllocator localDirAllocator,
Reporter reporter,
CompressionCodec codec,
Class<? extends Reducer> combinerClass,
CombineOutputCollector<K,V> combineCollector,
Counters.Counter spilledRecordsCounter,
Counters.Counter reduceCombineInputCounter,
Counters.Counter mergedMapOutputsCounter,
ExceptionReporter exceptionReporter,
Progress mergePhase, MapOutputFile mapOutputFile) {
this.reduceId = reduceId;
this.jobConf = jobConf;
this.localDirAllocator = localDirAllocator;
this.exceptionReporter = exceptionReporter;
this.reporter = reporter;
this.codec = codec;
this.combinerClass = combinerClass;
this.combineCollector = combineCollector;
this.reduceCombineInputCounter = reduceCombineInputCounter;
this.spilledRecordsCounter = spilledRecordsCounter;
this.mergedMapOutputsCounter = mergedMapOutputsCounter;
this.mapOutputFile = mapOutputFile;
this.mapOutputFile.setConf(jobConf);
this.localFS = localFS;
this.rfs = ((LocalFileSystem)localFS).getRaw();
final float maxInMemCopyUse =
jobConf.getFloat(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT,
MRJobConfig.DEFAULT_SHUFFLE_INPUT_BUFFER_PERCENT);
if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) {
throw new IllegalArgumentException("Invalid value for " +
MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT + ": " +
maxInMemCopyUse);
}
// Allow unit tests to fix Runtime memory
this.memoryLimit =
(long)(jobConf.getLong(MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES,
Math.min(Runtime.getRuntime().maxMemory(), Integer.MAX_VALUE))
* maxInMemCopyUse);//一个内存的限制,最小多少数据能够在内存Merge
this.ioSortFactor = jobConf.getInt(MRJobConfig.IO_SORT_FACTOR, 100);
//设置了一个参数大小,和内存的缓冲区类似。设置100,然后移位,变成100MB的空间
final float singleShuffleMemoryLimitPercent =
jobConf.getFloat(MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT,
DEFAULT_SHUFFLE_MEMORY_LIMIT_PERCENT);
if (singleShuffleMemoryLimitPercent <= 0.0f
|| singleShuffleMemoryLimitPercent > 1.0f) {
throw new IllegalArgumentException("Invalid value for "
+ MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT + ": "
+ singleShuffleMemoryLimitPercent);
}
usedMemory = 0L;
commitMemory = 0L;
this.maxSingleShuffleLimit =
(long)(memoryLimit * singleShuffleMemoryLimitPercent);
this.memToMemMergeOutputsThreshold =
jobConf.getInt(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD, ioSortFactor);