1.判断触发unsafeshuffle的条件得到SerializedShuffleHandle
参考类:org.apache.spark.shuffle.sort.SortShuffleManager
def canUseSerializedShuffle(dependency: ShuffleDependency[_, _, _]): Boolean = {
val shufId = dependency.shuffleId
val numPartitions = dependency.partitioner.numPartitions
if (!dependency.serializer.supportsRelocationOfSerializedObjects) {
// 1.判断支持序列化对象的重定位
// 现阶段支持对象重定位的有UnsafeRowSerializer和KryoSerializer
// 2.所以大部分时候sparkSql的UnsafeRow都是支持对象重定位的UnsafeRowSerializer
log.debug(s"Can't use serialized shuffle for shuffle $shufId because the serializer, " +
s"${dependency.serializer.getClass.getName}, does not support object relocation")
false
} else if (dependency.mapSideCombine) {
// 判断是否需要map-side
log.debug(s"Can't use serialized shuffle for shuffle $shufId because we need to do " +
s"map-side aggregation")
false
} else if (numPartitions > MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE) {
// 判断分区个数是否大于MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE
log.debug(s"Can't use serialized shuffle for shuffle $shufId because it has more than " +
s"$MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE partitions")
false
} else {
log.debug(s"Can use serialized shuffle for shuffle $shufId")
true
}
}
2.查看UnsafeShuffleWriter的write方法
参考类:org.apache.spark.shuffle.sort.UnsafeShuffleWriter
public void write(scala.collection.Iterator<Product2<K, V>> records) throws IOException {
// Keep track of success so we know if we encountered an exception
// We do this rather than a standard try/catch/re-throw to handle
// generic throwables.
boolean success = false;
try {
while (records.hasNext()) {
// 向ShuffleExternalSorter插入record
insertRecordIntoSorter(records.next());
}
//进行spill文件合并操作
closeAndWriteOutput();
success = true;
} finally {
if (sorter != null) {
try {
sorter.cleanupResources();
} catch (Exception e) {
// Only throw this error if we won't be masking another
// error.
if (success) {
throw e;
} else {
logger.error("In addition to a failure during writing, we failed during " +
"cleanup.", e);
}
}
}
}
}
3.查看UnsafeShuffleWriter的insertRecordIntoSorter方法
参考类:org.apache.spark.shuffle.sort.UnsafeShuffleWriter
void insertRecordIntoSorter(Product2<K, V> record) throws IOException {
assert(sorter != null);
final K key = record._1();
// 由Partitioner根据key得到partitionId
final int partitionId = partitioner.getPartition(key);
// 将K,V序列化写入临时的serOutputStream中
serBuffer.reset();
serOutputStream.writeKey(key, OBJECT_CLASS_TAG);
serOutputStream.writeValue(record._2(), OBJECT_CLASS_TAG);
serOutputStream.flush();
final int serializedRecordSize = serBuffer.size();
assert (serializedRecordSize > 0);
//写入ShuffleExternalSorter
sorter.insertRecord(
serBuffer.getBuf(), Platform.BYTE_ARRAY_OFFSET, serializedRecordSize, partitionId);
}
4.查看ShuffleExternalSorter的insertRecord方法
参考类:org.apache.spark.shuffle.sort.ShuffleExternalSorter
/**
* Write a record to the shuffle sorter.
* 1.将数据(k,v)插入到内存页MemoryBlock
* 2.因还需要对数据进行排序,这部分由inMemSorter来完成,inMemSorter包含一个存储数据指针的LongArray(pointer,partition)数组
*/
public void insertRecord(Object recordBase, long recordOffset, int length, int partitionId)
throws IOException {
// for tests
assert(inMemSorter != null);
// 检查是否能够插入到inMemSorter中
//如果当前inMemSorter的数量达到了这个spark.shuffle.spill.numElementsForceSpillThreshold阈值,会首先只需spill操作
//将内存中的数据写入磁盘,以便释放内存
if (inMemSorter.numRecords() >= numElementsForSpillThreshold) {
logger.info("Spilling data because number of spilledRecords crossed the threshold " +
numElementsForSpillThreshold);
spill();
}
// 检查inMemSort中是否有足够的内存空间容纳新数据生成的指针,如果没有,则将LongArray的大小扩充到当前大小的两倍
growPointerArrayIfNecessary();
final int uaoSize = UnsafeAlignedOffset.getUaoSize();
// Need 4 or 8 bytes to store the record length.
final int required = length + uaoSize;
// 检查当前的currentPage内存页能否存储新的数据(加上存储数据长度的uaoSize)
// 如果无法满足,则会申请新的内存页
acquireNewPageIfNecessary(required);
// currentPage代表当前内存页
assert(currentPage != null);
final Object base = currentPage.getBaseObject();
// 根据currentPage和pageCursor(offsetInPage)得到recordAddress
final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor);
//调用Platform的putInt方法向内存空间中插入(k,v)数据的长度
UnsafeAlignedOffset.putSize(base, pageCursor, length);
pageCursor += uaoSize;
//调用Platform.copyMemory将(k,v)数据本身写入内存空间
//最终写入的数据是(len+k+v)
Platform.copyMemory(recordBase, recordOffset, base, pageCursor, length);
pageCursor += length;
//数据指针recordAddress和partitionId一起作为参数插入到inMemSorter中
inMemSorter.insertRecord(recordAddress, partitionId);
}
5.查看ShuffleInMemorySorter的insertRecord方法
参考类:org.apache.spark.shuffle.sort.ShuffleInMemorySorter
public void insertRecord(long recordPointer, int partitionId) {
if (!hasSpaceForAnotherRecord()) {
throw new IllegalStateException("There is no space for new record");
}
// PackedRecordPointer.packPointer(recordPointer, partitionId)结果为:
// 24bit partition number
// 13bit page number
// offset in page
array.set(pos, PackedRecordPointer.packPointer(recordPointer, partitionId));
pos++;
}
6.查看ShuffleExternalSorter的spill
参考类:org.apache.spark.shuffle.sort.ShuffleExternalSorter
/**
* Sort and spill the current records in response to memory pressure.
*/
@Override
public long spill(long size, MemoryConsumer trigger) throws IOException {
if (trigger != this || inMemSorter == null || inMemSorter.numRecords() == 0) {
return 0L;
}
logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)",
Thread.currentThread().getId(),
Utils.bytesToString(getMemoryUsage()),
spills.size(),
spills.size() > 1 ? " times" : " time");
//对内存中的数据进行排序,将排序后的记录写入到磁盘文件。
//对inMemSorter进行排序,默认是基数排序radixSort(默认),另一种是timSort(归并算法大量优化的一种排序,最终排序结果按照partitionId排序
//已经排序好对迭代器的每一个元素都是一个指针,对应上面提到的经过PackedRecordPointer编码后的地址,通过该地址可以很方便的获取原数据
//数据在写入内存页的时候已经被序列化,因此spill操作时就是单纯的写字节数组
//一个文件里不同的partition数据会用fileSegment的形式来表示,对应的信息存在SpillInfo数据结构里中.
writeSortedFile(false);
//spill后释放内存
final long spillSize = freeMemory();
inMemSorter.reset();
// Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the
// records. Otherwise, if the task is over allocated memory, then without freeing the memory
// pages, we might not be able to get memory for the pointer array.
taskContext.taskMetrics().incMemoryBytesSpilled(spillSize);
return spillSize;
}
参考:SparkSQL内核剖析(朱峰,黄明著)- 电子工业出版社