Spark RDD 原理
创建RDD的3种方式
- 基于内存转换
- 基于本地文件转换
- 基于Hive表转换
开局一张图(图解)
提示:双击大图

流程源码
零、前置
//Spark配置文件
//local = 1线程 local[2] = 2线程
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("RDD_Partition")
//Spark集群连接,Spark功能的主要入口点。
val sc: SparkContext = new SparkContext(conf)
一、从内存中创建
①分区数确认
sc.makeRDD(List(1,2,3))
sc.makeRDD(List(1,2,3),numSlices) //切片总数
makeRDD[T: ClassTag](
seq: Seq[T],
numSlices: Int = defaultParallelism)
//1.默认不写 numSlices 取大值 == 2 || TotalCores(Cluster&&Local)
def defaultParallelism: Int = {
assertNotStopped()
taskScheduler.defaultParallelism //抽象方法
}
TaskSchedulerImpl.defaultParallelism(): Int = backend.defaultParallelism() //抽象方法
SchedulerBackend.defaultParallelism()
CoarseGrainedSchedulerBackend
StandaloneSchedulerBackend
YarnSchedulerBackend(abstract)
YarnClusterSchedulerBackend
YarnClientSchedulerBackend
LocalSchedulerBackend
CoarseGrainedSchedulerBackend.defaultParallelism(): Int = {
conf.getInt("spark.default.parallelism", math.max(totalCoreCount.get(), 2)) //核心数与2 取最大
}
LocalSchedulerBackend.defaultParallelism(): Int =
scheduler.conf.getInt("spark.default.parallelism", totalCores)
def getInt(key: String, defaultValue: Int): Int = {
getOption(key).map(_.toInt).getOrElse(defaultValue) //从SparkConf读取属性,如果没有set则为totalCores
}
//Local
totalCores == 本地计算机核心数(SparkConf.setMaster("local[*]"))
//Cluster
totalCores == 集群中的核心总数(receiveAndReply(){if RegisterExecutor => totalCoreCount.addAndGet(cores)})
//2.写明数值 [作用:数据量小时 不需要使用全部的CPU资源,指定分区数](1 Partition/Slice = 1 Task = 1 Core/Thread)
②数据分配
makeRDD(){parallelize(seq, numSlices)}
new ParallelCollectionRDD[T](){
ParallelCollectionRDD.slice(){
def positions(length: Long, numSlices: Int): Iterator[(Int, Int)] = {
(0 until numSlices).iterator.map {
i =>
//Ex1整除情况: length=100 Slices=10(0~9)
//i=0 : start=0*100/10 = 0;end=(0+1)*100/10 = 10
//i=1 : start=1*100/10 = 10;end=(1+1)*100/10 = 20
//Ex2非整除情况: length=7 Slices=3(0~2)
//i=0 : start=0*7/3 = 0;end=(0+1)*7/3 = 2
//i=1 : start=1*7/3 = 2;end=(1+1)*7/3 = 4
//i=2 : start=2*7/3 = 4;end=(2+1)*7/3 = 7
val start = ((i * length) / numSlices).toInt
val end = (((i + 1) * length) / numSlices).toInt
(start, end)
}
}
Seq match {
...
case nr: NumericRange[_] =>
// For ranges of Long, Double, BigInteger, etc
val slices = new ArrayBuffer[Seq[T]](numSlices)
var r = nr
for ((start, end) <- positions(nr.length, numSlices)) {
val sliceSize = end - start
slices += r.take(sliceSize).asInstanceOf[Seq[T]] //取数值序列Seq的
r = r.drop(sliceSize)
}
slices
...
}
}
}
二、从文件中创建
①分区数确认
sc.textFile("baseadlog.log")
sc.textFile("baseadlog.log",minPartitions) //最小分区数:有可能最终分区数比minPartitions大
textFile(
path: String,
minPartitions: Int = defaultMinPartitions)
//1.默认不写 minPartitions 取小值 == 2 || TotalCores(Cluster&&Local)
def defaultMinPartitions: Int = math.min(defaultParallelism, 2)
//defaultParallelism赋值过程
//集群模式下: 默认取 分配的Executor总核心数 与 2 的最大值
//本地模式下: 默认取 总核心数
//二者均是 spark.default.parallelism 配置属性优先
def defaultParallelism: Int = {
assertNotStopped()
taskScheduler.defaultParallelism
}
TaskScheduler
YarnScheduler
YarnClusterSchedule
TaskScheduler.defaultParallelism(): Int = backend.defaultParallelism()
SchedulerBackend
CoarseGrainedSchedulerBackend{
override def defaultParallelism(): Int =
conf.getInt("spark.default.parallelism", math.max(totalCoreCount.get(), 2))
}
StandaloneSchedulerBackend
YarnSchedulerBackend(abstruct)
YarnClusterSchedulerBackend
YarnClientSchedulerBackend
LocalSchedulerBackend{
overrider def defaultParallelism(): Int =
scheduler.conf.getInt("spark.default.parallelism", totalCores)
}
hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
minPartitions).map(pair => pair._2.toString).setName(path)
InputFormat
FileInputFormat(abstract)
TextInputFormat
MapredParquetInputFormat //Parquet列式存储
AvroAsTextInputFormat //Avro压缩文件读取
HiveInputFormat //用于执行使用索引的查询,使用块筛选器文件指定要查询的块.
BucketizedHiveInputFormat
CombineHiveInputFormat //在同一MR作业中读取不同格式文件
HiveIndexedInputFormat //用于执行使用索引的查询(hive.index.blockfilter.file)
HiveCompactIndexInputFormat //(hive.index.compact.file)
hadoopFile[K, V](
path: String,
inputFormatClass: Class[_ <: InputFormat[K, V]], // 子类 <: 父类 上限封顶
keyClass: Class[K],
valueClass: Class[V],
minPartitions: Int = defaultMinPartitions){
new HadoopRDD(
this,
confBroadcast,
Some(setInputPathsFunc),
inputFormatClass,
keyClass,
valueClass,
minPartitions).setName(path){
...
override def getPartitions: Array[Partition] = {
...
val inputFormat = getInputFormat(jobConf)
val inputSplits = inputFormat.getSplits(jobConf, minPartitions)//最小分区数<=切片数
...
}
...
}
}
//2.写明数值
//org.apache.hadoop.mapreduce.lib.input.FileInputFormat 新版本FIF
与老版本FIF区别:
//getMinSplitSize==读取"mapreduce.input.fileinputformat.split.minsize"默认为1
long minSize = Math.max(getFormatMinSplitSize()=1, getMinSplitSize(job)=1);
//getMaxSplitSize==读取"mapreduce.input.fileinputformat.split.maxsize"默认为Long.MAX_VALUE
long maxSize = getMaxSplitSize(job);
//计算切片大小 取最大 1默认最小 与 [totalSize/minPartitons => Long.Max默认最大 |←取最小→| 块大小]
//这个是新版本的一个优化,认为 整块取最合适 避免 一个Split包含多个Block 增加网络IO次数
long splitSize = computeSplitSize(blockSize, minSize, maxSize){
return Math.max(minSize, Math.min(maxSize, blockSize))
};
//老版本是 JobConf(特殊对象),新版本是 JobContext(同一入口,且能与MR交互)
//org.apache.hadoop.mapred.FileInputFormat 老版本FIF
//②切片计算[以单个文件为单位切割成多个分区/分片] TextInputFormat <: FileInputFormat.getSplits()
public InputSplit[] getSplits(JobConf job, int numSplits)throws IOException {
FileStatus[] files = listStatus(job);
//保存所有输入文件大小进入metrics指标
//1.totalSize总输入大小
job.setLong(NUM_INPUT_FILES, files.length);
long totalSize = 0;
for (FileStatus file: files) {
if (file.isDirectory()) {
throw new IOException("Not a file: "+ file.getPath());
}
totalSize += file.getLen();
}
//2.goalSize单个目标分区/切片大小 = totalSize / minPartitions(0=1=>1-输入/2-默认/n-输入)
long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
//3.minSize最小切片大小 = "mapreduce.input.fileinputformat.split.minsize"手动配置属性 与 minSplitSize=1 最大值
long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.
FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);
//private long minSplitSize = 1;
//protected void setMinSplitSize(long minSplitSize) this.minSplitSize = minSplitSize;
//setMinSplitSize只在特定的IF类中被调用修改:SequenceFileIF,RCFileIF,VectorizedRCFileIF,VectorizedOrcIF
//.mapreduce..FileInputFormat是所有FileInputFormat的基类实现了通用的getSplits方法但也可覆盖,内含基本配置属性
//产生切片列表/分区列表
ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
NetworkTopology clusterMap = new NetworkTopology();
//以每个文件为单位 ==> 一个文件 产生多个 切片
for (FileStatus file: files) {
Path path = file.getPath();
long length = file.getLen();
if (length != 0) {
FileSystem fs = path.getFileSystem(job);
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
if (isSplitable(fs, path)) {
//获取文件块大小
long blockSize = file.getBlockSize();
//计算切片大小 取最大值 1 与 [totalSize/minPartitons|←最小值→|块大小]
long splitSize = computeSplitSize(goalSize, minSize, blockSize){
return Math.max(minSize=1(默认), Math.min(goalSize总大小, blockSize块大小));
};
//设置 剩余变量 记录被切后的剩余总量
long bytesRemaining = length;
//private static final double SPLIT_SLOP = 1.1; //超过 10% slop溢出,就建立新的切片
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
//机架感知,数据DN本地化 ==> 减少网络IO 尽量一个切片都在最近的节点。
String[] splitHosts = getSplitHosts(blkLocations,
length-bytesRemaining, splitSize, clusterMap);
splits.add(makeSplit(path, length-bytesRemaining, splitSize,
splitHosts));
bytesRemaining -= splitSize;
}
//未超过 splitSize*110% 的切剩下的数据建立新的切片[最后一个切片可能大小为(splitSize0%-110%)]
if (bytesRemaining != 0) {
String[] splitHosts = getSplitHosts(blkLocations, length- bytesRemaining,
bytesRemaining, clusterMap);
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
splitHosts));
}
} else {
String[] splitHosts = getSplitHosts(blkLocations,0,length,clusterMap);
splits.add(makeSplit(path, 0, length, splitHosts));
}
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
LOG.debug("Total # of splits: " + splits.size());
return splits.toArray(new FileSplit[splits.size()]);
}
分区示例
//注意1:本地文件默认块大小为 32MB
//注意2:HDFS·MR 偏移量offset从0开始,并且以行读取
//注意3:切片与分区是以单个文件为单位,分区总数=各个文件分区数相加
Example1: a.txt
```1 CR LF [0,1,2]
2 CR LF [3,4,5]
3 CR LF [6,7,8]
4``` [9]
10 Byte && 3 minPartitions && 32MB BlockSize && splitSize= 3KB ([min(totalSize/minPartitons=3KB 与 32MB)] 与 1 取最大)
分区 索引 范围 取值
(double)10/3=3.3 > 1.1 => 0+3 [0,3] 1 CR LF 2 CR LF
(double) 7/3=2.3 > 1.1 => 3+3 [3,6] 3 CR LF
(double) 4/3=1.3 > 1.1 => 6+3 [6,9] 4
(double) 1/3=3.3 < 1.1 => 9+1 [9,10] 空
Example2: a.txt b.txt
```1 CR LF [0,1,2] ```1 CR LF [0,1,2]
2 CR LF [3,4,5] 2 CR LF [3,4,5]
3 CR LF [6,7,8] 3 CR LF [6,7,8]
4``` [9] 4 CR LF [9,10,11]
5 CR LF [12,13,14]
6 CR LF [15,16,17]
7``` [18]
29 Byte && 3 minPartitions && 32MB BlockSize && splitSize= 9KB ([min(29/3=9KB 与 32MB)] 与 1 取最大)
a.txt分区 *2 索引 范围 取值
(double)10/9=1.11 > 1.1 => 0+9 [0,9] 1 CR LF 2 CR LF 3 CR LF 4
(double) 1/9=0.11 < 1.1 => 0+9 [0,9] 空
b.txt分区 *3 索引 范围 取值
(double)19/9=2.11 > 1.1 => 0+9 [0,9] 1 CR LF 2 CR LF 3 CR LF 4 CR LF
(double)10/9=1.11 > 1.1 => 9+9 [9,18] 5 CR LF 6 CR LF 7
(double) 1/9=0.11 < 1.1 => 18+1 [18,19] 空
a.txt 2 + b.txt 3 = 分区总和 5
三、从Hive表中创建
hive> set hive.input.format;
hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat
hive> show create table dws.dws_member;
...
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
...
//1.CombineHiveInputFormat
//②切片计算
//函数:CombineHiveInputFormat.getSplits
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
PerfLogger perfLogger = PerfLogger.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
init(job);
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
Path[] paths = getInputPaths(job);
//重点:分别记录,需要合并combinable的 切片路径SplitsPath
List<Path> nonCombinablePaths = new ArrayList<Path>(paths.length / 2);
List<Path> combinablePaths = new ArrayList<Path>(paths.length / 2);
int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM,
(int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
int numPathPerThread = (int) Math.ceil((double) paths.length / numThreads);
LOG.info("Total number of paths: " + paths.length +
", launching " + numThreads + " threads to check non-combinable ones.");
ExecutorService executor = Executors.newFixedThreadPool(numThreads);
List<Future<Set<Integer>>> futureList = new ArrayList<Future<Set<Integer>>>(numThreads);
try {
for (int i = 0; i < numThreads; i++) {
int start = i * numPathPerThread;
int length = i != numThreads - 1 ? numPathPerThread : paths.length - start;
futureList.add(executor.submit(
new CheckNonCombinablePathCallable(paths, start, length, job)));
}
Set<Integer> nonCombinablePathIndices = new HashSet<Integer>();
for (Future<Set<Integer>> future : futureList) {
nonCombinablePathIndices.addAll(future.get());
}
for (int i = 0; i < paths.length; i++) {
if (nonCombinablePathIndices.contains(i)) {
nonCombinablePaths.add(paths[i]);
} else {
combinablePaths.add(paths[i]);
}
}
} catch (Exception e) {
LOG.error("Error checking non-combinable path", e);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
throw new IOException(e);
} finally {
executor.shutdownNow();
}
// Store the previous value for the path specification
String oldPaths = job.get(HiveConf.ConfVars.HADOOPMAPREDINPUTDIR.varname);
if (LOG.isDebugEnabled()) {
LOG.debug("The received input paths are: [" + oldPaths +
"] against the property "
+ HiveConf.ConfVars.HADOOPMAPREDINPUTDIR.varname);
}
//处理正常切片
if (nonCombinablePaths.size() > 0) {
FileInputFormat.setInputPaths(job, nonCombinablePaths.toArray
(new Path[nonCombinablePaths.size()]));
//super => HiveInputFormat(调用表真正的InputFormat.getSplits())
InputSplit[] splits = super.getSplits(job, numSplits);
for (InputSplit split : splits) {
result.add(split);
}
}
//处理需要合并的切片 结果 => 一个分区列值一个切片
if (combinablePaths.size() > 0) {
FileInputFormat.setInputPaths(job, combinablePaths.toArray
(new Path[combinablePaths.size()]));
Map<String, PartitionDesc> pathToPartitionInfo = this.pathToPartitionInfo != null ?
this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo();
//重点!!!只合并来自相同表和相同分区的splits。不合并 复数个表 或者 复数个分区的splits
InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo){
//这是个复杂的函数325-496
//核心调用同下:super(HiveInputFormat).getSplits() => {调用表真正的Stored As InputFormat.getSplits()}
};
for (InputSplit split : splits) {
result.add(split);
}
}
// Restore the old path information back
// This is just to prevent incompatibilities with previous versions Hive
// if some application depends on the original value being set.
if (oldPaths != null) {
job.set(HiveConf.ConfVars.HADOOPMAPREDINPUTDIR.varname, oldPaths);
}
// clear work from ThreadLocal after splits generated in case of thread is reused in pool.
Utilities.clearWorkMapForConf(job);
LOG.info("Number of all splits " + result.size());
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return result.toArray(new InputSplit[result.size()]);
}
//2.HiveInputFormat
//②切片计算
//函数:HiveInputFormat.getSplits
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
PerfLogger perfLogger = PerfLogger.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
//1.准备
init(job);
Path[] dirs = getInputPaths(job);
JobConf newjob = new JobConf(job);
List<InputSplit> result = new ArrayList<InputSplit>();
List<Path> currentDirs = new ArrayList<Path>();
Class<? extends InputFormat> currentInputFormatClass = null;
TableDesc currentTable = null;
TableScanOperator currentTableScan = null;
boolean pushDownProjection = false;
//Buffers to hold filter pushdown information
StringBuilder readColumnsBuffer = new StringBuilder(newjob.
//hive.io.file.readcolumn.ids
get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""));;
StringBuilder readColumnNamesBuffer = new StringBuilder(newjob.
//hive.io.file.readcolumn.names
get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""));
//2.以每一个分区列值 == 一个目录为单位 进行切割 ==> 1个分区列值 产生多个切片
// for each dir, get the InputFormat, and do getSplits.
for (Path dir : dirs) {
PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
TableDesc table = part.getTableDesc();
TableScanOperator tableScan = null;
List<String> aliases =
mrwork.getPathToAliases().get(dir.toUri().toString());
// Make filter pushdown information available to getSplits.
if ((aliases != null) && (aliases.size() == 1)) {
Operator op = mrwork.getAliasToWork().get(aliases.get(0));
if ((op != null) && (op instanceof TableScanOperator)) {
tableScan = (TableScanOperator) op;
//Reset buffers to store filter push down columns
readColumnsBuffer.setLength(0);
readColumnNamesBuffer.setLength(0);
// push down projections.
ColumnProjectionUtils.appendReadColumns(readColumnsBuffer, readColumnNamesBuffer,
tableScan.getNeededColumnIDs(), tableScan.getNeededColumns());
pushDownProjection = true;
// push down filters
pushFilters(newjob, tableScan);
}
}
if (!currentDirs.isEmpty() &&
inputFormatClass.equals(currentInputFormatClass) &&
table.equals(currentTable) &&
tableScan == currentTableScan) {
currentDirs.add(dir);
continue;
}
if (!currentDirs.isEmpty()) {
LOG.info("Generating splits");
//核心!!!addSplitsForGroup方法实际调用Hive创表的定义InputFormat类.getSplits方法 ↓↓↓ 佐证1
//切片数==currentDirs.size()*(numSplits / dirs.length)
addSplitsForGroup(currentDirs, currentTableScan, newjob,
getInputFormatFromCache(currentInputFormatClass, job),
currentInputFormatClass, currentDirs.size()*(numSplits / dirs.length),
currentTable, result);
}
currentDirs.clear();
currentDirs.add(dir);
currentTableScan = tableScan;
currentTable = table;
currentInputFormatClass = inputFormatClass;
}
if (pushDownProjection) {
newjob.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
newjob.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColumnsBuffer.toString());
newjob.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColumnNamesBuffer.toString());
LOG.info(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR + "=" + readColumnsBuffer.toString());
LOG.info(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR + "=" + readColumnNamesBuffer.toString());
}
if (dirs.length != 0) {
LOG.info("Generating splits");
addSplitsForGroup(currentDirs, currentTableScan, newjob,
getInputFormatFromCache(currentInputFormatClass, job),
currentInputFormatClass, currentDirs.size()*(numSplits / dirs.length),
currentTable, result);
}
Utilities.clearWorkMapForConf(job);
LOG.info("number of splits " + result.size());
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return result.toArray(new HiveInputFormat.HiveInputSplit[result.size()]);
}
//函数:HiveInputFormat.addSplitsForGroup
private void addSplitsForGroup(List<Path> dirs, TableScanOperator tableScan, JobConf conf,
InputFormat inputFormat, Class<? extends InputFormat> inputFormatClass, int splits,
TableDesc table, List<InputSplit> result) throws IOException {
Utilities.copyTablePropertiesToConf(table, conf);
if (tableScan != null) {
pushFilters(conf, tableScan);
}
FileInputFormat.setInputPaths(conf, dirs.toArray(new Path[dirs.size()]));
conf.setInputFormat(inputFormat.getClass());
int headerCount = 0;
int footerCount = 0;
if (table != null) {
headerCount = Utilities.getHeaderCount(table);
footerCount = Utilities.getFooterCount(table, conf);
if (headerCount != 0 || footerCount != 0) {
// Input file has header or footer, cannot be splitted.
conf.setLong(
ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"),
Long.MAX_VALUE);
}
}
//佐证1:调用Hive表实际InputFormat类的getSplits方法 [show create table tableName]
//STORED AS INPUTFORMAT 一般为 'org.apache.hadoop.mapred.TextInputFormat'
InputSplit[] iss = inputFormat.getSplits(conf, splits);
for (InputSplit is : iss) {
result.add(new HiveInputFormat.HiveInputSplit(is, inputFormatClass.getName()));
}
}
7255

被折叠的 条评论
为什么被折叠?



