【Spark 2.1.1 原理】Spark原理② Spark RDD 创建 分区 并行度 源码级 2020_01_14

创建RDD的3种方式

  1. 基于内存转换
  2. 基于本地文件转换
  3. 基于Hive表转换

开局一张图(图解)

提示:双击大图
在这里插入图片描述

流程源码

零、前置
//Spark配置文件
//local = 1线程 local[2] = 2线程
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("RDD_Partition")
//Spark集群连接,Spark功能的主要入口点。
val sc: SparkContext = new SparkContext(conf)

一、从内存中创建

	①分区数确认
	sc.makeRDD(List(1,2,3))
	sc.makeRDD(List(1,2,3),numSlices) //切片总数

	makeRDD[T: ClassTag](
      seq: Seq[T],
      numSlices: Int = defaultParallelism)

      	//1.默认不写 numSlices 取大值 == 2 || TotalCores(Cluster&&Local)
		def defaultParallelism: Int = {
			assertNotStopped()
			taskScheduler.defaultParallelism //抽象方法
		}

			TaskSchedulerImpl.defaultParallelism(): Int = backend.defaultParallelism() //抽象方法

				SchedulerBackend.defaultParallelism()
					CoarseGrainedSchedulerBackend
						StandaloneSchedulerBackend
						YarnSchedulerBackend(abstract)
							YarnClusterSchedulerBackend
							YarnClientSchedulerBackend
					LocalSchedulerBackend

					CoarseGrainedSchedulerBackend.defaultParallelism(): Int = {
						conf.getInt("spark.default.parallelism", math.max(totalCoreCount.get(), 2)) //核心数与2 取最大
					}

					LocalSchedulerBackend.defaultParallelism(): Int =
    					scheduler.conf.getInt("spark.default.parallelism", totalCores)

    					def getInt(key: String, defaultValue: Int): Int = {
							getOption(key).map(_.toInt).getOrElse(defaultValue) //从SparkConf读取属性,如果没有set则为totalCores
						}

							//Local
							totalCores == 本地计算机核心数(SparkConf.setMaster("local[*]"))
							//Cluster
							totalCores == 集群中的核心总数(receiveAndReply(){if RegisterExecutor => totalCoreCount.addAndGet(cores)})

		//2.写明数值 [作用:数据量小时 不需要使用全部的CPU资源,指定分区数](1 Partition/Slice = 1 Task = 1 Core/Thread)

	②数据分配
	makeRDD(){parallelize(seq, numSlices)}
		new ParallelCollectionRDD[T](){
			ParallelCollectionRDD.slice(){
				def positions(length: Long, numSlices: Int): Iterator[(Int, Int)] = {
					(0 until numSlices).iterator.map {
						i =>
							//Ex1整除情况: length=100 Slices=10(0~9) 
							//i=0 : start=0*100/10 = 0;end=(0+1)*100/10 = 10
							//i=1 : start=1*100/10 = 10;end=(1+1)*100/10 = 20
							//Ex2非整除情况: length=7 Slices=3(0~2)
							//i=0 : start=0*7/3 = 0;end=(0+1)*7/3 = 2
							//i=1 : start=1*7/3 = 2;end=(1+1)*7/3 = 4
							//i=2 : start=2*7/3 = 4;end=(2+1)*7/3 = 7
							val start = ((i * length) / numSlices).toInt
							val end = (((i + 1) * length) / numSlices).toInt
							(start, end)
					}
				}
				Seq match {
					...
					case nr: NumericRange[_] =>
				        // For ranges of Long, Double, BigInteger, etc
				        val slices = new ArrayBuffer[Seq[T]](numSlices)
				        var r = nr
				        for ((start, end) <- positions(nr.length, numSlices)) {
				            val sliceSize = end - start
				            slices += r.take(sliceSize).asInstanceOf[Seq[T]] //取数值序列Seq的
				            r = r.drop(sliceSize)
				        }
			        slices
			        ...
				}
			}
		}

二、从文件中创建

	①分区数确认
	sc.textFile("baseadlog.log")
    sc.textFile("baseadlog.log",minPartitions) //最小分区数:有可能最终分区数比minPartitions大

    textFile(
      path: String,
      minPartitions: Int = defaultMinPartitions)

      	//1.默认不写 minPartitions 取小值 == 2 || TotalCores(Cluster&&Local)
		def defaultMinPartitions: Int = math.min(defaultParallelism, 2)

		//defaultParallelism赋值过程
		//集群模式下: 默认取 分配的Executor总核心数 与 2 的最大值
		//本地模式下: 默认取 总核心数
		//二者均是 spark.default.parallelism 配置属性优先 
		def defaultParallelism: Int = {
			assertNotStopped()
			taskScheduler.defaultParallelism
		}
			TaskScheduler
				YarnScheduler
					YarnClusterSchedule 
			TaskScheduler.defaultParallelism(): Int = backend.defaultParallelism()
				SchedulerBackend
					CoarseGrainedSchedulerBackend{
						override def defaultParallelism(): Int =
						    conf.getInt("spark.default.parallelism", math.max(totalCoreCount.get(), 2))
					}
						StandaloneSchedulerBackend
						YarnSchedulerBackend(abstruct)
							YarnClusterSchedulerBackend
							YarnClientSchedulerBackend
					LocalSchedulerBackend{
						overrider def defaultParallelism(): Int =
    						scheduler.conf.getInt("spark.default.parallelism", totalCores)
					}


		hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
      			   minPartitions).map(pair => pair._2.toString).setName(path)

      		InputFormat
      			FileInputFormat(abstract)
      				TextInputFormat
      				MapredParquetInputFormat //Parquet列式存储
      				AvroAsTextInputFormat //Avro压缩文件读取
      			HiveInputFormat //用于执行使用索引的查询,使用块筛选器文件指定要查询的块.
      				BucketizedHiveInputFormat
      				CombineHiveInputFormat //在同一MR作业中读取不同格式文件
      				HiveIndexedInputFormat //用于执行使用索引的查询(hive.index.blockfilter.file)
      					HiveCompactIndexInputFormat //(hive.index.compact.file)

      	hadoopFile[K, V](
						path: String,
						inputFormatClass: Class[_ <: InputFormat[K, V]], // 子类 <: 父类 上限封顶
						keyClass: Class[K],
						valueClass: Class[V],
						minPartitions: Int = defaultMinPartitions){
			new HadoopRDD(
					      this,
					      confBroadcast,
					      Some(setInputPathsFunc),
					      inputFormatClass,
					      keyClass,
					      valueClass,
					      minPartitions).setName(path){
					      	...
					      	override def getPartitions: Array[Partition] = {
						      	...
						      	val inputFormat = getInputFormat(jobConf)
						      	val inputSplits = inputFormat.getSplits(jobConf, minPartitions)//最小分区数<=切片数
						      	...
						    }
						    ...
					      }
		}

		//2.写明数值

	//org.apache.hadoop.mapreduce.lib.input.FileInputFormat 新版本FIF
	与老版本FIF区别:
	//getMinSplitSize==读取"mapreduce.input.fileinputformat.split.minsize"默认为1
	long minSize = Math.max(getFormatMinSplitSize()=1, getMinSplitSize(job)=1);
	//getMaxSplitSize==读取"mapreduce.input.fileinputformat.split.maxsize"默认为Long.MAX_VALUE
	long maxSize = getMaxSplitSize(job);
		//计算切片大小 取最大 1默认最小 与 [totalSize/minPartitons => Long.Max默认最大 |←取最小→| 块大小]
		//这个是新版本的一个优化,认为 整块取最合适 避免 一个Split包含多个Block 增加网络IO次数
	    long splitSize = computeSplitSize(blockSize, minSize, maxSize){
	    	return Math.max(minSize, Math.min(maxSize, blockSize))
	    };

	//老版本是 JobConf(特殊对象),新版本是 JobContext(同一入口,且能与MR交互)

	//org.apache.hadoop.mapred.FileInputFormat 老版本FIF
	//②切片计算[以单个文件为单位切割成多个分区/分片] TextInputFormat <: FileInputFormat.getSplits()
	public InputSplit[] getSplits(JobConf job, int numSplits)throws IOException {
	    FileStatus[] files = listStatus(job);
	    
	    //保存所有输入文件大小进入metrics指标
	    //1.totalSize总输入大小
	    job.setLong(NUM_INPUT_FILES, files.length);
	    long totalSize = 0;
	    for (FileStatus file: files) {
	        if (file.isDirectory()) {
	        	throw new IOException("Not a file: "+ file.getPath());
	        }
	        totalSize += file.getLen();
	    }
	    //2.goalSize单个目标分区/切片大小 = totalSize / minPartitions(0=1=>1-输入/2-默认/n-输入)
	    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
	    //3.minSize最小切片大小 = "mapreduce.input.fileinputformat.split.minsize"手动配置属性 与 minSplitSize=1 最大值
	    long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.
	                            FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize); 
	      //private long minSplitSize = 1;
	      //protected void setMinSplitSize(long minSplitSize) this.minSplitSize = minSplitSize;
	      //setMinSplitSize只在特定的IF类中被调用修改:SequenceFileIF,RCFileIF,VectorizedRCFileIF,VectorizedOrcIF
	      //.mapreduce..FileInputFormat是所有FileInputFormat的基类实现了通用的getSplits方法但也可覆盖,内含基本配置属性

	    //产生切片列表/分区列表
	    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
	    NetworkTopology clusterMap = new NetworkTopology();
	    //以每个文件为单位 ==> 一个文件 产生多个 切片
	    for (FileStatus file: files) {
			Path path = file.getPath();
			long length = file.getLen();
			if (length != 0) {
				FileSystem fs = path.getFileSystem(job);
				BlockLocation[] blkLocations;
				if (file instanceof LocatedFileStatus) {
				    blkLocations = ((LocatedFileStatus) file).getBlockLocations();
				} else {
				    blkLocations = fs.getFileBlockLocations(file, 0, length);
				}
				if (isSplitable(fs, path)) {
					//获取文件块大小
				    long blockSize = file.getBlockSize();
				    //计算切片大小 取最大值 1 与 [totalSize/minPartitons|←最小值→|块大小]
				    long splitSize = computeSplitSize(goalSize, minSize, blockSize){
				    	return Math.max(minSize=1(默认), Math.min(goalSize总大小, blockSize块大小));
				    };

				    //设置 剩余变量 记录被切后的剩余总量
				    long bytesRemaining = length;
				    //private static final double SPLIT_SLOP = 1.1;   //超过 10% slop溢出,就建立新的切片
				    while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
				    	//机架感知,数据DN本地化 ==> 减少网络IO 尽量一个切片都在最近的节点。
				        String[] splitHosts = getSplitHosts(blkLocations,
				                                          length-bytesRemaining, splitSize, clusterMap);
				    	splits.add(makeSplit(path, length-bytesRemaining, splitSize,
				                             splitHosts));
				        bytesRemaining -= splitSize;
				    }
				    //未超过 splitSize*110% 的切剩下的数据建立新的切片[最后一个切片可能大小为(splitSize0%-110%)]
					if (bytesRemaining != 0) {
					    String[] splitHosts = getSplitHosts(blkLocations, length- bytesRemaining,
					                                        bytesRemaining, clusterMap);
					    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
					                         splitHosts));
					}
				} else {
				    String[] splitHosts = getSplitHosts(blkLocations,0,length,clusterMap);
				    splits.add(makeSplit(path, 0, length, splitHosts));
				}
			} else {
				//Create empty hosts array for zero length files
				splits.add(makeSplit(path, 0, length, new String[0]));
			}
	    }
	    LOG.debug("Total # of splits: " + splits.size());
	    return splits.toArray(new FileSplit[splits.size()]);
	}

	分区示例
	//注意1:本地文件默认块大小为 32MB
	//注意2:HDFS·MR 偏移量offset从0开始,并且以行读取
	//注意3:切片与分区是以单个文件为单位,分区总数=各个文件分区数相加
	Example1: a.txt
	```1 CR LF [0,1,2]       
	   2 CR LF [3,4,5]
	   3 CR LF [6,7,8]
	   4``` [9]

	10 Byte && 3 minPartitions && 32MB BlockSize && splitSize= 3KB ([min(totalSize/minPartitons=3KB 与 32MB)]1 取最大)
			分区 					  索引 范围   取值
			(double)10/3=3.3 > 1.1 => 0+3 [0,3]  1 CR LF 2 CR LF
			(double) 7/3=2.3 > 1.1 => 3+3 [3,6]  3 CR LF
			(double) 4/3=1.3 > 1.1 => 6+3 [6,9]  4
			(double) 1/3=3.3 < 1.1 => 9+1 [9,10] 空

	Example2: a.txt b.txt
	```1 CR LF [0,1,2]       ```1 CR LF [0,1,2]
	   2 CR LF [3,4,5]			2 CR LF [3,4,5]
	   3 CR LF [6,7,8]			3 CR LF [6,7,8]
	   4``` [9]				4 CR LF [9,10,11]
	   							5 CR LF [12,13,14]
	   							6 CR LF [15,16,17]
	   							7``` [18]

	29 Byte && 3 minPartitions && 32MB BlockSize && splitSize= 9KB ([min(29/3=9KB 与 32MB)]1 取最大)
		a.txt分区 *2					   索引 范围   取值
			(double)10/9=1.11 > 1.1 => 0+9 [0,9]  1 CR LF 2 CR LF 3 CR LF 4
			(double) 1/9=0.11 < 1.1 => 0+9 [0,9]  空
		b.txt分区 *3					   索引 范围   取值
			(double)19/9=2.11 > 1.1 => 0+9 [0,9]  1 CR LF 2 CR LF 3 CR LF 4 CR LF
			(double)10/9=1.11 > 1.1 => 9+9 [9,18]  5 CR LF 6 CR LF 7
			(double) 1/9=0.11 < 1.1 => 18+1 [18,19]  空
		a.txt 2 + b.txt 3 = 分区总和 5

三、从Hive表中创建

	hive> set hive.input.format;
	hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat
	hive> show create table dws.dws_member;
	...
	STORED AS INPUTFORMAT 
  	'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
  	...

  	//1.CombineHiveInputFormat
  	//②切片计算
  	//函数:CombineHiveInputFormat.getSplits
  	public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    PerfLogger perfLogger = PerfLogger.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
    init(job);

    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    Path[] paths = getInputPaths(job);

    //重点:分别记录,需要合并combinable的 切片路径SplitsPath
    List<Path> nonCombinablePaths = new ArrayList<Path>(paths.length / 2);
    List<Path> combinablePaths = new ArrayList<Path>(paths.length / 2);

    int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM,
        (int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
    int numPathPerThread = (int) Math.ceil((double) paths.length / numThreads);
    LOG.info("Total number of paths: " + paths.length +
        ", launching " + numThreads + " threads to check non-combinable ones.");
    ExecutorService executor = Executors.newFixedThreadPool(numThreads);
    List<Future<Set<Integer>>> futureList = new ArrayList<Future<Set<Integer>>>(numThreads);
    try {
      for (int i = 0; i < numThreads; i++) {
        int start = i * numPathPerThread;
        int length = i != numThreads - 1 ? numPathPerThread : paths.length - start;
        futureList.add(executor.submit(
            new CheckNonCombinablePathCallable(paths, start, length, job)));
      }
      Set<Integer> nonCombinablePathIndices = new HashSet<Integer>();
      for (Future<Set<Integer>> future : futureList) {
        nonCombinablePathIndices.addAll(future.get());
      }
      for (int i = 0; i < paths.length; i++) {
        if (nonCombinablePathIndices.contains(i)) {
          nonCombinablePaths.add(paths[i]);
        } else {
          combinablePaths.add(paths[i]);
        }
      }
    } catch (Exception e) {
      LOG.error("Error checking non-combinable path", e);
      perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
      throw new IOException(e);
    } finally {
      executor.shutdownNow();
    }

    // Store the previous value for the path specification
    String oldPaths = job.get(HiveConf.ConfVars.HADOOPMAPREDINPUTDIR.varname);
    if (LOG.isDebugEnabled()) {
      LOG.debug("The received input paths are: [" + oldPaths +
          "] against the property "
          + HiveConf.ConfVars.HADOOPMAPREDINPUTDIR.varname);
    }

    //处理正常切片
    if (nonCombinablePaths.size() > 0) {
      FileInputFormat.setInputPaths(job, nonCombinablePaths.toArray
          (new Path[nonCombinablePaths.size()]));

      //super => HiveInputFormat(调用表真正的InputFormat.getSplits())
      InputSplit[] splits = super.getSplits(job, numSplits);
      for (InputSplit split : splits) {
        result.add(split);
      }
    }

    //处理需要合并的切片 结果 => 一个分区列值一个切片
    if (combinablePaths.size() > 0) {
      FileInputFormat.setInputPaths(job, combinablePaths.toArray
          (new Path[combinablePaths.size()]));
      Map<String, PartitionDesc> pathToPartitionInfo = this.pathToPartitionInfo != null ?
          this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo();

      //重点!!!只合并来自相同表和相同分区的splits。不合并 复数个表 或者 复数个分区的splits
      InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo){
      	//这是个复杂的函数325-496
      	//核心调用同下:super(HiveInputFormat).getSplits() => {调用表真正的Stored As InputFormat.getSplits()}
      };
      for (InputSplit split : splits) {
        result.add(split);
      }
    }

    // Restore the old path information back
    // This is just to prevent incompatibilities with previous versions Hive
    // if some application depends on the original value being set.
    if (oldPaths != null) {
      job.set(HiveConf.ConfVars.HADOOPMAPREDINPUTDIR.varname, oldPaths);
    }

    // clear work from ThreadLocal after splits generated in case of thread is reused in pool.
    Utilities.clearWorkMapForConf(job);

    LOG.info("Number of all splits " + result.size());
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
    return result.toArray(new InputSplit[result.size()]);
  }

  	//2.HiveInputFormat
	//②切片计算
	//函数:HiveInputFormat.getSplits
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        PerfLogger perfLogger = PerfLogger.getPerfLogger();
        perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
        //1.准备
        init(job);
        Path[] dirs = getInputPaths(job);
        JobConf newjob = new JobConf(job);
        List<InputSplit> result = new ArrayList<InputSplit>();

        List<Path> currentDirs = new ArrayList<Path>();
        Class<? extends InputFormat> currentInputFormatClass = null;
        TableDesc currentTable = null;
        TableScanOperator currentTableScan = null;

        boolean pushDownProjection = false;
        //Buffers to hold filter pushdown information
        StringBuilder readColumnsBuffer = new StringBuilder(newjob.
        		//hive.io.file.readcolumn.ids
                get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""));;
        StringBuilder readColumnNamesBuffer = new StringBuilder(newjob.
        		//hive.io.file.readcolumn.names
                get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""));

        //2.以每一个分区列值 == 一个目录为单位 进行切割 ==> 1个分区列值 产生多个切片
        // for each dir, get the InputFormat, and do getSplits.
        for (Path dir : dirs) {
            PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
            Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
            TableDesc table = part.getTableDesc();
            TableScanOperator tableScan = null;

            List<String> aliases =
                    mrwork.getPathToAliases().get(dir.toUri().toString());

            // Make filter pushdown information available to getSplits.
            if ((aliases != null) && (aliases.size() == 1)) {
                Operator op = mrwork.getAliasToWork().get(aliases.get(0));
                if ((op != null) && (op instanceof TableScanOperator)) {
                    tableScan = (TableScanOperator) op;
                    //Reset buffers to store filter push down columns
                    readColumnsBuffer.setLength(0);
                    readColumnNamesBuffer.setLength(0);
                    // push down projections.
                    ColumnProjectionUtils.appendReadColumns(readColumnsBuffer, readColumnNamesBuffer,
                            tableScan.getNeededColumnIDs(), tableScan.getNeededColumns());
                    pushDownProjection = true;
                    // push down filters
                    pushFilters(newjob, tableScan);
                }
            }

            if (!currentDirs.isEmpty() &&
                    inputFormatClass.equals(currentInputFormatClass) &&
                    table.equals(currentTable) &&
                    tableScan == currentTableScan) {
                currentDirs.add(dir);
                continue;
            }

            if (!currentDirs.isEmpty()) {
                LOG.info("Generating splits");
                //核心!!!addSplitsForGroup方法实际调用Hive创表的定义InputFormat类.getSplits方法 ↓↓↓ 佐证1
                //切片数==currentDirs.size()*(numSplits / dirs.length)
                addSplitsForGroup(currentDirs, currentTableScan, newjob,
                        getInputFormatFromCache(currentInputFormatClass, job),
                        currentInputFormatClass, currentDirs.size()*(numSplits / dirs.length),
                        currentTable, result);
            }

            currentDirs.clear();
            currentDirs.add(dir);
            currentTableScan = tableScan;
            currentTable = table;
            currentInputFormatClass = inputFormatClass;
        }
        if (pushDownProjection) {
            newjob.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
            newjob.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColumnsBuffer.toString());
            newjob.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColumnNamesBuffer.toString());
            LOG.info(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR + "=" + readColumnsBuffer.toString());
            LOG.info(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR + "=" + readColumnNamesBuffer.toString());
        }

        if (dirs.length != 0) {
            LOG.info("Generating splits");
            addSplitsForGroup(currentDirs, currentTableScan, newjob,
                    getInputFormatFromCache(currentInputFormatClass, job),
                    currentInputFormatClass, currentDirs.size()*(numSplits / dirs.length),
                    currentTable, result);
        }

        Utilities.clearWorkMapForConf(job);
        LOG.info("number of splits " + result.size());
        perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
        return result.toArray(new HiveInputFormat.HiveInputSplit[result.size()]);
    }
    	//函数:HiveInputFormat.addSplitsForGroup
    	private void addSplitsForGroup(List<Path> dirs, TableScanOperator tableScan, JobConf conf,
                                   InputFormat inputFormat, Class<? extends InputFormat> inputFormatClass, int splits,
                                   TableDesc table, List<InputSplit> result) throws IOException {

	        Utilities.copyTablePropertiesToConf(table, conf);

	        if (tableScan != null) {
	            pushFilters(conf, tableScan);
	        }

	        FileInputFormat.setInputPaths(conf, dirs.toArray(new Path[dirs.size()]));
	        conf.setInputFormat(inputFormat.getClass());

	        int headerCount = 0;
	        int footerCount = 0;
	        if (table != null) {
	            headerCount = Utilities.getHeaderCount(table);
	            footerCount = Utilities.getFooterCount(table, conf);
	            if (headerCount != 0 || footerCount != 0) {
	                // Input file has header or footer, cannot be splitted.
	                conf.setLong(
	                        ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"),
	                        Long.MAX_VALUE);
	            }
	        }
	        //佐证1:调用Hive表实际InputFormat类的getSplits方法 [show create table tableName]
	        //STORED AS INPUTFORMAT 一般为 'org.apache.hadoop.mapred.TextInputFormat' 
	        InputSplit[] iss = inputFormat.getSplits(conf, splits);
	        for (InputSplit is : iss) {
	            result.add(new HiveInputFormat.HiveInputSplit(is, inputFormatClass.getName()));
	        }
    	}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值