创建RDD的分区切片及读取分区中数据的源码解析

最新推荐文章于 2024-07-09 22:19:34 发布

zhouyanjun_

最新推荐文章于 2024-07-09 22:19:34 发布

阅读量513

点赞数 1

分类专栏： Spark 文章标签： spark

本文链接：https://blog.csdn.net/qq_39437513/article/details/108753638

版权

Spark 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

Desc:通过读取外部文件的方式创建RDD的分区规则
-在textFile中，第二个参数为minPartitions表示最小分区数，注意是最小，不是实际最终磅定的分区数
-在实际分区的过程中，会根据处理的文件的总大小（字节数）初最小分区数进行相除运算
>余数为0，那么最小分区数，就是实际的分区数
>余数不为0，那么实际分区数  大于  最小分区数

val rdd: RDD[String] = sc.textFile("input", 3)

//minPartitions表示最小分区数
def textFile(
      path: String,  //路径
      minPartitions: Int = defaultMinPartitions): RDD[String] = withScope {
    assertNotStopped()
    hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
      minPartitions).map(pair => pair._2.toString).setName(path)
  }

def hadoopFile[K, V](
      path: String,
      inputFormatClass: Class[_ <: InputFormat[K, V]],
      keyClass: Class[K],
      valueClass: Class[V],
      minPartitions: Int = defaultMinPartitions): RDD[(K, V)] = withScope {
    assertNotStopped()

    // This is a hack to enforce loading hdfs-site.xml.
    // See SPARK-11227 for details.
    FileSystem.getLocal(hadoopConfiguration)

    // A Hadoop configuration can be about 10 KiB, which is pretty big, so broadcast it.
    val confBroadcast = broadcast(new SerializableConfiguration(hadoopConfiguration))
    val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path)
    new HadoopRDD( **//读取外部文件，创建HadoopRDD**
      this,
      confBroadcast,
      Some(setInputPathsFunc),
      inputFormatClass,
      keyClass,
      valueClass,
      minPartitions).setName(path)
  }

class HadoopRDD[K, V](
    sc: SparkContext,
    broadcastedConf: Broadcast[SerializableConfiguration],
    initLocalJobConfFuncOpt: Option[JobConf => Unit],
    inputFormatClass: Class[_ <: InputFormat[K, V]],
    keyClass: Class[K],
    valueClass: Class[V],
    minPartitions: Int)
  extends RDD[(K, V)](sc, Nil) with Logging {

**//RDD有个特性，一组分区。要获得分区，ctrl+F12 调用getPartitions方法。看看这个分区怎么实现的**

override def getPartitions: Array[Partition] = {
    val jobConf = getJobConf()
    // add the credentials here as this can be called before SparkContext initialized
    SparkHadoopUtil.get.addCredentials(jobConf)
    try {
**//InputFormat我们主要关心两个内容，①怎么切片②怎么读取文件数据
//getSplit就是来切片的，具体怎么切，点进去**
      val allInputSplits = getInputFormat(jobConf).**getSplits**(jobConf, minPartitions)
      val inputSplits = if (ignoreEmptySplits) {
        allInputSplits.filter(_.getLength > 0)
      } else {
        allInputSplits
      }
      if (inputSplits.length == 1 && inputSplits(0).isInstanceOf[FileSplit]) {
        val fileSplit = inputSplits(0).asInstanceOf[FileSplit]
        val path = fileSplit.getPath
        if (fileSplit.getLength > conf.get(IO_WARNING_LARGEFILETHRESHOLD)) {
          val codecFactory = new CompressionCodecFactory(jobConf)
          if (Utils.isFileSplittable(path, codecFactory)) {
            logWarning(s"Loading one large file ${path.toString} with only one partition, " +
              s"we can increase partition numbers for improving performance.")
          } else {
            logWarning(s"Loading one large unsplittable file ${path.toString} with only one " +
              s"partition, because the file is compressed by unsplittable compression codec.")
          }
        }
      }
      val array = new Array[Partition](inputSplits.size)
      for (i <- 0 until inputSplits.size) {
        array(i) = new HadoopPartition(id, i, inputSplits(i))
      }
      array
    } catch {
      case e: InvalidInputException if ignoreMissingFiles =>
        logWarning(s"${jobConf.get(FileInputFormat.INPUT_DIR)} doesn't exist and no" +
            s" partitions returned from this path.", e)
        Array.empty[Partition]
    }
  }

**//进入到InputFormat接口里面了。使用的就是hadoop里面的切片规则。**
package org.apache.hadoop.mapred; 
InputSplit[] getSplits(JobConf job, int numSplits) throws IOException;
**//ctrl + h 查看实现类 FileInputFormat，看具体的实现方法。
//在MapReduce时，我们查看的是TextInputFormat实现类。比较关心①getRecordReader()，帮助去读取数据，返回RecordReader<LongWritable, Text>，一行一行读数据
//②getSplits(),帮助我们做切片。而getSplits是在父类FileInputFormat里实现的，不是在这里实现的。

//在FileInputFormat ctrl+F12找到getSplits方法
//job就是对当前提交任务的包装，包括配置文件等**
  public InputSplit[] getSplits(JobConf job, int numSplits)
    throws IOException {
    StopWatch sw = new StopWatch().start();
    FileStatus[] files = listStatus(job);
**//listStatus是hdfs里的方法，把当前文件里的内容全部展示出来
//files 把当前文件夹的文件全面拿到**

// Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0;                           // compute total size文件总大小
    for (FileStatus file: files) {                // check we have valid files遍历操作，读取文件
      if (file.isDirectory()) {
        throw new IOException("Not a file: "+ file.getPath());
      }
      totalSize += file.getLen();//文件大小进行累加
    }

**//goalSize每个分区中放几个字节大小。numSplits人为设置的切片数**
long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.
    FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);//两个参数默认值都是1，点进去看。所以minSize值是1

//下面的就是切片规划
// generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); //**splits，定义的切片规划list集合**。 创建动态数组元素
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file: files) {//对文件夹里每个文件进行操作
      Path path = file.getPath();//得到路径，在哪存储的
      long length = file.getLen();//文件大小。length 下面会用到
      if (length != 0) {
        FileSystem fs = path.getFileSystem(job);**//创建FileSystem， 专门操作hdfs里面文件的对象**
        BlockLocation[] blkLocations;//定义块的位置
        if (file instanceof LocatedFileStatus) {
          blkLocations = ((LocatedFileStatus) file).getBlockLocations();//拿到块的位置。这部分代码就是对于原数据的获取
        } else {
          blkLocations = fs.getFileBlockLocations(file, 0, length);
        }
        if (isSplitable(fs, path)) { //这个文件是否是可切割的
          long blockSize = file.getBlockSize();**//获取块大小 33554432   32M**
          long splitSize = computeSplitSize(goalSize, minSize, blockSize); **//splitSize 切片大小**。点进去computeSplitSize()查看怎么计算的
					
**//拿到切片大小后，下面就是对文件进行切片**
          long bytesRemaining = length;//bytesRemaining 剩余文件大小。length文件大小，上面有
          while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {//SPLIT_SLOP 1.1。所得结果大于1.1就进行切片
            String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations,
                length-bytesRemaining, splitSize, clusterMap); //如果在hdfs上，会有多个主机地址
            splits.add(makeSplit(path, start:length-bytesRemaining, splitSize,
                splitHosts[0], splitHosts[1]));//把当前切片规划文件放到list集合中
            bytesRemaining -= splitSize;//去除已经处理了的
          }

          if (bytesRemaining != 0) { //如果没处理完
            String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length
                - bytesRemaining, bytesRemaining, clusterMap);
            splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                splitHosts[0], splitHosts[1])); //剩余的往切片规划里再放一次
          }
        } else {
          String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations,0,length,clusterMap);
          splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1]));
        }
      } else { 
        //Create empty hosts array for zero length files
        splits.add(makeSplit(path, 0, length, new String[0]));
      }
    }
		sw.stop();
    if (LOG.isDebugEnabled()) {
      LOG.debug("Total # of splits generated by getSplits: " + splits.size()
          + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
    }
    return splits.toArray(new FileSplit[splits.size()]); //切片完事后转换为数组。切片规划放到数组里去
  }

protected long computeSplitSize(long goalSize, long minSize,
                                       long blockSize) {
    return Math.max(minSize, Math.min(goalSize, blockSize));
  }

//我1.txt有10个字节，切片之后通过debug查看形成的**切片规划**。都在splits集合中
splits = {ArrayList@5965} size = 4
0 = {FileSplit@6066} "file:/D:/workspace_idea1/SparkCoreTest/input/1.txt:0+3"
1 = {FileSplit@6128} "file:/D:/workspace_idea1/SparkCoreTest/input/1.txt:3+3"
2 = {FileSplit@6194} "file:/D:/workspace_idea1/SparkCoreTest/input/1.txt:6+3"
3 = {FileSplit@6266} "file:/D:/workspace_idea1/SparkCoreTest/input/1.txt:9+1"

下面是怎么读取数据：

override def getPartitions: Array[Partition] = {
    val jobConf = getJobConf()
    // add the credentials here as this can be called before SparkContext initialized
    SparkHadoopUtil.get.addCredentials(jobConf)
    try {  **//val inputFormat = getInputFormat(jobConf)**
					 **//allInputSplits是切片规划**
      val allInputSplits = getInputFormat(jobConf).getSplits(jobConf, minPartitions) 
      val inputSplits = if (ignoreEmptySplits) {
        allInputSplits.filter(_.getLength > 0)
      } else {
        allInputSplits
      }
      if (inputSplits.length == 1 && inputSplits(0).isInstanceOf[FileSplit]) {
        val fileSplit = inputSplits(0).asInstanceOf[FileSplit]
        val path = fileSplit.getPath
        if (fileSplit.getLength > conf.get(IO_WARNING_LARGEFILETHRESHOLD)) {
          val codecFactory = new CompressionCodecFactory(jobConf)
          if (Utils.isFileSplittable(path, codecFactory)) {
            logWarning(s"Loading one large file ${path.toString} with only one partition, " +
              s"we can increase partition numbers for improving performance.")
          } else {
            logWarning(s"Loading one large unsplittable file ${path.toString} with only one " +
              s"partition, because the file is compressed by unsplittable compression codec.")
          }
        }
      }
      val array = new Array[Partition](inputSplits.size)
      for (i <- 0 until inputSplits.size) {
        array(i) = new HadoopPartition(id, i, inputSplits(i))
      }
      array
    } catch {
      case e: InvalidInputException if ignoreMissingFiles =>
        logWarning(s"${jobConf.get(FileInputFormat.INPUT_DIR)} doesn't exist and no" +
            s" partitions returned from this path.", e)
        Array.empty[Partition]
    }
  }
**//避免此代码块过长，换个代码块记录**

**//compute,每个分区的数据由这个方法来读取。切片规划有了。** 
override def **compute**(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = {
    val iter = new NextIterator[(K, V)] {

      private val split = theSplit.asInstanceOf[HadoopPartition]
      logInfo("Input split: " + split.inputSplit)
      private val jobConf = getJobConf()

      private val inputMetrics = context.taskMetrics().inputMetrics
      private val existingBytesRead = inputMetrics.bytesRead

      // Sets InputFileBlockHolder for the file block's information
      split.inputSplit.value match {
        case fs: FileSplit =>
          InputFileBlockHolder.set(fs.getPath.toString, fs.getStart, fs.getLength)
        case _ =>
          InputFileBlockHolder.unset()
      }

      // Find a function that will return the FileSystem bytes read by this thread. Do this before
      // creating RecordReader, because RecordReader's constructor might read some bytes
      private val getBytesReadCallback: Option[() => Long] = split.inputSplit.value match {
        case _: FileSplit | _: CombineFileSplit =>
          Some(SparkHadoopUtil.get.getFSBytesReadOnThreadCallback())
        case _ => None
      }

      // We get our input bytes from thread-local Hadoop FileSystem statistics.
      // If we do a coalesce, however, we are likely to compute multiple partitions in the same
      // task and in the same thread, in which case we need to avoid override values written by
      // previous partitions (SPARK-13071).
      private def updateBytesRead(): Unit = {
        getBytesReadCallback.foreach { getBytesRead =>
          inputMetrics.setBytesRead(existingBytesRead + getBytesRead())
        }
      }

      private var reader: RecordReader[K, V] = null
      private val inputFormat = getInputFormat(jobConf)
      HadoopRDD.addLocalConfiguration(
        new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(createTime),
        context.stageId, theSplit.index, context.attemptNumber, jobConf)

      reader =
        try { **//通过getRecordReader来获取分区中的值**
          inputFormat.**getRecordReader**(split.inputSplit.value, jobConf, Reporter.NULL)
        } catch {
          case e: FileNotFoundException if ignoreMissingFiles =>
            logWarning(s"Skipped missing file: ${split.inputSplit}", e)
            finished = true
            null
          // Throw FileNotFoundException even if `ignoreCorruptFiles` is true
          case e: FileNotFoundException if !ignoreMissingFiles => throw e
          case e: IOException if ignoreCorruptFiles =>
            logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
            finished = true
            null
        }
      // Register an on-task-completion callback to close the input stream.
      context.addTaskCompletionListener[Unit] { context =>
        // Update the bytes read before closing is to make sure lingering bytesRead statistics in
        // this thread get correctly added.
        updateBytesRead()
        closeIfNeeded()
      }

      private val key: K = if (reader == null) null.asInstanceOf[K] else reader.createKey()
      private val value: V = if (reader == null) null.asInstanceOf[V] else reader.createValue()

      override def getNext(): (K, V) = {
        try {
          finished = !reader.next(key, value)
        } catch {
          case e: FileNotFoundException if ignoreMissingFiles =>
            logWarning(s"Skipped missing file: ${split.inputSplit}", e)
            finished = true
          // Throw FileNotFoundException even if `ignoreCorruptFiles` is true
          case e: FileNotFoundException if !ignoreMissingFiles => throw e
          case e: IOException if ignoreCorruptFiles =>
            logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
            finished = true
        }
        if (!finished) {
          inputMetrics.incRecordsRead(1)
        }
        if (inputMetrics.recordsRead % SparkHadoopUtil.UPDATE_INPUT_METRICS_INTERVAL_RECORDS == 0) {
          updateBytesRead()
        }
        (key, value)
      }

      override def close(): Unit = {
        if (reader != null) {
          InputFileBlockHolder.unset()
          try {
            reader.close()
          } catch {
            case e: Exception =>
              if (!ShutdownHookManager.inShutdown()) {
                logWarning("Exception in RecordReader.close()", e)
              }
          } finally {
            reader = null
          }
          if (getBytesReadCallback.isDefined) {
            updateBytesRead()
          } else if (split.inputSplit.value.isInstanceOf[FileSplit] ||
                     split.inputSplit.value.isInstanceOf[CombineFileSplit]) {
            // If we can't get the bytes read from the FS stats, fall back to the split size,
            // which may be inaccurate.
            try {
              inputMetrics.incBytesRead(split.inputSplit.value.getLength)
            } catch {
              case e: java.io.IOException =>
                logWarning("Unable to get input size to set InputMetrics for task", e)
            }
          }
        }
      }
    }
    new InterruptibleIterator[(K, V)](context, iter)
  }

*/
  RecordReader<K, V> getRecordReader(InputSplit split,
                                     JobConf job, 
                                     Reporter reporter) throws IOException;
}

**//ctrl+F12查看实现类 FileInputFormat的子类TextInputFormat。找getRecordReader()方法**
//RecordReader是接口。getRecordReader()函数返回一个实现类。
public RecordReader<LongWritable, Text> getRecordReader(
                                          InputSplit genericSplit, JobConf job,
                                          Reporter reporter)
    throws IOException {
    
    reporter.setStatus(genericSplit.toString());
    String delimiter = job.get("textinputformat.record.delimiter");
    byte[] recordDelimiterBytes = null;
    if (null != delimiter) {
      recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
    }
    return new **LineRecordReader**(job, (FileSplit) genericSplit, **//输入的是job，切片规划**
        recordDelimiterBytes);
  }

public LineRecordReader(Configuration job, FileSplit split, **//split拿到的切片规划。但是是按行读的**
      byte[] recordDelimiter) throws IOException {
    this.maxLineLength = job.getInt(org.apache.hadoop.mapreduce.lib.input.
      LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart(); **//起始位置**
    end = start + split.getLength();**//结束长度**
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
      decompressor = CodecPool.getDecompressor(codec);
      if (codec instanceof SplittableCompressionCodec) {
        final SplitCompressionInputStream cIn =
          ((SplittableCompressionCodec)codec).createInputStream(
            fileIn, decompressor, start, end,
            SplittableCompressionCodec.READ_MODE.BYBLOCK);
        in = new CompressedSplitLineReader(cIn, job, recordDelimiter);
        start = cIn.getAdjustedStart();
        end = cIn.getAdjustedEnd();
        filePosition = cIn; // take pos from compressed stream
      } else {
        in = new SplitLineReader(codec.createInputStream(fileIn,
            decompressor), job, recordDelimiter);
        filePosition = fileIn;
      }
    } else {
      fileIn.seek(start);
      in = new UncompressedSplitLineReader(
          fileIn, job, recordDelimiter, split.getLength());
      filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
      start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
  }