def hadoopFile[K, V](
path:String,
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minPartitions:Int= defaultMinPartitions): RDD[(K, V)]= withScope {
assertNotStopped()// This is a hack to enforce loading hdfs-site.xml.// See SPARK-11227 for details.
FileSystem.getLocal(hadoopConfiguration)// A Hadoop configuration can be about 10 KiB, which is pretty big, so broadcast it.val confBroadcast = broadcast(new SerializableConfiguration(hadoopConfiguration))val setInputPathsFunc =(jobConf: JobConf)=> FileInputFormat.setInputPaths(jobConf, path)new HadoopRDD(**//读取外部文件,创建HadoopRDD**this,
confBroadcast,
Some(setInputPathsFunc),
inputFormatClass,
keyClass,
valueClass,
minPartitions).setName(path)}
overridedef getPartitions: Array[Partition]={val jobConf = getJobConf()// add the credentials here as this can be called before SparkContext initialized
SparkHadoopUtil.get.addCredentials(jobConf)try{**//InputFormat我们主要关心两个内容,①怎么切片②怎么读取文件数据//getSplit就是来切片的,具体怎么切,点进去**val allInputSplits = getInputFormat(jobConf).**getSplits**(jobConf, minPartitions)val inputSplits =if(ignoreEmptySplits){
allInputSplits.filter(_.getLength >0)}else{
allInputSplits
}if(inputSplits.length ==1&& inputSplits(0).isInstanceOf[FileSplit]){val fileSplit = inputSplits(0).asInstanceOf[FileSplit]val path = fileSplit.getPath
if(fileSplit.getLength > conf.get(IO_WARNING_LARGEFILETHRESHOLD)){val codecFactory =new CompressionCodecFactory(jobConf)if(Utils.isFileSplittable(path, codecFactory)){
logWarning(s"Loading one large file ${path.toString} with only one partition, "+
s"we can increase partition numbers for improving performance.")}else{
logWarning(s"Loading one large unsplittable file ${path.toString} with only one "+
s"partition, because the file is compressed by unsplittable compression codec.")}}}val array =new Array[Partition](inputSplits.size)for(i <-0 until inputSplits.size){
array(i)=new HadoopPartition(id, i, inputSplits(i))}
array
}catch{case e: InvalidInputException if ignoreMissingFiles =>
logWarning(s"${jobConf.get(FileInputFormat.INPUT_DIR)} doesn't exist and no"+
s" partitions returned from this path.", e)
Array.empty[Partition]}}
**//进入到InputFormat接口里面了。使用的就是hadoop里面的切片规则。**package org.apache.hadoop.mapred;
InputSplit[] getSplits(JobConf job, int numSplits) throws IOException;**//ctrl + h 查看实现类 FileInputFormat,看具体的实现方法。//在MapReduce时,我们查看的是TextInputFormat实现类。比较关心①getRecordReader(),帮助去读取数据,返回RecordReader<LongWritable, Text>,一行一行读数据//②getSplits(),帮助我们做切片。而getSplits是在父类FileInputFormat里实现的,不是在这里实现的。//在FileInputFormat ctrl+F12找到getSplits方法//job就是对当前提交任务的包装,包括配置文件等**
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
StopWatch sw =new StopWatch().start();
FileStatus[] files = listStatus(job);**//listStatus是hdfs里的方法,把当前文件里的内容全部展示出来//files 把当前文件夹的文件全面拿到**// Save the number of input files for metrics/loadgen
job.setLong(NUM_INPUT_FILES, files.length);
long totalSize =0;// compute total size文件总大小for(FileStatus file: files){// check we have valid files遍历操作,读取文件if(file.isDirectory()){thrownew IOException("Not a file: "+ file.getPath());}
totalSize += file.getLen();//文件大小进行累加}**//goalSize每个分区中放几个字节大小。numSplits人为设置的切片数**
long goalSize = totalSize /(numSplits ==0?1: numSplits);
long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.
FileInputFormat.SPLIT_MINSIZE,1), minSplitSize);//两个参数默认值都是1,点进去看。所以minSize值是1//下面的就是切片规划// generate splits
ArrayList<FileSplit> splits =new ArrayList<FileSplit>(numSplits);//**splits,定义的切片规划list集合**。 创建动态数组元素
NetworkTopology clusterMap =new NetworkTopology();for(FileStatus file: files){//对文件夹里每个文件进行操作
Path path = file.getPath();//得到路径,在哪存储的
long length = file.getLen();//文件大小。length 下面会用到if(length !=0){
FileSystem fs = path.getFileSystem(job);**//创建FileSystem, 专门操作hdfs里面文件的对象**
BlockLocation[] blkLocations;//定义块的位置if(file instanceof LocatedFileStatus){
blkLocations =((LocatedFileStatus) file).getBlockLocations();//拿到块的位置。这部分代码就是对于原数据的获取}else{
blkLocations = fs.getFileBlockLocations(file,0, length);}if(isSplitable(fs, path)){//这个文件是否是可切割的
long blockSize = file.getBlockSize();**//获取块大小 33554432 32M**
long splitSize = computeSplitSize(goalSize, minSize, blockSize);**//splitSize 切片大小**。点进去computeSplitSize()查看怎么计算的**//拿到切片大小后,下面就是对文件进行切片**
long bytesRemaining = length;//bytesRemaining 剩余文件大小。length文件大小,上面有while(((double) bytesRemaining)/splitSize > SPLIT_SLOP){//SPLIT_SLOP 1.1。所得结果大于1.1就进行切片String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations,
length-bytesRemaining, splitSize, clusterMap);//如果在hdfs上,会有多个主机地址
splits.add(makeSplit(path, start:length-bytesRemaining, splitSize,
splitHosts[0], splitHosts[1]));//把当前切片规划文件放到list集合中
bytesRemaining -= splitSize;//去除已经处理了的}if(bytesRemaining !=0){//如果没处理完String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length
- bytesRemaining, bytesRemaining, clusterMap);
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
splitHosts[0], splitHosts[1]));//剩余的往切片规划里再放一次}}else{String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations,0,length,clusterMap);
splits.add(makeSplit(path,0, length, splitHosts[0], splitHosts[1]));}}else{//Create empty hosts array for zero length files
splits.add(makeSplit(path,0, length,newString[0]));}}
sw.stop();if(LOG.isDebugEnabled()){
LOG.debug("Total # of splits generated by getSplits: "+ splits.size()+", TimeTaken: "+ sw.now(TimeUnit.MILLISECONDS));}return splits.toArray(new FileSplit[splits.size()]);//切片完事后转换为数组。切片规划放到数组里去}
protected long computeSplitSize(long goalSize, long minSize,
long blockSize){return Math.max(minSize, Math.min(goalSize, blockSize));}
overridedef getPartitions: Array[Partition]={val jobConf = getJobConf()// add the credentials here as this can be called before SparkContext initialized
SparkHadoopUtil.get.addCredentials(jobConf)try{**//val inputFormat = getInputFormat(jobConf)****//allInputSplits是切片规划**val allInputSplits = getInputFormat(jobConf).getSplits(jobConf, minPartitions)val inputSplits =if(ignoreEmptySplits){
allInputSplits.filter(_.getLength >0)}else{
allInputSplits
}if(inputSplits.length ==1&& inputSplits(0).isInstanceOf[FileSplit]){val fileSplit = inputSplits(0).asInstanceOf[FileSplit]val path = fileSplit.getPath
if(fileSplit.getLength > conf.get(IO_WARNING_LARGEFILETHRESHOLD)){val codecFactory =new CompressionCodecFactory(jobConf)if(Utils.isFileSplittable(path, codecFactory)){
logWarning(s"Loading one large file ${path.toString} with only one partition, "+
s"we can increase partition numbers for improving performance.")}else{
logWarning(s"Loading one large unsplittable file ${path.toString} with only one "+
s"partition, because the file is compressed by unsplittable compression codec.")}}}val array =new Array[Partition](inputSplits.size)for(i <-0 until inputSplits.size){
array(i)=new HadoopPartition(id, i, inputSplits(i))}
array
}catch{case e: InvalidInputException if ignoreMissingFiles =>
logWarning(s"${jobConf.get(FileInputFormat.INPUT_DIR)} doesn't exist and no"+
s" partitions returned from this path.", e)
Array.empty[Partition]}}**//避免此代码块过长,换个代码块记录**
**//compute,每个分区的数据由这个方法来读取。切片规划有了。** overridedef**compute**(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)]={val iter =new NextIterator[(K, V)]{privateval split = theSplit.asInstanceOf[HadoopPartition]
logInfo("Input split: "+ split.inputSplit)privateval jobConf = getJobConf()privateval inputMetrics = context.taskMetrics().inputMetrics
privateval existingBytesRead = inputMetrics.bytesRead
// Sets InputFileBlockHolder for the file block's information
split.inputSplit.value match{case fs: FileSplit =>
InputFileBlockHolder.set(fs.getPath.toString, fs.getStart, fs.getLength)case _ =>
InputFileBlockHolder.unset()}// Find a function that will return the FileSystem bytes read by this thread. Do this before// creating RecordReader, because RecordReader's constructor might read some bytesprivateval getBytesReadCallback: Option[()=>Long]= split.inputSplit.value match{case _: FileSplit | _: CombineFileSplit =>
Some(SparkHadoopUtil.get.getFSBytesReadOnThreadCallback())case _ => None
}// We get our input bytes from thread-local Hadoop FileSystem statistics.// If we do a coalesce, however, we are likely to compute multiple partitions in the same// task and in the same thread, in which case we need to avoid override values written by// previous partitions (SPARK-13071).privatedef updateBytesRead():Unit={
getBytesReadCallback.foreach { getBytesRead =>
inputMetrics.setBytesRead(existingBytesRead + getBytesRead())}}privatevar reader: RecordReader[K, V]=nullprivateval inputFormat = getInputFormat(jobConf)
HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(createTime),
context.stageId, theSplit.index, context.attemptNumber, jobConf)
reader =try{**//通过getRecordReader来获取分区中的值**
inputFormat.**getRecordReader**(split.inputSplit.value, jobConf, Reporter.NULL)}catch{case e: FileNotFoundException if ignoreMissingFiles =>
logWarning(s"Skipped missing file: ${split.inputSplit}", e)
finished =truenull// Throw FileNotFoundException even if `ignoreCorruptFiles` is truecase e: FileNotFoundException if!ignoreMissingFiles =>throw e
case e: IOException if ignoreCorruptFiles =>
logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
finished =truenull}// Register an on-task-completion callback to close the input stream.
context.addTaskCompletionListener[Unit]{ context =>// Update the bytes read before closing is to make sure lingering bytesRead statistics in// this thread get correctly added.
updateBytesRead()
closeIfNeeded()}privateval key: K =if(reader ==null)null.asInstanceOf[K]else reader.createKey()privateval value: V =if(reader ==null)null.asInstanceOf[V]else reader.createValue()overridedef getNext():(K, V)={try{
finished =!reader.next(key, value)}catch{case e: FileNotFoundException if ignoreMissingFiles =>
logWarning(s"Skipped missing file: ${split.inputSplit}", e)
finished =true// Throw FileNotFoundException even if `ignoreCorruptFiles` is truecase e: FileNotFoundException if!ignoreMissingFiles =>throw e
case e: IOException if ignoreCorruptFiles =>
logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
finished =true}if(!finished){
inputMetrics.incRecordsRead(1)}if(inputMetrics.recordsRead % SparkHadoopUtil.UPDATE_INPUT_METRICS_INTERVAL_RECORDS ==0){
updateBytesRead()}(key, value)}overridedef close():Unit={if(reader !=null){
InputFileBlockHolder.unset()try{
reader.close()}catch{case e: Exception =>if(!ShutdownHookManager.inShutdown()){
logWarning("Exception in RecordReader.close()", e)}}finally{
reader =null}if(getBytesReadCallback.isDefined){
updateBytesRead()}elseif(split.inputSplit.value.isInstanceOf[FileSplit]||
split.inputSplit.value.isInstanceOf[CombineFileSplit]){// If we can't get the bytes read from the FS stats, fall back to the split size,// which may be inaccurate.try{
inputMetrics.incBytesRead(split.inputSplit.value.getLength)}catch{case e: java.io.IOException =>
logWarning("Unable to get input size to set InputMetrics for task", e)}}}}}new InterruptibleIterator[(K, V)](context, iter)}
public LineRecordReader(Configuration job, FileSplit split,**//split拿到的切片规划。但是是按行读的**
byte[] recordDelimiter) throws IOException {this.maxLineLength = job.getInt(org.apache.hadoop.mapreduce.lib.input.
LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
start = split.getStart();**//起始位置**
end = start + split.getLength();**//结束长度**final Path file = split.getPath();
compressionCodecs =new CompressionCodecFactory(job);
codec = compressionCodecs.getCodec(file);// open the file and seek to the start of the splitfinal FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);if(isCompressedInput()){
decompressor = CodecPool.getDecompressor(codec);if(codec instanceof SplittableCompressionCodec){final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(
fileIn, decompressor, start, end,
SplittableCompressionCodec.READ_MODE.BYBLOCK);
in =new CompressedSplitLineReader(cIn, job, recordDelimiter);
start = cIn.getAdjustedStart();
end = cIn.getAdjustedEnd();
filePosition = cIn;// take pos from compressed stream}else{
in =new SplitLineReader(codec.createInputStream(fileIn,
decompressor), job, recordDelimiter);
filePosition = fileIn;}}else{
fileIn.seek(start);
in =new UncompressedSplitLineReader(
fileIn, job, recordDelimiter, split.getLength());
filePosition = fileIn;}// If this is not the first split, we always throw away first record// because we always (except the last split) read one extra line in// next() method.if(start !=0){
start += in.readLine(new Text(),0, maxBytesToConsume(start));}this.pos = start;}