HDFS源码解析---TestDFSIO

最新推荐文章于 2024-07-25 11:42:13 发布

请叫我算术嘉

最新推荐文章于 2024-07-25 11:42:13 发布

阅读量1.5w

点赞数 1

分类专栏： HDFS Hadoop 文章标签： hdfs hadoop mapreduce TestDFSIO java

本文链接：https://blog.csdn.net/ssjdoudou/article/details/121811114

版权

HDFS 同时被 2 个专栏收录

11 篇文章 1 订阅

订阅专栏

Hadoop

11 篇文章 4 订阅

订阅专栏

这个工具怎么用没啥好说的，里面的原理简单说一下。

MapReduce

这个工具会起mapreduce 任务在yarn的各个节点（客户端）发起读写请求。TestDFSIO.java的run方法会调用mapreduce作业任务。

@Override // Tool
  public int run(String[] args) throws IOException {
    TestType testType = null;
    int bufferSize = DEFAULT_BUFFER_SIZE;
    long nrBytes = 1*MEGA;
    int nrFiles = 1;
    long skipSize = 0;
    String resFileName = DEFAULT_RES_FILE_NAME;
    String compressionClass = null;
    boolean isSequential = false;
    String version = TestDFSIO.class.getSimpleName() + ".1.7";

    LOG.info(version);
    if (args.length == 0) {
      System.err.println("Missing arguments.");
      return -1;
    }

    for (int i = 0; i < args.length; i++) {       // parse command line
      if (args[i].startsWith("-read")) {
        testType = TestType.TEST_TYPE_READ;
      } else if (args[i].equals("-write")) {
        testType = TestType.TEST_TYPE_WRITE;
      } else if (args[i].equals("-append")) {
        testType = TestType.TEST_TYPE_APPEND;
      } else if (args[i].equals("-random")) {
        if(testType != TestType.TEST_TYPE_READ) return -1;
        testType = TestType.TEST_TYPE_READ_RANDOM;
      } else if (args[i].equals("-backward")) {
        if(testType != TestType.TEST_TYPE_READ) return -1;
        testType = TestType.TEST_TYPE_READ_BACKWARD;
      } else if (args[i].equals("-skip")) {
        if(testType != TestType.TEST_TYPE_READ) return -1;
        testType = TestType.TEST_TYPE_READ_SKIP;
      } else if (args[i].equals("-clean")) {
        testType = TestType.TEST_TYPE_CLEANUP;
      } else if (args[i].startsWith("-seq")) {
        isSequential = true;
      } else if (args[i].startsWith("-compression")) {
        compressionClass = args[++i];
      } else if (args[i].equals("-nrFiles")) {
        nrFiles = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-fileSize") || args[i].equals("-size")) {
        nrBytes = parseSize(args[++i]);
      } else if (args[i].equals("-skipSize")) {
        skipSize = parseSize(args[++i]);
      } else if (args[i].equals("-bufferSize")) {
        bufferSize = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-resFile")) {
        resFileName = args[++i];
      } else {
        System.err.println("Illegal argument: " + args[i]);
        return -1;
      }
    }
    if(testType == null)
      return -1;
    if(testType == TestType.TEST_TYPE_READ_BACKWARD)
      skipSize = -bufferSize;
    else if(testType == TestType.TEST_TYPE_READ_SKIP && skipSize == 0)
      skipSize = bufferSize;

    LOG.info("nrFiles = " + nrFiles);
    LOG.info("nrBytes (MB) = " + toMB(nrBytes));
    LOG.info("bufferSize = " + bufferSize);
    if(skipSize > 0)
      LOG.info("skipSize = " + skipSize);
    LOG.info("baseDir = " + getBaseDir(config));
    
    if(compressionClass != null) {
      config.set("test.io.compression.class", compressionClass);
      LOG.info("compressionClass = " + compressionClass);
    }

    config.setInt("test.io.file.buffer.size", bufferSize);
    config.setLong("test.io.skip.size", skipSize);
    config.setBoolean(DFSConfigKeys.DFS_SUPPORT_APPEND_KEY, true);
    FileSystem fs = FileSystem.get(config);

    if (isSequential) {
      long tStart = System.currentTimeMillis();
      sequentialTest(fs, testType, nrBytes, nrFiles);
      long execTime = System.currentTimeMillis() - tStart;
      String resultLine = "Seq Test exec time sec: " + (float)execTime / 1000;
      LOG.info(resultLine);
      return 0;
    }
    if (testType == TestType.TEST_TYPE_CLEANUP) {
      cleanup(fs);
      return 0;
    }
    createControlFile(fs, nrBytes, nrFiles);
    long tStart = System.currentTimeMillis();
    switch(testType) {
    case TEST_TYPE_WRITE:
      writeTest(fs);
      break;
    case TEST_TYPE_READ:
      readTest(fs);
      break;
    case TEST_TYPE_APPEND:
      appendTest(fs);
      break;
    case TEST_TYPE_READ_RANDOM:
    case TEST_TYPE_READ_BACKWARD:
    case TEST_TYPE_READ_SKIP:
      randomReadTest(fs);
    }
    long execTime = System.currentTimeMillis() - tStart;
  
    analyzeResult(fs, testType, execTime, resFileName);
    return 0;
  }

1、获取参数

2、根据测试类型决定调用哪一种Mapper（WriteMapper，ReadMapper，AppendMapper）

 private void runIOTest(
          Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, 
          Path outputDir) throws IOException {
    JobConf job = new JobConf(config, TestDFSIO.class);

    FileInputFormat.setInputPaths(job, getControlDir(config));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(mapperClass);
    job.setReducerClass(AccumulatingReducer.class);

    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(1);
    JobClient.runJob(job);
  }

ControlFile

问题来了，mapreduce 任务一定要知道每个map该执行啥，你可以抽象理解为mapreduce一定要读一个文件，这个文件一行一行的排布，每一行都包含了每个map任务要执行的信息。

@SuppressWarnings("deprecation")
  private void createControlFile(FileSystem fs,
                                  long nrBytes, // in bytes
                                  int nrFiles
                                ) throws IOException {
    LOG.info("creating control file: "+nrBytes+" bytes, "+nrFiles+" files");

    Path controlDir = getControlDir(config);
    fs.delete(controlDir, true);

    for(int i=0; i < nrFiles; i++) {
      String name = getFileName(i);
      Path controlFile = new Path(controlDir, "in_file_" + name);
      SequenceFile.Writer writer = null;
      try {
        writer = SequenceFile.createWriter(fs, config, controlFile,
                                           Text.class, LongWritable.class,
                                           CompressionType.NONE);
        writer.append(new Text(name), new LongWritable(nrBytes));
      } catch(Exception e) {
        throw new IOException(e.getLocalizedMessage());
      } finally {
    	if (writer != null)
          writer.close();
    	writer = null;
      }
    }
    LOG.info("created control files for: "+nrFiles+" files");
  }

可以看到，这个代码就是在执行mapreduce 之前先向hdfs 写入控制文件，控制mapreduce 任务的执行，控制文件的每一行写入了文件名和文件大小（byte）。

writer.append(new Text(name), new LongWritable(nrBytes));

这样，控制文件有多少行，map就会执行多少次，每次执行的时候，以Write为例吧

public void map(Text key, 
                  LongWritable value,
                  OutputCollector<Text, Text> output, 
                  Reporter reporter) throws IOException {
    String name = key.toString();
    long longValue = value.get();
    
    reporter.setStatus("starting " + name + " ::host = " + hostName);

    this.stream = getIOStream(name);
    T statValue = null;
    long tStart = System.currentTimeMillis();
    try {
      statValue = doIO(reporter, name, longValue);
    } finally {
      if(stream != null) stream.close();
    }
    long tEnd = System.currentTimeMillis();
    long execTime = tEnd - tStart;
    collectStats(output, name, execTime, statValue);
    
    reporter.setStatus("finished " + name + " ::host = " + hostName);
  }

key就是文件名，这样map就知道write的时候创建的文件名是什么。

value可以理解为要写入的文件大小。

这样，getIOStream可以通过name创建文件，doIO可以通过longValue知道每次写入的总的bytes是多少，因为是一个一个buffer写入的，这样就可以计算出需要多少buffer。