这个工具怎么用没啥好说的,里面的原理简单说一下。
MapReduce
这个工具会起mapreduce 任务在yarn的各个节点(客户端)发起读写请求。TestDFSIO.java的run方法会调用mapreduce作业任务。
@Override // Tool
public int run(String[] args) throws IOException {
TestType testType = null;
int bufferSize = DEFAULT_BUFFER_SIZE;
long nrBytes = 1*MEGA;
int nrFiles = 1;
long skipSize = 0;
String resFileName = DEFAULT_RES_FILE_NAME;
String compressionClass = null;
boolean isSequential = false;
String version = TestDFSIO.class.getSimpleName() + ".1.7";
LOG.info(version);
if (args.length == 0) {
System.err.println("Missing arguments.");
return -1;
}
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].startsWith("-read")) {
testType = TestType.TEST_TYPE_READ;
} else if (args[i].equals("-write")) {
testType = TestType.TEST_TYPE_WRITE;
} else if (args[i].equals("-append")) {
testType = TestType.TEST_TYPE_APPEND;
} else if (args[i].equals("-random")) {
if(testType != TestType.TEST_TYPE_READ) return -1;
testType = TestType.TEST_TYPE_READ_RANDOM;
} else if (args[i].equals("-backward")) {
if(testType != TestType.TEST_TYPE_READ) return -1;
testType = TestType.TEST_TYPE_READ_BACKWARD;
} else if (args[i].equals("-skip")) {
if(testType != TestType.TEST_TYPE_READ) return -1;
testType = TestType.TEST_TYPE_READ_SKIP;
} else if (args[i].equals("-clean")) {
testType = TestType.TEST_TYPE_CLEANUP;
} else if (args[i].startsWith("-seq")) {
isSequential = true;
} else if (args[i].startsWith("-compression")) {
compressionClass = args[++i];
} else if (args[i].equals("-nrFiles")) {
nrFiles = Integer.parseInt(args[++i]);
} else if (args[i].equals("-fileSize") || args[i].equals("-size")) {
nrBytes = parseSize(args[++i]);
} else if (args[i].equals("-skipSize")) {
skipSize = parseSize(args[++i]);
} else if (args[i].equals("-bufferSize")) {
bufferSize = Integer.parseInt(args[++i]);
} else if (args[i].equals("-resFile")) {
resFileName = args[++i];
} else {
System.err.println("Illegal argument: " + args[i]);
return -1;
}
}
if(testType == null)
return -1;
if(testType == TestType.TEST_TYPE_READ_BACKWARD)
skipSize = -bufferSize;
else if(testType == TestType.TEST_TYPE_READ_SKIP && skipSize == 0)
skipSize = bufferSize;
LOG.info("nrFiles = " + nrFiles);
LOG.info("nrBytes (MB) = " + toMB(nrBytes));
LOG.info("bufferSize = " + bufferSize);
if(skipSize > 0)
LOG.info("skipSize = " + skipSize);
LOG.info("baseDir = " + getBaseDir(config));
if(compressionClass != null) {
config.set("test.io.compression.class", compressionClass);
LOG.info("compressionClass = " + compressionClass);
}
config.setInt("test.io.file.buffer.size", bufferSize);
config.setLong("test.io.skip.size", skipSize);
config.setBoolean(DFSConfigKeys.DFS_SUPPORT_APPEND_KEY, true);
FileSystem fs = FileSystem.get(config);
if (isSequential) {
long tStart = System.currentTimeMillis();
sequentialTest(fs, testType, nrBytes, nrFiles);
long execTime = System.currentTimeMillis() - tStart;
String resultLine = "Seq Test exec time sec: " + (float)execTime / 1000;
LOG.info(resultLine);
return 0;
}
if (testType == TestType.TEST_TYPE_CLEANUP) {
cleanup(fs);
return 0;
}
createControlFile(fs, nrBytes, nrFiles);
long tStart = System.currentTimeMillis();
switch(testType) {
case TEST_TYPE_WRITE:
writeTest(fs);
break;
case TEST_TYPE_READ:
readTest(fs);
break;
case TEST_TYPE_APPEND:
appendTest(fs);
break;
case TEST_TYPE_READ_RANDOM:
case TEST_TYPE_READ_BACKWARD:
case TEST_TYPE_READ_SKIP:
randomReadTest(fs);
}
long execTime = System.currentTimeMillis() - tStart;
analyzeResult(fs, testType, execTime, resFileName);
return 0;
}
1、获取参数
2、根据测试类型决定调用哪一种Mapper(WriteMapper,ReadMapper,AppendMapper)
private void runIOTest(
Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass,
Path outputDir) throws IOException {
JobConf job = new JobConf(config, TestDFSIO.class);
FileInputFormat.setInputPaths(job, getControlDir(config));
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(mapperClass);
job.setReducerClass(AccumulatingReducer.class);
FileOutputFormat.setOutputPath(job, outputDir);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(1);
JobClient.runJob(job);
}
ControlFile
问题来了,mapreduce 任务一定要知道每个map该执行啥,你可以抽象理解为mapreduce一定要读一个文件,这个文件一行一行的排布,每一行都包含了每个map任务要执行的信息。
@SuppressWarnings("deprecation")
private void createControlFile(FileSystem fs,
long nrBytes, // in bytes
int nrFiles
) throws IOException {
LOG.info("creating control file: "+nrBytes+" bytes, "+nrFiles+" files");
Path controlDir = getControlDir(config);
fs.delete(controlDir, true);
for(int i=0; i < nrFiles; i++) {
String name = getFileName(i);
Path controlFile = new Path(controlDir, "in_file_" + name);
SequenceFile.Writer writer = null;
try {
writer = SequenceFile.createWriter(fs, config, controlFile,
Text.class, LongWritable.class,
CompressionType.NONE);
writer.append(new Text(name), new LongWritable(nrBytes));
} catch(Exception e) {
throw new IOException(e.getLocalizedMessage());
} finally {
if (writer != null)
writer.close();
writer = null;
}
}
LOG.info("created control files for: "+nrFiles+" files");
}
可以看到,这个代码就是在执行mapreduce 之前先向hdfs 写入控制文件,控制mapreduce 任务的执行,控制文件的每一行写入了文件名和文件大小(byte)。
writer.append(new Text(name), new LongWritable(nrBytes));
这样,控制文件有多少行,map就会执行多少次,每次执行的时候,以Write为例吧
public void map(Text key,
LongWritable value,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String name = key.toString();
long longValue = value.get();
reporter.setStatus("starting " + name + " ::host = " + hostName);
this.stream = getIOStream(name);
T statValue = null;
long tStart = System.currentTimeMillis();
try {
statValue = doIO(reporter, name, longValue);
} finally {
if(stream != null) stream.close();
}
long tEnd = System.currentTimeMillis();
long execTime = tEnd - tStart;
collectStats(output, name, execTime, statValue);
reporter.setStatus("finished " + name + " ::host = " + hostName);
}
key就是文件名,这样map就知道write的时候创建的文件名是什么。
value可以理解为要写入的文件大小。
这样,getIOStream可以通过name创建文件,doIO可以通过longValue知道每次写入的总的bytes是多少,因为是一个一个buffer写入的,这样就可以计算出需要多少buffer。