import org.apache.hadoop.fs.FileSystem; //導入方法依賴的package包/類
/**
* Write the list of distributed cache files in the decreasing order of
* file sizes into the sequence file. This file will be input to the job
* {@link GenerateDistCacheData}.
* Also validates if -generate option is missing and distributed cache files
* are missing.
* @return exit code
* @throws IOException
*/
private int writeDistCacheFilesList()
throws IOException {
// Sort the distributed cache files in the decreasing order of file sizes.
List dcFiles = new ArrayList(distCacheFiles.entrySet());
Collections.sort(dcFiles, new Comparator() {
public int compare(Object dc1, Object dc2) {
return ((Comparable) ((Map.Entry) (dc2)).getValue())
.compareTo(((Map.Entry) (dc1)).getValue());
}
});
// write the sorted distributed cache files to the sequence file
FileSystem fs = FileSystem.get(conf);
Path distCacheFilesList = new Path(distCachePath, "_distCacheFiles.txt");
conf.set(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST,
distCacheFilesList.toString());
SequenceFile.Writer src_writer = SequenceFile.createWriter(fs, conf,
distCacheFilesList, LongWritable.class, BytesWritable.class,
SequenceFile.CompressionType.NONE);
// Total number of unique distributed cache files
int fileCount = dcFiles.size();
long byteCount = 0;// Total size of all distributed cache files
long bytesSync = 0;// Bytes after previous sync;used to add sync marker
for (Iterator it = dcFiles.iterator(); it.hasNext();) {
Map.Entry entry = (Map.Entry)it.next();
LongWritable fileSize =
new LongWritable(Long.parseLong(entry.getValue().toString()));
BytesWritable filePath =
new BytesWritable(
entry.getKey().toString().getBytes(charsetUTF8));
byteCount += fileSize.get();
bytesSync += fileSize.get();
if (bytesSync > AVG_BYTES_PER_MAP) {
src_writer.sync();
bytesSync = fileSize.get();
}
src_writer.append(fileSize, filePath);
}
if (src_writer != null) {
src_writer.close();
}
// Set delete on exit for 'dist cache files list' as it is not needed later.
fs.deleteOnExit(distCacheFilesList);
conf.setInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, fileCount);
conf.setLong(GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, byteCount);
LOG.info("Number of HDFS based distributed cache files to be generated is "
+ fileCount + ". Total size of HDFS based distributed cache files "
+ "to be generated is " + byteCount);
if (!shouldGenerateDistCacheData() && fileCount > 0) {
LOG.error("Missing " + fileCount + " distributed cache files under the "
+ " directory\n" + distCachePath + "\nthat are needed for gridmix"
+ " to emulate distributed cache load. Either use -generate\noption"
+ " to generate distributed cache data along with input data OR "
+ "disable\ndistributed cache emulation by configuring '"
+ DistributedCacheEmulator.GRIDMIX_EMULATE_DISTRIBUTEDCACHE
+ "' to false.");
return Gridmix.MISSING_DIST_CACHE_FILES_ERROR;
}
return 0;
}