Hive 的 OutputCommitter
public class ExecDriver extends Task<MapredWork> implements Serializable {
public int execute(DriverContext driverContext) {
ShimLoader.getHadoopShims().setNullOutputFormat(job);
}
}
public class Hadoop20Shims implements HadoopShims {
public static class NullOutputCommitter extends OutputCommitter {
@Override
public void setupJob(JobContext jobContext) { }
@Override
public void cleanupJob(JobContext jobContext) { }
@Override
public void setupTask(TaskAttemptContext taskContext) { }
@Override
public boolean needsTaskCommit(TaskAttemptContext taskContext) {
return false;
}
@Override
public void commitTask(TaskAttemptContext taskContext) { }
@Override
public void abortTask(TaskAttemptContext taskContext) { }
}
public void setNullOutputFormat(JobConf conf) {
conf.setOutputFormat(NullOutputFormat.class);
conf.setOutputCommitter(Hadoop20Shims.NullOutputCommitter.class);
// option to bypass job setup and cleanup was introduced in hadoop-21 (MAPREDUCE-463)
// but can be backported. So we disable setup/cleanup in all versions >= 0.19
conf.setBoolean("mapred.committer.job.setup.cleanup.needed", false);
}
}
public class JobConf extends Configuration {
/**
* Set the {@link OutputCommitter} implementation for the map-reduce job.
*
* @param theClass the {@link OutputCommitter} implementation for the map-reduce
* job.
*/
public void setOutputCommitter(Class<? extends OutputCommitter> theClass) {
setClass("mapred.output.committer.class", theClass, OutputCommitter.class);
}
/**
* Get the {@link OutputCommitter} implementation for the map-reduce job,
* defaults to {@link FileOutputCommitter} if not specified explicitly.
*
* @return the {@link OutputCommitter} implementation for the map-reduce job.
*/
public OutputCommitter getOutputCommitter() {
return (OutputCommitter)ReflectionUtils.newInstance(
getClass("mapred.output.committer.class", FileOutputCommitter.class,
OutputCommitter.class), this);
}
}
OutputCommitter用于控制Job的输出,在MapTask和ReduceTask中都会被调用。
http://hi.baidu.com/_kouu/blog/item/dd2f08fd25da09e0fc037f15.html Hadoop OutputFormat浅析这个里面讲得很清晰了,但是Hive的作业没有使用这个功能。
Hive的输出都由自己来控制,具体在FileSinkOperator类中体现(reduce或者map的时候,MR job里面调用),其中jobClose方法是在MapReduce作业运行结束后调用(client端调用)。
public class ExecDriver extends Task<MapredWork> implements Serializable {
public int execute(DriverContext driverContext) {
if (rj != null) {
JobCloseFeedBack feedBack = new JobCloseFeedBack();
if (work.getAliasToWork() != null) {
for (Operator<? extends Serializable> op : work.getAliasToWork().values()) {
op.jobClose(job, success, feedBack);
ArrayList<Object> dirs = feedBack.get(JobCloseFeedBack.FeedBackType.DYNAMIC_PARTITIONS);
if (dirs != null) {
for (Object o: dirs) {
if (o instanceof String) {
dpPaths.add((String)o);
}
}
}
}
}
if (work.getReducer() != null) {
work.getReducer().jobClose(job, success, feedBack);
}
}
}
}