在对rcfile进行读取操作时,需要重写InputFormat和RecordReader这两个类,然后在调用时,需要使用hive-exec-*.jar,主要用到了其中的org.apache.hadoop.hive.ql.io.RCFile.*,可以使用如下方式调用:MultipleInputs.addInputPath(job, input, RCFileInputFormat.class)。
下面给出InputFormat和RecordReader的重写示例和一个读取rcfile文件并添加Tab键作为分隔符的示例。
RCFileInputFormat.java
import java.io.IOException;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
/**
* RCFileInputFormat.
*
* @param <K>
* @param <V>
*/
public class RCFileInputFormat<K extends LongWritable, V extends BytesRefArrayWritable>
extends FileInputFormat<K, V> {
public RCFileInputFormat() {
}
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public org.apache.hadoop.mapreduce.RecordReader<K, V> createRecordReader(
org.apache.hadoop.mapreduce.InputSplit arg0, TaskAttemptContext arg1)
throws IOException, InterruptedException {
return new RCFileRecordReader();
}
}
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org