sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
注:
- 默认partition个数等于hbase表region个数
- 可通过重写TableInputFormat的getStartEndKeys方法实现自定义分区
例:
public class CustomTableInputFormat extends TableInputFormat {
public static final String SCAN_SPLIT_NUM = "hbase.mapreduce.scan.split.num";
@Override
protected Pair<byte[][],byte[][]> getStartEndKeys() {
Configuration conf = this.getConf();
int splitNum = Integer.valueOf(
conf.get(CrawlTableInputFormat.SCAN_SPLIT_NUM, "1")
);
if (splitNum < 1) {
splitNum = 1;
}
byte[][] splits = Bytes.split(
Bytes.toBytes(conf.get(TableInputFormat.SCAN_ROW_START)),
Bytes.toBytes(conf.get(TableInputFormat.SCAN_ROW_STOP)),
true,
splitNum - 1
);
int len = splits.length;
byte[][] startKeys = new byte[len][];
byte[][] endKeys = new byte[len][];
for (int i = 0; i < len - 1; ++i) {
startKeys[i] = splits[i];
endKeys[i] = splits[i + 1];
}
return new Pair<>(startKeys, endKeys);
}
}
//调用
...
val conf = HBaseConfiguration.create(sc.hadoopConfiguration)
conf.set(CustomTableInputFormat.SCAN_SPLIT_NUM, "100")
sc.newAPIHadoopRDD(conf, classOf[CustomTableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
...