在使用setAsNewAPIHadoopDataset写数据到Hbase时发生如下错误:
java.lang.IllegalArgumentException: Can not create a Path from a null string
at org.apache.hadoop.fs.Path.checkPathArg(Path.java:123)
at org.apache.hadoop.fs.Path.<init>(Path.java:135)
at org.apache.hadoop.fs.Path.<init>(Path.java:89)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.absPathStagingDir(HadoopMapReduceCommitProtocol.scala:58)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:132)
at org.apache.spark.internal.io.SparkHadoopMapReduceWriter$.write(SparkHadoopMapReduceWriter.scala:101)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.PairRDDFunctions.saveAsNewAPIHadoopDataset(PairRDDFunctions.scala:1084)
写Hbase的源代码如下:(Scala版) Spark 2.2
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.{SparkConf, SparkContext}
/**
* Description: Put data into Hbase by map reduce Job.
*
* Author : Adore Chen
* Created: 2017-12-22
*/
object SparkMapJob {
/**
* insert 100,000 cost 21035 ms
*
* @param args
*/
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkPutByMap")
val context = new SparkContext(conf)
val hbaseConf =HBaseConfiguration.create()
hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, "test_table")
//IMPORTANT: must set the attribute to solve the problem (can't create path from null string )
hbaseConf.set("mapreduce.output.fileoutputformat.outputdir", "/tmp")
val job = Job.getInstance(hbaseConf)
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Put])
try{
val rdd = context.makeRDD(1 to 100000)
// column family
val family = Bytes.toBytes("cf")
// column counter --> ctr
val column = Bytes.toBytes("ctr")
rdd.map(value => {
var put = new Put(Bytes.toBytes(value))
put.addImmutable(family, column, Bytes.toBytes(value))
(new ImmutableBytesWritable(), put)
})
.saveAsNewAPIHadoopDataset(job.getConfiguration)
}finally{
context.stop()
}
}
}
这是spark的一个bug,具体信息查看:
https://issues.apache.org/jira/browse/SPARK-21549
解决方案:
//IMPORTANT: must set the attribute to solve the problem (can’t create path from null string )
hbaseConf.set(“mapreduce.output.fileoutputformat.outputdir”, “/tmp”)
参考信息: