可以将csv文件上传至hdfs,然后使用spark读取csv,创建rdd,再使用phoenix的spark驱动,批量保存数据到hbase中。
举例
第一种方式:saveRDDs
import org.apache.spark.SparkContext
import org.apache.phoenix.spark._
val sc = new SparkContext("local", "phoenix-test")
val dataSet = List((1L, "1", 1), (2L, "2", 2), (3L, "3", 3))
sc
.parallelize(dataSet)
.saveToPhoenix(
"OUTPUT_TEST_TABLE",
Seq("ID","COL1","COL2"),
zkUrl = Some("phoenix-server:2181")
)
第二种方式:save Dataframes
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.phoenix.spark._
// Load INPUT_TABLE
val sc = new SparkContext("local", "phoenix-test")
val sqlContext = new SQLContext(sc)
val df = sqlContext.load("org.apache.phoenix.spark", Map("table" -> "INPUT_TABLE",
"zkUrl" -> hbaseConnectionString))
// Save to OUTPUT_TABLE
df.save("org.apache.phoenix.spark", SaveMode.Overwrite, Map("table" -> "OUTPUT_TABLE",
"zkUrl" -> hbaseConnectionString))