首先说明遇到的坑
要在phoenxi中建表 , 只有在Phoenix中建表才鞥映射到hbase ,因为spark是通过Phoenix的jdbc
插入数据的 不是吧数据放进hbase而是把数据放进Phoenix
所以在hbase中建表是找不到的
我是通过sparksql插入的
sparksql是datafranme
所以使用Phoenix比较有优势
启动
在bin目录下
./sqlline.py
Phoenix建表
CREATE TABLE record (hospitalid CHAR(80) PRIMARY KEY ,hcount BIGINT,havgcost DOUBLE,havgreimburse DOUBLE,havgreproportion DOUBLE,havgday DOUBLE,havgfinproportion DOUBLE)
代码
val insertTable: String = "record"
val DB_PHOENIX_DRIVER = "org.apache.phoenix.jdbc.PhoenixDriver"
val DB_PHOENIX_URL = "jdbc:phoenix:master"
spark.sql(" SELECT \nhospitalid \n,SUM( CASE flag WHEN 1 THEN 1 ELSE 0 END ) hcount \n,SUM(allcost)/COUNT(l.recordid) havgcost\n,SUM(recost)/SUM( CASE flag WHEN 1 THEN 1 ELSE 0 END ) havgreimburse \n,SUM(recost/allcost)/COUNT(l.recordid) havgreproportion \n, SUM(datediff(starttime,endtime))/COUNT(l.recordid) havgday\n, SUM(CASE isrecovery WHEN 1 THEN 1 ELSE 0 END)/COUNT(l.recordid) havgfinproportion\n FROM \nrecord l , reimburse r\nWHERE\nl.recordid=r.recordid\nAND\nflag = 1\n\nGROUP BY\n hospitalid ")
.write.format("org.apache.phoenix.spark")//要以什么格式插入
.mode(SaveMode.Overwrite)//插入模式
.option("driver", DB_PHOENIX_DRIVER)//Phoenix驱动
.option("table", insertTable)//表名
.option("zkUrl", DB_PHOENIX_URL).save()//zookeeper的地址 url
整了半天才研究出来
还有一种方法是通过makerdd这种方法较为困难
需要把datafram转换为makerdd
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("INSERTHBase").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
//设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
conf.set("hbase.zookeeper.quorum","master")
//设置zookeeper连接端口,默认2181
conf.set("hbase.zookeeper.property.clientPort", "2181")
val tablename = "test1"
//初始化jobconf,TableOutputFormat必须是org.apache.hadoop.hbase.mapred包下的!
val jobConf = new JobConf(conf)
jobConf.setOutputFormat(classOf[TableOutputFormat])
jobConf.set(TableOutputFormat.OUTPUT_TABLE, tablename)
val indataRDD = sc.makeRDD(Array("1,jack,15","2,Lily,16","3,mike,16"))
val rdd = indataRDD.map(_.split(',')).map{arr=>{
/*一个Put对象就是一行记录,在构造方法中指定主键
* 所有插入的数据必须用org.apache.hadoop.hbase.util.Bytes.toBytes方法转换
* Put.add方法接收三个参数:列族,列名,数据
*/
val put = new Put(Bytes.toBytes(arr(0).toInt))
put.add(Bytes.toBytes("cf"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))
put.add(Bytes.toBytes("cf"),Bytes.toBytes("age"),Bytes.toBytes(arr(2).toInt))
//转化成RDD[(ImmutableBytesWritable,Put)]类型才能调用saveAsHadoopDataset
(new ImmutableBytesWritable, put)
}}
rdd.saveAsHadoopDataset(jobConf)
sc.stop()
}