Spark读取hbase的数据 :
import org.apache.spark.sql.SparkSession import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.hadoop.hbase._ import org.apache.hadoop.hbase.client.Scan import org.apache.hadoop.hbase.util.{Base64, Bytes} import org.apache.hadoop.hbase.protobuf.ProtobufUtil object HbaseTest { def main(args: Array[String]): Unit = { val sess = SparkSession.builder().appName("wangjk").master("local[2]") .config("spark.testing.memory", "2147480000").getOrCreate(); val sc = sess.sparkContext; val tablename = "Air:airDay" val conf = HBaseConfiguration.create() //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置 conf.set("hbase.zookeeper.quorum","192.168.0.112:2181,192.168.0.114:2181,192.168.0.116:2181") //设置zookeeper连接端口,默认2181 conf.set("hbase.zookeeper.property.clientPort", "2181") conf.set(TableInputFormat.INPUT_TABLE, tablename) val startRowkey="0,110000,20180220" val endRowkey="0,110000,20180302" //开始rowkey和结束一样代表精确查询某条数据 //组装scan语句 val scan=new Scan(Bytes.toBytes(startRowkey),Bytes.toBytes(endRowkey)) scan.setCacheBlocks(false) /* scan.addFamily(Bytes.toBytes("ks")); scan.addColumn(Bytes.toBytes("ks"), Bytes.toBytes("data"))*/ //将scan类转化成string类型 val proto= ProtobufUtil.toScan(scan) val ScanToString = Base64.encodeBytes(proto.toByteArray()); conf.set(TableInputFormat.SCAN,ScanToString) //读取数据并转化成rdd val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result]) val count = hBaseRDD.count() println(count) hBaseRDD.foreach{case (_,result) =>{ //获取行键 val key = Bytes.toString(result.getRow) //通过列族和列名获取列 val citycode = Bytes.toString(result.getValue("f1".getBytes,"citycode".getBytes)) val daytime = Bytes.toInt(result.getValue("f1".getBytes,"daytime".getBytes)) println("Row key:"+key+" Name:"+citycode+" Age:"+daytime) }} } }
Spark写入hbase数据:
import org.apache.spark.sql.SparkSession import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase._ import org.apache.hadoop.hbase.mapred.TableOutputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.hbase.io.ImmutableBytesWritable object HbaseWrite { def main(args: Array[String]): Unit = { val sess = SparkSession.builder().appName("wangjk").master("local[2]") .config("spark.testing.memory", "2147480000").getOrCreate(); val sc = sess.sparkContext; val tablename = "default:airTest" val conf = HBaseConfiguration.create() //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置 conf.set("hbase.zookeeper.quorum","192.168.0.112:2181,192.168.0.114:2181,192.168.0.116:2181") //设置zookeeper连接端口,默认2181 conf.set("hbase.zookeeper.property.clientPort", "2181") //初始化jobconf,TableOutputFormat必须是org.apache.hadoop.hbase.mapred包下的! val jobConf = new JobConf(conf) jobConf.setOutputFormat(classOf[TableOutputFormat]) jobConf.set(TableOutputFormat.OUTPUT_TABLE, tablename) val indataRDD = sc.makeRDD(Array("1,jack,15","2,Lily,16","3,mike,16")) val rdd = indataRDD.map(_.split(',')).map{arr=>{ /*一个Put对象就是一行记录,在构造方法中指定主键 * 所有插入的数据必须用org.apache.hadoop.hbase.util.Bytes.toBytes方法转换 * Put.add方法接收三个参数:列族,列名,数据 */ val put = new Put(Bytes.toBytes(arr(0).toInt)) put.add(Bytes.toBytes("f1"),Bytes.toBytes("name"),Bytes.toBytes(arr(1))) put.add(Bytes.toBytes("f1"),Bytes.toBytes("age"),Bytes.toBytes(arr(2).toInt)) //转化成RDD[(ImmutableBytesWritable,Put)]类型才能调用saveAsHadoopDataset (new ImmutableBytesWritable, put) }} rdd.saveAsHadoopDataset(jobConf) } }