如果原始数据在hbase中,这时想用spark对hbase数据做一些批量计算,就可以用spark的api直接读写hbase数据
读取hbase数据
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark._
object HbaseSparkRead {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
val sc = new SparkContext(sparkConf)
//hbase信息
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "ht05")
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set(TableInputFormat.INPUT_TABLE, "spark_hbase")
//读取数据并转化成rdd
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
//val count = hBaseRDD.count()
//println(count)
hBaseRDD.foreach { case (_, result) => {
//获取行键
val key = Bytes.toString(result.getRow)
//通过列族和列名获取列
val name = Bytes.toString(result.getValue("cf".getBytes, "name".getBytes))
println("Row key:" + key + " Name:" + name)
}
}
//将hbase数据保存到txt
hBaseRDD.map(x => Bytes.toString(x._2.getRow)).saveAsTextFile("hdfs://ht05:9000/test1")
}
}
写入hbase数据
import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark._
object HbaseSparkWrite {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
val sc = new SparkContext(sparkConf)
//从txt写入到hbase
val dataRdd = sc.textFile("hdfs://ht05:9000//zhaow/hotle0.txt")
//hbase信息
sc.hadoopConfiguration.set("hbase.zookeeper.quorum", "ht05")
sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", "2181")
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, "spark_test0")
//lazy,延迟加载,如果程序在spark-shell里面运行必须使用延迟加载,因为spark-shell里面每一部都会打印结果
lazy val job = new Job(sc.hadoopConfiguration)
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
val rdd = dataRdd.filter(_.length > 0).map { line => {
val rowkey: String = line
val put = new Put(Bytes.toBytes(rowkey))
put.add(Bytes.toBytes("cf"), Bytes.toBytes("name"), Bytes.toBytes(rowkey))
(new ImmutableBytesWritable, put)
}
}
rdd.saveAsNewAPIHadoopDataset(job.getConfiguration)
}
}