Scala版本2.11.8 spark版本2.1.0
<!--*************************************************************************-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<!--*************************************************************************-->
import java.util.concurrent.{Executors, Future}
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ArrayBuffer
object ThreadReadHbase {
//创建弹性线程池
val pool = Executors.newCachedThreadPool()
//配置参数意思不明晰,参考另外一篇博客https://blog.csdn.net/u010916338/article/details/80949525
val sparkConf = new SparkConf().setAppName("ThreadReadHbase").setMaster("yarn")
.set("spark.executor.instances", "5") //设置运算节点个数
.set("spark.driver.memory", "2g")
.set("spark.executor.memory", "2g")
.set("spark.driver.cores", "4")
.set("spark.executor.cores", "4")
.set("spark.streaming.blockInterval", "50")
.set("spark.default.parallelism", "10")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")//序列化,hbase底层存储是bytes类型
val sc = new SparkContext(sparkConf)
//声明获取RDD方法
def getRDDbyTimestamp() = {
//数组存放userid(拼接在rowkey最前面)
val userArr = Array("981112", "991111")
//申明可变数组,存放各线程返回值,因为线程有返回值所以要用Future保证数据一致性
var results = new ArrayBuffer[Future[RDD[(ImmutableBytesWritable, Result)]]]()
//查询每一个user信息启动一个线程,查询一天的数据
for (user <- userArr) {
//18070500代表18年7月5号零点,位于rowkey的第二段,紧接着userid,sc全局只能申明一个,所以需要传过去
var hbaseDataScaner = new HbaseDataScaner(user + "18070500", user + "18070524", sc)
pool.synchronized {
val rdd: Future[RDD[(ImmutableBytesWritable, Result)]] = pool.submit(hbaseDataScaner)
results += rdd
}
}
//存放合并之后的RDD
var rdds: RDD[(ImmutableBytesWritable, Result)] = null
//取出第一个RDD
if (results.size > 0) {
rdds = results(0).get()
}
//合并第二个到之后的RDD
if (results.size >= 1) {
results.remove(0)
for (x <- results) {
println("循环" + x.get())
rdds = rdds.union(x.get())
}
}
//打印RDD数据,注意:collect慎用,会将所有数据收集到一个计算节点,机器不能承受,小数据量ok
rdds.collect().foreach { case (_, result) => {
//获取行键
val key = Bytes.toString(result.getRow)
//通过列族和列名获取列
val os_info = Bytes.toString(result.getValue("f1".getBytes, "c_name".getBytes))
println("Row key:" + key + " Name:" + os_info)
}
}
//根据业务需求重新组合RDD元素
val rdd = rdds.map(result => (
//获取行键
Bytes.toString(result._2.getRow)+"|"+
//通过列族和列名获取列
Bytes.toString(result._2.getValue("f1".getBytes, "c_name".getBytes))+"|"+
Bytes.toString(result._2.getValue("f1".getBytes, "c_code".getBytes))+"|"+
Bytes.toString(result._2.getValue("f1".getBytes, "c_content".getBytes))+"|"+
Bytes.toString(result._2.getValue("f1".getBytes, "c_dept".getBytes))+"|"+
Bytes.toString(result._2.getValue("f1".getBytes, "c_log_type".getBytes))+"|"+
Bytes.toString(result._2.getValue("f1".getBytes, "c_result".getBytes))+"|"+
Bytes.toString(result._2.getValue("f1".getBytes, "c_sn".getBytes))+"|"+
Bytes.toString(result._2.getValue("f1".getBytes, "n_log_time".getBytes))
))
//保存RDD数据到hdfs
rdd.saveAsTextFile("/rdd/hbase")
//返回RDD,供后续数据分析使用
rdd
}
def main(args: Array[String]): Unit = {
val startTime = System.currentTimeMillis()
val hbaseRDD = ThreadReadHbase.getRDDbyTimestamp()
println("行数:" + hbaseRDD.count())
val endTime = System.currentTimeMillis()
println("运行时间:" + (endTime - startTime) / 1000 / 60 + "分钟")
//最后关闭SparkContext
sc.stop()
}
}
import java.util.concurrent.Callable
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Result, Scan}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext}
//线程有返回值需要实现Callable接口,Scala中的实现使用extends关键字,该类需要创建多次需是class类型
class HbaseDataScaner(startRowkey: String, endRowkey: String, sc: SparkContext) extends Callable[RDD[(ImmutableBytesWritable, Result)]] {
val tablename = "usersaction"
val conf = HBaseConfiguration.create()
//设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
conf.set("hbase.zookeeper.quorum", "192.168.35.206,192.168.35.207,192.168.35.208")
//设置zookeeper连接端口,默认2181
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set(TableInputFormat.INPUT_TABLE, tablename)
//范围查询rowkey
val scan = new Scan(Bytes.toBytes(startRowkey),Bytes.toBytes(endRowkey));
//开启缓存
scan.setCacheBlocks(true)
//底层bytes类型转成String类型
val proto = ProtobufUtil.toScan(scan)
val scan_str = Base64.encodeBytes(proto.toByteArray)
conf.set(TableInputFormat.SCAN, scan_str)
val hbaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
//RDD返回类型必须是[(ImmutableBytesWritable, Result)]
override def call(): RDD[(ImmutableBytesWritable, Result)] = {
return hbaseRDD
}
}