部署环境参考其他章节。
完整代码如下:
import java.util.UUID
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2
import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
object HbaseOpe {
def bulkLoadToHbase()={
val spark = SparkSession.builder().appName("HbaseBulkLoad").getOrCreate()
val sc = spark.sparkContext
val tbname= "yykj_pro:rm2018"
val ckey: Map[String, Int]=Map("humidity"->1,"prcp"->2,"sunlit"->3,"tavg"->4,"tmax"->5,"tmin"->6,"wind"->7)
for(i <-35 to 365){
val tm=f"2018$i%3d".replaceAll(" ", "0")
val txtpath="/user/datas/%s.txt".format(tm)
val txtRdd=sc.textFile(txtpath)
//具体文本格式为:
//1870,0,0,0,0,0,0,0
//1871,0,0,0,0,0,0,0
//1872,0,0,0,0,0,0,0
//1873,0,0,0,0,0,0,0
//1874,0,0,0,0,0,0,0
//8067,0,0,0,0,0,0,0
//8068,0,0,0,0,0,0,0
//8069,0,0,0,0,0,0,0
//8070,0,0,0,0,0,0,0
//8071,0,0,0,0,0,0,0
//使用bulkLoad多次尝试多列写入失败,所以这里只能通过循环将多列数据写入
//注意这里有个sortBy排序,使用bulkLoad写入主键必须按顺序排列,否则会出现
//Added a key not lexically larger than previous错误。
val sortRdd=txtRdd.map(_.split(",")).sortBy(arr=>arr(0))
for(c <- ckey){
val outRdd=sortRdd.map(arr=>{
val rowKey = Bytes.toBytes(arr(0))
val immutableRowKey = new ImmutableBytesWritable(rowKey)
val kv = new KeyValue(
rowKey,
Bytes.toBytes("dynamic"),
Bytes.toBytes(c._1),
tm.toLong,
Bytes.toBytes(c._2.toString)
)
(immutableRowKey, kv)
})
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.mapreduce.hfileoutputformat.table.name", tbname)
val tableName = TableName.valueOf(tbname)
val conn = ConnectionFactory.createConnection(hbaseConf)
val table = conn.getTable(tableName)
val regionLocator = conn.getRegionLocator(tableName)
val hFileOutput = s"/tmp/hbase/"+UUID.randomUUID().toString
outRdd.saveAsNewAPIHadoopFile(hFileOutput,
classOf[ImmutableBytesWritable],
classOf[KeyValue],
classOf[HFileOutputFormat2],
hbaseConf
)
//写入hbase
val bulkLoader = new LoadIncrementalHFiles(hbaseConf)
bulkLoader.doBulkLoad(new Path(hFileOutput), conn.getAdmin, table, regionLocator)
}
}
}
}