1、put方式scala编码实现:
package com.nbdpt.work4_hive2hbase2019
import com.nbdpt.util.BaseUtil
import org.apache.hadoop.hbase.client.{ConnectionFactory, Get, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
object SparkReadHiveToHbasePut2019 {
Logger.getLogger("org").setLevel(Level.INFO)
def main(args: Array[String]): Unit = {
val hiveTable = args(0)
val hbaseTable = args(1)
val partitions = args(2)
val ss = SparkSession.builder().appName("removeData")
.enableHiveSupport()
.getOrCreate()
val hiveDF: DataFrame = ss.sql(
s"""
|select
|xf_nsrsbh,gf_nsrsbh,fpdm,fphm,fpmxxh,flbm,kprq,kpyf,spmc,jldw,je,se,slv,zfbz,sjly,ext
|from ${hiveTable}
|""".stripMargin)
val columnf = "fpmx"
val columnNames = Array("xf_nsrsbh", "gf_nsrsbh", "fpdm", "fphm", "fpmxxh", "flbm", "kprq", "kpyf", "spmc", "jldw", "je", "se", "slv", "zfbz", "sjly", "ext")
ss.sparkContext.broadcast(columnNames)
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, hbaseTable)
hbaseConf.set("hbase.zookeeper.quorum", "masterhost1, masterhost2, masterhost3")
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
val job = Job.getInstance(hbaseConf)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
job.setOutputValueClass(classOf[Put])
mkOutPutRdd(hiveDF, columnNames, columnf, partitions)
.saveAsNewAPIHadoopDataset(job.getConfiguration)
ss.close()
}
def mkOutPutRdd(hiveDF: DataFrame, columnNames: Array[String], columnf: String, partitions: String) = {
val rdd1 = hiveDF.rdd
.filter(row => row.length > 0 && !nullDecide(row(0)))
.filter(row => !nullDecide(row(2)))
.filter(row => !nullDecide(row(3)))
.filter(row => !nullDecide(row(4)))
.filter(row => !nullDecide(row(7)) && row(7).toString.length > 4)
.coalesce(Integer.valueOf(partitions), true)
.persist(StorageLevel.MEMORY_AND_DISK_SER)
val rdd2 = rdd1.map(row => {
val buffer = Row.unapplySeq(row).get.map(_.asInstanceOf[String]).toBuffer
buffer.update(14, "hive")
val schema: StructType = row.schema
val newRow: Row = new GenericRowWithSchema(buffer.toArray, schema)
newRow
})
.map(row => {
val xfqybm = getQybm(String.valueOf(row(0)).trim)
val fpdm = String.valueOf(row(2)).trim
val fphm = String.valueOf(row(3)).trim
val id = String.valueOf(row(4)).trim
val yy = String.valueOf(row(7)).trim.substring(2, 4)
val rowKey = mkRowKey(xfqybm, fpdm, fphm, yy, id)
val put = new Put(Bytes.toBytes(rowKey))
columnNames.foreach(col => {
put.addColumn(Bytes.toBytes(columnf), Bytes.toBytes(col), Bytes.toBytes(nullHandle(row.getAs(col))))
})
(new ImmutableBytesWritable, put)
})
.filter(tuple => !Bytes.toString(tuple._2.getRow).contains("-"))
.persist(StorageLevel.MEMORY_AND_DISK_SER)
rdd1.unpersist()
rdd2
}
def getQybm(xf_nsrsbh: String) = {
val hbaseConf = HBaseConfiguration.create()
val conn = ConnectionFactory.createConnection(hbaseConf)
val tableName = "nbdpt:qy_xydm_bm"
val table = conn.getTable(TableName.valueOf(tableName))
val get = new Get(Bytes.toBytes(xf_nsrsbh))
val result = table.get(get)
val xfqybm = Bytes.toString(result.getValue(Bytes.toBytes("qyxx"), Bytes.toBytes("qybm")))
conn.close()
nullHandle(xfqybm)
}
def mkRowKey(xfqybm: String, fpdm: String, fphm: String, yy: String, id: String): String = {
val fpbm = BaseUtil.getFpbm(fpdm, fphm)
val rowKey = xfqybm + yy + fpbm + id
rowKey
}
def nullHandle(str: String): String = {
if (str == null || str == "" || str == "null") {
"-1"
} else {
str
}
}
def nullDecide(str: Any): Boolean = {
if (str == null || str == "" || str == "null") {
true
} else {
false
}
}
}
2、bulk load方式scala编码实现:
package com.nbdpt.work4_hive2hbase2019
import com.nbdpt.util.BaseUtil
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Get}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2
import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import scala.collection.mutable.ArrayBuffer
object SparkReadHiveToHbaseLoad2019 {
Logger.getLogger("org").setLevel(Level.INFO)
def main(args: Array[String]): Unit = {
val hiveTable = args(0)
val hbaseTable = args(1)
val hfilePath = args(2)
val partitions = args(3)
val conf = new SparkConf()
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val ss = SparkSession.builder().appName("hiveTohbase")
.enableHiveSupport()
.getOrCreate()
val hiveDF: DataFrame = ss.sql(
s"""
|select
|xf_nsrsbh,gf_nsrsbh,fpdm,fphm,fpmxxh,flbm,kprq,kpyf,spmc,jldw,je,se,slv,zfbz,sjly,ext
|from ${hiveTable}
|""".stripMargin)
val columnf = "fpmx"
val columnNames = Array("xf_nsrsbh", "gf_nsrsbh", "fpdm", "fphm", "fpmxxh", "flbm", "kprq", "kpyf", "spmc", "jldw", "je", "se", "slv", "zfbz", "sjly", "ext")
ss.sparkContext.broadcast(columnNames)
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.mapreduce.hfileoutputformat.table.name", hbaseTable)
hbaseConf.setInt("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", 5000)
val conn = ConnectionFactory.createConnection(hbaseConf)
val regionLocator = conn.getRegionLocator(TableName.valueOf(hbaseTable))
val table = conn.getTable(TableName.valueOf(hbaseTable))
val job = Job.getInstance(hbaseConf)
job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
job.setMapOutputValueClass(classOf[KeyValue])
HFileOutputFormat2.configureIncrementalLoadMap(job, table.getDescriptor)
mkOutputRdd(hiveDF, columnf, columnNames,partitions)
.saveAsNewAPIHadoopFile(hfilePath,
classOf[ImmutableBytesWritable],
classOf[KeyValue],
classOf[HFileOutputFormat2],
hbaseConf)
val load = new LoadIncrementalHFiles(hbaseConf)
load.doBulkLoad(new Path(hfilePath), conn.getAdmin, table, regionLocator)
table.close()
conn.close()
ss.close()
}
def mkOutputRdd(hiveDF: DataFrame, columnf: String, columnNames: Array[String],partitions: String): RDD[(ImmutableBytesWritable, KeyValue)] = {
val rdd1 = hiveDF.rdd
.filter(row => row.length > 0 && !nullDecide(row(0)))
.filter(row => !nullDecide(row(2)))
.filter(row => !nullDecide(row(3)))
.filter(row => !nullDecide(row(4)))
.filter(row => !nullDecide(row(7)) && row(7).toString.length > 4)
.coalesce(Integer.valueOf(partitions), true)
.persist(StorageLevel.MEMORY_AND_DISK_SER)
val rdd2 = rdd1.map(row => {
val buffer = Row.unapplySeq(row).get.map(_.asInstanceOf[String]).toBuffer
buffer.update(14, "hive")
val schema: StructType = row.schema
val newRow: Row = new GenericRowWithSchema(buffer.toArray, schema)
newRow
}).flatMap(row => {
val xfqybm = getQybm(row(0).toString.trim)
val fpdm = row(2).toString.trim
val fphm = row(3).toString.trim
val id = row(4).toString.trim
val yy = row(7).toString.trim.substring(2, 4)
val values = ArrayBuffer[(String, (String, String, String))]()
columnNames.foreach(col => {
values.append((mkRowKey(xfqybm, fpdm, fphm, yy, id), (columnf, col, nullHandle(row.getAs[String](col)))))
})
values
}).filter(x => !x._1.contains("-"))
.persist(StorageLevel.MEMORY_AND_DISK_SER)
rdd1.unpersist()
val rdd3 = rdd2.sortBy(x => (x._1, x._2._1, x._2._2))
.map(rdd => {
val rowKey = Bytes.toBytes(rdd._1)
val family = Bytes.toBytes(rdd._2._1)
val colum = Bytes.toBytes(rdd._2._2)
val value = Bytes.toBytes(rdd._2._3)
(new ImmutableBytesWritable(rowKey), new KeyValue(rowKey, family, colum, value))
}).persist(StorageLevel.MEMORY_AND_DISK_SER)
rdd2.unpersist()
rdd3
}
def getQybm(xf_nsrsbh: String) = {
val hbaseConf = HBaseConfiguration.create()
val conn = ConnectionFactory.createConnection(hbaseConf)
val tableName = "nbdpt:qy_xydm_bm"
val table = conn.getTable(TableName.valueOf(tableName))
val get = new Get(Bytes.toBytes(xf_nsrsbh))
val result = table.get(get)
val xfqybm = Bytes.toString(result.getValue(Bytes.toBytes("qyxx"), Bytes.toBytes("qybm")))
conn.close()
nullHandle(xfqybm)
}
def mkRowKey(xfqybm: String, fpdm: String, fphm: String, yy: String, id: String): String = {
val fpbm = BaseUtil.getFpbm(fpdm, fphm)
val rowKey = xfqybm + yy + fpbm + id
rowKey
}
def nullHandle(str: String): String = {
if (str == null || str == "" || str == "null") {
"-1"
} else {
str
}
}
def nullDecide(str: Any): Boolean = {
if (str == null || str == "" || str == "null") {
true
} else {
false
}
}
}
submit提交脚本:
nohup spark-submit \
--master yarn \
--deploy-mode cluster \
--class com.nbdpt.work4_hive2hbase2019.SparkReadHiveToHbaseLoad2019 \
--driver-memory 4G \
--executor-memory 14G \
--executor-cores 6 \
--num-executors 50 \
--conf spark.default.parallelism=900 \
--conf spark.speculation=true \
--conf spark.speculation.interval=100 \
--conf spark.speculation.quantile=0.75 \
--conf spark.speculation.multiplier=1.5 \
--conf spark.storage.memoryFraction=0.2 \
--conf spark.shuffle.memoryFraction=0.4 \
--conf spark.shuffle.service.enabled=true \
com.cbp.test-1.0-SNAPSHOT.jar \
"test_2016" \
"test_2016" \
"./test1" \
"500" \
> ./log.file 2>&1 &