代码
package com.zhonghong.mission.hbasetoes
import java.text.SimpleDateFormat
import java.util.Date
import com.alibaba.fastjson.JSONObject
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client.{Result, Scan}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.elasticsearch.spark.rdd.EsSpark
object HBaseToES {
private final val logger: Logger = Logger.getLogger(HBaseToES.getClass)
logger.setLevel(Level.WARN)
var hbaseSourceTB: String = _
var esIndexName: String = _
var es: String = _
var batchNum:String = _
var startKey: String = _
var endKey:String = _
def main(args: Array[String]): Unit = {
if (args.length >= 6) {
hbaseSourceTB = args(0)
esIndexName = args(1)
es = args(2)
batchNum = args(3)
startKey = args(4)
endKey = args(5)
logger.warn("hbaseSourceTB----->" + hbaseSourceTB)
logger.warn("esIndexName----->" + esIndexName)
logger.warn("es----->" + es)
logger.warn("batchNum----->" + batchNum)
} else {
System.exit(-1)
}
val conf = new SparkConf().setAppName("process:" + this.getClass.getSimpleName)
.set("es.index.auto.create", "true")
.set("es.nodes", es)
.set("es.nodes.wan.only", "false")
.set("es.mapping.date.rich", "false")
start(conf, hbaseSourceTB,esIndexName,batchNum,startKey,endKey)
}
def start(conf: SparkConf, hbaseSourceTB: String, indexName: String, batchNum:String, startKey:String, endKey:String) = {
val sc = new SparkContext(conf)
sc.hadoopConfiguration.set("fs.defaultFS", "hdfs://nameservice1")
sc.hadoopConfiguration.set("dfs.nameservices", "nameservice1")
val hBaseConf: Configuration = HBaseConfiguration.create()
hBaseConf.addResource("hbase-site.xml")
hBaseConf.set("hbase.zookeeper.quorum", "ip,ip,ip")
hBaseConf.set(TableInputFormat.INPUT_TABLE, hbaseSourceTB)
hBaseConf.setInt("hbase.rpc.timeout", 200000)
hBaseConf.setInt("hbase.client.operation.timeout", 200000)
hBaseConf.setInt("hbase.client.scanner.timeout.period", 3600000)
hBaseConf.setInt("hbase.client.retries.number", 6)
hBaseConf.setInt("zookeeper.recovery.retry", 3)
hBaseConf.setInt("zookeeper.recovery.retry.intervalmill", 200)
val scan = new Scan()
scan.setMaxVersions(1)
if (StringUtils.isNotBlank(startKey)&&StringUtils.isNotBlank(endKey)){
scan.setStartRow(startKey.getBytes())
scan.setStopRow(endKey.getBytes())
}
val scanStr = Base64.encodeBytes(ProtobufUtil.toScan(scan).toByteArray)
hBaseConf.set(TableInputFormat.SCAN, scanStr)
val articleRDD = sc.newAPIHadoopRDD(
hBaseConf,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result]
)
val resultRDD = articleRDD.repartition(10).map({ case (_, result) => {
val dataJson = new JSONObject()
val indexId = Bytes.toString(result.getRow)
val date = new Date()
val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val updateTime = format.format(date)
dataJson.put("indexId",indexId)
dataJson.put("updateTime",updateTime)
result.rawCells().foreach(cell => {
val columnBytes: Array[Byte] = CellUtil.cloneQualifier(cell)
val valueBytes: Array[Byte] = CellUtil.cloneValue(cell)
val qualifierByte = cell.getQualifierArray
if (qualifierByte != null && qualifierByte.nonEmpty) {
if (valueBytes != null && valueBytes.length != 0) {
val column = Bytes.toString(columnBytes)
val value = Bytes.toString(valueBytes)
dataJson.put(column, value)
}
}
})
dataJson.toJSONString
}
})
saveES(resultRDD,indexName,batchNum)
}
def saveES(rdd: RDD[String], indexName: String,batchNum:String) = {
try {
val props = Map("es.write.operation" -> "upsert",
"es.mapping.id" -> "indexId",
"es.http.timeout" -> "3m",
"es.http.retries" -> "30",
"es.batch.size.bytes" -> "10mb",
"es.batch.size.entries" -> batchNum,
"es.batch.write.retry.count" -> "30",
"es.batch.write.retry.wait" -> "120s"
)
EsSpark.saveJsonToEs(rdd, indexName, props)
} catch {
case e: Exception => {
logger.error("插入es错误!" + e.getMessage)
e.printStackTrace()
}
}
}
}
依赖
<properties>
<spark.version>2.1.1</spark.version>
<elasticsearch.version>6.3.0</elasticsearch.version>
</properties>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark-20_2.11</artifactId>
<version>${elasticsearch.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>