spark 通过Phoenix操作Hbase
phoenix.properties
phoenix.mutate.maxSize = 500000
phoenix.mutate.batchSizeBytes = 1073741824000
phoenix.schema.isNamespaceMappingEnabled = true
phoenix.insert.batchSize = 10000
phoenix.jdbc.driver = org.apache.phoenix.jdbc.PhoenixDriver
skipNormalizingIdentifier = true
phoenix.jdbc.url = jdbc:phoenix:zk01:2181
hbase.zookeeper.quorum = zk01,zk02,zk03:2181
phoenixUtils
package cn.huorong.utils
import com.alibaba.fastjson.JSONObject
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.hadoop.conf.Configuration
import org.apache.phoenix.query.QueryServices
import org.apache.spark.TaskContext
import org.apache.spark.sql.types._
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet, ResultSetMetaData}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import java.util.Properties
import scala.collection.mutable.ListBuffer
object PhoenixUtil {
def main(args: Array[String]): Unit = {
val list: List[JSONObject] = queryPhoenixList("SELECT * FROM TEST.SAMPLE_TASK_SCAN limit 100")
list.foreach(f=> println(f))
}
private val config: Config = ConfigFactory.load("phoenix.properties")
private val SKIP_NORMALIZING_IDENTIFIER: String = config.getString("skipNormalizingIdentifier")
private val MUTATE_MAXSIZE: String = config.getString("phoenix.mutate.maxSize")
private val PHOENIX_JDBC_URL:String = config.getString("phoenix.jdbc.url")
private val PHOENIX_JDBC_DRIVER:String = config.getString("phoenix.jdbc.driver")
private val MUTATE_BATCHSIZEBYTES: String = config.getString("phoenix.mutate.batchSizeBytes")
private val ISNAMESPACE_MAPPING_ENABLED: String = config.getString("phoenix.schema.isNamespaceMappingEnabled")
private val INSERT_BATCHSIZE: String = config.getString("phoenix.insert.batchSize")
private val ZOOKEEPER_QUORUM: String = config.getString("hbase.zookeeper.quorum")
//设置JDBC方式操作phoenix的前置参数
private var setPhoenixProperties: Properties ={
val connectionProperties = new Properties();
connectionProperties.setProperty(QueryServices.MAX_MUTATION_SIZE_ATTRIB,MUTATE_MAXSIZE) //commit或rollback前,一次批量处理的最大的行数,默认500000
connectionProperties.setProperty(QueryServices.MUTATE_BATCH_SIZE_BYTES_ATTRIB,MUTATE_BATCHSIZEBYTES) //一次插入的数据量大小
connectionProperties.setProperty(QueryServices.IS_NAMESPACE_MAPPING_ENABLED,ISNAMESPACE_MAPPING_ENABLED) //开启schema与namespace的对应关系
connectionProperties.setProperty(QueryServices.ZOOKEEPER_QUORUM_ATTRIB,ZOOKEEPER_QUORUM) //Zookeeper URL
connectionProperties.setProperty("skipNormalizingIdentifier",SKIP_NORMALIZING_IDENTIFIER) //跳过规范化检查
connectionProperties
}
/***
* @Author: lzx
* @Description:
* @Date: 2022/4/19 18:42:02
* @Param tableSchema:库名
* @Param table:表名
* @Param columns:列名
* @Param predicate:where后面的过滤条件 e.g."TID = 123"
* @Param zkUrl:Zookeeper URL
* @Param tenantId:租户
* @Param conf: hadoop.conf.Configuration Hadoop配置文件
* @return: org.apache.spark.sql.Dataset<org.apache.spark.sql.Row>
**/
def queryPhoenixDataframe(session:SparkSession, tableSchema:String, table: String,
columns: Seq[String], predicate: Option[String] = None):DataFrame={
import org.apache.phoenix.spark._
val conf: Configuration = new Configuration
conf.set("phoenix.schema.isNamespaceMappingEnabled", ISNAMESPACE_MAPPING_ENABLED)
val dataFrame: DataFrame = session.sqlContext.phoenixTableAsDataFrame(
s"$tableSchema.$table",
columns,
predicate,
conf = conf,
zkUrl = Some(ZOOKEEPER_QUORUM)
)
dataFrame
}
/***
* @Author: lzx
* @Description:
* @Date: 2022/4/27
* @Param sql: 传入sql语句
* @return: scala.collection.immutable.List<com.alibaba.fastjson.JSONObject>
**/
def queryPhoenixList(sql:String,properties: Properties = setPhoenixProperties): List[JSONObject] ={
val rsList: ListBuffer[JSONObject] = new ListBuffer[JSONObject]
//注册驱动
Class.forName(PHOENIX_JDBC_DRIVER)
//建立连接
val conn: Connection = DriverManager.getConnection(PHOENIX_JDBC_URL,properties)
//创建数据库操作对象
val ps: PreparedStatement = conn.prepareStatement(sql)
//执行SQL语句
val rs: ResultSet = ps.executeQuery()
val rsMetaData: ResultSetMetaData = rs.getMetaData
//处理结果集
while(rs.next()){
val userStatusJsonObj = new JSONObject()
//{"user_id":"zs","if_consumerd":"1"}
for(i <-1 to rsMetaData.getColumnCount){
userStatusJsonObj.put(rsMetaData.getColumnName(i),rs.getObject(i))
}
rsList.append(userStatusJsonObj)
}
//释放资源
rs.close()
ps.close()
conn.close()
rsList.toList
}
/***
* @Author: lzx
* @Description: JDBC插入phoenix主要逻辑
* @Date: 2022/4/26
* @Param dataFrame:df
* @Param table: phoenix表
* @Param phoenixJdbcUrl: "jdbc:phoenix:node118,node119,node120:2181"
* @Param pro: 写入Phoenix配置
* @Param batch: 写入批次大小(条)
* @return: void
**/
def jdbcBatchInsert(dataFrame: DataFrame,
tableSchema:String,
table: String,
insertFlag:String,
batch: Int = INSERT_BATCHSIZE.toInt,
phoenixJdbcUrl: String = PHOENIX_JDBC_URL,
pro: Properties=setPhoenixProperties
): Unit = {
//处理dataframe 提取字段,拼接sql语句
val fields: Array[String] = dataFrame.schema.fieldNames
val schema: Array[StructField] = dataFrame.schema.toArray
val numFields: Int = fields.length
val fieldsSql: String = fields.map(str => "\"".concat(str).concat("\"")).mkString("(", ",", ")")
val charSql: String = fields.map(str => "?").mkString(",")
val setters: Array[JDBCValueSetter] = schema.map(f => makeSetter(f.dataType))
var insertSql:String = ""
if (insertFlag.equals("Ignore")) {
insertSql = s"UPSERT INTO $tableSchema.$table $fieldsSql VALUES ($charSql) "
} else {
insertSql = s"UPSERT INTO $tableSchema.$table $fieldsSql VALUES ($charSql) "
}
// System.out.println("插入sql:" + insertSql)
val start: Long = System.currentTimeMillis()
dataFrame.rdd.foreachPartition(partition => {
//注册驱动
Class.forName(PHOENIX_JDBC_DRIVER).newInstance()
val connection: Connection = DriverManager.getConnection(phoenixJdbcUrl, pro)//创建连接
try {
connection.setAutoCommit(false)
val pstmt: PreparedStatement = connection.prepareStatement(insertSql)
var count = 0
var cnt = 0
partition.foreach(row => {
for (i <- 0 until numFields) {
if (row.isNullAt(i)) {
System.err.println("数据中存在字符为空,已经在preparedStatement进行类型预处理")
pstmt.setNull(i + 1, getJdbcType(schema(i).dataType))
} else {
setters(i).apply(pstmt, row, i)
}
}
pstmt.addBatch()
count += 1
if (count % batch == 0) { //批量提交
pstmt.executeBatch()
connection.commit()
cnt += 1
// println(s"${TaskContext.get.partitionId}分区,提交第${cnt}次,${count}条")
}
})
//小批量数据提交(达不到一个batch的数据也要提交)
pstmt.executeBatch()
connection.commit()
// println(s"第${TaskContext.get.partitionId}分区,共提交第${cnt},${count}条")
} finally {
connection.close()
}
})
val end = System.currentTimeMillis()
// println(s"插入表$table,共花费时间${(end - start) / 1000}秒")
}
private type JDBCValueSetter = (PreparedStatement, Row, Int) => Unit
/**
* 类型匹配
* @param dataType
* @return
*/
def makeSetter(dataType: DataType): JDBCValueSetter = dataType match {
case IntegerType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
if (row.isNullAt(pos)) {
stmt.setNull(pos + 1, java.sql.Types.INTEGER)
} else {
stmt.setInt(pos + 1, row.getInt(pos))
}
case LongType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setLong(pos + 1, row.getLong(pos))
case DoubleType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setDouble(pos + 1, row.getDouble(pos))
case FloatType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setFloat(pos + 1, row.getFloat(pos))
case ShortType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setInt(pos + 1, row.getShort(pos))
case ByteType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setInt(pos + 1, row.getByte(pos))
case BooleanType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setBoolean(pos + 1, row.getBoolean(pos))
case StringType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setString(pos + 1, row.getString(pos))
case BinaryType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setBytes(pos + 1, row.getAs[Array[Byte]](pos))
case TimestampType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setTimestamp(pos + 1, row.getAs[java.sql.Timestamp](pos))
case DateType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setDate(pos + 1, row.getAs[java.sql.Date](pos))
case t: DecimalType =>
(stmt: PreparedStatement, row: Row, pos: Int) =>
stmt.setBigDecimal(pos + 1, row.getDecimal(pos))
/* case ArrayType(et, _) =>
// remove type length parameters from end of type name
val typeName = getJdbcType(et, dialect).databaseTypeDefinition
.toLowerCase(Locale.ROOT).split("\\(")(0)
(stmt: PreparedStatement, row: Row, pos: Int) =>
val array = conn.createArrayOf(
typeName,
row.getSeq[AnyRef](pos).toArray)
stmt.setArray(pos + 1, array)*/
case _ =>
(_: PreparedStatement, _: Row, pos: Int) =>
throw new IllegalArgumentException(
s"Can't translate non-null value for field $pos")
}
/**
* sql类型匹配 如果有其他类型 自行添加
*
* @param dt
* @return
*/
private def getJdbcType(dt: DataType): Int = {
dt match {
case IntegerType => java.sql.Types.INTEGER
case LongType => java.sql.Types.BIGINT
case DoubleType => java.sql.Types.DOUBLE
case StringType => java.sql.Types.VARCHAR
case _ => java.sql.Types.VARCHAR
}
}
}