将Spark对Hive、HBase、Hdfs、Mysql、ElasticSearch、kudu、clickhouse等数据源的访问封装,通过策略设计模式根据需要访问。
其中kudu、clickhouse的数据源需要引入第三方包,hbase需要自定义。
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>elasticsearch-rest-high-level-client</artifactId>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.elasticsearch.plugin</groupId>-->
<!-- <artifactId>x-pack-sql-jdbc</artifactId>-->
<!-- </dependency>-->
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark-20_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-client</artifactId>
</dependency>
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-spark2_2.11</artifactId>
</dependency>
一,效果
import com.lcy.models.datasource.{CommonDataSource, DataSourceConstants}
import com.lcy.models.utils.SparkUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataType, IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
object DataSourceTest {
val ZK_HOST_HBASE = "hbase.zookeeper.quorum"
val ZK_PORT_HBASE = "hbase.zookeeper.property.clientPort"
val HBASE_FAMILY ="family.name"
val HBASE_ROW_KEY ="row.key.name"
val HBASE_SELECT_FIELDS ="fields.selecting"
val HBASE_TABLE ="hbase.table"
val HBASE_SELECT_WHERE_CONDITIONS ="where.conditions"
def main(args: Array[String]): Unit = {
// mysqlTest()
// hbaseTest()
// hiveTest()
hdfsTest()
}
def hdfsTest(): Unit = {
val spark = SparkSession
.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[4]")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate()
// 1. 参数Map集合
val paramsMap: Map[String, String] = Map(
"inType"-> "hdfs",
"inPath"-> "hdfs://bigdata-cdh01.itcast.cn:8020/datas/words.csv",
"sperator" -> "\t",
"selectFieldNames" -> "id,name"
)
// 2. 加载数据
val dataframe: DataFrame = CommonDataSource.readHdfs(spark, paramsMap)
dataframe.show()
val paramsMapWrite: Map[String, String] = Map(
"inType"-> "hdfs",
"inPath"-> "hdfs://bigdata-cdh01.itcast.cn:8020/datas/words2.csv",
"sperator" -> "\t",
"selectFieldNames" -> "id,name"
)
val rdd: RDD[Row] = spark.sparkContext.makeRDD(Seq[Row](Row.fromSeq(Seq("1", "lcy222222"))))
val frame: DataFrame = spark.createDataFrame(rdd, StructType(Array(
StructField("id3", StringType),
StructField("name3", StringType)
)))
CommonDataSource.writeHdfs(paramsMapWrite,frame)
spark.stop()
}
def hiveTest(): Unit = {
val spark = SparkSession
.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[4]")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("hive.metastore.uris", "thrift://bigdata-cdh01.itcast.cn:9083")
.enableHiveSupport()
.getOrCreate()
// 1. 参数Map集合
val paramsMap: Map[String, String] = Map(
"inType"-> "hive",
"hiveTable"-> "tags_dat.tbl_users",
"selectFieldNames" -> "username,password"
)
// 2. 加载数据
val dataframe: DataFrame = CommonDataSource.readHive(spark, paramsMap)
dataframe.printSchema()
dataframe.show(20, truncate = false)
spark.sql("use tags_dat")
spark.sql("select * from tbl_users2").show()
// 写入hive
CommonDataSource.appendHive("tags_dat.tbl_users3",dataframe)
CommonDataSource.appendAndEnableDynamicPartitionHive("tags_dat.tbl_users3",dataframe)
spark.stop()
}
def hbaseTest(): Unit = {
// 1,创建sparkSession
val spark: SparkSession = SparkUtils.sparkSession(SparkUtils.sparkConf(this.getClass.getCanonicalName))
val frame: DataFrame = spark.read
.format("com.lcy.models.datasource")
.option(ZK_HOST_HBASE, "bigdata-cdh01.cn")
.option(ZK_PORT_HBASE, 2181)
.option(HBASE_TABLE, "tbl_users")
.option(HBASE_FAMILY, "detail")
.option(HBASE_SELECT_FIELDS, "id,gender")
.option(HBASE_SELECT_WHERE_CONDITIONS,"id[ge]50")
.load()
frame.show()
frame.write.format("hbase")
.mode(SaveMode.Append)
.option(ZK_HOST_HBASE, "bigdata-cdh01.cn")
.option(ZK_PORT_HBASE, 2181)
.option(HBASE_TABLE, "tbl_users_2")
.option(HBASE_FAMILY, "detail")
.option(HBASE_ROW_KEY, "id")
.save()
spark.stop()
}
def mysqlTest(): Unit = {
// 1. 参数Map集合
val paramsMap: Map[String, String] = Map(
"inType"-> "mysql",
"driver"-> "com.mysql.jdbc.Driver",
"url"-> "jdbc:mysql://bigdata-cdh01.itcast.cn:3306/?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC",
"user"-> "root",
"password"-> "123456",
"sql" ->"select * from profile_tags.test_spark_ds",
"table"-> "profile_tags.test_spark_ds"
)
val spark: SparkSession = SparkUtils.sparkSession(SparkUtils.sparkConf("xxxx"))
// 读取mysql
val frame: DataFrame = CommonDataSource.readMysql(spark, paramsMap)
frame.show()
println("=====================================================")
// 写入mysql
val value: RDD[Row] = spark.sparkContext.makeRDD(Seq(Row.fromSeq(Seq("1","name"))))
val structType: StructType = StructType(
Array(
StructField("id", StringType),
StructField("name", StringType)
)
)
val df: DataFrame = spark.createDataFrame(value, structType)
// 2. 保存数据
CommonDataSource.write(DataSourceConstants.mysql,paramsMap,df);
df.printSchema()
df.show(20, truncate = false)
}
}
二,核心类
package com.lcy.models.datasource
import com.lcy.models.datasource.DataSourceConstants._
import com.lcy.models.meta.{HBaseMeta, HdfsMeta, HiveMeta, MySQLMeta}
import org.apache.spark.sql.{DataFrame, DataFrameReader, SaveMode, SparkSession}
object CommonDataSource {
def readMysql(spark:SparkSession,props:Map[String,String]) = {
read(spark,DataSourceConstants.mysql,props)
}
def readHBase(spark:SparkSession,props:Map[String,String]) = {
read(spark,DataSourceConstants.hbase,props)
}
def readHive(spark:SparkSession,props:Map[String,String]) = {
read(spark,DataSourceConstants.hive,props)
}
def readHdfs(spark:SparkSession,props:Map[String,String]) = {
read(spark,DataSourceConstants.hdfs,props)
}
def writeMysql(props:Map[String,String],dataFrame: DataFrame): Unit = {
write(DataSourceConstants.mysql,props,dataFrame)
}
def writeHBase(props:Map[String,String],dataFrame: DataFrame): Unit = {
write(DataSourceConstants.hbase,props,dataFrame)
}
def writeHive(props:Map[String,String],dataFrame: DataFrame): Unit = {
write(DataSourceConstants.hive,props,dataFrame)
}
def appendHive(hiveTable:String,dataFrame: DataFrame): Unit = {
val props = Map(("hiveTable",hiveTable),("saveMode","append"))
write(DataSourceConstants.hive,props,dataFrame)
}
def appendAndEnableDynamicPartitionHive(hiveTable:String,dataFrame: DataFrame): Unit = {
dataFrame.sparkSession.sql("set hive.exec.dynamic.partition =true")
dataFrame.sparkSession.sql("hive.exec.dynamic.partition.mode=nonstrict")
val props = Map(("hiveTable",hiveTable),("saveMode","append"))
write(DataSourceConstants.hive,props,dataFrame)
}
def writeHdfs(props:Map[String,String],dataFrame: DataFrame): Unit = {
write(DataSourceConstants.hdfs,props,dataFrame)
}
def read(spark:SparkSession, sourceType:String, props:Map[String,String]):DataFrame = {
sourceType match {
case DataSourceConstants.mysql => {
// 解析Map集合,封装MySQLMeta对象中
val mysqlMeta = MySQLMeta.getMySQLMeta(props)
// 从MySQL表加载业务数据
spark.read
.format("jdbc")
.option("driver", mysqlMeta.driver)
.option("url", mysqlMeta.url)
.option("user", mysqlMeta.user)
.option("password", mysqlMeta.password)
.option("dbtable", mysqlMeta.sql)
.load()
}
case DataSourceConstants.hbase =>{
// 解析map集合,封装Meta实体类中
val hbaseMeta = HBaseMeta.getHBaseMeta(props)
// 加载业务数据
spark.read
.format("hbase")
.option("zkHosts", hbaseMeta.zkHosts)
.option("zkPort", hbaseMeta.zkPort)
.option("hbaseTable", hbaseMeta.hbaseTable)
.option("family", hbaseMeta.family)
.option("selectFields", hbaseMeta.selectFieldNames)
.option("filterConditions", hbaseMeta.filterConditions)
.load()
}
case DataSourceConstants.hdfs => {
// 解析Map集合,封装HdfsMeta对象中
val hdfsMeta: HdfsMeta = HdfsMeta.getHdfsMeta(props)
// 从HDFS加载CSV格式数据
val reader: DataFrameReader = spark.read
.option("sep", hdfsMeta.sperator)
.option("header", "true")
.option("inferSchema", "true")
hdfsMeta.fileType match {
case "csv" => {
reader.csv(hdfsMeta.inPath)
.select(hdfsMeta.selectFieldNames: _*)
}
case "text" =>{
reader.text(hdfsMeta.filePath)
}
case "parquet" =>{
reader.parquet(hdfsMeta.filePath)
}
case "json" =>{
reader.json(hdfsMeta.filePath)
}
case _ =>{
// 如果未获取到数据,直接抛出异常
new RuntimeException("不支持的HDFS文件格式")
null
}
}
}
case DataSourceConstants.hive=>{
// Map集合,封装HiveMeta对象
val hiveMeta: HiveMeta = HiveMeta.getHiveMeta(props)
// 从Hive表加载数据, TODO:此时注意,如果标签模型业务数从Hive表加载,创建SparkSession对象时,集成Hive
spark.read
.table(hiveMeta.hiveTable)
// def select(cols: Column*): DataFrame, selectFieldNames: _* -> 将数组转换可变参数传递
.select(hiveMeta.selectFieldNames: _*)
//.filter(hiveMeta.whereCondition)
}
}
}
def write(sourceType:String,props:Map[String,String],df:DataFrame) = {
sourceType match {
case DataSourceConstants.mysql => {
val mysqlMeta = MySQLMeta.getMySQLMeta(props)
df.write
.format("jdbc")
.mode(SaveMode.Overwrite)
.option("driver","com.mysql.jdbc.Driver")
.option("url",mysqlMeta.url)
.option("user",mysqlMeta.user)
.option("password",mysqlMeta.password)
.option("dbtable",mysqlMeta.table)
.save()
}
case DataSourceConstants.hbase =>{
// 解析map集合,封装Meta实体类中
val hbaseMeta = HBaseMeta.getHBaseMeta(props)
df.write.format("hbase")
.mode(SaveMode.Append)
.option(ZK_HOST_HBASE, hbaseMeta.zkHosts)
.option(ZK_PORT_HBASE, hbaseMeta.zkPort)
.option(HBASE_TABLE, hbaseMeta.hbaseTable)
.option(HBASE_FAMILY, hbaseMeta.family)
.option(HBASE_ROW_KEY, hbaseMeta.rowKey)
.save()
}
case DataSourceConstants.hdfs => {
// 解析Map集合,封装HdfsMeta对象中
val hdfsMeta: HdfsMeta = HdfsMeta.getHdfsMeta(props)
hdfsMeta.fileType match {
case "csv" => {
df.write.csv(hdfsMeta.inPath);
}
case "text" =>{
df.write.text(hdfsMeta.filePath)
}
case "parquet" =>{
df.write.parquet(hdfsMeta.filePath)
}
case "json" =>{
df.write.json(hdfsMeta.filePath)
}
case _ =>{
// 如果未获取到数据,直接抛出异常
new RuntimeException("不支持的HDFS文件格式")
null
}
}
}
case DataSourceConstants.hive=>{
val hiveMeta: HiveMeta = HiveMeta.getHiveMeta(props)
df.write
.format("hive")
.mode(hiveMeta.saveMode)
.saveAsTable(hiveMeta.hiveTable)
}
}
}
}
三,辅助类
HBaseMeta
package com.lcy.models.meta
import com.lcy.models.utils.DateUtils
/**
* HBase 元数据解析存储,具体数据字段格式如下所示:
* inType=hbase
* zkHosts=bigdata-cdh01.cn
* zkPort=2181
* hbaseTable=tbl_tag_users
* family=detail
* selectFieldNames=id,gender
* whereCondition=modified#day#30
*/
case class HBaseMeta(
zkHosts: String,
zkPort: String,
hbaseTable: String,
family: String,
selectFieldNames: String,
rowKey:String,
filterConditions: String
)
object HBaseMeta{
/**
* 将Map集合数据解析到HBaseMeta中
* @param ruleMap map集合
*/
def getHBaseMeta(ruleMap: Map[String, String]): HBaseMeta = {
// TODO: 实际开发中,应该先判断各个字段是否有值,没有值直接给出提示,终止程序运行,此处省略
// 依据where语句动态生成过滤条件语句filter
// TODO: whereCondition=modified#day#30 -> filterConditions=modified[ge]20200907,modified[le]20201006
// modified#day#30 -> modified[ge]20200907,modified[le]20201006
// a. 从标签规则中获取whereCondition值
val whereCondition: String = ruleMap.getOrElse("whereCondition", null)
// b. 判读where条件是否有值,有值进行动态生成日期范围,没有返回null
val filterConditions: String = if(null != whereCondition) {
// step1. 按照分割符进行分割, field=modified、unit=day、amount=30
val Array(field, unit, amount) = whereCondition.split("#")
// step2. 获取当前日期、昨日日期、N天日期
val nowDate: String = DateUtils.getNow()
val yesterdayDate: String = DateUtils.dateCalculate(nowDate, -1)
// step3. 计算N天以前的日期
val agoDate: String = unit match {
case "day" => DateUtils.dateCalculate(nowDate, -(1 * amount.toInt))
case "month" => DateUtils.dateCalculate(nowDate, -(30 * amount.toInt))
case "year" => DateUtils.dateCalculate(nowDate, -(365 * amount.toInt))
}
// c. 计算出filter过滤条件语句
// modified[ge]20200907,modified[le]20201006
s"$field[le]$yesterdayDate,$field[ge]$agoDate"
}else null
if(null != filterConditions) println(s">>>>>>>>> filterConditions: $filterConditions >>>>>>>>>>>")
// 构建HBaseMeta对象
HBaseMeta(
ruleMap("zkHosts"),
ruleMap("zkPort"),
ruleMap("hbaseTable"),
ruleMap("family"),
ruleMap("selectFieldNames"),
ruleMap("rowKey"),
filterConditions
)
}
}
HdfsMeta
package com.lcy.models.meta
import org.apache.spark.sql.Column
/**
* 从HDFS文件系统读取数据,文件格式为csv类型,首行为列名称
inType=hdfs
inPath=/apps/datas/tbl_logs
sperator=\t
selectFieldNames=global_user_id,loc_url,log_time
*/
case class HdfsMeta(
inPath: String,
sperator: String,
selectFieldNames: Array[Column],
fileType:String = "csv",
filePath:String = "sv"
)
object HdfsMeta{
/**
* 将Map集合数据解析到HdfsMeta中
* @param ruleMap map集合
* @return
*/
def getHdfsMeta(ruleMap: Map[String, String]): HdfsMeta = {
// 将选择字段构建为Column对象
import org.apache.spark.sql.functions.col
val fieldColumns: Array[Column] = ruleMap("selectFieldNames")
.split(",")
.map{field => col(field)}
// 创建HdfsMeta对象并返回
HdfsMeta(
ruleMap("inPath"), //
ruleMap("sperator"), //
fieldColumns,
ruleMap.getOrElse("fileType","csv"),
ruleMap.getOrElse("filePath","")
)
}
}
HiveMeta
package com.lcy.models.meta
import org.apache.spark.sql.{Column, SaveMode}
/**
* 从Hive表中加载数据,SparkSession创建时与Hive集成已配置
inType=hive
hiveTable=tags_dat.tbl_logs
selectFieldNames=global_user_id,loc_url,log_time
## 分区字段及数据范围
whereCondition=log_time#day#30
*/
case class HiveMeta(
hiveTable: String,
saveMode:String,
selectFieldNames: Array[Column],
whereCondition: String
)
object HiveMeta{
/**
* 将Map集合数据解析到HiveMeta中
* @param ruleMap map集合
* @return
*/
def getHiveMeta(ruleMap: Map[String, String]): HiveMeta = {
// 此处省略依据分组字段值构建WHERE CAUSE 语句
// val whereCondition = ...
// 将选择字段构建为Column对象
import org.apache.spark.sql.functions.col
val fieldColumns: Array[Column] = ruleMap.getOrElse("selectFieldNames","")
.split(",")
.map{field => col(field)}
// 创建HiveMeta对象并返回
HiveMeta(
ruleMap("hiveTable"), //
ruleMap.getOrElse("saveMode",SaveMode.ErrorIfExists.name()),
fieldColumns, //
null
)
}
}
MySQLMeta
package com.lcy.models.meta
/**
* RDBMS 关系型数据库元数据解析存储,具体数据字段格式如下所示:
inType=mysql
driver=com.mysql.jdbc.Driver
url=jdbc:mysql://bigdata-cdh01.cn:3306/?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC
user=root
password=123456
sql=SELECT id, gender FROM tags_dat.tbl_users
*/
case class MySQLMeta(
driver: String,
url: String,
user: String,
password: String,
sql: String,
table:String
)
object MySQLMeta{
/**
* 将Map集合数据解析到RdbmsMeta中
* @param ruleMap map集合
* @return
*/
def getMySQLMeta(ruleMap: Map[String, String]): MySQLMeta = {
// 获取SQL语句,赋以别名
val sqlStr: String = s"( ${ruleMap.getOrElse("sql","")} ) AS tmp"
// 构建RdbmsMeta对象
MySQLMeta(
ruleMap("driver"),
ruleMap("url"),
ruleMap("user"),
ruleMap("password"),
sqlStr,
ruleMap("table")
)
}
}
DataSourceConstants
package com.lcy.models.datasource
class DataSourceConstants {
val mysql = "mysql"
val hbase = "hbase"
val hive = "hive"
val hdfs = "hdfs"
val elasticsearch = "elasticsearch"
val ZK_HOST_HBASE = "hbase.zookeeper.quorum"
val ZK_PORT_HBASE = "hbase.zookeeper.property.clientPort"
val HBASE_FAMILY ="family.name"
val HBASE_ROW_KEY ="row.key.name"
val HBASE_SELECT_FIELDS ="fields.selecting"
val HBASE_TABLE ="hbase.table"
val HBASE_SELECT_WHERE_CONDITIONS ="where.conditions"
}
object DataSourceConstants extends DataSourceConstants