代码:
def readDfdatafromHive(db:String,tablename:String,sparksession:SparkSession ):DataFrame={
val df= sparksession.read
.format("jdbc")
.option("driver","org.apache.hive.jdbc.HiveDriver")
.option("url","jdbc:hive2://hdp100:1111/")
.option("user","liuz")
.option("password","4630850lz")
.option("fetchsize", "2000")
.option("dbtable",db+"."+tablename)
.load()
df
}
val opendoorTimes_DF=DBHelper.readDfdatafromHive("dws"," dws_runstrength_opendoortimes_dt",sparkSession)
结果:类型转换异常,实际上是我们的列名和值一样
原因:
在spark中没有jdbc读取hive的Dialect支持,所以我们需要自己看源码,自己定义一个。
private[this] var dialects = List[JdbcDialect]()
registerDialect(MySQLDialect)
registerDialect(PostgresDialect)
registerDialect(DB2Dialect)
registerDialect(MsSqlServerDialect)
registerDialect(DerbyDialect)
registerDialect(OracleDialect)
registerDialect(TeradataDialect)
解决办法:
我们看一看mysql是怎么定义的:
private case object MySQLDialect extends JdbcDialect {
override def canHandle(url : String): Boolean =
url.toLowerCase(Locale.ROOT).startsWith("jdbc:mysql")
override def getCatalystType(
sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) {
// This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
// byte arrays instead of longs.
md.putLong("binarylong", 1)
Option(LongType)
} else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
Option(BooleanType)
} else None
}
override def quoteIdentifier(colName: String): String = {
s"`$colName`"
}
override def getTableExistsQuery(table: String): String = {
s"SELECT 1 FROM $table LIMIT 1"
}
override def isCascadingTruncateTable(): Option[Boolean] = Some(false)
}
可以继续看看presto的代码,此处略,方法的作用梳理一下:
def canHandle(url: String): Boolean 这里需要判断我们的jdbc协议是否满足某个条件,不同的协议条件不一样
override def getCatalystType(sqlType: Int, typeName: String, size: Int, md: MetadataBuilder)这个是用来处理数据类型的,避免数据类型不一致
override def quoteIdentifier(colName: String) 这个是处理列名的,因为不同的数据库的列名前缀不一样,可以自定义
最后我们定义的hivesqldialet代码:
object HiveSqlDialect extends JdbcDialect {
//
override def canHandle(url: String): Boolean ={
url.toLowerCase(Locale.ROOT).startsWith("jdbc:hive2")
// url.startsWith("jdbc:hive2")
//mysql url.toLowerCase(Locale.ROOT).startsWith("jdbc:mysql")
}
override def quoteIdentifier(colName: String): String = {
colName.split('.').map(part => s"`$part`").mkString(".")
// mysql s"`$colName`"
}
}
object RegisterHiveSqlDialect {
def register(): Unit = {
JdbcDialects.registerDialect(HiveSqlDialect)
}
}
object IndicatorCalculateService {
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME", "liuz")
val sparkSession: SparkSession = SparkSession.builder().master("local[*]").appName("MantanceNeedAnalysis")
.enableHiveSupport() .getOrCreate()
//注册自己定义的Dialect
RegisterHiveSqlDialect.register()
val opendoorTimes_DF=DBHelper.readDfdatafromHive("dws"," dws_runstrength_opendoortimes_dt",sparkSession)
opendoorTimes_DF.show()
}
结果: