但凡能这样写
val prop = new java.util.Properties
prop.put("url", "jdbc:mysql://10.37.140.228:3306/dam?characterEncoding=UTF-8")
prop.put("dbtable", "(select * from tableA where meta_create_time = str_to_date('2019-01-20','%Y-%m-%d') ) as tempViewName")
prop.put("user", "fabu")
prop.put("password", "73R4_h8td6fE")
prop.put("fetchsize","100000")
var df2 = sparkSession.read.jdbc( prop.getProperty("url"),prop.getProperty("dbtable"), prop)
都不要用ResultSet转DataFrame,当然了,上面那样写是单分区,容易OOM,最好根据字段多分区读取,这里不再写了,去百度吧,搜索“spark jdbc 优化”一大堆相应的文章
言归正传,因为一些特殊的原因,必须要把ResultSet转DataFrame,直接贴代码吧
def createStructField(name:String,colType:String):StructField={
colType match {
case "java.lang.String" =>{ return StructField(name,StringType,true)}
case "java.lang.Integer" =>{return StructField(name,IntegerType,true)}
case "java.lang.Long" =>{return StructField(name,LongType,true)}
case "java.lang.Boolean" =>{return StructField(name,BooleanType,true)}
case "java.lang.Double" =>{return StructField(name,DoubleType,true)}
case "java.lang.Float" =>{return StructField(name,FloatType,true)}
case "java.sql.Date" =>{return StructField(name,DateType,true)}
case "java.sql.Time" =>{return StructField(name,TimestampType,true)}
case "java.sql.Timestamp" =>{return StructField(name,TimestampType,true)}
case "java.math.BigDecimal" =>{return StructField(name,DecimalType(10,0),true)}
}
}
/**
* 把查出的ResultSet转换成DataFrame
*/
def createResultSetToDF(rs:ResultSet):DataFrame={
val rsmd = rs.getMetaData
val columnTypeList = new ArrayList[String]
val rowSchemaList = new ArrayList[StructField]
for(i <- 1 to rsmd.getColumnCount){
var temp = rsmd.getColumnClassName(i)
temp=temp.substring(temp.lastIndexOf(".")+1)
if("Integer".equals(temp)){
temp="Int";
}
columnTypeList.add(temp)
rowSchemaList.add(createStructField(rsmd.getColumnName(i),rsmd.getColumnClassName(i)))
}
val rowSchema = StructType(Seq(rowSchemaList:_*))
//ResultSet反射类对象
val rsClass = rs.getClass
var count=1;
var resultList = new ArrayList[Row]
var totalDF = session.createDataFrame(new ArrayList[Row], rowSchema)
while (rs.next()) {
count=count+1;
val temp = new ArrayList[Object]
for(i <- 0 to columnTypeList.size()-1){
val method = rsClass.getMethod("get"+columnTypeList.get(i),"aa".getClass)
temp.add(method.invoke(rs, rsmd.getColumnName(i+1)))
}
resultList.add(Row(temp:_*))
if(count%100000==0){
val tempDF = session.createDataFrame(resultList, rowSchema)
totalDF=totalDF.union(tempDF).distinct()
resultList.clear()
}
}
val tempDF = session.createDataFrame(resultList, rowSchema)
totalDF=totalDF.union(tempDF)
return totalDF
}
Scala语言写的