/**
*
* @param spark
* @param headerSchema 是否用csv第一行作为schema信息
* @param code csv编码格式
* @param file csv绝对路径
* @return
*/
def readCSV(spark:SparkSession,headerSchema:String,code:String,file:String) ={
val rddArr:RDD[Array[String]] = spark.sparkContext.hadoopFile(file, classOf[TextInputFormat],
classOf[LongWritable], classOf[Text]).map(
pair => new String(pair._2.getBytes, 0, pair._2.getLength, code))
//处理同一个单元格 同时出现 引号 逗号串列问题 切割
.map(_.trim.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)",-1))
val fieldArr = rddArr.first()
//Row.fromSeq(_) 如果只是 map(Row(_)),会导致 spark.createDataFrame(rddRow,schema)错误
val rddRow = rddArr.filter(!_.reduce(_+_).equals(fieldArr.reduce(_+_))).map(Row.fromSeq(_))
val schemaList = ArrayBuffer[StructField]()
if("TRUE".equals(headerSchema)){
for(i <- 0 until fieldArr.length){
schemaList.append(StructField(fieldArr(i),DataTypes.StringType))
}
}else{
for(i <- 0 until fieldArr.length){
schemaList.append(StructField(s"_c$i",DataTypes.StringType))
}
}
val schema = StructType(schemaList)
spark.createDataFrame(rddRow,schema)
}