需求背景: 有一些xls大文件数据。使用spark-excel(spark-excel)来读取时,文件太大会oom;工具提供的流式读取参数:maxRowsInMemory 也只支持xlsx类型文件。搜索了poi流式读取xls的方案,HSSFEventFactory提供了HSSFListener进行逐条处理数据。所以编写了spark读取xls的简易source。代码如下:
spark.read.format(“xls”).option(“path”, logPath).load()能够跑通。但是对应xls大文件还是会oom。具体了解后得到原因:SSTRecord存储了整个excel中所有字符串去重后结果,LabelSSTRecord只是存储了该字符串值在SSTRecord中的索引位置。所以在逐条处理xls文件数据的时候遇到SSTRecord还是会oom。
结论:没实现成功,失败;找不到其它实习方案,只能python脚本提前将xls文件转为csv。
package cn.keytop.source.xls
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.poi.hssf.eventusermodel._
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord
import org.apache.poi.hssf.record._
import org.apache.poi.hssf.usermodel.HSSFDataFormatter
import org.apache.poi.poifs.filesystem.POIFSFileSystem
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import scala.collection.mutable.ArrayBuffer
/**
* @author: 王建成
* @since: 2025/4/18 13:46
* @description: coding需求和地址 编写一个spark source plugin来读取xls大文件数据
*/
class XLSReader {
def read(pathStr: String, sqlContext: SQLContext): org.apache.spark.sql.DataFrame = {
val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
val fsPath = new Path(pathStr)
val fs = fsPath.getFileSystem(hadoopConf)
// 获取所有 .xls 文件
val allFiles: Array[Path] = {
if (fs.isDirectory(fsPath)) {
fs.listStatus(fsPath)
.filter(f => f.isFile && f.getPath.getName.toLowerCase.endsWith(".xls"))
.map(_.getPath)
} else {
Array(fsPath)
}
}
// 每个文件读取出一个 DataFrame,然后合并
val dfs = allFiles.map { filePath =>
println(s"Reading XLS file: $filePath")
readSingleXLS(filePath, fs, sqlContext)
}
dfs.reduceOption(_.union(_)).getOrElse {
// 如果目录下没有任何 xls 文件
sqlContext.createDataFrame(sqlContext.sparkContext.emptyRDD[Row], StructType(Nil))
}
}
private def readSingleXLS(path: Path, fs: FileSystem, sqlContext: SQLContext): DataFrame = {
val inputStream = fs.open(path)
val fsPOI = new POIFSFileSystem(inputStream)
val rowsBuffer = ArrayBuffer[ArrayBuffer[String]]()
var sstRecord: SSTRecord = null
var headers: ArrayBuffer[String] = ArrayBuffer()
var currentRow = ArrayBuffer[String]()
var currentRowNum = -1
val listener = new HSSFListener {
val formatter = new HSSFDataFormatter()
override def processRecord(record: Record): Unit = {
record match {
case sst: SSTRecord =>
sstRecord = sst
case label: LabelSSTRecord =>
val value = sstRecord.getString(label.getSSTIndex).toString
ensureSize(currentRow, label.getColumn + 1, "")
currentRow(label.getColumn) = value
currentRowNum = label.getRow
case number: NumberRecord =>
val value = number.getValue.toString
ensureSize(currentRow, number.getColumn + 1, "")
currentRow(number.getColumn) = value
currentRowNum = number.getRow
case _: LastCellOfRowDummyRecord =>
if (currentRow.nonEmpty) {
if (currentRowNum == 0 && headers.isEmpty) {
headers = currentRow.clone()
} else {
rowsBuffer += currentRow.clone()
}
}
currentRow.clear()
currentRowNum = -1
case _ =>
}
}
def ensureSize(buffer: ArrayBuffer[String], size: Int, default: String): Unit = {
while (buffer.size < size) {
buffer += default
}
}
}
val factory = new HSSFEventFactory()
val request = new HSSFRequest()
val listener1 = new MissingRecordAwareHSSFListener(listener)
val listener2 = new FormatTrackingHSSFListener(listener1)
request.addListenerForAllRecords(listener2)
factory.processWorkbookEvents(request, fsPOI)
val schema = StructType(headers.map(name => StructField(name, StringType, nullable = true)))
val rows = rowsBuffer.map(Row.fromSeq)
sqlContext.createDataFrame(sqlContext.sparkContext.parallelize(rows), schema)
}
}
package cn.keytop.source.xls
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, RelationProvider, TableScan}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext}
import java.io.Serializable
/**
* @author: 王建成
* @since: 2025/4/18 13:46
* @description: coding需求和地址 编写一个spark source plugin来读取xls大文件数据
*/
class DefaultSource extends RelationProvider with DataSourceRegister with Serializable{
override def shortName(): String = "xls"
override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
val path = parameters.getOrElse("path", throw new IllegalArgumentException("Missing path"))
val reader = new XLSReader()
val df = reader.read(path, sqlContext)
new BaseRelation with TableScan {
override def sqlContext: SQLContext = sqlContext
override def schema: StructType = df.schema
override def buildScan(): RDD[Row] = df.rdd
}
}
}