什么是SAX?
SAX(simple API for XML)是一种XML解析的替代方法。相比于DOM,SAX是一种速度更快,更有效的方法。它逐行扫描文档,一边扫描一边解析。
java导入excel表格数据方法:
java提供了对excel对象进行操作的api,即POI。POI提供API给Java程序对Microsoft Office格式档案读和写的功能。
对于excel文件,其中
HSSF提供读写Microsoft Excel XLS格式档案的功能。
XSSF提供读写Microsoft Excel OOXML XLSX格式档案的功能。
当Excel数据量比较小的时候可以直接通过poi导入Excel。
但是当数据量过大时,poi生成WorkBook过程中会直接造成超内存,这时候可以通过sax解析Excel的xml格式。
sax是如何解析Excel的xml文件?
首先将Excel文件后缀名改成.zip后,可以看到文件目录结构,
打开目录:\xl\worksheets下的sheet1.xml
.xml文件内容如下:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:xdr="http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing"
xmlns:x14="http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
xmlns:etc="http://www.wps.cn/officeDocument/2017/etCustomData">
<sheetPr/>
<dimension ref="A1:AL1"/>
<sheetViews>
<sheetView tabSelected="1" topLeftCell="S1" workbookViewId="0">
<selection activeCell="AI17" sqref="AI17"/>
</sheetView>
</sheetViews>
<sheetFormatPr defaultColWidth="9" defaultRowHeight="13.5"/>
<sheetData>
<row r="1" spans="1:38">
<c r="A1" s="1" t="s">
<v>0</v>
</c>
<c r="B1" s="1" t="s">
<v>1</v>
</c>
<c r="C1" s="1" t="s">
<v>2</v>
</c>
</row>
</sheetData>
<pageMargins left="0.75" right="0.75" top="1" bottom="1" header="0.511805555555556" footer="0.511805555555556"/>
<headerFooter/>
</worksheet>
<row>标签对应每一行数据;
<c>标签对应每一格数据,r="A1"表示位置;
<v>对应值
xml文件是由一系列节点构成,如<sheetData></sheetData>是一组节点,sax就是通过分析这一系列节点完成解析。sax主要通过继承DefaultHandle类,重写其中的方法,如下几个重要的方法:
public void startDocument () {
//开始解析文档
}
public void endDocument () {
//文档解析结束
}
public void startElement (String uri, String localName, String qName, Attributes attributes) {
//开始解析节点 如<>
}
public void characters (char[] ch, int start, int length) {
//保存节点内容 在调用startElement 方法后会调用
}
public void endElement (String uri, String localName, String qName) {
//结束解析节点 如遇到</>
}
sax解析excel文件代码如下:
package demo.readExcel;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public class ReadExcelUtils {
private int headCount = 1;
private List<String> head = new ArrayList<String>();
private List<List<String>> rowLists = new ArrayList<List<String>>();
public String excelPath;
public ReadExcelUtils(String excelPath) {
this.excelPath = excelPath;
try {
processSAXReadSheet();
} catch (IOException e) {
e.printStackTrace();
} catch (OpenXML4JException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
}
}
// 返回表头信息
public List<String> getHead() {
return head;
}
// 返回数据
public List<List<String>> getList() {
return rowLists;
}
public void processSAXReadSheet() throws IOException, OpenXML4JException, SAXException {
OPCPackage pkg = OPCPackage.open(excelPath);
XSSFReader xssfReader = new XSSFReader(pkg);
SharedStringsTable sst = xssfReader.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
Iterator<InputStream> sheets = xssfReader.getSheetsData();
//循环读取sheets
while (sheets.hasNext()) {
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}
}
private XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
// 利用XMLReaderFactory工厂类,创建XMLReader对象。
// System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser");
XMLReader parser = XMLReaderFactory.createXMLReader();
// XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
ContentHandler handler = new SheetHandler(sst);
parser.setContentHandler(handler);
return parser;
}
/**
* SAX 解析excel
*/
private class SheetHandler extends DefaultHandler {
private SharedStringsTable sst;
private String cellContent;
private boolean isNewRow;
private boolean isString;
private int rowIndex = 0;
private List<String> rowContent = new ArrayList<String>();
// cell位置,如A8
private String preRef = null;
private String ref = null;
private String maxRef = null;
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
}
/*
* cell为空的两种情况: 1. cell中原来有数据,把数据清空后,cell为空,xml为:<c r="B2" />
*
* 2.cell原本就为空,xml为:不存在此节点 , 如下不存在<c r="B1"></c>节点 <c r="A1" s="1"
* t="s"> <v>0</v> </c> <c r="C1" s="1" t="s"> <v>2</v> </c>
*/
public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
if (name.equals("row")) {
rowIndex++;
isNewRow = true;
}
// c : cell
else if (name.equals("c")) {
if (isNewRow == true) {
preRef = ""+(char)('A' - 1) + (rowLists.size() + 1);
//preRef = attributes.getValue("r");
} else {
preRef = ref;
}
ref = attributes.getValue("r");
String cellType = attributes.getValue("t");
if (cellType == null) {
isString = false;
} else {
isString = true;
}
// 清空cellContent
cellContent = "";
}
}
public void characters(char[] ch, int start, int length) throws SAXException {
cellContent += new String(ch, start, length);
}
public void endElement(String uri, String localName, String name) throws SAXException {
if (name.equals("row")) {
if (rowIndex == headCount) {
head = rowContent;
maxRef = ref;
} else if (rowIndex > headCount) {
if (rowContent != null) {
// 处理空单元格
while ((maxRef.charAt(0) - ref.charAt(0)) > 0) {
rowContent.add(null);
ref = (char) (ref.charAt(0) + 1) + ref.substring(1, ref.length());
}
rowLists.add(rowContent);
}
}
rowContent = null;
} else if (name.equals("c")) {
//新的row
if (isNewRow == true) {
rowContent = new ArrayList<String>();
isNewRow = false;
}
// 处理空单元格
while ((ref.charAt(0) - preRef.charAt(0)) > 1) {
rowContent.add(null);
preRef = (char) (preRef.charAt(0) + 1) + preRef.substring(1, preRef.length());
}
//cellContent为String
if (isString) {
int idx = Integer.parseInt(cellContent);
cellContent = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
} else { ///cellContent为int
// cell为空的第一种情况
if (cellContent.equals("")) {
cellContent = null;
} else {
}
}
rowContent.add(cellContent);
}
}
}
}
在整个过程中,我遇到了以下几个问题:
第一:cell为空的两种情况: 1. cell中原来有数据,把数据清空后,cell为空,xml为:<c r="B2" />
2.cell原本就为空,xml为:不存在此节点 , 如下不存在<c r="B1"></c>节点
<c r="A1" s="1" t="s">
<v>0</v>
</c>
<c r="C1" s="1" t="s">
<v>2</v>
</c>