在一次意外中,鼠标在一个excel文件上,点击了右键
发现这货竟然是一个压缩文件,解压后发现包含以下文件(仅限参考)
里面大多数都是xml格式(有点惊喜)。下图为excel文件使用excel打开时的视图,以下称“界面视图”
我们再回到解压的出来的文件中,看一下以下几个文件
/xl/workbook.xml
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:x15="http://schemas.microsoft.com/office/spreadsheetml/2010/11/main" xmlns:xr="http://schemas.microsoft.com/office/spreadsheetml/2014/revision" xmlns:xr6="http://schemas.microsoft.com/office/spreadsheetml/2016/revision6" xmlns:xr10="http://schemas.microsoft.com/office/spreadsheetml/2016/revision10" xmlns:xr2="http://schemas.microsoft.com/office/spreadsheetml/2015/revision2" mc:Ignorable="x15 xr xr6 xr10 xr2">
<fileVersion appName="xl" lastEdited="7" lowestEdited="7" rupBuild="24827"/>
<workbookPr defaultThemeVersion="166925"/>
<mc:AlternateContent xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006">
<mc:Choice Requires="x15">
<x15ac:absPath xmlns:x15ac="http://schemas.microsoft.com/office/spreadsheetml/2010/11/ac" url="C:\Users\86158\Desktop\"/>
</mc:Choice>
</mc:AlternateContent>
<xr:revisionPtr revIDLastSave="0" documentId="8_{86FF098E-6026-4F90-B3D3-D884D85F062F}" xr6:coauthVersionLast="47" xr6:coauthVersionMax="47" xr10:uidLastSave="{00000000-0000-0000-0000-000000000000}"/>
<bookViews>
<workbookView xWindow="368" yWindow="368" windowWidth="14399" windowHeight="8272" xr2:uid="{BD78F4D0-AB62-4C9E-8966-BD5A07F42F49}"/>
</bookViews>
<sheets>
<sheet name="Sheet1" sheetId="1" r:id="rId1"/>
</sheets>
<calcPr calcId="191029"/>
<extLst>
<ext xmlns:x15="http://schemas.microsoft.com/office/spreadsheetml/2010/11/main" uri="{140A7094-0E35-4892-8432-C4D2E57EDEB5}">
<x15:workbookPr chartTrackingRefBase="1"/>
</ext>
<ext xmlns:xcalcf="http://schemas.microsoft.com/office/spreadsheetml/2018/calcfeatures" uri="{B58B0392-4F1F-4190-BB64-5DF3571DCE5F}">
<xcalcf:calcFeatures>
<xcalcf:feature name="microsoft.com:RD"/>
<xcalcf:feature name="microsoft.com:FV"/>
<xcalcf:feature name="microsoft.com:LET_WF"/>
<xcalcf:feature name="microsoft.com:LAMBDA_WF"/>
</xcalcf:calcFeatures>
</ext>
</extLst>
</workbook>
/xl/_rels/workbook.xml.rels
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId3" type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/>
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme" Target="theme/theme1.xml"/>
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
<Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings" Target="sharedStrings.xml"/>
</Relationships>
/xl/sharedStrings.xml
<sst
xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
count="5" uniqueCount="5">
<si>
<t>name</t>
<phoneticPr fontId="1" type="noConversion" />
</si>
<si>
<t>age</t>
<phoneticPr fontId="1" type="noConversion" />
</si>
<si>
<t>张三</t>
<phoneticPr fontId="1" type="noConversion" />
</si>
<si>
<t>李四</t>
<phoneticPr fontId="1" type="noConversion" />
</si>
<si>
<t>王五</t>
<phoneticPr fontId="1" type="noConversion" />
</si>
</sst>
/xl/worksheets/sheet1.xml
<worksheet
xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac"
xmlns:xr="http://schemas.microsoft.com/office/spreadsheetml/2014/revision"
xmlns:xr2="http://schemas.microsoft.com/office/spreadsheetml/2015/revision2"
xmlns:xr3="http://schemas.microsoft.com/office/spreadsheetml/2016/revision3"
mc:Ignorable="x14ac xr xr2 xr3"
xr:uid="{F1B0648D-2180-4D99-96E4-2E5516BEAD45}">
<dimension ref="A1:B4" />
<sheetViews>
<sheetView tabSelected="1" workbookViewId="0">
<selection activeCell="B4" sqref="B4" />
</sheetView>
</sheetViews>
<sheetFormatPr defaultRowHeight="13.9"
x14ac:dyDescent="0.4" />
<sheetData>
<row r="1" spans="1:2" x14ac:dyDescent="0.4">
<c r="A1" t="s">
<v>0</v>
</c>
<c r="B1" t="s">
<v>1</v>
</c>
</row>
<row r="2" spans="1:2" x14ac:dyDescent="0.4">
<c r="A2" t="s">
<v>2</v>
</c>
<c r="B2">
<v>23</v>
</c>
</row>
<row r="3" spans="1:2" x14ac:dyDescent="0.4">
<c r="A3" t="s">
<v>3</v>
</c>
<c r="B3">
<v>21</v>
</c>
</row>
<row r="4" spans="1:2" x14ac:dyDescent="0.4">
<c r="A4" t="s">
<v>4</v>
</c>
<c r="B4">
<v>12</v>
</c>
</row>
</sheetData>
<phoneticPr fontId="1" type="noConversion" />
<pageMargins left="0.7" right="0.7" top="0.75"
bottom="0.75" header="0.3" footer="0.3" />
<pageSetup paperSize="9" orientation="portrait" r:id="rId1" />
</worksheet>
从上面文件中提取部分片段:
<sheet name="Sheet1" sheetId="1" r:id="rId1"/>
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
<sheetData>
<row r="1" spans="1:2" x14ac:dyDescent="0.4">
<c r="A1" t="s">
<v>0</v>
</c>
<c r="B1" t="s">
<v>1</v>
</c>
</row>
<row r="2" spans="1:2" x14ac:dyDescent="0.4">
<c r="A2" t="s">
<v>2</v>
</c>
<c r="B2">
<v>23</v>
</c>
</row>
<row r="3" spans="1:2" x14ac:dyDescent="0.4">
<c r="A3" t="s">
<v>3</v>
</c>
<c r="B3">
<v>21</v>
</c>
</row>
<row r="4" spans="1:2" x14ac:dyDescent="0.4">
<c r="A4" t="s">
<v>4</v>
</c>
<c r="B4">
<v>12</v>
</c>
</row>
</sheetData>
从上面先下几个假设:
1. 从sheet的名字(Sheet1)可以找到存储了表格数据的文件(/xl/worksheets/sheet1.xml)
2. /xl/worksheets/sheet1.xml 文件中的row 是行,c 是 column(列), v是值, c中的 t="s"描述的是字符串类型,则v代表的是序号
3. 字符串类型的值,都存在于 /xl/sharedStrings.xml 文件中。
如果以上假设成立,那么我们现在对excel的读取就变成了在有限的几个xml文件中,读取我们需要的数据。
步骤如下:
1. 读取 /xl/workbook.xml 文件,得到sheet名与rid的关系
2. 读取 /xl/_rels/workbook.xml.rels 文件,得到rid与sheet.xml的文件路径的关系
3. 读取/xl/sharedStrings.xml文件,得到字符串与序号的关系
4. 读取/xl/worksheets/sheet*.xml文件,得到行数据,当读到一个新的行,则创建新的数据容器(示例中为列表,读者可尝试使用类反射相关内容,实现把行数据读到实体中)并放入待返回的数据集中,然后读取行中的列,填充到数据容器对应的位置,如果列的类型为字符串,即有属性t="s",则把序号进行转换得到真实值,然后放到数据容器中。
代码如下:
package org.yafox.excel;
import java.io.File;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
/**
* @author lxq
*
*/
public class Reader {
/**
* 对应 xl/sharedStrings.xml 解释每个sst/si/t的值,并按顺序放入
*
* @param zipFile
* @param saxParser
* @return
* @throws Exception
*/
private List<String> readSharedStrings(ZipFile zipFile, SAXParser saxParser) throws Exception {
ZipEntry entry = zipFile.getEntry("xl/sharedStrings.xml");
ShareStringsHandler dh = new ShareStringsHandler();
saxParser.parse(zipFile.getInputStream(entry), dh);
return dh.getSharedStrings();
}
/**
* 对应 xl/_rels/workbook.xml.rels 结构 key对应为Id value对应为Target
*
* @param zipFile
* @param saxParser
* @return
* @throws Exception
*/
private Map<String, String> readRelationship(ZipFile zipFile, SAXParser saxParser) throws Exception {
ZipEntry entry = zipFile.getEntry("xl/_rels/workbook.xml.rels");
RelationshipHandler dh = new RelationshipHandler();
saxParser.parse(zipFile.getInputStream(entry), dh);
return dh.getRelationship();
}
/**
* 读取 workbook.xml workbook/sheets/sheet
*
* @param zipFile
* @param saxParser
* @return
* @throws Exception
*/
private Map<String, String> readNameRidMapping(ZipFile zipFile, SAXParser saxParser) throws Exception {
ZipEntry entry = zipFile.getEntry("xl/workbook.xml");
SheetHandler dh = new SheetHandler();
saxParser.parse(zipFile.getInputStream(entry), dh);
return dh.getNameRidMapping();
}
public List<List<String>> readData(File excelFile, String sheetName) throws Exception {
ZipFile zipFile = null;
try {
zipFile = new ZipFile(excelFile);
SAXParserFactory saxFactory = SAXParserFactory.newInstance();
SAXParser saxParser = saxFactory.newSAXParser();
Map<String, String> nameRidMapping = readNameRidMapping(zipFile, saxParser);
Map<String, String> relationship = readRelationship(zipFile, saxParser);
List<String> sharedStrings = readSharedStrings(zipFile, saxParser);
String sheetId = nameRidMapping.get(sheetName);
String path = relationship.get(sheetId);
ZipEntry entry = zipFile.getEntry("xl/" + path);
RowHandler dh = new RowHandler(sharedStrings);
saxParser.parse(zipFile.getInputStream(entry), dh);
return dh.getDatas();
} finally {
zipFile.close();
}
}
}
package org.yafox.excel;
import java.util.HashMap;
import java.util.Map;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* @author lxq
*
*/
public class RelationshipHandler extends DefaultHandler {
/**
* 对应 xl/_rels/workbook.xml.rels 结构
* key对应为Id
* value对应为Target
*/
private Map<String, String> relationship = new HashMap<String, String>();
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if (qName.equalsIgnoreCase("Relationship")) {
String id = attributes.getValue("Id");
String target = attributes.getValue("Target");
relationship.put(id, target);
}
}
public Map<String, String> getRelationship() {
return relationship;
}
}
package org.yafox.excel;
import java.util.ArrayList;
import java.util.List;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* @author lxq
*
*/
public class ShareStringsHandler extends DefaultHandler {
/**
* 对应 xl/sharedStrings.xml
* 解释每个sst/si/t的值,并按顺序放入
*/
private List<String> sharedStrings = new ArrayList<String>();
private boolean capture = false;
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if (qName.equalsIgnoreCase("t")) {
this.capture = true;
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (qName.equalsIgnoreCase("t")) {
this.capture = false;
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
// 如果处于捕获状态,则取t里的值
if (this.capture) {
String str = new String(ch, start, length);
this.sharedStrings.add(str.trim());
}
}
public List<String> getSharedStrings() {
return sharedStrings;
}
}
package org.yafox.excel;
import java.util.HashMap;
import java.util.Map;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* @author lxq
*
*/
public class SheetHandler extends DefaultHandler {
private Map<String, String> nameRidMapping = new HashMap<String, String>();
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if ("sheet".equals(qName)) {
String name = attributes.getValue("name");
String rid = attributes.getValue("r:id");
nameRidMapping.put(name, rid);
}
}
public Map<String, String> getNameRidMapping() {
return nameRidMapping;
}
}
package org.yafox.excel;
import java.util.ArrayList;
import java.util.List;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* @author lxq
*
*/
public class RowHandler extends DefaultHandler {
private List<String> sharedStrings = new ArrayList<String>();
private List<List<String>> datas = new ArrayList<List<String>>();
private List<String> rowData = null;
private boolean needTranslate = false;
private List<String> paths = new ArrayList<String>();
public RowHandler(List<String> sharedStrings) {
super();
this.sharedStrings = sharedStrings;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
this.paths.add(qName);
if (paths.size() == 3 && paths.get(0).equals("worksheet") && paths.get(1).equals("sheetData") && paths.get(2).equals("row")) {
rowData = new ArrayList<String>();
this.datas.add(rowData);
} else if (paths.size() == 4 && paths.get(0).equals("worksheet") && paths.get(1).equals("sheetData") && paths.get(2).equals("row") && paths.get(3).equals("c")) {
this.needTranslate = "s".equals(attributes.getValue("t"));
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
this.paths.remove(paths.size() - 1);
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (capturable()) {
String value = new String(ch, start, length).trim();
if (this.needTranslate) {
int idx = Integer.parseInt(value);
value = this.sharedStrings.get(idx);
}
rowData.add(value);
}
}
private boolean capturable() {
if (paths.size() != 5) {
return false;
}
return paths.get(0).equals("worksheet") && paths.get(1).equals("sheetData") && paths.get(2).equals("row") && paths.get(3).equals("c") && paths.get(4).equals("v");
}
public List<List<String>> getDatas() {
return datas;
}
}
以上文件格式是 .xlsx文件的结构,至于.xls的,应该不是上面的存储格式。
最近,留给大家思考一个问题,如何做一个高效的excel导出工具或类库呢?