点击上方“阿拉奇学Java”,选择“置顶或者星标”
优质文章第一时间送达!
链接: www.cnblogs.com/rongdi/p/11872810.html
推荐阅读 | Java 的 JSP 已经被淘汰了吗?
推荐阅读 | 知乎高赞:本科生如何才能进入腾讯、阿里等一流互联网大厂?
package com.example.utils;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import java.io.File;
import java.io.InputStream;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* 百度上直接copy过来的
* XSSF and SAX (Event API)
*/
public abstract class BigDataParseExcelUtil extends DefaultHandler {
private ReadOnlySharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private int sheetIndex = -1;
private List<String> rowlist = new ArrayList<String>();
private int curRow = 0; // 当前行
private int curCol = 0; // 当前列索引
private int preCol = 0; // 上一列列索引
private int titleRow = 0; // 标题行,一般情况下为0
private int rowsize = 0; // 列数
private List excelList = new ArrayList(); //excel全部转换为list
// excel记录行操作方法,以sheet索引,行索引和行元素列表为参数,对sheet的一行元素进行操作,元素为String类型
public abstract void optRows(int sheetIndex, int curRow,
List<String> rowlist, List excelList) throws SQLException, Exception;
// 只遍历一个sheet,其中sheetId为要遍历的sheet索引,从1开始,1-3
/**
* @param filename
* @param sheetId sheetId为要遍历的sheet索引,从1开始,1-3
* @throws Exception
*/
public void processOneSheet(String filename, int sheetId) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader(pkg);
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg);
XMLReader parser = fetchSheetParser(strings);
// rId2 found by processing the Workbook
// 根据 rId# 或 rSheet# 查找sheet
InputStream sheet2 = r.getSheet("rId" + sheetId);
sheetIndex++;
InputSource sheetSource = new InputSource(sheet2);
parser.parse(sheetSource);
sheet2.close();
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
// 得到单元格内容的值
lastContents += new String(ch, start, length);
}
public void process(InputStream inputStream) throws Exception {
OPCPackage pkg = OPCPackage.open(inputStream);
XSSFReader r = new XSSFReader(pkg);
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg);
XMLReader parser = fetchSheetParser(strings);
Iterator<InputStream> sheets = r.getSheetsData();
while (sheets.hasNext()) {
curRow = 0;
sheetIndex++;
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}
}
/**
* 遍历 excel 文件
*/
public void process(File file) throws Exception {
OPCPackage pkg = OPCPackage.open(file);
XSSFReader r = new XSSFReader(pkg);
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg);
XMLReader parser = fetchSheetParser(strings);
Iterator<InputStream> sheets = r.getSheetsData();
while (sheets.hasNext()) {
curRow = 0;
sheetIndex++;
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}
}
public XMLReader fetchSheetParser(ReadOnlySharedStringsTable sst)
throws SAXException {
XMLReader parser = XMLReaderFactory.createXMLReader();
// .createXMLReader("org.apache.xerces.parsers.SAXParser");
this.sst = sst;
parser.setContentHandler(this);
return parser;
}
@Override
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => 单元格
if (name.equals("c")) {
// 如果下一个元素是 SST 的索引,则将nextIsString标记为true
String cellType = attributes.getValue("t");
String rowStr = attributes.getValue("r");
curCol = this.getRowIndex(rowStr);
if (cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// 置空
lastContents = "";
}
@Override
public void endElement(String uri, String localName, String name)
throws SAXException {
// 根据SST的索引值的到单元格的真正要存储的字符串
// 这时characters()方法可能会被调用多次
if (nextIsString) {
try {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx))
.toString();
} catch (Exception e) {
}
}
// v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引
// 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
if (name.equals("v")) {
String value = lastContents.trim();
value = value.equals("") ? " " : value;
int cols = curCol - preCol;
if (cols > 1) {
for (int i = 0; i < cols - 1; i++) {
rowlist.add(preCol, "");
}
}
preCol = curCol;
rowlist.add(curCol - 1, value);
} else {
// 如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法
if (name.equals("row")) {
int tmpCols = rowlist.size();
if (curRow > this.titleRow && tmpCols < this.rowsize) {
for (int i = 0; i < this.rowsize - tmpCols; i++) {
rowlist.add(rowlist.size(), "");
}
}
try {
optRows(sheetIndex, curRow, rowlist, excelList);
} catch (SQLException e) {
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (curRow == this.titleRow) {
this.rowsize = rowlist.size();
}
rowlist.clear();
curRow++;
curCol = 0;
preCol = 0;
}
}
}
// 得到列索引,每一列c元素的r属性构成为字母加数字的形式,字母组合为列索引,数字组合为行索引,
// 如AB45,表示为第(A-A+1)*26+(B-A+1)*26列,45行
public int getRowIndex(String rowStr) {
rowStr = rowStr.replaceAll("[^A-Z]", "");
byte[] rowAbc = rowStr.getBytes();
int len = rowAbc.length;
float num = 0;
for (int i = 0; i < len; i++) {
num += (rowAbc[i] - 'A' + 1) * Math.pow(26, len - i - 1);
}
return (int) num;
}
}
package com.example.service;
import com.example.utils.BigDataParseExcelUtil;
import org.springframework.stereotype.Service;
import java.io.InputStream;
import java.sql.SQLException;
import java.util.List;
/**
* @author: rongdi
* @date:
*/
@Service
public class ExcelService {
public void import1(InputStream inputStream) throws Exception {
BigDataParseExcelUtil xlx = new BigDataParseExcelUtil() {
@Override
public void optRows(int sheetIndex, int curRow, List<String> rowlist, List excelList)
throws SQLException {
System.out.println(rowlist);
}
};
xlx.process(inputStream);
}
}
package com.example.controller;
import com.example.service.ExcelService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.multipart.MultipartFile;
/**
* @author: rongdi
* @date:
*/
@Controller
public class ExcelController {
@Autowired
private ExcelService excelService;
@RequestMapping("/excel/import1")
@ResponseBody
public String import1(@RequestParam("file") MultipartFile multipartFile) throws Exception {
excelService.import1(multipartFile.getInputStream());
return "ok";
}
}
使用postman等工具,导入上面说的20M的文件22.xlsx,报错如下:
那我们优化一下不使用inputStream,直接使用一个File传入看看
public void import2(File file) throws Exception {
BigDataParseExcelUtil xlx = new BigDataParseExcelUtil() {
@Override
public void optRows(int sheetIndex, int curRow, List<String> rowlist, List excelList)
throws SQLException {
System.out.println(rowlist);
}
};
xlx.process(file);
}
@RequestMapping("/excel/import2")
@ResponseBody
public String import2(@RequestParam("file") MultipartFile multipartFile) throws Exception {
// 延迟解析比率
ZipSecureFile.setMinInflateRatio(-1.0d);
File tmp = Files.createTempFile("tmp-", ".xlsx").toFile();
Files.copy(multipartFile.getInputStream(), Paths.get(tmp.getPath()), StandardCopyOption.REPLACE_EXISTING);
excelService.import2(tmp);
return "ok";
}
我们是不是可以直接往strings里添加字符串和获取字符串的方法那里替换掉,不要使用strings这个集合存储所有字符串。但是既然excel里设计成使用一个sharedStrings.xml存放公共的字符串,而不是像csv格式那样,每次读一行取一行数据就好了。那么这个sharedStrings.xml中的数据总要解析出来,总要有个地方存储里面的数据,不然怎么结合sheet.xml的格式获取到每一行的数据呢?所以这里就很尴尬了,不能每次解析sharedStrings.xml时不保存每次需要获取strings的时候,再去解析一下这个xm吧,如果从本文章的xml上来看,要重复解析25W次,效率极其低。现在问题可以简化成我们需要把sharedStrings.xml解析出的所有字符串放在一个地方,还能方便解析,由于怕内存溢出,肯定不能放在内存中了。那么这里就有一些选择,比如解析出的字符串按加入strings集合的顺序放入数据库,文件,外部存储或者缓存(限制内存大小,多余写入文件)存储中。然后使用的时候按照索引位置idx去一一取出。本文章先使用临时文件来放这些数据,因为不想搞那么复杂,导入任务不管再多复杂的系统中,最终执行的都会是一个单节点,在单节点中先使用本机资源这种就近资源是最方便的。如下直接先复制源码,然后修改上述说的两个地方。
package com.example.utils;
import org.apache.poi.ooxml.util.SAXHelper;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.ss.usermodel.RichTextString;
import org.apache.poi.util.Removal;
import org.apache.poi.xssf.model.SharedStrings;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.LineNumberReader;
import java.io.PushbackInputStream;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML;
public class ReadOnlySharedStringsTable extends DefaultHandler implements SharedStrings {
protected final boolean includePhoneticRuns;
/**
* An integer representing the total count of strings in the workbook. This count does not
* include any numbers, it counts only the total of text strings in the workbook.
*/
protected int count;
/**
* An integer representing the total count of unique strings in the Shared String Table.
* A string is unique even if it is a copy of another string, but has different formatting applied
* at the character level.
*/
protected int uniqueCount;
/**
* The shared strings table.
*/
private List<String> strings;
private File tmp = null;
FileOutputStream fos = null;
private int counts;
private Map<Integer,String> map = new LinkedHashMap<Integer,String>();
public ReadOnlySharedStringsTable(OPCPackage pkg)
throws IOException, SAXException {
this(pkg, true);
}
public ReadOnlySharedStringsTable(OPCPackage pkg, boolean includePhoneticRuns)
throws IOException, SAXException {
this.includePhoneticRuns = includePhoneticRuns;
ArrayList<PackagePart> parts =
pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType());
// Some workbooks have no shared strings table.
if (parts.size() > 0) {
PackagePart sstPart = parts.get(0);
readFrom(sstPart.getInputStream());
}
}
/**
* Like POIXMLDocumentPart constructor
*
* Calls {@link #ReadOnlySharedStringsTable(PackagePart, boolean)}, with a
* value of <code>true</code> to include phonetic runs.
*
* @since POI 3.14-Beta1
*/
public ReadOnlySharedStringsTable(PackagePart part) throws IOException, SAXException {
this(part, true);
}
/**
* Like POIXMLDocumentPart constructor
*
* @since POI 3.14-Beta3
*/
public ReadOnlySharedStringsTable(PackagePart part, boolean includePhoneticRuns)
throws IOException, SAXException {
this.includePhoneticRuns = includePhoneticRuns;
readFrom(part.getInputStream());
}
/**
* Read this shared strings table from an XML file.
*
* @param is The input stream containing the XML document.
* @throws IOException if an error occurs while reading.
* @throws SAXException if parsing the XML data fails.
*/
public void readFrom(InputStream is) throws IOException, SAXException {
// test if the file is empty, otherwise parse it
PushbackInputStream pis = new PushbackInputStream(is, 1);
int emptyTest = pis.read();
if (emptyTest > -1) {
pis.unread(emptyTest);
InputSource sheetSource = new InputSource(pis);
try {
XMLReader sheetParser = SAXHelper.newXMLReader();
sheetParser.setContentHandler(this);
sheetParser.parse(sheetSource);
} catch(ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
}
/**
* Return an integer representing the total count of strings in the workbook. This count does not
* include any numbers, it counts only the total of text strings in the workbook.
*
* @return the total count of strings in the workbook
*/
@Override
public int getCount() {
return this.count;
}
/**
* Returns an integer representing the total count of unique strings in the Shared String Table.
* A string is unique even if it is a copy of another string, but has different formatting applied
* at the character level.
*
* @return the total count of unique strings in the workbook
*/
@Override
public int getUniqueCount() {
return this.uniqueCount;
}
/**
* Return the string at a given index.
* Formatting is ignored.
*
* @param idx index of item to return.
* @return the item at the specified position in this Shared String table.
* @deprecated use <code>getItemAt</code> instead
*/
@Removal(version = "4.2")
@Deprecated
public String getEntryAt(int idx) {
/**
* 这里就是修改部分了,直接从按行存储的临时文件读取需要的字符串
*/
String value = map.get(idx + 1);
if(value == null) {
return readString(idx,1000,this.uniqueCount);
} else {
return value;
}
}
/**
* 从指定位置读取size个字符串,这里是使用局部性原理,每次读取size个字符串,
* 以免每次需要读取文件,性能极低
* @return
*/
private String readString(int idx,int size,int numbers) {
map.clear();
int currNumber = idx + 1;
if (currNumber < 0 || currNumber > numbers) {
return null;
}
try {
FileReader in = new FileReader(tmp);
LineNumberReader reader = new LineNumberReader(in);
try {
String line = "";
for(int i = 1;i <= numbers;i ++) {
line = reader.readLine();
if(i >= currNumber && i < currNumber + size) {
map.put(i, line);
}
}
} finally {
reader.close();
in.close();
}
} catch (Exception e) {
System.out.println(e.getMessage());
}
return map.get(idx + 1);
}
/**
* Returns all the strings.
* Formatting is ignored.
*
* @return a list with all the strings
* @deprecated use <code>getItemAt</code> instead
*/
@Removal(version = "4.2")
@Deprecated
public List<String> getItems() {
return strings;
}
@Override
public RichTextString getItemAt(int idx) {
return new XSSFRichTextString(getEntryAt(idx));
}
ContentHandler methods
private StringBuilder characters;
private boolean tIsOpen;
private boolean inRPh;
@Override
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
if (uri != null && ! uri.equals(NS_SPREADSHEETML)) {
return;
}
if ("sst".equals(localName)) {
String count = attributes.getValue("count");
if(count != null) this.count = Integer.parseInt(count);
String uniqueCount = attributes.getValue("uniqueCount");
if(uniqueCount != null) this.uniqueCount = Integer.parseInt(uniqueCount);
try {
tmp = Files.createTempFile("tmp-", ".xlsx").toFile();
} catch (IOException e) {
e.printStackTrace();
}
// this.strings = new ArrayList<>(this.uniqueCount);
characters = new StringBuilder(64);
try {
fos = new FileOutputStream(tmp,true);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
} else if ("si".equals(localName)) {
characters.setLength(0);
} else if ("t".equals(localName)) {
tIsOpen = true;
} else if ("rPh".equals(localName)) {
inRPh = true;
//append space...this assumes that rPh always comes after regular <t>
if (includePhoneticRuns && characters.length() > 0) {
characters.append(" ");
}
}
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
if (uri != null && ! uri.equals(NS_SPREADSHEETML)) {
return;
}
if ("si".equals(localName)) {
// strings.add(characters.toString().intern());
try {
/**
* 这里就是修改的一部分,这里直接把字符串按行存入临时文件
*/
counts ++;
fos.write((characters.toString() + "\n").getBytes());
if(counts == this.uniqueCount) {
fos.close();
}
} catch (IOException e) {
e.printStackTrace();
}
} else if ("t".equals(localName)) {
tIsOpen = false;
} else if ("rPh".equals(localName)) {
inRPh = false;
}
}
/**
* Captures characters only if a t(ext) element is open.
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (tIsOpen) {
if (inRPh && includePhoneticRuns) {
characters.append(ch, start, length);
} else if (! inRPh){
characters.append(ch, start, length);
}
}
}
}
package com.example.advanceevent;
import com.example.utils.FileUtils;
import org.ehcache.Cache;
import org.ehcache.CacheManager;
import org.ehcache.config.CacheConfiguration;
import org.ehcache.config.builders.CacheConfigurationBuilder;
import org.ehcache.config.builders.CacheManagerBuilder;
import org.ehcache.config.builders.ResourcePoolsBuilder;
import org.ehcache.config.units.MemoryUnit;
import org.ehcache.core.Ehcache;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.util.HashMap;
import java.util.UUID;
/**
* @author: rongdi
* @date:
*/
public class ReadCache {
private static final Logger LOGGER = LoggerFactory.getLogger(Ehcache.class);
private int index = 0;
private HashMap<Integer, String> dataMap = new HashMap(1334);
private static CacheManager fileCacheManager;
private static CacheConfiguration<Integer, HashMap> fileCacheConfiguration;
private static CacheManager activeCacheManager;
private CacheConfiguration<Integer, HashMap> activeCacheConfiguration;
private Cache<Integer, HashMap> fileCache;
private Cache<Integer, HashMap> activeCache;
private String cacheAlias;
private int cacheMiss = 0;
public ReadCache(int maxCacheActivateSize) {
this.activeCacheConfiguration = CacheConfigurationBuilder.newCacheConfigurationBuilder(Integer.class, HashMap.class, ResourcePoolsBuilder.newResourcePoolsBuilder().heap((long)maxCacheActivateSize, MemoryUnit.MB)).withSizeOfMaxObjectGraph(1000000L).withSizeOfMaxObjectSize((long)maxCacheActivateSize, MemoryUnit.MB).build();
init();
}
private void init() {
this.cacheAlias = UUID.randomUUID().toString();
this.fileCache = fileCacheManager.createCache(this.cacheAlias, fileCacheConfiguration);
this.activeCache = activeCacheManager.createCache(this.cacheAlias, this.activeCacheConfiguration);
}
public void put(String value) {
this.dataMap.put(this.index, value);
if ((this.index + 1) % 1000 == 0) {
this.fileCache.put(this.index / 1000, this.dataMap);
this.dataMap = new HashMap(1334);
}
++this.index;
if (LOGGER.isDebugEnabled() && this.index % 1000000 == 0) {
LOGGER.debug("Already put :{}", this.index);
}
}
public String get(Integer key) {
if (key != null && key >= 0) {
int route = key / 1000;
HashMap<Integer, String> dataMap = (HashMap)this.activeCache.get(route);
if (dataMap == null) {
dataMap = (HashMap)this.fileCache.get(route);
this.activeCache.put(route, dataMap);
if (LOGGER.isDebugEnabled() && this.cacheMiss++ % 1000 == 0) {
LOGGER.debug("Cache misses count:{}", this.cacheMiss);
}
}
return (String)dataMap.get(key);
} else {
return null;
}
}
public void putFinished() {
if (this.dataMap != null) {
this.fileCache.put(this.index / 1000, this.dataMap);
}
}
public void destroy() {
fileCacheManager.removeCache(this.cacheAlias);
activeCacheManager.removeCache(this.cacheAlias);
}
static {
File cacheFile = FileUtils.createCacheTmpFile();
fileCacheManager = CacheManagerBuilder.newCacheManagerBuilder().with(CacheManagerBuilder.persistence(cacheFile)).build(true);
activeCacheManager = CacheManagerBuilder.newCacheManagerBuilder().build(true);
fileCacheConfiguration = CacheConfigurationBuilder.newCacheConfigurationBuilder(Integer.class, HashMap.class, ResourcePoolsBuilder.newResourcePoolsBuilder().disk(10L, MemoryUnit.GB)).withSizeOfMaxObjectGraph(1000000L).withSizeOfMaxObjectSize(10L, MemoryUnit.GB).build();
}
}
package com.example.advanceevent;
import org.apache.poi.ooxml.util.SAXHelper;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.ss.usermodel.RichTextString;
import org.apache.poi.util.Removal;
import org.apache.poi.xssf.model.SharedStrings;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.util.ArrayList;
import java.util.List;
import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML;
public class ReadOnlySharedStringsTable extends DefaultHandler implements SharedStrings {
protected final boolean includePhoneticRuns;
/**
* An integer representing the total count of strings in the workbook. This count does not
* include any numbers, it counts only the total of text strings in the workbook.
*/
protected int count;
/**
* An integer representing the total count of unique strings in the Shared String Table.
* A string is unique even if it is a copy of another string, but has different formatting applied
* at the character level.
*/
protected int uniqueCount;
/**
* 缓存
*/
ReadCache readCache = new ReadCache(100);
private int counts;
public ReadOnlySharedStringsTable(OPCPackage pkg)
throws IOException, SAXException {
this(pkg, true);
}
public ReadOnlySharedStringsTable(OPCPackage pkg, boolean includePhoneticRuns)
throws IOException, SAXException {
this.includePhoneticRuns = includePhoneticRuns;
ArrayList<PackagePart> parts =
pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType());
// Some workbooks have no shared strings table.
if (parts.size() > 0) {
PackagePart sstPart = parts.get(0);
readFrom(sstPart.getInputStream());
}
}
/**
* Like POIXMLDocumentPart constructor
*
* Calls {@link #ReadOnlySharedStringsTable(PackagePart, boolean)}, with a
* value of <code>true</code> to include phonetic runs.
*
* @since POI 3.14-Beta1
*/
public ReadOnlySharedStringsTable(PackagePart part) throws IOException, SAXException {
this(part, true);
}
/**
* Like POIXMLDocumentPart constructor
*
* @since POI 3.14-Beta3
*/
public ReadOnlySharedStringsTable(PackagePart part, boolean includePhoneticRuns)
throws IOException, SAXException {
this.includePhoneticRuns = includePhoneticRuns;
readFrom(part.getInputStream());
}
/**
* Read this shared strings table from an XML file.
*
* @param is The input stream containing the XML document.
* @throws IOException if an error occurs while reading.
* @throws SAXException if parsing the XML data fails.
*/
public void readFrom(InputStream is) throws IOException, SAXException {
// test if the file is empty, otherwise parse it
PushbackInputStream pis = new PushbackInputStream(is, 1);
int emptyTest = pis.read();
if (emptyTest > -1) {
pis.unread(emptyTest);
InputSource sheetSource = new InputSource(pis);
try {
XMLReader sheetParser = SAXHelper.newXMLReader();
sheetParser.setContentHandler(this);
sheetParser.parse(sheetSource);
} catch(ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
}
/**
* Return an integer representing the total count of strings in the workbook. This count does not
* include any numbers, it counts only the total of text strings in the workbook.
*
* @return the total count of strings in the workbook
*/
@Override
public int getCount() {
return this.count;
}
/**
* Returns an integer representing the total count of unique strings in the Shared String Table.
* A string is unique even if it is a copy of another string, but has different formatting applied
* at the character level.
*
* @return the total count of unique strings in the workbook
*/
@Override
public int getUniqueCount() {
return this.uniqueCount;
}
/**
* Return the string at a given index.
* Formatting is ignored.
*
* @param idx index of item to return.
* @return the item at the specified position in this Shared String table.
* @deprecated use <code>getItemAt</code> instead
*/
@Removal(version = "4.2")
@Deprecated
public String getEntryAt(int idx) {
/**
* 这里就是修改部分了,直接从按行存储的临时文件读取需要的字符串
*/
return readCache.get(idx);
}
/**
* Returns all the strings.
* Formatting is ignored.
*
* @return a list with all the strings
* @deprecated use <code>getItemAt</code> instead
*/
@Removal(version = "4.2")
@Deprecated
public List<String> getItems() {
return null;
}
@Override
public RichTextString getItemAt(int idx) {
return new XSSFRichTextString(getEntryAt(idx));
}
ContentHandler methods
private StringBuilder characters;
private boolean tIsOpen;
private boolean inRPh;
@Override
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
if (uri != null && ! uri.equals(NS_SPREADSHEETML)) {
return;
}
if ("sst".equals(localName)) {
String count = attributes.getValue("count");
if(count != null) this.count = Integer.parseInt(count);
String uniqueCount = attributes.getValue("uniqueCount");
if(uniqueCount != null) this.uniqueCount = Integer.parseInt(uniqueCount);
// this.strings = new ArrayList<>(this.uniqueCount);
characters = new StringBuilder(64);
} else if ("si".equals(localName)) {
characters.setLength(0);
} else if ("t".equals(localName)) {
tIsOpen = true;
} else if ("rPh".equals(localName)) {
inRPh = true;
//append space...this assumes that rPh always comes after regular <t>
if (includePhoneticRuns && characters.length() > 0) {
characters.append(" ");
}
}
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
if (uri != null && ! uri.equals(NS_SPREADSHEETML)) {
return;
}
if ("si".equals(localName)) {
// strings.add(characters.toString().intern());
readCache.put(characters.toString());
/**
* 这里就是修改的一部分,这里直接把字符串按行存入临时文件
*/
counts ++;
if(counts == this.uniqueCount) {
readCache.putFinished();
}
} else if ("t".equals(localName)) {
tIsOpen = false;
} else if ("rPh".equals(localName)) {
inRPh = false;
}
}
/**
* Captures characters only if a t(ext) element is open.
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (tIsOpen) {
if (inRPh && includePhoneticRuns) {
characters.append(ch, start, length);
} else if (! inRPh){
characters.append(ch, start, length);
}
}
}
}
至此代码效率有了相当大的提高,而且内存溢出问题也得到解决。详细测试代码:https://github.com/rongdi/poi-example.git
看到这里啦,说明你对这篇文章感兴趣,帮忙转发一下或者点击文章右下角在看。感谢啦!关注公众号,回复「
进群
」即可进入
无广告技术交流群
。同时送上250本电子书+学习视频作为见面礼!
有你想看的
精彩
Java 的 JSP 已经被淘汰了吗?
知乎高赞:本科生如何才能进入腾讯、阿里等一流互联网大厂?
ArrayList集合为什么不能使用foreach增加、删除、修改元素
有一种幸福,叫娶个女项目经理做老婆
互联网公司忽悠员工的黑话
面试字节跳动,被怼的体无完肤!
别在 Java 代码里乱打日志了,这才是正确的打日志姿势
有了这套模板,女朋友再也不用担心我刷不动 LeetCode 了
支付宝架构师眼中的高并发架构
20M文件从30秒压缩到1秒,我是如何做到的(附源码)?
39 个奇葩代码注释,看完笑哭了。