工具类:
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.BiConsumer;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BlankRecord;
import org.apache.poi.hssf.record.BoolErrRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.usermodel.HSSFDataFormatter;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ooxml.util.SAXHelper;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.SharedStrings;
import org.apache.poi.xssf.model.Styles;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
/**
* excel大量数据读取工具类2
* @author zql
* @createTime 2020-12-06 16:08:30
* @version 1.1
* @modifyLog 1.1 优化代码
* @api http://poi.apache.org/components/spreadsheet/how-to.html
*
*/
public class ExcelBigDataReadUtil2 {
/**
* 2003版后缀
*/
private String suffix2003 = ".xls";
/**
* 2007版后缀
*/
private String suffix2007 = ".xlsx";
/**
* 读取excel文件
* @author zql
* @createTime 2020-12-06 16:09:20
*
* @param filePath 文件路径
* @param sheetNum 第几个sheetNum工作薄,从1开始,为null的时候表示全部读取
* @param biconsumer 消费者函数,第一个参数为行数据,第二个参数为sheet名称
* @throws Exception
*/
public void readExcel(String filePath, Integer sheetNum, BiConsumer<Map<String, String>, String> biconsumer) throws Exception {
File file = new File(filePath);
if (Objects.nonNull(sheetNum) && sheetNum.intValue() <= 0) {
sheetNum = 1;
}
if (filePath.endsWith(suffix2003)) {
XlsReader xlsR = new XlsReader();
xlsR.process(file, sheetNum, biconsumer);
} else if (filePath.endsWith(suffix2007)) {
XlsxReader xlsxR = new XlsxReader();
xlsxR.process(file, sheetNum, biconsumer);
} else {
throw new Exception("Only excel files with XLS or XLSX suffixes are allowed to be read!");
}
}
/**
* 用于excel2003版本的读取
* @author zql
* @createTime 2020-12-06 16:10:20
* @version 1.0
*
*/
public class XlsReader implements HSSFListener {
private int minColums = -1;
/**
* 上一行row的序号
*/
private int lastRowNumber;
/**
* 上一单元格的序号
*/
private int lastColumnNumber;
/**
* 是否输出formula,还是它对应的值
*/
private boolean outputFormulaValues = true;
/**
* 用于转换formulas
*/
private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;
/**
* excel2003工作簿
*/
private HSSFWorkbook stubWorkbook;
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
private final HSSFDataFormatter formatter = new HSSFDataFormatter();
/**
* 表索引
*/
private int sheetIndex = 0;
private BoundSheetRecord[] boundSheetRecords;
private List<BoundSheetRecord> boundSheetRecordList = new ArrayList<BoundSheetRecord>();
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
/**
* 当前行
*/
private int curRow = 0;
/**
* 存储一行记录所有单元格的Map容器
*/
private Map<String,String> cellMap = new HashMap<String,String>();
/**
* 存储标题行所有单元格的Map容器
*/
private Map<Integer,String> titles = new HashMap<Integer,String>();
/**
* 判断整行是否为空行的标记
*/
private boolean flag = false;
/**
* 数据处理方法
*/
private BiConsumer<Map<String, String>,String> biconsumer;
private String sheetName;
/**
* 指定的sheet,默认值为-1,当为-1时,表示传入的值为null,将读取所有的sheet
*/
private int sheetNum = -1;
/**
* 处理数据
* @author zql
* @createTime 2020-12-06 16:10:46
*
* @param file 文件
* @param sheetNum 第几个sheetNum工作薄,从1开始,为null的时候表示全部读取,小于或等于0时默认为第1个工作薄
* @param biconsumer
* @throws Exception
*/
public void process(File file, Integer sheetNum, BiConsumer<Map<String, String>, String> biconsumer) throws Exception {
if (Objects.nonNull(sheetNum) && sheetNum <= 0) {
sheetNum = 1;
}
if (Objects.nonNull(sheetNum)) {
this.sheetNum = sheetNum;
}
this.biconsumer = biconsumer;
// 使用指定的输入文件创建新的文件输入流
FileInputStream fin = new FileInputStream(file);
POIFSFileSystem poifs = new POIFSFileSystem(fin);
InputStream din = poifs.createDocumentInputStream("Workbook");
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
formatListener = new FormatTrackingHSSFListener(listener);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest req = new HSSFRequest();
if (outputFormulaValues) {
req.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(formatListener);
req.addListenerForAllRecords(workbookBuildingListener);
}
factory.processEvents(req, din);
fin.close();
din.close();
poifs.close();
}
/**
* HSSFListener 监听方法,处理Record(处理每个单元格)
*
* @param record
*/
@Override
public void processRecord(Record record) {
int thisRow = -1;
int thisColumn = -1;
String thisStr = null;
String value = null;
switch (record.getSid()) {
case BoundSheetRecord.sid:
boundSheetRecordList.add((BoundSheetRecord) record);
break;
// 开始处理每个sheet
case BOFRecord.sid:
BOFRecord br = (BOFRecord) record;
if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
// 如果有需要,则建立子工作簿
if (Objects.nonNull(workbookBuildingListener) && Objects.isNull(stubWorkbook)) {
stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
}
if (Objects.isNull(boundSheetRecords)) {
boundSheetRecords = BoundSheetRecord.orderByBofPosition(boundSheetRecordList);
}
sheetName = boundSheetRecords[sheetIndex].getSheetname();
sheetIndex++;
}
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
// 单元格为空白
case BlankRecord.sid:
BlankRecord brec = (BlankRecord) record;
thisRow = brec.getRow();
thisColumn = brec.getColumn();
thisStr = "";
if (curRow == 0) {
titles.put(thisColumn, thisStr);
} else {
cellMap.put(titles.get(thisColumn), thisStr);
}
break;
// 单元格为布尔类型
case BoolErrRecord.sid:
BoolErrRecord berec = (BoolErrRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
thisStr = String.valueOf(berec.getBooleanValue());
if (curRow == 0) {
titles.put(thisColumn, thisStr);
} else {
cellMap.put(titles.get(thisColumn), thisStr);
}
// 如果里面某个单元格含有值,则标识该行不为空行
checkRowIsNull(thisStr);
break;
// 单元格为公式类型
case FormulaRecord.sid:
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if (outputFormulaValues) {
if (Double.isNaN(frec.getValue())) {
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression())
+ '"';
}
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
if (curRow == 0) {
titles.put(thisColumn, thisStr);
} else {
cellMap.put(titles.get(thisColumn), thisStr);
}
// 如果里面某个单元格含有值,则标识该行不为空行
checkRowIsNull(thisStr);
break;
// 单元格中公式的字符串
case StringRecord.sid:
if (outputNextStringRecord) {
StringRecord srec = (StringRecord) record;
thisStr = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
curRow = thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
value = lrec.getValue().trim();
if (curRow == 0) {
value = value.length() == 0 ? "空标题" + thisColumn : value;
titles.put(thisColumn, value);
} else {
value = value.length() == 0 ? "" : value;
cellMap.put(titles.get(thisColumn), value);
}
// 如果里面某个单元格含有值,则标识该行不为空行
checkRowIsNull(value);
break;
// 单元格为字符串类型
case LabelSSTRecord.sid:
LabelSSTRecord lsrec = (LabelSSTRecord) record;
curRow = thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if (Objects.isNull(sstRecord)) {
if (curRow == 0) {
titles.put(thisColumn, "空标题" + thisColumn);
} else {
cellMap.put(titles.get(thisColumn), "");
}
} else {
value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim();
if (curRow == 0) {
value = value.length() == 0 ? "空标题" + thisColumn : value;
titles.put(thisColumn, value);
} else {
value = value.length() == 0 ? "" : value;
cellMap.put(titles.get(thisColumn), value);
}
// 如果里面某个单元格含有值,则标识该行不为空行
checkRowIsNull(value);
}
break;
// 单元格为数字类型
case NumberRecord.sid:
NumberRecord numrec = (NumberRecord) record;
curRow = thisRow = numrec.getRow();
thisColumn = numrec.getColumn();
// 第一种方式,这个被写死,采用的m/d/yy h:mm格式,不符合要求
/// value = formatListener.formatNumberDateCell(numrec).trim();
// 第二种方式,参照formatNumberDateCell里面的实现方法编写
Double valueDouble = ((NumberRecord) numrec).getValue();
String formatString = formatListener.getFormatString(numrec);
if (formatString.contains("m/d/yy")) {
formatString = "yyyy-MM-dd hh:mm:ss";
}
int formatIndex = formatListener.getFormatIndex(numrec);
value = formatter.formatRawCellContents(valueDouble, formatIndex, formatString).trim();
if (curRow == 0) {
value = value.length() == 0 ? "空标题" + thisColumn : value;
titles.put(thisColumn, value);
} else {
value = value.length() == 0 ? "" : value;
// 向容器加入列值
cellMap.put(titles.get(thisColumn), value);
}
// 如果里面某个单元格含有值,则标识该行不为空行
checkRowIsNull(value);
break;
default:
break;
}
// 遇到新行的操作
if (thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
// 空值的操作
if (record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
curRow = thisRow = mc.getRow();
thisColumn = mc.getColumn();
if (curRow == 0) {
titles.put(thisColumn, "空标题" + thisColumn);
} else {
cellMap.put(titles.get(thisColumn), "");
}
}
// 更新行和列的值
if (thisRow > -1) {
lastRowNumber = thisRow;
}
if (thisColumn > -1) {
lastColumnNumber = thisColumn;
}
// 行结束时的操作
if (record instanceof LastCellOfRowDummyRecord) {
if (minColums > 0) {
// 列值重新置空
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
}
lastColumnNumber = -1;
// 该行不为空行且该行不是第一行,并且sheet索引等于指定sheet或指定sheet为默认值-1(即sheetNum==-1表示读取全部sheet),发送(第一行为列名,不需要)
if (flag && curRow != 0 && (sheetIndex == sheetNum || sheetNum == -1)) {
this.biconsumer.accept(cellMap, sheetName);
}
flag = false;
}
}
/**
* 如果里面某个单元格含有值,则标识该行不为空行
* @author zql
* @createTime 2020-12-06 16:18:35
*
* @param value
*/
public void checkRowIsNull(String value) {
if (Objects.nonNull(value) && value.length() != 0) {
flag = true;
}
}
}
/**
* 用于excel2007版本的读取
* @author zql
* @createTime 2020-12-06 16:20:10
* @version 1.0
*
*/
public class XlsxReader extends DefaultHandler{
private OPCPackage xlsxPackage;
/**
* 解析Excel,并关闭流
* @author zql
* @createTime 2020-12-06 16:20:23
*
* @param file
* @param sheetNum
* @param biconsumer
*/
public void process(File file,Integer sheetNum,BiConsumer<Map<String, String>,String> biconsumer) {
try {
this.xlsxPackage = OPCPackage.open(file, PackageAccess.READ);
// 只读字符表
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage);
// xssf读取
XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
// 样式表
StylesTable styles = xssfReader.getStylesTable();
// 读取Excel
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
int sheetIndex = 0;
while (iter.hasNext()) {
// 指定sheetNum不为空时只读取指定的sheet
if (Objects.nonNull(sheetNum) && sheetIndex != (sheetNum - 1)) {
iter.next();
sheetIndex++;
continue;
}
try (InputStream stream = iter.next()) {
processSheet(styles, strings, new BoundSheetRecords(biconsumer, iter.getSheetName()),
stream);
}
sheetIndex++;
}
} catch (IOException | SAXException | OpenXML4JException e) {
System.out.println(e.getMessage());
} finally {
// 关闭解析流
try {
this.xlsxPackage.close();
} catch (IOException e) {
System.out.println(e.getMessage());
}
}
}
/**
* 解析一个表的内容使用指定的样式和共享字符串表
* @author zql
* @createTime 2020-12-06 16:20:58
*
* @param styles 可以通过表中的单元格引用的样式表
* @param strings 可以通过表中的单元格引用的字符串表
* @param sheetHandler 数据处理接口
* @param sheetInputStream 这条流读取了文件数据
* @throws IOException 来自解析器的IO异常,可能来自字节流或字符流
* @throws SAXException 如果解析XML数据失败
*/
private void processSheet(Styles styles, SharedStrings strings, SheetContentsHandler sheetHandler,
InputStream sheetInputStream) throws IOException, SAXException {
// 数据格式化对象
DataFormatter formatter = new DataFormatter();
// short date 数据读取
formatter.addFormat("m/d/yy", new SimpleDateFormat("yyyy/MM/dd"));
InputSource sheetSource = new InputSource(sheetInputStream);
try {
// xml读取类
XMLReader sheetParser = SAXHelper.newXMLReader();
// xml处理方法
ContentHandler handler = new XSSFSheetXMLHandler(styles, null, strings, sheetHandler, formatter, false);
sheetParser.setContentHandler(handler);
// 解析Excel
sheetParser.parse(sheetSource);
} catch (ParserConfigurationException e) {
throw new RuntimeException("The sax parser failed to parse.The error message:" + e.getMessage());
}
}
/**
* 数据处理
* @author zql
* @createTime 2020-12-06 16:22:12
* @version 1.0
*
*/
private class BoundSheetRecords implements SheetContentsHandler {
/**
* sheet名称
*/
private String sheetName;
/**
* 判空列索引,用于处理空单元格
*/
private int colIndex = 0;
/**
* 当前行号
*/
private int currentRow = -1;
/**
* 当前列号
*/
private int currentCol = -1;
/**
* 当前行数据
*/
private Map<String, String> rowData;
/**
* sheet表格中的标题Map集合,key为列索引,value为列值(该标题默认为第一行)
*/
private Map<Integer,String> titles = new HashMap<Integer,String>();
/**
* 标题集合长度
*/
private int tLength;
/**
* 数据处理方法
*/
private BiConsumer<Map<String, String>,String> biconsumer;
/**
* sheet页处理
*
* @param biconsumer 消费者函数,第一个参数为行数据,第二个参数为sheet名称
* @param sheetName sheet名称
*/
public BoundSheetRecords(BiConsumer<Map<String, String>, String> biconsumer, String sheetName) {
this.biconsumer = biconsumer;
this.sheetName = sheetName;
}
@Override
public void startRow(int rowNum) {
currentRow = rowNum;
currentCol = -1;
// 重置数据
rowData = new HashMap<>();
}
@Override
public void endRow(int rowNum) {
// 重置列索引
colIndex = 0;
// 不处理第一行
if (rowNum == 0) {
tLength = titles.size();
return;
}
// 不处理空行
if (rowData.isEmpty()) {
return;
}
// 处理数据
biconsumer.accept(rowData,sheetName);
}
@Override
public void cell(String cellReference, String formattedValue, XSSFComment comment) {
// 如果为空则生成一个当前位置的单元格对象
if (Objects.isNull(cellReference)) {
cellReference = new CellAddress(currentRow, currentCol).formatAsString();
}
int curColumnIndex = (new CellReference(cellReference)).getCol();
// 保存默认的第一行标题
if (currentRow == 0) {
// 列为空的标题
for (; colIndex < curColumnIndex; colIndex++) {
titles.put(colIndex, "空标题" + colIndex);
}
titles.put(curColumnIndex, formattedValue);
// 判空列索引加一,避免下一轮误判空
colIndex++;
return;
}
// 不读取大于标题行的列
if (curColumnIndex > tLength) {
return;
}
// 列为空的单元格
for (; colIndex < curColumnIndex; colIndex++) {
rowData.put(titles.get(colIndex), "");
}
// 当前列等于读取的列
currentCol = curColumnIndex;
// 设置值
rowData.put(titles.get(colIndex), formattedValue);
// 判空列索引加一,避免下一轮误判空,要放在设置值之前
colIndex++;
}
}
}
}
测试类:
import org.junit.Test;
import java.util.Map;
import java.util.function.BiConsumer;
/**
* excel大量数据读取工具测试类2
* @author zql
* @createTime 2020-12-06 16:08:30
* @version 1.1
* @modifyLog 1.1 优化代码
* @api http://poi.apache.org/components/spreadsheet/how-to.html
*
*/
public class ExcelBigDataReadUtil2Test {
private ExcelBigDataReadUtil2 e = new ExcelBigDataReadUtil2();
@Test
public void readExcel2003() throws Exception {
String filePath = "E:\\excel\\测试数据_60000条.xls";
BiConsumer<Map<String, String>, String> biconsumer = (row, sheetName) -> {
StringBuffer sb = new StringBuffer();
sb.append("xls-sheetName:").append(sheetName).append(",");
sb.append("{");
for (Map.Entry<String, String> cell : row.entrySet()) {
sb.append(cell.getKey());
sb.append(":");
sb.append(cell.getValue());
sb.append("|");
}
String line = sb.toString();
if (line.endsWith("|")) {
line = line.substring(0, line.lastIndexOf("|"));
}
// 去除最后一个分隔符
line += "}";
System.out.println(line);
};
e.readExcel(filePath, 1, biconsumer);
}
@Test
public void readExcel2007() throws Exception {
String filePath = "E:\\excel\\测试数据_100万条.xlsx";
BiConsumer<Map<String, String>, String> biconsumer2 = (row, sheetName) -> {
StringBuffer sb = new StringBuffer();
sb.append("xlsx-sheetName:").append(sheetName).append(",");
sb.append("{");
for (Map.Entry<String, String> cell : row.entrySet()) {
sb.append(cell.getKey());
sb.append(":");
sb.append(cell.getValue());
sb.append("|");
}
String line = sb.toString();
if (line.endsWith("|")) {
line = line.substring(0, line.lastIndexOf("|"));
}
// 去除最后一个分隔符
line += "}";
System.out.println(line);
};
e.readExcel(filePath, null, biconsumer2);
}
}
普通项目需要引入的包
poi-4.0.1.jar
poi-ooxml-4.0.1.jar
poi-ooxml-schemas-4.0.1.jar
commons-codec-1.11.jar
commons-collections4-4.3.jar
commons-math3-3.6.1.jar
xmlbeans-3.0.2.jar
commons-compress-1.18.jar
curvesapi-1.06.jar
<!-- poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.1</version>
</dependency>
参考链接:
- https://www.cnblogs.com/swordfall/p/8298386.html
- https://blog.csdn.net/weixin_38761297/article/details/90232607