最近在写一个获取 Word 以及 Excel 内容并进行过滤的功能,在CSDN上查了好久,好多都是关于 Excel 的,Word 的好多人建议用C写,但是最终还是被我解决掉了------本人很菜,如有可以优化的地方,可以评论区留言,请见谅,话不多说,上代码!
目录
注:以下代码是我在CSDN上查到的,然后自己做了整合
1. 代码中所用到的依赖
1.1 项目管理工具用的是Gradle
compile group: 'org.apache.poi', name: 'poi', version: '4.0.1'
compile group: 'org.apache.poi', name: 'poi-scratchpad', version: '4.0.1'
compile group: 'org.apache.poi', name: 'poi-ooxml', version: '4.0.1'
compile group: 'net.sf.jmimemagic', name: 'jmimemagic', version: '0.1.5'
compile group: 'org.apache.ant', name: 'ant', version: '1.10.9'
compile group: 'dom4j', name: 'dom4j', version: '1.6.1'
2. 以下就是用到的代码片段
注:代码会比较长,请见谅,而且因为我对代码做了修改,因此我会将我在CSDN上用到的资料贴在文章的最后
2.1 ExcelXlsReader:用于解决Excel2003版本大数据量问题
import org.apache.poi.hssf.eventusermodel.*;
import org.apache.poi.hssf.eventusermodel.dummyrecord.*;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.*;
import org.apache.poi.hssf.usermodel.*;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import java.io.InputStream;
import java.util.*;
/**
* @Author ThorLau
* @CreateTime 2021/1/26 15:19
* @Desc 用于解决Excel2003版本大数据量问题
*/
public class ExcelXlsReader implements HSSFListener {
Boolean filter = false;
List<String> keyword;
private int minColums = -1;
private POIFSFileSystem fs;
/**
* 总行数
*/
private int totalRows = 0;
/**
* 上一行row的序号
*/
private int lastRowNumber;
/**
* 上一单元格的序号
*/
private int lastColumnNumber;
/**
* 是否输出formula,还是它对应的值
*/
private boolean outputFormulaValues = true;
/**
* 用于转换formulas
*/
private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;
//excel2003工作簿
private HSSFWorkbook stubWorkbook;
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
private final HSSFDataFormatter formatter = new HSSFDataFormatter();
//表索引
private int sheetIndex = 0;
private BoundSheetRecord[] orderedBSRs;
private ArrayList boundSheetRecords = new ArrayList();
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
//当前行
private int curRow = 0;
//存储一行记录所有单元格的容器
private List<String> cellList = new ArrayList<String>();
/**
* 判断整行是否为空行的标记
*/
private boolean flag = false;
private String sheetName;
public List<String> getKeyword() {
return keyword;
}
public void setKeyword(List<String> keyword) {
this.keyword = keyword;
}
public Boolean getFilter() {
return filter;
}
public void setFilter(Boolean filter) {
this.filter = filter;
}
/**
* @Author ThorLau
* @CreateTime 2021/1/26 15:20
* @Desc 遍历excel下所有的sheet
* @Params [java.io.InputStream, java.util.List<java.lang.String>]
* @Return int
*/
public int process(InputStream inputStream, List<String> keyWord) throws Exception {
setKeyword(keyWord);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest request = new HSSFRequest();
this.fs = new POIFSFileSystem(inputStream);
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
formatListener = new FormatTrackingHSSFListener(listener);
if (outputFormulaValues) {
request.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(formatListener);
request.addListenerForAllRecords(workbookBuildingListener);
}
factory.processWorkbookEvents(request, fs);
return totalRows; //返回该excel文件的总行数,不包括首列和空行
}
/**
* @Author ThorLau
* @CreateTime 2021/1/26 15:21
* @Desc HSSFListener 监听方法,处理Record,处理每个单元格
* @Params [org.apache.poi.hssf.record.Record]
* @Return void
*/
public void processRecord(Record record) {
int thisRow = -1;
int thisColumn = -1;
String thisStr = null;
String value = null;
switch (record.getSid()) {
case BoundSheetRecord.sid:
boundSheetRecords.add(record);
break;
case BOFRecord.sid: //开始处理每个sheet
BOFRecord br = (BOFRecord) record;
if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
//如果有需要,则建立子工作簿
if (workbookBuildingListener != null && stubWorkbook == null) {
stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
}
if (orderedBSRs == null) {
orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
}
sheetName = orderedBSRs[sheetIndex].getSheetname();
sheetIndex++;
}
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
case BlankRecord.sid: //单元格为空白
BlankRecord brec = (BlankRecord) record;
thisRow = brec.getRow();
thisColumn = brec.getColumn();
thisStr = "";
cellList.add(thisColumn, thisStr);
break;
case BoolErrRecord.sid: //单元格为布尔类型
BoolErrRecord berec = (BoolErrRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
thisStr = berec.getBooleanValue() + "";
cellList.add(thisColumn, thisStr);
checkRowIsNull(thisStr); //如果里面某个单元格含有值,则标识该行不为空行
break;
case FormulaRecord.sid://单元格为公式类型
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if (outputFormulaValues) {
if (Double.isNaN(frec.getValue())) {
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
cellList.add(thisColumn, thisStr);
checkRowIsNull(thisStr); //如果里面某个单元格含有值,则标识该行不为空行
break;
case StringRecord.sid: //单元格中公式的字符串
if (outputNextStringRecord) {
StringRecord srec = (StringRecord) record;
thisStr = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
curRow = thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
value = lrec.getValue().trim();
value = value.equals("") ? "" : value;
cellList.add(thisColumn, value);
checkRowIsNull(value); //如果里面某个单元格含有值,则标识该行不为空行
break;
case LabelSSTRecord.sid: //单元格为字符串类型
LabelSSTRecord lsrec = (LabelSSTRecord) record;
curRow = thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if (sstRecord == null) {
cellList.add(thisColumn, "");
} else {
value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim();
value = value.equals("") ? "" : value;
cellList.add(thisColumn, value);
checkRowIsNull(value); //如果里面某个单元格含有值,则标识该行不为空行
}
break;
case NumberRecord.sid: //单元格为数字类型
NumberRecord numrec = (NumberRecord) record;
curRow = thisRow = numrec.getRow();
thisColumn = numrec.getColumn();
//第一种方式
//value = formatListener.formatNumberDateCell(numrec).trim();//这个被写死,采用的m/d/yy h:mm格式,不符合要求
//第二种方式,参照formatNumberDateCell里面的实现方法编写
Double valueDouble = ((NumberRecord) numrec).getValue();
String formatString = formatListener.getFormatString(numrec);
if (formatString.contains("m/d/yy")) {
formatString = "yyyy-MM-dd hh:mm:ss";
}
int formatIndex = formatListener.getFormatIndex(numrec);
value = formatter.formatRawCellContents(valueDouble, formatIndex, formatString).trim();
value = value.equals("") ? "" : value;
//向容器加入列值
cellList.add(thisColumn, value);
checkRowIsNull(value); //如果里面某个单元格含有值,则标识该行不为空行
break;
default:
break;
}
//遇到新行的操作
if (thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
//空值的操作
if (record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
curRow = thisRow = mc.getRow();
thisColumn = mc.getColumn();
cellList.add(thisColumn, "");
}
//更新行和列的值
if (thisRow > -1)
lastRowNumber = thisRow;
if (thisColumn > -1)
lastColumnNumber = thisColumn;
//行结束时的操作
if (record instanceof LastCellOfRowDummyRecord) {
if (minColums > 0) {
//列值重新置空
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
}
lastColumnNumber = -1;
if (flag && curRow != 0) { //该行不为空行且该行不是第一行,发送(第一行为列名,不需要)
for (String key : getKeyword()) {
if (cellList.indexOf(key) != -1) {
setFilter(true);
}
}
totalRows++;
}
//清空容器
cellList.clear();
flag = false;
}
}
/**
* @Author ThorLau
* @CreateTime 2021/1/26 15:22
* @Desc 如果里面某个单元格含有值,则标识该行不为空行
* @Params [java.lang.String]
* @Return void
*/
public void checkRowIsNull(String value) {
if (value != null && !"".equals(value)) {
flag = true;
}
}
}
2.2 ExcelXlsxReader :解析大数据量Excel07及10的工具类
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.*;
import org.xml.sax.helpers.XMLReaderFactory;
import java.io.InputStream;
import java.util.*;
/**
* @Author ThorLau
* @CreateTime 2021/1/26 15:22
* @Desc 解析大数据量Excel07及10的工具类
*/
public class ExcelXlsxReader {
private SheetHandler sheetHandler;
Boolean filter = false;
public Boolean getFilter() {
return filter;
}
public void setFilter(Boolean filter) {
this.filter = filter;
}
public void setSheetHandler(SheetHandler sheetHandler) {
this.sheetHandler = sheetHandler;
}
/**
* @Author ThorLau
* @CreateTime 2021/1/26 15:23
* @Desc excel07及10处理多个sheet
* @Params [java.io.InputStream, java.util.List<java.lang.String>]
* @Return void
*/
public void processAllSheets(InputStream is, List<String> keyword) throws Exception {
OPCPackage pkg = null;
InputStream sheet = null;
try {
pkg = OPCPackage.open(is);
XSSFReader r = new XSSFReader(pkg);
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
sheetHandler.setKeyword(keyword);
Iterator<InputStream> sheets = r.getSheetsData();
while (sheets.hasNext()) {
sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
if (sheetHandler.getFilter()) {
setFilter(true);
}
}
} catch (Exception e) {
e.printStackTrace();
throw e;
} finally {
if (pkg != null) {
pkg.close();
}
if (sheet != null) {
sheet.close();
}
}
}
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
XMLReader parser =
XMLReaderFactory.createXMLReader(
"com.sun.org.apache.xerces.internal.parsers.SAXParser"
);
setSheetHandler(new SheetHandler(sst));
ContentHandler handler = (ContentHandler) sheetHandler;
parser.setContentHandler(handler);
return parser;
}
}
2.3 SheetHandler:从SheetHandler类中处理从excle获取的数据
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import java.util.List;
public class SheetHandler extends DefaultHandler {
/**
* @Author ThorLau
* @CreateTime 2021/1/26 15:29
* @Desc SheetHandler类中处理从excle获取的数据,
* 官方文档中 SheetHandler以内部类形式,
* 为保证更新代码减少内部类class文件忘记打包,
* 改为一般java类
*/
List<String> keyword;
Boolean filter = false;
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private String cellPosition;
public List<String> getKeyword() {
return keyword;
}
public void setKeyword(List<String> keyword) {
this.keyword = keyword;
}
public Boolean getFilter() {
return filter;
}
public void setFilter(Boolean filter) {
this.filter = filter;
}
public SheetHandler(SharedStringsTable sst) {
this.sst = sst;
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
if (name.equals("c")) {
cellPosition = attributes.getValue("r");
String cellType = attributes.getValue("t");
if (cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// 清除缓存内容
lastContents = "";
}
public void endElement(String uri, String localName, String name) throws SAXException {
if (nextIsString) {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
nextIsString = false;
}
if (name.equals("v")) {
//数据读取结束后,将单元格坐标,内容存入map中
//不保存第一行数据 && !"1".equals(cellPosition.substring(1))
if (!(cellPosition.length() == 2) || (cellPosition.length() == 2)) {
//获取内容,直接过滤
for (String key : getKeyword()) {
if (lastContents.indexOf(key) != -1) {
setFilter(true);
}
}
}
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
lastContents += new String(ch, start, length);
}
}
2.4 过滤Excel和Word(末尾的这个注释要耐心读完)
/**
* @Author ThorLau
* @CreateTime 2021/1/26 15:24
* @Desc 过滤2003、2007、2010的word
* @Params [com.jdwa.jfilesync.file.IFileSystem, java.lang.String, java.util.List<java.lang.String>]
* @Return boolean
*/
public void SearchContentKeyByWorld(InputStream inputStream, String filePath, List<String> keyword) throws Throwable {
WordExtractor wordExtractor = null;
XWPFDocument docx = null;
String[] paragraphs = null;
try {
if (filePath.endsWith("doc") || filePath.endsWith("DOC")) {
wordExtractor = new WordExtractor(inputStream);
paragraphs = wordExtractor.getParagraphText();
for (String paragraph : paragraphs) {
for (String key : keyword) {
if (paragraph.indexOf(key) != -1) {
filter = true;
}
}
}
} else if (filePath.endsWith("docx") || filePath.endsWith("DOCX")) {
docx = new XWPFDocument(inputStream);
List<XWPFParagraph> list = docx.getParagraphs();
for (XWPFParagraph paragraph : list) {
for (String key : keyword) {
if (paragraph.getText().indexOf(key) != -1) {
filter = true;
}
}
}
}
} catch (Throwable e) {
System.out.println("[Search ContentKey By Excel Or World] error : {}" + e);
throw e;
}
}
/**
* @Author ThorLau
* @CreateTime 2021/1/27 16:37
* @Desc 过滤2003、2007、2010的表格
* @Params [java.io.InputStream, java.lang.String, java.util.List<java.lang.String>]
* @Return void
*/
public void SearchContentKeyByExcel(InputStream inputStream, String filePath, List<String> keyWord) throws Throwable {
ExcelXlsReader excelXls = new ExcelXlsReader();
ExcelXlsxReader example = new ExcelXlsxReader();
try {
if (filePath.endsWith("xls") || filePath.endsWith("XLS")) {
excelXls.process(inputStream, keyWord);
if (excelXls.getFilter()) {
filter = true;
}
} else if (filePath.endsWith("xlsx") || filePath.endsWith("XLSX")) {
example.processAllSheets(inputStream, keyWord);
if (example.getFilter()) {
filter = true;
}
}
} catch (Throwable e) {
System.out.println("有问题!!!");
throw e;
}
}
注:过滤Excel的时候直接获取上面写好的类就行了,过滤Word的时候直接用的.getParagraphText()方法,因为以前用的.getText()方法,所以内存溢出了,但是还有一种情况,就是InputStream 过大会导致在读.docx,也就是07、10版本的时候,因为要用XWPFDocument去解析这个文件,因此会内存溢出。但是,还有一种解决办法因为07以后都可以看作是一种压缩形式,可以用解压软件解压出来(如下图所示)。因此可以用ZipFile读到它然后获取document.xml去解析xml就可以获取到内容了,具体的代码CSDN上可以搜得到
3. 参考资料
https://blog.csdn.net/weixin_44259233/article/details/109613292
注:这个是对 Excel 03、07、10都生效的一篇个人觉得比较nice的文章