解决:JAVA读取 2003、2007、2010 Word 和 Excel 的内容时内存溢出的问题

最近在写一个获取 Word 以及 Excel 内容并进行过滤的功能,在CSDN上查了好久,好多都是关于 Excel 的,Word 的好多人建议用C写,但是最终还是被我解决掉了------本人很菜,如有可以优化的地方,可以评论区留言,请见谅,话不多说,上代码!


注:以下代码是我在CSDN上查到的,然后自己做了整合

1. 代码中所用到的依赖

1.1 项目管理工具用的是Gradle

	compile group: 'org.apache.poi', name: 'poi', version: '4.0.1'
    compile group: 'org.apache.poi', name: 'poi-scratchpad', version: '4.0.1'
    compile group: 'org.apache.poi', name: 'poi-ooxml', version: '4.0.1'
    compile group: 'net.sf.jmimemagic', name: 'jmimemagic', version: '0.1.5'
    compile group: 'org.apache.ant', name: 'ant', version: '1.10.9'
    compile group: 'dom4j', name: 'dom4j', version: '1.6.1'

2. 以下就是用到的代码片段

注:代码会比较长,请见谅,而且因为我对代码做了修改,因此我会将我在CSDN上用到的资料贴在文章的最后

2.1 ExcelXlsReader:用于解决Excel2003版本大数据量问题

import org.apache.poi.hssf.eventusermodel.*;
import org.apache.poi.hssf.eventusermodel.dummyrecord.*;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.*;
import org.apache.poi.hssf.usermodel.*;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import java.io.InputStream;
import java.util.*;

/**
 * @Author ThorLau
 * @CreateTime 2021/1/26 15:19
 * @Desc 用于解决Excel2003版本大数据量问题
 */
public class ExcelXlsReader implements HSSFListener {
    Boolean filter = false;
    List<String> keyword;

    private int minColums = -1;

    private POIFSFileSystem fs;

    /**
     * 总行数
     */
    private int totalRows = 0;

    /**
     * 上一行row的序号
     */
    private int lastRowNumber;

    /**
     * 上一单元格的序号
     */
    private int lastColumnNumber;

    /**
     * 是否输出formula,还是它对应的值
     */
    private boolean outputFormulaValues = true;

    /**
     * 用于转换formulas
     */
    private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;

    //excel2003工作簿
    private HSSFWorkbook stubWorkbook;

    private SSTRecord sstRecord;

    private FormatTrackingHSSFListener formatListener;

    private final HSSFDataFormatter formatter = new HSSFDataFormatter();

    //表索引
    private int sheetIndex = 0;

    private BoundSheetRecord[] orderedBSRs;

    private ArrayList boundSheetRecords = new ArrayList();

    private int nextRow;

    private int nextColumn;

    private boolean outputNextStringRecord;

    //当前行
    private int curRow = 0;

    //存储一行记录所有单元格的容器
    private List<String> cellList = new ArrayList<String>();

    /**
     * 判断整行是否为空行的标记
     */
    private boolean flag = false;

    private String sheetName;

    public List<String> getKeyword() {
        return keyword;
    }

    public void setKeyword(List<String> keyword) {
        this.keyword = keyword;
    }

    public Boolean getFilter() {
        return filter;
    }

    public void setFilter(Boolean filter) {
        this.filter = filter;
    }

    /**
     * @Author ThorLau
     * @CreateTime 2021/1/26 15:20
     * @Desc 遍历excel下所有的sheet
     * @Params [java.io.InputStream, java.util.List<java.lang.String>]
     * @Return int
     */
    public int process(InputStream inputStream, List<String> keyWord) throws Exception {
        setKeyword(keyWord);
        HSSFEventFactory factory = new HSSFEventFactory();
        HSSFRequest request = new HSSFRequest();

        this.fs = new POIFSFileSystem(inputStream);
        MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
        formatListener = new FormatTrackingHSSFListener(listener);
        if (outputFormulaValues) {
            request.addListenerForAllRecords(formatListener);
        } else {
            workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(formatListener);
            request.addListenerForAllRecords(workbookBuildingListener);
        }
        factory.processWorkbookEvents(request, fs);

        return totalRows; //返回该excel文件的总行数,不包括首列和空行
    }

    /**
     * @Author ThorLau
     * @CreateTime 2021/1/26 15:21
     * @Desc HSSFListener 监听方法,处理Record,处理每个单元格
     * @Params [org.apache.poi.hssf.record.Record]
     * @Return void
     */
    public void processRecord(Record record) {
        int thisRow = -1;
        int thisColumn = -1;
        String thisStr = null;
        String value = null;
        switch (record.getSid()) {
            case BoundSheetRecord.sid:
                boundSheetRecords.add(record);
                break;
            case BOFRecord.sid: //开始处理每个sheet
                BOFRecord br = (BOFRecord) record;
                if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
                    //如果有需要,则建立子工作簿
                    if (workbookBuildingListener != null && stubWorkbook == null) {
                        stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
                    }

                    if (orderedBSRs == null) {
                        orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
                    }
                    sheetName = orderedBSRs[sheetIndex].getSheetname();
                    sheetIndex++;
                }
                break;
            case SSTRecord.sid:
                sstRecord = (SSTRecord) record;
                break;
            case BlankRecord.sid: //单元格为空白
                BlankRecord brec = (BlankRecord) record;
                thisRow = brec.getRow();
                thisColumn = brec.getColumn();
                thisStr = "";
                cellList.add(thisColumn, thisStr);
                break;
            case BoolErrRecord.sid: //单元格为布尔类型
                BoolErrRecord berec = (BoolErrRecord) record;
                thisRow = berec.getRow();
                thisColumn = berec.getColumn();
                thisStr = berec.getBooleanValue() + "";
                cellList.add(thisColumn, thisStr);
                checkRowIsNull(thisStr);  //如果里面某个单元格含有值,则标识该行不为空行
                break;
            case FormulaRecord.sid://单元格为公式类型
                FormulaRecord frec = (FormulaRecord) record;
                thisRow = frec.getRow();
                thisColumn = frec.getColumn();
                if (outputFormulaValues) {
                    if (Double.isNaN(frec.getValue())) {
                        outputNextStringRecord = true;
                        nextRow = frec.getRow();
                        nextColumn = frec.getColumn();
                    } else {
                        thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
                    }
                } else {
                    thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
                }
                cellList.add(thisColumn, thisStr);
                checkRowIsNull(thisStr);  //如果里面某个单元格含有值,则标识该行不为空行
                break;
            case StringRecord.sid: //单元格中公式的字符串
                if (outputNextStringRecord) {
                    StringRecord srec = (StringRecord) record;
                    thisStr = srec.getString();
                    thisRow = nextRow;
                    thisColumn = nextColumn;
                    outputNextStringRecord = false;
                }
                break;
            case LabelRecord.sid:
                LabelRecord lrec = (LabelRecord) record;
                curRow = thisRow = lrec.getRow();
                thisColumn = lrec.getColumn();
                value = lrec.getValue().trim();
                value = value.equals("") ? "" : value;
                cellList.add(thisColumn, value);
                checkRowIsNull(value);  //如果里面某个单元格含有值,则标识该行不为空行
                break;
            case LabelSSTRecord.sid: //单元格为字符串类型
                LabelSSTRecord lsrec = (LabelSSTRecord) record;
                curRow = thisRow = lsrec.getRow();
                thisColumn = lsrec.getColumn();
                if (sstRecord == null) {
                    cellList.add(thisColumn, "");
                } else {
                    value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim();
                    value = value.equals("") ? "" : value;
                    cellList.add(thisColumn, value);
                    checkRowIsNull(value);  //如果里面某个单元格含有值,则标识该行不为空行
                }
                break;
            case NumberRecord.sid: //单元格为数字类型
                NumberRecord numrec = (NumberRecord) record;
                curRow = thisRow = numrec.getRow();
                thisColumn = numrec.getColumn();

                //第一种方式
                //value = formatListener.formatNumberDateCell(numrec).trim();//这个被写死,采用的m/d/yy h:mm格式,不符合要求

                //第二种方式,参照formatNumberDateCell里面的实现方法编写
                Double valueDouble = ((NumberRecord) numrec).getValue();
                String formatString = formatListener.getFormatString(numrec);
                if (formatString.contains("m/d/yy")) {
                    formatString = "yyyy-MM-dd hh:mm:ss";
                }
                int formatIndex = formatListener.getFormatIndex(numrec);
                value = formatter.formatRawCellContents(valueDouble, formatIndex, formatString).trim();

                value = value.equals("") ? "" : value;
                //向容器加入列值
                cellList.add(thisColumn, value);
                checkRowIsNull(value);  //如果里面某个单元格含有值,则标识该行不为空行
                break;
            default:
                break;
        }

        //遇到新行的操作
        if (thisRow != -1 && thisRow != lastRowNumber) {
            lastColumnNumber = -1;
        }

        //空值的操作
        if (record instanceof MissingCellDummyRecord) {
            MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
            curRow = thisRow = mc.getRow();
            thisColumn = mc.getColumn();
            cellList.add(thisColumn, "");
        }

        //更新行和列的值
        if (thisRow > -1)
            lastRowNumber = thisRow;
        if (thisColumn > -1)
            lastColumnNumber = thisColumn;

        //行结束时的操作
        if (record instanceof LastCellOfRowDummyRecord) {
            if (minColums > 0) {
                //列值重新置空
                if (lastColumnNumber == -1) {
                    lastColumnNumber = 0;
                }
            }
            lastColumnNumber = -1;

            if (flag && curRow != 0) { //该行不为空行且该行不是第一行,发送(第一行为列名,不需要)
                for (String key : getKeyword()) {
                    if (cellList.indexOf(key) != -1) {
                        setFilter(true);
                    }
                }
                totalRows++;
            }
            //清空容器
            cellList.clear();
            flag = false;
        }
    }

    /**
     * @Author ThorLau
     * @CreateTime 2021/1/26 15:22
     * @Desc 如果里面某个单元格含有值,则标识该行不为空行
     * @Params [java.lang.String]
     * @Return void
     */
    public void checkRowIsNull(String value) {
        if (value != null && !"".equals(value)) {
            flag = true;
        }
    }
}

2.2 ExcelXlsxReader :解析大数据量Excel07及10的工具类

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.*;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.InputStream;
import java.util.*;

/**
 * @Author ThorLau
 * @CreateTime 2021/1/26 15:22
 * @Desc 解析大数据量Excel07及10的工具类
 */
public class ExcelXlsxReader {
    private SheetHandler sheetHandler;
    Boolean filter = false;

    public Boolean getFilter() {
        return filter;
    }

    public void setFilter(Boolean filter) {
        this.filter = filter;
    }

    public void setSheetHandler(SheetHandler sheetHandler) {
        this.sheetHandler = sheetHandler;
    }

    /**
     * @Author ThorLau
     * @CreateTime 2021/1/26 15:23
     * @Desc excel07及10处理多个sheet
     * @Params [java.io.InputStream, java.util.List<java.lang.String>]
     * @Return void
     */
    public void processAllSheets(InputStream is, List<String> keyword) throws Exception {
        OPCPackage pkg = null;
        InputStream sheet = null;
        try {
            pkg = OPCPackage.open(is);
            XSSFReader r = new XSSFReader(pkg);
            SharedStringsTable sst = r.getSharedStringsTable();
            XMLReader parser = fetchSheetParser(sst);
            sheetHandler.setKeyword(keyword);
            Iterator<InputStream> sheets = r.getSheetsData();
            while (sheets.hasNext()) {
                sheet = sheets.next();
                InputSource sheetSource = new InputSource(sheet);
                parser.parse(sheetSource);
                if (sheetHandler.getFilter()) {
                    setFilter(true);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
            throw e;
        } finally {
            if (pkg != null) {
                pkg.close();
            }
            if (sheet != null) {
                sheet.close();
            }
        }
    }

    public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
        XMLReader parser =
                XMLReaderFactory.createXMLReader(
                        "com.sun.org.apache.xerces.internal.parsers.SAXParser"
                );
        setSheetHandler(new SheetHandler(sst));
        ContentHandler handler = (ContentHandler) sheetHandler;
        parser.setContentHandler(handler);
        return parser;
    }
}

2.3 SheetHandler:从SheetHandler类中处理从excle获取的数据

import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;

import java.util.List;

public class SheetHandler extends DefaultHandler {
    /**
     * @Author ThorLau
     * @CreateTime 2021/1/26 15:29
     * @Desc SheetHandler类中处理从excle获取的数据,
     * 官方文档中 SheetHandler以内部类形式,
     * 为保证更新代码减少内部类class文件忘记打包,
     * 改为一般java类
     */
    List<String> keyword;
    Boolean filter = false;

    private SharedStringsTable sst;
    private String lastContents;
    private boolean nextIsString;
    private String cellPosition;

    public List<String> getKeyword() {
        return keyword;
    }

    public void setKeyword(List<String> keyword) {
        this.keyword = keyword;
    }

    public Boolean getFilter() {
        return filter;
    }

    public void setFilter(Boolean filter) {
        this.filter = filter;
    }

    public SheetHandler(SharedStringsTable sst) {
        this.sst = sst;
    }

    public void startElement(String uri, String localName, String name,
                             Attributes attributes) throws SAXException {
        if (name.equals("c")) {
            cellPosition = attributes.getValue("r");
            String cellType = attributes.getValue("t");
            if (cellType != null && cellType.equals("s")) {
                nextIsString = true;
            } else {
                nextIsString = false;
            }
        }
        // 清除缓存内容
        lastContents = "";
    }

    public void endElement(String uri, String localName, String name) throws SAXException {
        if (nextIsString) {
            int idx = Integer.parseInt(lastContents);
            lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
            nextIsString = false;
        }
        if (name.equals("v")) {
            //数据读取结束后,将单元格坐标,内容存入map中
            //不保存第一行数据  && !"1".equals(cellPosition.substring(1))
            if (!(cellPosition.length() == 2) || (cellPosition.length() == 2)) {
                //获取内容,直接过滤
                for (String key : getKeyword()) {
                    if (lastContents.indexOf(key) != -1) {
                        setFilter(true);
                    }
                }
            }
        }
    }

    public void characters(char[] ch, int start, int length)
            throws SAXException {
        lastContents += new String(ch, start, length);
    }
}

2.4 过滤Excel和Word(末尾的这个注释要耐心读完)

	/**
     * @Author ThorLau
     * @CreateTime 2021/1/26 15:24
     * @Desc 过滤2003、2007、2010的word
     * @Params [com.jdwa.jfilesync.file.IFileSystem, java.lang.String, java.util.List<java.lang.String>]
     * @Return boolean
     */
    public void SearchContentKeyByWorld(InputStream inputStream, String filePath, List<String> keyword) throws Throwable {
        WordExtractor wordExtractor = null;
        XWPFDocument docx = null;
        String[] paragraphs = null;
        try {
            if (filePath.endsWith("doc") || filePath.endsWith("DOC")) {
                wordExtractor = new WordExtractor(inputStream);
                paragraphs = wordExtractor.getParagraphText();
                for (String paragraph : paragraphs) {
                    for (String key : keyword) {
                        if (paragraph.indexOf(key) != -1) {
                            filter = true;
                        }
                    }
                }
            } else if (filePath.endsWith("docx") || filePath.endsWith("DOCX")) {
                docx = new XWPFDocument(inputStream);
                List<XWPFParagraph> list = docx.getParagraphs();
                for (XWPFParagraph paragraph : list) {
                    for (String key : keyword) {
                        if (paragraph.getText().indexOf(key) != -1) {
                            filter = true;
                        }
                    }
                }
            }
        } catch (Throwable e) {
            System.out.println("[Search ContentKey By Excel Or World] error : {}" + e);
            throw e;
        }
    }

	/**
     * @Author ThorLau
     * @CreateTime 2021/1/27 16:37
     * @Desc 过滤2003、2007、2010的表格
     * @Params [java.io.InputStream, java.lang.String, java.util.List<java.lang.String>]
     * @Return void
     */
    public void SearchContentKeyByExcel(InputStream inputStream, String filePath, List<String> keyWord) throws Throwable {
        ExcelXlsReader excelXls = new ExcelXlsReader();
        ExcelXlsxReader example = new ExcelXlsxReader();
        try {
            if (filePath.endsWith("xls") || filePath.endsWith("XLS")) {
                excelXls.process(inputStream, keyWord);
                if (excelXls.getFilter()) {
                    filter = true;
                }
            } else if (filePath.endsWith("xlsx") || filePath.endsWith("XLSX")) {
                example.processAllSheets(inputStream, keyWord);
                if (example.getFilter()) {
                    filter = true;
                }
            }
        } catch (Throwable e) {
            System.out.println("有问题!!!");
            throw e;
        }
    }

注:过滤Excel的时候直接获取上面写好的类就行了,过滤Word的时候直接用的.getParagraphText()方法,因为以前用的.getText()方法,所以内存溢出了,但是还有一种情况,就是InputStream 过大会导致在读.docx,也就是07、10版本的时候,因为要用XWPFDocument去解析这个文件,因此会内存溢出。但是,还有一种解决办法因为07以后都可以看作是一种压缩形式,可以用解压软件解压出来(如下图所示)。因此可以用ZipFile读到它然后获取document.xml去解析xml就可以获取到内容了,具体的代码CSDN上可以搜得到
这是解压之后的效果,点击打卡word文件夹

打开Document.xml这里面显示的就是Word的文本内容

3. 参考资料

https://blog.csdn.net/weixin_44259233/article/details/109613292
注:这个是对 Excel 03、07、10都生效的一篇个人觉得比较nice的文章

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值