poi通过事件模式sax解析大excel文件,防止内存溢出

1.解析类

package com.cetc.ExcelPoi;

import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;

import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 * Created by shea on 2018/10/12.
 */
public class BigExcelParse extends DefaultHandler {
    enum xssfDataType {
        BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER,
    }
    /**
     * Table with styles
     */
    private StylesTable stylesTable;

    //取SST 的索引对应的值
    private SharedStringsTable sharedStringsTable;
    /**
     * 最大列数
     */
    private final int minColumnCount;

    //单元格内容是SST 的索引
    private boolean vIsOpen;

    private xssfDataType nextDataType;

    private short formatIndex;
    private String formatString;
    private final DataFormatter formatter;

    private int thisColumn = -1;
    // The last column printed to the output stream
    private int lastColumnNumber = -1;

    // Gathers characters as they are seen.
    private StringBuffer value;
    private String[] record;//封装每一行的结果
    private List<String[]> rows = new ArrayList<String[]>();//封装结果
    private boolean isCellNull = false;
    private SimpleDateFormat sdf = null;
    private static DecimalFormat df = new DecimalFormat("###########");


    //构造器
    public BigExcelParse(StylesTable styles,
                              SharedStringsTable strings, int cols) {
        this.stylesTable = styles;
        this.sharedStringsTable = strings;
        this.minColumnCount = cols;
        this.value = new StringBuffer();
        this.nextDataType = xssfDataType.NUMBER;
        this.formatter = new DataFormatter();
        record = new String[this.minColumnCount];
        rows.clear();// 每次读取都清空行集合
    }

    public void startElement(String uri, String localName, String name,
                             Attributes attributes) throws SAXException {

        if ("inlineStr".equals(name) || "v".equals(name)) {
            vIsOpen = true;
            // Clear contents cache
            value.setLength(0);
        }
        // c => cell
        else if ("c".equals(name)) {
            // Get the cell reference
            String r = attributes.getValue("r");
            int firstDigit = -1;
            for (int c = 0; c < r.length(); ++c) {
                if (Character.isDigit(r.charAt(c))) {
                    firstDigit = c;
                    break;
                }
            }
            thisColumn = nameToColumn(r.substring(0, firstDigit));

            // Set up defaults.
            this.nextDataType = xssfDataType.NUMBER;
            this.formatIndex = -1;
            this.formatString = null;
            String cellType = attributes.getValue("t");
            String cellStyleStr = attributes.getValue("s");
            if ("b".equals(cellType))
                nextDataType = xssfDataType.BOOL;
            else if ("e".equals(cellType))
                nextDataType = xssfDataType.ERROR;
            else if ("inlineStr".equals(cellType))
                nextDataType = xssfDataType.INLINESTR;
            else if ("s".equals(cellType))
                nextDataType = xssfDataType.SSTINDEX;
            else if ("str".equals(cellType))
                nextDataType = xssfDataType.FORMULA;
            else if (cellStyleStr != null) {
                // It's a number, but almost certainly one
                // with a special style or format
                int styleIndex = Integer.parseInt(cellStyleStr);
                XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
                this.formatIndex = style.getDataFormat();
                this.formatString = style.getDataFormatString();
                if (this.formatString == null)
                    this.formatString = BuiltinFormats
                            .getBuiltinFormat(this.formatIndex);
            }
        }

    }

    public void endElement(String uri, String localName, String name)
            throws SAXException {

        String thisStr = null;

        // v => contents of a cell
        //单元格内容标签结束,characters方法会被调用处理内容
        if ("v".equals(name)) {
            // Process the value contents as required.
            // Do now, as characters() may be called more than once
            switch (nextDataType) {

                case BOOL:
                    char first = value.charAt(0);
                    thisStr = first == '0' ? "FALSE" : "TRUE";
                    break;

                case ERROR:
                    thisStr = "\"ERROR:" + value.toString() + '"';
                    break;

                case FORMULA:
                    // A formula could result in a string value,
                    // so always add double-quote characters.
                    thisStr = value.toString();
                    break;

                case INLINESTR:
                    // TODO: have seen an example of this, so it's untested.
                    XSSFRichTextString rtsi = new XSSFRichTextString(
                            value.toString());
                    thisStr =rtsi.toString();
                    break;

                case SSTINDEX:
                    String sstIndex = value.toString();
                    try {
                        int idx = Integer.parseInt(sstIndex);
                        XSSFRichTextString rtss = new XSSFRichTextString(
                                sharedStringsTable.getEntryAt(idx));
                        thisStr = rtss.toString();
                    } catch (NumberFormatException ex) {
                        System.out.println("Failed to parse SST index '" + sstIndex
                                + "': " + ex.toString());
                    }
                    break;

                case NUMBER:
                    String n = value.toString();
                    // 判断是否是日期格式
                    if (formatIndex == 14 || formatIndex == 31 || formatIndex == 57 || formatIndex == 58
                            || (176<=formatIndex && formatIndex<=178) || (182<=formatIndex && formatIndex<=196)
                            || (210<=formatIndex && formatIndex<=213) || (208==formatIndex ) ) {// 日期
                        sdf = new SimpleDateFormat("yyyy-MM-dd");
                        Date date = org.apache.poi.ss.usermodel.DateUtil.getJavaDate(Double.parseDouble(n));
                        thisStr=sdf.format(date);
                    } else if (formatIndex == 20 || formatIndex == 32 || formatIndex==183 || (200<=formatIndex && formatIndex<=209)) {//时间
                        sdf = new SimpleDateFormat("HH:mm");
                        Date date = org.apache.poi.ss.usermodel.DateUtil.getJavaDate(Double.parseDouble(n));
                        thisStr = sdf.format(date);
                    }
                    else{
                         if(n.contains("E")){//科学计数法
                             String[] split = n.split("\\+");
                             String e = split[0].replaceAll("E|e", "");
                             thisStr=e.replace(".","");
                        }else {
                             thisStr = n;
                         }
                    }
                    break;
                default:
                    thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
                    break;
            }
            if (lastColumnNumber == -1) {
                lastColumnNumber = 0;
            }
            //判断单元格的值是否为空
            if (thisStr == null || "".equals(isCellNull)) {
                isCellNull = true;// 设置单元格是否为空值
            }
            record[thisColumn] = thisStr;
            // Update column
            if (thisColumn > -1)
                lastColumnNumber = thisColumn;
            //行结束,存储一行数据
        } else if ("row".equals(name)) {
            // Print out any missing commas if needed
            if (minColumnCount > 0) {
                // Columns are 0 based
                if (lastColumnNumber == -1) {
                    lastColumnNumber = 0;
                }
                // 判断是否空行
                if(record!=null &&record.length!=0){
                    rows.add(record.clone());
                    isCellNull = false;
                    for (int i = 0; i < record.length; i++) {
                        record[i] = null;
                    }
                }
            }
            lastColumnNumber = -1;
        }

    }

    public List<String[]> getRows() {
        return rows;
    }

    public void setRows(List<String[]> rows) {
        this.rows = rows;
    }

    public void characters(char[] ch, int start, int length)
            throws SAXException {
        if (vIsOpen)
            value.append(ch, start, length);
    }

    private int nameToColumn(String name) {
        int column = -1;
        for (int i = 0; i < name.length(); ++i) {
            int c = name.charAt(i);
            column = (column + 1) * 26 + c - 'A';
        }
        return column;
    }
}

2.调用类

package com.cetc.ExcelPoi;

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import java.io.*;
import java.util.HashMap;
import java.util.List;

/**
 * Created by shea on 2018/10/12.
 */
public class TestBigExcelParse {
    private OPCPackage xlsxPackage;
    private int minColumns;
    private PrintStream output;
    private String sheetName;
    //构造器
    public TestBigExcelParse(OPCPackage pkg, PrintStream output,
                               String sheetName, int minColumns) {
        this.xlsxPackage = pkg;
        this.output = output;
        this.minColumns = minColumns;
        this.sheetName = sheetName;
    }

    /**
     *
     * @param path----文件路径
     * @param sheetName---解析的sheet名称
     * @param minColumns---excel的最大列数
     * @return
     * @throws Exception
     */
    public static HashMap<String, Object> parseSheet(String path,String sheetName,int minColumns)throws Exception{
        //文件地址
        OPCPackage pkg = OPCPackage.open(path,PackageAccess.READ);
        XSSFReader r = new XSSFReader( pkg );
        //解析的sheet名称
        // InputStream rId1 = r.getSheet("rId1");//第一个sheet表
        XSSFReader.SheetIterator sheetsData = (XSSFReader.SheetIterator)r.getSheetsData();
        SharedStringsTable sst = r.getSharedStringsTable();
        StylesTable styles = r.getStylesTable();
        XMLReader parser = XMLReaderFactory.createXMLReader();
        BigExcelParse handler = new BigExcelParse(styles, sst, minColumns);
        parser.setContentHandler(handler);
        //遍历---获取指定的sheet名称
        HashMap<String, Object> map = new HashMap<String, Object>();
        while (sheetsData.hasNext()){
            InputStream in = sheetsData.next();
            if(sheetName.equals(sheetsData.getSheetName())){
                InputSource inputSource = new InputSource(in);
                parser.parse(inputSource);
                List<String[]> rows = handler.getRows();//返回所有的封装结果
                map.put("success", true);
                map.put("msg","解析完成!");
                map.put("rows",rows);
                in.close();
            }
        }
        if(map.isEmpty()){
            map.put("success", false);
            map.put("msg","解析失败,没有找到相应的sheet表!");
        }
        return map;
    }

    /**
     * 测试方法调用
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception{
        HashMap<String,Object> res = parseSheet("C:\\Users\\shea\\Desktop\\测试2.xlsx", "Sheet1", 285);
        System.out.println((String)res.get("msg"));
        List<String[]> rows = (List<String[]>)res.get("rows");
        BufferedWriter out = new BufferedWriter(new FileWriter("C:\\Users\\shea\\Desktop\\测试_bigExcel.txt"));
        for (String[] row:rows
             ) {
            for (String cell:row
                 ) {
                out.write(cell+"\t");
            }
           out.newLine();
        }
        out.flush();
        out.close();
    }
}

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
在使用POI读取Excel文件时,如果遇到大量的空行,可能会导致内存溢出的问题。这是因为POI在读取Excel文件时会将整个文件的内容全部加载到内存中,而空行并没有实际的数据内容,但仍然会占用内存空间。 解决这个问题的方法有以下几种: 1. 使用逐行读取的方式:通过POI提供的API,可以逐行读取Excel文件的内容,而不是将整个文件加载到内存中。这样可以有效地避免空行占用过多的内存空间。 2. 添加筛选条件:在读取Excel文件时,可以添加筛选条件,只读取有效的数据行,而忽略空行。可以通过判断某一行是否为空行的方式,进行过滤。 3. 设置最大行数限制:可以设置最大行数的限制,当达到设定的最大行数时,停止继续读取Excel文件。这样可以避免读取过多的空行,从而减少内存占用。 4. 对大文件进行分块处理:如果Excel文件太大,无法完全加载到内存中,可以将文件进行分块处理,每次读取一部分内容,然后进行处理,这样可以避免一次性读取整个文件导致内存溢出。 需要注意的是,以上方法仅仅是针对空行导致的内存溢出问题,如果Excel文件本身非常大,仍然可能会出现内存溢出的情况。为了避免此类问题,可以考虑采取其他的解决方案,例如使用数据库进行存储和查询,或者使用分布式处理来处理大规模的数据。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值