读海量excel数据(无OOM)

1.声明,代码来自这位朋

添加链接描述

2.读取excel分为2003版本(a.xls)和2007版本(a.xlsx)

3.所有的类文件如下(共4个)

在这里插入图片描述

4.依赖的包如下(共3个)

在这里插入图片描述
在这里插入图片描述

5.Excel2003Reader.java

package com.aisino.myapplication;

import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BlankRecord;
import org.apache.poi.hssf.record.BoolErrRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
     * 抽象Excel2003读取器,通过实现HSSFListener监听器,采用事件驱动模式解析excel2003 
     * 中的内容,遇到特定事件才会触发,大大减少了内存的使用。 
     * 
     */  
    public  class Excel2003Reader implements HSSFListener {
        private int minColumns = -1;  
        private POIFSFileSystem fs;
        private int lastRowNumber;  
        private int lastColumnNumber;  
      
        /** Should we output the formula, or the value it has? */  
        private boolean outputFormulaValues = true;  
      
        /** For parsing Formulas */  
        private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;
        //excel2003工作薄  
        private HSSFWorkbook stubWorkbook;
      
        // Records we pick up as we process  
        private SSTRecord sstRecord;
        private FormatTrackingHSSFListener formatListener;
      
        //表索引  
        private int sheetIndex = -1;  
        private BoundSheetRecord[] orderedBSRs;
        @SuppressWarnings("unchecked")  
        private ArrayList boundSheetRecords = new ArrayList();
      
        // For handling formulas with string results  
        private int nextRow;  
        private int nextColumn;  
        private boolean outputNextStringRecord;  
        //当前行  
        private int curRow = 0;  
        //存储行记录的容器  
        private List<String> rowlist = new ArrayList<String>();;
        @SuppressWarnings( "unused")  
        private String sheetName;  
          
        private IRowReader rowReader;  
      
          
        public void setRowReader(IRowReader rowReader){  
            this.rowReader = rowReader;  
        }  
          
        /** 
         * 遍历excel下所有的sheet 
         * @throws IOException 
         */  
        public void process(String fileName) throws IOException {
            this.fs = new POIFSFileSystem(new FileInputStream(fileName));
            MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(
                    this);  
            formatListener = new FormatTrackingHSSFListener(listener);  
            HSSFEventFactory factory = new HSSFEventFactory();
            HSSFRequest request = new HSSFRequest();
            if (outputFormulaValues) {  
                request.addListenerForAllRecords(formatListener);  
            } else {  
                workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(
                        formatListener);  
                request.addListenerForAllRecords(workbookBuildingListener);  
            }  
            factory.processWorkbookEvents(request, fs);  
        }  
          
        /** 
         * HSSFListener 监听方法,处理 Record 
         */  
        @SuppressWarnings("unchecked")  
        public void processRecord(Record record) {
            int thisRow = -1;  
            int thisColumn = -1;  
            String thisStr = null;  
            String value = null;  
            switch (record.getSid()) {  
                case BoundSheetRecord.sid:  
                    boundSheetRecords.add(record);  
                    break;  
                case BOFRecord.sid:
                    BOFRecord br = (BOFRecord) record;  
                    if (br.getType() == BOFRecord.TYPE_WORKSHEET) {  
                        // 如果有需要,则建立子工作薄  
                        if (workbookBuildingListener != null && stubWorkbook == null) {  
                            stubWorkbook = workbookBuildingListener  
                                    .getStubHSSFWorkbook();  
                        }  
                          
                        sheetIndex++;  
                        if (orderedBSRs == null) {  
                            orderedBSRs = BoundSheetRecord  
                                    .orderByBofPosition(boundSheetRecords);  
                        }  
                        sheetName = orderedBSRs[sheetIndex].getSheetname();  
                    }  
                    break;  
          
                case SSTRecord.sid:  
                    sstRecord = (SSTRecord) record;  
                    break;  
          
                case BlankRecord.sid:
                    BlankRecord brec = (BlankRecord) record;  
                    thisRow = brec.getRow();  
                    thisColumn = brec.getColumn();  
                    thisStr = "";  
                    rowlist.add(thisColumn, thisStr);  
                    break;  
                case BoolErrRecord.sid: //单元格为布尔类型
                    BoolErrRecord berec = (BoolErrRecord) record;  
                    thisRow = berec.getRow();  
                    thisColumn = berec.getColumn();  
                    thisStr = berec.getBooleanValue()+"";  
                    rowlist.add(thisColumn, thisStr);  
                    break;  
          
                case FormulaRecord.sid: //单元格为公式类型
                    FormulaRecord frec = (FormulaRecord) record;  
                    thisRow = frec.getRow();  
                    thisColumn = frec.getColumn();  
                    if (outputFormulaValues) {  
                        if (Double.isNaN(frec.getValue())) {  
                            // Formula result is a string  
                            // This is stored in the next record  
                            outputNextStringRecord = true;  
                            nextRow = frec.getRow();  
                            nextColumn = frec.getColumn();  
                        } else {  
                            thisStr = formatListener.formatNumberDateCell(frec);  
                        }  
                    } else {  
                        thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook,
                                frec.getParsedExpression()) + '"';  
                    }  
                    rowlist.add(thisColumn,thisStr);  
                    break;  
                case StringRecord.sid://单元格中公式的字符串
                    if (outputNextStringRecord) {  
                        // String for formula  
                        StringRecord srec = (StringRecord) record;  
                        thisStr = srec.getString();  
                        thisRow = nextRow;  
                        thisColumn = nextColumn;  
                        outputNextStringRecord = false;  
                    }  
                    break;  
                case LabelRecord.sid:
                    LabelRecord lrec = (LabelRecord) record;  
                    curRow = thisRow = lrec.getRow();  
                    thisColumn = lrec.getColumn();  
                    value = lrec.getValue().trim();  
                    value = value.equals("")?" ":value;  
                    this.rowlist.add(thisColumn, value);  
                    break;  
                case LabelSSTRecord.sid:  //单元格为字符串类型
                    LabelSSTRecord lsrec = (LabelSSTRecord) record;  
                    curRow = thisRow = lsrec.getRow();  
                    thisColumn = lsrec.getColumn();  
                    if (sstRecord == null) {  
                        rowlist.add(thisColumn, " ");  
                    } else {  
                        value =  sstRecord  
                        .getString(lsrec.getSSTIndex()).toString().trim();  
                        value = value.equals("")?" ":value;  
                        rowlist.add(thisColumn,value);  
                    }  
                    break;  
                case NumberRecord.sid:  //单元格为数字类型
                    NumberRecord numrec = (NumberRecord) record;  
                    curRow = thisRow = numrec.getRow();  
                    thisColumn = numrec.getColumn();  
                    value = formatListener.formatNumberDateCell(numrec).trim();  
                    value = value.equals("")?" ":value;  
                    // 向容器加入列值  
                    rowlist.add(thisColumn, value);  
                    break;  
                default:  
                    break;  
            }  
      
            // 遇到新行的操作  
            if (thisRow != -1 && thisRow != lastRowNumber) {  
                lastColumnNumber = -1;  
            }  
      
            // 空值的操作  
            if (record instanceof MissingCellDummyRecord) {
                MissingCellDummyRecord mc = (MissingCellDummyRecord) record;  
                curRow = thisRow = mc.getRow();  
                thisColumn = mc.getColumn();  
                rowlist.add(thisColumn," ");  
            }  
      
            // 更新行和列的值  
            if (thisRow > -1)  
                lastRowNumber = thisRow;  
            if (thisColumn > -1)  
                lastColumnNumber = thisColumn;  
      
            // 行结束时的操作  
            if (record instanceof LastCellOfRowDummyRecord) {
                if (minColumns > 0) {  
                    // 列值重新置空  
                    if (lastColumnNumber == -1) {  
                        lastColumnNumber = 0;  
                    }  
                }  
                lastColumnNumber = -1;  
                    // 每行结束时, 调用getRows() 方法  
                rowReader.getRows(sheetIndex,curRow, rowlist);  
                  
                // 清空容器  
                rowlist.clear();  
            }  
        }  
          
    }

6.Excel2007Reader.java

package com.aisino.myapplication;

import android.util.Log;

import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.InputStream;
import java.math.BigDecimal;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

/**
 * 抽象Excel2007读取器,excel2007的底层数据结构是xml文件,采用SAX的事件驱动的方法解析
 * xml,需要继承DefaultHandler,在遇到文件内容时,事件会触发,这种做法可以大大降低
 * 内存的耗费,特别使用于大数据量的文件。
 * 注意Excel2007Reader类中引用的Attributes在org.xml.sal包下,Date在java.util包下
 */
public class Excel2007Reader extends DefaultHandler {
 private static final String TAG = Excel2007Reader.class.getSimpleName();
 //共享字符串表
 private SharedStringsTable sst;
 //上一次的内容
 private String lastContents;
 private boolean nextIsString;

 private int sheetIndex = -1;
 private List<String> rowlist = new ArrayList<String>();
 //当前行
 private int curRow = 0;
 //当前列
 private int curCol = 0;
 //日期标志
 private boolean dateFlag;
 //数字标志
 private boolean numberFlag;
 
 private boolean isTElement;
 
 private IRowReader rowReader;
 
 public void setRowReader(IRowReader rowReader){
  this.rowReader = rowReader;
 }
 
 /**只遍历一个电子表格,其中sheetId为要遍历的sheet索引,从1开始,1-3
  * @param filename
  * @param sheetId
  * @throws Exception
  */
 public void processOneSheet(String filename,int sheetId) throws Exception {
  OPCPackage pkg = OPCPackage.open(filename);
  XSSFReader r = new XSSFReader(pkg);
  SharedStringsTable sst = r.getSharedStringsTable();
  XMLReader parser = fetchSheetParser(sst);
  
  // 根据 rId# 或 rSheet# 查找sheet
  InputStream sheet2 = r.getSheet("rId"+sheetId);
  sheetIndex++;
  InputSource sheetSource = new InputSource(sheet2);
  parser.parse(sheetSource);
  sheet2.close();
 }

 /**
  * 遍历工作簿中所有的电子表格
  * @param filename
  * @throws Exception
  */
 public void process(String filename) throws Exception {
  OPCPackage pkg = OPCPackage.open(filename);
  XSSFReader r = new XSSFReader(pkg);
  SharedStringsTable sst = r.getSharedStringsTable();
  XMLReader parser = fetchSheetParser(sst);
  Iterator<InputStream> sheets = r.getSheetsData();
  while (sheets.hasNext()) {
   curRow = 0;
   sheetIndex++;
   InputStream sheet = sheets.next();
   InputSource sheetSource = new InputSource(sheet);
   parser.parse(sheetSource);
   sheet.close();
  }
 }

 public XMLReader fetchSheetParser(SharedStringsTable sst)
   throws SAXException {
  XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
  this.sst = sst;
  parser.setContentHandler(this);
  return parser;
 }

 public void startElement(String uri, String localName, String name,
   Attributes attributes) throws SAXException {
  
  // c => 单元格
  if ("c".equals(name)) {
   // 如果下一个元素是 SST 的索引,则将nextIsString标记为true
   String cellType = attributes.getValue("t");
   if ("s".equals(cellType)) {
    nextIsString = true;
   } else {
    nextIsString = false;
   }
   //日期格式
   String cellDateType = attributes.getValue("s");
   if ("1".equals(cellDateType)){
    dateFlag = true;
   } else {
    dateFlag = false;
   }
   String cellNumberType = attributes.getValue("s");
   if("2".equals(cellNumberType)){
    numberFlag = true;
   } else {
    numberFlag = false;
   }
   
  }
  //当元素为t时
  if("t".equals(name)){
   isTElement = true;
  } else {
   isTElement = false;
  }
  
  // 置空
  lastContents = "";
 }

 public void endElement(String uri, String localName, String name)
   throws SAXException {
  
  // 根据SST的索引值的到单元格的真正要存储的字符串
  // 这时characters()方法可能会被调用多次
  if (nextIsString) {
   try {
    int idx = Integer.parseInt(lastContents);
    lastContents = new XSSFRichTextString(sst.getEntryAt(idx))
      .toString();
   } catch (Exception e) {

   }
  }
  //t元素也包含字符串
  if(isTElement){
   String value = lastContents.trim();
   rowlist.add(curCol, value);
   curCol++;
   isTElement = false;
   // v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引
   // 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
  } else if ("v".equals(name)) {
   String value = lastContents.trim();
   value = value.equals("")?" ":value;
   //日期格式处理
   if(dateFlag){
     Date date = HSSFDateUtil.getJavaDate(Double.valueOf(value));
     SimpleDateFormat dateFormat = new SimpleDateFormat(
                 "dd/MM/yyyy");
     value = dateFormat.format(date);
   }
   //数字类型处理
   if(numberFlag){
    BigDecimal bd = new BigDecimal(value);
    value = bd.setScale(3,BigDecimal.ROUND_UP).toString();
   }
   rowlist.add(curCol, value);
   curCol++;
  }else {
   //如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法
   if (name.equals("row")) {
    rowReader.getRows(sheetIndex,curRow,rowlist);
    rowlist.clear();
    curRow++;
    curCol = 0;
   }
  }
  
 }

 public void characters(char[] ch, int start, int length)
   throws SAXException {
  //得到单元格内容的值
  lastContents += new String(ch, start, length);
 }
}

7.IRowReader.java

package com.aisino.myapplication;

import java.util.List;

public interface IRowReader {
 
 /**业务逻辑实现方法
  * @param sheetIndex
  * @param curRow
  * @param rowlist
  */
 public  void getRows(int sheetIndex,int curRow, List<String> rowlist);
}


8.RowReader.java

package com.aisino.myapplication;

import java.util.ArrayList;
import java.util.List;

public class RowReader implements IRowReader{


 /* 业务逻辑实现方法
  * @see com.eprosun.util.excel.IRowReader#getRows(int, int, java.util.List)
  */
 private List<String> lst = new ArrayList<>();
 public void getRows(int sheetIndex, int curRow, List<String> rowlist) {
  // TODO Auto-generated method stub
  System.out.print(curRow+" ");
  for (int i = 0; i < rowlist.size(); i++) {
   System.out.println("rowReader:"+rowlist.get(i) + "  ");
  }
  System.out.println("totalMemory="+Runtime.getRuntime().totalMemory()/1024/1024);
  System.out.println();
 }

}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
目前处理Excel的开源javaAPI主要有两种,一是Jxl(Java Excel API),Jxl只支持Excel2003以下的版本。另外一种是Apache的Jakarta POI,相比于Jxl,POI对微软办公文档的支持更加强大,但是它使用复杂,上手慢。POI可支持更高的Excel版本2007。对Excel取,POI有两种模式,一是用户模式,这种方式同Jxl的使用很类似,使用简单,都是将文件一次性到内存,文件小的时候,没有什么问题,当文件大的时候,就会出现OutOfMemory的内存溢出问题。第二种是事件驱动模式,拿Excel2007来说,其内容采用XML的格式来存储,所以处理excel就是解析XML,而目前使用事件驱动模式解析XML的API是SAX(Simple API for XML),这种模型在取XML文档时,并没有将整个文档入内存,而是按顺序将整个文档解析完,在解析过程中,会主动产生事件交给程序中相应的处理函数来处理当前内容。因此这种方式对系统资源要求不高,可以处理海量数据。笔者曾经做过测试,这种方法处理一千万条,每条五列的数据花费大约11分钟。可见处理海量数据的文件事件驱动是一个很好的方式。而本文中用到的AbstractExcel2003Reader、AbstractExcel2007Reader对Excel取都是采用这种POI的事件驱动模式。至于Excel的写操作,对较高版本的Excel2007,POI提供了很好的支持,主要流程是第一步构建工作薄和电子表格对象,第二步在一个流中构建文本文件,第三步使用流中产生的数据替换模板中的电子表格。这种方式也可以处理海量数据文件。AbstractExcel2007Writer就是使用这种方式进行写操作。对于写入较低版本的Excel2003,POI使用了用户模式来处理,就是将整个文档加载进内存,如果数据量大的话就会出现内存溢出的问题,Excel2003Writer就是使用这种方式。据笔者的测试,如果数据量大于3万条,每条8列的话,就会报OutOfMemory的错误。Excel2003中每个电子表格的记录数必须在65536以下,否则就会发生异常。目前还没有好的解决方案,建议对于海量数据写入操作,尽量使用Excel2007。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值