java解析20万Excel经历

    Excel表中,有少量数据时使用Workbook解析没有问题,但在遇到海量数据时就会崩溃,查阅资料结合别人的代码自己也写了一个实例。

先看结论再看代码

    1.优点:灵活,如有多个不同的Excel表,可以写多个ExcelXXXReader,例如我有三个表,Coupon,Qa,Order,那我就对应ExcelCouponReader,ExcelQaReader,ExcelOrderReader,再新增类型表格,只需要对应新增Reader就可以了。

    2 缺点:解析速度慢,对比别人写的工具类,100W,1000W的数据,只需要10S,20S,抱歉我20个字段,ThinkPad,20W的数据需要80s。

    3 想省事的话,第一第二段代码直接粘贴复制,将其中的实体类替换成自己的就可以了。

代码部分:工具分为两个部分,一个是对读取Excel表格,另一个是将Excel中的字段一一解析到对应的实体类中。

 

第一部分,读取:

import org.apache.log4j.Logger;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.InputStream;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * package 
 * Description:用于解析大数据量的EXCEL文件工具类,可以解析少量的emoji表情的,如有昵称,建议将该字段删除
 * Date:
 * Author: 
 **/
public abstract  class XxlsAbstractReader extends DefaultHandler  {

    private static final Logger logger = Logger.getLogger(XxlsAbstractReader.class);

    private SharedStringsTable sst;
    private String lastContents;
    private boolean nextIsString;

    private int sheetIndex = -1;
    private List<Object> rowlist = new ArrayList<Object>();
    private int curRow = 0; // 当前行
    private int curCol = 0; // 当前列索引
    private int preCol = 0; // 上一列列索引
    private int titleRow = 0; // 标题行,一般情况下为0
    private int rowsize = 0; // 列数

    // excel记录行操作方法,以sheet索引,行索引和行元素列表为参数,对sheet的一行元素进行操作,元素为String类型
    public abstract void optRows(int sheetIndex, int curRow, List<Object> rowlist) throws SQLException;

    // 只遍历一个sheet,其中sheetId为要遍历的sheet索引,从1开始,1-3

    /**
     * @param filename
     * @param sheetId sheetId为要遍历的sheet索引,从1开始,1-3
     * @throws Exception
     */
    public void processOneSheet(String filename, int sheetId) throws Exception {
        OPCPackage pkg = OPCPackage.open(filename);
        XSSFReader r = new XSSFReader(pkg);
        SharedStringsTable sst = r.getSharedStringsTable();

        XMLReader parser = fetchSheetParser(sst);

        // rId2 found by processing the Workbook
        // 根据 rId# 或 rSheet# 查找sheet
        InputStream sheet2 = r.getSheet("rId" + sheetId);
        sheetIndex++;
        InputSource sheetSource = new InputSource(sheet2);
        parser.parse(sheetSource);
        sheet2.close();
    }

    /**
     * 遍历 excel 文件
     */
    public void process(String filename) throws Exception {
        OPCPackage pkg = OPCPackage.open(filename);
        XSSFReader r = new XSSFReader(pkg);
        SharedStringsTable sst = r.getSharedStringsTable();

        XMLReader parser = fetchSheetParser(sst);

        Iterator<InputStream> sheets = r.getSheetsData();
        while (sheets.hasNext()) {
            curRow = 0;
            sheetIndex++;
            InputStream sheet = sheets.next();
            InputSource sheetSource = new InputSource(sheet);
            parser.parse(sheetSource);
            sheet.close();
        }
    }

    /**
     * 遍历 excel 文件
     */
    public void process(InputStream in) throws Exception {
        OPCPackage pkg = OPCPackage.open(in);
        XSSFReader r = new XSSFReader(pkg);
        SharedStringsTable sst = r.getSharedStringsTable();

        XMLReader parser = fetchSheetParser(sst);

        Iterator<InputStream> sheets = r.getSheetsData();
        while (sheets.hasNext()) {
            curRow = 0;
            sheetIndex++;
            InputStream sheet = sheets.next();
            InputSource sheetSource = new InputSource(sheet);
            parser.parse(sheetSource);
            sheet.close();
        }
    }

    public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
        XMLReader parser = XMLReaderFactory.createXMLReader();
        // .createXMLReader("org.apache.xerces.parsers.SAXParser");
        this.sst = sst;
        parser.setContentHandler(this);
        return parser;
    }

    public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
        // c => 单元格
        if ("c".equals(name)) {
            // 如果下一个元素是 SST 的索引,则将nextIsString标记为true
            String cellType = attributes.getValue("t");
            String rowStr = attributes.getValue("r");
            curCol = this.getRowIndex(rowStr);
            if (cellType != null && "s".equals(cellType)) {
                nextIsString = true;
            } else {
                nextIsString = false;
            }
        }
        // 置空
        lastContents = "";
    }

    public void endElement(String uri, String localName, String name) throws SAXException {
        // 根据SST的索引值的到单元格的真正要存储的字符串
        // 这时characters()方法可能会被调用多次
        if (nextIsString) {
            try {
                int idx = Integer.parseInt(lastContents);
                lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
            } catch (Exception e) {

            }
        }
        // v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引
        // 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
        if ("v".equals(name)) {
            String value = lastContents.trim();
            value = "".equals(value) ? " " : value;
            int cols = curCol - preCol;
            if (cols > 1) {
                for (int i = 0; i < cols - 1; i++) {
                    rowlist.add(preCol, "");
                }
            }
            preCol = curCol;
            rowlist.add(curCol - 1, value);
        } else {
            // 如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法
            if ("row".equals(name)) {
                int tmpCols = rowlist.size();
                if (tmpCols > 0) {
                    if (curRow > this.titleRow && tmpCols < this.rowsize) {
                        for (int i = 0; i < this.rowsize - tmpCols; i++) {
                            rowlist.add(rowlist.size(), "");
                        }
                    }
                    try {
                        optRows(sheetIndex, curRow, rowlist);
                    } catch (SQLException e) {
                        logger.error("endElement error", e);
                    }
                } else {// 跳过空白行
                    logger.info("jump blank row ,curRow:" + (curRow + 1));
                }
                if (curRow == this.titleRow) {
                    this.rowsize = rowlist.size();
                    if (this.rowsize == 0) {
                        throw new RuntimeException("excel 未被编辑或者内容为空");
                    }
                }

                rowlist.clear();
                curRow++;
                curCol = 0;
                preCol = 0;
            }
        }
    }

    public void characters(char[] ch, int start, int length) throws SAXException {
        // 得到单元格内容的值
        lastContents += new String(ch, start, length);
    }

    // 得到列索引,每一列c元素的r属性构成为字母加数字的形式,字母组合为列索引,数字组合为行索引,
    // 如AB45,表示为第(A-A+1)*26+(B-A+1)*26列,45行
    public int getRowIndex(String rowStr) {
        rowStr = rowStr.replaceAll("[^A-Z]", "");
        byte[] rowAbc = rowStr.getBytes();
        int len = rowAbc.length;
        float num = 0;
        for (int i = 0; i < len; i++) {
            num += (rowAbc[i] - 'A' + 1) * Math.pow(26, len - i - 1);
        }
        return (int) num;
    }

    public int getTitleRow() {
        return titleRow;
    }

    public void setTitleRow(int titleRow) {
        this.titleRow = titleRow;
    }

    public int getSheetIndex() {
        return sheetIndex;
    }


}

第二部分

import com.XXX.common.context.util.UuidUtil;
import com.XXX.customized.mmt.uploadexcel.model.ExcelCoupon;
import org.apache.commons.lang.time.DateUtils;

import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.*;

/**
 * Paceage:
 * Description:使用工具类对EXCEL文件进行解析并映射到实体类中
 * Date:
 * Author: 
 **/
public class ExcelCouponReader extends XxlsAbstractReader {
    private List<ExcelCoupon> voList = new ArrayList<>();

    public ExcelCouponReader() {
    }

    @Override
    public void optRows(int sheetIndex, int curRow, List<Object> rowlist) throws SQLException {
        if (curRow == 0) {
            return;
        }
        
        // 将要解析的实体类
        ExcelCoupon excelCoupon = new ExcelCoupon();
        excelCoupon.setNo(String.valueOf(rowlist.get(0)));
        excelCoupon.setOpenid(rowlist.get(1).toString());
        excelCoupon.setWID(rowlist.get(2).toString());
        excelCoupon.setCouponCode(rowlist.get(3).toString());
        excelCoupon.setStatus(rowlist.get(4).toString());
        excelCoupon.setReceivedTime(numToString(rowlist.get(5).toString()));
        excelCoupon.setRedeemedTime(numToString(rowlist.get(6).toString()));
        excelCoupon.setShopName(rowlist.get(7).toString());
        excelCoupon.setTitle(rowlist.get(8).toString());
        excelCoupon.setDetail(rowlist.get(9).toString());
        excelCoupon.setUseTips(rowlist.get(10).toString());
        excelCoupon.setStartTime(numToString(rowlist.get(11).toString()));
        excelCoupon.setEndTime(numToString(rowlist.get(12).toString()));
        excelCoupon.setCardId(rowlist.get(13).toString());
        excelCoupon.setLanguage(rowlist.get(14).toString());
        excelCoupon.setUid(uid);

        voList.add(excelCoupon);
    }

    public List<ExcelCoupon> getExcelCouponList() {
        return voList;
    }

    /**
     * 日期类型字段会被解析成1900至今的毫秒数,自己写了一个工具,将毫秒数转换为日期,小时和分钟会丢失精度
     * @param dataString
     * @return
     */
    public static String numToString(String dataString) {
        if ("" != dataString && null != dataString) {
            String str = null;
            if (!dataString.contains(".")) {
                Calendar calendar = new GregorianCalendar(1900,0,-1);
                Date d1 = calendar.getTime();
                Date d2 = DateUtils.addDays(d1,Integer.valueOf(dataString));
                SimpleDateFormat sdf = new SimpleDateFormat("yyyy/MM/dd HH:mm");
                str = sdf.format(d2);
            }else {
                String string = dataString.substring(0,dataString.lastIndexOf("."));
                Calendar calendar = new GregorianCalendar(1900,0,-1);
                Date d1 = calendar.getTime();
                Date d2 = DateUtils.addDays(d1,Integer.valueOf(string));
                SimpleDateFormat sdf = new SimpleDateFormat("yyyy/MM/dd HH:mm");
                str = sdf.format(d2);
            }
            return str;
        }
        return "";
    }
}

使用:

public static void main(String[] args) {
    try {
        File file = new File("D:test.xlsx");
        InputStream in = new FileInputStream(file);
        ExcelCouponReader excelCouponReader = new ExcelCouponReader();
        excelCouponReader.process(in);
        //解析到对应的实体类
        List<ExcelCoupon> excelCouponList = excelCouponReader.getExcelCouponList();
    } catch (Exception e) {
        e.printStackTrace();
    }

}
  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值