excel超级解析之Poi-SAX解析工具

最新推荐文章于 2023-03-28 09:56:22 发布

为梦想。

最新推荐文章于 2023-03-28 09:56:22 发布

阅读量1.6k

点赞数

分类专栏：问题记录

本文链接：https://blog.csdn.net/yhmliunan/article/details/108964774

版权

问题记录专栏收录该内容

3 篇文章 1 订阅

订阅专栏

Excel文件导入的时候有大量的数据需要解析时用传统的方式解析读取行数据会很缓慢，特别是xlsx格式的excel文件可能有上百万行的数据量。
Poi的Sax解析是把xlsx格式的excel文件先解析成xml，再直接从内存读取xml数据处理，这样的速度会很快。

一、区分xls和xlsx的文件并执行相应的方法

 /**
      * excel文件解析
      * @param attachFileInfo 定义的附件实体类
      * @return
      * @throws Exception
      */
     private List<Map<String, String>> fileParsing(AttachFileInfo attachFileInfo) throws Exception
     {
          String rootPath = MessageConvertFactory.getMessage(AppConf.uploadFileRootPath);
          String filePath = attachFileInfo.getFilePath();
          String fileName = attachFileInfo.getFileName();
          List<Map<String, String>> rowlists = new ArrayList<Map<String, String>>();
          String ext = fileName.substring(fileName.lastIndexOf("."));
          if (".xls".equals(ext)) {
              ExcelUtil excel = new ExcelUtil(rootPath + filePath);
              rowlists = excel.readExcelContent();
          }
          else if (".xlsx".equals(ext)){//xlsx支持poi的sax解析
              // 1、执行文件解析操作
              BigDataParseExcel bigDataParseExcel = new BigDataParseExcel();
              bigDataParseExcel.process(rootPath + filePath);
              rowlists = bigDataParseExcel.getRowlists();
          }
          return rowlists;
     }

二、xlsx文件支持poi的Sax解析方式

import java.io.InputStream;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
/**
* XSSF and SAX (Event API)
* 超大excel解析工具  表头自行配置
* yhm
*/
public class BigDataParseExcel extends DefaultHandler
{
    private SharedStringsTable sst;
    private String lastContents;
    private boolean nextIsString;
    private int sheetIndex = -1;
    private List<String> rowlist = new ArrayList<String>();
    private List<String> titlelist = new ArrayList<String>();
    private Map<String,String> map = new HashMap<String,String>();
    private List<Map<String,String>> rowlists = new ArrayList<Map<String,String>>();
    private int curRow = 0; //当前行
    private int curCol = 0; //当前列索引
    private int preCol = 0; //上一列列索引
    private int titleRow = 0; //标题行，一般情况下为0
    private int rowsize = 0; //列数
    //excel记录行操作方法，以sheet索引，行索引和行元素列表为参数，对sheet的一行元素进行操作，元素为String类型

    public void setRowlists(List<Map<String,String>> rowlists)
    {
        this.rowlists = rowlists;
    }

    public List<Map<String,String>> getRowlists()
    {
        return rowlists;
    }

    //组装
    public void optRows(int sheetIndex, int curRow, List<String> rowlist) throws SQLException
    {
        for (int s = 0;s < rowlist.size(); s++)
        {
            map.put(titlelist.get(s), rowlist.get(s));
        }
        rowlists.add(map);
    }
    
    //只遍历一个sheet，其中sheetId为要遍历的sheet索引，从1开始，1-3
    /**
     *
     * @param filename
     * @param sheetId sheetId为要遍历的sheet索引，从1开始，1-3
     * @throws Exception
     */
    public void processOneSheet(String filename,int sheetId) throws Exception
    {
        OPCPackage pkg = OPCPackage.open(filename);
        XSSFReader r = new XSSFReader(pkg);
        SharedStringsTable sst = r.getSharedStringsTable();
        XMLReader parser = fetchSheetParser(sst);
        // rId2 found by processing the Workbook
        // 根据 rId# 或 rSheet# 查找sheet
        InputStream sheet2 = r.getSheet("rId" + sheetId);
        sheetIndex++;
        InputSource sheetSource = new InputSource(sheet2);
        parser.parse(sheetSource);
        sheet2.close();
    }
    /**
     * 1、遍历 excel 文件
     */
    public void process(String filename) throws Exception
    {
        OPCPackage pkg = OPCPackage.open(filename);
        XSSFReader r = new XSSFReader(pkg);//打开表，解析它，并返回一个用于处理对象。
        SharedStringsTable sst = r.getSharedStringsTable();
        XMLReader parser = fetchSheetParser(sst);
        Iterator<InputStream> sheets = r.getSheetsData();
        while (sheets.hasNext())
        {
            curRow = 0;
            sheetIndex++;
            InputStream sheet = sheets.next();
            InputSource sheetSource = new InputSource(sheet);
            parser.parse(sheetSource);
            sheet.close();
        }
    }
    
    /**
     * 解析成xml
     * @param sst
     * @return
     * @throws SAXException
     */
    public XMLReader fetchSheetParser(SharedStringsTable sst)
            throws SAXException
    {
        XMLReader parser = XMLReaderFactory.createXMLReader();
        //.createXMLReader("org.apache.xerces.parsers.SAXParser");
        this.sst = sst;
        parser.setContentHandler(this);
        return parser;
    }
    
    /**
     * 起始元素解析（DefaultHandler）
     */
    public void startElement(String uri, String localName, String name,
            Attributes attributes) throws SAXException
    {
        // c => 单元格
        if (name.equals("c"))
        {
            // 如果下一个元素是 SST 的索引，则将nextIsString标记为true
            String cellType = attributes.getValue("t");
            String rowStr = attributes.getValue("r");
            curCol = this.getRowIndex(rowStr);
            if (cellType != null && cellType.equals("s"))
            {
                nextIsString = true;
            } else
            {
                nextIsString = false;
            }
        }
        // 置空
        lastContents = "";
    }
    
    /**
     * 结尾元素解析（DefaultHandler）
     */
    public void endElement(String uri, String localName, String name)
            throws SAXException
    {
        // 根据SST的索引值的到单元格的真正要存储的字符串
        // 这时characters()方法可能会被调用多次
        if (nextIsString)
        {
            try
            {
                int idx = Integer.parseInt(lastContents);
                lastContents = new XSSFRichTextString(sst.getEntryAt(idx))
                .toString();
            } catch (Exception e)
            {
            }
        }
        // v => 单元格的值，如果单元格是字符串则v标签的值为该字符串在SST中的索引
        // 将单元格内容加入rowlist中，在这之前先去掉字符串前后的空白符
        if (name.equals("v"))
        {
            String value = lastContents.trim();
            value = value.equals("") ? " " : value;
            int cols = curCol - preCol;
            if (cols > 1)
            {
                for (int i = 0;i < cols - 1;i++)
                {
                    rowlist.add(preCol,"");
                }
            }
            preCol = curCol;
            rowlist.add(curCol - 1, value);
        } else
        {
            rowlistLast(name);
        }
    }
    
    private void rowlistLast(String name)
    {
          //如果标签名称为 row ，这说明已到行尾，调用 optRows() 方法
        if (name.equals("row"))
        {
            int tmpCols = rowlist.size();
            if (curRow > this.titleRow && tmpCols < this.rowsize)
            {
                for (int i = 0;i < this.rowsize - tmpCols;i++)
                {
                    rowlist.add(rowlist.size(), "");
                }
            }
            try
            {
                if (curRow == 1)
                {
                    titlelist = rowlist;
                } else if (curRow > 1)
                {
                    optRows(sheetIndex,curRow,rowlist);
                }
            } catch (SQLException e)
            {
                e.printStackTrace();
            }
            if (curRow == this.titleRow)
            {
                this.rowsize = rowlist.size();
            }
            rowlist = new ArrayList<String>();
            curRow++;
            curCol = 0;
            preCol = 0;
        }
    }
    
    public void characters(char[] ch, int start, int length)
            throws SAXException
    {
        //得到单元格内容的值
        lastContents += new String(ch, start, length);
    }
    
    //得到列索引，每一列c元素的r属性构成为字母加数字的形式，字母组合为列索引，数字组合为行索引，
    //如AB45,表示为第（A-A+1）*26+（B-A+1）*26列，45行
    public int getRowIndex(String rowStr)
    {
        rowStr = rowStr.replaceAll("[^A-Z]", "");
        byte[] rowAbc = rowStr.getBytes();
        int len = rowAbc.length;
        float num = 0;
        for (int i = 0;i < len;i++)
        {
            num += (rowAbc[i] - 'A' + 1) * Math.pow(26,len - i - 1 );
        }
        return (int) num;
    }
    
    public int getTitleRow()
    {
        return titleRow;
    }
    
    public void setTitleRow(int titleRow)
    {
        this.titleRow = titleRow;
    }
}

三、xls解析


import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.servlet.http.HttpServletResponse;

import net.sf.json.JSONArray;

import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.util.HSSFColor;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.util.CellRangeAddress;
import org.apache.poi.xssf.streaming.SXSSFCell;
import org.apache.poi.xssf.streaming.SXSSFRow;
import org.apache.poi.xssf.streaming.SXSSFSheet;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.bw.biz.enums.DeliveryPlaceEnum;
import com.bw.biz.enums.ProjectTypeEnum;
import com.bw.material.util.CommonUtils;
import com.bw.wzjh.model.WzjhMaterialTech;

/**
* Excel工具类
* @author yhm
*/
public class ExcelUtil
{
    private Logger logger = LoggerFactory.getLogger(ExcelUtil.class);
    private Workbook wb;
    private Sheet sheet;
    private Row row;
    private List<String> titlelist = new ArrayList<String>();
    private List<String> rowlist;
    private Map<String,String> map = new HashMap<String,String>();

    public ExcelUtil(String filepath)
    {
        if (filepath == null)
        {
            return;
        }
        try
        {
            InputStream is = new FileInputStream(filepath);
            wb = new HSSFWorkbook(is);
        }
        catch (FileNotFoundException e)
        {
            logger.error("FileNotFoundException", e);
        }
        catch (IOException e)
        {
            logger.error("IOException", e);
        }
    }

    /**
     * 读取Excel表格表头的内容
     * @param InputStream
     * @return String 表头内容的数组
     * @author zengwendong
     */
    public String[] readExcelTitle() throws Exception
    {
        if (wb == null)
        {
            //            throw new Exception("Workbook对象为空！");
        }
        sheet = wb.getSheetAt(0);
        row = sheet.getRow(0);
        // 标题总列数
        int colNum = row.getPhysicalNumberOfCells();
        System.out.println("colNum:" + colNum);
        String[] title = new String[colNum];
        for (int i = 0; i < colNum; i++)
        {
            // title[i] = getStringCellValue(row.getCell((short) i));
            title[i] = row.getCell(i).getCellFormula();
        }
        return title;
    }

    /**
     * 读取Excel数据内容
     * @param InputStream
     * @return Map 包含单元格数据内容的Map对象
     * @author yhm
     */
    public List<Map<String, String>> readExcelContent() throws Exception
    {
        if (wb == null)
        {
            // throw new Exception("Workbook对象为空！");
        }
        List<Map<String, String>> rowlists = new ArrayList<Map<String,String>>();
        sheet = wb.getSheetAt(0);
        // 得到总行数
        int rowNum = sheet.getLastRowNum();
        row = sheet.getRow(1);
        int colNum = row.getPhysicalNumberOfCells();
        // 正文内容应该从第二行开始,第一行为表头的标题
        for (int i = 0; i <= rowNum; i++)
        {
            row = sheet.getRow(i);
            int j = 0;
            rowlist = new ArrayList<String>();
            while (j < colNum)
            {
                Object obj = getCellFormatValue(row.getCell(j));
                rowlist.add(obj.toString());
                j++;
            }
            if (i == 1)
            {
            titlelist = rowlist;
            } else if (i > 1)
            {
                for (int s = 0;s < rowlist.size(); s++)
                {
                    map.put(titlelist.get(s), rowlist.get(s));
                }
                rowlists.add(map);
                map = new HashMap<String,String>();
            }
        }
        
        return rowlists;
    }

    /**
     * 根据Cell类型设置数据
     * @param cell
     * @return
     * @author zengwendong
     */
    private Object getCellFormatValue(Cell cell)
    {
        Object cellvalue = "";
        if (cell != null)
        {
            // 判断当前Cell的Type
            try
            {
                  if (Cell.CELL_TYPE_NUMERIC == cell.getCellType())
                  {
                      cellvalue = cell.getNumericCellValue();
                      if (Double.valueOf(cellvalue.toString()) % 1 == 0)
                      { // 是这个整数，小数点后面是0
                         cellvalue = Double.valueOf(cellvalue.toString()).intValue();
                      }
                  } else if (Cell.CELL_TYPE_FORMULA == cell.getCellType())
                  {
                   // 判断当前的cell是否为Date
                      if (DateUtil.isCellDateFormatted(cell))
                      {
                          // 如果是Date类型则，转化为Data格式
                          // data格式是带时分秒的：2013-7-10 0:00:00
                          // cellvalue = cell.getDateCellValue().toLocaleString();
                          // data格式是不带带时分秒的：2013-7-10
                          SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
                          Date date = cell.getDateCellValue();
                          cellvalue = sdf.format(date);
                      }
                      else
                      { // 如果是纯数字
                          // 取得当前Cell的数值
                          cellvalue = String.valueOf(cell.getNumericCellValue());
                      }
                  } else if ( Cell.CELL_TYPE_STRING == cell.getCellType() )
                  {
                      cellvalue = cell.getRichStringCellValue().getString();
                  } else
                  {
                      cellvalue = "";
                  }
            } catch (Exception e)
            {
                e.printStackTrace();
            }
          
        }
        else
        {
            cellvalue = "";
        }
        return cellvalue;
    }

}