读取大数据量excel

本文介绍了一个用于解析Excel 2007文件的Java类,通过使用Apache POI库来读取XLSX格式的工作簿内容,并提供详细的单元格数据处理逻辑。
package excel;


import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;


import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;


public class Excel2007Reader extends DefaultHandler {
/**
* 共享字符串表
*/
private SharedStringsTable sst;


/**
* 上一次的内容
*/
private String lastContents;


/**
* 字符串标识
*/
private boolean nextIsString;


/**
* 工作表索引
*/
private int sheetIndex = -1;


/**
* 行集合
*/
private List<String> rowlist = new ArrayList<String>();


/**
* 当前行
*/
private int curRow = 0;


/**
* 当前列
*/
private int curCol = 0;


/**
* T元素标识
*/
private boolean isTElement;


/**
* 异常信息,如果为空则表示没有异常
*/
private String exceptionMessage;


/**
* 单元格数据类型,默认为字符串类型
*/
private CellDataType nextDataType = CellDataType.SSTINDEX;


private final DataFormatter formatter = new DataFormatter();


private short formatIndex;


private String formatString;
/** 封装第一行的标题信息,防止空单元格 */
private List<String> title = new ArrayList<String>();


/**
* 单元格
*/
private StylesTable stylesTable;


/**
* 取第一个sheet里内容
*
* @param filename
* @param sheetId
* @throws Exception
*/
public void processOneSheet(String filename, int sheetId) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader xssfReader = new XSSFReader(pkg);
stylesTable = xssfReader.getStylesTable();
SharedStringsTable sst = xssfReader.getSharedStringsTable();
XMLReader parser = this.fetchSheetParser(sst);
InputStream sheet = xssfReader.getSheet("rId" + sheetId);
sheetIndex++;
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}


/**
* 遍历工作簿中所有的电子表格
*
* @param filename
* @throws IOException
* @throws OpenXML4JException
* @throws SAXException
* @throws Exception
*/
public void process(String filename) throws IOException,OpenXML4JException, SAXException {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader xssfReader = new XSSFReader(pkg);
stylesTable = xssfReader.getStylesTable();
SharedStringsTable sst = xssfReader.getSharedStringsTable();
XMLReader parser = this.fetchSheetParser(sst);
Iterator<InputStream> sheets = xssfReader.getSheetsData();
while (sheets.hasNext()) {
curRow = 0;
sheetIndex++;
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}
}


public XMLReader fetchSheetParser(SharedStringsTable sst)
throws SAXException {
XMLReader parser = XMLReaderFactory.createXMLReader();
// XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
this.sst = sst;
parser.setContentHandler(this);
return parser;
}


/**
* 模板中有多少列就设置多少列
*
* @author Administrator
*
*/
enum TitleItem {
A("A"), B("B"), C("C"), D("D"), E("E"), F("F"), G("G"), H("H"), I("I"), J("J");
private String key;


TitleItem(String key) {
this.key = key;
}


public static TitleItem getObj(String key) {
for (TitleItem obj : values()) {
if (obj.getKey().equals(key)) {
return obj;
}
}
return null;
}


public String getKey() {
return key;
}


public void setKey(String key) {
this.key = key;
}


}


@Override
public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
// 封装title
String type = attributes.getValue("r");
if (type != null && !type.equals("")) {
type = type.substring(0, 1);
if (TitleItem.getObj(type) != null) {
title.add(type);
}
}


// c => 单元格
if ("c".equals(name)) {
// 设定单元格类型
this.setNextDataType(attributes);
}


// 当元素为t时
if ("t".equals(name)) {
isTElement = true;
} else {
isTElement = false;
}
// 置空
lastContents = "";
}


/**
* 单元格中的数据可能的数据类型
*/
enum CellDataType {
BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
}


/**
* 处理数据类型
*
* @param attributes
*/
public void setNextDataType(Attributes attributes) {
nextDataType = CellDataType.NUMBER;
formatIndex = -1;
formatString = null;
String cellType = attributes.getValue("t");
String cellStyleStr = attributes.getValue("s");


if ("b".equals(cellType)) {
nextDataType = CellDataType.BOOL;
} else if ("e".equals(cellType)) {
nextDataType = CellDataType.ERROR;
} else if ("inlineStr".equals(cellType)) {
nextDataType = CellDataType.INLINESTR;
} else if ("s".equals(cellType)) {
nextDataType = CellDataType.SSTINDEX;
} else if ("str".equals(cellType)) {
nextDataType = CellDataType.FORMULA;
}


if (cellStyleStr != null) {
int styleIndex = Integer.parseInt(cellStyleStr);
XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
formatIndex = style.getDataFormat();
formatString = style.getDataFormatString();


if ("m/d/yy" == formatString) {
nextDataType = CellDataType.DATE;
formatString = "yyyy-MM-dd";
// formatString = "yyyy-MM-dd hh:mm:ss.SSS";
}


if (formatString == null) {
nextDataType = CellDataType.NULL;
formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
}
}
}


/**
* 对解析出来的数据进行类型处理
*
* @param value
* 单元格的值(这时候是一串数字)
* @param thisStr
* 一个空字符串
* @return
*/
public String getDataValue(String value, String thisStr) {
switch (nextDataType) {
// 这几个的顺序不能随便交换,交换了很可能会导致数据错误
case BOOL:
char first = value.charAt(0);
thisStr = first == '0' ? "FALSE" : "TRUE";
break;
case ERROR:
thisStr = "\"ERROR:" + value.toString() + '"';
break;
case FORMULA:
thisStr = '"' + value.toString() + '"';
break;
case INLINESTR:
XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());


thisStr = rtsi.toString();
rtsi = null;
break;
case SSTINDEX:
String sstIndex = value.toString();
try {
int idx = Integer.parseInt(sstIndex);
XSSFRichTextString rtss = new XSSFRichTextString(sst.getEntryAt(idx));
thisStr = rtss.toString();
rtss = null;
} catch (NumberFormatException ex) {
thisStr = value.toString();
}
break;
case NUMBER:
if (formatString != null) {
thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim();
} else {
thisStr = value;
}


thisStr = thisStr.replace("_", "").trim();
break;
case DATE:
thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString);


// 对日期字符串作特殊处理
thisStr = thisStr.replace(" ", "T");
break;
default:
thisStr = " ";


break;
}


return thisStr;
}


@Override
public void endElement(String uri, String localName, String name) throws SAXException {


// 根据SST的索引值的到单元格的真正要存储的字符串
// 这时characters()方法可能会被调用多次
if (nextIsString) {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
}


// t元素也包含字符串
if (isTElement) {
// 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
String value = lastContents.trim();
rowlist.add(curCol, value);
curCol++;
isTElement = false;
} else if ("v".equals(name)) {
// v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引
String value = this.getDataValue(lastContents.trim(), "");


rowlist.add(curCol, value);
curCol++;
} else {
// 如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法
if (name.equals("row")) {
try {
// 一行信息结束时进行业务逻辑处理
optRow(sheetIndex, curRow, rowlist, title);
// 重置当前行标题信息
title = new ArrayList<String>();
} catch (Exception e) {
e.printStackTrace();
}
rowlist.clear();
curRow++;
curCol = 0;
}
}
}


/**
* 处理每一行数据,可以根据业务需要封装成业务实体(每一行都保证有全部单元格的内容,空单元格内容为“”)
*
* @param sheetIndex
* @param curRow
* @param rowList
* @param title
*/
public void optRow(int sheetIndex, int curRow, List<String> rowList,List<String> title) {
System.out.println("第 " + (curRow + 1) + " 行数据如下:");
List<String> list = changeList(rowList, title);
for (String string : list) {
System.out.println(string);
}
/*
* System.out.println("---------------------------------------"); for
* (String key : title) { System.out.println(key); }
* System.out.println("第 "+(curRow+1)+" 行数据如下:"); for (int i = 0; i <
* rowlist.size(); i++) { System.out.print("'" + rowlist.get(i) + "',");
* } System.out.println("\n---------------------------------------");
*/
}


/**
* 将为空的单元内容读取出来(上面方法无法取到空单元格,所以人工处理一下)
*
* @param rowList
* @param title
*/
public List<String> changeList(List<String> rowList, List<String> title) {
TitleItem[] titles = TitleItem.values();
// 共有多少个字段
int count = titles.length;
// 实际有多少个字段
int realCount = title.size();
List<String> result = rowList;
// 有空单元格
if (count != realCount) {
result = new ArrayList<String>();
for (int i = 0; i < count; i++) {
boolean flag = false;
String value = "";
for (int j = 0; j < realCount; j++) {
// 和单元格标题匹配,不存在内容为“”
flag = title.get(j).equals(titles[i].getKey()) ? true : false;
if (flag) {
value = rowlist.get(j);
break;
}
}
result.add(value);
}
}
return result;
}


@Override
public void characters(char[] ch, int start, int length) throws SAXException {
// 得到单元格内容的值
lastContents += new String(ch, start, length);
}


/**
* @return the exceptionMessage
*/
public String getExceptionMessage() {
return exceptionMessage;
}
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值