很抱歉,现在才注意到代码竟然有span这些东西,当时我是没有的,并且现在我编辑代码没有,少的一个EXCEL03类,只是处理03版本的EXCEL,03的EXCEL是没有大批量数据的,除非分页,现在补进来,并且补入maven包
===分隔符===
很久很久没写博客了,工作了感觉纯理论水平竟然直线下滑,现在重新开始了
这个版本的代码是网上的,但是有BUG,空格之类的BUG,测出来的时候一脸懵逼,然后修改成为了一个可以处理2007,100W的数据量是没问题的,我测试过,不会有BUG的实现了,其实最好去POI的官网去,官网上面有很多INSTANCE,方便记忆我也把网址贴出来https://poi.apache.org/spreadsheet/index.html
其实这样子直接读取并不太好,可以考虑将EXCEL转换为CSV,或者文件切割,然后处理,这样子大数据更容易处理
Maven包
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.8</version>
</dependency>
Excel处理
import java.util.HashMap;
import java.util.Map;
/**
* Created by hasee on 2017/9/7.
*/
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public class ExceUtil {
private static Logger logger = LoggerFactory.getLogger(ExceUtil.class);
private StylesTable stylesTable =null;
private static enum CellDataType{
BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
}
/**
* 处理一个sheet
* @param filename
* @throws Exception
*/
public List<List<String>> processOneSheet(String filename) throws Exception {
try{
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
stylesTable = r.getStylesTable();
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
// Seems to either be rId# or rSheet#
//InputStream sheet2 = r.getSheet("rId1");
Iterator<InputStream> sheets = r.getSheetsData();
InputStream sheet2 =sheets.next();
InputSource sheetSource = new InputSource(sheet2);
parser.parse(sheetSource);
sheet2.close();
SheetHandler tmp=(SheetHandler) parser.getContentHandler();
pkg.close();
return tmp.getSheetData();
}catch(Exception e){
e.printStackTrace();
}
return null;
}
/**
* 处理所有sheet
* @param filename
* @throws Exception
*/
public void processAllSheets(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
Iterator<InputStream> sheets = r.getSheetsData();
while(sheets.hasNext()) {
logger.info("Processing new sheet:\n");
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
logger.info("");
}
}
/**
* 获取解析器
* @param sst
* @return
* @throws SAXException
*/
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
XMLReader parser =
XMLReaderFactory.createXMLReader(
"org.apache.xerces.parsers.SAXParser"
);
ContentHandler handler = new SheetHandler(sst);
parser.setContentHandler(handler);
return parser;
}
/**
* 自定义解析处理器
* See org.xml.sax.helpers.DefaultHandler javadocs
*/
private class SheetHandler extends DefaultHandler {
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private List<String> rowlist = new ArrayList<String>();
private List<List<String>> sheetData = new ArrayList<List<String>>();
//增加的前一个value与其绑定
private Map<String,String> mapVal =new HashMap<String,String>();
private String preValue=null;
private int curRow = 0;
private int curCol = 0;
//定义前一个元素和当前元素的位置,用来计算其中空的单元格数量,如A6和A8等
private String preRef = null, ref = null;
//定义该文档一行最大的单元格数,用来补全一行最后可能缺失的单元格
private String maxRef = null;
private CellDataType nextDataType = CellDataType.SSTINDEX;
private final DataFormatter formatter = new DataFormatter();
private short formatIndex;
private String formatString;
private String tmpString;
//用一个enum表示单元格可能的数据类型
/*enum CellDataType{
BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
}*/
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
}
public List<List<String>> getSheetData(){
return sheetData;
}
/**
* 解析一个element的开始时触发事件
*/
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => cell
if(name.equals("c")) {
//前一个单元格的位置
if(preRef == null){
preRef = attributes.getValue("r");
}else{
preRef = ref;
}
//当前单元格的位置
ref = attributes.getValue("r");
this.setNextDataType(attributes);
// Figure out if the value is an index in the SST
String cellType = attributes.getValue("t");
if(cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// Clear contents cache
lastContents = "";
}
//判断是不是另起一行
private Boolean isStart=true;
/**
* 解析一个element元素结束时触发事件
*/
public void endElement(String uri, String localName, String name)
throws SAXException {
// Process the last contents as required.
// Do now, as characters() may be called more than once
if(nextIsString) {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
nextIsString = false;
}
// v => contents of a cell
// Output after we've seen the string contents
if (name.equals("v")) {
String value = this.getDataValue(lastContents.trim(), "");
if(preValue!=null){
preRef=mapVal.get(preValue);
}
mapVal.put(value,ref);
//补全首列为空的情况
if(isStart){
int len=countStart(ref);
for(int i=0;i<len;i++){
rowlist.add(curCol, "");
curCol++;
}
isStart=false;
}
//补全单元格之间的空单元格
if(!ref.equals(preRef)){
//test=false;
int len = countNullCell(ref, preRef);
for(int i=0;i<len;i++){
rowlist.add(curCol, "");
curCol++;
}
}
preValue=value;
rowlist.add(curCol, value);
curCol++;
}else {
//如果标签名称为 row,这说明已到行尾,调用 optRows() 方法
if (name.equals("row")) {
String value = "";
//默认第一行为表头,以该行单元格数目为最大数目
if(curRow == 0){
maxRef = ref;
}
//补全一行尾部可能缺失的单元格
if(maxRef != null){
int len = countNullCell(maxRef, ref);
for(int i=0;i<=len;i++){
rowlist.add(curCol, "");
curCol++;
}
}
//拼接一行的数据
for(int i=0;i<rowlist.size();i++){
if(rowlist.get(i).contains(",")){
value += "\""+rowlist.get(i)+"\",";
}else{
value += rowlist.get(i)+",";
}
}
//加换行符
value += "\n";
/*try {
writer.write(value);
} catch (IOException e) {
e.printStackTrace();
}*/
curRow++;
//一行的末尾重置一些数据
//自然增加,后期可以在大小看是否能改善下空间储存效率
List<String> tmpList=new ArrayList<String>(rowlist);
sheetData.add(tmpList);
rowlist.clear();
curCol = 0;
preRef = null;
ref = null;
//换行数据,所以之前绑定的就可以清空
mapVal.clear();
preValue=null;
isStart=true;
}
}
}
/**
* 根据element属性设置数据类型
* @param attributes
*/
public void setNextDataType(Attributes attributes){
nextDataType = CellDataType.NUMBER;
formatIndex = -1;
formatString = null;
String cellType = attributes.getValue("t");
String cellStyleStr = attributes.getValue("s");
if ("b".equals(cellType)){
nextDataType = CellDataType.BOOL;
}else if ("e".equals(cellType)){
nextDataType = CellDataType.ERROR;
}else if ("inlineStr".equals(cellType)){
nextDataType = CellDataType.INLINESTR;
}else if ("s".equals(cellType)){
nextDataType = CellDataType.SSTINDEX;
}else if ("str".equals(cellType)){
nextDataType = CellDataType.FORMULA;
}
if (cellStyleStr != null){
int styleIndex = Integer.parseInt(cellStyleStr);
XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
formatIndex = style.getDataFormat();
formatString = style.getDataFormatString();
if ("m/d/yy" == formatString){
nextDataType = CellDataType.DATE;
//full format is "yyyy-MM-dd hh:mm:ss.SSS";
formatString = "yyyy-MM-dd";
}
if (formatString == null){
nextDataType = CellDataType.NULL;
formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
}
}
}
/**
* 根据数据类型获取数据
* @param value
* @param thisStr
* @return
*/
public String getDataValue(String value, String thisStr)
{
switch (nextDataType)
{
//这几个的顺序不能随便交换,交换了很可能会导致数据错误
case BOOL:
char first = value.charAt(0);
thisStr = first == '0' ? "FALSE" : "TRUE";
break;
case ERROR:
thisStr = "\"ERROR:" + value.toString() + '"';
break;
case FORMULA:
thisStr = '"' + value.toString() + '"';
break;
case INLINESTR:
XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
thisStr = rtsi.toString();
rtsi = null;
break;
case SSTINDEX:
String sstIndex = value.toString();
thisStr = value.toString();
break;
case NUMBER:
if (formatString != null){
thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim();
}else{
thisStr = value;
}
thisStr = thisStr.replace("_", "").trim();
break;
case DATE:
try{
thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString);
}catch(NumberFormatException ex){
thisStr = value.toString();
}
thisStr = thisStr.replace(" ", "");
break;
default:
thisStr = "";
break;
}
return thisStr;
}
/**
* 获取element的文本数据
*/
public void characters(char[] ch, int start, int length)
throws SAXException {
lastContents += new String(ch, start, length);
}
/**
* 计算两个单元格之间的单元格数目(同一行)
* @param ref
* @param preRef
* @return
*/
public int countNullCell(String ref, String preRef){
//excel2007最大行数是1048576,最大列数是16384,最后一列列名是XFD
String xfd = ref.replaceAll("\\d+", "");
String xfd_1 = preRef.replaceAll("\\d+", "");
xfd = fillChar(xfd, 3, '@', true);
xfd_1 = fillChar(xfd_1, 3, '@', true);
char[] letter = xfd.toCharArray();
char[] letter_1 = xfd_1.toCharArray();
int res = (letter[0]-letter_1[0])*26*26 + (letter[1]-letter_1[1])*26 + (letter[2]-letter_1[2]);
return res-1;
}
/**
* 计算首个单元格为空的情况
* @param ref
* @param preRef
* @return
*/
public int countStart(String ref){
//excel2007最大行数是1048576,最大列数是16384,最后一列列名是XFD
//
String A="A";
ref=ref.substring(0, 1);
char[] end = ref.toCharArray();
char[] start=A.toCharArray();
int len=(int)end[0]-(int)start[0];
return len;
}
/**
* 字符串的填充
* @param str
* @param len
* @param let
* @param isPre
* @return
*/
String fillChar(String str, int len, char let, boolean isPre){
int len_1 = str.length();
if(len_1 <len){
if(isPre){
for(int i=0;i<(len-len_1);i++){
str = let+str;
}
}else{
for(int i=0;i<(len-len_1);i++){
str = str+let;
}
}
}
return str;
}
}
BufferedWriter writer = null;
public static void main(String[] args) throws Exception {
ExceUtil example = new ExceUtil();
String filename = "C:\\Users\\lifi\\Desktop\\100W.xlsx";
List<List<String>> test=example.getExcel(filename);
for(int i=0;i<test.size();i++){
System.out.println(test.get(i));
}
}
// 是否是2003的excel,返回true是2003
public static boolean is2003Excel(String filePath) {
return filePath.matches("^.+\\.(?i)(xls)$");
}
public List<List<String>> getExcel(String filePath){
try{
List<List<String>> tmp;
if(is2003Excel(filePath)){
ExcelUtil03 excelUtil03=new ExcelUtil03();
tmp=excelUtil03.readExcel(filePath);
}else{
ExceUtil example = new ExceUtil();
tmp=example.processOneSheet(filePath);
}
return tmp;
}catch(Exception e){
logger.info(e.getMessage());
}
return null;
}
public List<List<String>> getExcel(String fileName,BufferedWriter writerCome){
try{
writer=writerCome;
ExceUtil example = new ExceUtil();
example.processOneSheet(fileName);
}catch(Exception e){
logger.error(e.getMessage());
}
return null;
}
}
Excel03处理
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ExcelUtil03 {
private static Logger logger = LoggerFactory.getLogger(ExcelUtil03.class);
// 错误信息
private String errorInfo;
// 错误信息
private static int readSheet = 0;
private static boolean readSheetNum = false;
// 验证excel文件
public boolean validateExcel(String filePath) {
// 检查文件名是否为空或者是否是Excel格式的文件
if (filePath == null || !(is2003Excel(filePath) || is2007Excel(filePath))) {
errorInfo = "文件名不是excel格式";
return false;
}
// 检查文件是否存在
File file = new File(filePath);
if (file == null || !file.exists()) {
errorInfo = "excel文件不存在";
return false;
}
return true;
}
// 根据文件名读取excel文件
public List<List<String>> readExcel(String filePath) {
List<List<String>> dataList = new ArrayList<List<String>>();
InputStream is = null;
try {
// 验证文件是否合法
if (!validateExcel(filePath)) {
logger.error(errorInfo);
return null;
}
// 判断文件的类型,是2003还是2007
boolean is2003Excel = true;
if (is2007Excel(filePath)) {
is2003Excel = false;
}
// 调用本类提供的根据流读取的方法
File file = new File(filePath);
is = new FileInputStream(file);
dataList = readFile(is, is2003Excel);
is.close();
} catch (Exception ex) {
ex.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
is = null;
e.printStackTrace();
}
}
}
// 返回最后读取的结果
return dataList;
}
// 根据流读取Excel文件
public List<List<String>> readFile(InputStream inputStream,
boolean is2003Excel) {
List<List<String>> dataLists = null;
try {
// 根据版本选择创建Workbook的方式
Workbook wb = null;
if (is2003Excel) {
wb = new HSSFWorkbook(inputStream);
} else {
wb = new XSSFWorkbook(inputStream);
}
// sheet循环
int sheetNum = sheetCirculation(wb);
List<List<String>> dataList = new ArrayList<List<String>>();
if (readSheetNum) {
dataLists = read(dataList, wb, readSheet);
} else {
for (int i = 0; i < sheetNum; i++) {
// Sheet sheet = wb.getSheetAt(i);
// 显示sheet名称
// System.out.println(sheet.getSheetName());
dataLists = read(dataList, wb, i);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return dataLists;
}
// 读取数据
private List<List<String>> read(List<List<String>> dataList, Workbook wb,
int sheets) {
// 总行数
int totalRows = 0;
// 总列数
int totalCells = 0;
// 第一个shell页
Sheet sheet = wb.getSheetAt(sheets);
// Excel的行数
totalRows = sheet.getPhysicalNumberOfRows();
// Excel的列数
if (totalRows >= 1 && sheet.getRow(0) != null) {
totalCells = sheet.getRow(0).getPhysicalNumberOfCells();
}
// 遍历Excel的行
for (int r = 0; r < totalRows; r++) {
Row row = sheet.getRow(r);
if (row == null) {
continue;
}
List<String> rowLst = new ArrayList<String>();
// 遍历Excel的列
for (int c = 0; c < totalCells; c++) {
Cell cell = row.getCell(c);
String cellValue = "";
if (null != cell) {
// 以下是判断数据的类型
switch (cell.getCellType()) {
case HSSFCell.CELL_TYPE_NUMERIC: {// 数字
// cellValue = cell.getNumericCellValue() + "";
if (HSSFDateUtil.isCellDateFormatted(cell)) {
double d = cell.getNumericCellValue();
Date date = HSSFDateUtil.getJavaDate(d);
SimpleDateFormat dformat = new SimpleDateFormat("yyyy-MM-dd");
cellValue = dformat.format(date);
} else {
NumberFormat nf = NumberFormat.getInstance();
nf.setGroupingUsed(false);// true时的格式:1,234,567,890
cellValue = nf.format(cell.getNumericCellValue());// 数值类型的数据为double,所以需要转换一下
}
break;
}
case HSSFCell.CELL_TYPE_STRING: {// 字符串
cellValue = cell.getStringCellValue();
break;
}
case HSSFCell.CELL_TYPE_BOOLEAN:{ // Boolean
cellValue = cell.getBooleanCellValue() + "";
break;
}
case HSSFCell.CELL_TYPE_FORMULA: {// 公式
try {
BigDecimal gongshi = new BigDecimal(cell.getNumericCellValue());
double gongshi1 = gongshi.setScale(2, BigDecimal.ROUND_HALF_UP).doubleValue();
cellValue = String.valueOf(gongshi1);
} catch (IllegalStateException e) {
cellValue = String.valueOf(cell.getRichStringCellValue())+"";
}
break;
}
case HSSFCell.CELL_TYPE_BLANK: {// 空值
cellValue = "";
break;
}
case HSSFCell.CELL_TYPE_ERROR: {// 故障
cellValue = "非法字符";
break;
}
default: {
cellValue = "未知类型";
break;
}
}
}
rowLst.add(cellValue);
}
// 保存第r行的第c列
dataList.add(rowLst);
}
return dataList;
}
private int sheetCirculation(Workbook wb) {
int sheetCount = -1;
sheetCount = wb.getNumberOfSheets();
return sheetCount;
}
// 是否是2003的excel,返回true是2003
public static boolean is2003Excel(String filePath) {
return filePath.matches("^.+\\.(?i)(xls)$");
}
// 是否是2007的excel,返回true是2007
public static boolean is2007Excel(String filePath) {
return filePath.matches("^.+\\.(?i)(xlsx)$");
}
// 构造方法
public ExcelUtil03() {
}
// 得到错误信息
public String getErrorInfo() {
return errorInfo;
}
public static void main(String[] args){
ExcelUtil03 excelUtil03=new ExcelUtil03();
List<List<String>> test=excelUtil03.readExcel("C:\\Users\\lifi\\Desktop\\数据核对模板.xlsx");
}
}