参考资料:http://blog.sina.com.cn/s/blog_628d4dd101017quk.html http://www.iteye.com/topic/624967
poi是广泛应用的excel文件的读写架包.可以进行excel文件的数据读取分析或者excel文件的创建写入。相应的,也就可以比对2个excel文件内容是否是完全一致的.
首先我们先来个简单一点的:
假设现在有2个2003版本的xls文件file1和file2.我们需要确定2个文件的内容是一模一样的:
示例代码1:
public static void compare(File file1,File file2) throws FileNotFoundException, IOException{
POIFSFileSystem fs1 = new POIFSFileSystem(new FileInputStream(file1));
POIFSFileSystem fs2 = new POIFSFileSystem(new FileInputStream(file2));
HSSFWorkbook wb1 = new HSSFWorkbook(fs1);
HSSFWorkbook wb2 = new HSSFWorkbook(fs2);
//TODO multisheet function
HSSFSheet sheet1 = wb1.getSheetAt(0);
HSSFSheet sheet2 = wb2.getSheetAt(0);
int rowNum1 = sheet1.getLastRowNum();
int colNum1 = sheet1.getRow(0).getLastCellNum();
int rowNum2 = sheet2.getLastRowNum();
int colNum2 = sheet2.getRow(0).getLastCellNum();
if(rowNum1!=rowNum2||colNum1!=colNum2){
System.out.println("col num different!");
}
HSSFRow row1 = null;
HSSFRow row2 = null;
for(int i=0;i<rowNum1;i++){
row1 = null;
row2 = null;
row1 = sheet1.getRow(i);
row2 = sheet2.getRow(i);
if(row1.getLastCellNum()!=row2.getLastCellNum()){
System.out.println("col num different:row"+i);
}
HSSFCell cell1 = null;
HSSFCell cell2 = null;
for(int j=0;j<row1.getLastCellNum();j++){
cell1 = null;
cell2 = null;
cell1 = row1.getCell(j);
cell2 = row2.getCell(j);
if(cell1.getCellType()!=cell2.getCellType()){
}else{
if(cell1.getCellType()==HSSFCell.CELL_TYPE_BOOLEAN){
if(cell1.getBooleanCellValue()!=cell2.getBooleanCellValue()){
System.out.println("HSSFCell.CELL_TYPE_BOOLEAN value different:row"+i+",cell:"+j+",cell1:"+cell1.getBooleanCellValue()+",cell2:"+cell2.getBooleanCellValue());
}
}
if(cell1.getCellType()==HSSFCell.CELL_TYPE_NUMERIC){
if(cell1.getNumericCellValue()!=cell2.getNumericCellValue()){
System.out.println("HSSFCell.CELL_TYPE_NUMERIC value different:row"+i+",cell:"+j+",cell1:"+cell1.getNumericCellValue()+",cell2:"+cell2.getNumericCellValue());
}
}
if(cell1.getCellType()==HSSFCell.CELL_TYPE_STRING){
if(!cell1.getStringCellValue().equals(cell2.getStringCellValue())){
System.out.println("HSSFCell.CELL_TYPE_STRING value different:row"+i+",cell:"+j+",cell1:"+cell1.getStringCellValue()+",cell2:"+cell2.getStringCellValue());
}
}
if(cell1.getCellType()==HSSFCell.CELL_TYPE_FORMULA){
if(!cell1.getCellFormula().equals(cell2.getCellFormula())){
System.out.println("HSSFCell.CELL_TYPE_BOOLEAN value different:row"+i+",cell:"+j+",cell1:"+cell1.getCellFormula()+",cell2:"+cell2.getCellFormula());
}
}
}
}
}
}
然后当我们拿到2个2007版本及以上的xlsx文件file1和file2的时候,我们就会发现使用示例代码1来做这个工作,程序抛出异常:
Exception in thread "main" org.apache.poi.poifs.filesystem.OfficeXmlFileException: The supplied data appears to be in the Office 2007+ XML. You are calling the part of POI that deals with OLE2 Office Documents. You need to call a different part of POI to process this data (eg XSSF instead of HSSF)
at org.apache.poi.poifs.storage.HeaderBlock.<init>(HeaderBlock.java:131)
at org.apache.poi.poifs.storage.HeaderBlock.<init>(HeaderBlock.java:104)
at org.apache.poi.poifs.filesystem.POIFSFileSystem.<init>(POIFSFileSystem.java:138)
at CompareTxt.compare(CompareTxt.java:36)
at CompareTxt.main(CompareTxt.java:268)
这是因为xlsx文件不能使用HSSFWorkbook,HSSFSheet,HSSFRow以及HSSFCell来操作读写了,需要改为
XSSFWorkbook,XSSFSheet,XSSFRow以及XSSFCell这一套~~嗯嗯~~
示例代码2如下:
public static void compareFor2010(File file1,File file2) throws FileNotFoundException, IOException, InvalidFormatException{
XSSFWorkbook wb1 = new XSSFWorkbook(new FileInputStream(file1)) ;
XSSFWorkbook wb2 = new XSSFWorkbook(new FileInputStream(file2)) ;
//TODO multi sheet function
Sheet sheet1 = wb1.getSheetAt(0);
Sheet sheet2 = wb2.getSheetAt(0);
int rowNum1 = sheet1.getLastRowNum();
int colNum1 = sheet1.getRow(0).getLastCellNum();
int rowNum2 = sheet2.getLastRowNum();
int colNum2 = sheet2.getRow(0).getLastCellNum();
if(rowNum1!=rowNum2||colNum1!=colNum2){
System.out.println("col num different!");
}
Row row1 = null;
Row row2 = null;
for(int i=0;i<=rowNum1;i++){
if (i%1000 == 0) {
System.out.println("row come to:"+i);
}
row1 = null;
row2 = null;
row1 = sheet1.getRow(i);
row2 = sheet2.getRow(i);
if(row1.getLastCellNum()!=row2.getLastCellNum()){
System.out.println("col num different:row"+i);
}
Cell cell1 = null;
Cell cell2 = null;
for(int j=0;j<row1.getLastCellNum();j++){
cell1 = null;
cell2 = null;
cell1 = row1.getCell(j);
cell2 = row2.getCell(j);
if(cell1.getCellType()!=cell2.getCellType()){
}else{
if(cell1.getCellType()==HSSFCell.CELL_TYPE_BOOLEAN){
if(cell1.getBooleanCellValue()!=cell2.getBooleanCellValue()){
System.out.println("HSSFCell.CELL_TYPE_BOOLEAN value different:row"+i+",cell:"+j+",cell1:"+cell1.getBooleanCellValue()+",cell2:"+cell2.getBooleanCellValue());
}
}
if(cell1.getCellType()==HSSFCell.CELL_TYPE_NUMERIC){
if(cell1.getNumericCellValue()!=cell2.getNumericCellValue()){
System.out.println("HSSFCell.CELL_TYPE_NUMERIC value different:row"+i+",cell:"+j+",cell1:"+cell1.getNumericCellValue()+",cell2:"+cell2.getNumericCellValue());
}
}
if(cell1.getCellType()==HSSFCell.CELL_TYPE_STRING){
if(!cell1.getStringCellValue().equals(cell2.getStringCellValue())){
System.out.println("HSSFCell.CELL_TYPE_STRING value different:row"+i+",cell:"+j+",cell1:"+cell1.getStringCellValue()+",cell2:"+cell2.getStringCellValue());
}
}
if(cell1.getCellType()==HSSFCell.CELL_TYPE_FORMULA){
if(!cell1.getCellFormula().equals(cell2.getCellFormula())){
System.out.println("HSSFCell.CELL_TYPE_BOOLEAN value different:row"+i+",cell:"+j+",cell1:"+cell1.getCellFormula()+",cell2:"+cell2.getCellFormula());
}
}
}
}
}
}
其他代码不变,就是上面提到的几个实现类替换了即可。
但是,当我们拿到了2个超级大的xlsx文件对象的时候,我们就悲剧的发现示例代码2程序工作的时候继续抛出了一个新的异常给我们:
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at org.apache.xmlbeans.impl.store.Cur$CurLoadContext.attr(Cur.java:3039)
at org.apache.xmlbeans.impl.store.Cur$CurLoadContext.attr(Cur.java:3060)
at org.apache.xmlbeans.impl.store.Locale$SaxHandler.startElement(Locale.java:3250)
at org.apache.xmlbeans.impl.piccolo.xml.Piccolo.reportStartTag(Piccolo.java:1082)
at org.apache.xmlbeans.impl.piccolo.xml.PiccoloLexer.parseAttributesNS(PiccoloLexer.java:1822)
at org.apache.xmlbeans.impl.piccolo.xml.PiccoloLexer.parseOpenTagNS(PiccoloLexer.java:1521)
at org.apache.xmlbeans.impl.piccolo.xml.PiccoloLexer.parseTagNS(PiccoloLexer.java:1362)
at org.apache.xmlbeans.impl.piccolo.xml.PiccoloLexer.parseXMLNS(PiccoloLexer.java:1293)
at org.apache.xmlbeans.impl.piccolo.xml.PiccoloLexer.parseXML(PiccoloLexer.java:1261)
at org.apache.xmlbeans.impl.piccolo.xml.PiccoloLexer.yylex(PiccoloLexer.java:4808)
at org.apache.xmlbeans.impl.piccolo.xml.Piccolo.yylex(Piccolo.java:1290)
at org.apache.xmlbeans.impl.piccolo.xml.Piccolo.yyparse(Piccolo.java:1400)
at org.apache.xmlbeans.impl.piccolo.xml.Piccolo.parse(Piccolo.java:714)
at org.apache.xmlbeans.impl.store.Locale$SaxLoader.load(Locale.java:3439)
at org.apache.xmlbeans.impl.store.Locale.parseToXmlObject(Locale.java:1270)
at org.apache.xmlbeans.impl.store.Locale.parseToXmlObject(Locale.java:1257)
at org.apache.xmlbeans.impl.schema.SchemaTypeLoaderBase.parse(SchemaTypeLoaderBase.java:345)
at org.openxmlformats.schemas.spreadsheetml.x2006.main.WorksheetDocument$Factory.parse(Unknown Source)
at org.apache.poi.xssf.usermodel.XSSFSheet.read(XSSFSheet.java:137)
at org.apache.poi.xssf.usermodel.XSSFSheet.onDocumentRead(XSSFSheet.java:129)
at org.apache.poi.xssf.usermodel.XSSFWorkbook.onDocumentRead(XSSFWorkbook.java:269)
at org.apache.poi.POIXMLDocument.load(POIXMLDocument.java:159)
at org.apache.poi.xssf.usermodel.XSSFWorkbook.<init>(XSSFWorkbook.java:190)
at CompareTxt.compareFor2010(CompareTxt.java:100)
at CompareTxt.main(CompareTxt.java:258)
这是因为创建XSSFWorkbook对象的时候是吧整个文件加载到内存,我们可怜的256M或则512M的eclipse workspace叼爆了。。。。
所以这就需要下面的示例代码3:
package excel2007compare;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public abstract class XxlsAbstract extends DefaultHandler {
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private int sheetIndex = -1;
private List<String> rowlist = new ArrayList<String>();
private int curRow = 0;
private int curCol = 0;
private String savedFileName;//保存的文件名
private FileOutputStream out;
// excel记录行操作方法,以行索引和行元素列表为参数,对一行元素进行操作,元素为String类型
// public abstract void optRows(int curRow, List<String> rowlist) throws
// SQLException ;
// excel记录行操作方法,以sheet索引,行索引和行元素列表为参数,对sheet的一行元素进行操作,元素为String类型
public abstract void optRows(int sheetIndex, int curRow,
List<String> rowlist) throws SQLException;
public abstract void optRowsAndSave(int sheetIndex, int curRow,
List<String> rowlist,FileOutputStream out) throws Exception;
// 只遍历一个sheet,其中sheetId为要遍历的sheet索引,从1开始,1-3
public void processOneSheet(String filename, int sheetId) throws Exception {
out = new FileOutputStream(savedFileName);
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader(pkg);
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
// rId2 found by processing the Workbook
// 根据 rId# 或 rSheet# 查找sheet
InputStream sheet2 = r.getSheet("rId" + sheetId);
sheetIndex++;
InputSource sheetSource = new InputSource(sheet2);
parser.parse(sheetSource);
sheet2.close();
out.flush();
out.close();
}
/**
* 遍历 excel 文件
*/
public void process(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader(pkg);
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
Iterator<InputStream> sheets = r.getSheetsData();
while (sheets.hasNext()) {
curRow = 0;
sheetIndex++;
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}
}
public XMLReader fetchSheetParser(SharedStringsTable sst)
throws SAXException {
XMLReader parser = XMLReaderFactory
.createXMLReader("org.apache.xerces.parsers.SAXParser");
this.sst = sst;
parser.setContentHandler(this);
return parser;
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => 单元格
if (name.equals("c")) {
// 如果下一个元素是 SST 的索引,则将nextIsString标记为true
String cellType = attributes.getValue("t");
if (cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// 置空
lastContents = "";
}
public void endElement(String uri, String localName, String name)
throws SAXException {
// 根据SST的索引值的到单元格的真正要存储的字符串
// 这时characters()方法可能会被调用多次
if (nextIsString) {
try {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx))
.toString();
} catch (Exception e) {
}
}
// v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引
// 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
if (name.equals("v")) {
String value = lastContents.trim();
value = value.equals("") ? " " : value;
rowlist.add(curCol, value);
curCol++;
} else {
// 如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法
if (name.equals("row")) {
try {
optRowsAndSave(sheetIndex, curRow, rowlist,getOut());
} catch (Exception e) {
e.printStackTrace();
}
rowlist.clear();
curRow++;
curCol = 0;
}
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
// 得到单元格内容的值
lastContents += new String(ch, start, length);
}
public void setSavedFileName(String savedFileName) {
this.savedFileName = savedFileName;
}
public String getSavedFileName() {
return savedFileName;
}
public void setOut(FileOutputStream out) {
this.out = out;
}
public FileOutputStream getOut() {
return out;
}
}
package excel2007compare;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.sql.SQLException;
import java.util.List;
public class CompareBigExcel2007 extends XxlsAbstract {
@Override
public void optRows(int sheetIndex,int curRow, List<String> rowlist) throws SQLException {
for (int i = 0; i < rowlist.size(); i++) {
System.out.print("'" + rowlist.get(i) + "',");
}
System.out.println();
}
public static void main(String[] args) throws Exception {
CompareBigExcel2007 howto = new CompareBigExcel2007();
howto.setSavedFileName("C:\\2222.txt");
howto.processOneSheet("C:\\2222.xlsx",1);
}
@Override
public void optRowsAndSave(int sheetIndex, int curRow,
List<String> rowlist, FileOutputStream out) throws Exception {
for (int i = 0; i < rowlist.size(); i++) {
out.write((rowlist.get(i)+",").getBytes());
}
out.write(System.getProperty("line.separator").getBytes());
}
}
通过上面2个类我们可以将excel文件单元格的内容读取并重新到txt文件,下面就是txt文件内容的比对:
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class CompareExcel {
public static void compare(File file1,File file2) throws FileNotFoundException, IOException{
boolean diff = false;
FileInputStream in1 = new FileInputStream(file1);
FileInputStream in2 = new FileInputStream(file2);
int i=1;int j=1;
byte[] bytes2 = new byte[5000];
byte[] bytes1 = new byte[5000];
while(i>=0){
i=in1.read(bytes1);
j=in2.read(bytes2);
if(i>=0){
System.out.println("i:"+i+",j:"+j);
if(!new String(bytes1,0,i).equals(new String(bytes2,0,j))){
System.out.println("bytes1.toString():"+new String(bytes1,0,i));
System.out.println("bytes2.toString():"+new String(bytes2,0,j));
diff = true;
}
}
}
System.out.print("diff:"+diff);
}
public static void test11(){
}
public static void main(String[] args) throws FileNotFoundException, IOException {
compare(new File("C:\\1111.txt"),new File("C:\\2222.txt"));
}
}
基于问什么不直接流读取excel文件进行比对而是先转化为txt文件在进行比对? 这个当然不是因为闲着蛋疼,而是因为excel文件是有格式的(其实相邻单元格之间会有逗号),这就会是的你的流读取出现乱码,进而得到错误的结果。而转化为txt文件就是为了得到纯洁的文件内容~~嗯嗯~~那么你是纯洁的嘛?
以上~