这个是我在做Lucene全文检索的时候写的,郁闷的是,我们的环境用的是jdk1.4,但是要解析office2007必须用到POI3.5以上版本,但是POI3.5必须运行在JDK1.5以上的版本,
~只好寻求其他方法了
package
org.gaoyoubo.resolve;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
public class Resolve {
/**
* 得到文件后缀名
*/
public String getExt(String path) {
String ext = "" ;
if (path != null && ! "" .equals(path)) {
File file = new File(path);
if (file.exists()) {
ext = path.substring(path.lastIndexOf( " . " ) + 1 );
}
}
return ext;
}
public String execute(String path) {
String content = "" ;
String ext = getExt(path);
if (ext != null && ! "" .equals(ext)) {
if (Const.FILE_TYPE_LIST.contains(ext)) {
if (ext.equals( " txt " )) {
content = resolveText(path);
} else if (ext.equals( " doc " )) {
content = resolveWord2003(path);
} else if (ext.equals( " docx " )) {
content = resolveWord2007(path);
} else if (ext.equals( " pdf " )) {
content = resolvePdf(path);
} else if (ext.equals( " xls " )) {
content = resolveExcel2003(path);
} else if (ext.equals( " xlsx " )) {
content = resolveExcel2007(path);
}
}
}
else {
System.err.println( " 无法解析文件: " + path + " ! " );
}
return content;
}
/**
* 解析word2007
* @param path 文件路径
* @return 文件内容
*/
public String resolveWord2007(String path){
String content = "" ;
try {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
content = ex.getText();
} catch (Exception e) {
System.err.println( " 解析文件: " + path + " 失败! " );
}
return content;
}
/**
* 解析word2003
* @param path 文件路径
* @return 文件内容
*/
public String resolveWord2003(String path){
String content = "" ;
try {
File file = new File(path);
FileInputStream fis = new FileInputStream(file);
WordExtractor ex = new WordExtractor(fis);
content = ex.getText();
fis.close();
} catch (Exception e) {
System.err.println( " 解析文件: " + path + " 失败! " );
}
return content;
}
/**
* 解析Excel2003
* @param path
* @return
*/
public String resolveExcel2003(String path){
StringBuffer content = new StringBuffer( "" );
try {
File file = new File(path);
FileInputStream fis = new FileInputStream(file);
HSSFWorkbook wordbook = new HSSFWorkbook(fis);
// 遍历sheet
for ( int i = 0 ; i < wordbook.getNumberOfSheets(); i ++ ) {
if ( null != wordbook.getSheetAt(i)) {
HSSFSheet sheet = wordbook.getSheetAt(i); // 得到sheet
// 遍历该sheet中的数据
for ( int j = 0 ; j < sheet.getLastRowNum(); j ++ ) {
HSSFRow row = sheet.getRow(j); // 获取一行
// 循环遍历cell
for ( int k = 0 ; k < row.getLastCellNum(); k ++ ) {
if ( null != row.getCell(k)) {
HSSFCell cell = row.getCell(k); // 获取单元格的值
if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
content.append(cell.getNumericCellValue());
}
else if (cell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
content.append(cell.getBooleanCellValue());
}
else {
content.append(cell.getStringCellValue());
}
}
}
}
}
}
fis.close();
} catch (Exception e) {
System.err.println( " 解析文件: " + path + " 失败! " );
}
return content.toString();
}
/**
* 解析Excel2007
* @param path
* @return
*/
public String resolveExcel2007(String path){
StringBuffer content = new StringBuffer( "" );
try {
XSSFWorkbook wb = new XSSFWorkbook(path);
// 遍历sheet
for ( int i = 0 ; i < wb.getNumberOfSheets(); i ++ ) {
XSSFSheet sheet = wb.getSheetAt(i);
if (sheet == null ) {
continue ;
}
// 遍历行
for ( int j = 0 ; j < sheet.getLastRowNum(); j ++ ) {
XSSFRow row = sheet.getRow(j);
if (row == null ) {
continue ;
}
// 遍历单元格
for ( int k = 0 ; k < row.getLastCellNum(); k ++ ) {
XSSFCell cell = row.getCell(k);
if (cell == null ) {
continue ;
}
if (cell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
content.append(cell.getBooleanCellValue());
}
else if (cell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
content.append(cell.getNumericCellValue());
}
else {
content.append(cell.getStringCellValue());
}
}
}
}
} catch (Exception e) {
System.err.println( " 解析文件: " + path + " 失败! " );
}
return content.toString();
}
/**
* 解析pdf
* @param path 文件路径
* @return 文件内容
*/
public String resolvePdf(String path) {
String content = "" ;
// StringBuffer content = new StringBuffer("");
try {
/*
FileInputStream fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
fis.close();
*/
File file = new File(path);
PDDocument doc = PDDocument.load(file);
ByteArrayOutputStream out = new ByteArrayOutputStream();
OutputStreamWriter writer = new OutputStreamWriter(out);
PDFTextStripper ts = new PDFTextStripper();
ts.writeText(doc, writer);
doc.close();
out.close();
writer.close();
byte [] contents = out.toByteArray();
content = new String(contents);
} catch (Exception e) {
System.err.println( " 解析文件: " + path + " 失败! " );
}
return content.toString();
}
/**
* 解析普通文本文件
* @param path
* @return
*/
public String resolveText(String path){
StringBuffer content = new StringBuffer( "" );
try {
File file = new File(path);
FileReader reader = new FileReader(file);
BufferedReader br = new BufferedReader(reader);
while (br.read() != - 1 ) {
content.append(br.readLine());
}
br.close();
reader.close();
} catch (Exception e) {
System.err.println( " 读取文件: " + path + " 失败! " );
}
return content.toString();
}
}
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
public class Resolve {
/**
* 得到文件后缀名
*/
public String getExt(String path) {
String ext = "" ;
if (path != null && ! "" .equals(path)) {
File file = new File(path);
if (file.exists()) {
ext = path.substring(path.lastIndexOf( " . " ) + 1 );
}
}
return ext;
}
public String execute(String path) {
String content = "" ;
String ext = getExt(path);
if (ext != null && ! "" .equals(ext)) {
if (Const.FILE_TYPE_LIST.contains(ext)) {
if (ext.equals( " txt " )) {
content = resolveText(path);
} else if (ext.equals( " doc " )) {
content = resolveWord2003(path);
} else if (ext.equals( " docx " )) {
content = resolveWord2007(path);
} else if (ext.equals( " pdf " )) {
content = resolvePdf(path);
} else if (ext.equals( " xls " )) {
content = resolveExcel2003(path);
} else if (ext.equals( " xlsx " )) {
content = resolveExcel2007(path);
}
}
}
else {
System.err.println( " 无法解析文件: " + path + " ! " );
}
return content;
}
/**
* 解析word2007
* @param path 文件路径
* @return 文件内容
*/
public String resolveWord2007(String path){
String content = "" ;
try {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
content = ex.getText();
} catch (Exception e) {
System.err.println( " 解析文件: " + path + " 失败! " );
}
return content;
}
/**
* 解析word2003
* @param path 文件路径
* @return 文件内容
*/
public String resolveWord2003(String path){
String content = "" ;
try {
File file = new File(path);
FileInputStream fis = new FileInputStream(file);
WordExtractor ex = new WordExtractor(fis);
content = ex.getText();
fis.close();
} catch (Exception e) {
System.err.println( " 解析文件: " + path + " 失败! " );
}
return content;
}
/**
* 解析Excel2003
* @param path
* @return
*/
public String resolveExcel2003(String path){
StringBuffer content = new StringBuffer( "" );
try {
File file = new File(path);
FileInputStream fis = new FileInputStream(file);
HSSFWorkbook wordbook = new HSSFWorkbook(fis);
// 遍历sheet
for ( int i = 0 ; i < wordbook.getNumberOfSheets(); i ++ ) {
if ( null != wordbook.getSheetAt(i)) {
HSSFSheet sheet = wordbook.getSheetAt(i); // 得到sheet
// 遍历该sheet中的数据
for ( int j = 0 ; j < sheet.getLastRowNum(); j ++ ) {
HSSFRow row = sheet.getRow(j); // 获取一行
// 循环遍历cell
for ( int k = 0 ; k < row.getLastCellNum(); k ++ ) {
if ( null != row.getCell(k)) {
HSSFCell cell = row.getCell(k); // 获取单元格的值
if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
content.append(cell.getNumericCellValue());
}
else if (cell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
content.append(cell.getBooleanCellValue());
}
else {
content.append(cell.getStringCellValue());
}
}
}
}
}
}
fis.close();
} catch (Exception e) {
System.err.println( " 解析文件: " + path + " 失败! " );
}
return content.toString();
}
/**
* 解析Excel2007
* @param path
* @return
*/
public String resolveExcel2007(String path){
StringBuffer content = new StringBuffer( "" );
try {
XSSFWorkbook wb = new XSSFWorkbook(path);
// 遍历sheet
for ( int i = 0 ; i < wb.getNumberOfSheets(); i ++ ) {
XSSFSheet sheet = wb.getSheetAt(i);
if (sheet == null ) {
continue ;
}
// 遍历行
for ( int j = 0 ; j < sheet.getLastRowNum(); j ++ ) {
XSSFRow row = sheet.getRow(j);
if (row == null ) {
continue ;
}
// 遍历单元格
for ( int k = 0 ; k < row.getLastCellNum(); k ++ ) {
XSSFCell cell = row.getCell(k);
if (cell == null ) {
continue ;
}
if (cell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
content.append(cell.getBooleanCellValue());
}
else if (cell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
content.append(cell.getNumericCellValue());
}
else {
content.append(cell.getStringCellValue());
}
}
}
}
} catch (Exception e) {
System.err.println( " 解析文件: " + path + " 失败! " );
}
return content.toString();
}
/**
* 解析pdf
* @param path 文件路径
* @return 文件内容
*/
public String resolvePdf(String path) {
String content = "" ;
// StringBuffer content = new StringBuffer("");
try {
/*
FileInputStream fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
fis.close();
*/
File file = new File(path);
PDDocument doc = PDDocument.load(file);
ByteArrayOutputStream out = new ByteArrayOutputStream();
OutputStreamWriter writer = new OutputStreamWriter(out);
PDFTextStripper ts = new PDFTextStripper();
ts.writeText(doc, writer);
doc.close();
out.close();
writer.close();
byte [] contents = out.toByteArray();
content = new String(contents);
} catch (Exception e) {
System.err.println( " 解析文件: " + path + " 失败! " );
}
return content.toString();
}
/**
* 解析普通文本文件
* @param path
* @return
*/
public String resolveText(String path){
StringBuffer content = new StringBuffer( "" );
try {
File file = new File(path);
FileReader reader = new FileReader(file);
BufferedReader br = new BufferedReader(reader);
while (br.read() != - 1 ) {
content.append(br.readLine());
}
br.close();
reader.close();
} catch (Exception e) {
System.err.println( " 读取文件: " + path + " 失败! " );
}
return content.toString();
}
}