packageorg.css.resource.businesssoft.searchengine.quwenjiansuo;importjava.io.File;importjava.io.FileInputStream;importjava.io.IOException;importjava.io.InputStream;importorg.apache.poi.POITextExtractor;importorg.apache.poi.POIXMLDocument;importorg.apache.poi.POIXMLTextExtractor;importorg.apache.poi.extractor.ExtractorFactory;importorg.apache.poi.hssf.usermodel.HSSFCell;importorg.apache.poi.hssf.usermodel.HSSFRow;importorg.apache.poi.hssf.usermodel.HSSFSheet;importorg.apache.poi.hssf.usermodel.HSSFWorkbook;importorg.apache.poi.hwpf.extractor.WordExtractor;importorg.apache.poi.openxml4j.exceptions.OpenXML4JException;importorg.apache.poi.openxml4j.opc.OPCPackage;importorg.apache.poi.xssf.usermodel.XSSFCell;importorg.apache.poi.xssf.usermodel.XSSFRow;importorg.apache.poi.xssf.usermodel.XSSFSheet;importorg.apache.poi.xssf.usermodel.XSSFWorkbook;importorg.apache.poi.xwpf.extractor.XWPFWordExtractor;importorg.apache.xmlbeans.XmlException;/***
*@authorlizh
**/
public classCovertFile {/*** 从word 2003文档中提取纯文本
*@paramis
*@return*@throwsIOException*/
public static String extractTextFromDOC(InputStream is) throwsIOException {
WordExtractor ex= new WordExtractor(is); //is是WORD文件的InputStream
returnex.getText();
}/*** 从word 2007文档中提取纯文本
*@paramfileName
*@return
*/
public staticString extractTextFromDOC2007(String fileName) {try{
OPCPackage opcPackage=POIXMLDocument.openPackage(fileName);
POIXMLTextExtractor ex= newXWPFWordExtractor(opcPackage);returnex.getText();
}catch(Exception e) {return "";
}
}/*** 从excel 2003文档中提取纯文本
*@paramis
*@return*@throwsIOException*/
private static String extractTextFromXLS(InputStream is) throwsIOException {
StringBuffer content= newStringBuffer();
HSSFWorkbook workbook= new HSSFWorkbook(is); //创建对Excel工作簿文件的引用
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {if (null !=workbook.getSheetAt(numSheets)) {
HSSFSheet aSheet= workbook.getSheetAt(numSheets); //获得一个sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <=aSheet
.getLastRowNum(); rowNumOfSheet++) {if (null !=aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow= aSheet.getRow(rowNumOfSheet); //获得一行
for (short cellNumOfRow = 0; cellNumOfRow <=aRow
.getLastCellNum(); cellNumOfRow++) {if (null !=aRow.getCell(cellNumOfRow)) {
HSSFCell aCell= aRow.getCell(cellNumOfRow); //获得列值
if (aCell.getCellType() ==HSSFCell.CELL_TYPE_NUMERIC) {
content.append(aCell.getNumericCellValue());
}else if (aCell.getCellType() ==HSSFCell.CELL_TYPE_BOOLEAN) {
content.append(aCell.getBooleanCellValue());
}else{
content.append(aCell.getStringCellValue());
}
}
}
}
}
}
}returncontent.toString();
}/*** 从excel 2007文档中提取纯文本
*@paramfileName
*@return*@throwsException*/
private staticString extractTextFromXLS2007(String fileName)throwsException {
StringBuffer content= newStringBuffer();//构造 XSSFWorkbook 对象,strPath 传入文件路径
XSSFWorkbook xwb = newXSSFWorkbook(fileName);//循环工作表Sheet
for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
XSSFSheet xSheet=xwb.getSheetAt(numSheet);if (xSheet == null) {continue;
}//循环行Row
for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
XSSFRow xRow=xSheet.getRow(rowNum);if (xRow == null) {continue;
}//循环列Cell
for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
XSSFCell xCell=xRow.getCell(cellNum);if (xCell == null) {continue;
}if (xCell.getCellType() ==XSSFCell.CELL_TYPE_BOOLEAN) {
content.append(xCell.getBooleanCellValue());
}else if (xCell.getCellType() ==XSSFCell.CELL_TYPE_NUMERIC) {
content.append(xCell.getNumericCellValue());
}else{
content.append(xCell.getStringCellValue());
}
}
}
}returncontent.toString();
}/*** 从excel 2007文档中提取纯文本
*@paramfileName
*@return
*/
public staticString getXLS2007(String fileName){
String doc= "";try{
doc=extractTextFromXLS2007(fileName);returndoc;
}catch(Exception e){return "";
}
}/*** 从ppt 2003、2007文档中提取纯文本
*@paramfileName
*@return
*/
public staticString getPPTX(String fileName){
String doc= "";try{
File inputFile= newFile(fileName);
POITextExtractor extractor=ExtractorFactory.createExtractor(inputFile);
doc=extractor.getText();returndoc;
}catch(Exception e){return "";
}
}public static voidmain(String[] args) {try{//String wordFile = "D:/松山血战.docx";//String wordText2007 = CovertFile.extractTextFromDOC2007(wordFile);//System.out.println("wordText2007=======" + wordText2007);//
//InputStream is = new FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");//String excelText = CovertFile.extractTextFromXLS(is);//System.out.println("text2003==========" + excelText);//String excelFile = "D:/zh.xlsx";//String excelText2007 = CovertFile.extractTextFromXLS2007(excelFile);//System.out.println("excelText2007==========" + excelText2007);
String pptFile= "D:/zz3.ppt";
String pptx=CovertFile.getPPTX(pptFile);
System.out.println("pptx==========" +pptx);
}catch(Exception e) {
e.printStackTrace();
}
}
}