java 解析文件 大全

公司有个项目让java从各种文档中抽取正文.
于是费了很多经历来写起初..去网上找demo
一下是摘抄一个哥们的例子
package org.css.resource.businesssoft.searchengine.quwenjiansuo;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.poi.POITextExtractor;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
/**
*
* @author lizh
*
*/
public class CovertFile {

/**
* 从word 2003文档中提取纯文本
* @param is
* @return
* @throws IOException
*/
public static String extractTextFromDOC(InputStream is) throws IOException {
WordExtractor ex = new WordExtractor(is); // is是WORD文件的InputStream
return ex.getText();
}

/**
* 从word 2007文档中提取纯文本
* @param fileName
* @return
*/
public static String extractTextFromDOC2007(String fileName) {
try {
OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
return ex.getText();
} catch (Exception e) {
return "";
}
}

/**
* 从excel 2003文档中提取纯文本
* @param is
* @return
* @throws IOException
*/
private static String extractTextFromXLS(InputStream is) throws IOException {
StringBuffer content = new StringBuffer();
HSSFWorkbook workbook = new HSSFWorkbook(is); // 创建对Excel工作簿文件的引用

for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets); // 获得一个sheet

for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一行

for (short cellNumOfRow = 0; cellNumOfRow <= aRow
.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
HSSFCell aCell = aRow.getCell(cellNumOfRow); // 获得列值

if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
content.append(aCell.getNumericCellValue());
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
content.append(aCell.getBooleanCellValue());
} else {
content.append(aCell.getStringCellValue());
}
}
}
}
}
}
}
return content.toString();
}

/**
* 从excel 2007文档中提取纯文本
* @param fileName
* @return
* @throws Exception
*/
private static String extractTextFromXLS2007(String fileName)
throws Exception {
StringBuffer content = new StringBuffer();

// 构造 XSSFWorkbook 对象,strPath 传入文件路径
XSSFWorkbook xwb = new XSSFWorkbook(fileName);

// 循环工作表Sheet
for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
XSSFSheet xSheet = xwb.getSheetAt(numSheet);
if (xSheet == null) {
continue;
}

// 循环行Row
for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
XSSFRow xRow = xSheet.getRow(rowNum);
if (xRow == null) {
continue;
}

// 循环列Cell
for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
XSSFCell xCell = xRow.getCell(cellNum);
if (xCell == null) {
continue;
}

if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
content.append(xCell.getBooleanCellValue());
} else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
content.append(xCell.getNumericCellValue());
} else {
content.append(xCell.getStringCellValue());
}
}
}
}

return content.toString();
}

/**
* 从excel 2007文档中提取纯文本
* @param fileName
* @return
*/
public static String getXLS2007(String fileName){
String doc = "";
try{
doc = extractTextFromXLS2007(fileName);
return doc;
}catch(Exception e){
return "";
}
}

/**
* 从ppt 2003、2007文档中提取纯文本
* @param fileName
* @return
*/
public static String getPPTX(String fileName){
String doc = "";
try{
File inputFile = new File(fileName);
POITextExtractor extractor = ExtractorFactory.createExtractor(inputFile);
doc = extractor.getText();
return doc;
}catch(Exception e){
return "";
}
}


public static void main(String[] args) {
try {
// String wordFile = "D:/松山血战.docx";
// String wordText2007 = CovertFile.extractTextFromDOC2007(wordFile);
// System.out.println("wordText2007=======" + wordText2007);
//
// InputStream is = new FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");
// String excelText = CovertFile.extractTextFromXLS(is);
// System.out.println("text2003==========" + excelText);

// String excelFile = "D:/zh.xlsx";
// String excelText2007 = CovertFile.extractTextFromXLS2007(excelFile);
// System.out.println("excelText2007==========" + excelText2007);

String pptFile = "D:/zz3.ppt";
String pptx = CovertFile.getPPTX(pptFile);
System.out.println("pptx==========" + pptx);

} catch (Exception e) {
e.printStackTrace();
}
}

}



最后突然发现其实只用两行代码就能搞定
office 2003 - office 2007

POITextExtractor extractor = ExtractorFactory.createExtractor(f);
return extractor.getText();


于是我泪流满面....白忙乎了..顺路奉上解析pdf的吧
package com.lingjoin.extractors;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.util.Date;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
import com.lingjoin.paser.LingJoinFile;

/**
* PDF解析器
*
* @author Ansj
*
*/
public class PDFExtractor extends AbstractExtractor {

private String getContent(LingJoinFile f) {
// TODO Auto-generated method stub
PDDocument doc = null ;
try {
doc = PDDocument.load(f);
PDFTextStripper stripper = new PDFTextStripper();
/**
* 设置文件的信息
*/
this.setLingJoinFileInfo(f, doc
.getDocumentInformation());
return stripper.getText(doc);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
return "";
}

private BufferedReader getContentReader(LingJoinFile f) {
return new BufferedReader(new StringReader(this.getContent(f)));
}

/**
*
* 项目名称:FilePaser
* 类描述: 设置文件的信息
* 创建人:ANSJ
* 创建时间:2010-4-14 下午04:27:57
* 修改备注:
* @version
*/
private void setLingJoinFileInfo(LingJoinFile f, PDDocumentInformation info) {
if (info.getAuthor() != null) {
f.setlAuthor(info.getAuthor());
}
// try {
// if (info.getModificationDate() != null) {
// Date date = info.getModificationDate().getTime();
// f.setlModificationDate(date.getTime());
// }
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
//设置标题
// if (info.getTitle() != null) {
// f.setlTitle(info.getTitle());
// }
}

public void paserFileToReader(LingJoinFile f) throws Exception {
f.setlContentReader(this.getContentReader(f)) ;

}

public void paserFileToString(LingJoinFile f) throws Exception {
// TODO Auto-generated method stub
f.setlContent(this.getContent(f)) ;
}

public PDFExtractor(Integer typeFlag) {
// TODO Auto-generated constructor stub
this.typeFlag = typeFlag ;
}

private Integer typeFlag = null ;

public Integer getTypeFlag() {
// TODO Auto-generated method stub
return typeFlag;
}
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值