之前做爬虫,写了一个解析word,execl,pdf,ppd获取文本内容的工具类,主要使用到poi,pdfbox。代码献上
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import com.jrtech.riskcontrol.modules.crawl.service.IFileParseService;
@Service
public class FileParseService implements IFileParseService{
private static Logger logger = LoggerFactory.getLogger(FileParseService.class);
@Override
public String parseWord2003(InputStream inputStream) throws IOException{
try {
logger.info("parse Word 2003 begin");
WordExtractor ex = new WordExtractor(inputStream);
return ex.getText();
} finally{
if (inputStream!=null) {
inputStream.close();
}
}
}
@Override
public String parseWord2007(InputStream inputStream) throws Exception{
try {
logger.info("parse Word 2007 begin");
OPCPackage opcPackage = OPCPackage.open(inputStream);
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
return ex.getText();
} finally{
if (inputStream!=null) {
inputStream.close();
}
}
}
@Override
public String parsePPT2003(InputStream inputStream) throws IOException {
try {
logger.info("parse PPT 2003 begin");
StringBuffer content = new StringBuffer("");
SlideShow ss = new SlideShow(new HSLFSlideShow(inputStream));
Slide[] slides = ss.getSlides();
for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();
for (int j = 0; j < t.length; j++) {
content.append(t[j].getText());
}
content.append(slides[i].getTitle());
}
return content.toString();
} finally {
if (inputStream!=null) {
inputStream.close();
}
}
}
@Override
public String parsePPT2007(InputStream inputStream) throws IOException{
try {
logger.info("parse PPT 2007 begin");
XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
XSLFSlide[] slides = xmlSlideShow.getSlides();
StringBuilder sb = new StringBuilder();
for (XSLFSlide slide : slides) {
CTSlide rawSlide = slide.getXmlObject();
CTGroupShape gs = rawSlide.getCSld().getSpTree();
List<CTShape> shapes = gs.getSpList();
for (CTShape shape : shapes) {
CTTextBody tb = shape.getTxBody();
if (null == tb){
continue;
}
List<CTTextParagraph> paras = tb.getPList();
for (CTTextParagraph textParagraph : paras) {
List<CTRegularTextRun> textRuns = textParagraph.getRList();
for (CTRegularTextRun textRun : textRuns) {
sb.append(textRun.getT());
}
}
}
}
return sb.toString();
}finally{
if (inputStream!=null) {
inputStream.close();
}
}
}
@Override
public String parsePDF(InputStream inputStream) throws IOException{
try {
logger.info("parse PDF begin");
PDFTextStripper stripper = new PDFTextStripper();
PDFParser parser = new PDFParser(inputStream);
parser.parse();
COSDocument cosDoc = parser.getDocument();
String pdfText = stripper.getText(new PDDocument(cosDoc));
return pdfText;
}finally{
if (inputStream!=null) {
inputStream.close();
}
}
}
@Override
public String parseExecl(InputStream inputStream) throws IOException{
try {
logger.info("parse Execl begin");
StringBuffer buff = new StringBuffer("");
Workbook wb = null;
// 创建对Excel工作簿文件的引用
if (POIFSFileSystem.hasPOIFSHeader(inputStream)) {
wb = new HSSFWorkbook(inputStream);
}
if (POIXMLDocument.hasOOXMLHeader(inputStream)) {
wb = new XSSFWorkbook(inputStream);
}
// 创建对工作表的引用。
int sheetSize = wb.getNumberOfSheets();
for (int sheetNum = 0; sheetNum < sheetSize; sheetNum++) {
if (null != wb.getSheetAt(sheetNum)) {
Sheet sheet = null;
if(wb instanceof HSSFWorkbook){
sheet = (HSSFSheet) wb.getSheetAt(sheetNum);
}else {
sheet = (XSSFSheet) wb.getSheetAt(sheetNum);
}
int rowSize = sheet.getLastRowNum();
for (int rowNum = 0; rowNum <= rowSize; rowNum++) {
if (null != sheet.getRow(rowNum)) {
Row row = sheet.getRow(rowNum);
int cellSize = row.getLastCellNum();
for (int cellNum = 0; cellNum <= cellSize; cellNum++) {
if (null != row.getCell(cellNum)) {
Cell cell = row.getCell(cellNum);
int typeId = cell.getCellType();
switch (typeId) {
case HSSFCell.CELL_TYPE_NUMERIC:
if (HSSFDateUtil.isCellDateFormatted(cell)) {
buff.append(cell.getDateCellValue());
}else {
buff.append(cell.getNumericCellValue());
}
break;
case HSSFCell.CELL_TYPE_STRING:
buff.append(cell.getStringCellValue());
break;
}
}
}
}
}
}
}
return buff.toString();
}finally{
if (inputStream != null) {
inputStream.close();
}
}
}
}