调用示例:
File powerPointFile = new File("D:\\temp.ppt");
//读取PowerPoint文档中所有文本内容,以字符串形式返回
System.out.println(PowerPointFileUtil.extractTextFromPowerPointFile(powerPointFile , "," , ";"));
工具类源码:
/**
* BasePowerPointFileUtil.java
* Copyright ® 2017 窦海宁
* All right reserved
*/
package org.aiyu.core.common.util.file.office;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.sl.usermodel.AutoShape;
import org.apache.poi.sl.usermodel.Shape;
import org.apache.poi.sl.usermodel.Slide;
import org.apache.poi.sl.usermodel.SlideShow;
/**
* <p>PowerPoint文件工具基类
*
* <p>通用的PowerPoint文件工具基类,可用于从PowerPoint文档中抽取文本信息
*
* @author 窦海宁, chong0660@sina.com
* @since AiyuCommonCore-1.0
* @version AiyuCommonCore-1.0
*/
public abstract class BasePowerPointFileUtil {
/**
* <p>读取PowerPoint文件中的幻灯片对象
*
* @param slideShow SlideShow对象
*
* @return 读取出的工作薄列表
*
* @modify 窦海宁, 2017-01-18
*/
protected static List readSlideShow(SlideShow slideShow) {
List slideList = null;
if (slideShow != null) {
slideList = new ArrayList();
List slides = slideShow.getSlides();
for (int i = 0 ; i < slides.size() ; i++) {
slideList.add(BasePowerPointFileUtil.readSlide((Slide) slides.get(i)));
}
}
return slideList;
}
/**
* <p>读取指定的Slide中的数据
*
* @param slide Slide对象
*
* @return 读取出的Slide数据列表
*
* @modify 窦海宁, 2017-01-18
*/
protected static List readSlide(Slide slide) {
List shapeList = null;
if (slide != null) {
shapeList = new ArrayList();
List shapes = slide.getShapes();
for (int i = 0 ; i < shapes.size() ; i++) {
shapeList.add(BasePowerPointFileUtil.readShape((Shape) shapes.get(i)));
}
}
return shapeList;
}
/**
* <p>读取指定的图形的数据
*
* @param shape Slide中的图形对象
*
* @return 读取出的图形数据
*
* @modify 窦海宁, 2017-01-18
*/
protected static Object readShape(Shape shape) {
String returnValue = null;
if (shape != null) {
if (shape instanceof AutoShape) {
try {
returnValue = ((AutoShape) shape).getText();
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
return returnValue;
}
}
PowerPoint2003版本工具类:
/**
* PowerPoint2003FileUtil.java
* Copyright ® 2010 窦海宁
* All right reserved
*/
package org.aiyu.core.common.util.file.office;
import java.io.File;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
import org.apache.poi.sl.usermodel.SlideShow;
/**
* <p>PowerPoint2003版文件工具类
*
* <p>通用的PowerPoint2003版文件工具类,可用于从PowerPoint文档中抽取文本信息
*
* @author 窦海宁, chong0660@sina.com
* @since AiyuCommonCore-1.0
* @version AiyuCommonCore-1.0
*/
public abstract class PowerPoint2003FileUtil extends BasePowerPointFileUtil {
/**
* <p>从PowerPoint文档中提取文本信息
*
* @param powerPointFile PowerPoint文件
* @param shapeSeparator Shape分隔符
* @param slideSeparator Slide分隔符
*
* @return 提取后的文本信息
*
* @modify 窦海宁, 2017-01-18
*/
protected static String extractTextFromPowerPointFile(File powerPointFile , String shapeSeparator , String slideSeparator) {
StringBuffer returnValue = new StringBuffer();
if (powerPointFile != null && slideSeparator != null && shapeSeparator != null) {
if (powerPointFile.isFile()) {
try {
SlideShow slideShow = new HSLFSlideShow(new HSLFSlideShowImpl(powerPointFile.getCanonicalPath()));
Iterator slideIterator = PowerPoint2003FileUtil.readSlideShow(slideShow).iterator();
//遍历Slide
while (slideIterator.hasNext()) {
Iterator shapeIterator = ((List) slideIterator.next()).iterator();
//遍历Shape
while (shapeIterator.hasNext()) {
Object shapeValue = shapeIterator.next();
if (shapeValue != null) {
returnValue.append((String) shapeValue);
if (shapeIterator.hasNext()) {
returnValue.append(shapeSeparator);
}
}
}
if (slideIterator.hasNext()) {
returnValue.append(slideSeparator);
}
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
return StringUtils.trimToNull(returnValue.toString());
}
}
PowerPoint2007版本工具类:
/**
* PowerPoint2007FileUtil.java
* Copyright ® 2017 窦海宁
* All right reserved
*/
package org.aiyu.core.common.util.file.office;
import java.io.File;
import java.io.FileInputStream;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
/**
* <p>PowerPoint2007版文件工具类
*
* <p>通用的PowerPoint2007版文件工具类,可用于从PowerPoint文档中抽取文本信息
*
* @author 窦海宁, chong0660@sina.com
* @since AiyuCommonCore-1.0
* @version AiyuCommonCore-1.0
*/
public abstract class PowerPoint2007FileUtil extends BasePowerPointFileUtil {
/**
* <p>从PowerPoint文档中提取文本信息
*
* @param powerPointFile PowerPoint文件
* @param shapeSeparator Shape分隔符
* @param slideSeparator Slide分隔符
*
* @return 提取后的文本信息
*
* @modify 窦海宁, 2017-01-18
*/
protected static String extractTextFromPowerPointFile(File powerPointFile , String shapeSeparator , String slideSeparator) {
StringBuffer returnValue = new StringBuffer();
if (powerPointFile != null && slideSeparator != null && shapeSeparator != null) {
if (powerPointFile.isFile()) {
try {
XMLSlideShow slideShow = new XMLSlideShow(new FileInputStream(powerPointFile));
Iterator slideIterator = PowerPoint2007FileUtil.readSlideShow(slideShow).iterator();
//遍历Slide
while (slideIterator.hasNext()) {
Iterator shapeIterator = ((List) slideIterator.next()).iterator();
//遍历Shape
while (shapeIterator.hasNext()) {
Object shapeValue = shapeIterator.next();
if (shapeValue != null) {
returnValue.append((String) shapeValue);
if (shapeIterator.hasNext()) {
returnValue.append(shapeSeparator);
}
}
}
if (slideIterator.hasNext()) {
returnValue.append(slideSeparator);
}
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
return StringUtils.trimToNull(returnValue.toString());
}
}
统一调用工具类:
/**
* PowerPointFileUtil.java
* Copyright ® 2017 窦海宁
* All right reserved
*/
package org.aiyu.core.common.util.file.office;
import java.io.File;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
/**
* <p>PowerPoint文件工具类
*
* <p>通用的PowerPoint文件工具类,可用于从PowerPoint文档中抽取文本信息
*
* @author 窦海宁, chong0660@sina.com
* @since AiyuCommonCore-1.0
* @version AiyuCommonCore-1.0
*/
public abstract class PowerPointFileUtil extends BasePowerPointFileUtil {
/**
* <p>从PowerPoint文档中提取文本信息
*
* @param powerPointFile PowerPoint文件
* @param shapeSeparator Shape分隔符
* @param slideSeparator Slide分隔符
*
* @return 提取后的文本信息
*
* @modify 窦海宁, 2017-02-06
*/
public static String extractTextFromPowerPointFile(File powerPointFile , String shapeSeparator , String slideSeparator) {
String resultText = null;
if (powerPointFile != null && powerPointFile.exists()) {
String extension = FilenameUtils.getExtension(powerPointFile.getName());
if (StringUtils.equalsIgnoreCase("ppt" , extension)) {
//Office2003版文件处理
resultText = PowerPoint2003FileUtil.extractTextFromPowerPointFile(powerPointFile , shapeSeparator , slideSeparator);
} else if (StringUtils.equalsIgnoreCase("pptx" , extension)) {
//Office2007版文件处理
resultText = PowerPoint2003FileUtil.extractTextFromPowerPointFile(powerPointFile , shapeSeparator , slideSeparator);
} else {
//文件类型有误
}
}
return resultText;
}
}
统一调用工具类通过文件扩展名(PPT与PPTX,不区分大小写)判断文件版本,暂时没有想到更好的办法;本工具类使用POI_3.15实现,无须目标机器安装OFFICE软件也可进行文件读写。