其中程序使用到了poi,jacob,openoffice,icepdf,iText包,可以在网上搜一下。
PS:在使用openoffice将pdf转成图片时,前提是本地安装了openoffice软件,并且要开启该服务:
安装完openoffice后
1.安装openoffice后进入默认的安装目录
cd C:\Program Files (x86)\OpenOffice 4\program
执行
soffice -headless -accept="socket,host=127.0.0.1,port=8100;urp;" -nofirststartwizard
2.查看是否安装成功
2.1查看端口对应的pid
netstat -ano|findstr "8100"
2.2查看pid对应的服务程序名
tasklist|findstr "ipd值"
不多说,直接上代码:
package com.officefileparser.utils;
import java.awt.image.BufferedImage;
import java.awt.image.RenderedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.ConnectException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.commons.io.FilenameUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hslf.model.Picture;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.PictureData;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.icepdf.core.pobjects.Document;
import org.icepdf.core.pobjects.Page;
import org.icepdf.core.util.GraphicsRenderingHints;
import com.artofsolving.jodconverter.DefaultDocumentFormatRegistry;
import com.artofsolving.jodconverter.DocumentConverter;
import com.artofsolving.jodconverter.DocumentFormat;
import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter;
import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.Dispatch;
public class OfficeFileParserUtils {
/**
* 使用poi读取doc文件
* @param docUrl
* doc文件路径
* @return
* 代表doc文件内容的字符串
* @throws IOException
*/
public static void reaDoc(String docPath) throws IOException{
FileInputStream in;
in = new FileInputStream(docPath);
WordExtractor extractor = new WordExtractor(in);
//HWPFDocument hwpfDocument = new HWPFDocument(in);
//一个元素就是一段内容
String[] paraTexts = extractor.getParagraphText();
for (int i=0; i<paraTexts.length; i++) {
//一段一段的存入
System.out.println(/*"Paragraph " + (i+1) + " : " +*/ paraTexts[i]);
}
}
/**
* 使用poi写word文档
* @param destFile 目的地址
* @param fileCon 待写入的字符串
*/
public void exportDoc(String destFile,String fileCon){
try {
//doc content
ByteArrayInputStream bais = new ByteArrayInputStream(fileCon.getBytes());
POIFSFileSystem fs = new POIFSFileSystem();
DirectoryEntry directory = fs.getRoot();
directory.createDocument("WordDocument", bais);
FileOutputStream ostream = new FileOutputStream(destFile);
fs.writeFilesystem(ostream);
bais.close();
ostream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 使用pdfbox读取pdf文档信息
* @param pdfPath pdf文件路径
* @return pdf信息字符串
* @throws Exception
*/
public static String getPdfText(String pdfPath) throws Exception {
boolean sort = false;
int startPage = 1;
int endPage = 10;
PDDocument document = null;
File file = new File(pdfPath);
if(!file.exists()){
System.out.println(file.getAbsolutePath()+" 不存在!!!");
return "";
}
System.out.println(file);
try {
try {
document = PDDocument.load(file);
} catch (MalformedURLException e) {
}
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(sort);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
return stripper.getText(document);
} catch (Exception e) {
e.printStackTrace();
return "";
} finally {
if (document != null) {
document.close();
}
}
}
// function 检查文件是否为PPT
public static boolean checkFile(File file) {
boolean isppt = false;
String filename = file.getName();
String suffixname = null;
if (filename != null && filename.indexOf(".") != -1) {
suffixname = filename.substring(filename.indexOf("."));
System.out.println(suffixname);
if (suffixname.equals(".ppt")) {
isppt = true;
}
return isppt;
} else {
return isppt;
}
}
/**
* 使用poi抽取PPT的全部内容
* @param PPTPath
* @return
* @throws IOException
*/
public static String readPPTAll(String PPTPath) throws IOException{
PowerPointExtractor powerPointExtractor = new PowerPointExtractor(PPTPath);
System.out.println(powerPointExtractor.getText());
return powerPointExtractor.getText();
}
/**
* 使用poi一张一张的抽取ppt内容
* @param pptPath
* @return
* @throws IOException
*/
public static HashMap readPPTOneByOne(String pptPath,String outPath) throws IOException{
//StringBuffer content = new StringBuffer("");//store the content of ppt
InputStream inputStream = new FileInputStream(pptPath);
HashMap<Integer, String> contentPageMap =new HashMap<Integer, String>();
//Constructs a Powerpoint document from an input stream.
SlideShow slideShow = new SlideShow(new HSLFSlideShow(inputStream));
//获得每一张幻灯片,可认为一个slides元素就是一张ppt
Slide[] slides = slideShow.getSlides();
for (int i = 0; i < slides.length; i++) {
//获得ppt的页码pageNum
int pageNum = slides[i].getSlideNumber();
//读取一张幻灯片的内容(包括标题),为了取得幻灯片的文字内容,建立TextRun,TextRun代表一张ppt
TextRun[] textRuns = slides[i].getTextRuns();
System.out.println("第"+pageNum+"张");
//读取一张幻灯片的标题
String title=slides[i].getTitle();
/* //获得系统换行符
String lineSeparator = (String) java.security.AccessController.doPrivileged(
new sun.security.action.GetPropertyAction("line.separator"));
System.out.print(lineSeparator);*/
//如果一张ppt中没有标题,就将第一个文本框中的内容作为标题
if(title == null){
title = textRuns[0].getText().replaceAll("[\n\r]", "");//去掉换行符
}
System.out.println("标题:"+title);
for (int j = 0; j < textRuns.length; j++) {
//获得一张ppt的一个文本框中的内容text
String text = textRuns[j].getText().replaceAll("[\n\r]", "");
contentPageMap.put(i, text);
System.out.print(text);
System.out.println();
}
}
// 提取ppt中的所有图片
//获得ppt的页码pageNum
File outDir = new File(outPath);
if(!outDir.exists()){
outDir.mkdir();
}
extractImageOfPPT(pptPath,outPath);
return contentPageMap;
}
//使用poi抽取ppt中的图片
private static void extractImageOfPPT(String pptPath,String outPath) throws IOException {
//SlidesShow类代表一个ppt文件
SlideShow ppt = new SlideShow(new HSLFSlideShow(pptPath));
//sildes数组中的一个元素就代表一张ppt
Slide[] slides = ppt.getSlides();
// extract all pictures contained in the presentation
PictureData[] pdata = ppt.getPictureData();
for (int i = 0; i < pdata.length; i++) {
PictureData pict = pdata[i];
// picture data
byte[] data = pict.getData();
int type = pict.getType();
String ext;
switch (type) {
case Picture.JPEG:
ext = ".jpg";
break;
case Picture.PNG:
ext = ".png";
break;
case Picture.WMF:
ext = ".wmf";
break;
case Picture.EMF:
ext = ".emf";
break;
case Picture.PICT:
ext = ".pict";
break;
default:
continue;
}
FileOutputStream out = new FileOutputStream(outPath+"pict_" + i + ext);
out.write(data);
out.close();
}
}
/**
*使用openoffice将doc的各种类型文件转成pdf格式
* @param docPath
* @param pdfPath
* @throws ConnectException
*/
public static void doc2Pdf(String docPath, String pdfPath) throws ConnectException {
File inputFile = new File(docPath);//预转文件
File outputFile = new File(pdfPath);//pdf文件
OpenOfficeConnection connection = new SocketOpenOfficeConnection(8100);
connection.connect();//建立连接
DocumentConverter converter = new OpenOfficeDocumentConverter(connection);
DefaultDocumentFormatRegistry formatReg = new DefaultDocumentFormatRegistry();
DocumentFormat txt = formatReg.getFormatByFileExtension("odt") ;//设定文件格式
DocumentFormat pdf = formatReg.getFormatByFileExtension("pdf") ;//设定文件格式
converter.convert(inputFile, txt, outputFile, pdf);//文件转换
connection.disconnect();//关闭连接
}
/**
* 使用openoffice将doc文档转成图片,效果不错,思路:先将doc文档转成pdf,然后将pdf转成图片
* @param docPath
* @param imgDirPath
*/
public static void doc2Imags(String docPath, String imgDirPath){
String pdfPath =String.format("%s%s.pdf", FilenameUtils.getFullPath(docPath), FilenameUtils.getBaseName(docPath));
try {
System.out.println(FilenameUtils.getFullPath(docPath)+" "+FilenameUtils.getBaseName(docPath));
doc2Pdf(docPath, pdfPath);
pdf2Imgs(pdfPath, imgDirPath);
File pdf = new File(pdfPath);
if(pdf.isFile()){
// pdf.delete();
}
} catch (ConnectException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 使用icepdf将pdf转换成图片 ,并返回图片名称
* @param pdfPath
* @param imagePath
* @return 返回转换后图片的名字
* @throws Exception
*/
public static List<String> pdf2Imgs(String pdfPath, String imgDirPath) throws Exception {
Document document = new Document();
document.setFile(pdfPath);
float scale = 5f;//放大倍数
float rotation = 0f;//旋转度数
List<String> imgNames = new ArrayList<String>();
int pageNum = document.getNumberOfPages();
File imgDir = new File(imgDirPath);
if (!imgDir.exists()) {
imgDir.mkdirs();
}
for (int i = 0; i < pageNum; i++) {
BufferedImage image = (BufferedImage) document.getPageImage(i, GraphicsRenderingHints.SCREEN,
Page.BOUNDARY_CROPBOX, rotation, scale);
RenderedImage rendImage = image;
try {
String filePath = imgDirPath + File.separator + i + ".jpg";
File file = new File(filePath);
ImageIO.write(rendImage, "jpg", file);
imgNames.add(FilenameUtils.getName(filePath));
} catch (IOException e) {
e.printStackTrace();
return null;
}
image.flush();
}
document.dispose();
return imgNames;
}
/**
* 使用jacob将ppt转成图片
* @param inputFile
* @param ImgFilePath
*/
public static synchronized void ppt2Img(String inputFile,String ImgFilePath){
System.out.println("ppt2PDF==========进入");
ActiveXComponent app = null;
try {
app = new ActiveXComponent("PowerPoint.Application");
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("--------------------");
app.setProperty("Visible", true);
Dispatch ppts = app.getProperty("Presentations").toDispatch();
System.out.println("ppt2PDF==========准备打开ppt文档");
System.out.println(new File(inputFile).exists());
Dispatch ppt = Dispatch.call(ppts,
"Open",
inputFile,
true,//ReadOnly
true,//Untitled指定文件是否有标题
true//WithWindow指定文件是否可见
).toDispatch();
System.out.println("ppt2PDF==========准备转换ppt文档");
Dispatch.call(ppt,"SaveCopyAs",ImgFilePath,17);//转换成jpg
System.out.println("ppt2PDF==========准备关闭ppt文档");
Dispatch.call(ppt, "Close");
app.invoke("Quit");
}
}