package com.qyw.utils;
import com.aspose.words.HtmlSaveOptions;
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.StringBuilderWriter;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.converter.WordToHtmlUtils;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.fit.pdfdom.PDFDomTree;
import org.icepdf.core.pobjects.graphics.text.PageText;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.zwobble.mammoth.DocumentConverter;
import org.zwobble.mammoth.Result;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
/**
* Description: office套件工具类
* @version 1.0
*/
public class OfficeUtils {
private static final Logger log = LoggerFactory.getLogger(OfficeUtils.class);
private OfficeUtils(){}
public static final Set<String> WORD_FILE_EXTENSION = new HashSet<>(Arrays.asList("doc","docx"));
public static final String PDF_FILE_EXTENSION = "pdf";
/**
* 判断无否为doc文件
* @param inputStream
* @return
*/
public static boolean isDocFile(InputStream inputStream){
boolean result = true;
WordExtractor doc = null;
try {
doc = new WordExtractor(inputStream);
} catch (Exception e) {
return false;
}
return result;
}
/**
* 提取doc文档的原始文本
* @param inputStream
* @return
* @throws Exception
*/
public static String extractDocRawText(InputStream inputStream) throws Exception{
WordExtractor doc = null;
String text = StringUtils.EMPTY;
try {
doc = new WordExtractor(inputStream);
text = doc.getText();
}finally {
IOUtils.closeQuietly(doc);
IOUtils.closeQuietly(inputStream);
}
return text;
}
/**
* 提取docx文档的原始文本,mammoth方式
* @param inputStream
* @return
* @throws Exception
*/
public static String extractDocxRawText(InputStream inputStream) throws IOException{
String text = StringUtils.EMPTY;
try {
DocumentConverter converter = new DocumentConverter();
Result<String> result = converter.extractRawText(inputStream);
text = result.getValue();
}finally {
IOUtils.closeQuietly(inputStream);
}
return text;
}
/**
* 提取doc文档的原始文本,poi方式
* @param inputStream
* @return
* @throws Exception
*/
public static String extractDocxRawTextByPoi(InputStream inputStream) throws Exception{
XWPFWordExtractor doc = null;
String text = StringUtils.EMPTY;
try {
doc = new XWPFWordExtractor(OPCPackage.open(inputStream));
text = doc.getText();
}finally {
IOUtils.closeQuietly(doc);
IOUtils.closeQuietly(inputStream);
}
return text;
}
/**
* doc转html
* @param inputStream
* @return
* @throws Exception
*/
public static String convertDocToHtml(InputStream inputStream) throws Exception {
ByteArrayOutputStream out = null;
try {
HWPFDocumentCore hwpfDocumentCore = WordToHtmlUtils.loadDoc(inputStream);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
wordToHtmlConverter.processDocument(hwpfDocumentCore);
Document htmlDocument = wordToHtmlConverter.getDocument();
out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "HTML");
serializer.transform(domSource, streamResult);
} finally {
IOUtils.closeQuietly(out);
}
return new String(out.toByteArray());
}
/**
* docx转换html
* @param inputStream
* @return
* @throws IOException
*/
public static String convertDocxToHtml(InputStream inputStream) throws IOException {
XWPFDocument docxDocument = new XWPFDocument(inputStream);
XHTMLOptions options = XHTMLOptions.create();
options.setIgnoreStylesIfUnused(true);
options.setFragment(true);
options.setOmitHeaderFooterPages(true);
// 转换图片
options.setImageManager(new Base64EmbedImgManager());
// 转换htm11
ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
String htmlStr = htmlStream.toString();
return htmlStr;
}
/**
* word转html(支持doc、docx)
*
* @param inputByte
* @return
*/
public String word2Html(byte[] inputByte) {
ByteArrayInputStream inStream = new ByteArrayInputStream(inputByte);
String html = "";
try {
com.aspose.words.Document document = new com.aspose.words.Document(inStream);
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
HtmlSaveOptions htmlSaveOptions = new HtmlSaveOptions();
htmlSaveOptions.setImagesFolder(System.getProperty("java.io.tmpdir"));
document.save(outStream, htmlSaveOptions);
html = new String(outStream.toByteArray());
} catch (Exception e) {
log.error("aspose Word2Html error", e);
}
return html;
}
public static String convertPdftohtml(InputStream inputStream) throws Exception {
try (PDDocument document = PDDocument.load(inputStream);
StringBuilderWriter out = new StringBuilderWriter(500)) {
PDFDomTree pdfDomTree = new PDFDomTree();
pdfDomTree.writeText(document, out);
return out.toString();
}
}
public static String extractPdfText(InputStream inputStream) throws Exception{
StringBuilder sb = new StringBuilder(500);
try{
org.icepdf.core.pobjects.Document document = new org.icepdf.core.pobjects.Document();
document.setInputStream(inputStream,null);
int numberOfPages = document.getNumberOfPages();
for (int i = 0; i < numberOfPages; i++) {
PageText pageText = document.getPageText(i);
if (pageText != null && pageText.getPageLines() != null) {
sb.append(pageText.toString().trim());
}
}
}finally {
IOUtils.closeQuietly(inputStream);
}
return sb.toString();
}
/**
* pdf转化为图片(png)方法
* @param pdfFiles pdf文件
* @param toDirectory 图片存放目录
* @return
* @throws Exception
*/
public List<File> pdfToImg(List<File> pdfFiles,String toDirectory) throws Exception{
List<File> imageFiles = new ArrayList<>();
File file1 = new File(toDirectory);
if(!file1.exists()){
file1.mkdirs();
}
if(!toDirectory.endsWith(File.separator)){
toDirectory+=File.separator;
}
for (File pdfFile : pdfFiles) {
if(!pdfFile.getName().toLowerCase().endsWith(PDF_FILE_EXTENSION)){
continue;
}
Document document = new Document();
try {
document.setInputStream(new FileInputStream(pdfFile), null);
float scale = 2.5f;//缩放比例
float rotation = 0f;//旋转角度
for (int i = 0; i < document.getNumberOfPages(); i++) {
BufferedImage image = (BufferedImage)
document.getPageImage(i, GraphicsRenderingHints.SCREEN, Page.BOUNDARY_CROPBOX, rotation, scale);
try {
File file = new File(toDirectory+FilenameUtils.getBaseName(pdfFile.getName()) + ".png");
ImageIO.write(image, "png", file);
imageFiles.add(file);
} catch (IOException e) {
e.printStackTrace();
} finally {
image.flush();
}
}
} finally {
document.dispose();
}
}
return imageFiles;
}
}
附上pom依赖
<repositories>
<repository>
<id>AsposeJavaAPI</id>
<name>Aspose Java API</name>
<url>https://repository.aspose.com/repo/</url>
</repository>
</repositories>
<dependency>
<groupId>org.icepdf.os</groupId>
<artifactId>icepdf-core</artifactId>
<version>6.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.17</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.17</version>
</dependency>
<dependency>
<groupId>net.sf.cssbox</groupId>
<artifactId>pdf2dom</artifactId>
<version>1.8</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
<version>1.0.6</version>
</dependency>
<dependency>
<groupId>com.aspose</groupId>
<artifactId>aspose-words</artifactId>
<version>20.3</version>
<classifier>jdk17</classifier>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
<version>2.0.2</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.document.docx</artifactId>
<version>2.0.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>