word转HTML
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.springframework.web.multipart.MultipartFile;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
public class Word2HtmlUtil {
/**
* logger
*/
private static final Logger logger = LogManager.getLogger(Word2HtmlUtil.class);
public static String Word2007ToHtml(MultipartFile file) throws IOException {
// String filepath = "C:/Users/YScredit/Desktop/";
// String fileName = "test.docx";
// String htmlName = "test.html";
// final String localFile = filepath + fileName;
// File f = new File(localFile);
//if (!f.exists()) {
if (file.isEmpty() || file.getSize() <= 0) {
logger.error("Sorry File does not Exists!");
return null;
} else {
//if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {
if (file.getOriginalFilename().endsWith(".docx") || file.getOriginalFilename().endsWith(".DOCX")) {
// 1) 加载word文档生成 XWPFDocument对象
//InputStream in = new FileInputStream(f);
InputStream in = file.getInputStream();
XWPFDocument document = new XWPFDocument(in);
// 也可以使用字符数组流获取解析的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
XHTMLConverter.getInstance().convert(document, baos, null);
String content = baos.toString();
baos.close();
return content;
} else {
logger.error("Enter only MS Office 2007+ files");
return null;
}
}
}
public static String Word2003ToHtml(MultipartFile file)
throws IOException, ParserConfigurationException, TransformerException {
if (file.isEmpty() || file.getSize() <= 0) {
logger.error("Sorry File does not Exists!");
return null;
} else {
if (file.getOriginalFilename().endsWith(".doc") || file.getOriginalFilename().endsWith(".DOC")) {
InputStream input = file.getInputStream();
HWPFDocument wordDocument = new HWPFDocument(input);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
// 解析word文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
// 也可以使用字符数组流获取解析的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(baos);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
// 也可以使用字符数组流获取解析的内容
String content = new String(baos.toByteArray());
baos.close();
return content;
} else {
logger.error("Enter only MS Office 2003 files");
return null;
}
}
}
}
用到的依赖,在项目框架里用了 poi-ooxml 3.15的版本,导致版本冲突,不好用,遂改为3.13可行。
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.13</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.15</version>
</dependency>
读取word文件内容
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.*;
public class DocUtil {
/**
* 读取doc文件内容
*
* @param
* @return 想要读取的文件对象 返回文件内容
* @throws IOException
*/
public static String doc2String(File file) throws IOException {
String str = "";
try {
FileInputStream fis = new FileInputStream(file);
WordExtractor re = new WordExtractor(fis);
String doc1 = re.getText();
System.out.println(doc1);
str += doc1;
fis.close();
} catch (Exception e) {
e.printStackTrace();
}
return str;
}
/**
* 读取docx文件内容
*
* @param
* @return 想要读取的文件对象 返回文件内容
* @throws IOException
*/
public static String docx2String(File file)throws IOException {
String str = "";
try {
FileInputStream fis = new FileInputStream(file);
XWPFDocument xdoc = new XWPFDocument(fis);
XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
String doc1 = extractor.getText();
System.out.println(doc1);
str += doc1;
fis.close();
} catch (Exception e) {
e.printStackTrace();
}
return str;
}
}
用到的依赖
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
<version>1.0.6</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.4</version>
</dependency>