将Doc或者Docx文档处理成html的代码逻辑
下面是maven的配置代码:
<!-- 文档处理所需的jar的依赖 -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-examples</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
<version>1.0.4</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.core</artifactId>
<version>1.0.4</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>org.apache.xmlbeans</groupId>
<artifactId>xmlbeans</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.1</version>
</dependency>
<!-- 文档处理所需的jar的依赖 -->
将word处理成html的代码:
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.w3c.dom.Document;
import com.sun.org.apache.xalan.internal.xsltc.compiler.Template;
import cn.com.hbny.docdetection.entity.ResourcesWord;
import cn.com.hbny.docdetection.server.ExtendedServerConfig;
import cn.com.hbny.docdetection.utils.Pinyin4jUtils.PinyinType;
/**
* @brief ReadWordUtils.java 文档处理对应的工具类
* @attention
* @author toto
* @date 2017年3月3日
* @note begin modify by 涂作权 2017年3月3日 原始创建
*/
public final class ReadWordUtils {
private static Logger logger = Logger.getLogger(ReadWordUtils.class);
protected static final String CHARSET_UTF8 = "UTF-8";
private static String tempImagePath = "";
/**
* 读取docx
* @throws Exception
*/
public static ResourcesWord readDocx(String path) throws Exception {
int paragNum = 0; // 段落的个数
int sentenceNum = 0; // 句子个数
int wordNum = 0; // 字体个数
StringBuffer content = new StringBuffer();
ResourcesWord resourcesWord = new ResourcesWord();
InputStream is = new FileInputStream(path);
XWPFDocument doc = new XWPFDocument(is);
List<XWPFParagraph> paras = doc.getParagraphs();
for (XWPFParagraph para : paras) {
// 当前段落的属性
if (!StringUtils.isEmpty(para.getText())) {
paragNum++;
sentenceNum += para.getText().replace("\r\n", "").trim().split("。").length;
content.append(para.getText());
}
}
// 获取文档中所有的表格
List<XWPFTable> tables = doc.getTables();
List<XWPFTableRow> rows;
List<XWPFTableCell> cells;
for (XWPFTable table : tables) {
// 表格属性
// 获取表格对应的行
rows = table.getRows();
for (XWPFTableRow row : rows) {
// 获取行对应的单元格
cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
content.append(cell.getText());
}
}
/*
* MongoDBUtils mongoDb = new MongoDBUtils("javadb"); DBObject dbs =
* new BasicDBObject(); dbs.put("name", "创新性"); //分类
* dbs.put("major", "医疗"); //专业 dbs.put("content",
* content.toString().trim()); dbs.put("paragNum", paragNum);
* dbs.put("sentenceNum", sentenceNum); dbs.put("wordNum", wordNum);
* mongoDb.insert(dbs, "javadb");
*/
}
// 得到全部内容的字数
wordNum += content.toString().trim().length();
resourcesWord.setContent(content.toString());
resourcesWord.setParagNum(paragNum);
resourcesWord.setSentenceNum(sentenceNum);
resourcesWord.setWordNum(wordNum);
close(is);
return resourcesWord;
}
/**
* 读取doc文件的内容
*
* @throws IOException
*/
public static ResourcesWord readDoc(String path) throws IOException {
int paragNum = 0; // 段落的个数
int sentenceNum = 0; // 句子个数
int wordNum = 0; // 字体个数
ResourcesWord resourcesWord = new ResourcesWord();
StringBuffer content = new StringBuffer();
try {
File f = new File(path);
FileInputStream is = new FileInputStream(f);
WordExtractor ex = new WordExtractor(is);// is是WORD文件的InputStream
String[] paragraph = ex.getParagraphText();
for (int i = 0; i < paragraph.length; i++) {
paragNum++;
System.out.println("Paragraph " + (i + 1) + " : " + paragraph[i]);
sentenceNum += paragraph[i].replace("\r\n", "").trim().split("。").length;
wordNum += paragraph[i].trim().length();
content.append(paragraph[i].trim());
}
System.out.println("段落:" + paragNum);
System.out.println("句子:" + sentenceNum);
System.out.println("字体:" + wordNum);
resourcesWord.setContent(content.toString());
resourcesWord.setParagNum(paragNum);
resourcesWord.setSentenceNum(sentenceNum);
resourcesWord.setWordNum(wordNum);
/*
* MongoDBUtils mongoDb = new MongoDBUtils("javadb"); DBObject dbs =
* new BasicDBObject(); dbs.put("name", "创新性"); //分类
* dbs.put("major", "医疗"); //专业 dbs.put("content",
* content.toString()); dbs.put("paragNum", paragNum);
* dbs.put("sentenceNum", sentenceNum); dbs.put("wordNum", wordNum);
* mongoDb.insert(dbs, "javadb");
*/
is.close();
} catch (Exception e) {
e.printStackTrace();
}
return resourcesWord;
}
/**
* \brief doc转换成html,并返回输出的相对路径
* @param filePath :要转换的doc文档
* @param outPutFilePath :文档输出的位置
* @attention
* @author toto
* @throws IOException
* @throws FileNotFoundException
* @throws ParserConfigurationException
* @date 2017年2月27日
* @note begin modify by 涂作权 2017年2月27日 原始创建
*/
public static String doc2Html(
String filePath,
final String outPutFilePath)
throws TransformerException, IOException, ParserConfigurationException {
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(filePath));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory
.newInstance()
.newDocumentBuilder()
.newDocument());
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
//File file = new File(outPutFilePath);
//String name = file.getName();
tempImagePath = outPutFilePath.substring(0,outPutFilePath.indexOf(".html")) + File.separator;
File imageFolder = new File(tempImagePath);
if (!imageFolder.exists()) {
try {
FileUtils.forceMkdir(imageFolder);
} catch (IOException e) {
e.printStackTrace();
}
}
String newTempImagePath = imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, "");
return newTempImagePath + File.separator + suggestedName;
}
});
wordToHtmlConverter.processDocument(wordDocument);
// 保存图片
List<Picture> pics = wordDocument.getPicturesTable().getAllPictures();
if (pics != null) {
for (int i = 0; i < pics.size(); i++) {
Picture pic = (Picture) pics.get(i);
try {
File picOutFolder = new File(tempImagePath + File.separator);
if (!picOutFolder.exists()) {
picOutFolder.mkdirs();
}
pic.writeImageContent(new FileOutputStream(tempImagePath + File.separator + pic.suggestFullFileName()));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
out.close();
writeFile(new String(out.toByteArray()), outPutFilePath);
return gainRelativePathByOutputPath(outPutFilePath);
}
/**
* 将docx格式的word转换为html格式的文档
*
* @param filePath 原始的docx文件路径存储位置
* @param outPutFile html输出文件路径
* @return
* @throws TransformerException
* @throws IOException
* @throws ParserConfigurationException
*/
public static String docx2Html(String filePath,
final String outPutFilePath)
throws TransformerException, IOException, ParserConfigurationException {
//String fileOutName = outPutFile;
XWPFDocument wordDocument = new XWPFDocument(new FileInputStream(filePath));
XHTMLOptions options = XHTMLOptions.create().indent(4);
// 导出图片
Map<String, String> imageInfoMap = gainTempImagePath(outPutFilePath);
File imageFolder = new File(imageInfoMap.get("imageStoredPath"));
options.setExtractor(new FileImageExtractor(imageFolder));
// URI resolver
//这种方式获得word中的图片地址是绝对地址
//options.URIResolver(new FileURIResolver(imageFolder));
//设置生成的html中的img src中的地址是相对路径
options.URIResolver(new BasicURIResolver(imageInfoMap.get("imageFolder")));
File outFile = new File(outPutFilePath);
outFile.getParentFile().mkdirs();
OutputStream out = new FileOutputStream(outFile);
XHTMLConverter.getInstance().convert(wordDocument, out, options);
return gainRelativePathByOutputPath(outPutFilePath);
//System.out.println("Generate " + fileOutName + " with " + (System.currentTimeMillis() - startTime) + " ms.");
}
/**
* \brief 将内容写到path路径下面
* @param content :文档内容
* @param path :最终的文件存储路径
* @attention 方法的使用注意事项
* @author toto
* @date 2017年2月27日
* @note begin modify by 涂作权 2017年2月27日 修改输出的文件名称
*/
public static void writeFile(String docContent, String path) {
FileOutputStream outDocFos = null;
try {
//判断文件是否为空的
if (StringUtils.isNotBlank(path)) {
File file = new File(path);
if (!file.exists()) {
FileUtils.forceMkdir(file.getParentFile());
}
outDocFos = new FileOutputStream(path);
IOUtils.write(docContent, outDocFos,CHARSET_UTF8);
}
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (outDocFos != null)
outDocFos.close();
} catch (IOException ie) {
}
}
}
/**
* 关闭输入流
*
* @param is
*/
private static void close(InputStream is) {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* \brief 通过文档输出路径获得图片存储路径
* @param outPutFile :文档输出路径
* @return
* @attention 方法的使用注意事项
* @author toto
* @date 2017年2月28日
* @note begin modify by 修改人 修改时间 修改内容摘要说明
*/
private static Map<String, String> gainTempImagePath(String outPutFilePath) {
Map<String,String> imageInfoMap = new HashMap<String,String>();
try {
//File file = new File(outPutFilePath);
tempImagePath = outPutFilePath.substring(0,outPutFilePath.indexOf(".html")) + File.separator;
File imageFolder = new File(tempImagePath);
if (!imageFolder.exists()) {
try {
FileUtils.forceMkdir(imageFolder);
} catch (IOException e) {
e.printStackTrace();
}
}
//System.out.println(imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, ""));
//return imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, "");
imageInfoMap.put("imageStoredPath", imageFolder.getPath());
imageInfoMap.put("imageFolder", imageFolder.getPath().replace(imageFolder.getParentFile().getPath(), "").replace(File.separator, ""));
return imageInfoMap;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
private static String gainRelativePathByOutputPath(String outPutFilePath) {
//用于预览的存储路径
String docsPreviewPath = ExtendedServerConfig.getInstance().getStringProp("DOCS_PREVIEW_PREFIX");
return outPutFilePath.split(docsPreviewPath)[1];
}
/**
* \brief
* @param orgStr :表示要替换的就得字符串
* @param regEx :表示的是正则表达式
* @param targetStr :表示要替换的字符串
* @return
* @attention 方法的使用注意事项
* @author toto
* @date 2017年3月4日
* @note begin modify by 涂作权 原始创建 2017年3月4日
*/
public static String replaceStr(String orgStr,String regEx,String targetStr){
if (null !=orgStr && !"".equals(orgStr.trim())) {
//String regEx="[\\s~·`!!@#¥$%^……&*(())\\-——\\-_=+【\\[\\]】{{}}\\|、\\\\;;::‘'“”\",,《<。.》>、/??]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(orgStr);
return m.replaceAll(targetStr);
}
return null;
}
public static void main(String[] args) throws Exception {
// String uploadFile = ExtendedServerConfig.getInstance().getStringProperty("UPLOAD_PATH");
// String docsTempPath = ExtendedServerConfig.getInstance().getStringProperty("DOCS_TEMP_PATH");
// String docsOutputPath = ExtendedServerConfig.getInstance().getStringProp("DOCS_OUTPUT_PATH");
// System.out.println("uploadFile = " + uploadFile + " " + docsTempPath + " " + docsOutputPath);
//
// Testtest.readWord("E://111.doc");
// Testtest.readDoc();
// System.out.println(content);
// ResourcesWord readDocx = ReadWordUtils.readDoc(uploadFile + "/大学生创新创业项目申报书.doc");
// logger.info(readDocx.getContent());
// logger.info(readDocx.getParagNum());
//
// new ReadWordUtils().doc2Html(uploadFile + "/大学生创新创业项目申报书.doc" , docsOutputPath + "/大学生创新创业项目申报书.html");
//new ReadWordUtils().docx2Html(uploadFile + "/大学生创新创业项目申报书副本.docx" , docsOutputPath + "/大学生创新创业项目申报书副本.html");
String newStr = replaceStr("afdas//\\as dfasd a//asd\\\\\\asd\\/", "[\\\\]","/");
newStr = replaceStr(newStr, "(/){1,}", "/");
newStr = replaceStr(newStr, "[ ]", "");
System.out.println(newStr);
}
}
下面是调用案例:
import java.io.File;
import org.apache.log4j.Logger;
import org.springframework.stereotype.Service;
import cn.com.hbny.docdetection.mongodb.beans.DocInfo;
import cn.com.hbny.docdetection.server.ExtendedServerConfig;
import cn.com.hbny.docdetection.service.base.impl.BaseServiceImpl;
import cn.com.hbny.docdetection.service.docInfoHandler.DocInfoHandlerService;
import cn.com.hbny.docdetection.utils.Pinyin4jUtils;
import cn.com.hbny.docdetection.utils.ReadWordUtils;
import cn.com.hbny.docdetection.utils.UUIDGenerator;
import cn.com.hbny.docdetection.utils.Pinyin4jUtils.PinyinType;
/**
* @brief DocInfoHandlerServiceImpl.java 文档检测对应的文档
* @attention
* @author toto
* @date 2017年3月2日
* @note begin modify by 涂作权 2017年3月2日 原始创建
*/
@Service(value = "docInfoHandlerService")
public class DocInfoHandlerServiceImpl extends BaseServiceImpl implements DocInfoHandlerService {
private static Logger logger = Logger.getLogger(DocInfoHandlerServiceImpl.class);
/**
* 文档处理对应的service
* @param docLibrayId :文档库对应的id
* @param originalDocPath :原始文档所在的位置
* @param uploadPath :文档上传路径
* @param outPutFolderPath :文档最终的输出文件夹
* @param docsPreviewPrefix :文档预览的前缀
*/
public DocInfo handlerSingleDocInfo(
String docLibrayId,
String originalDocPath,
String uploadPath,
String outPutFolderPath,
String docsPreviewPrefix) {
try {
DocInfo docInfo = new DocInfo();
docInfo.setId(UUIDGenerator.generate());
docInfo.setDocLibrayId(docLibrayId);
//处理传递过来的文件路径
File file = new File(originalDocPath);
//判断文件是否哦存在,如果不存在直接返回,如果存在继续下面的操作
if (file.exists()) {
//获取到文档的名称
String fileName = file.getName();
docInfo.setOriginalFileName(fileName.substring(0,fileName.toLowerCase().indexOf(".doc")));
//截取上传文件的后面那一串路径
String fileRelativePath = originalDocPath.substring(uploadPath.length());
docInfo.setOriginalDocPath(fileRelativePath);
//判断文件后缀
if (fileName.endsWith(".doc")) {
//1、处理word文档,并将word文档存储在相应的位置上,将word存储成html
String outPutFilePath = Pinyin4jUtils.toPinYin(
outPutFolderPath + fileRelativePath.replace(".doc", ".html"),
PinyinType.LOWERCASE);
outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[\\\\]","/");
outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "(/){1,}", "/");
outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[ ]", "");
//下面是经过处理后的文件存储位置
String filePathAfterHandled = ReadWordUtils.doc2Html(originalDocPath,outPutFilePath);
docInfo.setHtmlDocPath(filePathAfterHandled);
} else {
//1、处理word文档,并将word文档存储在相应的位置上,将word存储成html
//1、处理word文档,并将word文档存储在相应的位置上,将word存储成html
String outPutFilePath = Pinyin4jUtils.toPinYin(
outPutFolderPath + fileRelativePath.replace(".docx", ".html"),
PinyinType.LOWERCASE);
outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[\\\\]","/");
outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "(/){1,}", "/");
outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[ ]", "");
//下面是经过处理后的文件存储位置
String filePathAfterHandled = ReadWordUtils.docx2Html(originalDocPath, outPutFilePath);
docInfo.setHtmlDocPath(filePathAfterHandled);
}
return null;
} else {
return null;
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static void main(String[] args) {
String uploadPath = ExtendedServerConfig.getInstance().getStringProperty("UPLOAD_PATH");
String outPutFolderPath = ExtendedServerConfig.getInstance().getStringProperty("DOCS_OUTPUT_PATH");
String docsPreviewPrefix = ExtendedServerConfig.getInstance().getStringProperty("DOCS_PREVIEW_PREFIX");
// new DocInfoHandlerServiceImpl().handlerSingleDocInfo(
// UUIDGenerator.generate(),
// uploadPath + "/双创项目申报书20170301/国家大学生创新训练计划项目申请书华师大.doc",
// uploadPath,
// outPutFolderPath);
// new DocInfoHandlerServiceImpl().handlerSingleDocInfo(
// UUIDGenerator.generate(),
// uploadPath + "/双创项目申报书20170301/国家级大学生创新创业训练计划 立项申请书 上海电力学院.doc",
// uploadPath,
// outPutFolderPath,
// docsPreviewPrefix);
new DocInfoHandlerServiceImpl().handlerSingleDocInfo(
UUIDGenerator.generate(),
uploadPath + "/双创项目申报书20170301/专题产品需求规格说明书.docx",
uploadPath,
outPutFolderPath,
docsPreviewPrefix);
}
}
下面是所以用到的参数配置:
#上传的文件的存储位置的配置,统一的最后面不要加斜杠
UPLOAD_PATH=D:/installed/apache-tomcat-7.0.47/webapps/upload
##处理后的文档输出位置,统一的最后面不要加斜杠
DOCS_OUTPUT_PATH=D:/installed/apache-tomcat-7.0.47/webapps/docs-output-path
##文档预览路径,注意最后面不要加斜杠
DOCS_PREVIEW_PREFIX=/docs-output-path
##处理文档是,生成的一些图片的临时存储路径,最后面不要加斜杠
DOCS_TEMP_PATH=D:/installed/apache-tomcat-7.0.47/webapps/temp