Word2Html(doc docx)

最新推荐文章于 2023-06-16 10:48:45 发布

weixin_34006965

最新推荐文章于 2023-06-16 10:48:45 发布

阅读量1k

点赞数

原文链接：https://my.oschina.net/xiaoshoubingliang/blog/697355

版权

2019独角兽企业重金招聘Python工程师标准>>>

package com.htfg.dto;

import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.ParagraphAlignment;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.w3c.dom.Document;

public class DocumentTransformUtil {
   /**
   * doc2html核心方法
   * @param docFile doc文件
   * @param htmlFile html文件
   */
   public static void doc2Html(final File docFile,
           File htmlFile) {
       String imgsdirStr = docFile.getParentFile().getParent()+"\\temps\\imgs\\"+docFile.getName().replace(".doc", "");
       final File imgDir = new File(imgsdirStr);
       if(!imgDir.exists() && !imgDir.isDirectory()){
           imgDir.mkdir();
       }
       try {
           HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(docFile
                   ));
           WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                   DocumentBuilderFactory.newInstance().newDocumentBuilder()
                           .newDocument());
           wordToHtmlConverter.setPicturesManager(new PicturesManager() {
               public String savePicture(byte[] content,
                       PictureType pictureType, String suggestedName,
                       float widthInches, float heightInches) {
                   return "../temps/imgs/" +docFile.getName().replace(".doc", "")+"/"+ suggestedName;
               }
           });
           wordToHtmlConverter.processDocument(wordDocument);
           List pics = wordDocument.getPicturesTable().getAllPictures();
           if (pics != null) {
               for (int i = 0; i < pics.size(); i++) {
                   Picture pic = (Picture) pics.get(i);
                   try {
                       String imgtmppath = docFile.getParentFile().getParent()+"/temps/imgs/"+docFile.getName().replace(".doc", "")+"/";
                       pic.writeImageContent(new FileOutputStream(imgtmppath
                               + pic.suggestFullFileName()));
                   } catch (FileNotFoundException e) {
                       e.printStackTrace();
                   }
               }
           }
           Document htmlDocument = wordToHtmlConverter.getDocument();
           ByteArrayOutputStream out = new ByteArrayOutputStream();
           DOMSource domSource = new DOMSource(htmlDocument);
           StreamResult streamResult = new StreamResult(out);

           TransformerFactory tf = TransformerFactory.newInstance();
           Transformer serializer = tf.newTransformer();
           serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
           serializer.setOutputProperty(OutputKeys.INDENT, "yes");
           serializer.setOutputProperty(OutputKeys.METHOD, "html");
           serializer.transform(domSource, streamResult);
           out.close();
           writeFile(new String(out.toByteArray()), htmlFile);
       } catch (Exception e) {
           e.printStackTrace();
       }
   }
   /**
   * docx格式word转换为html
   *
   * @param fileName
   * docx文件路径
   * @param outPutFile
   * html输出文件路径
   * @throws TransformerException
   * @throws IOException
   * @throws ParserConfigurationException
   */
   public static void docx2Html(final File docFile,File htmlFile) throws TransformerException, IOException, ParserConfigurationException {

       String imgsdirStr = docFile.getParentFile().getParent()+"/temps/imgs/";
       final File imgDir = new File(imgsdirStr);

       long startTime = System.currentTimeMillis();
       OPCPackage pack = POIXMLDocument.openPackage(docFile.getPath());

       XWPFDocument document = new XWPFDocument(pack);
       XHTMLOptions options = XHTMLOptions.create().indent(4);
       // 导出图片


       options.URIResolver(new BasicURIResolver("http://localhost:8080/SearchBySolr/res/temps/imgs"));
       File imageFolder = new File(imgsdirStr);
           options.setExtractor(new FileImageExtractor(imageFolder));
       // URI resolver
       //options.URIResolver(new FileURIResolver(imageFolder));
       List<XWPFParagraph> paragraphs = document.getParagraphs();
       //段落的格式,下面及个设置,将使新添加的文字向左对其,无缩进.

       for(XWPFParagraph p:paragraphs){
           p.setIndentationLeft(2);
           p.setIndentationHanging(0);
           p.setAlignment(ParagraphAlignment.LEFT);
           p.setWordWrap(true);
       }


       OutputStream out = new FileOutputStream(htmlFile);
       XHTMLConverter.getInstance().convert(document, out, options);
       //System.out.println("Generate " + fileOutName + " with " + (System.currentTimeMillis() - startTime) + " ms.");

}
   /**
   * 把文件夹内doc转为htmls
   * @param docsDir docs保存目录
   * @param htmlsDir htmls保存路径
   */
   public static void docs2Htmls(File docsDir, File htmlsDir) {
       File[] files = docsDir.listFiles();
       for (File file : files) {
           doc2Html(file, new File(htmlsDir+"\\"+file.getName().replace(".doc", ".html")));
       }
   }
   public static void docxs2Htmls(File docsDir, File htmlsDir) {
       File[] files = docsDir.listFiles();
       for (File file : files) {
           doc2Html(file, new File(htmlsDir+"\\"+file.getName().replace(".docx", ".html")));
       }
   }

   /**
   * 把文档内用写入html文件
   *
   * @param content
   * 文档内容
   * @param htmlFile
   * html文件
   */
   private static void writeFile(String content, File htmlFile) {
       FileOutputStream fos = null;
       BufferedWriter bw = null;
       try {
           fos = new FileOutputStream(htmlFile);
           bw = new BufferedWriter(new OutputStreamWriter(fos, "utf-8"));
           bw.write(content);
       } catch (FileNotFoundException fnfe) {
           fnfe.printStackTrace();
       } catch (IOException ioe) {
           ioe.printStackTrace();
       } finally {
           try {
               if (bw != null)
                   bw.close();
               if (fos != null)
                   fos.close();
           } catch (IOException ie) {
               ie.printStackTrace();
           }
       }
   }
}