Word转HTML

用poi框架技术doc2html

/**
	 * 
	 * @param file wrod文档file
	 * @param path 图片路径
	 * @throws TransformerException
	 * @throws FileNotFoundException
	 * @throws IOException
	 * @throws ParserConfigurationException
	 */
 public void doc2html(File file,String path) throws TransformerException, FileNotFoundException, IOException, ParserConfigurationException {
	   ByteArrayOutputStream out = new ByteArrayOutputStream();
	   HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(file));
		WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
				DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
		wordToHtmlConverter.setPicturesManager(new PicturesManager() {
			public String savePicture(byte[] content, PictureType pictureType, String suggestedName,
					float widthInches, float heightInches) {
				return suggestedName;
			}
		});
		wordToHtmlConverter.processDocument(wordDocument);
       //获取图片
		List<Picture> pics = wordDocument.getPicturesTable().getAllPictures();
		Document htmlDocument = wordToHtmlConverter.getDocument();
		DOMSource domSource = new DOMSource(htmlDocument);
		StreamResult streamResult = new StreamResult(out);
		TransformerFactory tf = TransformerFactory.newInstance();
		Transformer serializer = tf.newTransformer();
		serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
		serializer.setOutputProperty(OutputKeys.INDENT, "yes");
		serializer.setOutputProperty(OutputKeys.METHOD, "html");
		serializer.transform(domSource, streamResult);
		out.close();
		String htmlContent = new String(out.toByteArray());
		for (int i = 0; i < pics.size(); i++) {
			Picture pic = (Picture) pics.get(i);
			//替换图片路径
			String filename = pic.suggestFullFileName();
			String substring = filename.substring(filename.lastIndexOf(".") + 1);
			htmlContent = htmlContent.replaceAll(filename,
					"image"+"/"+"image"+ (i + 1) + "." + substring + "");
			byte[] temp = pic.getContent();
            //保存html文本
			   FileWriter htmlWriter =new  FileWriter(path+"index.html");
			   htmlWriter.write(htmlContent);
			   htmlWriter.close();
            //保存图片
			   OutputStream out1 = new FileOutputStream(path+"image\\image"+(i+1)+"."+substring);
			   out1.write(temp);
			   out1.close();
		}
   }

用docx4J框架技术 docx2html


 /**
      * 
      * @param file wrod文档file
      * @param path 图片路径
      * @throws Exception
      */
public void docx2html(File file,String path) throws Exception   {
     ByteArrayOutputStream out = new ByteArrayOutputStream();
  WordprocessingMLPackage wordMLPackage = new WordprocessingMLPackage();
     wordMLPackage = WordprocessingMLPackage.load(file);
      // 创建html配置项
  	HTMLSettings htmlSettings = Docx4J.createHTMLSettings();
      //保存图片
      List<String> lists = saveDocxImg(wordMLPackage,path);
  	htmlSettings.setWmlPackage(wordMLPackage);
      //配置文本图片给路径
  	htmlSettings.setImageDirPath("iamge1");
  	htmlSettings.setImageIncludeUUID(false);
  	String userCSS = "html, body, div, span, h1, h2, h3, h4, h5, h6, p, a, img,  ol, ul, li, table, caption, tbody, tfoot, thead, tr, th, td "
  			+ "{ margin: 0; padding: 0; border: 0;}" + "body {line-height: 1;} ";
  	htmlSettings.setUserCSS(userCSS);
  	Docx4jProperties.setProperty("docx4j.Convert.Out.HTML.OutputMethodXML", true);
  	//转换html
      Docx4J.toHTML(htmlSettings, out, Docx4J.FLAG_EXPORT_PREFER_XSL);
  	out.close();
  	File flie = new File(path+"\\image1");
  	File[] files = flie.listFiles();
  	String htmlContent = new String(out.toByteArray());//htm内容文本
       // 替换图片名称
  	for (int i = 0; i < files.length; i++) {
  		String substring = files[i].getName().substring(files[i].getName().lastIndexOf(".") + 1);
  		htmlContent = htmlContent.replaceAll(files[i].getName(),
  				"image1/" + "image" + (i + 1) + "." + substring);
  	}
  	 FileWriter htmlWriter =new  FileWriter(path+"index1.html");//保存html
  	   htmlWriter.write(htmlContent);
  	   htmlWriter.close();
 }
 public List<String> saveDocxImg(WordprocessingMLPackage wordMLPackage,String path) throws Exception {
  	List<String> paths = new ArrayList<>();
  	for (Entry<PartName, Part> entry : wordMLPackage.getParts().getParts().entrySet()) {
  		if (entry.getValue() instanceof BinaryPartAbstractImage) {
  			BinaryPartAbstractImage binImg = (BinaryPartAbstractImage) entry.getValue();
  			// 图片minetype
  			String imgContentType = binImg.getContentType();
  			PartName pt = binImg.getPartName();
  			String fileName = null;
  			if (pt.getName().indexOf("word/media/") != -1) {
  				fileName = pt.getName().substring(pt.getName().indexOf("word/media/") + "word/media/".length());
  			}
  			byte[] temp = ((BinaryPart) entry.getValue()).getBytes();
  			 OutputStream out1 = new FileOutputStream(path+"image1\\"+fileName);
  			   out1.write(temp);
  			   out1.close();
  		}
  	}
  	return paths;
  }

main

public static void main(String[] args) throws Exception
{
    Word2html w=new Word2html();
     // w.doc2html(new File("C:\\Users\\chenqiao\\Desktop\\阅卷\\测试v(1).doc"),"C:\\Users\\chenqiao\\Desktop\\阅卷\\html\\"); 
   w.docx2html(new File("C:\\Users\\chenqiao\\Desktop\\阅卷\\阅卷.docx"),"C:\\Users\\chenqiao\\Desktop\\阅卷\\html\\");
}
发布了7 篇原创文章 · 获赞 0 · 访问量 56
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览