java通过Apache开源框架POI读取Word2003和Word2007转换成html的demo实例

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

public class test {
	
	public static synchronized String getOnlyId(){
		long curL = System.currentTimeMillis();
		if(curL>curL){
			cacheInt = 0;
		}else{
			cacheInt += 1;
		}
		return String.valueOf(curL)+String.valueOf(cacheInt);
	}	
	
	private static int cacheInt = 0;
	
	private static long curL = 0;	
	
	private static String projectUrl = "http://192.168.1.1:8080/test/";
	
	private static String projectPath = "D:/test/piccache/";
	
	/**
	 * 处理word2003
	 * @param inFile
	 * @return
	 */
	public static String doWord(File inFile) {
		ByteArrayOutputStream out = new ByteArrayOutputStream();  
		String randomName = "PIC"+getOnlyId();
		//转换后html中图片src的链接
		final String baseUrl = projectUrl+"wordpic/"+randomName+"/";
		//转换后图片存放的位置
		String dir = projectPath+"/wordpic/"+randomName+"/";
		File dirF = new File(dir);
		if(!dirF.exists()||!dirF.isDirectory()){
			dirF.mkdir();
		}
		try{
			HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(inFile));
	        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());  
	        wordToHtmlConverter.setPicturesManager( new PicturesManager()         
	         {  
	             public String savePicture( byte[] content,  
	                     PictureType pictureType, String suggestedName,  
	                     float widthInches, float heightInches )  
	             {  
	                 return baseUrl+suggestedName;  
	             }  
	         } );  
	        wordToHtmlConverter.processDocument(wordDocument);  
	        List<Picture> pics=wordDocument.getPicturesTable().getAllPictures();  
	        if(pics!=null){  
	            for(int i=0;i<pics.size();i++){  
	                Picture pic = (Picture)pics.get(i);  
	                try {  
	                    pic.writeImageContent(new FileOutputStream(dir + pic.suggestFullFileName()));  
	                } catch (FileNotFoundException e) {  
	                    e.printStackTrace();  
	                }    
	            }  
	        }  
	        Document htmlDocument = wordToHtmlConverter.getDocument();  
	        
	        DOMSource domSource = new DOMSource(htmlDocument);  
	        StreamResult streamResult = new StreamResult(out);  
	  
	        TransformerFactory tf = TransformerFactory.newInstance();  
	        Transformer serializer = tf.newTransformer();  
	        serializer.setOutputProperty(OutputKeys.ENCODING, "GB2312");  
	        serializer.setOutputProperty(OutputKeys.INDENT, "yes");  
	        serializer.setOutputProperty(OutputKeys.METHOD, "html");  
	        serializer.transform(domSource, streamResult);  
	        out.close();  
		}catch(Exception e){
			e.printStackTrace();
		}
        return new String(out.toByteArray());
	}  
	
	/**
	 * 处理word2007
	 * @param inFile
	 * @return
	 */
	public static String doWord2007(File inFile) {
		ByteArrayOutputStream out = new ByteArrayOutputStream();  
		String randomName = "PIC"+getOnlyId();
		//转换后html中图片src的链接
		final String baseUrl = projectUrl+"wordpic/"+randomName+"/";
		//转换后图片存放的位置
		String dir = projectPath+"/wordpic/"+randomName+"/";
		File dirF = new File(dir);
		if(!dirF.exists()||!dirF.isDirectory()){
			dirF.mkdir();
		}
		try{
			XWPFDocument wordDocument = new XWPFDocument(new FileInputStream(inFile));
			XHTMLOptions options = XHTMLOptions.create().URIResolver(new BasicURIResolver(baseUrl));
			File imageFolderFile = new File(dir);
			options.setExtractor(new FileImageExtractor(imageFolderFile));
			XHTMLConverter.getInstance().convert(wordDocument, out, options);
	        out.close();  
		}catch(Exception e){
			e.printStackTrace();
		}
        return new String(out.toByteArray());
	}
	
	
	public static void main(String[] args) {
		File word2003 = new File("d:/test/员工思想工作总结(李海博).doc");
		File word2007 = new File("d:/test/发改委oa系统功能说明书.docx");
		
		/** 打印出word2003转换后的html内容*/
		System.out.println(doWord(word2003));
		
		/** 打印出word2007转换后的html内容*/
		System.out.println(doWord2007(word2007));
	}

}


  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
要将Word文档转换为PDF,可以使用Java中的Apache POI和iText库。Apache POI是一个Java库,用于读取和写入Microsoft Office格式的文档,包括Word文档。而iText是一个开源Java库,用于创建和处理PDF文件。 以下是将Word文档转换为PDF的简单步骤: 1. 使用Apache POI读取Word文档,将其转换为HTML格式。 2. 使用iText将HTML格式的文档转换为PDF格式。 以下是一个简单的Java示例代码: ``` import java.io.*; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import com.itextpdf.text.Document; import com.itextpdf.text.Element; import com.itextpdf.text.Paragraph; import com.itextpdf.text.pdf.PdfWriter; import com.itextpdf.text.html.simpleparser.HTMLWorker; import com.itextpdf.text.html.simpleparser.StyleSheet; public class WordToPdfConverter { public static void main(String[] args) { try { // 读取Word文档 FileInputStream fis = new FileInputStream("input.docx"); XWPFDocument document = new XWPFDocument(fis); XWPFWordExtractor extractor = new XWPFWordExtractor(document); String content = extractor.getText(); // 将HTML格式的文档转换为PDF Document pdfDoc = new Document(); PdfWriter.getInstance(pdfDoc, new FileOutputStream("output.pdf")); pdfDoc.open(); StringReader strReader = new StringReader(content); HTMLWorker htmlWorker = new HTMLWorker(pdfDoc); StyleSheet styles = new StyleSheet(); htmlWorker.setStyleSheet(styles); htmlWorker.parse(strReader); pdfDoc.close(); System.out.println("转换成功!"); } catch (Exception e) { e.printStackTrace(); } } } ``` 在这个例子中,我们首先使用Apache POI读取Word文档,然后将其转换为HTML格式。接下来,我们使用iText创建一个PDF文档,并使用HTMLWorker将HTML格式的文档写入PDF文档中。最后,我们将PDF文档保存在文件系统中。 请注意,这只是一个简单的示例代码,可能需要根据实际情况进行修改和优化。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值