原文: http://blog.csdn.net/fyqcdbdx/article/details/7630122
1. 下载POI工具并引用
2. 读取整个doc文档,获得该文档的所有字符串。
3. 从该字符串中得到标题,把该标题构成一个HTML格式的字符串,如<html><head><title>测试文档</title></head><body>。
4. 从该文档中判断是否有表格,如有,把每个表格的开始偏移量,结束偏移量记录下来,同时根据每个表格的行,列读取表格的内容,并构造出表格的HTML字符串。
5. 从该字符串的第一个字符开始逐个字符循环,得到字符的字体,字号大小,直到下一个字符的字体,字号不一样时,把这些字符内容构造成一个HTML格式的字符串。
6. 如果碰到字符为回车符,制表符,把回车符,制表符构造成HTML格式的字符串。
7. 如果碰到字符为图片,读取图片,把图片放在指定路径,再把这一路径的信息构造成HTML字符串,如<img src='c://test//1.jpg'/>。
8. 如读取字符串的位置等于表格的开始偏移量时,插入前面一构造出的表格HTML字符串,同时跳过表格的结束偏移量,继续往下循环读取字符。
9. 由于以上读取是按字符串逐个读取,并且根据字符的变化同时构造出HTML字符串,所以当字符串读取完毕后,即构造出一个完整的HTML字符串。
10. 举例
Word文件
HTML文件
11.源代码
WordExcelToHtml.java
- package com;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.OutputStream;
- import java.io.OutputStreamWriter;
- import org.apache.poi.hwpf.HWPFDocument;
- import org.apache.poi.hwpf.model.PicturesTable;
- import org.apache.poi.hwpf.usermodel.CharacterRun;
- import org.apache.poi.hwpf.usermodel.Picture;
- import org.apache.poi.hwpf.usermodel.Range;
- import org.apache.poi.hwpf.extractor.WordExtractor;
- import org.apache.poi.hwpf.usermodel.Paragraph;
- import org.apache.poi.hwpf.usermodel.Table;
- import org.apache.poi.hwpf.usermodel.TableCell;
- import org.apache.poi.hwpf.usermodel.TableIterator;
- import org.apache.poi.hwpf.usermodel.TableRow;
- public class WordExcelToHtml {
- /**
- * 回车符ASCII码
- */
- private static final short ENTER_ASCII = 13;
- /**
- * 空格符ASCII码
- */
- private static final short SPACE_ASCII = 32;
- /**
- * 水平制表符ASCII码
- */
- private static final short TABULATION_ASCII = 9;
- public static String htmlText = "";
- public static String htmlTextTbl = "";
- public static int counter=0;
- public static int beginPosi=0;
- public static int endPosi=0;
- public static int beginArray[];
- public static int endArray[];
- public static String htmlTextArray[];
- public static boolean tblExist=false;
- public static final String inputFile="c://bb.doc";
- public static void main(String argv[])
- {
- try {
- getWordAndStyle(inputFile);
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- /**
- * 读取每个文字样式
- *
- * @param fileName
- * @throws Exception
- */
- public static void getWordAndStyle(String fileName) throws Exception {
- FileInputStream in = new FileInputStream(new File(fileName));
- HWPFDocument doc = new HWPFDocument(in);
- Range rangetbl = doc.getRange();//得到文档的读取范围
- TableIterator it = new TableIterator(rangetbl);
- int num=100;
- beginArray=new int[num];
- endArray=new int[num];
- htmlTextArray=new String[num];
- // 取得文档中字符的总数
- int length = doc.characterLength();
- // 创建图片容器
- PicturesTable pTable = doc.getPicturesTable();
- htmlText = "<html><head><title>" + doc.getSummaryInformation().getTitle() + "</title></head><body>";
- // 创建临时字符串,好加以判断一串字符是否存在相同格式
- if(it.hasNext())
- {
- readTable(it,rangetbl);
- }
- int cur=0;
- String tempString = "";
- for (int i = 0; i < length - 1; i++) {
- // 整篇文章的字符通过一个个字符的来判断,range为得到文档的范围
- Range range = new Range(i, i + 1, doc);
- CharacterRun cr = range.getCharacterRun(0);
- //beginArray=new int[num];
- //endArray=new int[num];
- //htmlTextArray=new String[num];
- if(tblExist)
- {
- if(i==beginArray[cur])
- {
- htmlText+=tempString+htmlTextArray[cur];
- tempString="";
- i=endArray[cur]-1;
- cur++;
- continue;
- }
- }
- if (pTable.hasPicture(cr)) {
- htmlText += tempString ;
- // 读写图片
- readPicture(pTable, cr);
- tempString = "";
- }
- else {
- Range range2 = new Range(i + 1, i + 2, doc);
- // 第二个字符
- CharacterRun cr2 = range2.getCharacterRun(0);
- char c = cr.text().charAt(0);
- System.out.println(i+"::"+range.getEndOffset()+"::"+range.getStartOffset()+"::"+c);
- // 判断是否为回车符
- if (c == ENTER_ASCII)
- {
- tempString += "<br/>";
- }
- // 判断是否为空格符
- else if (c == SPACE_ASCII)
- tempString += " ";
- // 判断是否为水平制表符
- else if (c == TABULATION_ASCII)
- tempString += " ";
- // 比较前后2个字符是否具有相同的格式
- boolean flag = compareCharStyle(cr, cr2);
- if (flag)
- tempString += cr.text();
- else {
- String fontStyle = "<span style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
- if (cr.isBold())
- fontStyle += "font-weight:bold;";
- if (cr.isItalic())
- fontStyle += "font-style:italic;";
- htmlText += fontStyle + "" mce_style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
- if (cr.isBold())
- fontStyle += "font-weight:bold;";
- if (cr.isItalic())
- fontStyle += "font-style:italic;";
- htmlText += fontStyle + "">" + tempString + cr.text() + "</span>";
- tempString = "";
- }
- }
- }
- htmlText += tempString+"</body></html>";
- writeFile(htmlText);
- }
- /**
- * 读写文档中的表格
- *
- * @param pTable
- * @param cr
- * @throws Exception
- */
- public static void readTable(TableIterator it, Range rangetbl) throws Exception {
- htmlTextTbl="";
- //迭代文档中的表格
- counter=-1;
- while (it.hasNext())
- {
- tblExist=true;
- htmlTextTbl="";
- Table tb = (Table) it.next();
- beginPosi=tb.getStartOffset() ;
- endPosi=tb.getEndOffset();
- System.out.println("............"+beginPosi+"...."+endPosi);
- counter=counter+1;
- //迭代行,默认从0开始
- beginArray[counter]=beginPosi;
- endArray[counter]=endPosi;
- htmlTextTbl+="<table border>";
- for (int i = 0; i < tb.numRows(); i++) {
- TableRow tr = tb.getRow(i);
- htmlTextTbl+="<tr>";
- //迭代列,默认从0开始
- for (int j = 0; j < tr.numCells(); j++) {
- TableCell td = tr.getCell(j);//取得单元格
- int cellWidth=td.getWidth();
- //取得单元格的内容
- for(int k=0;k<td.numParagraphs();k++){
- Paragraph para =td.getParagraph(k);
- String s = para.text().toString().trim();
- if(s=="")
- {
- s=" ";
- }
- System.out.println(s);
- htmlTextTbl += "<td width="+cellWidth+ ">"+s+"</td>";
- System.out.println(i+":"+j+":"+cellWidth+":"+s);
- } //end for
- } //end for
- } //end for
- htmlTextTbl+="</table>" ;
- htmlTextArray[counter]=htmlTextTbl;
- } //end while
- }
- /**
- * 读写文档中的图片
- *
- * @param pTable
- * @param cr
- * @throws Exception
- */
- public static void readPicture(PicturesTable pTable, CharacterRun cr) throws Exception {
- // 提取图片
- Picture pic = pTable.extractPicture(cr, false);
- // 返回POI建议的图片文件名
- String afileName = pic.suggestFullFileName();
- OutputStream out = new FileOutputStream(new File("c://test" + File.separator + afileName));
- pic.writeImageContent(out);
- htmlText += "<img src="c://test//" + afileName + "" mce_src="c://test//" + afileName + ""/>";
- }
- public static boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2)
- {
- boolean flag = false;
- if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic() && cr1.getFontName().equals(cr2.getFontName()) && cr1.getFontSize() == cr2.getFontSize())
- {
- flag = true;
- }
- return flag;
- }
- /**
- * 写文件
- *
- * @param s
- */
- public static void writeFile(String s) {
- FileOutputStream fos = null;
- BufferedWriter bw = null;
- try {
- File file = new File("c://abc.html");
- fos = new FileOutputStream(file);
- bw = new BufferedWriter(new OutputStreamWriter(fos));
- bw.write(s);
- } catch (FileNotFoundException fnfe) {
- fnfe.printStackTrace();
- } catch (IOException ioe) {
- ioe.printStackTrace();
- } finally {
- try {
- if (bw != null)
- bw.close();
- if (fos != null)
- fos.close();
- } catch (IOException ie) {
- }
- }
- }