apache poi解析word(doc)文档成xml及导出成html

//遇到解析word文档的需求,文档格式不定,在网上看了好多文章,大多是简单应用api解析内容,要不就是需要windows环境,还有个在线编辑的第三方的,不太符合本人需求,目前实现了doc的,docx的后续再说吧。
//本来是想把word解析成xml形式的字符串,就 参考apache poi中的 wordToHtmlConverter部分源码,因为它做了很多样式的解析,本人用不到这些,解析过多反而累赘,所以参考源码 +dom4j实现此功能,不过后来就顺便实现了下导出html(只有一两个样式)
//注意修改代码中word文档的地址和导出图片的存放地址(getImgUrl)

private static Document document = null;
    static{
            document = DocumentHelper.createDocument();
    }  

public static void main(String[] args) {
        File file = new File("C:/Users/css/Desktop/1.doc"); //注意文档地址
        if(file.exists()){
            HWPFDocument doc;
            try {
                doc = new HWPFDocument(new FileInputStream(file));
                
                Range range = doc.getRange();  
                wordToHtml(doc);
//                printRange(doc);       
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
/**
     * word解析并导出html文件
     * @param doc
     */  
private static void wordToHtml(HWPFDocument doc){
        Element htmlElement = DocumentHelper.createElement("html");
        document.setRootElement(htmlElement);
        
        Element headElement = DocumentHelper.createElement("head");
        Element charSetElement = DocumentHelper.createElement("meta");
        charSetElement.addAttribute("http-equiv""Content-Type");
        charSetElement.addAttribute("content""text/html; charset=UTF-8");
        headElement.add(charSetElement);
        htmlElement.add(headElement);
        
        Element bodyElement = DocumentHelper.createElement("body");
        Element contentElement = formatRange(doc); //解析word
        bodyElement.add(contentElement);
        htmlElement.add(bodyElement);
        
        String docString = document.asXML();
        writeFile(docString, "C:/Users/css/Desktop/12345.html");
        System.out.println(docString);
    } 
/**
     * 导出文件
     * @param content
     * @param path
     */
    private static void writeFile(String content, String path) {  
        FileOutputStream fos = null;  
        BufferedWriter bw = null;  
        try {  
            File file = new File(path);  
            fos = new FileOutputStream(file);  
            bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));  
            bw.write(content);  
        } catch (FileNotFoundException fnfe) {  
            fnfe.printStackTrace();  
        } catch (IOException ioe) {  
            ioe.printStackTrace();  
        } finally {  
            try {  
                if (bw != null)  
                    bw.close();  
                if (fos != null)  
                    fos.close();  
            } catch (IOException ie) {  
            }  
        }  
    }  
    /**
     * 解析word文档(包括table表格、图片、text文本内容)
     * @param doc
     * @return
     */
    private static Element formatRange(HWPFDocument doc){
        Element contentEle = DocumentHelper.createElement("div");
        
        Range range = doc.getRange();
        PicturesTable pt = doc.getPicturesTable();
        
        int pnum = range.numParagraphs();
        Paragraph paragraph = null;
        String text = null;
        
        Element divElement = null;
        CharacterRun run = null;
        int numCharacterRuns = 0;
        StringBuilder styleStr = null;
        
        for(int i=0;i<pnum;i++){
            paragraph = range.getParagraph(i);//段落
            styleStr = new StringBuilder();
            if(paragraph.isInTable()){//是否为table
                Table table = range.getTable(paragraph);
                divElement = formatTable(table);//解析table内容
                i += table.numParagraphs();//跳过table
                i--;
            }else if(paragraph.isInList()){ //目前解析的word模板没用到也就没做
                System.out.println("handle paragrah list----------------");
            }else{
                text = paragraph.text();
                if(text != null && !"".equals(text)){
                    run = paragraph.getCharacterRun(0);
                    numCharacterRuns = paragraph.numCharacterRuns();
                    if(run != null && numCharacterRuns>0){
                        if(run.text().charAt(0)==0x01 && pt.hasPicture(run)){//图片
                            divElement = formatImg(pt.extractPicture(run, true));//解析图片,创建img节点,并导出图片
                        }else{//文本内容
                            divElement = DocumentHelper.createElement("div");
                            divElement.addText(text);
                            if(run.isBold()){//加粗样式
                                styleStr.append("font-weight:bold;");
                            }
                            if(styleStr.length()>0){
                                divElement.addAttribute("style", styleStr.toString());
                            }
                        }
                    }
                }
            }
            if(divElement != null){//添加段落节点
                contentEle.add(divElement);
            }
        }
        return contentEle;
    }

    /**
     * 解析table表格
     * @param table
     * @return
     */
    private static Element formatTable(Table table){
        Element tableElement = DocumentHelper.createElement("table");
        Element theadElement = DocumentHelper.createElement("thead");
        Element tbodyElement = DocumentHelper.createElement("tbody");
        
        int[] tableCellEdges = buildTableCellEdgesArray(table); //单元格边界
        
        int rownum = table.numRows();
        int maxColumns = Integer.MIN_VALUE;
        for ( int r = 0; r < rownum; r++ ){
            maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() );
        }
        Element rowElement = null;
        TableRow row = null;
        int cellnum = 0;
        Element cellElement = null;
        TableCell cell = null;
        int rowSpan = 0;
        int colSpan = 0;
        
        tableElement.addAttribute("border""1");
        tableElement.addAttribute("style""border-spacing:0");
        
        for(int i=0;i<rownum;i++){
            row = table.getRow(i);
            if(row != null){
                rowElement = DocumentHelper.createElement("tr");
                cellnum = row.numCells();
                int currentEdgeIndex = 0;
                for(int j=0;j<cellnum;j++){
                    cell = row.getCell(j);
                    if(cell.isVerticallyMerged() && !cell.isFirstVerticallyMerged())
                    {
                        currentEdgeIndex += getNumberColumnsSpanned(tableCellEdges, currentEdgeIndex, cell );
                        continue;
                    }
                    if(cell != null){
                        if(row.isTableHeader()){
                            cellElement = DocumentHelper.createElement("th");
                        }else{
                            cellElement = DocumentHelper.createElement("td");
                        }
                        colSpan = getNumberColumnsSpanned( tableCellEdges, currentEdgeIndex, cell );//取得列合并数
                        currentEdgeIndex += colSpan;
                        if ( colSpan == 0 ){
                            continue;
                        }
                        if ( colSpan != 1 ){
                            cellElement.addAttribute( "colspan", String.valueOf( colSpan ) );
                        }
                        rowSpan = getNumberRowsSpanned( table, tableCellEdges, i, j, cell );//取得行合并数
                        if ( rowSpan > 1 ){
                            cellElement.addAttribute( "rowspan", String.valueOf( rowSpan ) );
                        }
                        cellElement.addText(cell.text());
                    }
                    if(cellElement != null){
                        rowElement.add(cellElement);
                    }
                }
            }
            if(row.isTableHeader()){
                theadElement.add(rowElement);
            }else{
                tbodyElement.add(rowElement);
            }
        }
        if(theadElement.hasContent()){
            tableElement.add(theadElement);
        }
        if(tbodyElement.hasContent()){
            tableElement.add(tbodyElement);
        }
        return tableElement;
    }
/**
     * 解析table单元格边界
     * @param table
     * @return
     */
    private static int[] buildTableCellEdgesArray( Table table )
    {
        Set<Integer> edges = new TreeSet<Integer>();
        for ( int r = 0; r < table.numRows(); r++ )
        {
            TableRow tableRow = table.getRow( r );
            for ( int c = 0; c < tableRow.numCells(); c++ )
            {
                TableCell tableCell = tableRow.getCell( c );
                edges.add( Integer.valueOf( tableCell.getLeftEdge() ) );
                edges.add( Integer.valueOf( tableCell.getLeftEdge()
                        + tableCell.getWidth() ) );
            }
        }
        Integer[] sorted = edges.toArray( new Integer[edges.size()] );
        int[] result = new int[sorted.length];
        for ( int i = 0; i < sorted.length; i++ )
        {
            result[i] = sorted[i].intValue();
        }
        return result;
    }

/**
     * 解析table列合并数
     * @param tableCellEdges
     * @param currentEdgeIndex
     * @param tableCell
     * @return
     */
    private static int getNumberColumnsSpanned( int[] tableCellEdges,
            int currentEdgeIndex, TableCell tableCell )
    {
        int nextEdgeIndex = currentEdgeIndex;
        int colSpan = 0;
        int cellRightEdge = tableCell.getLeftEdge() + tableCell.getWidth();
        while ( tableCellEdges[nextEdgeIndex] < cellRightEdge )
        {
            colSpan++;
            nextEdgeIndex++;
        }
        return colSpan;
    }
/**
     * 解析table行合并数
     * @param table
     * @param tableCellEdges
     * @param currentRowIndex
     * @param currentColumnIndex
     * @param tableCell
     * @return
     */
    private static int getNumberRowsSpanned( Table table,
            final int[] tableCellEdges, int currentRowIndex,
            int currentColumnIndex, TableCell tableCell )
    {
        if ( !tableCell.isFirstVerticallyMerged() )
            return 1;
        final int numRows = table.numRows();
        int count = 1;
        for ( int r1 = currentRowIndex + 1; r1 < numRows; r1++ )
        {
            TableRow nextRow = table.getRow( r1 );
            if ( currentColumnIndex >= nextRow.numCells() )
                break;
            // we need to skip row if he don't have cells at all
            boolean hasCells = false;
            int currentEdgeIndex = 0;
            for ( int c = 0; c < nextRow.numCells(); c++ )
            {
                TableCell nextTableCell = nextRow.getCell( c );
                if ( !nextTableCell.isVerticallyMerged()
                        || nextTableCell.isFirstVerticallyMerged() )
                {
                    int colSpan = getNumberColumnsSpanned( tableCellEdges,
                            currentEdgeIndex, nextTableCell );
                    currentEdgeIndex += colSpan;
                    if ( colSpan != 0 )
                    {
                        hasCells = true;
                        break;
                    }
                }
                else
                {
                    currentEdgeIndex += getNumberColumnsSpanned(
                            tableCellEdges, currentEdgeIndex, nextTableCell );
                }
            }
            if ( !hasCells )
                continue;
            TableCell nextCell = nextRow.getCell( currentColumnIndex );
            if ( !nextCell.isVerticallyMerged()
                    || nextCell.isFirstVerticallyMerged() )
                break;
            count++;
        }
        return count;
    } 
/**
     * 解析图片,创建img节点,并导出图片
     * @param picture
     * @return
     */
    private static Element formatImg(Picture picture){
        Element imgElement = null;
        if(picture != null){
            String imgUrl = getImgUrl(picture.suggestFullFileName());
            exportImg(picture, imgUrl);
            imgElement = DocumentHelper.createElement("img");
            imgElement.addAttribute("src", imgUrl);
        }
        return imgElement;
    }
/**
     * 获取图片保存位置
     * @param suggestedName
     * @return
     */
    private static String getImgUrl(String suggestedName){
        return "C:/Users/css/Desktop/"+suggestedName; //注意图片导出地址
    }
/**
     * 导出图片
     * @param picture
     * @param expUrl
     */
    private static void exportImg(Picture picture,String expUrl){
        if(picture != null && expUrl != null && !"".equals(expUrl))
        try {
            picture.writeImageContent(new FileOutputStream(expUrl));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }  
  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值