Word文档解析

package com.etcc.document;

import com.etcc.document.vo.*;
import org.apache.commons.lang3.*;
import org.apache.poi.xwpf.usermodel.*;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
import org.slf4j.*;

import java.io.*;
import java.math.*;
import java.util.*;


public class DocxParseUtil {

    private static Logger logger = LoggerFactory.getLogger(DocxParseUtil.class);

    /**
     * 解析文件
     * @param
     */
    public static List<PlanContentItem> parseDoc(FileInputStream docInputStream){

        XWPFDocument document = null;
        try {
            document = new XWPFDocument(docInputStream);
        } catch (IOException e) {
            logger.debug("读入文档异常{}", e);
            return null;
        }

        List<IBodyElement> bodyElements = document.getBodyElements();

        int index = 1;

        //返回的解析对象
        List<PlanContentItem> planContentItems = new ArrayList<>();
        //获取所有的styles
        XWPFStyles styles = document.getStyles();

        for (IBodyElement element : bodyElements){
            //新建一个存储对象
            PlanContentItem item = new PlanContentItem();

            switch (element.getElementType()){
                case PARAGRAPH: //解析文档或是图片
                    if (element instanceof XWPFParagraph){
                        XWPFParagraph paragraph = (XWPFParagraph) element;

                        parseParagraph(item, paragraph, styles);

//                        System.out.println(JSONObject.toJSONString(item));
                        if (StringUtils.isNotEmpty(item.getContent()) || item.getPics() != null) {
                            item.setSort(index);
                            planContentItems.add(item);

                            index++;
                        }
                    }
                    break;
                case TABLE: //解析表格
                    if (element instanceof XWPFTable){
                        XWPFTable table = (XWPFTable) element;

                        String tableHtml = tableToHtml(table);
                        item.setSort(index);
                        item.setDirectory("0");
                        item.setLevel(0);
                        item.setContent(tableHtml);
                        planContentItems.add(item);

                        index++;
                    }
                    break;
                case CONTENTCONTROL:

                    break;
            }

        }

        return planContentItems;
    }

    /**
     * 解析段落
     * @param contentItem 自定义解析出的对象
     * @param paragraph doc段落
     */
    private static void parseParagraph(PlanContentItem contentItem, XWPFParagraph paragraph, XWPFStyles styles){

        //判断是否设置了大纲级别
        ParagraphItem paragraphItem = new ParagraphItem();
        try {
            // 判断该段落是否设置了大纲级别
            parseOutline(paragraphItem, paragraph, paragraph.getCTP().getPPr().getOutlineLvl());
        } catch (Exception e) {

        }
        if (paragraphItem.getIsOutline()==null || !paragraphItem.getIsOutline()) {
            try {
                //判断该段落的样式是否设置了大纲级别
                CTDecimalNumber number = styles.getStyle(paragraph.getStyle()).getCTStyle().getPPr()
                        .getOutlineLvl();
                parseOutline(paragraphItem, paragraph, number);
            } catch (Exception e) {
            }
        }
        if (paragraphItem.getIsOutline() == null || !paragraphItem.getIsOutline()) {
            try {
                //判断该段落的样式的基础样式是否设置了大纲级别
                CTDecimalNumber number = styles
                        .getStyle(styles.getStyle(paragraph.getStyle()).getCTStyle().getBasedOn().getVal())
                        .getCTStyle().getPPr().getOutlineLvl();
                parseOutline(paragraphItem, paragraph, number);
            } catch (Exception e) {
                // TODO: handle exception
            }
        }

        if (paragraphItem.getIsOutline()!=null && paragraphItem.getIsOutline()){
            contentItem.setContent(paragraphItem.getContent());
            contentItem.setLevel(paragraphItem.getLevel());
            contentItem.setDirectory("1");
        }else {
            //获取段落中的文本信息
            List<XWPFRun> runs = paragraph.getRuns();
            //存放文字内容
            StringBuffer contentBuffer = new StringBuffer();
            //存放图片信息
            List<PicItem> pics = new ArrayList<>();
            //解析文本信息
            for (XWPFRun run : runs){
                String text = run.text();
                //获取文本中的图片
                List<XWPFPicture> pictures = run.getEmbeddedPictures();
                if (pictures != null && pictures.size()>0){
                    for (XWPFPicture picture : pictures){
                        logger.debug("图片类型",picture.getPictureData().getPictureType());
                        //获取图片名称
                        String  fileName = picture.getPictureData().getFileName();
                        //获取图片数据
                        byte[] data = picture.getPictureData().getData();

                        pics.add(new PicItem(fileName, data));
                    }
                }

                if (StringUtils.isNotEmpty(text)){
                    contentBuffer.append(text);
                }
            }
            //内容
            String pContent = contentBuffer.toString();
            contentItem.setContent(pContent);
            contentItem.setDirectory("0");
            contentItem.setLevel(0);

            if (pics.size()> 0){
                contentItem.setPics(pics);
            }
        }
    }

    /**
     * 返回大纲信息
     *
     * @param paragraph
     * @param number
     * @throws IOException
     */
    private static void parseOutline(ParagraphItem paragraphItem, XWPFParagraph paragraph, CTDecimalNumber number) {
        String text = paragraph.getParagraphText();
        paragraphItem.setContent(text);
        if (number != null && text != null && !"".equals(text)) {
            StringBuffer space = new StringBuffer();
            for (BigInteger i = BigInteger.ZERO; i.compareTo(number.getVal()) < 0; i = i.add(BigInteger.ONE)) {
                space.append("    ");
            }
            space.append(text);
//            System.out.println(space.toString());
            paragraphItem.setIsOutline(true);
            paragraphItem.setLevel(number.getVal().intValue()+1);
        } else {
            paragraphItem.setIsOutline(false);
            paragraphItem.setLevel(0);
        }
    }
   /**
     * table转成html格式
     * @param table
     * @return
     */
  private static String tableToHtml(XWPFTable table){
        //获取表格所有的行
        List<XWPFTableRow> rows = table.getRows();
        
        if (rows !=null && rows.size()>0){
            StringBuffer tableBuffer = new StringBuffer("<table border=\"1\" cellspacing=\"0\" >");

            for (int i = 0; i < rows.size(); i++) {
                XWPFTableRow row = rows.get(i);
                tableBuffer.append("<tr>");
                List<XWPFTableCell> tableCells = row.getTableCells();
               // for (XWPFTableCell cell : tableCells){
                for (int j=0; j<tableCells.size();j++){
                    XWPFTableCell cell=tableCells.get(j);

                    CTTcPr tcPr = cell.getCTTc().getTcPr();
                    int width = cell.getWidth();
                    CTVMerge vMerge = tcPr.getVMerge();
                    int rowspan=0;
                    if(vMerge!=null){
                        if(1==vMerge.getVal().intValue()){
                            continue;
                        }else if(2==vMerge.getVal().intValue()){
                            rowspan = getRowspan(table, i+1,j);
                        }
                    }
                    CTDecimalNumber gridSpan = cell.getCTTc().getTcPr().getGridSpan();
                    if (i==0) {
                        tableBuffer.append("<th");
                        if(gridSpan!=null){
                            tableBuffer.append(" colspan='"+gridSpan.getVal()+"'");
                        }
                        if(rowspan>1){
                            tableBuffer.append(" rowspan='"+rowspan+"'");
                        }
                        tableBuffer.append(" width="+width);
                        tableBuffer.append(">");
                        tableBuffer.append(cell.getText());
                        tableBuffer.append("</th>");
                    }else {
                        tableBuffer.append("<td");
                        if(gridSpan!=null){
                            tableBuffer.append(" colspan='"+gridSpan.getVal()+"'");
                        }
                        if(rowspan>1){
                            tableBuffer.append(" rowspan='"+rowspan+"'");
                        }
                        tableBuffer.append(" width="+width);
                        tableBuffer.append(">");

                        tableBuffer.append(cell.getText());
                        tableBuffer.append("</td>");
                    }
                }
                tableBuffer.append("</tr>");
            }
            tableBuffer.append("</table>");
            return tableBuffer.toString();
        }
        return null;
    }


    public static int getRowspan(XWPFTable table, int row, int col) {
        int sum=1;
        XWPFTableCell cell = table.getRow(row).getCell(col);
        // 正常独立单元格
        CTTcPr tcPr = cell.getCTTc().getTcPr();
        CTVMerge vMerge = tcPr.getVMerge();
        //cell.getCTTc().getTcPr().getHMerge();
        while (vMerge!=null&&1 == vMerge.getVal().intValue()) {
            // 用户保存当前单元格行合并的单元格数-1(因为不包含自身)
            sum++;
            row++;
            cell = table.getRow(row).getCell(col);
            tcPr = cell.getCTTc().getTcPr();
            vMerge = tcPr.getVMerge();
        }
        return sum;
    }




    public static void getDocxTree(List<PlanContentItem> planContentItems) {
        int curLevel=1;
        for(int i=0;i<planContentItems.size();i++){
            PlanContentItem item=planContentItems.get(i);
            if(item.getDirectory()==1){
                curLevel=item.getLevel();
            }else{
                item.setLevel(curLevel+1);
            }
        }
        List<PlanContentItem> finalPlanContentItems = planContentItems;

        PlanContentItem root=new PlanContentItem();
        root.setContent("正文");
        root.setLevel(0);
        root.setSort(0);
        root.setDirectory(1);
        root.setSubList(planContentItems);
        planContentItems.stream().forEach(temp->{
            if(temp.getDirectory()==1){
                temp.setSubList(finalPlanContentItems.subList(temp.getSort(),finalPlanContentItems.size()));
            }
        });
        System.out.println(JSONObject.toJSONString(root));
    }




    public static void main(String[] args) {
//        String FILE_PATH="E:\\test\\项目软件概要设计说明书.docx";
        String FILE_PATH="C:\\Users\\Lenovo\\Downloads\\中国系统项目成本管理办法.docx";
        try {
            List<PlanContentItem> planContentItems = parseDoc(new FileInputStream(new File(FILE_PATH)));
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        // System.out.println(JSONObject.toJSONString(planContentItems));
    }
}
@Data
public class PlanContentItem extends BaseTreeEntity<PlanContentItem> implements DomSearch<PlanContentItem> {
    //ID
    private String id;
    //内容
    private String content;
    //是否目录
    private int directory;
    //序号
    private Integer sort;
    //级别
    private int level;
    //图片信息
    private List<PicItem> pics;

    @Override
    public List<PlanContentItem> getSubList() {
        return super.getChildren();
    }

    @Override
    public void setSubList(List<PlanContentItem> tList) {
        if(getDirectory()==0)
            return;
        int level=getLevel();
        for(int i=0; i<tList.size();i++){
            PlanContentItem item=tList.get(i);
                if(item.getLevel()<=level){
                    break;
                }else{
                    if(item.getLevel()==level+1){
                        super.addChildren(item);
                        item.setParentId(this.getSort());
                    }
                }
        }
    }
}

 // 树结构接口, 解决谁是儿子的问题

public interface DomSearch<T extends BaseTreeEntity> extends Serializable {

    List<T> getSubList();

    void setSubList(List<T> subList);
}

 //树结构

@Data
public class BaseTreeEntity<T extends BaseTreeEntity> {

    private int parentId;
    private List<T> children = new ArrayList<T>();

    public void addChildren(T node){
        children.add(node);
    }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

大巨魔战将

如果对您有帮助,请打赏1分钱

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值