package com.etcc.document;
import com.etcc.document.vo.*;
import org.apache.commons.lang3.*;
import org.apache.poi.xwpf.usermodel.*;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
import org.slf4j.*;
import java.io.*;
import java.math.*;
import java.util.*;
public class DocxParseUtil {
private static Logger logger = LoggerFactory.getLogger(DocxParseUtil.class);
/**
* 解析文件
* @param
*/
public static List<PlanContentItem> parseDoc(FileInputStream docInputStream){
XWPFDocument document = null;
try {
document = new XWPFDocument(docInputStream);
} catch (IOException e) {
logger.debug("读入文档异常{}", e);
return null;
}
List<IBodyElement> bodyElements = document.getBodyElements();
int index = 1;
//返回的解析对象
List<PlanContentItem> planContentItems = new ArrayList<>();
//获取所有的styles
XWPFStyles styles = document.getStyles();
for (IBodyElement element : bodyElements){
//新建一个存储对象
PlanContentItem item = new PlanContentItem();
switch (element.getElementType()){
case PARAGRAPH: //解析文档或是图片
if (element instanceof XWPFParagraph){
XWPFParagraph paragraph = (XWPFParagraph) element;
parseParagraph(item, paragraph, styles);
// System.out.println(JSONObject.toJSONString(item));
if (StringUtils.isNotEmpty(item.getContent()) || item.getPics() != null) {
item.setSort(index);
planContentItems.add(item);
index++;
}
}
break;
case TABLE: //解析表格
if (element instanceof XWPFTable){
XWPFTable table = (XWPFTable) element;
String tableHtml = tableToHtml(table);
item.setSort(index);
item.setDirectory("0");
item.setLevel(0);
item.setContent(tableHtml);
planContentItems.add(item);
index++;
}
break;
case CONTENTCONTROL:
break;
}
}
return planContentItems;
}
/**
* 解析段落
* @param contentItem 自定义解析出的对象
* @param paragraph doc段落
*/
private static void parseParagraph(PlanContentItem contentItem, XWPFParagraph paragraph, XWPFStyles styles){
//判断是否设置了大纲级别
ParagraphItem paragraphItem = new ParagraphItem();
try {
// 判断该段落是否设置了大纲级别
parseOutline(paragraphItem, paragraph, paragraph.getCTP().getPPr().getOutlineLvl());
} catch (Exception e) {
}
if (paragraphItem.getIsOutline()==null || !paragraphItem.getIsOutline()) {
try {
//判断该段落的样式是否设置了大纲级别
CTDecimalNumber number = styles.getStyle(paragraph.getStyle()).getCTStyle().getPPr()
.getOutlineLvl();
parseOutline(paragraphItem, paragraph, number);
} catch (Exception e) {
}
}
if (paragraphItem.getIsOutline() == null || !paragraphItem.getIsOutline()) {
try {
//判断该段落的样式的基础样式是否设置了大纲级别
CTDecimalNumber number = styles
.getStyle(styles.getStyle(paragraph.getStyle()).getCTStyle().getBasedOn().getVal())
.getCTStyle().getPPr().getOutlineLvl();
parseOutline(paragraphItem, paragraph, number);
} catch (Exception e) {
// TODO: handle exception
}
}
if (paragraphItem.getIsOutline()!=null && paragraphItem.getIsOutline()){
contentItem.setContent(paragraphItem.getContent());
contentItem.setLevel(paragraphItem.getLevel());
contentItem.setDirectory("1");
}else {
//获取段落中的文本信息
List<XWPFRun> runs = paragraph.getRuns();
//存放文字内容
StringBuffer contentBuffer = new StringBuffer();
//存放图片信息
List<PicItem> pics = new ArrayList<>();
//解析文本信息
for (XWPFRun run : runs){
String text = run.text();
//获取文本中的图片
List<XWPFPicture> pictures = run.getEmbeddedPictures();
if (pictures != null && pictures.size()>0){
for (XWPFPicture picture : pictures){
logger.debug("图片类型",picture.getPictureData().getPictureType());
//获取图片名称
String fileName = picture.getPictureData().getFileName();
//获取图片数据
byte[] data = picture.getPictureData().getData();
pics.add(new PicItem(fileName, data));
}
}
if (StringUtils.isNotEmpty(text)){
contentBuffer.append(text);
}
}
//内容
String pContent = contentBuffer.toString();
contentItem.setContent(pContent);
contentItem.setDirectory("0");
contentItem.setLevel(0);
if (pics.size()> 0){
contentItem.setPics(pics);
}
}
}
/**
* 返回大纲信息
*
* @param paragraph
* @param number
* @throws IOException
*/
private static void parseOutline(ParagraphItem paragraphItem, XWPFParagraph paragraph, CTDecimalNumber number) {
String text = paragraph.getParagraphText();
paragraphItem.setContent(text);
if (number != null && text != null && !"".equals(text)) {
StringBuffer space = new StringBuffer();
for (BigInteger i = BigInteger.ZERO; i.compareTo(number.getVal()) < 0; i = i.add(BigInteger.ONE)) {
space.append(" ");
}
space.append(text);
// System.out.println(space.toString());
paragraphItem.setIsOutline(true);
paragraphItem.setLevel(number.getVal().intValue()+1);
} else {
paragraphItem.setIsOutline(false);
paragraphItem.setLevel(0);
}
}
/**
* table转成html格式
* @param table
* @return
*/
private static String tableToHtml(XWPFTable table){
//获取表格所有的行
List<XWPFTableRow> rows = table.getRows();
if (rows !=null && rows.size()>0){
StringBuffer tableBuffer = new StringBuffer("<table border=\"1\" cellspacing=\"0\" >");
for (int i = 0; i < rows.size(); i++) {
XWPFTableRow row = rows.get(i);
tableBuffer.append("<tr>");
List<XWPFTableCell> tableCells = row.getTableCells();
// for (XWPFTableCell cell : tableCells){
for (int j=0; j<tableCells.size();j++){
XWPFTableCell cell=tableCells.get(j);
CTTcPr tcPr = cell.getCTTc().getTcPr();
int width = cell.getWidth();
CTVMerge vMerge = tcPr.getVMerge();
int rowspan=0;
if(vMerge!=null){
if(1==vMerge.getVal().intValue()){
continue;
}else if(2==vMerge.getVal().intValue()){
rowspan = getRowspan(table, i+1,j);
}
}
CTDecimalNumber gridSpan = cell.getCTTc().getTcPr().getGridSpan();
if (i==0) {
tableBuffer.append("<th");
if(gridSpan!=null){
tableBuffer.append(" colspan='"+gridSpan.getVal()+"'");
}
if(rowspan>1){
tableBuffer.append(" rowspan='"+rowspan+"'");
}
tableBuffer.append(" width="+width);
tableBuffer.append(">");
tableBuffer.append(cell.getText());
tableBuffer.append("</th>");
}else {
tableBuffer.append("<td");
if(gridSpan!=null){
tableBuffer.append(" colspan='"+gridSpan.getVal()+"'");
}
if(rowspan>1){
tableBuffer.append(" rowspan='"+rowspan+"'");
}
tableBuffer.append(" width="+width);
tableBuffer.append(">");
tableBuffer.append(cell.getText());
tableBuffer.append("</td>");
}
}
tableBuffer.append("</tr>");
}
tableBuffer.append("</table>");
return tableBuffer.toString();
}
return null;
}
public static int getRowspan(XWPFTable table, int row, int col) {
int sum=1;
XWPFTableCell cell = table.getRow(row).getCell(col);
// 正常独立单元格
CTTcPr tcPr = cell.getCTTc().getTcPr();
CTVMerge vMerge = tcPr.getVMerge();
//cell.getCTTc().getTcPr().getHMerge();
while (vMerge!=null&&1 == vMerge.getVal().intValue()) {
// 用户保存当前单元格行合并的单元格数-1(因为不包含自身)
sum++;
row++;
cell = table.getRow(row).getCell(col);
tcPr = cell.getCTTc().getTcPr();
vMerge = tcPr.getVMerge();
}
return sum;
}
public static void getDocxTree(List<PlanContentItem> planContentItems) {
int curLevel=1;
for(int i=0;i<planContentItems.size();i++){
PlanContentItem item=planContentItems.get(i);
if(item.getDirectory()==1){
curLevel=item.getLevel();
}else{
item.setLevel(curLevel+1);
}
}
List<PlanContentItem> finalPlanContentItems = planContentItems;
PlanContentItem root=new PlanContentItem();
root.setContent("正文");
root.setLevel(0);
root.setSort(0);
root.setDirectory(1);
root.setSubList(planContentItems);
planContentItems.stream().forEach(temp->{
if(temp.getDirectory()==1){
temp.setSubList(finalPlanContentItems.subList(temp.getSort(),finalPlanContentItems.size()));
}
});
System.out.println(JSONObject.toJSONString(root));
}
public static void main(String[] args) {
// String FILE_PATH="E:\\test\\项目软件概要设计说明书.docx";
String FILE_PATH="C:\\Users\\Lenovo\\Downloads\\中国系统项目成本管理办法.docx";
try {
List<PlanContentItem> planContentItems = parseDoc(new FileInputStream(new File(FILE_PATH)));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
// System.out.println(JSONObject.toJSONString(planContentItems));
}
}
@Data
public class PlanContentItem extends BaseTreeEntity<PlanContentItem> implements DomSearch<PlanContentItem> {
//ID
private String id;
//内容
private String content;
//是否目录
private int directory;
//序号
private Integer sort;
//级别
private int level;
//图片信息
private List<PicItem> pics;
@Override
public List<PlanContentItem> getSubList() {
return super.getChildren();
}
@Override
public void setSubList(List<PlanContentItem> tList) {
if(getDirectory()==0)
return;
int level=getLevel();
for(int i=0; i<tList.size();i++){
PlanContentItem item=tList.get(i);
if(item.getLevel()<=level){
break;
}else{
if(item.getLevel()==level+1){
super.addChildren(item);
item.setParentId(this.getSort());
}
}
}
}
}
// 树结构接口, 解决谁是儿子的问题
public interface DomSearch<T extends BaseTreeEntity> extends Serializable {
List<T> getSubList();
void setSubList(List<T> subList);
}
//树结构
@Data
public class BaseTreeEntity<T extends BaseTreeEntity> {
private int parentId;
private List<T> children = new ArrayList<T>();
public void addChildren(T node){
children.add(node);
}
}