关于word合并、分割、转换问题
最近项目中有个这样的需求:要把word 按章节(大纲)拆分多个word,然后再将这些word能合并成一个整体。看到这个需要,其实也是头疼,100个程序员中应该100位都没有做过类似的开发,最多的就是poi 读取、生成word,那么接下来,我会将最近研究的word切割、转换、合并以笔记的形式记录,也算做为积累吧!
word 分割 :
word 分割,使用的 poi,这个还真有点难,如果不是同事给国帮助,分割这块,还是很难搞定,不过合并用的第三方插件,很快就搞定了,上代码吧:
package com.sysware.soft603.util.backUp;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFStyles;
import org.dom4j.Element;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.SAXReader;
import org.dom4j.io.XMLWriter;
import com.sysware.fai.entity.BookMarkInfo;
public class POItest1 {
public static void main(String[] args) throws IOException {
splitWord();
}
private static void splitWord() {
String path = "d:\\GJB 1362A-2007 军工产品定型程序和要求.docx";
InputStream is = null;
List<BookMarkInfo> bookMarkInfos = new ArrayList<BookMarkInfo>();
try {
is = new FileInputStream(path);
} catch (FileNotFoundException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
XWPFDocument doc = null;
try {
doc = new XWPFDocument(is);
} catch (IOException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
// 获取段落
List<XWPFParagraph> paras = doc.getParagraphs();
// 获取bodyElements
List<IBodyElement> bodyElements = doc.getBodyElements();
// 获取doc样式
XWPFStyles styles = doc.getStyles();
int j = 0;
// /切割成的word 文件存储位置
String patha = "d:\\poi\\";
// 根据大纲定义分割成的段落
ArrayList<Integer> al_duanLuo = new ArrayList<Integer>();
// 大纲名称
ArrayList al2_name = new ArrayList<>();
// 大纲级别
ArrayList<Integer> al3_jiBie = new ArrayList<Integer>();
ArrayList<Integer> al5 = new ArrayList<Integer>();
// 存放生成wordId
ArrayList<String> al6_wordId = new ArrayList<String>();
for (int i = 0; i < bodyElements.size(); i++) {
IBodyElement bodyElement = bodyElements.get(i);
try {
if (j == 0) {
al_duanLuo.add(i);
j++;
al2_name.add("首页");
al3_jiBie.add(1);
// al5.add(1);
al6_wordId.add(java.util.UUID.randomUUID().toString());
}
if (bodyElement.getElementType() == BodyElementType.PARAGRAPH) {
XWPFParagraph para = (XWPFParagraph) bodyElement;
if (styles.getStyle(para.getStyle()).getCTStyle().getName()
.getVal().contains("heading")) {
// 以标题创建第一个文件
al_duanLuo.add(i);
j++;
al2_name.add(para.getParagraphText());
// System.out.println(al2);
String temps = styles.getStyle(para.getStyle())
.getCTStyle().getName().getVal()
.split("heading")[1].trim();
al3_jiBie.add(Integer.parseInt(temps));
// al5.add(Integer.parseInt(para.getStyleID()));
al6_wordId.add(java.util.UUID.randomUUID().toString());
// System.out.println("a2======" + al2_name);
// System.out.println("a3======" + al3_jiBie);
// System.out.println("al======" + al_duanLuo);
// System.out.println("a5======" + al5);
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// 定义存放父新id
ArrayList al4_parentId = new ArrayList<>();
for (int i = 0; i < al_duanLuo.size(); i++) {
if (al3_jiBie.get(i) > 1) {
int k;
for (k = i; k >= 0; k--) {
if ((int) al3_jiBie.get(k) < (int) al3_jiBie.get(i)) {
break;
}
}
al4_parentId.add(al6_wordId.get(k));
} else {
al4_parentId.add(" ");
}
}
// for (int i = 0; i < al_duanLuo.size(); i++) {
// System.out.println("===================");
// System.out.println("al2=" + al2_name.get(i));
// System.out.println("al3=" + al3_jiBie.get(i));
// System.out.println("all4=" + al4_parentId.get(i));
// System.out.println("===================");
// // 根据关系创建拼接组装xml文件
// }
System.out.println("al_duanLuo" + al_duanLuo);
System.out.println("al2_name=" + al2_name);
System.out.println("al3_jiBie=" + al3_jiBie);
// System.out.println(al6_wordId);
System.out.println("al4_parentId=" + al4_parentId);
XWPFDocument newDoc = doc;
int max = 0;
max = bodyElements.size() - 1;
al_duanLuo.add(max);
try {
for (int k = 0; k < al_duanLuo.size() - 1; k++) {
path = "d:\\GJB 1362A-2007 军工产品定型程序和要求.docx";
is = null;
try {
is = new FileInputStream(path);
} catch (FileNotFoundException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
doc = null;
try {
doc = new XWPFDocument(is);
// 移除多级列表,移除前面的编号,这里分割后是有编号的,不过这里如果你不移除的话,直接把代码注释掉即可
if (k != 0) {
XWPFParagraph para1 = (XWPFParagraph) doc
.getBodyElements().get(al_duanLuo.get(k));
String str1 = para1.getStyleID();
doc.getStyles().getStyle(str1).getCTStyle().getPPr()
.unsetNumPr();
}
} catch (IOException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
// 移除前0--14,
int temp = al_duanLuo.get(k);
int tempCount = al_duanLuo.get(k + 1);
for (int u = max; u > tempCount - 1; u--) {
doc.removeBodyElement(u);
}
// 进行移除之前
for (int l = temp - 1; l >= 0; l--) {
doc.removeBodyElement(l);
}
// XWPFParagraph tempDuan = paras.get(temp);
// paras.get(7).setStyle(paras.get(1).getStyle());
// XWPFRun run = paras.get(7).insertNewRun(0);
// run.setText("10086");
OutputStream out = null;
out = new FileOutputStream("d:\\poi\\" + al6_wordId.get(k)
+ ".docx");
BookMarkInfo bookMarkInfo = new BookMarkInfo();
bookMarkInfo.setId(al6_wordId.get(k));
bookMarkInfo.setPid(al4_parentId.get(k).toString());
bookMarkInfo.setName(al2_name.get(k).toString());
bookMarkInfo.setLevel(al3_jiBie.get(k));
bookMarkInfos.add(bookMarkInfo);
try {
doc.write(out);
} catch (IOException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("over");
CreateXml(bookMarkInfos);
//遍历xml文件
}
public static void CreateXml(List<BookMarkInfo> bookMarkInfos) {
String target = "d:/MyXml.xml";
SAXReader reader = new SAXReader();
try {
org.dom4j.Document document = reader.read(new File(target));
Element root = document.getRootElement();
List<Element> list = root.elements();
for (Element element : list) {
root.remove(element);
}
//给xml加上标号,
int j=0;
for (int i=0;i< bookMarkInfos.size();i++) {
BookMarkInfo bookMarkInfo =bookMarkInfos.get(i);
//判断是否为第一级
if(bookMarkInfo.getLevel()==1){
//为了是第一个不用加编号
if(i!=0){
j++ ;
}
}
if (bookMarkInfo.getLevel() == 1) {
Element element = root.addElement("item");
element.addAttribute("id", bookMarkInfo.getId());
//加上编号
if(j==0){
element.addAttribute("name", bookMarkInfo.getName());
}else{
element.addAttribute("name", j+bookMarkInfo.getName());
}
element.addAttribute("filename", bookMarkInfo.getId()
+ ".docx");
GetXElement(element, bookMarkInfo, bookMarkInfos,j+"",0);
}
}
saveXml(target, document);
//遍历xml
} catch (org.dom4j.DocumentException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static void saveXml(String target, org.dom4j.Document document) {
try {
OutputFormat outputFormat = OutputFormat.createPrettyPrint();
outputFormat.setLineSeparator("\r\n");
OutputStreamWriter outputStreamWriter = new OutputStreamWriter(
new FileOutputStream(target), "UTF-8");
XMLWriter xmlWriter = new XMLWriter(outputStreamWriter,
outputFormat);
xmlWriter.write(document);
xmlWriter.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static void GetXElement(Element parent, BookMarkInfo item,
List<BookMarkInfo> bookMarkInfos,String j,int k) {
k=0;
for (BookMarkInfo bookMarkInfo : bookMarkInfos) {
if (bookMarkInfo.getLevel() == item.getLevel() + 1
&& bookMarkInfo.getPid() == item.getId()) {
k++ ;
Element element = parent.addElement("item");
element.addAttribute("id", bookMarkInfo.getId());
element.addAttribute("name", j+"."+k+bookMarkInfo.getName());
element.addAttribute("filename", bookMarkInfo.getId() + ".docx");
GetXElement(element, bookMarkInfo, bookMarkInfos,j+"."+k,k);
}
}
}
}
<?xml version="1.0" encoding="UTF-8"?> <item id="1" name="1" filename="a.docx" author="杨稳" publishtime="2017-02-03" keywords="关键词" abstract="摘要" journal="" issn="" Implementation="2017-04-01" groupname="国标" phase="研制阶段" content="内容分类"> <!-- id=编号(标准规范),name=名称,filename=文件名称,author=作者,publishtime="发表时间",keywords="关键词",abstract="摘要",journal="所属期刊",issn=ISSN,Implementation="实施日期",groupname="国标",phase="研制阶段",content="内容分类"--> <item id="1.1" name="1.1" filename="a.docx"/> <item id="1.2" name="1.2" filename="a.docx"> <item id="1.2.1" name="1.2.1" filename="a.docx"/> <item id="1.2.2" name="1.2.2" filename="a.docx"/> </item> <item id="1.3" name="1.3" filename="a.docx"> <item id="1.3.1" name="1.3.1" filename="a.docx"/> </item> <item id="1.4" name="1.4" filename="a.docx"/> </item>
最终生成的 xml 主是要为了后续往数据库保存为了:最终生成的xml是:
<?xml version="1.0" encoding="UTF-8"?> <item id="1" name="1" filename="a.docx" author="杨稳" publishtime="2017-02-03" keywords="关键词" abstract="摘要" journal="" issn="" Implementation="2017-04-01" groupname="国标" phase="研制阶段" content="内容分类"> <!-- id=编号(标准规范),name=名称,filename=文件名称,author=作者,publishtime="发表时间",keywords="关键词",abstract="摘要",journal="所属期刊",issn=ISSN,Implementation="实施日期",groupname="国标",phase="研制阶段",content="内容分类"--> <item id="cdc88f18-a024-4e88-9ba1-3d5c5a7a0c58" name="首页" filename="cdc88f18-a024-4e88-9ba1-3d5c5a7a0c58.docx"