Docx简介
- 以Microsoft Office的doc/docx 为主要处理对象
- Word2003和之前都是doc,文档格式不公开
- Word2007和之后都是docx,文档格式公开,遵循XML路线
- docx 为主要研究对象
– 文字样式
– 表格
– 图片
– 公式
Java对docx文件操作
- 常见功能
– docx解析
– docx生成(完全生成,模板加部分生成:套打) - 处理的第三方库
– Jacob,COM4J (基于Windows平台)
– POI, docx4j (纯Java的第三方库)
– 一些其他平台的第三方库(OpenOffice/Libre Office SDK)linux平台
– Aspose(收费)
– 一些开源的OpenXML的包
POI
Apache POI
- poi.apche.org
- 可处理docx,xlsx,pptx,visio等office套件
- 纯Java工具包,无需第三方依赖
- 主要类
– XWPFDocument 整个文档对象
– XWPFParagraph 段落(以回车来定义的,并非文本上的段落)
– XWPFRun 一个片段(字体样式相同的一段)
– XWPFPicture 图片
– XWPFTable 表格
读取docx文件 文本
docx整个文本对象,然后分为段落,段落分为片段,遍历进行判断,输出文本或者其他
public static void main(String[] args) throws Exception {
readDocx();
}
public static void readDocx() throws Exception {
InputStream is;
is = new FileInputStream("test.docx");
XWPFDocument xwpf = new XWPFDocument (is);
List<IBodyElement> ibs= xwpf.getBodyElements();
for(IBodyElement ib:ibs) //遍历文本的每一个段落
{
BodyElementType bet = ib.getElementType();
if(bet== BodyElementType.TABLE)
{
//表格
System.out.println("table" + ib.getPart());
}
else
{
//段落
XWPFParagraph para = (XWPFParagraph) ib;
System.out.println("It is a new paragraph....The indention is "
+ para.getFirstLineIndent() + "," + para.getIndentationFirstLine() );
//System.out.println(para.getCTP().xmlText());
List<XWPFRun> res = para.getRuns(); //段落再分为片段
//System.out.println("run");
if(res.size()<=0)
{
System.out.println("empty line");
}
for(XWPFRun re: res) //遍历每一个片段
{
if(null == re.text()||re.text().length()<=0) //是图片
{
if(re.getEmbeddedPictures().size()>0)
{
System.out.println("image***" + re.getEmbeddedPictures().size());
} else
{
System.out.println("objects:" + re.getCTR().getObjectList().size());
if(re.getCTR().xmlText().indexOf("instrText") > 0) {
System.out.println("there is an equation field");
}
else
{
//System.out.println(re.getCTR().xmlText());
}
}
}
else
{
System.out.println("==="+ re.getCharacterSpacing() + re.text());
}
}
}
}
is.close();
}
//输出
It is a new paragraph....The indention is -1,-1
===0我们是中国人,
===0
===0C
===0hinese!
It is a new paragraph....The indention is 420,420
===0Hello ECNU! Hello China!
It is a new paragraph....The indention is -1,-1
empty line
docx中图片的读取操作
public static void imageRead() throws IOException, InvalidFormatException {
File docFile = new File("simple.docx");
XWPFDocument doc = new XWPFDocument(OPCPackage.openOrCreate(docFile));
int i = 0;
for (XWPFParagraph p : doc.getParagraphs()) {
for (XWPFRun run : p.getRuns()) {
System.out.println("a new run");
for (XWPFPicture pic : run.getEmbeddedPictures()) {
//有图片就进入for循环,没有会返回空
System.out.println(pic.getCTPicture().xmlText());
//image EMU(English Metric Unit)
System.out.println(pic.getCTPicture().getSpPr().getXfrm().getExt().getCx());
System.out.println(pic.getCTPicture().getSpPr().getXfrm()