POI word 内容提取 错误描述:
下面展示一些 内联代码片
。
Strict OOXML isn't currently supported, please see bug #57699
简单来说就是POI不支持Strict OOXML文档解析参考(https://blog.51cto.com/u_15127653/4192965)
网上中了一堆没有word的唯一一个看起来能用的 ooxml-strict-converter我这里还用不了😭
没办法只能看着 [ooxml-strict-converter]对应着改改了
最终修改结果暂时能用
ooxml-strict-mappings.properties 文件
http://purl.oclc.org/ooxml/drawingml/chart=http://schemas.openxmlformats.org/drawingml/2006/chart
http://purl.oclc.org/ooxml/drawingml/chartDrawing=http://schemas.openxmlformats.org/drawingml/2006/chartDrawing
http://purl.oclc.org/ooxml/drawingml/diagram=http://schemas.openxmlformats.org/drawingml/2006/diagram
http://purl.oclc.org/ooxml/drawingml/lockedCanvas=http://schemas.openxmlformats.org/drawingml/2006/lockedCanvas
http://purl.oclc.org/ooxml/drawingml/main=http://schemas.openxmlformats.org/drawingml/2006/main
http://purl.oclc.org/ooxml/drawingml/picture=http://schemas.openxmlformats.org/drawingml/2006/picture
http://purl.oclc.org/ooxml/drawingml/spreadsheetDrawing=http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing
http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing=http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing
http://purl.oclc.org/ooxml/officeDocument/bibliography=http://schemas.openxmlformats.org/officeDocument/2006/bibliography
http://purl.oclc.org/ooxml/officeDocument/characteristics=http://schemas.openxmlformats.org/officeDocument/2006/characteristics
http://purl.oclc.org/ooxml/officeDocument/customProperties=http://schemas.openxmlformats.org/officeDocument/2006/custom-properties
http://purl.oclc.org/ooxml/officeDocument/customXml=http://schemas.openxmlformats.org/officeDocument/2006/customXml
http://purl.oclc.org/ooxml/officeDocument/docPropsVTypes=http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes
http://purl.oclc.org/ooxml/officeDocument/extendedProperties=http://schemas.openxmlformats.org/officeDocument/2006/extended-properties
http://purl.oclc.org/ooxml/officeDocument/math=http://schemas.openxmlformats.org/officeDocument/2006/math
http://purl.oclc.org/ooxml/officeDocument/relationships=http://schemas.openxmlformats.org/officeDocument/2006/relationships
http://purl.oclc.org/ooxml/presentationml/main=http://schemas.openxmlformats.org/presentationml/2006/main
http://purl.oclc.org/ooxml/schemaLibrary/main=http://schemas.openxmlformats.org/schemaLibrary/2006/main
http://purl.oclc.org/ooxml/spreadsheetml/main=http://schemas.openxmlformats.org/spreadsheetml/2006/main
http://purl.oclc.org/ooxml/wordprocessingml/main=http://schemas.openxmlformats.org/wordprocessingml/2006/main
http://purl.oclc.org/ooxml/officeDocument/relationships/extendedProperties=http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties
http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument=http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument
OoXmlStrictConverter.java 文件
import java.io.BufferedReader;
import java.io.FilterInputStream;
import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.Properties;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.Namespace;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.springframework.util.StringUtils;
public class OoXmlStrictConverter {
private static final XMLEventFactory XEF = XMLEventFactory.newInstance();
private static final XMLInputFactory XIF = XMLInputFactory.newInstance();
private static final XMLOutputFactory XOF = XMLOutputFactory.newInstance();
// public static void main(String[] args) {
// XOF.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true);
// try {
// Properties mappings = readMappings();
// System.out.println("loaded mappings entries=" + mappings.size());
// if (args.length > 0) {
// transform(args[0], "transformed.xlsx", mappings);
// } else {
// transform("SimpleStrict.xlsx", "Simple.xlsx", mappings);
// transform("SampleSS.strict.xlsx", "SampleSS.trans.xlsx", mappings);
// transform("sample.strict.xlsx", "sample.trans.xlsx", mappings);
// transform("SimpleNormal.xlsx", "SimpleNormal.transformed.xlsx", mappings);
// }
// } catch(Throwable t) {
// t.printStackTrace();
// }
// }
public static void transform(InputStream inFile, OutputStream outFile, Properties mappings) throws Exception {
System.out.println("transforming " + inFile + " to " + outFile);
try(ZipInputStream zis = new ZipInputStream(inFile);
ZipOutputStream zos = new ZipOutputStream(outFile);) {
ZipEntry ze;
while((ze = zis.getNextEntry()) != null) {
ZipEntry newZipEntry = new ZipEntry(ze.getName());
zos.putNextEntry(newZipEntry);
FilterInputStream filterIs = new FilterInputStream(zis) {
@Override
public void close() throws IOException {
}
};
FilterOutputStream filterOs = new FilterOutputStream(zos) {
@Override
public void close() throws IOException {
}
};
if(isXml(ze.getName())) {
try {
XMLEventReader xer = XIF.createXMLEventReader(filterIs);
XMLEventWriter xew = XOF.createXMLEventWriter(filterOs);
int depth = 0;
while(xer.hasNext()) {
XMLEvent xe = xer.nextEvent();
if(xe.isStartElement()) {
StartElement se = xe.asStartElement();
xe = XEF.createStartElement(updateQName(se.getName(), mappings),
processAttributes(se.getAttributes(), mappings, se.getName().getNamespaceURI(), (depth == 0)),
processNamespaces(se.getNamespaces(), mappings));
depth++;
} else if(xe.isEndElement()) {
EndElement ee = xe.asEndElement();
xe = XEF.createEndElement(updateQName(ee.getName(), mappings),
processNamespaces(ee.getNamespaces(), mappings));
depth--;
}
xew.add(xe);
}
xer.close();
xew.close();
} catch(Throwable t) {
throw new IOException("Problem paraing " + ze.getName(), t);
}
} else {
copy(filterIs, filterOs);
}
zis.closeEntry();
zos.closeEntry();
}
}
}
private static boolean isXml(final String fileName) {
if(StringUtils.hasText(fileName)) {
int pos = fileName.lastIndexOf(".");
if(pos != -1) {
String ext = fileName.substring(pos + 1).toLowerCase();
return ext.equals("xml") || ext.equals("vml") || ext.equals("rels");
}
}
return false;
}
private static final QName CONFORMANCE = new QName("conformance");
private static Iterator<Attribute> processAttributes(final Iterator<Attribute> iter,
final Properties mappings, final String elementNamespaceUri, final boolean rootElement) {
ArrayList<Attribute> list = new ArrayList<>();
while(iter.hasNext()) {
Attribute att = iter.next();
QName qn = updateQName(att.getName(), mappings);
if(rootElement && mappings.containsKey(elementNamespaceUri) && att.getName().equals(CONFORMANCE)) {
//drop attribute
} else {
String value = att.getValue();
String newValue = mappings.getProperty(value);
list.add(XEF.createAttribute(qn, StringUtils.hasText(newValue)?newValue:value));
}
}
return Collections.unmodifiableList(list).iterator();
}
private static Iterator<Namespace> processNamespaces(final Iterator<Namespace> iter,
final Properties mappings) {
ArrayList<Namespace> list = new ArrayList<>();
while(iter.hasNext()) {
Namespace ns = iter.next();
String namespaceUri = ns.getNamespaceURI();
if(StringUtils.hasText(namespaceUri)) {
String mappedUri = mappings.getProperty(namespaceUri);
if(mappedUri != null) {
ns = StringUtils.hasText(ns.getPrefix()) ? XEF.createNamespace(ns.getPrefix(), mappedUri)
: XEF.createNamespace(mappedUri);
}
}
list.add(ns);
}
return Collections.unmodifiableList(list).iterator();
}
private static QName updateQName(QName qn, Properties mappings) {
String namespaceUri = qn.getNamespaceURI();
if(StringUtils.hasText(namespaceUri)) {
String mappedUri = mappings.getProperty(namespaceUri);
if(mappedUri != null) {
qn = StringUtils.hasText(qn.getPrefix()) ? new QName(mappedUri, qn.getLocalPart(), qn.getPrefix())
: new QName(mappedUri, qn.getLocalPart());
}
}
return qn;
}
public static Properties readMappings() throws IOException {
Properties props = new Properties();
try(InputStream is = OoXmlStrictConverter.class.getResourceAsStream("/ooxml-strict-mappings.properties");
BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"))) {
String line;
while((line = reader.readLine()) != null) {
String[] vals = line.split("=");
if(vals.length >= 2) {
props.setProperty(vals[0], vals[1]);
} else if(vals.length == 1) {
props.setProperty(vals[0], "");
}
}
}
return props;
}
private static void copy(InputStream inp, OutputStream out) throws IOException {
byte[] buff = new byte[4096];
int count;
while ((count = inp.read(buff)) != -1) {
if (count > 0) {
out.write(buff, 0, count);
}
}
}
}