POI word 内容提取 Strict OOXML isn‘t currently supported, please see bug #57699

POI word 内容提取 错误描述:
下面展示一些 内联代码片

Strict OOXML isn't currently supported, please see bug #57699

简单来说就是POI不支持Strict OOXML文档解析参考(https://blog.51cto.com/u_15127653/4192965)
网上中了一堆没有word的唯一一个看起来能用的 ooxml-strict-converter我这里还用不了😭
没办法只能看着 [ooxml-strict-converter]对应着改改了
最终修改结果暂时能用

ooxml-strict-mappings.properties 文件

http://purl.oclc.org/ooxml/drawingml/chart=http://schemas.openxmlformats.org/drawingml/2006/chart
http://purl.oclc.org/ooxml/drawingml/chartDrawing=http://schemas.openxmlformats.org/drawingml/2006/chartDrawing
http://purl.oclc.org/ooxml/drawingml/diagram=http://schemas.openxmlformats.org/drawingml/2006/diagram
http://purl.oclc.org/ooxml/drawingml/lockedCanvas=http://schemas.openxmlformats.org/drawingml/2006/lockedCanvas
http://purl.oclc.org/ooxml/drawingml/main=http://schemas.openxmlformats.org/drawingml/2006/main
http://purl.oclc.org/ooxml/drawingml/picture=http://schemas.openxmlformats.org/drawingml/2006/picture
http://purl.oclc.org/ooxml/drawingml/spreadsheetDrawing=http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing
http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing=http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing
http://purl.oclc.org/ooxml/officeDocument/bibliography=http://schemas.openxmlformats.org/officeDocument/2006/bibliography
http://purl.oclc.org/ooxml/officeDocument/characteristics=http://schemas.openxmlformats.org/officeDocument/2006/characteristics
http://purl.oclc.org/ooxml/officeDocument/customProperties=http://schemas.openxmlformats.org/officeDocument/2006/custom-properties
http://purl.oclc.org/ooxml/officeDocument/customXml=http://schemas.openxmlformats.org/officeDocument/2006/customXml
http://purl.oclc.org/ooxml/officeDocument/docPropsVTypes=http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes
http://purl.oclc.org/ooxml/officeDocument/extendedProperties=http://schemas.openxmlformats.org/officeDocument/2006/extended-properties
http://purl.oclc.org/ooxml/officeDocument/math=http://schemas.openxmlformats.org/officeDocument/2006/math
http://purl.oclc.org/ooxml/officeDocument/relationships=http://schemas.openxmlformats.org/officeDocument/2006/relationships
http://purl.oclc.org/ooxml/presentationml/main=http://schemas.openxmlformats.org/presentationml/2006/main
http://purl.oclc.org/ooxml/schemaLibrary/main=http://schemas.openxmlformats.org/schemaLibrary/2006/main
http://purl.oclc.org/ooxml/spreadsheetml/main=http://schemas.openxmlformats.org/spreadsheetml/2006/main
http://purl.oclc.org/ooxml/wordprocessingml/main=http://schemas.openxmlformats.org/wordprocessingml/2006/main
http://purl.oclc.org/ooxml/officeDocument/relationships/extendedProperties=http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties
http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument=http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument

OoXmlStrictConverter.java 文件

import java.io.BufferedReader;
import java.io.FilterInputStream;
import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.Properties;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.Namespace;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import org.springframework.util.StringUtils;

public class OoXmlStrictConverter {

    private static final XMLEventFactory XEF = XMLEventFactory.newInstance();
    private static final XMLInputFactory XIF = XMLInputFactory.newInstance();
    private static final XMLOutputFactory XOF = XMLOutputFactory.newInstance();

//    public static void main(String[] args) {
//        XOF.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true);
//        try {
//            Properties mappings = readMappings();
//            System.out.println("loaded mappings entries=" + mappings.size());
//            if (args.length > 0) {
//                transform(args[0], "transformed.xlsx", mappings);
//            } else {
//                transform("SimpleStrict.xlsx", "Simple.xlsx", mappings);
//                transform("SampleSS.strict.xlsx", "SampleSS.trans.xlsx", mappings);
//                transform("sample.strict.xlsx", "sample.trans.xlsx", mappings);
//                transform("SimpleNormal.xlsx", "SimpleNormal.transformed.xlsx", mappings);
//            }
//        } catch(Throwable t) {
//            t.printStackTrace();
//        }
//    }
    
    public static void transform(InputStream inFile, OutputStream outFile, Properties mappings) throws Exception {
    	System.out.println("transforming " + inFile + " to " + outFile);
    	try(ZipInputStream zis = new ZipInputStream(inFile);
    			ZipOutputStream zos = new ZipOutputStream(outFile);) {
    		ZipEntry ze;
    		while((ze = zis.getNextEntry()) != null) {
    			ZipEntry newZipEntry = new ZipEntry(ze.getName());
    			zos.putNextEntry(newZipEntry);
    			FilterInputStream filterIs = new FilterInputStream(zis) {
    				@Override
    				public void close() throws IOException {
    				}
    			};
    			FilterOutputStream filterOs = new FilterOutputStream(zos) {
    				@Override
    				public void close() throws IOException {
    				}
    			};
    			if(isXml(ze.getName())) {
    				try {
    					XMLEventReader xer = XIF.createXMLEventReader(filterIs);
    					XMLEventWriter xew = XOF.createXMLEventWriter(filterOs);
    					int depth = 0;
    					while(xer.hasNext()) {
    						XMLEvent xe = xer.nextEvent();
    						if(xe.isStartElement()) {
    							StartElement se = xe.asStartElement();
    							xe = XEF.createStartElement(updateQName(se.getName(), mappings),
    									processAttributes(se.getAttributes(), mappings, se.getName().getNamespaceURI(), (depth == 0)),
    									processNamespaces(se.getNamespaces(), mappings));
    							depth++;
    						} else if(xe.isEndElement()) {
    							EndElement ee = xe.asEndElement();
    							xe = XEF.createEndElement(updateQName(ee.getName(), mappings),
    									processNamespaces(ee.getNamespaces(), mappings));
    							depth--;
    						}
    						xew.add(xe);
    					}
    					xer.close();
    					xew.close();
    				} catch(Throwable t) {
    					throw new IOException("Problem paraing " + ze.getName(), t);
    				}
    			} else {
    				copy(filterIs, filterOs);
    			}
    			zis.closeEntry();
    			zos.closeEntry();
    		}
    	}
    }

    private static boolean isXml(final String fileName) {
        if(StringUtils.hasText(fileName)) {
            int pos = fileName.lastIndexOf(".");
            if(pos != -1) {
                String ext = fileName.substring(pos + 1).toLowerCase();
                return ext.equals("xml") || ext.equals("vml") || ext.equals("rels");
            }
        }
        return false;
    }
    
    private static final QName CONFORMANCE = new QName("conformance");
    
    private static Iterator<Attribute> processAttributes(final Iterator<Attribute> iter,
            final Properties mappings, final String elementNamespaceUri, final boolean rootElement) {
        ArrayList<Attribute> list = new ArrayList<>();
        while(iter.hasNext()) {
            Attribute att = iter.next();
            QName qn = updateQName(att.getName(), mappings);
            if(rootElement && mappings.containsKey(elementNamespaceUri) && att.getName().equals(CONFORMANCE)) {
                //drop attribute
            } else {
                String value = att.getValue();
                String newValue = mappings.getProperty(value);
                list.add(XEF.createAttribute(qn, StringUtils.hasText(newValue)?newValue:value));
            }
        }
        return Collections.unmodifiableList(list).iterator();
    }

    private static Iterator<Namespace> processNamespaces(final Iterator<Namespace> iter,
            final Properties mappings) {
        ArrayList<Namespace> list = new ArrayList<>();
        while(iter.hasNext()) {
            Namespace ns = iter.next();
            String namespaceUri = ns.getNamespaceURI();
            if(StringUtils.hasText(namespaceUri)) {
        		String mappedUri = mappings.getProperty(namespaceUri);
        		if(mappedUri != null) {
        			ns = StringUtils.hasText(ns.getPrefix()) ? XEF.createNamespace(ns.getPrefix(), mappedUri)
        					: XEF.createNamespace(mappedUri);
        		}
        	}
            list.add(ns);
        }
        return Collections.unmodifiableList(list).iterator();
    }
    
    private static QName updateQName(QName qn, Properties mappings) {
        String namespaceUri = qn.getNamespaceURI();
        if(StringUtils.hasText(namespaceUri)) {
            String mappedUri = mappings.getProperty(namespaceUri);
            if(mappedUri != null) {
                qn = StringUtils.hasText(qn.getPrefix()) ? new QName(mappedUri, qn.getLocalPart(), qn.getPrefix())
                		: new QName(mappedUri, qn.getLocalPart());
            }
        }
        return qn;
    }
    
    public static Properties readMappings() throws IOException {
        Properties props = new Properties();
        try(InputStream is = OoXmlStrictConverter.class.getResourceAsStream("/ooxml-strict-mappings.properties");
            BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"))) {
            String line;
            while((line = reader.readLine()) != null) {
                String[] vals = line.split("=");
                if(vals.length >= 2) {
                    props.setProperty(vals[0], vals[1]);
                } else if(vals.length == 1) {
                    props.setProperty(vals[0], "");
                }

            }
        }
        return props;
    }
    
    private static void copy(InputStream inp, OutputStream out) throws IOException {
        byte[] buff = new byte[4096];
        int count;
        while ((count = inp.read(buff)) != -1) {
            if (count > 0) {
                out.write(buff, 0, count);
            }
        }
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值