有些PDF文件是XFA文件格式,都是XML样式,这种PDF文件需要使用XFA读取方式,下面是JAVA实现方式
package com.xxx.xxx.util.pdf;
import com.itextpdf.forms.PdfAcroForm;
import com.itextpdf.forms.xfa.XfaForm;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import java.io.File;
import java.io.FileOutputStream;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* 读取XFA PDF文件,结果是XML文件
* @author xxx
*
*/
public class ReadXFAUtil {
/**
* 读取xfa pdf
* @param str
* @throws Exception
*/
public static void manipulatePdf(String src,String dest) throws Exception {
PdfDocument pdfDoc = new PdfDocument(new PdfReader(src));
PdfAcroForm form = PdfAcroForm.getAcroForm(pdfDoc, true);
XfaForm xfa = form.getXfaForm();
// Get XFA data under datasets/data.
Node node = xfa.getDatasetsNode();
NodeList list = node.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
if ("data".equals(list.item(i).getLocalName())) {
node = list.item(i);
break;
}
}
try (FileOutputStream os = new FileOutputStream(dest)) {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.transform(new DOMSource(node), new StreamResult(os));
}
pdfDoc.close();
}
public static void main(String[] args) throws Exception {
String src = "e:/4000-2021-6增值税申报表.pdf";
String dest = "e:/1.xml";
ReadXFAUtil.manipulatePdf(src,dest);
}
}
如有问题,请私信。
xObP8s/gudi/zrPMoaJKU7K5u7e+s7/Os8yhokpBVkHP4LnYv86zzMjn0OjSqtKyv8nS1MGqz7VRUaGjDQoNCtf31d8gUVEgNDA0NTQwMjI5