如何将pdf文件按页分割并解析内容
1.创建文件页存储对象
@Getter
@Setter
@ToString
public class DivisionPDFResult {
/**
* 页码
*/
public int pageIndex;
/**
* 页内容字符串
*/
public String text;
/**
* 是否成功
*/
public Boolean isOk;
}
2.主方法
/**
* @param multipartFile
* @description 分割PDF
*/
public static List<DivisionPDFResult> divisionPDFByPage(MultipartFile multipartFile) {
List<DivisionPDFResult> result = new ArrayList<>();
//创建文件容器
PDDocument document = null;
try {
document = PDDocument.load(multipartFile.getInputStream());
//获取所有pdf页面
PDPageTree pages = document.getDocumentCatalog().getPages();
logger.info("+++++++++++++获取到媒体个数+++++++++++++++++++++++++++++++++++++++++++++++++" + pages.getCount());
for (int i = 0; i < pages.getCount(); i++) {
String text = StringUtil.EMPTY;
DivisionPDFResult divisionPDFResult = new DivisionPDFResult();
divisionPDFResult.setPageIndex(i + 1);
divisionPDFResult.setIsOk(true);
//取出当页内容
PDResources resources = pages.get(i).getResources();
// 二维码解析器
QRCodeReader reader = new QRCodeReader();
//解析失败不影响继续循环
try {
// 遍历所有多媒体资源
for (COSName name : resources.getXObjectNames()) {
// 获取当前的多媒体元素
PDXObject obj = resources.getXObject(name);
// 如果多媒体元素是图片
if (obj instanceof PDImageXObject) {
// 以下几步将图片解析为BinaryBitmap
BufferedImage image = ((PDImageXObject) obj).getImage();
LuminanceSource source = new BufferedImageLuminanceSource(image);
BinaryBitmap bitmap = new BinaryBitmap(new HybridBinarizer(source));
// 尝试解析二维码,如果不是二维码,抛出异常
Result readCode = reader.decode(bitmap);
// 解析二维码获得的文本
text = readCode.getText();
break;
}
}
// 解析完PDF第1页所有的多媒体元素,没有二维码图片
if (StringUtil.isBlank(text)) {
// 然后将PDF第1页转为图片,然后解析二维码
// 将PDF第1页转为图片
PDFRenderer pdfRender = new PDFRenderer(document);
BufferedImage image = pdfRender.renderImageWithDPI(0, 296);
// 解析二维码获得的文本
LuminanceSource source = new BufferedImageLuminanceSource(image);
Binarizer binarizer = new HybridBinarizer(source);
BinaryBitmap bitmap = new BinaryBitmap(binarizer);
Result readCode = reader.decode(bitmap);
text = readCode.getText();
}
} catch (Exception e) {
divisionPDFResult.setIsOk(false);
}
divisionPDFResult.setText(text);
result.add(divisionPDFResult);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (document != null) {
try {
document.close();
} catch (Exception e) {
logger.error("PDDocument close error", document);
}
}
}
return result;
}
3.轮询处理
List<DivisionPDFResult> fileLoadList = PDFUtils.divisionPDFByPage(multipartFile);
JSON.toJSONString(fileLoadList));
//取出解析成功的媒体文件
fileLoadList.stream().filter(x -> x.getIsOk()).collect(Collectors.toList());
if (fileLoadList.isEmpty()) {
return mediaInvoiceList;
}
fileLoadList.forEach(x -> {
MediaInvoice mediaInvoice = null;
//这里我的文件是表格形式,就用split得到每个格子中的内容
String[] content = x.getText().split(",");
if (content.length > 0) {
...
//数据处理
}
});
解析OFD文件:
public String invoiceXml(MultipartFile file) {
String xml = "";
long doc_handler = 0;
long s=System.currentTimeMillis();
try {
logger.info("ofd解析xml开始------");
doc_handler = FPParser.openFile(file.getBytes());
if (doc_handler == 0) {
return xml;
}
xml = FPParser.getData(doc_handler);
logger.info("ofd解析xml结束 {} 耗时{}------",xml,System.currentTimeMillis()-s);
} catch (IOException e) {
logger.error("invoiceXml error", e);
} finally {
FPParser.closeFile(doc_handler);
}
return xml;
}
public class FPParser {
public static native long openFile(byte[] doc_src);
public static native String getData(long doc_handler);
public static native int closeFile(long doc_handler);
}
//数据处理方法
String xml = invoiceOfdUtilsService.invoiceXml(multipartFile);
if(StringUtils.isEmpty(xml)){
return null;
}
//TODO:xml格式转为MediaInvoice对象
MediaInvoice mediaInvoice = new MediaInvoice();
Document doc = DocumentHelper.parseText(xml);
Element root = doc.getRootElement();
List<Element> elements = root.elements();
for (Element element : elements) {
System.out.println(element.getName("字段名称")+element.getTextTrim());
}