一开始,也是百思不得其解,相关的资料也是少之又少。
刚开始的思路也是先读取PDF表格线条的坐标,再根据坐标定位其中的文字信息,从而读取出来。
但代码量稍显复杂,写到一半就写不下去了,后面偶遇一个工具包:tabula
贴上官网地址:tabula.technology
GitHub地址:https://github.com/tabulapdf/tabula
思路差不多,关键是解决了问题。
本文使用该工具包,并以支付宝流水和微信流水的PDF文件作为实战,需要的拿走。若支付宝微信官方没变更文档格式,截止目前,基本可以直接使用。
1.引入依赖包
<dependency>
<groupId>technology.tabula</groupId>
<artifactId>tabula</artifactId>
<version>1.0.5</version>
<exclusions>
<exclusion>
<artifactId>slf4j-simple</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
2.读取PDF表格数据的工具类
import lombok.AllArgsConstructor;
import lombok.Getter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import technology.tabula.*;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class PDFTableReader<T> {
// 换行符
private final static String LINE_WRAP = "\r";
/**
* PDF数据缓冲器
*/
private final List<Map<String, String>> dataBuff = new ArrayList<>();
private final transient File pdfFile;
/**
* 表格标题属性
*/
private final PDFTitle[] pdfTitles;
/**
* 标题所在行号
* 默认第一行
*/
private int titleRowIndex = 0;
/**
* 描述信息
*/
private String keyInfo;
/**
* 关键信息所在页起始下标,从0开始
*/
private int keyInfoPageNumber;
/**
* 关键信息所在行起始下标
*/
private int keyInfoLineStartIndex;
/**
* 关键信息所在行结束下标
*/
private int keyInfoLineEndIndex;
// 表格映射实体类
private final Class<T> beanClass;
private PDFTableReader(File pdfFile, Class<T> beanClass) {
this.pdfFile = pdfFile;
this.beanClass = beanClass;
if (beanClass == null) {
throw new RuntimeException("必须指定读取表格数据的实体类");
}
Field[] fields = beanClass.getDeclaredFields();
pdfTitles = new PDFTitle[fields.length];
for (int i = 0; i < fields.length; i++) {
Field field = fields[i];
TableTitle title = field.getAnnotation(TableTitle.class);
if (title == null) {
throw new RuntimeException(field.getName() + " attribute miss annotation TableTitle");
}
pdfTitles[i] = new PDFTitle(title.cellIndex(), title.value(), title.trimSpace());
}
}
public static <T> PDFTableReader<T> load(File pdfFile, Class<T> beanClass) {
return new PDFTableReader<T>(pdfFile, beanClass);
}
/**
* 设置PDF表格标题所在行号
*
* @param titleRowIndex - 表头标题所在行下标,从0开始
*/
public PDFTableReader<T> titleRowIndex(int titleRowIndex) {
this.titleRowIndex = titleRowIndex;
return this;
}
/**
* 设置PDF描述信息
*
* @param keyInfoPageNumber - 关键信息所在页起始下标,从0开始
* @param keyInfoLineStartIndex - 关键信息所在行起始下标,从0开始
* @param keyInfoLineEndIndex - 关键信息所在行结束下标,从0开始
*/
public PDFTableReader<T> describe(int keyInfoPageNumber, int keyInfoLineStartIndex, int keyInfoLineEndIndex) {
this.keyInfoPageNumber = keyInfoPageNumber;
this.keyInfoLineStartIndex = keyInfoLineStartIndex;
this.keyInfoLineEndIndex = keyInfoLineEndIndex;
return this;
}
public List<Map<String, String>> readMaps() {
return this.doRead();
}
public List<T> readBeans() {
List<T> beans = new ArrayList<>();
List<Map<String, String>> maps = this.doRead();
T bean;
try {
for (Map<String, String> map : maps) {
bean = this.beanClass.newInstance();
Field[] fields = bean.getClass().getDeclaredFields();
for (Field field : fields) {
TableTitle title = field.getAnnotation(TableTitle.class);
if (title == null) {
continue;
}
if (!field.isAccessible()) {
field.setAccessible(true);
}
field.set(bean, map.get(title.value()));
}
beans.add(bean);
}
return beans;
} catch (Exception e) {
throw new RuntimeException(e.getMessage());
}
}
private List<Map<String, String>> doRead() {
try (PDDocument document = PDDocument.load(pdfFile)) {
// 读取描述
this.doReadKeyInfo(document);
// 读取表格
List<Map<String, String>> maps = new ArrayList<>();
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
PageIterator pi = new ObjectExtractor(document).extract();
Map<String, String> cellMap;
int rowNum = 0;// 行计数器
while (pi.hasNext()) {
Page page = pi.next();
List<Table> tables = sea.extract(page);
for (Table table : tables) {
List<List<RectangularTextContainer>> rows = table.getRows();
if (rows.size() <= titleRowIndex) {
throw new RuntimeException("标题行不正确");
}
for (List<RectangularTextContainer> row : rows) {
if (rowNum <= titleRowIndex) {
rowNum++;
continue;
}
// 处理表格数据
cellMap = new HashMap<>(row.size());
for (int k = 0; k < row.size(); k++) {
RectangularTextContainer cell = row.get(k);
PDFTitle pdfTitle = pdfTitles[k];
// 如果标题列下标不匹配,跳过
if (pdfTitle.getCellIndex() != k) {
continue;
}
// 换行处理,此处需要调用
String cellText = cell.getText();
cellText = cellText == null ? "" : cellText.trim();
if (pdfTitle.isTrimSpace()) {
cellMap.put(pdfTitle.getName(), cellText.replace(LINE_WRAP, ""));
} else {
cellMap.put(pdfTitle.getName(), cellText.replace(LINE_WRAP, " "));
}
}
maps.add(cellMap);
rowNum++;
}
}
}
this.dataBuff.addAll(maps);
return maps;
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
}
private void doReadKeyInfo(PDDocument document) throws IOException {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(true); //sort:设置为true 则按照行进行读取,默认是false
stripper.setStartPage(keyInfoPageNumber);
stripper.setEndPage(keyInfoPageNumber + 1);
String result = stripper.getText(document);
result = result.replaceAll("\n", "");
String[] split = result.split(LINE_WRAP);
if (keyInfoLineStartIndex >= split.length || keyInfoLineEndIndex >= split.length) {
throw new RuntimeException("关键信息所在列已超出读取页码行的范围");
}
StringBuilder r = new StringBuilder();
for (int i = keyInfoLineStartIndex; i <= keyInfoLineEndIndex; i++) {
String line = split[i];
if (line == null || line.trim().equals("")) {
continue;
}
r.append(line.trim());
}
this.keyInfo = r.toString();
}
public List<Map<String, String>> getDataBuff() {
return dataBuff;
}
public String getKeyInfo() {
return keyInfo;
}
// PDF表格标题属性
@Getter
@AllArgsConstructor
static class PDFTitle {
// 标题所在列下标,从0开始
private int cellIndex;
// 标题名称
private String name;
// 需要去掉空格
private boolean trimSpace;
}
}
import java.lang.annotation.*;
/**
* PDF Table 表头描述信息
*/
@Documented
@Target(ElementType.FIELD)
@Retention(RetentionPolicy.RUNTIME)
public @interface TableTitle {
/**
* 表头名称
*/
String value();
/**
* 表头列下标,从0开始
*/
int cellIndex();
/**
* 是否去掉单元格内容空格
*/
boolean trimSpace() default true;
}
3.实战
公用代码
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import java.util.List;
/**
* 流水包装类
*/
@ToString
@Getter
@AllArgsConstructor
public class FlowWrapper<NT> {
private String realName;// 姓名
private String idCardNUmber;// 证件号码
private String account;// 账户
private String startDate;// 流水起始日期
private String endDate;// 流水结束日期
private List<NT> details;// 流水数据
}
正则字段匹配工具
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RegExUtil {
public static String find(String regex, String line) {
Matcher matcher = Pattern.compile(regex).matcher(line);
if (matcher.find()) {
return matcher.group();
}
return null;
}
public static String[] findMulti(String regex, String line) {
Matcher matcher = Pattern.compile(regex).matcher(line);
List<String> result = new ArrayList<>();
while (matcher.find()) {
String name = matcher.group();
result.add(name);
}
return result.toArray(new String[0]);
}
}
3.1支付宝流水PDF文件读取实例
实体
import cn.cosmosx.base.pdf.TableTitle;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;
@ToString
@Getter
@Setter
public class AlipayFlow {
@TableTitle(value = "收/支", cellIndex = 0)
private String outInType;
@TableTitle(value = "交易对方", cellIndex = 1)
private String counterpart;
@TableTitle(value = "商品说明", cellIndex = 2)
private String goodsDesc;
@TableTitle(value = "收/付款方式", cellIndex = 3)
private String payOrInMethod;
@TableTitle(value = "金额", cellIndex = 4)
private String amount;
@TableTitle(value = "交易订单号", cellIndex = 5)
private String tradeOrder;
@TableTitle(value = "商家订单号", cellIndex = 6)
private String merchantOrder;
@TableTitle(value = "交易时间", cellIndex = 7, trimSpace = false)
private String tradeDate;
}
读取工具
import cn.cosmosx.base.pdf.PDFTableReader;
import cn.cosmosx.base.pdf.RegExUtil;
import cn.cosmosx.base.pdf.bean.AlipayFlow;
import java.io.File;
import java.util.List;
public class AlipayFlowUtil {
// 注:正则 “?<=RegEx”:匹配RegEx之后的内容;“?=RegEx”:匹配RegEx之前的内容
private static final String NAME_REG = "(?<=[:|:])\\S+(?=[\\(|(])";
private static final String ID_REG = "(?<=(\\(\\S{1,10}[:|:]))\\S+(?=[\\)|)])";
private static final String ACCOUNT_REG = "(?<=(\\u8d26\\u53f7|\\u5fae\\u4fe1\\u53f7:))\\S+(?=[\\u4e2d])";
private static final String DATE_REG = "(\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2})";
public static FlowWrapper<AlipayFlow> read(String pdfFilePath) {
PDFTableReader<AlipayFlow> reader = PDFTableReader.load(new File(pdfFilePath), AlipayFlow.class)
.describe(0, 3, 6)
.titleRowIndex(2);
List<AlipayFlow> list = reader.readBeans();
String keyInfo = reader.getKeyInfo();
String realName = RegExUtil.find(NAME_REG, keyInfo);
String idCardNumber = RegExUtil.find(ID_REG, keyInfo);
String account = RegExUtil.find(ACCOUNT_REG, keyInfo);
String[] dateArr = RegExUtil.findMulti(DATE_REG, keyInfo);
String startDate = null;
String endDate = null;
if (dateArr.length > 1) {
startDate = dateArr[0];
endDate = dateArr[1];
}
return new FlowWrapper<>(realName, idCardNumber, account, startDate, endDate, list);
}
}
3.2微信流水PDF文件读取实例
实体
import cn.cosmosx.base.pdf.TableTitle;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;
@ToString
@Getter
@Setter
public class WxFlow {
@TableTitle(value = "交易订单号", cellIndex = 0)
private String tradeOrder;
@TableTitle(value = "交易时间", cellIndex = 1, trimSpace = false)
private String tradeDate;
@TableTitle(value = "交易类型", cellIndex = 2)
private String tradeType;
@TableTitle(value = "收/支/其他", cellIndex = 3)
private String outInType;
@TableTitle(value = "交易方式", cellIndex = 4)
private String tradeMethod;
@TableTitle(value = "金额(元)", cellIndex = 5)
private String amount;
@TableTitle(value = "交易对方", cellIndex = 6)
private String counterpart;
@TableTitle(value = "商家单号", cellIndex = 7)
private String merchantOrder;
}
读取工具
import cn.cosmosx.base.pdf.PDFTableReader;
import cn.cosmosx.base.pdf.RegExUtil;
import cn.cosmosx.base.pdf.bean.WxFlow;
import java.io.File;
import java.util.List;
public class WxFlowUtil {
// 注:正则 “?<=RegEx”:匹配RegEx之后的内容;“?=RegEx”:匹配RegEx之前的内容
private static final String NAME_REG = "(?<=[:|:])\\S+(?=[\\(|(])";
private static final String ID_REG = "(?<=(\\(\\S{1,10}[:|:]))\\S+(?=[\\)|)])";
private static final String ACCOUNT_REG = "(?<=(\\u8d26\\u53f7|\\u5fae\\u4fe1\\u53f7:))\\S+(?=[\\u4e2d])";
private static final String DATE_REG = "(\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2})";
public static FlowWrapper<WxFlow> read(String pdfFilePath) {
PDFTableReader<WxFlow> reader = PDFTableReader.load(new File(pdfFilePath), WxFlow.class)
.describe(0, 2, 4)
.titleRowIndex(2);
List<WxFlow> list = reader.readBeans();
String keyInfo = reader.getKeyInfo();
String realName = RegExUtil.find(NAME_REG, keyInfo);
String idCardNumber = RegExUtil.find(ID_REG, keyInfo);
String account = RegExUtil.find(ACCOUNT_REG, keyInfo);
String[] dateArr = RegExUtil.findMulti(DATE_REG, keyInfo);
String startDate = null;
String endDate = null;
if (dateArr.length > 1) {
startDate = dateArr[0];
endDate = dateArr[1];
}
return new FlowWrapper<>(realName, idCardNumber, account, startDate, endDate, list);
}
}
至此,已全部完成!
若有更高的方式,欢迎大家推荐。