主要读取PDF的表格数据,以微信支付宝为例子展开,开箱即用。
1.依赖
<dependency>
<groupId>technology.tabula</groupId>
<artifactId>tabula</artifactId>
<version>1.0.5</version>
</dependency>
2.工具代码
(1)几个bean
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import java.util.List;
/**
* 流水包装类
*/
@ToString
@Getter
@AllArgsConstructor
public class FlowWrapper<NT> {
private String realName;// 姓名
private String idCardNUmber;// 证件号码
private String account;// 账户
private String startDate;// 流水起始日期
private String endDate;// 流水结束日期
private List<NT> details;// 流水数据
}
import com.gynsh.utils.cashflow.TableTitle;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;
@ToString
@Getter
@Setter
public class AlipayFlow {
@TableTitle(value = "收/支", cellIndex = 0)
private String outInType;
@TableTitle(value = "交易对方", cellIndex = 1)
private String counterpart;
@TableTitle(value = "商品说明", cellIndex = 2)
private String goodsDesc;
@TableTitle(value = "收/付款方式", cellIndex = 3)
private String payOrInMethod;
@TableTitle(value = "金额", cellIndex = 4)
private String amount;
@TableTitle(value = "交易订单号", cellIndex = 5)
private String tradeOrder;
@TableTitle(value = "商家订单号", cellIndex = 6)
private String merchantOrder;
@TableTitle(value = "交易时间", cellIndex = 7, trimSpace = false)
private String tradeDate;
}
import com.gynsh.utils.cashflow.TableTitle;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;
@ToString
@Getter
@Setter
public class WxFlow {
@TableTitle(value = "交易订单号", cellIndex = 0)
private String tradeOrder;
@TableTitle(value = "交易时间", cellIndex = 1, trimSpace = false)
private String tradeDate;
@TableTitle(value = "交易类型", cellIndex = 2)
private String tradeType;
@TableTitle(value = "收/支/其他", cellIndex = 3)
private String outInType;
@TableTitle(value = "交易方式", cellIndex = 4)
private String tradeMethod;
@TableTitle(value = "金额(元)", cellIndex = 5)
private String amount;
@TableTitle(value = "交易对方", cellIndex = 6)
private String counterpart;
@TableTitle(value = "商家单号", cellIndex = 7)
private String merchantOrder;
}
(2)工具
import lombok.AllArgsConstructor;
import lombok.Getter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import technology.tabula.*;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
final class PDFTableReader<T> {
// 换行符
private final static String LINE_WRAP = "\r";
/**
* PDF数据缓冲器
*/
private final List<Map<String, String>> dataBuff = new ArrayList<>();
private final transient File pdfFile;
/**
* 表格标题属性
*/
private final PDFTitle[] pdfTitles;
/**
* 标题所在行号
* 默认第一行
*/
private int titleRowIndex = 0;
/**
* 描述信息
*/
private String keyInfo;
/**
* 关键信息所在页起始下标,从0开始
*/
private int keyInfoPageNumber;
/**
* 关键信息所在行起始下标
*/
private int keyInfoLineStartIndex;
/**
* 关键信息所在行结束下标
*/
private int keyInfoLineEndIndex;
// 表格映射实体类
private final Class<T> beanClass;
private PDFTableReader(File pdfFile, Class<T> beanClass) {
this.pdfFile = pdfFile;
this.beanClass = beanClass;
if (beanClass == null) {
throw new RuntimeException("必须指定读取表格数据的实体类");
}
Field[] fields = beanClass.getDeclaredFields();
pdfTitles = new PDFTitle[fields.length];
for (int i = 0; i < fields.length; i++) {
Field field = fields[i];
TableTitle title = field.getAnnotation(TableTitle.class);
if (title == null) {
throw new RuntimeException(field.getName() + " attribute miss annotation TableTitle");
}
pdfTitles[i] = new PDFTitle(title.cellIndex(), title.value(), title.trimSpace());
}
}
public static <T> PDFTableReader<T> load(File pdfFile, Class<T> beanClass) {
return new PDFTableReader<T>(pdfFile, beanClass);
}
/**
* 设置PDF表格标题所在行号
*
* @param titleRowIndex - 表头标题所在行下标,从0开始
*/
public PDFTableReader<T> titleRowIndex(int titleRowIndex) {
this.titleRowIndex = titleRowIndex;
return this;
}
/**
* 设置PDF描述信息
*
* @param keyInfoPageNumber - 关键信息所在页起始下标,从0开始
* @param keyInfoLineStartIndex - 关键信息所在行起始下标,从0开始
* @param keyInfoLineEndIndex - 关键信息所在行结束下标,从0开始
*/
public PDFTableReader<T> describe(int keyInfoPageNumber, int keyInfoLineStartIndex, int keyInfoLineEndIndex) {
this.keyInfoPageNumber = keyInfoPageNumber;
this.keyInfoLineStartIndex = keyInfoLineStartIndex;
this.keyInfoLineEndIndex = keyInfoLineEndIndex;
return this;
}
public List<Map<String, String>> readMaps() {
return this.doRead();
}
public List<T> readBeans() {
List<T> beans = new ArrayList<>();
List<Map<String, String>> maps = this.doRead();
T bean;
try {
for (Map<String, String> map : maps) {
bean = this.beanClass.newInstance();
Field[] fields = bean.getClass().getDeclaredFields();
for (Field field : fields) {
TableTitle title = field.getAnnotation(TableTitle.class);
if (title == null) {
continue;
}
if (!field.isAccessible()) {
field.setAccessible(true);
}
field.set(bean, map.get(title.value()));
}
beans.add(bean);
}
return beans;
} catch (Exception e) {
throw new RuntimeException(e.getMessage());
}
}
private List<Map<String, String>> doRead() {
try (PDDocument document = PDDocument.load(pdfFile)) {
// 读取描述
this.doReadKeyInfo(document);
// 读取表格
List<Map<String, String>> maps = new ArrayList<>();
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
PageIterator pi = new ObjectExtractor(document).extract();
Map<String, String> cellMap;
int rowNum = 0;// 行计数器
while (pi.hasNext()) {
Page page = pi.next();
List<Table> tables = sea.extract(page);
for (Table table : tables) {
List<List<RectangularTextContainer>> rows = table.getRows();
if (rows.size() <= titleRowIndex) {
throw new RuntimeException("标题行不正确");
}
for (List<RectangularTextContainer> row : rows) {
if (rowNum <= titleRowIndex) {
rowNum++;
continue;
}
// 处理表格数据
cellMap = new HashMap<>(row.size());
for (int k = 0; k < row.size(); k++) {
RectangularTextContainer<?> cell = row.get(k);
PDFTitle pdfTitle = pdfTitles[k];
// 如果标题列下标不匹配,跳过
if (pdfTitle.getCellIndex() != k) {
continue;
}
// 换行处理,此处需要调用
String cellText = cell.getText();
cellText = cellText == null ? "" : cellText.trim();
if (pdfTitle.isTrimSpace()) {
cellMap.put(pdfTitle.getName(), cellText.replace(LINE_WRAP, ""));
} else {
cellMap.put(pdfTitle.getName(), cellText.replace(LINE_WRAP, " "));
}
}
maps.add(cellMap);
rowNum++;
}
}
}
this.dataBuff.addAll(maps);
return maps;
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
}
private void doReadKeyInfo(PDDocument document) throws IOException {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(true); //sort:设置为true 则按照行进行读取,默认是false
stripper.setStartPage(keyInfoPageNumber);
stripper.setEndPage(keyInfoPageNumber + 1);
String result = stripper.getText(document);
result = result.replaceAll("\n", "");
String[] split = result.split(LINE_WRAP);
if (keyInfoLineStartIndex >= split.length || keyInfoLineEndIndex >= split.length) {
throw new RuntimeException("关键信息所在列已超出读取页码行的范围");
}
StringBuilder r = new StringBuilder();
for (int i = keyInfoLineStartIndex; i <= keyInfoLineEndIndex; i++) {
String line = split[i];
if (line == null || line.trim().equals("")) {
continue;
}
r.append(line.trim());
}
this.keyInfo = r.toString();
}
public List<Map<String, String>> getDataBuff() {
return dataBuff;
}
public String getKeyInfo() {
return keyInfo;
}
// PDF表格标题属性
@Getter
@AllArgsConstructor
static class PDFTitle {
// 标题所在列下标,从0开始
private int cellIndex;
// 标题名称
private String name;
// 需要去掉空格
private boolean trimSpace;
}
}
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
final class RegExUtil {
public static String find(String regex, String line) {
Matcher matcher = Pattern.compile(regex).matcher(line);
if (matcher.find()) {
return matcher.group();
}
return null;
}
public static String[] findMulti(String regex, String line) {
Matcher matcher = Pattern.compile(regex).matcher(line);
List<String> result = new ArrayList<>();
while (matcher.find()) {
String name = matcher.group();
result.add(name);
}
return result.toArray(new String[0]);
}
}
import java.lang.annotation.*;
/**
* PDF Table 表头描述信息
*/
@Documented
@Target(ElementType.FIELD)
@Retention(RetentionPolicy.RUNTIME)
public @interface TableTitle {
/**
* 表头名称
*/
String value();
/**
* 表头列下标,从0开始
*/
int cellIndex();
/**
* 是否去掉单元格内容空格
*/
boolean trimSpace() default true;
}
import lombok.AllArgsConstructor;
import lombok.Getter;
@Getter
@AllArgsConstructor
public enum Type {
IN("收入"),
OUT("支出"),
OTHER("其他"),
;
private final String value;
}
import com.gynsh.utils.cashflow.bean.AlipayFlow;
import com.gynsh.utils.cashflow.bean.FlowWrapper;
import java.io.File;
import java.util.List;
public class AlipayFlowUtil {
// 注:正则 “?<=RegEx”:匹配RegEx之后的内容;“?=RegEx”:匹配RegEx之前的内容
private static final String NAME_REG = "(?<=[:|:])\\S+(?=[\\(|(])";
private static final String ID_REG = "(?<=(\\(\\S{1,10}[:|:]))\\S+(?=[\\)|)])";
private static final String ACCOUNT_REG = "(?<=(\\u8d26\\u53f7|\\u5fae\\u4fe1\\u53f7:))\\S+(?=[\\u4e2d])";
private static final String DATE_REG = "(\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2})";
public static FlowWrapper<AlipayFlow> read(String pdfFilePath) {
return read(new File(pdfFilePath));
}
public static FlowWrapper<AlipayFlow> read(File file) {
PDFTableReader<AlipayFlow> reader = PDFTableReader.load(file, AlipayFlow.class)
.describe(0, 3, 6)
.titleRowIndex(2);
List<AlipayFlow> list = reader.readBeans();
String keyInfo = reader.getKeyInfo();
String realName = RegExUtil.find(NAME_REG, keyInfo);
String idCardNumber = RegExUtil.find(ID_REG, keyInfo);
String account = RegExUtil.find(ACCOUNT_REG, keyInfo);
String[] dateArr = RegExUtil.findMulti(DATE_REG, keyInfo);
String startDate = null;
String endDate = null;
if (dateArr.length > 1) {
startDate = dateArr[0];
endDate = dateArr[1];
}
return new FlowWrapper<>(realName, idCardNumber, account, startDate, endDate, list);
}
}
import com.gynsh.utils.cashflow.bean.FlowWrapper;
import com.gynsh.utils.cashflow.bean.WxFlow;
import java.io.File;
import java.util.List;
public class WxFlowUtil {
// 注:正则 “?<=RegEx”:匹配RegEx之后的内容;“?=RegEx”:匹配RegEx之前的内容
private static final String NAME_REG = "(?<=[:|:])\\S+(?=[\\(|(])";
private static final String ID_REG = "(?<=(\\(\\S{1,10}[:|:]))\\S+(?=[\\)|)])";
private static final String ACCOUNT_REG = "(?<=(\\u8d26\\u53f7|\\u5fae\\u4fe1\\u53f7:))\\S+(?=[\\u4e2d])";
private static final String DATE_REG = "(\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2})";
public static FlowWrapper<WxFlow> read(String pdfFilePath) {
return read(new File(pdfFilePath));
}
public static FlowWrapper<WxFlow> read(File file) {
PDFTableReader<WxFlow> reader = PDFTableReader.load(file, WxFlow.class)
.describe(0, 2, 4)
.titleRowIndex(2);
List<WxFlow> list = reader.readBeans();
String keyInfo = reader.getKeyInfo();
String realName = RegExUtil.find(NAME_REG, keyInfo);
String idCardNumber = RegExUtil.find(ID_REG, keyInfo);
String account = RegExUtil.find(ACCOUNT_REG, keyInfo);
String[] dateArr = RegExUtil.findMulti(DATE_REG, keyInfo);
String startDate = null;
String endDate = null;
if (dateArr.length > 1) {
startDate = dateArr[0];
endDate = dateArr[1];
}
return new FlowWrapper<>(realName, idCardNumber, account, startDate, endDate, list);
}
}
3.使用示例
import com.gynsh.utils.cashflow.AlipayFlowUtil;
import com.gynsh.utils.cashflow.Type;
import com.gynsh.utils.cashflow.WxFlowUtil;
import com.gynsh.utils.cashflow.bean.AlipayFlow;
import com.gynsh.utils.cashflow.bean.FlowWrapper;
import com.gynsh.utils.cashflow.bean.WxFlow;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class CashFlowTest {
public static void main(String[] args) {
String path = "C:\\Users\\geng\\Desktop\\流水证明_20221215_141552.pdf";
String path2 = "C:\\Users\\geng\\Desktop\\微信支付交易明细证明(20220919-20221219).pdf";
printAlipayCashFlow(path);
log.error("===============================================================");
printWxCashFlow(path2);
}
static void printAlipayCashFlow(String path) {
FlowWrapper<AlipayFlow> wrapper = AlipayFlowUtil.read(path);
log.info("姓名:{}", wrapper.getRealName());
log.info("账户:{}", wrapper.getAccount());
log.info("证件号码:{}", wrapper.getIdCardNUmber());
log.info("起始日期:{}", wrapper.getStartDate());
log.info("结束日期:{}", wrapper.getEndDate());
log.info("流水合计(收入):{}", sumAlipayFlow(wrapper, Type.IN));
log.info("流水合计(支出):{}", sumAlipayFlow(wrapper, Type.OUT));
log.info("流水合计(其他):{}", sumAlipayFlow(wrapper, Type.OTHER));
}
static void printWxCashFlow(String path) {
FlowWrapper<WxFlow> wrapper = WxFlowUtil.read(path);
log.info("姓名:{}", wrapper.getRealName());
log.info("账户:{}", wrapper.getAccount());
log.info("证件号码:{}", wrapper.getIdCardNUmber());
log.info("起始日期:{}", wrapper.getStartDate());
log.info("结束日期:{}", wrapper.getEndDate());
log.info("流水合计(收入):{}", sumWxFlow(wrapper, Type.IN));
log.info("流水合计(支出):{}", sumWxFlow(wrapper, Type.OUT));
log.info("流水合计(其他):{}", sumWxFlow(wrapper, Type.OTHER));
}
// 计算流水
static double sumAlipayFlow(FlowWrapper<AlipayFlow> wrapper, Type type) {
return wrapper.getDetails().stream().filter(f -> type.getValue().equals(f.getOutInType()))
.mapToDouble(f -> Double.parseDouble(f.getAmount())).sum();
}
static double sumWxFlow(FlowWrapper<WxFlow> wrapper, Type type) {
return wrapper.getDetails().stream().filter(f -> type.getValue().equals(f.getOutInType()))
.mapToDouble(f -> Double.parseDouble(f.getAmount())).sum();
}
}