一文读取微信支付宝PDF文件数据

最新推荐文章于 2024-06-13 13:58:29 发布

流沙QS

最新推荐文章于 2024-06-13 13:58:29 发布

阅读量240

点赞数 4

分类专栏： Java专栏文章标签：微信 pdf java

本文链接：https://blog.csdn.net/gengzhy/article/details/136560882

版权

Java专栏专栏收录该内容

46 篇文章 1 订阅

订阅专栏

主要读取PDF的表格数据，以微信支付宝为例子展开，开箱即用。

1.依赖

<dependency>
    <groupId>technology.tabula</groupId>
    <artifactId>tabula</artifactId>
	<version>1.0.5</version>
</dependency>

2.工具代码

（1）几个bean


import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;

import java.util.List;

/**
 * 流水包装类
 */
@ToString
@Getter
@AllArgsConstructor
public class FlowWrapper<NT> {
    private String realName;// 姓名
    private String idCardNUmber;// 证件号码
    private String account;// 账户
    private String startDate;// 流水起始日期
    private String endDate;// 流水结束日期
    private List<NT> details;// 流水数据
}

import com.gynsh.utils.cashflow.TableTitle;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;

@ToString
@Getter
@Setter
public class AlipayFlow {
    @TableTitle(value = "收/支", cellIndex = 0)
    private String outInType;
    @TableTitle(value = "交易对方", cellIndex = 1)
    private String counterpart;
    @TableTitle(value = "商品说明", cellIndex = 2)
    private String goodsDesc;
    @TableTitle(value = "收/付款方式", cellIndex = 3)
    private String payOrInMethod;
    @TableTitle(value = "金额", cellIndex = 4)
    private String amount;
    @TableTitle(value = "交易订单号", cellIndex = 5)
    private String tradeOrder;
    @TableTitle(value = "商家订单号", cellIndex = 6)
    private String merchantOrder;
    @TableTitle(value = "交易时间", cellIndex = 7, trimSpace = false)
    private String tradeDate;
}

import com.gynsh.utils.cashflow.TableTitle;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;

@ToString
@Getter
@Setter
public class WxFlow {
    @TableTitle(value = "交易订单号", cellIndex = 0)
    private String tradeOrder;
    @TableTitle(value = "交易时间", cellIndex = 1, trimSpace = false)
    private String tradeDate;
    @TableTitle(value = "交易类型", cellIndex = 2)
    private String tradeType;
    @TableTitle(value = "收/支/其他", cellIndex = 3)
    private String outInType;
    @TableTitle(value = "交易方式", cellIndex = 4)
    private String tradeMethod;
    @TableTitle(value = "金额(元)", cellIndex = 5)
    private String amount;
    @TableTitle(value = "交易对方", cellIndex = 6)
    private String counterpart;
    @TableTitle(value = "商家单号", cellIndex = 7)
    private String merchantOrder;
}

（2）工具

import lombok.AllArgsConstructor;
import lombok.Getter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import technology.tabula.*;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;

import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

final class PDFTableReader<T> {
    // 换行符
    private final static String LINE_WRAP = "\r";
    /**
     * PDF数据缓冲器
     */
    private final List<Map<String, String>> dataBuff = new ArrayList<>();
    private final transient File pdfFile;
    /**
     * 表格标题属性
     */
    private final PDFTitle[] pdfTitles;

    /**
     * 标题所在行号
     * 默认第一行
     */
    private int titleRowIndex = 0;

    /**
     * 描述信息
     */
    private String keyInfo;
    /**
     * 关键信息所在页起始下标，从0开始
     */
    private int keyInfoPageNumber;
    /**
     * 关键信息所在行起始下标
     */
    private int keyInfoLineStartIndex;
    /**
     * 关键信息所在行结束下标
     */
    private int keyInfoLineEndIndex;
    // 表格映射实体类
    private final Class<T> beanClass;

    private PDFTableReader(File pdfFile, Class<T> beanClass) {
        this.pdfFile = pdfFile;
        this.beanClass = beanClass;
        if (beanClass == null) {
            throw new RuntimeException("必须指定读取表格数据的实体类");
        }
        Field[] fields = beanClass.getDeclaredFields();
        pdfTitles = new PDFTitle[fields.length];
        for (int i = 0; i < fields.length; i++) {
            Field field = fields[i];
            TableTitle title = field.getAnnotation(TableTitle.class);
            if (title == null) {
                throw new RuntimeException(field.getName() + " attribute miss annotation TableTitle");
            }
            pdfTitles[i] = new PDFTitle(title.cellIndex(), title.value(), title.trimSpace());
        }
    }

    public static <T> PDFTableReader<T> load(File pdfFile, Class<T> beanClass) {
        return new PDFTableReader<T>(pdfFile, beanClass);
    }

    /**
     * 设置PDF表格标题所在行号
     *
     * @param titleRowIndex - 表头标题所在行下标，从0开始
     */
    public PDFTableReader<T> titleRowIndex(int titleRowIndex) {
        this.titleRowIndex = titleRowIndex;
        return this;
    }

    /**
     * 设置PDF描述信息
     *
     * @param keyInfoPageNumber     - 关键信息所在页起始下标，从0开始
     * @param keyInfoLineStartIndex - 关键信息所在行起始下标，从0开始
     * @param keyInfoLineEndIndex   - 关键信息所在行结束下标，从0开始
     */
    public PDFTableReader<T> describe(int keyInfoPageNumber, int keyInfoLineStartIndex, int keyInfoLineEndIndex) {
        this.keyInfoPageNumber = keyInfoPageNumber;
        this.keyInfoLineStartIndex = keyInfoLineStartIndex;
        this.keyInfoLineEndIndex = keyInfoLineEndIndex;
        return this;
    }

    public List<Map<String, String>> readMaps() {
        return this.doRead();
    }

    public List<T> readBeans() {
        List<T> beans = new ArrayList<>();
        List<Map<String, String>> maps = this.doRead();
        T bean;
        try {
            for (Map<String, String> map : maps) {
                bean = this.beanClass.newInstance();
                Field[] fields = bean.getClass().getDeclaredFields();
                for (Field field : fields) {
                    TableTitle title = field.getAnnotation(TableTitle.class);
                    if (title == null) {
                        continue;
                    }
                    if (!field.isAccessible()) {
                        field.setAccessible(true);
                    }
                    field.set(bean, map.get(title.value()));
                }
                beans.add(bean);
            }
            return beans;
        } catch (Exception e) {
            throw new RuntimeException(e.getMessage());
        }
    }

    private List<Map<String, String>> doRead() {
        try (PDDocument document = PDDocument.load(pdfFile)) {
            // 读取描述
            this.doReadKeyInfo(document);
            // 读取表格
            List<Map<String, String>> maps = new ArrayList<>();
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            PageIterator pi = new ObjectExtractor(document).extract();
            Map<String, String> cellMap;
            int rowNum = 0;// 行计数器
            while (pi.hasNext()) {
                Page page = pi.next();
                List<Table> tables = sea.extract(page);
                for (Table table : tables) {
                    List<List<RectangularTextContainer>> rows = table.getRows();
                    if (rows.size() <= titleRowIndex) {
                        throw new RuntimeException("标题行不正确");
                    }
                    for (List<RectangularTextContainer> row : rows) {
                        if (rowNum <= titleRowIndex) {
                            rowNum++;
                            continue;
                        }
                        // 处理表格数据
                        cellMap = new HashMap<>(row.size());
                        for (int k = 0; k < row.size(); k++) {
                            RectangularTextContainer<?> cell = row.get(k);
                            PDFTitle pdfTitle = pdfTitles[k];
                            // 如果标题列下标不匹配，跳过
                            if (pdfTitle.getCellIndex() != k) {
                                continue;
                            }
                            // 换行处理，此处需要调用
                            String cellText = cell.getText();
                            cellText = cellText == null ? "" : cellText.trim();
                            if (pdfTitle.isTrimSpace()) {
                                cellMap.put(pdfTitle.getName(), cellText.replace(LINE_WRAP, ""));
                            } else {
                                cellMap.put(pdfTitle.getName(), cellText.replace(LINE_WRAP, " "));
                            }
                        }
                        maps.add(cellMap);
                        rowNum++;
                    }
                }
            }
            this.dataBuff.addAll(maps);
            return maps;
        } catch (IOException e) {
            throw new RuntimeException(e.getMessage());
        }
    }

    private void doReadKeyInfo(PDDocument document) throws IOException {
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setSortByPosition(true); //sort:设置为true 则按照行进行读取，默认是false
        stripper.setStartPage(keyInfoPageNumber);
        stripper.setEndPage(keyInfoPageNumber + 1);
        String result = stripper.getText(document);
        result = result.replaceAll("\n", "");
        String[] split = result.split(LINE_WRAP);
        if (keyInfoLineStartIndex >= split.length || keyInfoLineEndIndex >= split.length) {
            throw new RuntimeException("关键信息所在列已超出读取页码行的范围");
        }
        StringBuilder r = new StringBuilder();
        for (int i = keyInfoLineStartIndex; i <= keyInfoLineEndIndex; i++) {
            String line = split[i];
            if (line == null || line.trim().equals("")) {
                continue;
            }
            r.append(line.trim());
        }
        this.keyInfo = r.toString();
    }

    public List<Map<String, String>> getDataBuff() {
        return dataBuff;
    }

    public String getKeyInfo() {
        return keyInfo;
    }

    // PDF表格标题属性
    @Getter
    @AllArgsConstructor
    static class PDFTitle {
        // 标题所在列下标，从0开始
        private int cellIndex;
        // 标题名称
        private String name;
        // 需要去掉空格
        private boolean trimSpace;
    }
}

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

final class RegExUtil {
    public static String find(String regex, String line) {
        Matcher matcher = Pattern.compile(regex).matcher(line);
        if (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

    public static String[] findMulti(String regex, String line) {
        Matcher matcher = Pattern.compile(regex).matcher(line);
        List<String> result = new ArrayList<>();
        while (matcher.find()) {
            String name = matcher.group();
            result.add(name);
        }
        return result.toArray(new String[0]);
    }
}

import java.lang.annotation.*;

/**
 * PDF Table 表头描述信息
 */
@Documented
@Target(ElementType.FIELD)
@Retention(RetentionPolicy.RUNTIME)
public @interface TableTitle {
    /**
     * 表头名称
     */
    String value();

    /**
     * 表头列下标，从0开始
     */
    int cellIndex();

    /**
     * 是否去掉单元格内容空格
     */
    boolean trimSpace() default true;
}

import lombok.AllArgsConstructor;
import lombok.Getter;

@Getter
@AllArgsConstructor
public enum Type {
    IN("收入"),
    OUT("支出"),
    OTHER("其他"),
    ;
    private final String value;
}

import com.gynsh.utils.cashflow.bean.AlipayFlow;
import com.gynsh.utils.cashflow.bean.FlowWrapper;

import java.io.File;
import java.util.List;

public class AlipayFlowUtil {
    // 注：正则 “?<=RegEx”：匹配RegEx之后的内容；“?=RegEx”：匹配RegEx之前的内容
    private static final String NAME_REG = "(?<=[：|:])\\S+(?=[\\(|（])";
    private static final String ID_REG = "(?<=(\\(\\S{1,10}[：|:]))\\S+(?=[\\)|）])";
    private static final String ACCOUNT_REG = "(?<=(\\u8d26\\u53f7|\\u5fae\\u4fe1\\u53f7：))\\S+(?=[\\u4e2d])";
    private static final String DATE_REG = "(\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2})";

    public static FlowWrapper<AlipayFlow> read(String pdfFilePath) {
        return read(new File(pdfFilePath));
    }

    public static FlowWrapper<AlipayFlow> read(File file) {
        PDFTableReader<AlipayFlow> reader = PDFTableReader.load(file, AlipayFlow.class)
                .describe(0, 3, 6)
                .titleRowIndex(2);
        List<AlipayFlow> list = reader.readBeans();
        String keyInfo = reader.getKeyInfo();
        String realName = RegExUtil.find(NAME_REG, keyInfo);
        String idCardNumber = RegExUtil.find(ID_REG, keyInfo);
        String account = RegExUtil.find(ACCOUNT_REG, keyInfo);
        String[] dateArr = RegExUtil.findMulti(DATE_REG, keyInfo);
        String startDate = null;
        String endDate = null;
        if (dateArr.length > 1) {
            startDate = dateArr[0];
            endDate = dateArr[1];
        }
        return new FlowWrapper<>(realName, idCardNumber, account, startDate, endDate, list);
    }
}

import com.gynsh.utils.cashflow.bean.FlowWrapper;
import com.gynsh.utils.cashflow.bean.WxFlow;

import java.io.File;
import java.util.List;

public class WxFlowUtil {
    // 注：正则 “?<=RegEx”：匹配RegEx之后的内容；“?=RegEx”：匹配RegEx之前的内容
    private static final String NAME_REG = "(?<=[：|:])\\S+(?=[\\(|（])";
    private static final String ID_REG = "(?<=(\\(\\S{1,10}[：|:]))\\S+(?=[\\)|）])";
    private static final String ACCOUNT_REG = "(?<=(\\u8d26\\u53f7|\\u5fae\\u4fe1\\u53f7：))\\S+(?=[\\u4e2d])";
    private static final String DATE_REG = "(\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2})";

    public static FlowWrapper<WxFlow> read(String pdfFilePath) {
        return read(new File(pdfFilePath));
    }

    public static FlowWrapper<WxFlow> read(File file) {
        PDFTableReader<WxFlow> reader = PDFTableReader.load(file, WxFlow.class)
                .describe(0, 2, 4)
                .titleRowIndex(2);
        List<WxFlow> list = reader.readBeans();
        String keyInfo = reader.getKeyInfo();
        String realName = RegExUtil.find(NAME_REG, keyInfo);
        String idCardNumber = RegExUtil.find(ID_REG, keyInfo);
        String account = RegExUtil.find(ACCOUNT_REG, keyInfo);
        String[] dateArr = RegExUtil.findMulti(DATE_REG, keyInfo);
        String startDate = null;
        String endDate = null;
        if (dateArr.length > 1) {
            startDate = dateArr[0];
            endDate = dateArr[1];
        }
        return new FlowWrapper<>(realName, idCardNumber, account, startDate, endDate, list);
    }
}

3.使用示例

import com.gynsh.utils.cashflow.AlipayFlowUtil;
import com.gynsh.utils.cashflow.Type;
import com.gynsh.utils.cashflow.WxFlowUtil;
import com.gynsh.utils.cashflow.bean.AlipayFlow;
import com.gynsh.utils.cashflow.bean.FlowWrapper;
import com.gynsh.utils.cashflow.bean.WxFlow;
import lombok.extern.slf4j.Slf4j;

@Slf4j
public class CashFlowTest {

    public static void main(String[] args) {
        String path = "C:\\Users\\geng\\Desktop\\流水证明_20221215_141552.pdf";
        String path2 = "C:\\Users\\geng\\Desktop\\微信支付交易明细证明(20220919-20221219).pdf";
        printAlipayCashFlow(path);
        log.error("===============================================================");
        printWxCashFlow(path2);
    }

    static void printAlipayCashFlow(String path) {
        FlowWrapper<AlipayFlow> wrapper = AlipayFlowUtil.read(path);
        log.info("姓名：{}", wrapper.getRealName());
        log.info("账户：{}", wrapper.getAccount());
        log.info("证件号码：{}", wrapper.getIdCardNUmber());
        log.info("起始日期：{}", wrapper.getStartDate());
        log.info("结束日期：{}", wrapper.getEndDate());
        log.info("流水合计（收入）：{}", sumAlipayFlow(wrapper, Type.IN));
        log.info("流水合计（支出）：{}", sumAlipayFlow(wrapper, Type.OUT));
        log.info("流水合计（其他）：{}", sumAlipayFlow(wrapper, Type.OTHER));
    }

    static void printWxCashFlow(String path) {
        FlowWrapper<WxFlow> wrapper = WxFlowUtil.read(path);
        log.info("姓名：{}", wrapper.getRealName());
        log.info("账户：{}", wrapper.getAccount());
        log.info("证件号码：{}", wrapper.getIdCardNUmber());
        log.info("起始日期：{}", wrapper.getStartDate());
        log.info("结束日期：{}", wrapper.getEndDate());
        log.info("流水合计（收入）：{}", sumWxFlow(wrapper, Type.IN));
        log.info("流水合计（支出）：{}", sumWxFlow(wrapper, Type.OUT));
        log.info("流水合计（其他）：{}", sumWxFlow(wrapper, Type.OTHER));
    }

    // 计算流水
    static double sumAlipayFlow(FlowWrapper<AlipayFlow> wrapper, Type type) {
        return wrapper.getDetails().stream().filter(f -> type.getValue().equals(f.getOutInType()))
                .mapToDouble(f -> Double.parseDouble(f.getAmount())).sum();
    }
    static double sumWxFlow(FlowWrapper<WxFlow> wrapper, Type type) {
        return wrapper.getDetails().stream().filter(f -> type.getValue().equals(f.getOutInType()))
                .mapToDouble(f -> Double.parseDouble(f.getAmount())).sum();
    }

}