如何读取PDF中的表格数据（交易流水读取实战）？

最新推荐文章于 2025-06-03 20:57:14 发布

流沙QS

最新推荐文章于 2025-06-03 20:57:14 发布

阅读量3.1k

点赞数 10

分类专栏： Java专栏工具文章标签： pdf

本文链接：https://blog.csdn.net/gengzhy/article/details/128386973

版权

Java专栏同时被 2 个专栏收录

53 篇文章

订阅专栏

工具

24 篇文章

订阅专栏

一开始，也是百思不得其解，相关的资料也是少之又少。

刚开始的思路也是先读取PDF表格线条的坐标，再根据坐标定位其中的文字信息，从而读取出来。

但代码量稍显复杂，写到一半就写不下去了，后面偶遇一个工具包：tabula

贴上官网地址：tabula.technology

GitHub地址：https://github.com/tabulapdf/tabula

思路差不多，关键是解决了问题。

本文使用该工具包，并以支付宝流水和微信流水的PDF文件作为实战，需要的拿走。若支付宝微信官方没变更文档格式，截止目前，基本可以直接使用。

1.引入依赖包

<dependency>
	<groupId>technology.tabula</groupId>
	<artifactId>tabula</artifactId>
	<version>1.0.5</version>
	<exclusions>
		<exclusion>
			<artifactId>slf4j-simple</artifactId>
			<groupId>org.slf4j</groupId>
		</exclusion>
	</exclusions>
</dependency>

2.读取PDF表格数据的工具类

import lombok.AllArgsConstructor;
import lombok.Getter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import technology.tabula.*;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;

import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class PDFTableReader<T> {
    // 换行符
    private final static String LINE_WRAP = "\r";
    /**
     * PDF数据缓冲器
     */
    private final List<Map<String, String>> dataBuff = new ArrayList<>();
    private final transient File pdfFile;
    /**
     * 表格标题属性
     */
    private final PDFTitle[] pdfTitles;

    /**
     * 标题所在行号
     * 默认第一行
     */
    private int titleRowIndex = 0;

    /**
     * 描述信息
     */
    private String keyInfo;
    /**
     * 关键信息所在页起始下标，从0开始
     */
    private int keyInfoPageNumber;
    /**
     * 关键信息所在行起始下标
     */
    private int keyInfoLineStartIndex;
    /**
     * 关键信息所在行结束下标
     */
    private int keyInfoLineEndIndex;
    // 表格映射实体类
    private final Class<T> beanClass;

    private PDFTableReader(File pdfFile, Class<T> beanClass) {
        this.pdfFile = pdfFile;
        this.beanClass = beanClass;
        if (beanClass == null) {
            throw new RuntimeException("必须指定读取表格数据的实体类");
        }
        Field[] fields = beanClass.getDeclaredFields();
        pdfTitles = new PDFTitle[fields.length];
        for (int i = 0; i < fields.length; i++) {
            Field field = fields[i];
            TableTitle title = field.getAnnotation(TableTitle.class);
            if (title == null) {
                throw new RuntimeException(field.getName() + " attribute miss annotation TableTitle");
            }
            pdfTitles[i] = new PDFTitle(title.cellIndex(), title.value(), title.trimSpace());
        }
    }

    public static <T> PDFTableReader<T> load(File pdfFile, Class<T> beanClass) {
        return new PDFTableReader<T>(pdfFile, beanClass);
    }

    /**
     * 设置PDF表格标题所在行号
     *
     * @param titleRowIndex - 表头标题所在行下标，从0开始
     */
    public PDFTableReader<T> titleRowIndex(int titleRowIndex) {
        this.titleRowIndex = titleRowIndex;
        return this;
    }

    /**
     * 设置PDF描述信息
     *
     * @param keyInfoPageNumber     - 关键信息所在页起始下标，从0开始
     * @param keyInfoLineStartIndex - 关键信息所在行起始下标，从0开始
     * @param keyInfoLineEndIndex   - 关键信息所在行结束下标，从0开始
     */
    public PDFTableReader<T> describe(int keyInfoPageNumber, int keyInfoLineStartIndex, int keyInfoLineEndIndex) {
        this.keyInfoPageNumber = keyInfoPageNumber;
        this.keyInfoLineStartIndex = keyInfoLineStartIndex;
        this.keyInfoLineEndIndex = keyInfoLineEndIndex;
        return this;
    }

    public List<Map<String, String>> readMaps() {
        return this.doRead();
    }

    public List<T> readBeans() {
        List<T> beans = new ArrayList<>();
        List<Map<String, String>> maps = this.doRead();
        T bean;
        try {
            for (Map<String, String> map : maps) {
                bean = this.beanClass.newInstance();
                Field[] fields = bean.getClass().getDeclaredFields();
                for (Field field : fields) {
                    TableTitle title = field.getAnnotation(TableTitle.class);
                    if (title == null) {
                        continue;
                    }
                    if (!field.isAccessible()) {
                        field.setAccessible(true);
                    }
                    field.set(bean, map.get(title.value()));
                }
                beans.add(bean);
            }
            return beans;
        } catch (Exception e) {
            throw new RuntimeException(e.getMessage());
        }
    }

    private List<Map<String, String>> doRead() {
        try (PDDocument document = PDDocument.load(pdfFile)) {
            // 读取描述
            this.doReadKeyInfo(document);
            // 读取表格
            List<Map<String, String>> maps = new ArrayList<>();
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            PageIterator pi = new ObjectExtractor(document).extract();
            Map<String, String> cellMap;
            int rowNum = 0;// 行计数器
            while (pi.hasNext()) {
                Page page = pi.next();
                List<Table> tables = sea.extract(page);
                for (Table table : tables) {
                    List<List<RectangularTextContainer>> rows = table.getRows();
                    if (rows.size() <= titleRowIndex) {
                        throw new RuntimeException("标题行不正确");
                    }
                    for (List<RectangularTextContainer> row : rows) {
                        if (rowNum <= titleRowIndex) {
                            rowNum++;
                            continue;
                        }
                        // 处理表格数据
                        cellMap = new HashMap<>(row.size());
                        for (int k = 0; k < row.size(); k++) {
                            RectangularTextContainer cell = row.get(k);
                            PDFTitle pdfTitle = pdfTitles[k];
                            // 如果标题列下标不匹配，跳过
                            if (pdfTitle.getCellIndex() != k) {
                                continue;
                            }
                            // 换行处理，此处需要调用
                            String cellText = cell.getText();
                            cellText = cellText == null ? "" : cellText.trim();
                            if (pdfTitle.isTrimSpace()) {
                                cellMap.put(pdfTitle.getName(), cellText.replace(LINE_WRAP, ""));
                            } else {
                                cellMap.put(pdfTitle.getName(), cellText.replace(LINE_WRAP, " "));
                            }
                        }
                        maps.add(cellMap);
                        rowNum++;
                    }
                }
            }
            this.dataBuff.addAll(maps);
            return maps;
        } catch (IOException e) {
            throw new RuntimeException(e.getMessage());
        }
    }

    private void doReadKeyInfo(PDDocument document) throws IOException {
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setSortByPosition(true); //sort:设置为true 则按照行进行读取，默认是false
        stripper.setStartPage(keyInfoPageNumber);
        stripper.setEndPage(keyInfoPageNumber + 1);
        String result = stripper.getText(document);
        result = result.replaceAll("\n", "");
        String[] split = result.split(LINE_WRAP);
        if (keyInfoLineStartIndex >= split.length || keyInfoLineEndIndex >= split.length) {
            throw new RuntimeException("关键信息所在列已超出读取页码行的范围");
        }
        StringBuilder r = new StringBuilder();
        for (int i = keyInfoLineStartIndex; i <= keyInfoLineEndIndex; i++) {
            String line = split[i];
            if (line == null || line.trim().equals("")) {
                continue;
            }
            r.append(line.trim());
        }
        this.keyInfo = r.toString();
    }

    public List<Map<String, String>> getDataBuff() {
        return dataBuff;
    }

    public String getKeyInfo() {
        return keyInfo;
    }

    // PDF表格标题属性
    @Getter
    @AllArgsConstructor
    static class PDFTitle {
        // 标题所在列下标，从0开始
        private int cellIndex;
        // 标题名称
        private String name;
        // 需要去掉空格
        private boolean trimSpace;
    }
}

import java.lang.annotation.*;

/**
 * PDF Table 表头描述信息
 */
@Documented
@Target(ElementType.FIELD)
@Retention(RetentionPolicy.RUNTIME)
public @interface TableTitle {
    /**
     * 表头名称
     */
    String value();

    /**
     * 表头列下标，从0开始
     */
    int cellIndex();

    /**
     * 是否去掉单元格内容空格
     */
    boolean trimSpace() default true;
}

3.实战

公用代码

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;

import java.util.List;

/**
 * 流水包装类
 */
@ToString
@Getter
@AllArgsConstructor
public class FlowWrapper<NT> {
    private String realName;// 姓名
    private String idCardNUmber;// 证件号码
    private String account;// 账户
    private String startDate;// 流水起始日期
    private String endDate;// 流水结束日期
    private List<NT> details;// 流水数据
}

正则字段匹配工具

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class RegExUtil {
    public static String find(String regex, String line) {
        Matcher matcher = Pattern.compile(regex).matcher(line);
        if (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

    public static String[] findMulti(String regex, String line) {
        Matcher matcher = Pattern.compile(regex).matcher(line);
        List<String> result = new ArrayList<>();
        while (matcher.find()) {
            String name = matcher.group();
            result.add(name);
        }
        return result.toArray(new String[0]);
    }
}

3.1支付宝流水PDF文件读取实例

实体

import cn.cosmosx.base.pdf.TableTitle;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;

@ToString
@Getter
@Setter
public class AlipayFlow {
    @TableTitle(value = "收/支", cellIndex = 0)
    private String outInType;
    @TableTitle(value = "交易对方", cellIndex = 1)
    private String counterpart;
    @TableTitle(value = "商品说明", cellIndex = 2)
    private String goodsDesc;
    @TableTitle(value = "收/付款方式", cellIndex = 3)
    private String payOrInMethod;
    @TableTitle(value = "金额", cellIndex = 4)
    private String amount;
    @TableTitle(value = "交易订单号", cellIndex = 5)
    private String tradeOrder;
    @TableTitle(value = "商家订单号", cellIndex = 6)
    private String merchantOrder;
    @TableTitle(value = "交易时间", cellIndex = 7, trimSpace = false)
    private String tradeDate;
}

读取工具

import cn.cosmosx.base.pdf.PDFTableReader;
import cn.cosmosx.base.pdf.RegExUtil;
import cn.cosmosx.base.pdf.bean.AlipayFlow;

import java.io.File;
import java.util.List;

public class AlipayFlowUtil {
    // 注：正则 “?<=RegEx”：匹配RegEx之后的内容；“?=RegEx”：匹配RegEx之前的内容
    private static final String NAME_REG = "(?<=[：|:])\\S+(?=[\\(|（])";
    private static final String ID_REG = "(?<=(\\(\\S{1,10}[：|:]))\\S+(?=[\\)|）])";
    private static final String ACCOUNT_REG = "(?<=(\\u8d26\\u53f7|\\u5fae\\u4fe1\\u53f7：))\\S+(?=[\\u4e2d])";
    private static final String DATE_REG = "(\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2})";

    public static FlowWrapper<AlipayFlow> read(String pdfFilePath) {
        PDFTableReader<AlipayFlow> reader = PDFTableReader.load(new File(pdfFilePath), AlipayFlow.class)
                .describe(0, 3, 6)
                .titleRowIndex(2);
        List<AlipayFlow> list = reader.readBeans();
        String keyInfo = reader.getKeyInfo();
        String realName = RegExUtil.find(NAME_REG, keyInfo);
        String idCardNumber = RegExUtil.find(ID_REG, keyInfo);
        String account = RegExUtil.find(ACCOUNT_REG, keyInfo);
        String[] dateArr = RegExUtil.findMulti(DATE_REG, keyInfo);
        String startDate = null;
        String endDate = null;
        if (dateArr.length > 1) {
            startDate = dateArr[0];
            endDate = dateArr[1];
        }
        return new FlowWrapper<>(realName, idCardNumber, account, startDate, endDate, list);
    }
}

3.2微信流水PDF文件读取实例

实体

import cn.cosmosx.base.pdf.TableTitle;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;

@ToString
@Getter
@Setter
public class WxFlow {
    @TableTitle(value = "交易订单号", cellIndex = 0)
    private String tradeOrder;
    @TableTitle(value = "交易时间", cellIndex = 1, trimSpace = false)
    private String tradeDate;
    @TableTitle(value = "交易类型", cellIndex = 2)
    private String tradeType;
    @TableTitle(value = "收/支/其他", cellIndex = 3)
    private String outInType;
    @TableTitle(value = "交易方式", cellIndex = 4)
    private String tradeMethod;
    @TableTitle(value = "金额(元)", cellIndex = 5)
    private String amount;
    @TableTitle(value = "交易对方", cellIndex = 6)
    private String counterpart;
    @TableTitle(value = "商家单号", cellIndex = 7)
    private String merchantOrder;
}

读取工具

import cn.cosmosx.base.pdf.PDFTableReader;
import cn.cosmosx.base.pdf.RegExUtil;
import cn.cosmosx.base.pdf.bean.WxFlow;

import java.io.File;
import java.util.List;

public class WxFlowUtil {
    // 注：正则 “?<=RegEx”：匹配RegEx之后的内容；“?=RegEx”：匹配RegEx之前的内容
    private static final String NAME_REG = "(?<=[：|:])\\S+(?=[\\(|（])";
    private static final String ID_REG = "(?<=(\\(\\S{1,10}[：|:]))\\S+(?=[\\)|）])";
    private static final String ACCOUNT_REG = "(?<=(\\u8d26\\u53f7|\\u5fae\\u4fe1\\u53f7：))\\S+(?=[\\u4e2d])";
    private static final String DATE_REG = "(\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2})";

    public static FlowWrapper<WxFlow> read(String pdfFilePath) {
        PDFTableReader<WxFlow> reader = PDFTableReader.load(new File(pdfFilePath), WxFlow.class)
                .describe(0, 2, 4)
                .titleRowIndex(2);
        List<WxFlow> list = reader.readBeans();
        String keyInfo = reader.getKeyInfo();
        String realName = RegExUtil.find(NAME_REG, keyInfo);
        String idCardNumber = RegExUtil.find(ID_REG, keyInfo);
        String account = RegExUtil.find(ACCOUNT_REG, keyInfo);
        String[] dateArr = RegExUtil.findMulti(DATE_REG, keyInfo);
        String startDate = null;
        String endDate = null;
        if (dateArr.length > 1) {
            startDate = dateArr[0];
            endDate = dateArr[1];
        }
        return new FlowWrapper<>(realName, idCardNumber, account, startDate, endDate, list);
    }
}

至此，已全部完成！

若有更高的方式，欢迎大家推荐。