pdf 简单表格数据读取

文件依赖

<dependency>
  <groupId>technology.tabula</groupId>
  <artifactId>tabula</artifactId>
  <version>1.0.3</version>
  <exclusions>
    <exclusion>
      <artifactId>slf4j-simple</artifactId>
      <groupId>org.slf4j</groupId>
    </exclusion>
  </exclusions>
</dependency>

还有 fastjson 用于将data处理为json格式

表格对应的key value对象

public class Entry {
    String num;
    String data1;
    String data2;
    String data3;
    String data4;

    @Override
    public String toString() {
        return "Entry{" +
                "num='" + num + '\'' +
                ", data1='" + data1 + '\'' +
                ", data2='" + data2 + '\'' +
                ", data3='" + data3 + '\'' +
                ", data4='" + data4 + '\'' +
                '}';
    }

    public String getNum() {
        return num;
    }

    public void setNum(String num) {
        this.num = num;
    }

    public String getData1() {
        return data1;
    }

    public void setData1(String data1) {
        this.data1 = data1;
    }

    public String getData2() {
        return data2;
    }

    public void setData2(String data2) {
        this.data2 = data2;
    }

    public String getData3() {
        return data3;
    }

    public void setData3(String data3) {
        this.data3 = data3;
    }

    public String getData4() {
        return data4;
    }

    public void setData4(String data4) {
        this.data4 = data4;
    }
}

实际操作方法

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import technology.tabula.CommandLineApp;

import java.util.ArrayList;
import java.util.List;


public class PdfTableParser {

    public static void main(String[] args) throws ParseException {
        //-f导出格式,默认CSV  (一定要大写)
        //-p 指导出哪页,all是所有
        //path F:\文字文稿1_20241113090441.pdf
        //-l 强制使用点阵模式提取PDF (关键在于这儿)
        String[] argsa = new String[]{"-f=JSON", "-p=2", "F:\\文字文稿1_20241113090441.pdf", "-l"};
        //CommandLineApp.main(argsa);
        CommandLineParser parser = new DefaultParser();
        CommandLine cmd = parser.parse(CommandLineApp.buildOptions(), argsa);
        StringBuilder stringBuilder = new StringBuilder();
        new CommandLineApp(stringBuilder, cmd).extractTables(cmd);
        String dataString = stringBuilder.toString();
        System.out.println("打印返回数据:  "+ dataString);
        //解析tabula读取pdf表格,将返回的数据转成jsonArray
        JSONArray jsonArray = new JSONArray();
        Object parse = JSONObject.parse(dataString);
        jsonArray.add(parse);
        List<Entry> listInfo = new ArrayList<>();
        for (int i = 0; i < jsonArray.size(); i++) {
            //获取每个页
            JSONArray jsonPage = jsonArray.getJSONArray(i);
            //遍历页
            for (int j = 0; j < jsonPage.size(); j++) {
                //获取每页中的data
                JSONArray dataArr = jsonPage.getJSONObject(j).getJSONArray("data");
                //遍历data中的每个单元格
                for (int k = 0; k < dataArr.size(); k++) {
                    //遍历data中的每一条,也就是单元格中的每一行
                    JSONArray dataD = dataArr.getJSONArray(k);
                    String xuhao = dataD.getJSONObject(0).get("text").toString().replaceAll("\r", "");
                    //如果第1个单元格的数据是序号,则跳出个这个循环
                    if (xuhao.contains("序号")) {
                        continue;
                    }
                    Entry info = new Entry();
                    //通过下标获取每个单元格的数据,下标是固定的 ,最多有9个单元格
                    info.setNum(dataD.getJSONObject(1).get("text").toString().replaceAll("\r", ""));
                    info.setData1(dataD.getJSONObject(2).get("text").toString().replaceAll("\r", ""));
                    info.setData2(dataD.getJSONObject(3).get("text").toString().replaceAll("\r", ""));
                    info.setData3(dataD.getJSONObject(4).get("text").toString().replaceAll("\r", ""));
                    info.setData4(dataD.getJSONObject(5).get("text").toString().replaceAll("\r", ""));
                    listInfo.add(info);
                }
            }
        }
        for (int i = 0; i < listInfo.size(); i++) {
            //遍历删除序号是空的数据
            if (StringUtils.isBlank(listInfo.get(i).getNum())) {
                listInfo.remove(listInfo.get(i));
            }
        }
        listInfo.forEach(System.out::println);
    }
}

文件内容

在这里插入图片描述

输出结果

[main] INFO org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB - To get higher rendering speed on JDK8 or later,
[main] INFO org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB -   use the option -Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider
[main] INFO org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB -   or call System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")
打印返回数据:  [{"extraction_method":"lattice","top":87.67198,"left":84.46285,"width":426.4116516113281,"height":110.67039489746094,"right":510.8745,"bottom":198.34238,"data":[[{"top":87.67198,"left":84.46285,"width":71.13715362548828,"height":16.268020629882812,"text":"序号"},{"top":87.67198,"left":155.6,"width":71.00001525878906,"height":16.268020629882812,"text":"测试"},{"top":87.67198,"left":226.60002,"width":70.99998474121094,"height":16.268020629882812,"text":"内容 1"},{"top":87.67198,"left":297.6,"width":71.0,"height":16.268020629882812,"text":"内容 2"},{"top":87.67198,"left":368.6,"width":71.04998779296875,"height":16.268020629882812,"text":"内容三"},{"top":87.67198,"left":439.65,"width":71.22451782226562,"height":16.268020629882812,"text":"内容 4"}],[{"top":103.94,"left":84.46285,"width":71.13715362548828,"height":47.29998779296875,"text":"1"},{"top":103.94,"left":155.6,"width":71.00001525878906,"height":47.29998779296875,"text":"123\rasas\r789"},{"top":103.94,"left":226.60002,"width":70.99998474121094,"height":47.29998779296875,"text":"qwe\rqwe\rqwe"},{"top":103.94,"left":297.6,"width":71.0,"height":47.29998779296875,"text":"asd\rasd\rasd"},{"top":103.94,"left":368.6,"width":71.04998779296875,"height":47.29998779296875,"text":"dfg\rdfg\rdfg"},{"top":103.94,"left":439.65,"width":71.22451782226562,"height":47.29998779296875,"text":"123\r456\r789"}],[{"top":151.23999,"left":84.46285,"width":71.13715362548828,"height":47.102386474609375,"text":"2"},{"top":151.23999,"left":155.6,"width":71.00001525878906,"height":47.102386474609375,"text":"qwfqfw\rqwfq\rqwqw"},{"top":151.23999,"left":226.60002,"width":70.99998474121094,"height":47.102386474609375,"text":"sassaas\rsaas\rsasas"},{"top":151.23999,"left":297.6,"width":71.0,"height":47.102386474609375,"text":"zxc\rzxc\rzxc"},{"top":151.23999,"left":368.6,"width":71.04998779296875,"height":47.102386474609375,"text":"cvb\rcvb\rcvb"},{"top":151.23999,"left":439.65,"width":71.22451782226562,"height":47.102386474609375,"text":"rty\rrty\rrty"}]]}]
Entry{num='123asas789', data1='qweqweqwe', data2='asdasdasd', data3='dfgdfgdfg', data4='123456789'}
Entry{num='qwfqfwqwfqqwqw', data1='sassaassaassasas', data2='zxczxczxc', data3='cvbcvbcvb', data4='rtyrtyrty'}

Process finished with exit code 0

参考文章

参考链接

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值