PDF文件内容读取_pdfreader maven-CSDN博客

本文链接：https://blog.csdn.net/hellokitty136/article/details/122671135

PDF文件内容读取

一、PDF表格读取

可以读取表格内容，以及表格坐标

1、maven依赖

 <!-- openPdf 可以读取标签-->
        <dependency>
            <groupId>com.github.librepdf</groupId>
            <artifactId>openpdf</artifactId>
            <version>1.3.26</version>
        </dependency>
        <dependency>
            <groupId>technology.tabula</groupId>
            <artifactId>tabula</artifactId>
            <version>1.0.3</version>
            <exclusions>
                <exclusion>
                    <artifactId>slf4j-simple</artifactId>
                    <groupId>org.slf4j</groupId>
                </exclusion>
            </exclusions>
    </dependency>

2、代码

import com.lowagie.text.Rectangle;
import com.lowagie.text.pdf.AcroFields;
import com.lowagie.text.pdf.PdfReader;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.ParseException;
import technology.tabula.CommandLineApp;

/**
     * 根据网络路径，获取本地路径
     * @param path
     * @return
     * @throws IOException
     */
    private String getLocalPath(String path) throws IOException {
        InputStream inputStream = null;
        FileOutputStream fos = null;
        try {
            URL url = new URL(path);
            HttpURLConnection conn = (HttpURLConnection)url.openConnection();
            //设置超时间
            conn.setConnectTimeout(5*1000);
            //防止屏蔽程序抓取而返回403错误
            conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
            inputStream = conn.getInputStream();
            byte[] getData = readInputStream(inputStream);
            File file = new File(savePath + File.separator + UUID.randomUUID() + ".pdf");
            fos = new FileOutputStream(file);
            fos.write(getData);

            return file.getAbsolutePath();
        } catch (Exception e) {
            log.error("CaseFileSignatureVerifyTask-文件下载失败", e);
        } finally {
            if(fos!=null){
                fos.close();
            }
            if(inputStream!=null){
                inputStream.close();
            }
        }
        return null;
    }

private byte[] readInputStream(InputStream inputStream) throws IOException {
        byte[] buffer = new byte[1024];
        int len = 0;
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        while((len = inputStream.read(buffer)) != -1) {
            bos.write(buffer, 0, len);
        }
        bos.close();
        return bos.toByteArray();
    }



//底层使用的new File， 因此pdf必须是本地文件
//读取pdf表格，包括内容以及坐标等
private double[] getRecipientTableCoordinate(String pdf) throws ParseException, IOException {
        String localPath = this.getLocalPath(pdf);
        if (StringUtil.isBlank(localPath)) {
            throw BizException.serverError("文件路径获取失败");
        }
        try {
            String[] args = new String[]{"-f=JSON", "-p=all", localPath};
            CommandLineParser parser = new DefaultParser();
            CommandLine cmd = parser.parse(CommandLineApp.buildOptions(), args);
            StringBuilder stringBuilder = new StringBuilder();
            new CommandLineApp(stringBuilder, cmd).extractTables(cmd);
            Map<String, Object> stringStringMap = JsonUtil.jsonToMapObject(stringBuilder.substring(1, stringBuilder.length() - 1));
            List<List<Map<String, Object>>> data = (ArrayList)stringStringMap.get("data");
            for (List<Map<String, Object>> list : data) {
                for (Map<String, Object> map : list) {
                    String text = (String)map.get("text");
                    if (StringUtils.isBlank(text)) {
                        continue;
                    }
                    if (text.contains("收件人签名") || text.contains("签名或盖章")) {
                        double[] coordinate = new double[2];
                        coordinate[0] = (double) map.get("top");
                        coordinate[1] = coordinate[0] + (double)map.get("height");
                        return coordinate;
                    }
                }
            }
        } catch (Exception e) {
            throw e;
        } finally {
            File file = new File(localPath);
            if (file.exists()) {
                file.delete();
            }
        }
        return null;
    }

 /**
     * openpdf获取签名信息
     * @param pdf
     * @return
     * @throws Exception
     */
    private boolean hasRecipientSigned(String pdf) throws Exception {
        if (StringUtils.isBlank(pdf)) {
            return false;
        }
        try (PdfReader reader = new PdfReader(pdf)) {
            AcroFields fields = reader.getAcroFields();

            List<String> signatures = fields.getSignedFieldNames();

            System.out.println("签名数目: " + signatures.size());
            for (String signature : signatures) {
                float[] fieldPositions = fields.getFieldPositions(signature);
                Rectangle pageSize = reader.getPageSize((int) fieldPositions[0]);
                float height = pageSize.getHeight();
                float bottomY = fieldPositions[2];
                float topY = fieldPositions[4];
                bottomY = height - bottomY;
                topY = height - topY;
            }
        }
        return false;
    }

二、PDF内容读取

读取正常pdf内容

1、maven依赖

<dependency>
	    <groupId>com.itextpdf</groupId>
	    <artifactId>itextpdf</artifactId>
	    <version>5.5.11</version>
	 </dependency>
	
	 <dependency>
	    <groupId>com.itextpdf</groupId>
	    <artifactId>itext-asian</artifactId>
	    <version>5.2.0</version>
	 </dependency>

2、代码

 private static void extract(String pdf) throws IOException {
        //可以传入输入流创建 PdfReader对象，也可以使用文件路径创建 PdfReader对象
        PdfReader reader = new PdfReader(pdf);

        //创建pdf解析类
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        //获取pdf的页数
        int pageNum = reader.getNumberOfPages();
        String pageContent = "";
        for (int i = 1; i <= pageNum; i++) {
            // 只能从第1页开始读
            pageContent += PdfTextExtractor.getTextFromPage(reader, i);
        }
        //pdf文件的所有内容
        System.out.println("pageContent:" + pageContent.replace("\n",""));
    }