PDF文件解析

最新推荐文章于 2024-02-24 18:16:10 发布

快乐的CRUD

最新推荐文章于 2024-02-24 18:16:10 发布

阅读量114

点赞数

文章标签： pdf java 服务器

本文链接：https://blog.csdn.net/yuanyuan1758/article/details/134574892

版权

1.表格类的PDF文件，可以通过tabula解析

maven 依赖

<dependency>
<groupId>technology.tabula</groupId>
<artifactId>tabula</artifactId>
<version>1.0.3</version>
<exclusions>
<exclusion>
<artifactId>slf4j-simple</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>

片段代码

public String getContent(byte[] bytes) {
    String str = Base64.encodeBase64String(bytes);
    String[] args = new String[]{"-f=JSON", "-p=all", str, "-l"};
    CommandLineParser parser = new DefaultParser();
    StringBuilder sb = new StringBuilder();
    try {
        CommandLine cmd = parser.parse(CommandLineApp.buildOptions(), args);
        CommandLineApp commandLineApp = new CommandLineAppExt(sb, cmd);
        commandLineApp.extractTables(cmd);
    } catch (ParseException e) {
        e.printStackTrace();
    }
    return sb.toString();
}

public class CommandLineAppExt extends CommandLineApp {

    @Override
    public void extractTables(CommandLine line) throws ParseException {
        if (line.getArgs().length != 1) {
            throw new ParseException("Need exactly one filename\nTry --help for help");
        } else {
            byte[] decode = Base64.decodeBase64(line.getArgs()[0]);
            this.extractFileTables(decode);
        }
    }

    public void extractFileTables(byte[] bytes) throws ParseException {
        this.extractFile(bytes, this.defaultOutput);
    }

    private void extractFile(byte[] bytes, Appendable outFile) throws ParseException {
        PDDocument pdfDocument = null;

        try {
            pdfDocument = this.password == null ? PDDocument.load(bytes) : PDDocument.load(bytes, this.password);
            PageIterator pageIterator = this.getPageIterator(pdfDocument);
            ArrayList tables = new ArrayList();

            while (true) {
                while (pageIterator.hasNext()) {
                    Page page = pageIterator.next();
                    Rectangle area;
                    if (this.pageAreas != null) {
                        for (Iterator var7 = this.pageAreas.iterator(); var7.hasNext(); tables.addAll(this.tableExtractor.extractTables(page.getArea(area)))) {
                            Pair<Integer, Rectangle> areaPair = (Pair) var7.next();
                            area = (Rectangle) areaPair.getRight();
                            if ((Integer) areaPair.getLeft() == 0) {
                                area = new Rectangle((float) ((double) (area.getTop() / 100.0F) * page.getHeight()), (float) ((double) (area.getLeft() / 100.0F) * page.getWidth()), (float) (area.getWidth() / 100.0D * page.getWidth()), (float) (area.getHeight() / 100.0D * page.getHeight()));
                            }
                        }
                    } else {
                        tables.addAll(this.tableExtractor.extractTables(page));
                    }
                }

                this.writeTables(tables, outFile);
                return;
            }
        } catch (IOException var17) {
            throw new ParseException(var17.getMessage());
        } finally {
            try {
                if (pdfDocument != null) {
                    pdfDocument.close();
                }
            } catch (IOException var16) {
                System.out.println("Error in closing pdf document" + var16);
            }

        }
    }

}

说明：CommandLineAppExt类继承CommandLineApp类，为满足传入byte[]类型参数，做了改造，CommandLineApp类本身不支持传入byte[]类型数据，支持传入File文件类型。

2.pdf文件不是表格，而是平铺的文档，用spire解析

maven 依赖

<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.pdf.free</artifactId>
<version>3.9.0</version>
</dependency>

<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>

片段代码

public String getContent(byte[] bytes) {
        PdfDocument doc = new PdfDocument();
        doc.loadFromBytes(bytes);
        StringBuffer sb = new StringBuffer();
        PdfPageBase page;
        for (int i = 0; i < doc.getPages().getCount(); i++) {
            //获取每一页的page对象
            page = doc.getPages().get(i);
            sb.append(page.extractText(true));
        }
        doc.close();
        return sb.toString();
    }