【Java】使用poi+pdfbox实现office文件提取内容

引入maven依赖

<!-- poi -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.16</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.16</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>3.16</version>
</dependency>

<!-- pdf -->
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.4</version>
</dependency>

提取内容

private static String read(File file) {
    StringBuilder builder = new StringBuilder();

    String name = file.getName();
    boolean txt = name.endsWith(".txt");
    if (txt) {
        try (FileInputStream inputStream = new FileInputStream(file)) {
            int len;
            byte[] bytes = new byte[1024];
            while ((len = inputStream.read(bytes)) != -1) {
                builder.append(new String(bytes, 0, len));
            }
            inputStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    POITextExtractor extractor = null;
    boolean word = name.endsWith(".doc") || name.endsWith(".docx");
    if (word) {
        try {
            extractor = new WordExtractor(new HWPFDocument(new FileInputStream(file)));
        } catch (Exception e) {
            try {
                extractor = new XWPFWordExtractor(new XWPFDocument(new FileInputStream(file)));
            } catch (Exception ignored) {
            }
        }
    }
    boolean excel = name.endsWith(".xls") || name.endsWith(".xlsx");
    if (excel) {
        try {
            extractor = new ExcelExtractor(new HSSFWorkbook(new POIFSFileSystem(file)));
        } catch (Exception e) {
            try {
                extractor = new XSSFExcelExtractor(new XSSFWorkbook(file));
            } catch (Exception ignored) {
            }
        }
    }
    boolean slide = name.endsWith(".ppt") || name.endsWith(".pptx");
    if (slide) {
        try {
            extractor = new PowerPointExtractor(new FileInputStream(file));
        } catch (Exception e) {
            try {
                extractor = new XSLFPowerPointExtractor(new XSLFSlideShow(OPCPackage.open(file)));
            } catch (Exception ignored) {
            }
        }
    }
    if (extractor != null) {
        builder.append(extractor.getText());
        try {
            extractor.close();
        } catch (IOException ignored) {
        }
    }
    boolean pdf = name.endsWith(".pdf");
    if (pdf) {
        try {
            PDDocument document = PDDocument.load(file);
            PDFTextStripper stripper = new PDFTextStripper();
            builder.append(stripper.getText(document));
            document.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return builder.toString();
}

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值