java 读取pdf文件内容

方式一

1、引入maven

<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.25</version>
</dependency>

2、代码工具类

//import com.itextpdf.text.pdf.PdfReader;
//import com.itextpdf.text.pdf.parser.PdfTextExtractor;
//import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;

import com.google.common.collect.Lists;
import com.jiayou.peis.entity.ImageObject;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * PDF处理
 *
 */
public class PdfUtils {
    //    /**
//     * 使用itextpdf提取PDF文本(解析不靠谱)
//     *
//     * @param inputStream
//     * @return
//     * @throws IOException
//     */
//    @Deprecated
//    public static String toText(InputStream inputStream) throws IOException {
//        try {
//            StringBuilder buf = new StringBuilder();
//            PdfReader reader = new PdfReader(inputStream);
//            int pageNum = reader.getNumberOfPages();
//            for(int i=1;i<=pageNum;i++){
//                // 读取第i页的文档内容
//                buf.append(PdfTextExtractor.getTextFromPage(reader, i, new SimpleTextExtractionStrategy()));
//            }
            return buf.toString();
//            return StrUtils.removeReturnChar(buf.toString());
//        } finally {
//            CloseUtils.closeQuietly(inputStream);
//        }
//    }
    public static String text(byte[] data) throws IOException {
        return PdfUtils.text(data, true);
    }
    public static String text(byte[] data, boolean sortByPosition) throws IOException {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        return PdfUtils.text(inputStream, sortByPosition);
    }
    /**
     * 使用pdfbox提取PDF文本(解析正常,可使用)
     *
     * @param file
     * @return
     * @throws IOException
     */
    public static String text(File file, boolean sortByPosition) throws IOException {
        InputStream inputStream = new FileInputStream(file);
        return PdfUtils.text(inputStream, sortByPosition);
    }
    public static String text(File file) throws IOException {
        return PdfUtils.text(file, true);
    }
    public static String text(InputStream inputStream) throws IOException {
        return text(inputStream, true);
    }
    /**
     * 使用pdfbox提取PDF文本(解析正常,可使用)
     *
     * @param inputStream
     * @return
     * @throws IOException
     */
    public static String text(InputStream inputStream, boolean sortByPosition) throws IOException {
        PDDocument document = null;
        try {
//            document = PDDocument.load(inputStream);
            document = Loader.loadPDF(inputStream);
            PDFTextStripper textStripper = new PDFTextStripper();
            // Get total page count of the PDF document
            int numberOfPages = document.getNumberOfPages();
            //set the first page to be extracted
            textStripper.setStartPage(1);
            // set the last page to be extracted
            textStripper.setEndPage(numberOfPages);
            // 获取文本内容
            textStripper.setSortByPosition(sortByPosition);
            textStripper.setShouldSeparateByBeads(true);
            return StrUtils.removeReturnChar(textStripper.getText(document));
        } finally {
            CloseUtils.closeQuietly(document, inputStream);
        }
    }

    /**
     * 使用pdfbox提取PDF文本(解析正常,可使用)
     *
     * @param file
     * @return
     * @throws IOException
     */
    public static List<ImageObject> images(File file) throws IOException {
        InputStream inputStream = new FileInputStream(file);
        return PdfUtils.images(inputStream);
    }

    public static List<ImageObject> images(byte[] data) throws IOException {
        ByteArrayInputStream inputStream = null;
        try {
            inputStream = new ByteArrayInputStream(data);
            return PdfUtils.images(inputStream);
        } finally {
            CloseUtils.closeQuietly(inputStream);
        }
    }

    /**
     * 使用pdfbox提取PDF图片列表
     *
     * @param inputStream
     * @return
     * @throws IOException
     */
    public static List<ImageObject> images(InputStream inputStream) throws IOException {
        List<ImageObject> imageList = Lists.newArrayList();
        PDDocument document = null;
        try {
//            document = PDDocument.load(inputStream);
            document = Loader.loadPDF(inputStream);
            // get resources for a page
            PDResources pdResources = document.getPage(0).getResources();
            int i = 0;
            for (COSName csName : pdResources.getXObjectNames()) {
//                System.out.println(i+":"+csName);
                PDXObject pdxObject = pdResources.getXObject(csName);
                if (pdxObject instanceof PDImageXObject) {
//                    i++;
                    PDStream pdStream = pdxObject.getStream();
                    PDImageXObject image = new PDImageXObject(pdStream, pdResources);
                    String imageSuffix = imageSuffix(image);
                    // image storage location and image name
                    BufferedImage bufferedImage = image.getImage();
                    ImageObject object = new ImageObject();
                    object.setIndex(i++);
                    object.setImage(bufferedImage);
                    object.setSuffix(imageSuffix);
                    imageList.add(object);
                }
            }
        } finally {
            CloseUtils.closeQuietly(document, inputStream);
        }
        return imageList;
    }

    /**
     * 获取图片后缀
     *
     * @param pdImage
     * @return
     * @throws IOException
     */
    private static String imageSuffix(PDImageXObject pdImage) throws IOException {
        String suffix = pdImage.getSuffix();
        if (suffix == null || "jb2".equals(suffix)) {
            suffix = "png";
        } else if ("jpx".equals(suffix)) {
            // use jp2 suffix for file because jpx not known by windows
            suffix = "jp2";
        }

        if (hasMasks(pdImage)) {
            // TIKA-3040, PDFBOX-4771: can't save ARGB as JPEG
            suffix = "png";
        }
        return suffix;
    }

    private static boolean hasMasks(PDImage pdImage) throws IOException {
        if (pdImage instanceof PDImageXObject) {
            PDImageXObject ximg = (PDImageXObject) pdImage;
            return ximg.getMask() != null || ximg.getSoftMask() != null;
        }
        return false;
    }

    /**
     * 保存图片到指定文件夹
     *
     * @param imageList
     * @param dir
     * @param prefixName
     * @throws IOException
     */
    public static void saveImage(List<ImageObject> imageList, String dir, String prefixName) throws IOException {
        File imgDir = new File(dir);
        FileUtils.forceMkdir(imgDir);
        for(ImageObject image:imageList){
            File imgFile = new File(dir, prefixName+"_"+image.getIndex()+"."+image.getSuffix());
            ImageIO.write(image.getImage(), image.getSuffix(), imgFile);
        }
    }
}

3、相关代码

CloseUtils

import java.io.Closeable;
import java.util.zip.ZipInputStream;

/**
 * 关闭文件流
 */
public class CloseUtils {

    /**
     * 关闭文件流
     *
     * @param closeables
     */
    public static void closeQuietly(Closeable... closeables) {
        if (closeables != null && closeables.length > 0) {

            for (Closeable closeable : closeables) {

                if (closeable != null) {
                    try {
                        closeable.close();
                    } catch (Exception e) {}
                }
            }
        }
    }

    public static void closeEntryQuietly(ZipInputStream... closeables) {
        if (closeables != null && closeables.length > 0) {
            for (ZipInputStream closeable : closeables) {
                if (closeable != null) {
                    try {
                        closeable.closeEntry();
                    } catch (Exception e) { }
                }
            }
        }
    }
}

StrUtils

import cn.hutool.core.util.StrUtil;
import com.google.common.collect.Lists;
import org.apache.commons.lang3.StringUtils;

import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

/**
 *
 */
public class StrUtils {
    /**
     *
     * @param text
     * @return
     */
    public static List<String> lines(String text){
        if(text == null || text.isEmpty()){
            return null;
        }
        return Lists.newArrayList(text.split("\n+"));
    }
    /**
     *
     * @param text
     * @param lineIndex
     * @return
     */
    public static String splitAt(String text, int lineIndex){
        if(text == null || text.isEmpty()){
            return null;
        }
        String[] lines = text.split("\\s+");
        return lines.length > lineIndex ? lines[lineIndex] : null;
    }
    /**
     * 读取某行的数据
     *
     * @param text
     * @param lineIndex 0 开始
     * @return
     */
    public static String lineAt(String text, int lineIndex){
        if(text == null || text.isEmpty()){
            return null;
        }
        String[] lines = text.split("\r?\n");
        return lines.length > lineIndex ? lines[lineIndex] : null;
    }
    /**
     *
     *
     * @param text
     * @param startFlag 最后开始第一个
     * @param closeFlag 最后开始第一个
     * @return
     */
    public static String substringBetween(String text, String startFlag, String closeFlag, String subType){
        return substringBetween(text, startFlag, closeFlag, subType, null);
    }
    /**
     *
     *
     * @param text
     * @param startFlag 最后开始第一个
     * @param closeFlag 最后开始第一个
     * @return
     */
    public static String substringBetween(String text, String startFlag, String closeFlag, String subType, String defaultVal){
        if(StringUtils.isEmpty(text)){
            return defaultVal;
        }
        if("substringBetween".equals(subType)){
            if(StringUtils.isNotEmpty(startFlag) && StringUtils.isNotEmpty(closeFlag)){
                return StrUtils.substringBetween(text, startFlag, closeFlag);
            }
        } else if("substringBetweenLast".equals(subType)){
            if(StringUtils.isNotEmpty(startFlag) && StringUtils.isNotEmpty(closeFlag)){
                return StrUtils.substringBetweenLast(text, startFlag, closeFlag);
            }
        } else if("substringBetweenLastBefore".equals(subType)){
            if(StringUtils.isNotEmpty(startFlag) && StringUtils.isNotEmpty(closeFlag)){
                return StrUtils.substringBetweenLastBefore(text, startFlag, closeFlag);
            }
        } else if("substringBetweenBeforeLast".equals(subType)){
            if(StringUtils.isNotEmpty(startFlag) && StringUtils.isNotEmpty(closeFlag)){
                return StrUtils.substringBetweenBeforeLast(text, startFlag, closeFlag);
            }
        } else if("substringAfter".equals(subType)){
            if(StringUtils.isNotEmpty(startFlag)){
                return StringUtils.substringAfter(text, startFlag);
            }
        } else if("substringAfterLast".equals(subType)){
            if(StringUtils.isNotEmpty(startFlag)){
                return StringUtils.substringAfterLast(text, startFlag);
            }
        } else if("substringBefore".equals(subType)){
            if(StringUtils.isNotEmpty(closeFlag)){
                return StringUtils.substringBefore(text, closeFlag);
            }
        } else if("substringBeforeLast".equals(subType)){
            if(StringUtils.isNotEmpty(closeFlag)){
                return StringUtils.substringBeforeLast(text, closeFlag);
            }
        }
        return defaultVal;
    }
    /**
     *
     * @param text
     * @param startFlag 前面开始第一个
     * @param closeFlag 前面开始第一个
     * @return
     */
    public static String substringBetween(String text, String startFlag, String closeFlag){
        if(text == null || text.isEmpty()){
            return null;
        }
        int startIndex = text.indexOf(startFlag);
        int endIndex   = text.indexOf(closeFlag);
        if(startIndex >= 0 && endIndex >= 0 && endIndex >= startIndex){
            return text.substring(startIndex+startFlag.length(), endIndex);
        }
        return null;
    }

    public static String substringBetween(String text, String startFlag){
        if(text == null || text.isEmpty()){
            return null;
        }
        int startIndex = text.indexOf(startFlag);
        if(startIndex >= 0 && startIndex != -1){
            return text.substring(startIndex + startFlag.length());
        }
        return null;
    }

    /**
     *
     *
     * @param text
     * @param startFlag 最后开始第一个
     * @param closeFlag 最后开始第一个
     * @return
     */
    public static String substringBetweenLast(String text, String startFlag, String closeFlag){
        if(text == null || text.isEmpty()){
            return null;
        }
        int startIndex = text.lastIndexOf(startFlag);
        int endIndex   = text.lastIndexOf(closeFlag);
        if(startIndex >= 0 && endIndex >= 0 && endIndex >= startIndex){
            return text.substring(startIndex+startFlag.length(), endIndex);
        }
        return null;
    }

    /**
     *
     *
     * @param text
     * @param startFlag 最后开始第一个
     * @param closeFlag 前面开始第一个
     * @return
     */
    public static String substringBetweenLastBefore(String text, String startFlag, String closeFlag){
        if(text == null || text.isEmpty()){
            return null;
        }
        int startIndex = text.lastIndexOf(startFlag);
        int endIndex   = text.indexOf(closeFlag);
        if(startIndex >= 0 && endIndex >= 0 && endIndex >= startIndex){
            return text.substring(startIndex+startFlag.length(), endIndex);
        }
        return null;
    }

    /**
     *
     * @param text
     * @param startFlag 前面开始第一个
     * @param closeFlag 最后开始第一个
     * @return
     */
    public static String substringBetweenBeforeLast(String text, String startFlag, String closeFlag){
        if(text == null || text.isEmpty()){
            return null;
        }
        int startIndex = text.indexOf(startFlag);
        int endIndex   = text.lastIndexOf(closeFlag);
        if(startIndex >= 0 && endIndex >= 0 && endIndex >= startIndex){
            return text.substring(startIndex+startFlag.length(), endIndex);
        }
        return null;
    }

    /**
     * Null转换成空
     *
     * @param val
     * @return
     */
    public static String nullToEmpty(String val){
        return val == null ? "" : val;
    }

    /**
     * 删除回车键
     *  回车Carriage Return
     *
     * @param val
     * @return
     */
    public static String removeReturnChar(String val){
        return val == null ? "" : val.replace("\r", "");
    }


    public static String getLastNotNullText(String[] array){
        List<String> collect = Arrays.stream(array).filter(x -> StrUtil.isNotBlank(x)).collect(Collectors.toList());
        return collect.get(collect.size() - 1);
    }
    public static String getFirstNotNullText(String[] array){
        List<String> collect = Arrays.stream(array).filter(x -> StrUtil.isNotBlank(x)).collect(Collectors.toList());
        return collect.get(0);
    }

}

方式二

https://blog.csdn.net/ThinkPet/article/details/131256428

  • 1
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值