使用org.apache.pdfbox 2.x PDF 转Doc 转图片 转字符串

场景:

本来说要弄一个PDF简历解析的,但是百度搜索了一下一般都是需要Python语义或者人工智能一类的。所以做了一个pdf转doc的副产品。

注意本代码是基于2.x版本,对于新出的3.x应该是无效的。pdfbox的API一直都有在变动更新,网上搜罗的代码很多时候都不生效,还是找到对应的版本会好一点。

代码

依赖

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.22</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all -->
        <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>5.3.5</version>
        </dependency>

 

工具类

package com.pdftoword.demo.utils;

import cn.hutool.core.io.IoUtil;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import javax.imageio.ImageIO;
import javax.servlet.http.HttpServletResponse;
import java.awt.image.BufferedImage;
import java.io.*;
import java.net.URLEncoder;


@Service
public class PdfUtils {

    private String DOC_FILEURL = "D://text.doc";        //doc文件路径

    /**
     * pdf 转 doc 文件
     * @param file
     */
    public void convertWord(MultipartFile file,HttpServletResponse response){
        PDDocument doc = null;
        Writer writer = null;
        OutputStream os  = null;
        PDFTextStripper pdfTextStripper = null;

        try{
            response.setContentType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;charset=utf-8");
            String fileName = URLEncoder.encode("文件", "UTF-8");
            response.setHeader("Access-Control-Expose-Headers", "Content-Disposition");
            response.setHeader("Content-Disposition", "attachment;filename="+fileName+".doc");
            doc = PDDocument.load(file.getInputStream());
            os = new FileOutputStream(DOC_FILEURL);
            writer = new OutputStreamWriter(os,"UTF-8");
            pdfTextStripper = new PDFTextStripper();
            int pageNumber = doc.getNumberOfPages();
            pdfTextStripper = new PDFTextStripper();
            pdfTextStripper.setSortByPosition(true);
            pdfTextStripper.setStartPage(1);
            pdfTextStripper.setEndPage(pageNumber);
            pdfTextStripper.writeText(doc,writer);
            //必须先关掉文件流再重新读取,否则会产生冲突
            writer.close();
            doc.close();
            File docFile = new File(DOC_FILEURL);
            cn.hutool.core.io.file.FileReader reader = new cn.hutool.core.io.file.FileReader(docFile);
            OutputStream ps = response.getOutputStream();
            byte [] bytes = reader.readBytes();
            IoUtil.write(ps,true,bytes);

        }catch (IOException e){
            e.printStackTrace();
        }
    }

    /**
     * pdf 转 String
     * @param file
     * @return
     */
    public String convertText(MultipartFile file){
        PDDocument doc = null;
        OutputStream os  = null;
        Writer writer = null;
        PDFTextStripper pdfTextStripper = null;
        StringBuilder builder = new StringBuilder();
        String res  = null;
        try{
            doc = PDDocument.load(file.getInputStream());
            pdfTextStripper = new PDFTextStripper();
            int pageNumber = doc.getNumberOfPages();
            pdfTextStripper = new PDFTextStripper();
            pdfTextStripper.setSortByPosition(true);
            pdfTextStripper.setStartPage(1);
            pdfTextStripper.setLineSeparator(",");
            pdfTextStripper.setEndPage(pageNumber);
            res = pdfTextStripper.getText(doc);

            doc.close();
        }catch (IOException e){
            e.printStackTrace();
        }
        return  res;
    }

    /**
     *pdf 转 图片
     * @param file
     * @param response
     */
    public void toImage(MultipartFile file, HttpServletResponse response){
        PDDocument doc = null;
        try{
            response.setContentType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;charset=utf-8");
            String fileName = URLEncoder.encode("图片", "UTF-8");
            response.setHeader("Access-Control-Expose-Headers", "Content-Disposition");
            response.setHeader("Content-Disposition", "attachment;filename="+fileName+".png");
            doc = PDDocument.load(file.getInputStream());
            PDFRenderer renderer = new PDFRenderer(doc);
            int pageCount = doc.getNumberOfPages();
            for(int i = 0;i<pageCount;i++){
                BufferedImage bufferedImage = renderer.renderImage(i);
//              ImageIO.write(bufferedImage,"JPEG",new File(filePath+"-"+i+".jpg"));
                OutputStream os =  response.getOutputStream();
                ImageIO.write(bufferedImage,"PNG",os);
            }
        }catch (IOException e){
            e.printStackTrace();
        }
    }




}

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值