场景:
本来说要弄一个PDF简历解析的,但是百度搜索了一下一般都是需要Python语义或者人工智能一类的。所以做了一个pdf转doc的副产品。
注意本代码是基于2.x版本,对于新出的3.x应该是无效的。pdfbox的API一直都有在变动更新,网上搜罗的代码很多时候都不生效,还是找到对应的版本会好一点。
代码
依赖
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.22</version>
</dependency>
<!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all -->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.3.5</version>
</dependency>
工具类
package com.pdftoword.demo.utils;
import cn.hutool.core.io.IoUtil;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import javax.imageio.ImageIO;
import javax.servlet.http.HttpServletResponse;
import java.awt.image.BufferedImage;
import java.io.*;
import java.net.URLEncoder;
@Service
public class PdfUtils {
private String DOC_FILEURL = "D://text.doc"; //doc文件路径
/**
* pdf 转 doc 文件
* @param file
*/
public void convertWord(MultipartFile file,HttpServletResponse response){
PDDocument doc = null;
Writer writer = null;
OutputStream os = null;
PDFTextStripper pdfTextStripper = null;
try{
response.setContentType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;charset=utf-8");
String fileName = URLEncoder.encode("文件", "UTF-8");
response.setHeader("Access-Control-Expose-Headers", "Content-Disposition");
response.setHeader("Content-Disposition", "attachment;filename="+fileName+".doc");
doc = PDDocument.load(file.getInputStream());
os = new FileOutputStream(DOC_FILEURL);
writer = new OutputStreamWriter(os,"UTF-8");
pdfTextStripper = new PDFTextStripper();
int pageNumber = doc.getNumberOfPages();
pdfTextStripper = new PDFTextStripper();
pdfTextStripper.setSortByPosition(true);
pdfTextStripper.setStartPage(1);
pdfTextStripper.setEndPage(pageNumber);
pdfTextStripper.writeText(doc,writer);
//必须先关掉文件流再重新读取,否则会产生冲突
writer.close();
doc.close();
File docFile = new File(DOC_FILEURL);
cn.hutool.core.io.file.FileReader reader = new cn.hutool.core.io.file.FileReader(docFile);
OutputStream ps = response.getOutputStream();
byte [] bytes = reader.readBytes();
IoUtil.write(ps,true,bytes);
}catch (IOException e){
e.printStackTrace();
}
}
/**
* pdf 转 String
* @param file
* @return
*/
public String convertText(MultipartFile file){
PDDocument doc = null;
OutputStream os = null;
Writer writer = null;
PDFTextStripper pdfTextStripper = null;
StringBuilder builder = new StringBuilder();
String res = null;
try{
doc = PDDocument.load(file.getInputStream());
pdfTextStripper = new PDFTextStripper();
int pageNumber = doc.getNumberOfPages();
pdfTextStripper = new PDFTextStripper();
pdfTextStripper.setSortByPosition(true);
pdfTextStripper.setStartPage(1);
pdfTextStripper.setLineSeparator(",");
pdfTextStripper.setEndPage(pageNumber);
res = pdfTextStripper.getText(doc);
doc.close();
}catch (IOException e){
e.printStackTrace();
}
return res;
}
/**
*pdf 转 图片
* @param file
* @param response
*/
public void toImage(MultipartFile file, HttpServletResponse response){
PDDocument doc = null;
try{
response.setContentType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;charset=utf-8");
String fileName = URLEncoder.encode("图片", "UTF-8");
response.setHeader("Access-Control-Expose-Headers", "Content-Disposition");
response.setHeader("Content-Disposition", "attachment;filename="+fileName+".png");
doc = PDDocument.load(file.getInputStream());
PDFRenderer renderer = new PDFRenderer(doc);
int pageCount = doc.getNumberOfPages();
for(int i = 0;i<pageCount;i++){
BufferedImage bufferedImage = renderer.renderImage(i);
// ImageIO.write(bufferedImage,"JPEG",new File(filePath+"-"+i+".jpg"));
OutputStream os = response.getOutputStream();
ImageIO.write(bufferedImage,"PNG",os);
}
}catch (IOException e){
e.printStackTrace();
}
}
}