业务说明:由于业务相关要求,对接第三方PDF文件转JPG之后上传服务器,同时还进行ocr识别。
看了一下CSDN上面的一些帖子,大部分都无法处理非标准的PDF文件,比如用打印机扫描的PDF文件可能是歪着的,就会转换图片失败。
前提:使用Ghostscript转换。需要先安装gs软件,然后将path: D:/gs/bin/gs9.23/bin/gswin32c 写入到gsPath配置文件中。后面进行调用。
依赖包:
<dependency>
<groupId>org.jodconverter</groupId>
<artifactId>jodconverter-core</artifactId>
<version>4.1.0</version>
</dependency>
导入:
import com.github.tobato.fastdfs.domain.StorePath;
import com.github.tobato.fastdfs.service.FastFileStorageClient;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.PdfReader;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.concurrent.CountDownLatch;
1、核心方法 pdf2Image。logParam和caseOrderNo随便传值就好,id业务需求对图片上传服务器操作使用。
private static final String GS_TEMP_FOLDER_NAME = "gstemp"; //生成pdf和图片文件的临时目录
@Value("${gs.path}")
private String gsPath; //配置文件中gs的配置路径
public void pdf2Image(byte[] pdfBytes, String logParam,String caseOrderNo,String id) throws Exception {
//1. 创建临时目录,临时存储生成的图片数据
String path = GS_TEMP_FOLDER_NAME + File.separator + UUID.randomUUID() + File.separator + UUID.randomUUID();
File pdfFile = FileUtil.createFile(path, logParam);
if (pdfFile == null) {
throw new Exception("创建临时目录失败!");
}
//2. 将pdf字节流写入文件中
try {
FileUtils.writeByteArrayToFile(new File(path), pdfBytes);
} catch (IOException e) {
log.error("将pdf转成图片出现IOException. logParam = " + logParam, e);
throw new Exception("将pdf转成图片出现异常!!");
}
//3. 将pdf转成图片,组拼返回体
return pdf2Image(pdfFile, logParam,caseOrderNo,id);
}
--------------------------------------------------------
private List<UploadFileDto> pdf2Image(File pdfFile, String logParam, String caseOrderNo,String id) throws Exception {
long startTime = System.currentTimeMillis();
List<UploadFileDto> imgList = new ArrayList<>();
String imgPath = pdfFile.getParent();
PdfReader pdfReader = null;
try{
pdfReader = new PdfReader(new FileInputStream(pdfFile));
} catch (Exception e) {
e.printStackTrace();
throw new Exception("将pdf转成图片出现异常!");
}
int pageCount = pdfReader.getNumberOfPages();
if (pageCount == 0) {
throw new Exception("将pdf转成图片出现异常!");
}
Rectangle rectangle = pdfReader.getPageSize(1);
float width = rectangle.getWidth();
float height = rectangle.getHeight();
String resolution = "";
String downScaleFactor = "";
//根据pdf的大小,之后对图片压缩
if (width < 1487 && height < 2105) {
resolution = "-r300";
downScaleFactor = "-dDownScaleFactor=1";
} else if (width < 2451 && height < 3508) {
resolution = "-r300";
downScaleFactor = "-dDownScaleFactor=2";
} else if (width * 0.5 <= 2479 && height * 0.5 <= 3508) {
resolution = "-r150";
downScaleFactor = "-dDownScaleFactor=2";
} else if (width * 0.4 <= 2479 && height * 0.4 <= 3508) {
resolution = "-r120";
downScaleFactor = "-dDownScaleFactor=3";
} else {
resolution = "-r72";
downScaleFactor = "-dDownScaleFactor=3";
}
Map<String, String> param = new HashMap<>();
param.put("resolution", resolution);
param.put("downScaleFactor", downScaleFactor);
log.info("获取到pdf总页数为... pageCount = {}, logParam = {}", pageCount, logParam);
//2. 线程池
CountDownLatch countDownLatch = new CountDownLatch(pageCount);
for (int i = 1; i <= pageCount; i++) {
final int j = i;
//如果是多页pdf可以用线程池,本业务单页pdf所以注释掉
// imgThreadPool.execute(new Runnable() {
// @Override
// public void run() {
//图片文件
File tempImage = new File(imgPath + File.separator + UUID.randomUUID().toString() + ".jpg");
//拼接命令行参数
String[] gsArgs = {gsPath, "-dNOPAUSE", "-dBATCH", "-dSAFER", "-sDEVICE=jpeg", "-sDisplayHandle=0", param.get("resolution"), param.get("downScaleFactor")};
try {
gsArgs = Arrays.copyOf(gsArgs, gsArgs.length + 4);
// pdf路径参数 必须放在最后
gsArgs[gsArgs.length - 1] = pdfFile.getAbsolutePath();
// 设置需要处理的页码范围,一张一张的处理
gsArgs[gsArgs.length - 4] = "-dFirstPage=" + (j);
gsArgs[gsArgs.length - 3] = "-dLastPage=" + (j);
gsArgs[gsArgs.length - 2] = "-sOutputFile=" + tempImage.getAbsolutePath();
log.info("pdf2Image gsArgs is: {}", JSONObject.toJSONString(gsArgs));
Process proc = new ProcessBuilder(gsArgs).redirectErrorStream(true).start();
//读取进程的流
while (proc.isAlive()) {
List<String> output = IOUtils.readLines(proc.getInputStream());
output.forEach((String line) -> log.info(line));
}
// 线程等待,等待处理完毕
int exitValue;
if ((exitValue = proc.waitFor()) != 0) {
log.error("pdf转为图片转换失败 gs进程返回错误码为: {}, logParam = {}", exitValue, logParam);
return null;
}
//读取图片流
byte[] imgByte = FileUtils.readFileToByteArray(tempImage);
UploadFileDto fileDto = new UploadFileDto();
fileDto.setFileType(FileTypeEnum.JPG.getType());
fileDto.setFileStream(imgByte);
fileDto.setSort(j);
imgList.add(fileDto);
//得到图片文件tempImage之后,根据各项目逻辑对图片进行相关操作
//.................
} catch (Exception e) {
log.error("pdf转为图片转换出现特殊异常...logParam = " + logParam, e);
} finally {
log.debug("pdf转为图片 finally ===");
countDownLatch.countDown();
}
}
try {
countDownLatch.await();
} catch (InterruptedException e) {
log.error("pdf转为图片时,出现InterruptedException...logParam = " + logParam, e);
throw new Exception("将pdf转成图片出现异常!");
}
//删除文件夹以及文件
FileUtil.deleteFileAndFolder(imgPath);
return imgList;
}
上诉代码有些地方书写解释比较麻烦,记录下给大家使用就好。