引入依赖:
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.25</version>
</dependency>
测试代码:
import io.choerodon.core.iam.ResourceLevel;
import io.choerodon.swagger.annotation.Permission;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
/**
* <p>
* description xxxx
* </p>
*
* @author liqin 2024/4/15 14:53
*/
@RestController("TestOcrController.v1")
@RequestMapping("/v1/{organizationId}/ocr-test")
public class TestOcrController {
@Permission(level = ResourceLevel.ORGANIZATION)
@PostMapping
public void ocrMain(@RequestParam("file") MultipartFile file) {
String fileName = file.getOriginalFilename();
System.out.println("文件类型:{}"+ fileName);
String contentType = file.getContentType();
System.out.println("contentType:{}"+ contentType);
int dotIndex = fileName.lastIndexOf('.');
String filePrefixName = "";
String fileType = "";
if (dotIndex != -1) {
fileType = fileName.substring(dotIndex + 1).toLowerCase();
filePrefixName = fileName.substring(0, fileName.lastIndexOf("."));
}
System.out.println("文件类型:{}"+ fileType);
PDDocument doc = null;
try (InputStream is = file.getInputStream()) {
doc = PDDocument.load(is);
int pageCount = doc.getNumberOfPages();
System.out.println("pageCount:"+ pageCount);
for (int i = 0; i < pageCount; i++) {
// PDF 按页切割 还是生成pdf ======================================================
PDDocument outputDocument = new PDDocument();
PDPage page = doc.getPage(i);
outputDocument.addPage(page);
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
outputDocument.save(outputStream);
byte[] bytes = outputStream.toByteArray();
// 使用MockMultipartFile将byte[]转换为MultipartFile
String newOriginalFilename = filePrefixName+"-"+(i+1)+"."+fileType;
MockMultipartFile multipartFile = new MockMultipartFile(
"file",
newOriginalFilename,
"application/pdf",
bytes);
// 处理multipartFile,例如保存到磁盘或上传
String path = "C:\\Users\\Administrator\\Desktop\\fileTest"+ File.separator +newOriginalFilename;
multipartFile.transferTo(new File(path));
}finally {
try {
if(outputDocument != null) {
// 这里需要关闭PDDocument,不然如果想要删除pdf文件时会提示文件正在使用,无法删除的情况
outputDocument.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
// PDF 每一页转图片 ======================================================
// ByteArrayOutputStream opStream = new ByteArrayOutputStream();
// PDFRenderer renderer = new PDFRenderer(doc);
// BufferedImage image = renderer.renderImageWithDPI(i, 200, ImageType.ARGB);
// ImageIO.write(image, "JPG", opStream);
// String newOriginalFilename = filePrefixName+"-"+(i+1)+".jpg";
// MultipartFile multipartFile = new MockMultipartFile("file",
// newOriginalFilename,
// "image/jpg", opStream.toByteArray());
// System.out.println("新文件大小:{}"+ multipartFile.getSize());
//
// // 处理multipartFile,例如保存到磁盘或上传
// // 目标文件路径
// String path = "C:\\Users\\Administrator\\Desktop\\fileTest"+ File.separator +newOriginalFilename;
// System.out.println("地址:{}"+ path);
// Path filePath = Paths.get(path);
// // 将MultipartFile内容写入到目标文件
// Files.copy(multipartFile.getInputStream(), filePath);
}
}catch (Exception e){
e.printStackTrace();
}finally {
try {
if(doc != null) {
// 这里需要关闭PDDocument,不然如果想要删除pdf文件时会提示文件正在使用,无法删除的情况
doc.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}