1.最好用的是e-iceblue公司的 spire.pdf For java 免费版就够用了
可以自动PDF中的文字和图片
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.pdf.free</artifactId>
<version>2.2.2</version>
</dependency>
2.org.apache.pdfbox
将PDF的每一页转成一张图片
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.6</version>
</dependency>
3.net.sourceforge.tess4j
最基本的OCR识别工具,可以体验驯宠的乐趣
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>3.4.8</version>
</dependency>
@ResponseBody
@RequestMapping(value = "imageUpload", method = RequestMethod.POST)
public Map<String, Object> springUpload(@RequestParam(value="file", required=false)List<MultipartFile> files,
HttpServletRequest request) throws IllegalStateException, IOException//, TesseractException
{
Map<String, Object> msgmap = new HashMap<String, Object>();
for(MultipartFile file:files){
if(file != null){
//设置文件的保存路径
CommonUtils.delFile(AddressUtil.getUpload_address()+ "\\"+"图片识别"+"\\",file.getOriginalFilename());
String filePath = AddressUtil.getUpload_address() + "\\"+"图片识别"+"\\"+ file.getOriginalFilename();
CommonUtils.fileMake(filePath);
//转存文件
try {
file.transferTo(new File(filePath));
msgmap.put("message", "success");
} catch (IOException e) {
msgmap.put("message", "error");
e.printStackTrace();
}
//获取上传文件的扩展名
String suffix = file.getOriginalFilename().substring(file.getOriginalFilename().lastIndexOf(".") + 1);
//解析PDF
if("PDF".equals(suffix.toUpperCase())){
/** org.apache.pdfbox 将PDF逐页转成图片处理 一页内容放到一张图片里
// 转换前文件地址
File scrFile = new File(AddressUtil.getUpload_address()+ "\\"+"图片识别"+"\\",file.getOriginalFilename());
try {
//转换前PDF本尊
PDDocument doc = PDDocument.load(scrFile);
//调用转换器
PDFRenderer render = new PDFRenderer(doc);
//PDF页数取得
int pageCount = doc.getNumberOfPages();
for (int i = 0; i < pageCount; i++) {
//写入图片buffer
BufferedImage image = render.renderImageWithDPI(i, 144); // Windows native DPI
//转换后图片 Created
ImageIO.write(image, "PNG", new File(AddressUtil.getUpload_address()+ "\\"+"图片识别"+"\\",file.getOriginalFilename()+"_"+(i+1)+".png"));
//调用图片识别接口>...
msgmap.put("foundText", postHttp(AddressUtil.getUpload_address() + "\\"+"图片识别"+"\\" + file.getOriginalFilename()+"_"+(i+1)+".png"));
}
} catch (IOException e) {
e.printStackTrace();
}
*/
try {
//转换前PDF本尊
PdfDocument doc = new PdfDocument();
doc.loadFromFile(
AddressUtil.getUpload_address() + "\\" + "图片识别" + "\\" + file.getOriginalFilename());
//PDF中的文字 Container
StringBuilder sb = new StringBuilder();
//PDF文件的页对象
PdfPageBase page;
for (int i = 0; i < doc.getPages().getCount(); i++) {
//页对象取得
page = doc.getPages().get(i);
//提取文字部分
sb.append(page.extractText(true));
//如果有图片则提取
if (page.extractImages() != null) {
int count = 1;
for (BufferedImage image : page.extractImages()) {
if (image != null) {
//指定输出图片名
File output = new File(AddressUtil.getUpload_address() + "\\" + "图片识别" + "\\",
file.getOriginalFilename() + "_" + count + ".png");
//图片转存
ImageIO.write(image, "PNG", output);
//调用识别图片接口
msgmap.put("foundInfo", postHttp(AddressUtil.getUpload_address() + "\\" + "图片识别"
+ "\\" + file.getOriginalFilename() + "_" + (i + 1) + ".png"));
}
count = count + 1;
}
}
}
System.out.println(sb);
doc.close();
} catch (Exception e) {
e.printStackTrace();
}
//解析图片
}else {
/** net.sourceforge.tess4j 图片识别度低 需驯化生成需要的字库
ITesseract instance = new Tesseract();
//如果需要识别英文之外的语种,需要指定识别语种,并且需要将对应的语言包放进项目中
instance.setLanguage("chi_sim");
// 指定识别图片
File imgDir = new File(AddressUtil.getUpload_address()+ "\\"+"图片识别"+"\\",file.getOriginalFilename());
long startTime = System.currentTimeMillis();
String ocrResult = instance.doOCR(imgDir);
System.out.println("OCR Result: \n" + ocrResult + "\n 耗时:" + (System.currentTimeMillis() - startTime) + "ms");
*/
//调用识别图片接口
msgmap.put("foundInfo", postHttp(AddressUtil.getUpload_address() + "\\"+"图片识别"+"\\" + file.getOriginalFilename()));
}
}
}
return msgmap;
}