Java集成Tesserac
目前项目中需要使用到图文识别,查了很多博客,发现了很多很多tesserac的,今天实现出来,有以下几点需要注意
首先 确定自己使用的 tesserac 是正确的
查看自己 tesserac 的环境
tesseract -v
查看自己的tesseract 是否安装成功,如果没有下面链接有还有 chi_sim 语言包
链接:https://pan.baidu.com/s/1PAGr3qsBL8AtqlyA0LecUg
提取码:9mp6
chi_sim 是tesseract 的语言包 在tessdata 中设置就可以
以上步骤完成之后 差不多就完成了 下面就是一个测试Demo
package com.example.ocr.controller.OCR;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
public class OCRHelper {
private final String LANG_OPTION = "-l";
private final String EOL = System.getProperty("line.separator");
/**
* Tesseract-OCR的安装路径
*/
private String tessPath = "D://ocr//Tesseract-OCR";
/**
* @param imageFile 传入的图像文件
* @return 识别后的字符串
*/
public String recognizeText(File imageFile) throws Exception {
/**
* 设置输出文件的保存的文件目录
*/
File outputFile = new File(imageFile.getParentFile(), "output");
StringBuffer strB = new StringBuffer();
List<String> cmd = new ArrayList<String>();
cmd.add(tessPath + "\\tesseract");
cmd.add("");
cmd.add(outputFile.getName());
cmd.add(LANG_OPTION);
cmd.add("chi_sim");
//cmd.add("eng");
ProcessBuilder pb = new ProcessBuilder();
pb.directory(imageFile.getParentFile());
cmd.set(1, imageFile.getName());
pb.command(cmd);
pb.redirectErrorStream(true);
Process process = pb.start();
int w = process.waitFor();
if (w == 0)// 0代表正常退出
{
BufferedReader in = new BufferedReader(new InputStreamReader(
new FileInputStream(outputFile.getAbsolutePath() + ".txt"),
"UTF-8"));
String str;
while ((str = in.readLine()) != null) {
strB.append(str).append(EOL);
}
in.close();
} else {
String msg;
switch (w) {
case 1:
msg = "错误 Errors accessing files. There may be spaces in your image's filename.";
break;
case 29:
msg = "Cannot recognize the image or its selected region.";
break;
case 31:
msg = "Unsupported image format.";
break;
default:
msg = "Errors occurred.";
}
throw new RuntimeException(msg);
}
new File(outputFile.getAbsolutePath() + ".txt").delete();
return strB.toString().replaceAll("\\s*", "");
}
}
test类
package com.example.ocr.controller.OCR;
import java.io.File;
import java.io.IOException;
import java.util.Scanner;
/**
* 测试主类
*/
public class Test {
public static void main(String[] args) {
System.out.println("输入你识别的图片");
Scanner scanner = new Scanner(System.in);
String next = scanner.next();
try {
//图片文件:此图片是需要被识别的图片
File imageFile = new File(next);
String recognizeText = new OCRHelper().recognizeText(imageFile);
System.out.print(recognizeText + "\t");
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
}
代码已经简化过,只要按照配置一部一部来,cp直接能用