1.下载安装tesseract-ocr-setup-4.00.00dev.exe
http://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-setup-4.00.00dev.exe
2.下载简体中文语言包chi_sim.traineddata
https://github.com/tesseract-ocr/tessdata/find/master
3.图片二值化处理
package com.daorigin.AI.ocr;
import java.awt.image.BufferedImage;
import java.awt.image.ConvolveOp;
import java.awt.image.Kernel;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import javax.imageio.ImageIO;
public class ImgSharper{
public static void main(String[] args){
try {
// getSharperPicture("E://民事起诉状.JPG","E://民事起诉状2.JPG","jpg");
// binaryImage("E://民事起诉状.JPG","E://民事起诉状1.JPG","jpg");
binaryImage("E://民事起诉状2.JPG","E://民事起诉状3.JPG","jpg");
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 图片锐化
* @param originalPic
* @return
* @throws IOException
* @throws FileNotFoundException
*/
public static void getSharperPicture(String in,String out,String type) throws FileNotFoundException, IOException{
BufferedImage originalPic = ImageIO.read(new FileInputStream(in));
int imageWidth = originalPic.getWidth();
int imageHeight = originalPic.getHeight();
// BufferedImage newPic = new BufferedImage(imageWidth, imageHeight,
// BufferedImage.TYPE_3BYTE_BGR);
BufferedImage newPic = new BufferedImage(imageWidth, imageHeight,
BufferedImage.TYPE_3BYTE_BGR);
// float[] data =
// { -1.0f, -1.0f, -1.0f, -1.0f, 10.0f, -1.0f, -1.0f, -1.0f, -1.0f };
float[] data = {
-1.0f, -1.0f, -1.0f,
-1.0f, 9.0f, -1.0f,
-1.0f, -1.0f, -1.0f
};
Kernel kernel = new Kernel(3, 3, data);
ConvolveOp co = new ConvolveOp(kernel, ConvolveOp.EDGE_NO_OP, null);
co.filter(originalPic, newPic);
ImageIO.write(newPic, type, new File(out));
}
/**
* 二值化
* @throws IOException
*/
public static void binaryImage(String in,String out,String type) throws IOException{
BufferedImage image = ImageIO.read(new FileInputStream(in));;
int width = image.getWidth();
int height = image.getHeight();
BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY);//重点,技巧在这个参数BufferedImage.TYPE_BYTE_BINARY
for(int i= 0 ; i < width ; i++){
for(int j = 0 ; j < height; j++){
int rgb = image.getRGB(i, j);
grayImage.setRGB(i, j, rgb);
}
}
ImageIO.write(grayImage, type, new File(out));
}
}
4.程序调用
package com.daorigin.AI.ocr;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.jdesktop.swingx.util.OS;
public class OCRHelper {
private final String LANG_OPTION = "-l";
private final String EOL = System.getProperty("line.separator");
/**
* 文件位置我防止在,项目同一路径
*/
private String tessPath = new File("C:\\Program Files (x86)\\Tesseract-OCR").getAbsolutePath();
/**
* @param imageFile
* 传入的图像文件
* @param imageFormat
* 传入的图像格式
* @return 识别后的字符串
*/
public String recognizeText(File imageFile) throws Exception {
/**
* 设置输出文件的保存的文件目录
*/
File outputFile = new File(imageFile.getParentFile(), "output");
StringBuffer strB = new StringBuffer();
List<String> cmd = new ArrayList<String>();
if (OS.isWindowsXP()) {
cmd.add(tessPath + "\\tesseract");
} else if (OS.isLinux()) {
cmd.add("tesseract");
} else {
cmd.add(tessPath + "\\tesseract");
}
cmd.add("");
cmd.add(outputFile.getName());
cmd.add(LANG_OPTION);
cmd.add("chi_sim");
// cmd.add("eng");
ProcessBuilder pb = new ProcessBuilder();
/**
* Sets this process builder's working directory.
*/
pb.directory(imageFile.getParentFile());
cmd.set(1, imageFile.getName());
pb.command(cmd);
pb.redirectErrorStream(true);
Process process = pb.start();
// tesseract.exe 1.jpg 1 -l chi_sim
// Runtime.getRuntime().exec("tesseract.exe 1.jpg 1 -l chi_sim");
/**
* the exit value of the process. By convention, 0 indicates normal
* termination.
*/
// System.out.println(cmd.toString());
int w = process.waitFor();
if (w == 0)// 0代表正常退出
{
BufferedReader in = new BufferedReader(
new InputStreamReader(new FileInputStream(outputFile.getAbsolutePath() + ".txt"), "UTF-8"));
String str;
while ((str = in.readLine()) != null) {
strB.append(str).append(EOL);
}
in.close();
} else {
String msg;
switch (w) {
case 1:
msg = "Errors accessing files. There may be spaces in your image's filename.";
break;
case 29:
msg = "Cannot recognize the image or its selected region.";
break;
case 31:
msg = "Unsupported image format.";
break;
default:
msg = "Errors occurred.";
}
throw new RuntimeException(msg);
}
new File(outputFile.getAbsolutePath() + ".txt").delete();
return strB.toString().replaceAll("\\s*", "");
}
}
package com.daorigin.AI.ocr;
import java.io.File;
public class Test {
public static void main(String[] args) {
try {
ImgSharper is = new ImgSharper();
is.binaryImage("E:/民事起诉状第二页.JPG", "E:/民事起诉状第二页0.JPG", "jpg");
File file = new File("E:/民事起诉状第二页0.JPG");
String recognizeText = new OCRHelper().recognizeText(file);
System.out.println(recognizeText + "\t");
} catch (Exception e) {
e.printStackTrace();
}
}
}
基本经过二值化和锐化处理的图片,分辨准确率可以达到95%以上。