java文字识别程序的关键是寻找一个可以调用的ocr引擎。tesseract-ocr就是一个这样的ocr引擎,在1985年到1995年由hp实验室开发,现在在google。tesseract-ocr 3.0发布,支持中文。不过tesseract-ocr 3.0不是图形化界面的客户端,别人写的freeocr图形化客户端还不支持导入新的 3.0 traineddata。但这标志着,现在有自由的中文ocr软件了。
java中使用tesseract-ocr3.01的步骤如下:
1.下载安装tesseract-ocr-setup-3.01-1.exe(3.0以上版本才增加了中文识别)
2.在安装向导中可以选择需要下载的语言包。
3.到网上搜索下载java图形处理所需的2个包:jai_imageio-1.1-alpha.jar,swingx-1.6.1.jar
4.java程序清单:
imageiohelper 类:
1 import java.awt.image.bufferedimage;
2 import java.io.file;
3 import java.io.ioexception;
4 import java.util.iterator;
5 import java.util.locale;
6
7 import javax.imageio.iioimage;
8 import javax.imageio.imageio;
9 import javax.imageio.imagereader;
10 import javax.imageio.imagewriteparam;
11 import javax.imageio.imagewriter;
12 import javax.imageio.metadata.iiometadata;
13 import javax.imageio.stream.imageinputstream;
14 import javax.imageio.stream.imageoutputstream;
15
16 import com.sun.media.imageio.plugins.tiff.tiffimagewriteparam;
17
18 public class imageiohelper {
19
20 public static file createimage(file imagefile, string imageformat) {
21 file tempfile = null;
22 try {
23 iterator readers = imageio.getimagereadersbyformatname(imageformat);
24 imagereader reader = readers.next();
25
26 imageinputstream iis = imageio.createimageinputstream(imagefile);
27 reader.setinput(iis);
28 //read the stream metadata
29 iiometadata streammetadata = reader.getstreammetadata();
30
31 //set up the writeparam
32 tiffimagewriteparam tiffwriteparam = new tiffimagewriteparam(locale.chinese);
33 tiffwriteparam.setcompressionmode(imagewriteparam.mode_disabled);
34
35 //get tif writer and set output to file
36 iterator writers = imageio.getimagewritersbyformatname("tiff");
37 imagewriter writer = writers.next();
38
39 bufferedimage bi = reader.read(0);
40 iioimage image = new iioimage(bi,null,reader.getimagemetadata(0));
41 tempfile = tempimagefile(imagefile);
42 imageoutputstream ios = imageio.createimageoutputstream(tempfile);
43 writer.setoutput(ios);
44 writer.write(streammetadata, image, tiffwriteparam);
45 ios.close();
46
47 writer.dispose();
48 reader.dispose();
49
50 } catch (ioexception e) {
51 e.printstacktrace();
52 }
53 return tempfile;
54 }
55
56 private static file tempimagefile(file imagefile) {
57 string path = imagefile.getpath();
58 stringbuffer strb = new stringbuffer(path);
59 strb.insert(path.lastindexof('.'),0);
60 return new file(strb.tostring().replacefirst("(?<=//.)(//w+)$", "tif"));
61 }
62
63 }
ocr 类:
1 package com.hhp.util;
2
3 import java.io.bufferedreader;
4 import java.io.file;
5 import java.io.fileinputstream;
6 import java.io.inputstreamreader;
7 import java.util.arraylist;
8 import java.util.list;
9 import org.jdesktop.swingx.util.os;
10
11 public class ocr {
12 private final string lang_option = "-l"; //英文字母小写l,并非数字1
13 private final string eol = system.getproperty("line.separator");
14 private string tesspath = "c://program files (x86)//tesseract-ocr";
15 //private string tesspath = new file("tesseract").getabsolutepath();
16
17 public string recognizetext(file imagefile,string imageformat)throws exception{
18 file tempimage = imageiohelper.createimage(imagefile,imageformat);
19 file outputfile = new file(imagefile.getparentfile(),"output");
20 stringbuffer strb = new stringbuffer();
21 list cmd = new arraylist();
22 if(os.iswindowsxp()){
23 cmd.add(tesspath+"//tesseract");
24 }else if(os.islinux()){
25 cmd.add("tesseract");
26 }else{
27 cmd.add(tesspath+"//tesseract");
28 }
29 cmd.add("");
30 cmd.add(outputfile.getname());
31 cmd.add(lang_option);
32 cmd.add("chi_sim");
33 //cmd.add("eng");
34
35 processbuilder pb = new processbuilder();
36 pb.directory(imagefile.getparentfile());
37
38 cmd.set(1, tempimage.getname());
39 pb.command(cmd);
40 pb.redirecterrorstream(true);
41
42 process process = pb.start();
43 //tesseract.exe 1.jpg 1 -l chi_sim
44 int w = process.waitfor();
45
46 //删除临时正在工作文件
47 tempimage.delete();
48
49 if(w==0){
50 bufferedreader in = new bufferedreader(new inputstreamreader(new fileinputstream(outputfile.getabsolutepath()+".txt"),"utf-8"));
51
52 string str;
53 while((str = in.readline())!=null){
54 strb.append(str).append(eol);
55 }
56 in.close();
57 }else{
58 string msg;
59 switch(w){
60 case 1:
61 msg = "errors accessing files.there may be spaces in your image's filename.";
62 break;
63 case 29:
64 msg = "cannot recongnize the image or its selected region.";
65 break;
66 case 31:
67 msg = "unsupported image format.";
68 break;
69 default:
70 msg = "errors occurred.";
71 }
72 tempimage.delete();
73 throw new runtimeexception(msg);
74 }
75 new file(outputfile.getabsolutepath()+".txt").delete();
76 return strb.tostring();
77 }
78 }
测试类testocr :
1 import java.io.file;
2 import java.io.ioexception;
3
4 import com.hhp.util.ocr;
5
6 public class ocrtest {
7
8 public static void main(string[] args) {
9 string path = "c://temp//ocrcode//4.png";
10 system.out.println("orc test begin......");
11 try {
12 string valcode = new ocr().recognizetext(new file(path), "png");
13 system.out.println(valcode);
14 } catch (ioexception e) {
15 e.printstacktrace();
16 } catch (exception e) {
17 e.printstacktrace();
18 }
19 system.out.println("orc test end......");
20 }
21
22 }
经过测试,tesseract-ocr 3.01的文字识别率很高,对于网站中常见的验证码识别率也很高。
微信扫码,欢迎关注微信公众账号,更多精彩~
手机扫码加入qq群,欢迎你~
希望与广大网友互动??
点此进行留言吧!