java+7+up+71_识别率很高的java文字识别技术

java文字识别程序的关键是寻找一个可以调用的ocr引擎。tesseract-ocr就是一个这样的ocr引擎,在1985年到1995年由hp实验室开发,现在在google。tesseract-ocr 3.0发布,支持中文。不过tesseract-ocr 3.0不是图形化界面的客户端,别人写的freeocr图形化客户端还不支持导入新的 3.0 traineddata。但这标志着,现在有自由的中文ocr软件了。

java中使用tesseract-ocr3.01的步骤如下:

1.下载安装tesseract-ocr-setup-3.01-1.exe(3.0以上版本才增加了中文识别)

2.在安装向导中可以选择需要下载的语言包。

3.到网上搜索下载java图形处理所需的2个包:jai_imageio-1.1-alpha.jar,swingx-1.6.1.jar

4.java程序清单:

imageiohelper 类:

1 import java.awt.image.bufferedimage;

2 import java.io.file;

3 import java.io.ioexception;

4 import java.util.iterator;

5 import java.util.locale;

6

7 import javax.imageio.iioimage;

8 import javax.imageio.imageio;

9 import javax.imageio.imagereader;

10 import javax.imageio.imagewriteparam;

11 import javax.imageio.imagewriter;

12 import javax.imageio.metadata.iiometadata;

13 import javax.imageio.stream.imageinputstream;

14 import javax.imageio.stream.imageoutputstream;

15

16 import com.sun.media.imageio.plugins.tiff.tiffimagewriteparam;

17

18 public class imageiohelper {

19

20 public static file createimage(file imagefile, string imageformat) {

21 file tempfile = null;

22 try {

23 iterator readers = imageio.getimagereadersbyformatname(imageformat);

24 imagereader reader = readers.next();

25

26 imageinputstream iis = imageio.createimageinputstream(imagefile);

27 reader.setinput(iis);

28 //read the stream metadata

29 iiometadata streammetadata = reader.getstreammetadata();

30

31 //set up the writeparam

32 tiffimagewriteparam tiffwriteparam = new tiffimagewriteparam(locale.chinese);

33 tiffwriteparam.setcompressionmode(imagewriteparam.mode_disabled);

34

35 //get tif writer and set output to file

36 iterator writers = imageio.getimagewritersbyformatname("tiff");

37 imagewriter writer = writers.next();

38

39 bufferedimage bi = reader.read(0);

40 iioimage image = new iioimage(bi,null,reader.getimagemetadata(0));

41 tempfile = tempimagefile(imagefile);

42 imageoutputstream ios = imageio.createimageoutputstream(tempfile);

43 writer.setoutput(ios);

44 writer.write(streammetadata, image, tiffwriteparam);

45 ios.close();

46

47 writer.dispose();

48 reader.dispose();

49

50 } catch (ioexception e) {

51 e.printstacktrace();

52 }

53 return tempfile;

54 }

55

56 private static file tempimagefile(file imagefile) {

57 string path = imagefile.getpath();

58 stringbuffer strb = new stringbuffer(path);

59 strb.insert(path.lastindexof('.'),0);

60 return new file(strb.tostring().replacefirst("(?<=//.)(//w+)$", "tif"));

61 }

62

63 }

ocr 类:

1 package com.hhp.util;

2

3 import java.io.bufferedreader;

4 import java.io.file;

5 import java.io.fileinputstream;

6 import java.io.inputstreamreader;

7 import java.util.arraylist;

8 import java.util.list;

9 import org.jdesktop.swingx.util.os;

10

11 public class ocr {

12 private final string lang_option = "-l"; //英文字母小写l,并非数字1

13 private final string eol = system.getproperty("line.separator");

14 private string tesspath = "c://program files (x86)//tesseract-ocr";

15 //private string tesspath = new file("tesseract").getabsolutepath();

16

17 public string recognizetext(file imagefile,string imageformat)throws exception{

18 file tempimage = imageiohelper.createimage(imagefile,imageformat);

19 file outputfile = new file(imagefile.getparentfile(),"output");

20 stringbuffer strb = new stringbuffer();

21 list cmd = new arraylist();

22 if(os.iswindowsxp()){

23 cmd.add(tesspath+"//tesseract");

24 }else if(os.islinux()){

25 cmd.add("tesseract");

26 }else{

27 cmd.add(tesspath+"//tesseract");

28 }

29 cmd.add("");

30 cmd.add(outputfile.getname());

31 cmd.add(lang_option);

32 cmd.add("chi_sim");

33 //cmd.add("eng");

34

35 processbuilder pb = new processbuilder();

36 pb.directory(imagefile.getparentfile());

37

38 cmd.set(1, tempimage.getname());

39 pb.command(cmd);

40 pb.redirecterrorstream(true);

41

42 process process = pb.start();

43 //tesseract.exe 1.jpg 1 -l chi_sim

44 int w = process.waitfor();

45

46 //删除临时正在工作文件

47 tempimage.delete();

48

49 if(w==0){

50 bufferedreader in = new bufferedreader(new inputstreamreader(new fileinputstream(outputfile.getabsolutepath()+".txt"),"utf-8"));

51

52 string str;

53 while((str = in.readline())!=null){

54 strb.append(str).append(eol);

55 }

56 in.close();

57 }else{

58 string msg;

59 switch(w){

60 case 1:

61 msg = "errors accessing files.there may be spaces in your image's filename.";

62 break;

63 case 29:

64 msg = "cannot recongnize the image or its selected region.";

65 break;

66 case 31:

67 msg = "unsupported image format.";

68 break;

69 default:

70 msg = "errors occurred.";

71 }

72 tempimage.delete();

73 throw new runtimeexception(msg);

74 }

75 new file(outputfile.getabsolutepath()+".txt").delete();

76 return strb.tostring();

77 }

78 }

测试类testocr :

1 import java.io.file;

2 import java.io.ioexception;

3

4 import com.hhp.util.ocr;

5

6 public class ocrtest {

7

8 public static void main(string[] args) {

9 string path = "c://temp//ocrcode//4.png";

10 system.out.println("orc test begin......");

11 try {

12 string valcode = new ocr().recognizetext(new file(path), "png");

13 system.out.println(valcode);

14 } catch (ioexception e) {

15 e.printstacktrace();

16 } catch (exception e) {

17 e.printstacktrace();

18 }

19 system.out.println("orc test end......");

20 }

21

22 }

经过测试,tesseract-ocr 3.01的文字识别率很高,对于网站中常见的验证码识别率也很高。

微信扫码,欢迎关注微信公众账号,更多精彩~

手机扫码加入qq群,欢迎你~

希望与广大网友互动??

点此进行留言吧!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值