package com.org.algorithm.OO; import org.apache.commons.lang.StringUtils; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; public class test { static String ocrCommand = "tesseract"; static String ocrLangData = "-l chi_sim"; public static String getCaptureText() { String result = ""; String imgPath = "/home/if/ocr_test/p.png"; BufferedReader bufReader = null; try { String outPath = imgPath.substring(0, imgPath.lastIndexOf(".")); Runtime runtime = Runtime.getRuntime(); String command = ocrCommand + " " + imgPath + " " + outPath + " " + ocrLangData; System.out.println(command); Process ps = runtime.exec(command); ps.waitFor(); // 读取文件 File file = new File(outPath + ".txt"); bufReader = new BufferedReader(new FileReader(file)); String temp = ""; StringBuffer sb = new StringBuffer(); while ((temp = bufReader.readLine()) != null) { sb.append(temp); } // 文字结果 result = sb.toString(); if (StringUtils.isNotBlank(result)) result = result.replaceAll(" ", ""); } catch (Exception e) { // logger.error("识别验证码异常,Exception:{}", e.getMessage()); e.printStackTrace(); } return result; } public static void main(String[] args) { System.out.println(getCaptureText()); } }
总结:最近想去比赛,就是把一个图片的问题提出来,他给的数据,就png,有水印,
就是我在过程中发现,英文远远比中文快,因为英文的文本基数只有24(a-y),但是中文的基数有点大,我去参加的比赛,他只要求提出前面后文字信息,所以不同全文本分析,还有在过程中,一句话中文文本的识别高于全文本识别,
2去除其他影响,我的png后面有色点,还有水印,我后期的想法通过工具,直接格式化图片,这样提高识别
String path = "/home/if/ocr_test/p.png"; String trainPath = "/home/if/桌面/tessdata"; File imageFile = new File(path); ITesseract instance = new Tesseract(); instance.setDatapath(trainPath); // 默认是英文(识别字母和数字),如果要识别中文(数字 + 中文),需要制定语言包 instance.setLanguage("chi_sim"); try{ String result = instance.doOCR(imageFile); System.out.println(result); }catch(TesseractException e){ System.out.println(e.getMessage()); }
参考
http://blog.csdn.net/wangyongxia921/article/details/52813453