java程序集成Tesseract-OCR识别图片信息

最新推荐文章于 2024-08-24 14:11:28 发布

喳喳叽叽

最新推荐文章于 2024-08-24 14:11:28 发布

阅读量3.4k

点赞数 1

分类专栏： Java 文章标签：图片识别 tesseract OCR java编程 pdfbox

本文链接：https://blog.csdn.net/sinat_19965097/article/details/78144535

版权

Java 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

java程序集成Tesseract-OCR识别图片信息

最近老大说是要实现一个识别图片中的信息，然后与数据库中的相关信息匹配的一个功能，然后给了我说了个OCR技术，叫我去看看。没搞过啊，只能赶紧网上搜索资料，资料还挺多。OCR（Optical Character Recognition，光学字符识别）,比较常用的是谷歌的一个ocr引擎Tesseract-OCR,然后去网上下载下载下来解压，通过cmd命令测试，定位到文件夹下，然后通过命令tesseract 001.jpg 123 -l chi_sim测试效果还不错，tesseract-ocr 3.0可以支持中文，然后就想用程序来调用命令。

cmd测试效果图

执行ocr命令：
cmd执行命令

需要解析的图片：
ocr的图片

解析识别出来的文字：
这里写图片描述

保证图片的分辨率还是挺准确的。

然后是通过java程序来执行命令，这个也是比较简单的，但是也是遇到了两个问题：

问题1
tesseract-ocr 不支持pdf格式的解析，对tif格式图片解析也不是很好的支持，我要实现的是对pdf进行解析，这就难免要进行图片的转换，pdf具有文档形式和图片形式（扫描版）而且具有很多版本，本以为很简单的事情，但是尝试了很多方法各种扩展jar包都不能全面兼容，最后终于找到了pdfbox-app-2.0.7.jar+jbig2-imageio-1.6.5.jar解决掉
问题2
tesseract-ocr解析效率特别慢，解析一张a4纸大小的图片，需要
花费二三十秒，简直无法忍受。但是tesseract-ocr这东西毕竟太高深没法碰，只能从图片入手，之前测试的一行字的小图片解析只需要一秒钟，所以就以这种形式，将自己需要的图片信息截取下来，然后通过多线程调用tesseract-ocr进行解析,效果还不错，下面附源代码：

OcrEngine.java

package com.jj.test2;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.Vector;

import javax.imageio.ImageIO;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

public class OcrEngine{
    /**
     * 日志
     */

    private static String tesspath ;  //ocr路径
    private static String tempath ;   //临时文件路径
    private static String TYPE_PDF = "pdf";//定义处理图片的类型
    private static String TYPE_JPG = "jpg";

    /**
     * 解析文件
     * @param context
     * @param file
     * @throws TxnException 
     */

    public static void main(String args[]){
        try {
            Map<String,String> result = parseFile(new File("d:\\testdata\\001.pdf"));
            for(Map.Entry<String,String> entry : result.entrySet()){  
                System.out.print("Key = "+entry.getKey()+",value="+entry.getValue());  
            } 
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public static Map<String,String> parseFile(File file) throws Exception{
        Long start = System.currentTimeMillis();
        tempath = "d:\\temp\\";
        tesspath = "d:\\application\\Tesseract-OCR\\";

        try{
            if(!new File(tempath).exists()){
                new File(tempath).mkdir();
            }
            if(!new File(tesspath+File.separator+"tesseract.exe").exists()){
                throw new RuntimeException("OCR异常-不存在OCR引擎");
            }
            if(!file.exists()){
                throw new RuntimeException("OCR异常-图片文件不存在");
            }
            String filetype = file.getName().substring(file.getName().lastIndexOf(".")+1);
            System.out.println("=========>runtime1:"+(System.currentTimeMillis()-start));
            if(TYPE_PDF.equals(filetype)){//可以自行添加图片格式
                file = pdf2Jpg(file);
            }else{
                throw new RuntimeException("OCR异常-不支持的文件格式");
            }
            System.out.println("=========>runtime3:"+(System.currentTimeMillis()-start));
            Vector<Thread> threads = new Vector<Thread>();
            OcrOperate oop = new OcrOperate(file, tempath);
            for (int i = 0; i < 4; i++) {
                Thread iThread = new Thread(oop);
                threads.add(iThread);
                iThread.start();
            }  
            for (Thread iThread : threads) {  
                try {  
                    // 等待所有线程执行完毕  
                    iThread.join();
                } catch (InterruptedException e) {  
                    e.printStackTrace();  
                }  
            }
            System.out.println("=========>runtime4:"+(System.currentTimeMillis()-start));
            return oop.getResult();

        } catch (Exception e){
            throw new RuntimeException("OCR异常-解析文件错误",e);
        }
    }

    /**
     * 将pdf文件转换成jpg文件
     * @param infile
     * @return
     * @throws InvalidPasswordException
     * @throws IOException
     */
    private static File pdf2Jpg(File infile) throws InvalidPasswordException, IOException {
        Long start = System.currentTimeMillis();
        File outfile = null;
        PDDocument document = null;
        PDFRenderer renderer = null;
        try {
            document = PDDocument.load(infile, (String)null);
            outfile = new File(tempath+File.separator+UUID.randomUUID()+".jpg");//输出文件

            ImageIO.scanForPlugins();
            renderer = new PDFRenderer(document);
            System.out.println("=========>runtime1.1:"+(System.currentTimeMillis()-start));
            BufferedImage image = renderer.renderImageWithDPI(0,300,ImageType.RGB);//控转换那种图片以及图片的像素
            System.out.println("=========>runtime1.2:"+(System.currentTimeMillis()-start));
            FileOutputStream out = new FileOutputStream(outfile);
            ImageIO.write(image, TYPE_JPG, out);
            out.close();
            System.out.println("=========>runtime1.3:"+(System.currentTimeMillis()-start));
            return outfile;
        } finally{
            if(document!=null){
                document.close();
            }
        }
    }
}

OcrOperate.java

package com.jj.test2;

import java.awt.Rectangle;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import javax.imageio.ImageIO;
import javax.imageio.ImageReadParam;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;

import org.jdesktop.swingx.util.OS;

public class OcrOperate implements Runnable{
    /**
     * 日志
     */

    private String tesspath ;  //ocr路径
    private String tempath ;   //临时文件路径
    private File file;     //操作的文件
    private String TYPE_JPG = "jpg";  //文件类型
    private Map<String, String> result = new HashMap<String, String>(); //返回的结果集
    private LinkedList<int[]> list = new LinkedList<int[]>();

    public OcrOperate(File file,String tempath){
        this.file = file;
        this.tempath = tempath;
        this.tesspath = "d:\\application\\Tesseract-OCR\\";
        for(int i=0;i<4;i++){//自行设置图片位置（需要自己测试调整，要保证所有图片大小一直）
            this.list.add(new int[]{230,950+i*130,1800,120});
        }
    }

    @Override
    public void run() {
        try {
            if(list.size()>0){
                recognizeText(cutImage(file,list.removeFirst()));
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public Map<String, String> getResult(){
        return result;
    }

    /**
     * 截取图片获取有效数据-----提高解析速率
     * @param file
     * @return
     * @throws IOException
     */
    private File cutImage(File file,int[] arr)throws IOException {
       FileInputStream is =   null ;
       ImageInputStream iis = null ;
       Rectangle rect = null;
       File outFile = null;

       try {
           ImageReader reader = ImageIO.getImageReadersByFormatName(TYPE_JPG).next();
           is = new FileInputStream(file);
           iis = ImageIO.createImageInputStream(is);
           reader.setInput(iis, true ) ;

           ImageReadParam param = reader.getDefaultReadParam();   
           rect =   new Rectangle(arr[0],arr[1],arr[2],arr[3]);
           param.setSourceRegion(rect);
           BufferedImage bi=reader.read(0,param);

           outFile = new File(tempath+File.separator+UUID.randomUUID()+".jpg");
           ImageIO.write(bi,TYPE_JPG,outFile);
       } finally{
           if (is != null )
               is.close() ; 
           if (iis != null )
               iis.close();

           //file.delete();
       }
       return outFile;
    }

    /**
     * 调用OCR引擎解析图片
     * @param file
     * @return
     * @throws TxnErrorException 
     * @throws IOException 
     */
    private void recognizeText(File file) throws IOException{
        Process process = null;
        BufferedReader br = null;

        //设置输出文件的保存的文件目录 
        File outfile = new File(tempath,UUID.randomUUID()+"");
        List<String> cmd = new ArrayList<String>();
        if (OS.isWindowsXP()){
            cmd.add(tesspath + "\\tesseract");
        } else if (OS.isLinux()){
            cmd.add(tesspath+File.separator+"tesseract");  
        } else{
            cmd.add(tesspath + "\\tesseract");  
        }
        cmd.add("");  
        cmd.add(outfile.getName());  
        cmd.add("-l");  
        cmd.add("CHI_SIM");
        // cmd.add("ENG");

        try {
            ProcessBuilder pb = new ProcessBuilder();  
            //设置线程的工作路径 
            pb.directory(file.getParentFile());  
            cmd.set(1, file.getName());  
            pb.command(cmd);  
            pb.redirectErrorStream(true);
            process = pb.start();

            //Runtime.getRuntime().exec("D:\\application\\Tesseract-OCR\\tesseract d:\\testdata\\002.jpg d:\\testdata\\output -l chi_sim");  
            System.out.println("=========>cmd:"+cmd.toString());

            //设置一个解析时间阈值，超过该值解析停止
            int i = 120;
            outfile = new File(outfile.getAbsoluteFile()+".txt");
            while(!(outfile.exists()) && i-->0){
                try {
                    Thread.sleep(100);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }

            if (outfile.exists()){
                br = new BufferedReader(new InputStreamReader( new FileInputStream(outfile), "UTF-8"));  
                String str = null;  
                while ((str = br.readLine()) != null){
                    str = str.replaceAll("\\s*\\p{Punct}*", "");//去掉空白符号和标点符号
                    System.out.println("=======>str:"+str);
                    if(str.contains("问题一")){//根据结果集自行调整 截取位置
                        result.put("question1", str.substring(7,str.indexOf("发文序号")));
                        result.put("question11", str.substring(str.indexOf("发文序号")+4));
                    }else if(str.contains("问题二")){
                        result.put("question2", str.substring(8));
                    }else if(str.contains("问题三")){
                        result.put("question3", str.substring(6));
                    }else if(str.contains("问题四")){
                        result.put("question4", str);
                    }
                }
            } else{
                throw new RuntimeException("OCR异常-解析超时，未识别出文本文件");
            }
        } finally {
            if(process!=null){
                process.destroy();
            }
            if(br!=null){
                try {
                    br.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            //file.delete();
            //outfile.delete();
            //清空临时文件数据
            /*for(File file2 : new File(tempath).listFiles()){
                file2.delete();
            }*/
        }
    }

}

OcrEngine.java是主方法，包括图片的转换和线程的起掉，OcrOperate.java实现图片的切割和ocr解析图片。感觉还是很弱智，后面再改善，另外ocr的解析时间是变短了，但是pdf转jpg的时间还是很长，这个主要是转的时候要求的分辨率太高了，可以自行调整，或者寻找另一种效率更快兼容性更好的扩展jar包来转换pdf。