tessdata的Ocr识别

最新推荐文章于 2024-07-24 15:36:57 发布

cqwuliu

最新推荐文章于 2024-07-24 15:36:57 发布

阅读量548

点赞数

分类专栏：智能识别图片 File处理工具 jAVA工具

本文链接：https://blog.csdn.net/weixin_44873668/article/details/114327044

版权

jAVA工具同时被 3 个专栏收录

28 篇文章 0 订阅

订阅专栏

File处理工具

13 篇文章 0 订阅

订阅专栏

智能识别图片

3 篇文章 0 订阅

订阅专栏

话不多说直接上代码

中文库可以自己搜索也可以github上下载很多
chi_sim.traineddata

package com.doing.utils;


import lombok.extern.slf4j.Slf4j;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Slf4j
public class OcrLocalTess4j {
	/**
 * @Author Mr伍
 * @Description //TODO 
 * @Date  2021/3/3
 * @Param bytes  图片文件的byte数组
 * @return java.lang.String  识别出来的字符串
 **/
    public static String dowl_Ocr(byte[] bytes) throws RuntimeException, IOException {
        File file = null;
        //创建文件目录
        String filePath    = DefaultConfig.getString("tempfilejpgPath");
        //语言库路径
        String xlLib=DefaultConfig.getString("languagePath");
        //零时文件名
        String fname="tem.jpg";
        File dir = new File(filePath);
        if (!dir.exists() && !dir.isDirectory()) {
            dir.mkdirs();
        }
        BufferedOutputStream bos1 = null;
        java.io.FileOutputStream fos1 = null;
        try {

            file = new File(filePath + "\\"+fname);
            fos1 = new java.io.FileOutputStream(file);
            bos1 = new BufferedOutputStream(fos1);
            bos1.write(bytes);
        } catch (Exception e) {
            e.printStackTrace();
            log.error(e.getMessage());
        } finally {
            if (bos1 != null) {
                try {
                    bos1.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (fos1 != null) {
                try {
                    fos1.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            ITesseract instance = new Tesseract();
            //设置训练库的位置
            instance.setDatapath(xlLib);
            //chi_sim 中文 eng英文
            instance.setLanguage("chi_sim");

            String result = null;
            try {
            //doOCR不止提供File类型还有BufferedImage详情看源码
                result = instance.doOCR(file);
            } catch (TesseractException e) {
                log.error(e.getMessage());
                e.printStackTrace();
            }
//匹配空格换行符
            Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");
            Matcher m = pattern.matcher(result);
            //替换成空字符
            String strNoBlank = m.replaceAll("");
            System.out.println("result:"+strNoBlank );
            //查询需要的数字开头结尾截取出来（自由扩展）
            strNoBlank = strNoBlank.substring(strNoBlank.indexOf("5000"), strNoBlank.indexOf("5000") + 11);
            System.err.println("识别出的数字"+strNoBlank);
            return strNoBlank;
        }

    }

}


<dependency>
            <groupId>net.sourceforge.tess4j</groupId>
            <artifactId>tess4j</artifactId>
            <version>4.4.0</version>
            <!--排除部分-->
            <exclusions>
                <exclusion>
                    <artifactId>log4j-over-slf4j</artifactId>
                    <groupId>org.slf4j</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>logback-classic</artifactId>
                    <groupId>ch.qos.logback</groupId>
                </exclusion>
            </exclusions>
        </dependency>