MacOS安装使用Tesseract-OCR进行图文识别

1、引入依赖(自行安装文件服务器,通过minio下载图片进行ocr识别

        <!--tess4J ocr图像识别-->
        <dependency>
            <groupId>net.sourceforge.tess4j</groupId>
            <artifactId>tess4j</artifactId>
            <version>4.4.1</version>
        </dependency>

 <!-- MinIO -->
        <dependency>
            <groupId>io.minio</groupId>
            <artifactId>minio</artifactId>
            <version>8.2.2</version>
        </dependency>
        <!-- Hutool -->
        <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>5.8.18</version>
        </dependency>

2、创建client

package com.example.demo.config;

import lombok.Getter;
import lombok.Setter;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;

import java.awt.image.BufferedImage;

@Getter
@Setter
@Component
@ConfigurationProperties(prefix = "tess4j")
public class Tess4jClient {

    private String dataPath;
    private String language;

    // 入参:图片流
    public String doOCR(BufferedImage image) throws TesseractException {
        //创建Tesseract对象
        ITesseract tesseract = new Tesseract();
        //设置中文字体库路径
        tesseract.setDatapath(dataPath);
        //中文识别
        tesseract.setLanguage(language);
        //执行ocr识别
        String result = tesseract.doOCR(image);
        //替换回车和tal键  使结果为一行
        result = result.replaceAll("\\r|\\n", "-").replaceAll(" ", "");
        return result;
    }
}

3、导入中文字体库

4、配置application.yml;

tess4j:
  datapath: /Users/jianchenghou/uploadjar/tessdata#步骤3下载位置
  language: chi_sim

5、工具类

package com.example.demo.util;

import cn.hutool.core.util.StrUtil;
import com.alibaba.fastjson2.JSONObject;
import com.example.demo.config.MinioProperties;
import io.minio.*;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;

import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;

/**
 * @Author houjiancheng
 * @Date 2022-08-15 14:31
 * @Version V1.0
 */
@Slf4j
@Component
public class MinioUtils {

    @Autowired
    private MinioClient minioClient;
    @Autowired
    private MinioProperties minioProp;

    private static String[] fileArr= {".jpg",".png",".zip",".doc",".docx",".xls",".xlsx",".pdf"};
    //支持上传jpg/png/zip/doc/docx/xls/xlsx/pdf文件,且不超过100MB

    private static final Long FILE_MAX_SIZE =1024*1024*10L;


    /**
     * @Author houjiancheng
     * @Description  判断桶存在,不存在创建桶
     * @Date 10:49 2022-08-16
     * @Param [bucketName]
     **/
    @SneakyThrows
    public void createBucket(String bucketName) {
        if (!minioClient.bucketExists(BucketExistsArgs.builder().bucket(bucketName).build())) {
            minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucketName).build());
        }

    }

    @SneakyThrows
    public InputStream getObjectInputStream(String objectName,String bucketName){
        GetObjectArgs getObjectArgs = GetObjectArgs.builder()
                .bucket(bucketName)
                .object(objectName)
                .build();
        return minioClient.getObject(getObjectArgs);
    }


    public JSONObject uploadFile(MultipartFile file,String  bucket) throws Exception {
        JSONObject res = new JSONObject();
        res.put("code", 0);
        // 判断上传文件是否为空
        if (null == file || 0 == file.getSize()) {
            res.put("msg", "上传文件不能为空");
            return res;
        }
        InputStream is=null;
        try {
            // 判断存储桶是否存在
            createBucket(bucket);
            // 文件名
            String originalFilename = file.getOriginalFilename();
            // 新的文件名 = 存储桶名称_时间戳.后缀名
            String fileName = DateUtil.getDateStr() + "/" + IdUtil.getId() + originalFilename.substring(originalFilename.lastIndexOf("."));
            // 开始上传
            is=file.getInputStream();
            PutObjectArgs putObjectArgs = PutObjectArgs.builder()
                    .bucket(bucket)
                    .object(fileName)
                    .contentType(file.getContentType())
                    .stream(is, is.available(), -1)
                    .build();
            minioClient.putObject(putObjectArgs);

            res.put("code", 1);
            res.put("msg",  bucket + "/" + fileName);
            res.put("bucket", bucket);
            res.put("fileName", fileName);
            return res;
        }  catch (Exception e) {

            log.error("上传文件失败:{}", e.getMessage());
        }finally {
            if (is != null) {
                is.close();
            }
        }
        res.put("msg", "上传失败");
        return res;
    }

    public void downLoad(String fileName,String realFileName, HttpServletResponse response, HttpServletRequest request,String bucketName) {
        InputStream is=null;
        OutputStream os =null;
        try {
            is=getObjectInputStream(fileName, bucketName);
            if(is!=null){
                byte buf[] = new byte[1024];
                int length = 0;
                String codedfilename = "";
                String agent = request.getHeader("USER-AGENT");
                System.out.println("agent:" + agent);
                if ((null != agent && -1 != agent.indexOf("MSIE")) || (null != agent && -1 != agent.indexOf("Trident"))) {
                    String name = URLEncoder.encode(realFileName, "UTF8");
                    codedfilename = name;
                } else if (null != agent && -1 != agent.indexOf("Mozilla")) {
                    codedfilename = new String(realFileName.getBytes("UTF-8"), "iso-8859-1");
                } else {
                    codedfilename = new String(realFileName.getBytes("UTF-8"), "iso-8859-1");
                }
                response.reset();
                response.setHeader("Content-Disposition", "attachment;filename=" + URLEncoder.encode(realFileName.substring(realFileName.lastIndexOf("/") + 1), "UTF-8"));
                response.setContentType("application/octet-stream");
                response.setCharacterEncoding("UTF-8");
                os = response.getOutputStream();
                // 输出文件
                while ((length = is.read(buf)) > 0) {
                    os.write(buf, 0, length);
                }
                // 关闭输出流
                os.close();

            }else{
                log.error("下载失败");
            }
        }catch (Exception e){

            log.error("错误:"+e.getMessage());
        }finally {
            if(is!=null){
                try {
                    is.close();
                } catch (IOException e) {

                }
            }
            if(os!=null){
                try {
                    os.close();
                } catch (IOException e) {

                }
            }
        }
    }

    public void deleteObject(String objectName,String bucketName) {
        try {
            RemoveObjectArgs removeObjectArgs = RemoveObjectArgs.builder()
                    .bucket(StrUtil.isBlank(bucketName)?minioProp.getBucketName():bucketName)
                    .object(objectName)
                    .build();
            minioClient.removeObject(removeObjectArgs);
        }catch (Exception e){
            log.error("错误:"+e.getMessage());
        }
    }



    public static Map<String, Object> checkFileType(MultipartFile file, Long maxSize){
        Map<String, Object> re = new HashMap<String, Object>();
        Long size = file.getSize();

        if(maxSize != null){
            if(size >= maxSize){
                re.put("code","F");
                re.put("info","文件大小超出限制");
                return re;
            }
        }else{
            maxSize=FILE_MAX_SIZE;
            if(size >= maxSize){
                re.put("code","F");
                re.put("info","文件大小超出限制");
                return re;
            }
        }
        //文件真实名字
        String oldName = file.getOriginalFilename();

        String suf = oldName.substring(oldName.lastIndexOf("."));
        suf = suf.toLowerCase();
/*
        for(String str : wenjian){
            if(str.contains(suf)){
                //文件大小
                long fileSize = file.getSize();
                if(fileSize>(100*1024*1024)){
                    re.put("code","F");
                    re.put("info","请上传文件小于100M的文件");
                    return re;
                }
            }
        }
        for(String str : tupian){
            if(str.contains(suf)){
                //图片
                long fileSize = file.getSize();
                if(fileSize>(5*1024*1024)){
                    re.put("code","F");
                    re.put("info","请上传图片小于5M的图片");
                    return re;
                }
            }
        }
*/

        boolean flag = false;
        for(String str : fileArr){
            if(str.contains(suf)){
                flag = true;
            }
        }
        if(flag){
            re.put("code","T");
            return re;
        }else{
            re.put("code","F");
            re.put("info","请上传jpg/png/zip/docx/xlsx/pdf格式的文件");
            return re;
        }
    }

}

6、测试代码Controller,通过提供一个下载图片的地址,进行文字识别;

package com.example.demo.rest;

import com.example.demo.config.Tess4jClient;
import com.example.demo.service.MinioService;
import com.example.demo.service.TestService;
import io.minio.BucketExistsArgs;
import io.minio.MakeBucketArgs;
import io.minio.MinioClient;
import io.minio.UploadObjectArgs;
import io.minio.errors.MinioException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Random;

@RestController
@RequestMapping("/ocr")
public class OcrRestController {

    @Autowired
    private MinioService minioService;
    @Autowired
    private Tess4jClient tess4jClient;


    @PostMapping("/test")
    public String test(String path){
        String result=new String();
        try {
            byte[] bytes = minioService.downLoadFileByByte(path);
            //从byte[]转换为butteredImage
            ByteArrayInputStream in = new ByteArrayInputStream(bytes);
            BufferedImage imageFile = ImageIO.read(in);
            //识别图片的文字
            result = tess4jClient.doOCR(imageFile);
            //再结合敏感词过滤算法,审核图片中的文字是否包含敏感词
            //boolean isSensitive = sensitiveScan(result);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }


}

7、测试代码Service

package com.example.demo.service;


import com.alibaba.fastjson2.JSONObject;
import com.example.demo.config.MinioProperties;
import com.example.demo.entity.Attach;
import com.example.demo.fileutil.*;
import com.example.demo.mapper.FileMapper;
import com.example.demo.rest.MkyUserRestController;
import com.example.demo.util.DateUtil;
import com.example.demo.util.IdUtil;
import com.example.demo.util.MinioUtils;
import io.minio.GetObjectArgs;
import io.minio.MinioClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.ClassUtils;
import org.springframework.web.multipart.MultipartFile;

import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.*;
import java.net.URLEncoder;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;

@Service
@Transactional
public class MinioService {

    private static final transient Logger log = LoggerFactory.getLogger(MkyUserRestController.class);


    @Autowired
    private FileMapper fileMapper;
    @Autowired
    private MinioUtils minioUtils;
    @Autowired
    private MinioClient minioClient;
    @Autowired
    private MinioProperties minioProperties;



    public Map<String, Object> upload(MultipartFile file,String bucket) {
        Map<String, Object> fileMap=new HashMap<>();
        try {
            Map<String, Object> checkFileMap = MinioUtils.checkFileType(file, null);
            String checkCode = (String)checkFileMap.get("code");
            if("F".equals(checkCode)){
                return checkFileMap;
            }else{
                String oldName = file.getOriginalFilename();
                String suf = oldName.substring(oldName.lastIndexOf(".")+1);
                suf = suf.toLowerCase();
                JSONObject jsonObject = minioUtils.uploadFile(file,bucket);
                System.out.println(jsonObject.toString());
                fileMap.put("info",jsonObject.toString());
            }
        }catch (Exception e){
            fileMap.put("code","F");
            fileMap.put("info","操作失败");
        }
        return fileMap;
    }


    public void downloadFile(HttpServletRequest req, HttpServletResponse res, String path, String name) {
        minioUtils.downLoad(path,name,res,req,minioProperties.getBucketName());
    }


    public byte[] downLoadFileByByte(String pathUrl) {

    /*    String key = pathUrl.replace(minIOConfigProperties.getEndpoint()+"/","");
        int index = key.indexOf("/");
        String bucket = key.substring(0,index);
        String filePath = key.substring(index+1);*/
        InputStream inputStream = null;
        try {
            inputStream = minioClient.getObject(GetObjectArgs.builder().bucket(minioProperties.getBucketName()).object(pathUrl).build());
        } catch (Exception e) {
            log.error("minio down file error.  pathUrl:{}",pathUrl);
            e.printStackTrace();
        }

        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        byte[] buff = new byte[100];
        int rc = 0;
        while (true) {
            try {
                if (!((rc = inputStream.read(buff, 0, 100)) > 0)) break;
            } catch (IOException e) {
                e.printStackTrace();
            }
            byteArrayOutputStream.write(buff, 0, rc);
        }
        return byteArrayOutputStream.toByteArray();
    }
}

8、保证minio正常,进行测试

上传图片:

测试:

9、注意事项:

1)初次启动报错

Could not initialize class net.sourceforge.tess4j.TessAPI

  • 经过查询是未需要安装tesseract 命令,如何安装呢?打开终端执行

 

brew install tesseract

然后会提示,找不到brew命令

然后就是安装brew命令:

/bin/zsh -c "$(curl -fsSL https://gitee.com/cunkai/HomebrewCN/raw/master/Homebrew.sh)"

确认安装成功:

brew -v

可以更新国内的镜像提升下载速度

2)安装tesseract,终端执行brew install tesseract,下载很慢可以更新国内镜像

# 替换brew.git:
$ cd "$(brew --repo)"
# 清华大学:
$ git remote set-url origin https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/brew.git
 
# 替换homebrew-core.git:
$ cd "$(brew --repo)/Library/Taps/homebrew/homebrew-core"
# 清华大学:
$ git remote set-url origin https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/homebrew-core.git
 
# 替换homebrew-bottles:
# 清华大学:
$ echo 'export HOMEBREW_BOTTLE_DOMAIN=https://mirrors.tuna.tsinghua.edu.cn/homebrew-bottles' >> ~/.bash_profile
$ source ~/.bash_profile
 
# 应用生效:
$ brew update

修改完之后,再次执行brew install tesseract仍然会失败;

如果直接访问路径,浏览器能够打开说明只是连接超时了,单独安装即可

执行brew install (失败的内容)libtool.rb

但是遗憾对于github的资源,始终无法下载成功!!!!折磨了我一天,各种修改方法都未能生效,只能自己搞了个野路子;

改用复制的镜像:

https://hub.nuaa.cf/

并且替换host文件,让github.com转发到克隆hub.nuaa.cf,就跳过了403的问题;

vim /etc/hosts

最后验证:tesseract -v

  • 9
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
安装pytesseract库和tesseract-ocr引擎的步骤如下: 1. 首先,确保你已经安装了Python和pip。 2. 打开命令行终端,并执行以下命令来安装pytesseract库: ```shell pip install pytesseract ``` 3. 安装完成后,你还需要安装tesseract- Windows: - 访问https://github.com/UB-Mannheim/tesseract/wiki下载最新的tesseract-ocr安装程序。 - 运行安装程序,并按照提示完成安装。 - 在安装过程中,记住tesseract-OCR。 - macOS: - 打开终端,并执行以下命令来安装tesseract-ocr: ```shell brew install tesseract ``` - Linux(Ubuntu): - 打开终端,并执行以下命令来安装tesseract-ocr: ```shell sudo apt-get install tesseract-ocr ``` 4. 安装完成后,你还需要配置pytesseract库以使用正确的tesseract-ocr引擎路径。根据你的操作系统,可以按照以下步骤进行配置: - Windows: - 打开pytesseract库的安装目录,例如:C:\Users\huxiu\AppData\Local\Programs\Python\Python35\Lib\site-packages\pytesseract。 - 打开pytesseract.py文件,并找到以下行: ```python tesseract_cmd = 'tesseract' ``` - 将该行修改为你的tesseract-ocr引擎的路径,例如: ```python tesseract_cmd = 'C:/Program Files (x86)/Tesseract-OCR/tesseract.exe' ``` - macOS和Linux: - 不需要进行额外的配置,pytesseract库会自动找到正确的tesseract-ocr引擎路径。 5. 现在,你已经成功安装了pytesseract库和tesseract-ocr引擎。你可以在Python代码中导入pytesseract库,并使用它来进行OCR(光学字符识别)操作。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值