1、引入依赖(自行安装文件服务器,通过minio下载图片进行ocr识别
<!--tess4J ocr图像识别-->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.4.1</version>
</dependency>
<!-- MinIO -->
<dependency>
<groupId>io.minio</groupId>
<artifactId>minio</artifactId>
<version>8.2.2</version>
</dependency>
<!-- Hutool -->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.18</version>
</dependency>
2、创建client
package com.example.demo.config;
import lombok.Getter;
import lombok.Setter;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;
import java.awt.image.BufferedImage;
@Getter
@Setter
@Component
@ConfigurationProperties(prefix = "tess4j")
public class Tess4jClient {
private String dataPath;
private String language;
// 入参:图片流
public String doOCR(BufferedImage image) throws TesseractException {
//创建Tesseract对象
ITesseract tesseract = new Tesseract();
//设置中文字体库路径
tesseract.setDatapath(dataPath);
//中文识别
tesseract.setLanguage(language);
//执行ocr识别
String result = tesseract.doOCR(image);
//替换回车和tal键 使结果为一行
result = result.replaceAll("\\r|\\n", "-").replaceAll(" ", "");
return result;
}
}
3、导入中文字体库
4、配置application.yml;
tess4j:
datapath: /Users/jianchenghou/uploadjar/tessdata#步骤3下载位置
language: chi_sim
5、工具类
package com.example.demo.util;
import cn.hutool.core.util.StrUtil;
import com.alibaba.fastjson2.JSONObject;
import com.example.demo.config.MinioProperties;
import io.minio.*;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
/**
* @Author houjiancheng
* @Date 2022-08-15 14:31
* @Version V1.0
*/
@Slf4j
@Component
public class MinioUtils {
@Autowired
private MinioClient minioClient;
@Autowired
private MinioProperties minioProp;
private static String[] fileArr= {".jpg",".png",".zip",".doc",".docx",".xls",".xlsx",".pdf"};
//支持上传jpg/png/zip/doc/docx/xls/xlsx/pdf文件,且不超过100MB
private static final Long FILE_MAX_SIZE =1024*1024*10L;
/**
* @Author houjiancheng
* @Description 判断桶存在,不存在创建桶
* @Date 10:49 2022-08-16
* @Param [bucketName]
**/
@SneakyThrows
public void createBucket(String bucketName) {
if (!minioClient.bucketExists(BucketExistsArgs.builder().bucket(bucketName).build())) {
minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucketName).build());
}
}
@SneakyThrows
public InputStream getObjectInputStream(String objectName,String bucketName){
GetObjectArgs getObjectArgs = GetObjectArgs.builder()
.bucket(bucketName)
.object(objectName)
.build();
return minioClient.getObject(getObjectArgs);
}
public JSONObject uploadFile(MultipartFile file,String bucket) throws Exception {
JSONObject res = new JSONObject();
res.put("code", 0);
// 判断上传文件是否为空
if (null == file || 0 == file.getSize()) {
res.put("msg", "上传文件不能为空");
return res;
}
InputStream is=null;
try {
// 判断存储桶是否存在
createBucket(bucket);
// 文件名
String originalFilename = file.getOriginalFilename();
// 新的文件名 = 存储桶名称_时间戳.后缀名
String fileName = DateUtil.getDateStr() + "/" + IdUtil.getId() + originalFilename.substring(originalFilename.lastIndexOf("."));
// 开始上传
is=file.getInputStream();
PutObjectArgs putObjectArgs = PutObjectArgs.builder()
.bucket(bucket)
.object(fileName)
.contentType(file.getContentType())
.stream(is, is.available(), -1)
.build();
minioClient.putObject(putObjectArgs);
res.put("code", 1);
res.put("msg", bucket + "/" + fileName);
res.put("bucket", bucket);
res.put("fileName", fileName);
return res;
} catch (Exception e) {
log.error("上传文件失败:{}", e.getMessage());
}finally {
if (is != null) {
is.close();
}
}
res.put("msg", "上传失败");
return res;
}
public void downLoad(String fileName,String realFileName, HttpServletResponse response, HttpServletRequest request,String bucketName) {
InputStream is=null;
OutputStream os =null;
try {
is=getObjectInputStream(fileName, bucketName);
if(is!=null){
byte buf[] = new byte[1024];
int length = 0;
String codedfilename = "";
String agent = request.getHeader("USER-AGENT");
System.out.println("agent:" + agent);
if ((null != agent && -1 != agent.indexOf("MSIE")) || (null != agent && -1 != agent.indexOf("Trident"))) {
String name = URLEncoder.encode(realFileName, "UTF8");
codedfilename = name;
} else if (null != agent && -1 != agent.indexOf("Mozilla")) {
codedfilename = new String(realFileName.getBytes("UTF-8"), "iso-8859-1");
} else {
codedfilename = new String(realFileName.getBytes("UTF-8"), "iso-8859-1");
}
response.reset();
response.setHeader("Content-Disposition", "attachment;filename=" + URLEncoder.encode(realFileName.substring(realFileName.lastIndexOf("/") + 1), "UTF-8"));
response.setContentType("application/octet-stream");
response.setCharacterEncoding("UTF-8");
os = response.getOutputStream();
// 输出文件
while ((length = is.read(buf)) > 0) {
os.write(buf, 0, length);
}
// 关闭输出流
os.close();
}else{
log.error("下载失败");
}
}catch (Exception e){
log.error("错误:"+e.getMessage());
}finally {
if(is!=null){
try {
is.close();
} catch (IOException e) {
}
}
if(os!=null){
try {
os.close();
} catch (IOException e) {
}
}
}
}
public void deleteObject(String objectName,String bucketName) {
try {
RemoveObjectArgs removeObjectArgs = RemoveObjectArgs.builder()
.bucket(StrUtil.isBlank(bucketName)?minioProp.getBucketName():bucketName)
.object(objectName)
.build();
minioClient.removeObject(removeObjectArgs);
}catch (Exception e){
log.error("错误:"+e.getMessage());
}
}
public static Map<String, Object> checkFileType(MultipartFile file, Long maxSize){
Map<String, Object> re = new HashMap<String, Object>();
Long size = file.getSize();
if(maxSize != null){
if(size >= maxSize){
re.put("code","F");
re.put("info","文件大小超出限制");
return re;
}
}else{
maxSize=FILE_MAX_SIZE;
if(size >= maxSize){
re.put("code","F");
re.put("info","文件大小超出限制");
return re;
}
}
//文件真实名字
String oldName = file.getOriginalFilename();
String suf = oldName.substring(oldName.lastIndexOf("."));
suf = suf.toLowerCase();
/*
for(String str : wenjian){
if(str.contains(suf)){
//文件大小
long fileSize = file.getSize();
if(fileSize>(100*1024*1024)){
re.put("code","F");
re.put("info","请上传文件小于100M的文件");
return re;
}
}
}
for(String str : tupian){
if(str.contains(suf)){
//图片
long fileSize = file.getSize();
if(fileSize>(5*1024*1024)){
re.put("code","F");
re.put("info","请上传图片小于5M的图片");
return re;
}
}
}
*/
boolean flag = false;
for(String str : fileArr){
if(str.contains(suf)){
flag = true;
}
}
if(flag){
re.put("code","T");
return re;
}else{
re.put("code","F");
re.put("info","请上传jpg/png/zip/docx/xlsx/pdf格式的文件");
return re;
}
}
}
6、测试代码Controller,通过提供一个下载图片的地址,进行文字识别;
package com.example.demo.rest;
import com.example.demo.config.Tess4jClient;
import com.example.demo.service.MinioService;
import com.example.demo.service.TestService;
import io.minio.BucketExistsArgs;
import io.minio.MakeBucketArgs;
import io.minio.MinioClient;
import io.minio.UploadObjectArgs;
import io.minio.errors.MinioException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Random;
@RestController
@RequestMapping("/ocr")
public class OcrRestController {
@Autowired
private MinioService minioService;
@Autowired
private Tess4jClient tess4jClient;
@PostMapping("/test")
public String test(String path){
String result=new String();
try {
byte[] bytes = minioService.downLoadFileByByte(path);
//从byte[]转换为butteredImage
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
BufferedImage imageFile = ImageIO.read(in);
//识别图片的文字
result = tess4jClient.doOCR(imageFile);
//再结合敏感词过滤算法,审核图片中的文字是否包含敏感词
//boolean isSensitive = sensitiveScan(result);
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
}
7、测试代码Service
package com.example.demo.service;
import com.alibaba.fastjson2.JSONObject;
import com.example.demo.config.MinioProperties;
import com.example.demo.entity.Attach;
import com.example.demo.fileutil.*;
import com.example.demo.mapper.FileMapper;
import com.example.demo.rest.MkyUserRestController;
import com.example.demo.util.DateUtil;
import com.example.demo.util.IdUtil;
import com.example.demo.util.MinioUtils;
import io.minio.GetObjectArgs;
import io.minio.MinioClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.ClassUtils;
import org.springframework.web.multipart.MultipartFile;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.*;
import java.net.URLEncoder;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
@Service
@Transactional
public class MinioService {
private static final transient Logger log = LoggerFactory.getLogger(MkyUserRestController.class);
@Autowired
private FileMapper fileMapper;
@Autowired
private MinioUtils minioUtils;
@Autowired
private MinioClient minioClient;
@Autowired
private MinioProperties minioProperties;
public Map<String, Object> upload(MultipartFile file,String bucket) {
Map<String, Object> fileMap=new HashMap<>();
try {
Map<String, Object> checkFileMap = MinioUtils.checkFileType(file, null);
String checkCode = (String)checkFileMap.get("code");
if("F".equals(checkCode)){
return checkFileMap;
}else{
String oldName = file.getOriginalFilename();
String suf = oldName.substring(oldName.lastIndexOf(".")+1);
suf = suf.toLowerCase();
JSONObject jsonObject = minioUtils.uploadFile(file,bucket);
System.out.println(jsonObject.toString());
fileMap.put("info",jsonObject.toString());
}
}catch (Exception e){
fileMap.put("code","F");
fileMap.put("info","操作失败");
}
return fileMap;
}
public void downloadFile(HttpServletRequest req, HttpServletResponse res, String path, String name) {
minioUtils.downLoad(path,name,res,req,minioProperties.getBucketName());
}
public byte[] downLoadFileByByte(String pathUrl) {
/* String key = pathUrl.replace(minIOConfigProperties.getEndpoint()+"/","");
int index = key.indexOf("/");
String bucket = key.substring(0,index);
String filePath = key.substring(index+1);*/
InputStream inputStream = null;
try {
inputStream = minioClient.getObject(GetObjectArgs.builder().bucket(minioProperties.getBucketName()).object(pathUrl).build());
} catch (Exception e) {
log.error("minio down file error. pathUrl:{}",pathUrl);
e.printStackTrace();
}
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
byte[] buff = new byte[100];
int rc = 0;
while (true) {
try {
if (!((rc = inputStream.read(buff, 0, 100)) > 0)) break;
} catch (IOException e) {
e.printStackTrace();
}
byteArrayOutputStream.write(buff, 0, rc);
}
return byteArrayOutputStream.toByteArray();
}
}
8、保证minio正常,进行测试
上传图片:
测试:
9、注意事项:
1)初次启动报错
Could not initialize class net.sourceforge.tess4j.TessAPI
- 经过查询是未需要安装tesseract 命令,如何安装呢?打开终端执行
brew install tesseract
然后会提示,找不到brew命令
然后就是安装brew命令:
/bin/zsh -c "$(curl -fsSL https://gitee.com/cunkai/HomebrewCN/raw/master/Homebrew.sh)"
确认安装成功:
brew -v
可以更新国内的镜像提升下载速度
2)安装tesseract,终端执行brew install tesseract,下载很慢可以更新国内镜像
# 替换brew.git:
$ cd "$(brew --repo)"
# 清华大学:
$ git remote set-url origin https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/brew.git
# 替换homebrew-core.git:
$ cd "$(brew --repo)/Library/Taps/homebrew/homebrew-core"
# 清华大学:
$ git remote set-url origin https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/homebrew-core.git
# 替换homebrew-bottles:
# 清华大学:
$ echo 'export HOMEBREW_BOTTLE_DOMAIN=https://mirrors.tuna.tsinghua.edu.cn/homebrew-bottles' >> ~/.bash_profile
$ source ~/.bash_profile
# 应用生效:
$ brew update
修改完之后,再次执行brew install tesseract仍然会失败;
如果直接访问路径,浏览器能够打开说明只是连接超时了,单独安装即可
执行brew install (失败的内容)libtool.rb
但是遗憾对于github的资源,始终无法下载成功!!!!折磨了我一天,各种修改方法都未能生效,只能自己搞了个野路子;
改用复制的镜像:
https://hub.nuaa.cf/
并且替换host文件,让github.com转发到克隆hub.nuaa.cf,就跳过了403的问题;
vim /etc/hosts
最后验证:tesseract -v