1.下载中文语言包
https://tesseract-ocr.github.io/tessdoc/Data-Files
下载 chi_sim.traineddata
2.maven依赖
<!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j -->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.5.5</version>
</dependency>
3.测试代码
将第一步下载的chi_sim.traineddata文件复制到D:\50TesseractOCR\tessdata下
提取文字图片:
public static void main(String[] args) throws IOException, TesseractException {
String imagePath="D:\\50TesseractOCR\\20211013153341.png";
//加载待读取图片
File imageFile = new File(imagePath);
if (!imageFile.exists()) {
System.out.println("找不到图片");
return ;
}
BufferedImage textImage = ImageIO.read(imageFile);
//创建tess对象
ITesseract tesseract = new Tesseract();
//设置训练文件目录
tesseract.setDatapath("D:\\50TesseractOCR\\tessdata");
//设置训练语言
tesseract.setLanguage("chi_sim");
//执行转换
String result = tesseract.doOCR(imageFile);
System.out.println(result);
}