依赖
<!-- OCR 图片文字识别技术-->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.1.1</version>
</dependency>
demo
public static void main(String[] args) {
Tess4jClient tess4jClient = new Tess4jClient();
Collection<File> files = FileUtils.listFiles(new File("图片路径"), new String[]{"png"}, false);
StringBuilder stringBuilder = new StringBuilder();
try {
for (File file : files) {
String path = file.getPath();
byte[] bytes = tess4jClient.downLoadFile(path);
//从byte[]转换为butteredImage
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
BufferedImage imageFile = ImageIO.read(in);
//识别图片的文字
String result = tess4jClient.doOCR(imageFile);
stringBuilder.append(result+"\r\n");
System.err.println(result);
}
String replace = stringBuilder.toString();
}catch (Exception e){
e.printStackTrace();
}
}
// 入参:图片流
public String doOCR(BufferedImage image) throws TesseractException {
//创建Tesseract对象
ITesseract tesseract = new Tesseract();
//设置中文字体库路径
tesseract.setDatapath(dataPath);
//中文识别
tesseract.setLanguage(language);
//执行ocr识别
String result = tesseract.doOCR(image);
return result;
}
public byte[] downLoadFile(String path) throws FileNotFoundException {
InputStream inputStream = null;
inputStream = new FileInputStream(path);
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
byte[] buff = new byte[100];
int rc = 0;
while (true) {
try {
if (!((rc = inputStream.read(buff, 0, 100)) > 0)) break;
} catch (IOException e) {
e.printStackTrace();
}
byteArrayOutputStream.write(buff, 0, rc);
}
return byteArrayOutputStream.toByteArray();
}