下面的程序可以批量对文件夹下面的图片进行识别,只需要修改相应的文件夹位置即可,有很多文章讲述如何安装tesseract,这里不在赘述如何安装,直接放代码。
from PIL import Image
import pytesseract
import string, re, os
import codecs
def imgtostr(imgpath):
'''识别图片中的所有文字'''
image = Image.open(imgpath,'r')
text = pytesseract.image_to_string(image, lang='eng+chi_sim') # 调用字符文件识别
text = text.replace("\n", " ") # 去换行
return text
open('word.txt', 'w').close()
def writefile(image_path, strstr):
'''将文字累加写入txt文档'''
with open('word.txt', 'a+', encoding="utf-8") as f:
f.writelines(strstr)
f.writelines("\n\n")
if __name__ == '__main__':
print("\n")
image_path = './test_images'
print("\n....start........\n")
for foldName, subfolders, filenames in os.walk(image_path):
toltal = 0
for fe in filenames:
grpaimg = image_path + '/' + fe
print(grpaimg)
textddd = imgtostr(grpaimg)
writefile(image_path, grpaimg + ":\n" + textddd)
print(textddd, end="\n\n")
print("......end........")