tesseract train python_file

# tesseract-trainer
This is a set of two tools used to generate OCR training files for Tesseract. It is particularly designed for image files with small numbers of characters. It will help you create box files, assuming the name of the image file reflects the text contained in the image.

To run the tesseract trainer, you need to point it at a directory containing a set of image files and a set of box files with corresponding file names. e.g. You might have a directory containing:
- asdf.png
- asdf.box
- qwerty.png
- qwerty.box

Where the file names correspond to the characters that the image contains.

from PIL import Image
import subprocess
import os
import numpy

#Steps to take before running:
#Set TESSDATA_PREFIX to correct directory
#Put image and box files together in the same directory
#Label each corresponding file with the same filenames

class TesseractTrainer():
	def __init__(self):
		self.languageName = "eng"
		self.fontName = "captchaFont"
		self.directory = "/Users/ryan/Documents/tesseract-trainer/images"
		self.trainingList = None
		self.boxList = None


	def runAll(self):
		self.createFontFile()
		self.cleanImages()
		self.renameFiles()
		self.extractUnicode()
		self.runShapeClustering()
		self.runMfTraining()
		self.runCnTraining()
		self.createTessData()

	def cleanImages(self):
		print("CLEANING IMAGES...")
		files = os.listdir(self.directory)

		for fileName in files:
			if fileName.endswith("jpg") or fileName.endswith("jpeg") or fileName.endswith("png"):
				image = Image.open(self.directory+"/"+fileName)
				#Set a threshold value for the image, and save
				image = image.point(lambda x: 0 if x<250 else 255)
				(root, ext) = os.path.splitext(fileName)

				newFilePath = root+".tiff"
				image.save(self.directory+"/"+newFilePath)


	#Looks for box files, uses the box filename to find the corresponding
	#.tiff file. Renames all files with the appropriate "<language>.<font>.exp<N>" filename
	def renameFiles(self):
		files = os.listdir(self.directory)
		boxString = ""
		i = 0
		for fileName in files:
			if fileName.endswith(".box"):
				(root, ext) = os.path.splitext(fileName)
				tiffFile = self.languageName+"."+self.fontName+".exp"+str(i)+".tiff"
				boxFile = self.languageName+"."+self.fontName+".exp"+str(i)+".box"

				os.rename(self.directory+"/"+root+".tiff", self.directory+"/"+tiffFile)
				os.rename(self.directory+"/"+root+".box", self.directory+"/"+boxFile)
				boxString += " "+boxFile
				self.createTrainingFile(self.languageName+"."+self.fontName+".exp"+str(i))
				i += 1

		return boxString

	#Creates a training file for a single tiff/box pair
	#Called by renameFiles
	def createTrainingFile(self, prefix):
		print("CREATING TRAINING DATA...")
		currentDir = os.getcwd()
		os.chdir(self.directory)
		p = subprocess.Popen(["tesseract", prefix+".tiff", prefix, "nobatch", "box.train"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
		returnValue = stdout_value = p.communicate()[1]
		returnValue = returnValue.decode("utf-8")
		if "Empty page!!" in returnValue:
			os.chdir(self.directory)
			subprocess.call(["tesseract", "-psm", "7", prefix+".tiff", prefix, "nobatch", "box.train"])
		os.chdir(currentDir)


	def extractUnicode(self):
		currentDir = os.getcwd()
		print("EXTRACTING UNICODE...")
		boxList = self.getBoxFileList()
		boxArr = boxList.split(" ")
		boxArr.insert(0, "unicharset_extractor")
		boxArr = [i for i in boxArr if i != '']
		os.chdir(self.directory)
		p = subprocess.Popen(boxArr)
		p.wait()
		os.chdir(currentDir)

	def createFontFile(self):
		currentDir = os.getcwd()
		os.chdir(self.directory)
		fname = self.directory+"/font_properties"
		with open(fname, 'w') as fout:
		    fout.write(self.fontName+" 0 0 0 0 0")
		os.chdir(currentDir)

	def runShapeClustering(self):
		print("RUNNING SHAPE CLUSTERING...")
		#shapeclustering -F font_properties -U unicharset eng.captchaFont.exp0.tr...
		self.getTrainingFileList()
		shapeCommand = self.trainingList.split(" ")
		shapeCommand.insert(0, "shapeclustering")
		shapeCommand.insert(1, "-F")
		shapeCommand.insert(2, "font_properties")
		shapeCommand.insert(3, "-U")
		shapeCommand.insert(4, "unicharset")
		shapeCommand = [i for i in shapeCommand if i != '']
		currentDir = os.getcwd()
		os.chdir(self.directory)
		p = subprocess.Popen(shapeCommand)
		p.wait()
		os.chdir(currentDir)


	def runMfTraining(self):
		#mftraining -F font_properties -U unicharset eng.captchaFont.exp0.tr...
		print("RUNNING MF CLUSTERING...")
		self.getTrainingFileList()
		mfCommand = self.trainingList.split(" ")
		mfCommand.insert(0, "mftraining")
		mfCommand.insert(1, "-F")
		mfCommand.insert(2, "font_properties")
		mfCommand.insert(3, "-U")
		mfCommand.insert(4, "unicharset")
		mfCommand = [i for i in mfCommand if i != '']

		currentDir = os.getcwd()
		os.chdir(self.directory)
		p = subprocess.Popen(mfCommand)
		p.wait()
		os.chdir(currentDir)

	def runCnTraining(self):
		#cntraining -F font_properties -U unicharset eng.captchaFont.exp0.tr...
		print("RUNNING MF CLUSTERING...")
		self.getTrainingFileList()
		cnCommand = self.trainingList.split(" ")
		cnCommand.insert(0, "cntraining")
		cnCommand.insert(1, "-F")
		cnCommand.insert(2, "font_properties")
		cnCommand.insert(3, "-U")
		cnCommand.insert(4, "unicharset")
		cnCommand = [i for i in cnCommand if i != '']

		currentDir = os.getcwd()
		os.chdir(self.directory)
		p = subprocess.Popen(cnCommand)
		p.wait()
		os.chdir(currentDir)


	def createTessData(self):
		print("CREATING TESS DATA...")
		#Rename all files and run combine_tessdata <language>.
		currentDir = os.getcwd()
		os.chdir(self.directory)
		os.rename("unicharset", self.languageName+".unicharset")
		os.rename("shapetable", self.languageName+".shapetable")
		os.rename("inttemp", self.languageName+".inttemp")
		os.rename("normproto", self.languageName+".normproto")
		os.rename("pffmtable", self.languageName+".pffmtable")

		p = subprocess.Popen(["combine_tessdata", self.languageName+"."])
		p.wait()
		os.chdir(currentDir)


	def getBoxFileList(self):
		if self.boxList is not None:
			return self.boxList
		self.boxList = ""
		files = os.listdir(self.directory)
		commandString = "unicharset_extractor"
		filesFound = False

		for fileName in files:
			if fileName.endswith(".box"):
				filesFound = True
				self.boxList += " "+fileName

		if not filesFound:
			self.boxList = None
		return self.boxList

	#Retrieve a list of created training files, caches 
	#the list, so this only needs to be done once.
	def getTrainingFileList(self):
		if self.trainingList is not None:
			return self.trainingList

		self.trainingList = ""

		files = os.listdir(self.directory)
		commandString = "unicharset_extractor"
		filesFound = False

		for fileName in files:
			if fileName.endswith(".tr"):
				filesFound = True
				self.trainingList += " "+fileName

		if not filesFound:
			self.trainingList = None
		return self.trainingList



trainer = TesseractTrainer()
trainer.runAll()



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值