        <p style="margin-left:0pt;"><span style="color:#ff0000;">论文:</span></p>

The OCRopus Open Source OCR System

Transfer Learning for OCRopus Model Training on Early Printed Books
























  1. from __future__ import print_function
  2. import numpy as np
  3. import cv2
  4. import time
  5. class Pre_Process(object):
  6. def __init__(self):
  7. self.zoom= 0.5
  8. self.perc= 50
  9. self.range= 20
  10. self.bignore= 0.2
  11. self.maxskew= 5
  12. self.skewsteps= 1
  13. self.escale= 1.0
  14. self.lo= 0.05
  15. self.hi= 0.9
  16. def normalize_raw_image(self,raw):
  17. ''' perform image normalization '''
  18. image = raw - np.amin(raw)
  19. if np.amax(image) == np.amin(image):
  20. return image
  21. image /= np.amax(image)
  22. return image
  23. def estimate_local_whitelevel(self,image, bignore=0.2,zoom=0.5, perc=80, range=20):
  24. '''flatten it by estimating the local whitelevel
  25. zoom for page background estimation, smaller=faster, default: %(default)s
  26. percentage for filters, default: %(default)s
  27. range for filters, default: %(default)s
  28. '''
  29. d0, d1 = image.shape
  30. o0, o1 = int(bignore * d0), int(bignore * d1)
  31. est = image[o0:d0 - o0, o1:d1 - o1]
  32. image_black=np.sum(est < 0.05)
  33. image_white=np.sum(est > 0.95)
  34. extreme = (image_black+image_white) * 1.0 / np.prod(est.shape)
  35. if np.mean(est)< 0.4:
  36. print( np.mean(est),np.median(est))
  37. image = 1 - image
  38. if extreme > 0.95:
  39. flat = image
  40. else:
  41. m=cv2.blur(image,(range,range))
  42. w, h = np.minimum(np.array(image.shape), np.array(m.shape))
  43. flat = np.clip(image[:w, :h] - m[:w, :h] + 1, 0, 1)
  44. return flat
  45. def estimate_skew_angle(self,image, angles):
  46. estimates = []
  47. for a in angles:
  48. matrix = cv2.getRotationMatrix2D((int(image.shape[ 1] / 2), int(image.shape[ 0] / 2)), a, 1)
  49. rotate_image = cv2.warpAffine(image, matrix, (image.shape[ 1], image.shape[ 0]))
  50. v = np.mean(rotate_image, axis= 1)
  51. v = np.var(v)
  52. estimates.append((v, a))
  53. _, a = max(estimates)
  54. return a
  55. def estimate_skew(self,flat,maxskew=2, skewsteps=1):
  56. ''' estimate skew angle and rotate'''
  57. flat = np.amax(flat) - flat
  58. flat -= np.amin(flat)
  59. ma = maxskew
  60. ms = int( 2 * maxskew * skewsteps)
  61. angle = self.estimate_skew_angle(flat, np.linspace(-ma, ma, ms + 1))
  62. matrix = cv2.getRotationMatrix2D((int(flat.shape[ 1] / 2), int(flat.shape[ 0] / 2)), angle, 1)
  63. flat= cv2.warpAffine(flat, matrix, (flat.shape[ 1], flat.shape[ 0]))
  64. flat = np.amax(flat) - flat
  65. return flat, angle
  66. def estimate_thresholds(self,flat, bignore=0.2, escale=1, lo=0.05, hi=0.9):
  67. '''# estimate low and high thresholds
  68. ignore this much of the border for threshold estimation, default: %(default)s
  69. scale for estimating a mask over the text region, default: %(default)s
  70. lo percentile for black estimation, default: %(default)s
  71. hi percentile for white estimation, default: %(default)s
  72. '''
  73. d0, d1 = flat.shape
  74. o0, o1 = int(bignore * d0), int(bignore * d1)
  75. est = flat[o0:d0 - o0, o1:d1 - o1]
  76. if escale > 0:
  77. # by default, we use only regions that contain
  78. # significant variance; this makes the percentile
  79. # based low and high estimates more reliable
  80. v = est -cv2.GaussianBlur(est, ( 3, 3), escale * 20)
  81. v=cv2.GaussianBlur(v ** 2, ( 3, 3), escale * 20)** 0.5
  82. v = (v > 0.3 * np.amax(v))
  83. v=np.asarray(v,np.uint8)
  84. v=cv2.cvtColor(v, cv2.COLOR_GRAY2RGB)
  85. kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(int(escale * 50),int(escale * 50)))
  86. v = cv2.dilate(v, kernel, 1)
  87. v=cv2.cvtColor(v, cv2.COLOR_RGB2GRAY)
  88. v = (v > 0.3 * np.amax(v))
  89. est = est[v]
  90. if len(est)!= 0:
  91. est=np.sort(est)
  92. lo = est[int(lo*len(est))]
  93. hi = est[int(hi*len(est))]
  94. # rescale the image to get the gray scale image
  95. flat -= lo
  96. flat /= (hi - lo)
  97. flat = np.clip(flat, 0, 1)
  98. return flat
  99. def process(self,img):
  100. # perform image normalization(30ms)
  101. image = self.normalize_raw_image(img)
  102. # check whether the image is already effectively binarized(70ms)
  103. flat = self.estimate_local_whitelevel(image,self.bignore, self.zoom, self.perc, self.range)
  104. # estimate skew angle and rotate(100ms)
  105. flat, angle = self.estimate_skew(flat, self.maxskew, self.skewsteps)
  106. # estimate low and high thresholds(200ms)
  107. flat = self.estimate_thresholds(flat, self.bignore, self.escale, self.lo, self.hi)
  108. flat=np.asarray(flat* 255,np.uint8)
  109. return flat
  110. if __name__== "__main__":
  111. pp=Pre_Process()
  112. image=cv2.imread( "0020_0022.png", 0)
  113. image=image/ 255
  114. for i in range( 1):
  115. start = time.time()
  116. flat=pp.process(image)
  117. print( "time:",time.time()-start)
  118. cv2.imwrite( "gray.jpg", flat)
  119. cv2.imwrite( "binary.jpg", 255*(flat> 128))





  1. Sequential(
  2. ( 0): Conv2d( 1, 8, kernel_size=( 3, 3), stride=( 1, 1), padding=( 1, 1))
  3. ( 1): BatchNorm2d( 8, eps=1e- 05, momentum= 0.1, affine= True)
  4. ( 2): ReLU()
  5. ( 3): LSTM2(
  6. (hlstm): RowwiseLSTM(
  7. (lstm): LSTM( 8, 4, bidirectional= 1)
  8. )
  9. (vlstm): RowwiseLSTM(
  10. (lstm): LSTM( 8, 4, bidirectional= 1)
  11. )
  12. )
  13. ( 4): Conv2d( 8, 1, kernel_size=( 1, 1), stride=( 1, 1))
  14. ( 5): Sigmoid()
  15. )


  1. import ocrobin
  2. import cv2
  3. import numpy as np
  4. import time
  5. bm = ocrobin. Binarizer( "bin-000000046-005393.pt")
  6. bm.model
  7. image = np.mean(cv2.imread( "0020_0022.png")[:, :, : 3], 2)
  8. start=time.time()
  9. binary = bm.binarize(image)
  10. print( "time:",time.time()-start)
  11. print(np. max(binary),np. min(binary))
  12. gray=( 1-binary)* 255
  13. binary=(binary< 0.5)* 255
  14. cv2.imwrite( "gray.png",gray)
  15. cv2.imwrite( "bin.png",binary)




  1. Sequential(
  2. ( 0): CheckSizes [( 1, 128), ( 1, 512), ( 256, 256), ( 256, 256)]
  3. ( 1): Conv2d( 1, 8, kernel_size=( 3, 3), stride=( 1, 1), padding=( 1, 1))
  4. ( 2): BatchNorm2d( 8, eps= 1e-05, momentum= 0.1, affine= True)
  5. ( 3): ReLU()
  6. ( 4): MaxPool2d(kernel_size=( 2, 2), stride=( 2, 2), dilation=( 1, 1), ceil_mode= False)
  7. ( 5): Conv2d( 8, 16, kernel_size=( 3, 3), stride=( 1, 1), padding=( 1, 1))
  8. ( 6): BatchNorm2d( 16, eps= 1e-05, momentum= 0.1, affine= True)
  9. ( 7): ReLU()
  10. ( 8): MaxPool2d(kernel_size=( 2, 2), stride=( 2, 2), dilation=( 1, 1), ceil_mode= False)
  11. ( 9): Conv2d( 16, 32, kernel_size=( 3, 3), stride=( 1, 1), padding=( 1, 1))
  12. ( 10): BatchNorm2d( 32, eps= 1e-05, momentum= 0.1, affine= True)
  13. ( 11): ReLU()
  14. ( 12): MaxPool2d(kernel_size=( 2, 2), stride=( 2, 2), dilation=( 1, 1), ceil_mode= False)
  15. ( 13): Conv2d( 32, 64, kernel_size=( 3, 3), stride=( 1, 1), padding=( 1, 1))
  16. ( 14): BatchNorm2d( 64, eps= 1e-05, momentum= 0.1, affine= True)
  17. ( 15): ReLU()
  18. ( 16): Img2FlatSum
  19. ( 17): Linear(in_features= 64, out_features= 64, bias= True)
  20. ( 18): BatchNorm1d( 64, eps= 1e-05, momentum= 0.1, affine= True)
  21. ( 19): ReLU()
  22. ( 20): Linear(in_features= 64, out_features= 4, bias= True)
  23. ( 21): Sigmoid()
  24. ( 22): CheckSizes [( 1, 128), ( 4, 4)]
  25. )


  1. Sequential(
  2. ( 0): CheckSizes [( 1, 128), ( 1, 512), ( 256, 256), ( 256, 256)]
  3. ( 1): Conv2d( 1, 8, kernel_size=( 5, 5), stride=( 1, 1), padding=( 2, 2))
  4. ( 2): BatchNorm2d( 8, eps= 1e-05, momentum= 0.1, affine= True)
  5. ( 3): ReLU()
  6. ( 4): Spectrum
  7. ( 5): Conv2d( 8, 4, kernel_size=( 5, 5), stride=( 1, 1), padding=( 2, 2))
  8. ( 6): BatchNorm2d( 4, eps= 1e-05, momentum= 0.1, affine= True)
  9. ( 7): ReLU()
  10. ( 8): Reshape(( 0, [ 1, 2, 3]))
  11. ( 9): Linear(in_features= 262144, out_features= 128, bias= True)
  12. ( 10): BatchNorm1d( 128, eps= 1e-05, momentum= 0.1, affine= True)
  13. ( 11): ReLU()
  14. ( 12): Linear(in_features= 128, out_features= 30, bias= True)
  15. ( 13): Sigmoid()
  16. ( 14): CheckSizes [( 1, 128), ( 30, 30)]
  17. )


该模块也是ocropus3中的模块,主要负责训练数据的增强处理。包含了页面旋转(page rotation),随机几何变换(random geometric transformations),随机分布变换(random distortions),规则表面失真(ruled surface distortions),模糊(blur),阈值化(thresholding),噪声(noise),多尺度噪声(multiscale noise),随机斑点(random blobs),纤维噪声(fibrous noise),前景背景选择(foreground/background selection)等。





  1. Sequential(
  2. ( 0): Conv2d( 1, 16, kernel_size=( 3, 3), stride=( 1, 1), padding=( 1, 1))
  3. ( 1): BatchNorm2d( 16, eps=1e- 05, momentum= 0.1, affine= True)
  4. ( 2): ReLU()
  5. ( 3): MaxPool2d(kernel_size=( 2, 2), stride=( 2, 2), dilation=( 1, 1), ceil_mode= False)
  6. ( 4): Conv2d( 16, 32, kernel_size=( 3, 3), stride=( 1, 1), padding=( 1, 1))
  7. ( 5): BatchNorm2d( 32, eps=1e- 05, momentum= 0.1, affine= True)
  8. ( 6): ReLU()
  9. ( 7): MaxPool2d(kernel_size=( 2, 2), stride=( 2, 2), dilation=( 1, 1), ceil_mode= False)
  10. ( 8): Conv2d( 32, 64, kernel_size=( 3, 3), stride=( 1, 1), padding=( 1, 1))
  11. ( 9): BatchNorm2d( 64, eps=1e- 05, momentum= 0.1, affine= True)
  12. ( 10): ReLU()
  13. ( 11): LSTM2(
  14. (hlstm): RowwiseLSTM(
  15. (lstm): LSTM( 64, 32, bidirectional= 1)
  16. )
  17. (vlstm): RowwiseLSTM(
  18. (lstm): LSTM( 64, 32, bidirectional= 1)
  19. )
  20. )
  21. ( 12): Conv2d( 64, 32, kernel_size=( 1, 1), stride=( 1, 1))
  22. ( 13): BatchNorm2d( 32, eps=1e- 05, momentum= 0.1, affine= True)
  23. ( 14): ReLU()
  24. ( 15): LSTM2(
  25. (hlstm): RowwiseLSTM(
  26. (lstm): LSTM( 32, 32, bidirectional= 1)
  27. )
  28. (vlstm): RowwiseLSTM(
  29. (lstm): LSTM( 64, 32, bidirectional= 1)
  30. )
  31. )
  32. ( 16): Conv2d( 64, 1, kernel_size=( 1, 1), stride=( 1, 1))
  33. ( 17): Sigmoid()
  34. )





  1. from __future_ _ import print_function
  2. import os
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. from scipy.ndimage import interpolation,filters
  6. def scale_to_h(img,target_height,order=1,dtype=np.dtype('f'),cval= 0):
  7. h,w = img.shape
  8. scale = target_height* 1.0/h
  9. target_width = int(scale*w)
  10. output = interpolation.affine_transform( 1.0*img,np.eye( 2)/scale,order=order,
  11. output_shape=(target_height,target_width),
  12. mode= 'constant',cval=cval)
  13. output = np.array(output,dtype=dtype)
  14. return output
  15. class CenterNormalizer:
  16. def __init__(self,target_height=48,params=(4,1.0,0.3)):
  17. self.debug = int(os.getenv( "debug_center") or "0")
  18. self.target_height = target_height
  19. self.range, self.smoothness, self.extra = params
  20. def setHeight(self,target_height):
  21. self.target_height = target_height
  22. def measure(self,line):
  23. h,w = line.shape
  24. h=float(h)
  25. w=float(w)
  26. smoothed = filters.gaussian_filter(line,(h* 0. 5,h* self.smoothness),mode= 'constant')
  27. smoothed += 0. 001*filters.uniform_filter(smoothed,(h* 0. 5,w),mode= 'constant')
  28. self.shape = (h,w)
  29. a = np.argmax(smoothed,axis= 0)
  30. a = filters.gaussian_filter(a,h* self.extra)
  31. self.center = np.array(a, 'i')
  32. deltas = np.abs(np.arange(h)[ :,np.newaxis]- self.center[np.newaxis, :])
  33. self.mad = np.mean(deltas[line!= 0])
  34. self.r = int( 1+ self.range* self.mad)
  35. if self. debug:
  36. plt.figure( "center")
  37. plt.imshow(line,cmap=plt.cm.gray)
  38. plt.plot( self.center)
  39. plt.ginput( 1, 1000)
  40. def dewarp(self,img,cval=0,dtype=np.dtype('f')):
  41. print(img.shape== self.shape)
  42. assert img.shape== self.shape
  43. h,w = img.shape
  44. # The actual image img is embedded into a larger image by
  45. # adding vertical space on top and at the bottom (padding)
  46. hpadding = self.r # this is large enough
  47. padded = np.vstack([cval*np.ones((hpadding,w)),img,cval*np.ones((hpadding,w))])
  48. center = self.center + hpadding
  49. dewarped = [padded[center[i]- self. r:center[i]+ self.r,i] for i in range(w)]
  50. dewarped = np.array(dewarped,dtype=dtype).T
  51. return dewarped
  52. def normalize(self,img,order=1,dtype=np.dtype('f'),cval= 0):
  53. dewarped = self.dewarp(img,cval=cval,dtype=dtype)
  54. h,w = dewarped.shape
  55. scaled = scale_to_h(dewarped, self.target_height,order=order,dtype=dtype,cval=cval)
  56. return scaled
  57. if __name_ _== "__main__":
  58. cn=CenterNormalizer()
  59. import cv2
  60. image=cv2.imread( "20180727122251.png", 0)
  61. image=(image> 128)* 255
  62. image= 255-image
  63. image=np.float32(image)
  64. cn.measure(image)
  65. scaled=cn.normalize(image)
  66. print(np.max(scaled),np.min(scaled))
  67. cv2.imwrite( "scaled.png", 255-scaled)





  1. from __future__ import print_function
  2. import random as pyrandom
  3. import glob
  4. import sys
  5. import os
  6. import re
  7. import codecs
  8. import traceback
  9. import argparse
  10. import numpy as np
  11. import matplotlib.pyplot as plt
  12. from PIL import Image
  13. from PIL import ImageFont,ImageDraw
  14. from scipy.ndimage import filters,measurements,interpolation
  15. from scipy.misc import imsave
  16. replacements = [
  17. ( u'[_~#]', u"~"), # OCR control characters
  18. ( u'"', u"''"), # typewriter double quote
  19. ( u"`", u"'"), # grave accent
  20. ( u'[“”]', u"''"), # fancy quotes
  21. ( u"´", u"'"), # acute accent
  22. ( u"[‘’]", u"'"), # left single quotation mark
  23. ( u"[“”]", u"''"), # right double quotation mark
  24. ( u"“", u"''"), # German quotes
  25. ( u"„", u",,"), # German quotes
  26. ( u"…", u"..."), # ellipsis
  27. ( u"′", u"'"), # prime
  28. ( u"″", u"''"), # double prime
  29. ( u"‴", u"'''"), # triple prime
  30. ( u"〃", u"''"), # ditto mark
  31. ( u"µ", u"μ"), # replace micro unit with greek character
  32. ( u"[–—]", u"-"), # variant length hyphens
  33. ( u"fl", u"fl"), # expand Unicode ligatures
  34. ( u"fi", u"fi"),
  35. ( u"ff", u"ff"),
  36. ( u"ffi", u"ffi"),
  37. ( u"ffl", u"ffl"),
  38. ]
  39. import unicodedata
  40. def normalize_text(s):
  41. """Apply standard Unicode normalizations for OCR.
  42. This eliminates common ambiguities and weird unicode
  43. characters."""
  44. #s = unicode(s)
  45. s = unicodedata.normalize( 'NFC',s)
  46. s = re.sub( r'\s+(?u)', ' ',s)
  47. s = re.sub( r'\n(?u)', '',s)
  48. s = re.sub( r'^\s+(?u)', '',s)
  49. s = re.sub( r'\s+$(?u)', '',s)
  50. for m,r in replacements:
  51. s = re.sub((m),(r),s)
  52. #s = re.sub(unicode(m),unicode(r),s)
  53. return s
  54. parser = argparse.ArgumentParser(description = "Generate text line training data")
  55. parser.add_argument( '-o', '--base',default= 'linegen',help= 'output directory, default: %(default)s')
  56. parser.add_argument( '-r', '--distort',type=float,default= 1.0)
  57. parser.add_argument( '-R', '--dsigma',type=float,default= 20.0)
  58. parser.add_argument( '-f', '--fonts',default= "tests/DejaVuSans.ttf")
  59. parser.add_argument( '-F', '--fontlist',default= None)
  60. parser.add_argument( '-t', '--texts',default= "tests/tomsawyer.txt")
  61. parser.add_argument( '-T', '--textlist',default= None)
  62. parser.add_argument( '-m', '--maxlines',default= 200,type=int,
  63. help= 'max # lines for each directory, default: %(default)s')
  64. parser.add_argument( '-e', '--degradations',default= "lo",
  65. help= "lo, med, or hi; or give a file, default: %(default)s")
  66. parser.add_argument( '-j', '--jitter',default= 0.5)
  67. parser.add_argument( '-s', '--sizes',default= "40-70")
  68. parser.add_argument( '-d', '--display',action= "store_true")
  69. parser.add_argument( '--numdir',action= "store_true")
  70. parser.add_argument( '-C', '--cleanup',default= '[_~#]')
  71. parser.add_argument( '-D', '--debug_show',default= None,
  72. help= "select a class for stepping through")
  73. args = parser.parse_args()
  74. if "-" in args.sizes:
  75. lo,hi = args.sizes.split( "-")
  76. sizes = range(int(lo),int(hi)+ 1)
  77. else:
  78. sizes = [int(x) for x in args.sizes.split( ",")]
  79. if args.degradations== "lo":
  80. # sigma +/- threshold +/-
  81. deglist = """
  82. 0.5 0.0 0.5 0.0
  83. """
  84. elif args.degradations== "med":
  85. deglist = """
  86. 0.5 0.0 0.5 0.05
  87. 1.0 0.3 0.4 0.05
  88. 1.0 0.3 0.5 0.05
  89. 1.0 0.3 0.6 0.05
  90. """
  91. elif args.degradations== "hi":
  92. deglist = """
  93. 0.5 0.0 0.5 0.0
  94. 1.0 0.3 0.4 0.1
  95. 1.0 0.3 0.5 0.1
  96. 1.0 0.3 0.6 0.1
  97. 1.3 0.3 0.4 0.1
  98. 1.3 0.3 0.5 0.1
  99. 1.3 0.3 0.6 0.1
  100. """
  101. elif args.degradations is not None:
  102. with open(args.degradations) as stream:
  103. deglist = stream.read()
  104. degradations = []
  105. for deg in deglist.split( "\n"):
  106. deg = deg.strip()
  107. if deg== "": continue
  108. deg = [float(x) for x in deg.split()]
  109. degradations.append(deg)
  110. if args.fonts is not None:
  111. fonts = []
  112. for pat in args.fonts.split( ':'):
  113. if pat== "": continue
  114. fonts += sorted(glob.glob(pat))
  115. elif args.fontlist is not None:
  116. with open(args.fontlist) as fh:
  117. lines = (line.strip() for line in fh)
  118. fonts = [line for line in lines if line]
  119. else:
  120. print( "use -f or -F arguments to specify fonts")
  121. sys.exit( 1)
  122. assert len(fonts)> 0, "no fonts?"
  123. print( "fonts", fonts)
  124. if args.texts is not None:
  125. texts = []
  126. for pat in args.texts.split( ':'):
  127. print(pat)
  128. if pat== "": continue
  129. texts += sorted(glob.glob(pat))
  130. elif args.textlist is not None:
  131. texts = re.split( r'\s*\n\s*',open(args.textlist).read())
  132. else:
  133. print( "use -t or -T arguments to specify texts")
  134. sys.exit( 1)
  135. assert len(texts)> 0, "no texts?"
  136. lines = []
  137. for text in texts:
  138. print( "# reading", text)
  139. with codecs.open(text, 'r', 'utf-8') as stream:
  140. for line in stream.readlines():
  141. line = line.strip()
  142. line = re.sub(args.cleanup, '',line)
  143. if len(line)< 1: continue
  144. lines.append(line)
  145. print( "got", len(lines), "lines")
  146. assert len(lines)> 0
  147. lines = list(set(lines))
  148. print( "got", len(lines), "unique lines")
  149. def rgeometry(image,eps=0.03,delta=0.3):
  150. m = np.array([[ 1+eps*np.random.randn(), 0.0],[eps*np.random.randn(), 1.0+eps*np.random.randn()]])
  151. w,h = image.shape
  152. c = np.array([w/ 2.0,h/ 2])
  153. d = c-np.dot(m,c)+np.array([np.random.randn()*delta,np.random.randn()*delta])
  154. return interpolation.affine_transform(image,m,offset=d,order= 1,mode= 'constant',cval=image[ 0, 0])
  155. def rdistort(image,distort=3.0,dsigma=10.0,cval=0):
  156. h,w = image.shape
  157. hs = np.random.randn(h,w)
  158. ws = np.random.randn(h,w)
  159. hs = filters.gaussian_filter(hs,dsigma)
  160. ws = filters.gaussian_filter(ws,dsigma)
  161. hs *= distort/np.amax(hs)
  162. ws *= distort/np.amax(ws)
  163. def f(p):
  164. return (p[ 0]+hs[p[ 0],p[ 1]],p[ 1]+ws[p[ 0],p[ 1]])
  165. return interpolation.geometric_transform(image,f,output_shape=(h,w),
  166. order= 1,mode= 'constant',cval=cval)
  167. if args.debug_show:
  168. plt.ion()
  169. plt.gray()
  170. base = args.base
  171. print( "base", base)
  172. if os.path.exists(base)== False:
  173. os.mkdir(base)
  174. def crop(image,pad=1):
  175. [[r,c]] = measurements.find_objects(np.array(image== 0, 'i'))
  176. r0 = r.start
  177. r1 = r.stop
  178. c0 = c.start
  179. c1 = c.stop
  180. image = image[r0-pad:r1+pad,c0-pad:c1+pad]
  181. return image
  182. last_font = None
  183. last_size = None
  184. last_fontfile = None
  185. def genline(text,fontfile=None,size=36,sigma=0.5,threshold=0.5):
  186. global image,draw,last_font,last_fontfile
  187. if last_fontfile!=fontfile or last_size!=size:
  188. last_font = ImageFont.truetype(fontfile,size)
  189. last_fontfile = fontfile
  190. font = last_font
  191. image = Image.new( "L",( 6000, 200))
  192. draw = ImageDraw.Draw(image)
  193. draw.rectangle(( 0, 0, 6000, 6000),fill= "white")
  194. # print("\t", size, font)
  195. draw.text(( 250, 20),text,fill= "black",font=font)
  196. a = np.asarray(image, 'f')
  197. a = a* 1.0/np.amax(a)
  198. if sigma> 0.0:
  199. a = filters.gaussian_filter(a,sigma)
  200. a += np.clip(np.random.randn(*a.shape)* 0.2, -0.25, 0.25)
  201. a = rgeometry(a)
  202. a = np.array(a>threshold, 'f')
  203. a = crop(a,pad= 3)
  204. # FIXME add grid warping here
  205. # clf(); ion(); gray(); imshow(a); ginput(1,0.1)
  206. del draw
  207. del image
  208. return a
  209. lines_per_size = args.maxlines//len(sizes)
  210. for pageno,font in enumerate(fonts):
  211. if args.numdir:
  212. pagedir = "%s/%04d"%(base,pageno+ 1)
  213. else:
  214. fbase = re.sub( r'^[./]*', '',font)
  215. fbase = re.sub( r'[.][^/]*$', '',fbase)
  216. fbase = re.sub( r'[/]', '_',fbase)
  217. pagedir = "%s/%s"%(base,fbase)
  218. if os.path.exists(pagedir)== False:
  219. os.mkdir(pagedir)
  220. print( "===", pagedir, font)
  221. lineno = 0
  222. while lineno<args.maxlines:
  223. (sigma,ssigma,threshold,sthreshold) = pyrandom.choice(degradations)
  224. sigma += ( 2*np.random.rand() -1)*ssigma
  225. threshold += ( 2*np.random.rand() -1)*sthreshold
  226. line = pyrandom.choice(lines)
  227. size = pyrandom.choice(sizes)
  228. with open(pagedir+ ".info", "w") as stream:
  229. stream.write( "%s\n"%font)
  230. try:
  231. image = genline(text=line,fontfile=font,
  232. size=size,sigma=sigma,threshold=threshold)
  233. except:
  234. traceback.print_exc()
  235. continue
  236. if np.amin(image.shape)< 10: continue
  237. if np.amax(image)< 0.5: continue
  238. if args.distort> 0:
  239. image = rdistort(image,args.distort,args.dsigma,cval=np.amax(image))
  240. if args.display:
  241. plt.gray()
  242. plt.clf()
  243. plt.imshow(image)
  244. plt.ginput( 1, 0.1)
  245. fname = pagedir+ "/01%04d"%lineno
  246. imsave(fname+ ".bin.png",image)
  247. gt = normalize_text(line)
  248. with codecs.open(fname+ ".gt.txt", "w", 'utf-8') as stream:
  249. stream.write(gt+ "\n")
  250. print( "%5.2f %5.2f %3d\t%s" % (sigma, threshold, size, line))
  251. lineno += 1



  1. 纯python
  2. 支持任何深度学习框架
  3. 支持非常大的数据集
  4. 支持数据流
  5. 支持map-reduce和分布式数据增强
  6. 支持tar,tfrecords等多种格式


该模块为训练模块,训练过程使用cpu训练。模型为多层感知机MLP,使用的是CTC loss。训练速度很快。




在ocropus3中,识别模块为卷积网络模块。损失还是CTC LOSS。


  1. Sequential(
  2. ( 0): Reorder BHWD->BDHW
  3. ( 1): CheckSizes [( 0, 900), ( 1, 1), ( 48, 48), ( 0, 9000)]
  4. ( 2): Conv2d( 1, 100, kernel_size=( 3, 3), stride=( 1, 1), padding=( 1, 1))
  5. ( 3): BatchNorm2d( 100, eps= 1e-05, momentum= 0.1, affine= True)
  6. ( 4): ReLU()
  7. ( 5): MaxPool2d(kernel_size=( 2, 1), stride=( 2, 1), dilation=( 1, 1), ceil_mode= False)
  8. ( 6): Conv2d( 100, 200, kernel_size=( 3, 3), stride=( 1, 1), padding=( 1, 1))
  9. ( 7): BatchNorm2d( 200, eps= 1e-05, momentum= 0.1, affine= True)
  10. ( 8): ReLU()
  11. ( 9): Reshape(( 0, [ 1, 2], 3))
  12. ( 10): CheckSizes [( 0, 900), ( 0, 5000), ( 0, 9000)]
  13. ( 11): LSTM1:LSTM( 4800, 200, bidirectional= 1)
  14. ( 12): Conv1d( 400, 97, kernel_size=( 1,), stride=( 1,))
  15. ( 13): Reorder BDW->BWD
  16. ( 14): CheckSizes [( 0, 900), ( 0, 9000), ( 97, 97)]
  17. )








  1. ocropy网络的前向后向全部python实现,没有第三方神经网络框架的依赖,支持自己训练,需要python2版本。
  2. ocropy2,ocropus3有pytorch依赖
  3. ocropus3将各个模块都独立出来,耦合性更小




