基于python/opencv/tesseract使用传统方法的,表格图片版面分析以及印刷体汉字识别(持续更新,学习备份用)
import cv2
import numpy as np
import pytesseract
image = cv2.imread('img-625101042_1.jpeg', 1)
print(image.shape)
#二值化
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
binary = cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, -5)
#ret,binary = cv2.threshold(~gray, 127, 255, cv2.THRESH_BINARY)
#cv2.imshow("cell", binary)
#cv2.waitKey(0)
rows,cols=binary.shape
scale = 40
#识别横线
kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(cols//scale,1))
eroded = cv2.erode(binary,kernel,iterations = 1)
#cv2.imshow("Eroded Image",eroded)
dilatedcol