使用 TensorFlow 进行 OCR(光学字符识别)识别,并将表格图片转换为结构化数据是一个较为复杂但很有价值的任务。这个过程通常包括以下几个步骤:
- 图像预处理
- 表格检测
- 单元格识别
- 文字识别
- 数据结构化
下面是一个示例代码,展示如何使用 TensorFlow 完成这些步骤。
1. 安装必要的库
首先,安装 TensorFlow 和其他所需库:
pip install tensorflow opencv-python pytesseract
2. 图像预处理
我们需要对图像进行预处理,以便更好地检测表格和单元格。
import cv2
import numpy as np
def preprocess_image(image_path):
# 读取图像
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
# 二值化
_, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
# 形态学操作
kernel = np.ones((2, 2), np.uint8)
morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
return morphed
3. 表格检测
使用 OpenCV 检测表格轮廓。
def detect_table(image):
contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
table_contours = []
for contour in contours:
if cv2.contourArea(contour) > 1000:
table_contours.append(contour)
return table_contours
4. 单元格识别
将检测到的表格进一步分割成单元格。
def detect_cells(image, table_contour):
x, y, w, h = cv2.boundingRect(table_contour)
table_roi = image[y:y+h, x:x+w]
# 水平方向和垂直方向投影
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w//15, 1))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h//15))
horizontal_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, horizontal_kernel)
vertical_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, vertical_kernel)
grid = cv2.add(horizontal_lines, vertical_lines)
contours, _ = cv2.findContours(grid, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cell_contours = [cv2.boundingRect(c) for c in contours]
return cell_contours
5. 文字识别
使用 Tesseract 进行 OCR 识别。
import pytesseract
def recognize_text(image, cell_contours):
results = []
for (x, y, w, h) in cell_contours:
cell_roi = image[y:y+h, x:x+w]
text = pytesseract.image_to_string(cell_roi, config='--psm 7')
results.append(text.strip())
return results
6. 数据结构化
将识别出的文字整理为结构化数据(如 DataFrame)。
import pandas as pd
def structure_data(texts, cell_contours):
# 假设表格为 n 行 m 列
rows = max([y for (x, y, w, h) in cell_contours]) + 1
cols = max([x for (x, y, w, h) in cell_contours]) + 1
data = [['' for _ in range(cols)] for _ in range(rows)]
for text, (x, y, w, h) in zip(texts, cell_contours):
row_idx = y
col_idx = x
data[row_idx][col_idx] = text
return pd.DataFrame(data)
image_path = 'path/to/your/table_image.png'
preprocessed_image = preprocess_image(image_path)
table_contours = detect_table(preprocessed_image)
for table_contour in table_contours:
cell_contours = detect_cells(preprocessed_image, table_contour)
texts = recognize_text(preprocessed_image, cell_contours)
structured_data = structure_data(texts, cell_contours)
print(structured_data)
完整代码
import cv2
import numpy as np
import pytesseract
import pandas as pd
def preprocess_image(image_path):
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
_, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
kernel = np.ones((2, 2), np.uint8)
morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
return morphed
def detect_table(image):
contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
table_contours = [c for c in contours if cv2.contourArea(c) > 1000]
return table_contours
def detect_cells(image, table_contour):
x, y, w, h = cv2.boundingRect(table_contour)
table_roi = image[y:y+h, x:x+w]
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w//15, 1))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h//15))
horizontal_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, horizontal_kernel)
vertical_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, vertical_kernel)
grid = cv2.add(horizontal_lines, vertical_lines)
contours, _ = cv2.findContours(grid, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cell_contours = [cv2.boundingRect(c) for c in contours]
return cell_contours
def recognize_text(image, cell_contours):
results = []
for (x, y, w, h) in cell_contours:
cell_roi = image[y:y+h, x:x+w]
text = pytesseract.image_to_string(cell_roi, config='--psm 7')
results.append(text.strip())
return results
def structure_data(texts, cell_contours):
rows = max([y for (x, y, w, h) in cell_contours]) + 1
cols = max([x for (x, y, w, h) in cell_contours]) + 1
data = [['' for _ in range(cols)] for _ in range(rows)]
for text, (x, y, w, h) in zip(texts, cell_contours):
row_idx = y
col_idx = x
data[row_idx][col_idx] = text
return pd.DataFrame(data)
image_path = 'path/to/your/table_image.png'
preprocessed_image = preprocess_image(image_path)
table_contours = detect_table(preprocessed_image)
for table_contour in table_contours:
cell_contours = detect_cells(preprocessed_image, table_contour)
texts = recognize_text(preprocessed_image, cell_contours)
structured_data = structure_data(texts, cell_contours)
print(structured_data)
综上
通过上述步骤,我们使用 TensorFlow 和 OpenCV 构建了一个简单的 OCR 模型,将表格图片转换为结构化数据。这个模型可以进一步优化,例如改进预处理步骤、使用更高级的表格检测算法(如深度学习模型),以及增强 OCR 识别的准确性。希望这个示例对你有所帮助!