如何使用TensorFlow进行OCR识别将表格图片转换为结构化数据

myCOTB

于 2024-07-04 13:00:44 发布

阅读量324

点赞数 3

文章标签： tensorflow ocr 人工智能

本文链接：https://blog.csdn.net/mycotb/article/details/140176746

版权

在这里插入图片描述
使用 TensorFlow 进行 OCR（光学字符识别）识别，并将表格图片转换为结构化数据是一个较为复杂但很有价值的任务。这个过程通常包括以下几个步骤：

图像预处理
表格检测
单元格识别
文字识别
数据结构化

下面是一个示例代码，展示如何使用 TensorFlow 完成这些步骤。

1. 安装必要的库

首先，安装 TensorFlow 和其他所需库：

pip install tensorflow opencv-python pytesseract

2. 图像预处理

我们需要对图像进行预处理，以便更好地检测表格和单元格。

import cv2
import numpy as np

def preprocess_image(image_path):
    # 读取图像
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # 二值化
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    
    # 形态学操作
    kernel = np.ones((2, 2), np.uint8)
    morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    
    return morphed

3. 表格检测

使用 OpenCV 检测表格轮廓。

def detect_table(image):
    contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    table_contours = []
    for contour in contours:
        if cv2.contourArea(contour) > 1000:
            table_contours.append(contour)
    
    return table_contours

4. 单元格识别

将检测到的表格进一步分割成单元格。

def detect_cells(image, table_contour):
    x, y, w, h = cv2.boundingRect(table_contour)
    table_roi = image[y:y+h, x:x+w]
    
    # 水平方向和垂直方向投影
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w//15, 1))
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h//15))
    
    horizontal_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, vertical_kernel)
    
    grid = cv2.add(horizontal_lines, vertical_lines)
    
    contours, _ = cv2.findContours(grid, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cell_contours = [cv2.boundingRect(c) for c in contours]
    
    return cell_contours

5. 文字识别

使用 Tesseract 进行 OCR 识别。

import pytesseract

def recognize_text(image, cell_contours):
    results = []
    for (x, y, w, h) in cell_contours:
        cell_roi = image[y:y+h, x:x+w]
        text = pytesseract.image_to_string(cell_roi, config='--psm 7')
        results.append(text.strip())
    
    return results

6. 数据结构化

将识别出的文字整理为结构化数据（如 DataFrame）。

import pandas as pd

def structure_data(texts, cell_contours):
    # 假设表格为 n 行 m 列
    rows = max([y for (x, y, w, h) in cell_contours]) + 1
    cols = max([x for (x, y, w, h) in cell_contours]) + 1
    
    data = [['' for _ in range(cols)] for _ in range(rows)]
    
    for text, (x, y, w, h) in zip(texts, cell_contours):
        row_idx = y
        col_idx = x
        data[row_idx][col_idx] = text
    
    return pd.DataFrame(data)

image_path = 'path/to/your/table_image.png'
preprocessed_image = preprocess_image(image_path)
table_contours = detect_table(preprocessed_image)

for table_contour in table_contours:
    cell_contours = detect_cells(preprocessed_image, table_contour)
    texts = recognize_text(preprocessed_image, cell_contours)
    structured_data = structure_data(texts, cell_contours)
    
    print(structured_data)

完整代码

import cv2
import numpy as np
import pytesseract
import pandas as pd

def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    kernel = np.ones((2, 2), np.uint8)
    morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    return morphed

def detect_table(image):
    contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    table_contours = [c for c in contours if cv2.contourArea(c) > 1000]
    return table_contours

def detect_cells(image, table_contour):
    x, y, w, h = cv2.boundingRect(table_contour)
    table_roi = image[y:y+h, x:x+w]
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w//15, 1))
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h//15))
    horizontal_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, vertical_kernel)
    grid = cv2.add(horizontal_lines, vertical_lines)
    contours, _ = cv2.findContours(grid, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cell_contours = [cv2.boundingRect(c) for c in contours]
    return cell_contours

def recognize_text(image, cell_contours):
    results = []
    for (x, y, w, h) in cell_contours:
        cell_roi = image[y:y+h, x:x+w]
        text = pytesseract.image_to_string(cell_roi, config='--psm 7')
        results.append(text.strip())
    return results

def structure_data(texts, cell_contours):
    rows = max([y for (x, y, w, h) in cell_contours]) + 1
    cols = max([x for (x, y, w, h) in cell_contours]) + 1
    data = [['' for _ in range(cols)] for _ in range(rows)]
    for text, (x, y, w, h) in zip(texts, cell_contours):
        row_idx = y
        col_idx = x
        data[row_idx][col_idx] = text
    return pd.DataFrame(data)

image_path = 'path/to/your/table_image.png'
preprocessed_image = preprocess_image(image_path)
table_contours = detect_table(preprocessed_image)

for table_contour in table_contours:
    cell_contours = detect_cells(preprocessed_image, table_contour)
    texts = recognize_text(preprocessed_image, cell_contours)
    structured_data = structure_data(texts, cell_contours)
    print(structured_data)

综上

通过上述步骤，我们使用 TensorFlow 和 OpenCV 构建了一个简单的 OCR 模型，将表格图片转换为结构化数据。这个模型可以进一步优化，例如改进预处理步骤、使用更高级的表格检测算法（如深度学习模型），以及增强 OCR 识别的准确性。希望这个示例对你有所帮助！

myCOTB

关注

3
点赞
踩
10

收藏

觉得还不错? 一键收藏
0
评论
如何使用TensorFlow进行OCR识别将表格图片转换为结构化数据

使用 TensorFlow 进行 OCR（光学字符识别）识别，并将表格图片转换为结构化数据是一个较为复杂但很有价值的任务。下面是一个示例代码，展示如何使用 TensorFlow 完成这些步骤。
复制链接

扫一扫