如何使用TensorFlow进行OCR识别将表格图片转换为结构化数据

在这里插入图片描述
使用 TensorFlow 进行 OCR(光学字符识别)识别,并将表格图片转换为结构化数据是一个较为复杂但很有价值的任务。这个过程通常包括以下几个步骤:

  1. 图像预处理
  2. 表格检测
  3. 单元格识别
  4. 文字识别
  5. 数据结构化

下面是一个示例代码,展示如何使用 TensorFlow 完成这些步骤。

1. 安装必要的库

首先,安装 TensorFlow 和其他所需库:

pip install tensorflow opencv-python pytesseract

2. 图像预处理

我们需要对图像进行预处理,以便更好地检测表格和单元格。

import cv2
import numpy as np

def preprocess_image(image_path):
    # 读取图像
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # 二值化
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    
    # 形态学操作
    kernel = np.ones((2, 2), np.uint8)
    morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    
    return morphed

3. 表格检测

使用 OpenCV 检测表格轮廓。

def detect_table(image):
    contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    table_contours = []
    for contour in contours:
        if cv2.contourArea(contour) > 1000:
            table_contours.append(contour)
    
    return table_contours

4. 单元格识别

将检测到的表格进一步分割成单元格。

def detect_cells(image, table_contour):
    x, y, w, h = cv2.boundingRect(table_contour)
    table_roi = image[y:y+h, x:x+w]
    
    # 水平方向和垂直方向投影
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w//15, 1))
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h//15))
    
    horizontal_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, vertical_kernel)
    
    grid = cv2.add(horizontal_lines, vertical_lines)
    
    contours, _ = cv2.findContours(grid, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cell_contours = [cv2.boundingRect(c) for c in contours]
    
    return cell_contours

5. 文字识别

使用 Tesseract 进行 OCR 识别。

import pytesseract

def recognize_text(image, cell_contours):
    results = []
    for (x, y, w, h) in cell_contours:
        cell_roi = image[y:y+h, x:x+w]
        text = pytesseract.image_to_string(cell_roi, config='--psm 7')
        results.append(text.strip())
    
    return results

6. 数据结构化

将识别出的文字整理为结构化数据(如 DataFrame)。

import pandas as pd

def structure_data(texts, cell_contours):
    # 假设表格为 n 行 m 列
    rows = max([y for (x, y, w, h) in cell_contours]) + 1
    cols = max([x for (x, y, w, h) in cell_contours]) + 1
    
    data = [['' for _ in range(cols)] for _ in range(rows)]
    
    for text, (x, y, w, h) in zip(texts, cell_contours):
        row_idx = y
        col_idx = x
        data[row_idx][col_idx] = text
    
    return pd.DataFrame(data)

image_path = 'path/to/your/table_image.png'
preprocessed_image = preprocess_image(image_path)
table_contours = detect_table(preprocessed_image)

for table_contour in table_contours:
    cell_contours = detect_cells(preprocessed_image, table_contour)
    texts = recognize_text(preprocessed_image, cell_contours)
    structured_data = structure_data(texts, cell_contours)
    
    print(structured_data)

完整代码

import cv2
import numpy as np
import pytesseract
import pandas as pd

def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    kernel = np.ones((2, 2), np.uint8)
    morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    return morphed

def detect_table(image):
    contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    table_contours = [c for c in contours if cv2.contourArea(c) > 1000]
    return table_contours

def detect_cells(image, table_contour):
    x, y, w, h = cv2.boundingRect(table_contour)
    table_roi = image[y:y+h, x:x+w]
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w//15, 1))
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h//15))
    horizontal_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_lines = cv2.morphologyEx(table_roi, cv2.MORPH_OPEN, vertical_kernel)
    grid = cv2.add(horizontal_lines, vertical_lines)
    contours, _ = cv2.findContours(grid, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cell_contours = [cv2.boundingRect(c) for c in contours]
    return cell_contours

def recognize_text(image, cell_contours):
    results = []
    for (x, y, w, h) in cell_contours:
        cell_roi = image[y:y+h, x:x+w]
        text = pytesseract.image_to_string(cell_roi, config='--psm 7')
        results.append(text.strip())
    return results

def structure_data(texts, cell_contours):
    rows = max([y for (x, y, w, h) in cell_contours]) + 1
    cols = max([x for (x, y, w, h) in cell_contours]) + 1
    data = [['' for _ in range(cols)] for _ in range(rows)]
    for text, (x, y, w, h) in zip(texts, cell_contours):
        row_idx = y
        col_idx = x
        data[row_idx][col_idx] = text
    return pd.DataFrame(data)

image_path = 'path/to/your/table_image.png'
preprocessed_image = preprocess_image(image_path)
table_contours = detect_table(preprocessed_image)

for table_contour in table_contours:
    cell_contours = detect_cells(preprocessed_image, table_contour)
    texts = recognize_text(preprocessed_image, cell_contours)
    structured_data = structure_data(texts, cell_contours)
    print(structured_data)

综上

通过上述步骤,我们使用 TensorFlow 和 OpenCV 构建了一个简单的 OCR 模型,将表格图片转换为结构化数据。这个模型可以进一步优化,例如改进预处理步骤、使用更高级的表格检测算法(如深度学习模型),以及增强 OCR 识别的准确性。希望这个示例对你有所帮助!

  • 3
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值