最短编辑距离和最短编辑路径

comli_cn

已于 2024-03-05 21:19:17 修改

阅读量882

点赞数 1

分类专栏：算法 NLP 文章标签：矩阵 leetcode 线性代数

于 2022-05-02 18:45:43 首次发布

本文链接：https://blog.csdn.net/comli_cn/article/details/124544712

版权

字符串处理动态规划编辑距离算法实现回溯

关键词由CSDN通过智能技术生成

算法同时被 2 个专栏收录

10 篇文章 2 订阅

订阅专栏

NLP

8 篇文章 0 订阅

订阅专栏

1. 题目

求出两个字符串的最短编辑距离，及最短编辑路径。

2. 代码

# 计算两个字符串的最小编辑距离
def min_ed(source_str, target_str, source_len, target_len):
    # 初始化matrix
    matrix = []
    for i in range(source_len + 1):
        matrix.append([])
        for j in range(target_len + 1):
            matrix[i].append(0)

    # 边界条件
    for i in range(source_len + 1):
        matrix[i][0] = i
    for j in range(target_len + 1):
        matrix[0][j] = j

    # 动态规划的方法来填充matrix
    for i in range(1, source_len + 1):
        for j in range(1, target_len + 1):
            if source_str[i - 1] == target_str[j - 1]:
                matrix[i][j] = min(matrix[i - 1][j], matrix[i][j - 1], matrix[i - 1][j - 1])
            if source_str[i - 1] != target_str[j - 1]:
                matrix[i][j] = min(matrix[i - 1][j], matrix[i][j - 1], matrix[i - 1][j - 1]) + 1
    return matrix

# 列出两个字符串的最小编辑路径
def min_ed_path(path_matrix, source_str, target_str, source_len, target_len):
    # 回溯的时候相当于是在反向生成源字符串和目标字符串，所以可以在每一步打印出反向生成的源字符串和目标字符串
    source_back_str = ""
    tar_back_str = ""
    i, j = source_len, target_len
    while(i > 0 and j > 0):
        # 回溯时候的优先级 ：向左 > 向上 > 向左上

        if path_matrix[i][j - 1] + 1 == path_matrix[i][j]:
            # 当path_matrix[i][j - 1] + 1 == path_matrix[i][j]时说明source_str[i - 1] == target_str[j - 2]
            # 这就是说在这个位置上source_str比target_str少一个target_str[j - 1]，所以要插入一个target_str[j - 1]
            print("源字符串 插入字符 %s, 操作代价为1" % (target_str[j - 1]))
            source_back_str += target_str[j - 1]
            tar_back_str += target_str[j - 1]
            print("source back string = %s" % (source_back_str))
            print("target back string = %s" % (tar_back_str))
            j -= 1
        elif path_matrix[i - 1][j] + 1 == path_matrix[i][j]:
            # 当path_matrix[i - 1][j] + 1 == path_matrix[i][j]时说明source_str[i - 2] == target_str[j - 1]
            # 这就是说在这个位置上target_str比source_str少一个source_str[j - 1]，所以source_str要删除一个source_str[j - 1]
            print("源字符串 删除字符 %s, 操作代价为1" % (source_str[i - 1]))
            print("source back string = %s" % (source_back_str))
            print("target back string = %s" % (tar_back_str))
            i -= 1
        elif path_matrix[i - 1][j - 1] + 1 == path_matrix[i][j]:
            # 当path_matrix[i - 1][j - 1] + 1 == path_matrix[i][j]时说明source_str[i - 1] != target_str[j - 1]且
            # source_str[i - 2] != target_str[j - 2]
            # 此时只能通过替换的方式来对齐
            print("源字符串 中的 %s 替换为 目标字符串 的 %s, 操作代价为1" % (source_str[i - 1], target_str[j - 1]))
            source_back_str += target_str[j - 1]
            tar_back_str += target_str[j - 1]
            print("source back string = %s" % (source_back_str))
            print("target back string = %s" % (tar_back_str))
            i -= 1
            j -= 1
        elif path_matrix[i - 1][j - 1] == path_matrix[i][j]:
            # 当path_matrix[i - 1][j - 1] + 1 == path_matrix[i][j]时说明source_str[i - 1] == target_str[j - 1]
            # 这就是说在这个位置上target_str和source_str时相同的，所以保留source_str[i - 1]和target_str[j - 1]
            print("源字符串 中的 %s 匹配 目标字符串 的 %s, 操作代价为0" % (source_str[i - 1], target_str[j - 1]))
            source_back_str += source_str[i - 1]
            tar_back_str += target_str[j - 1]
            print("source back string = %s" % (source_back_str))
            print("target back string = %s" % (tar_back_str))
            i -= 1
            j -= 1
        else:
            i -= 1
            j -= 1

def main():
    # 要注意 path_matrix[i][j]对应于source_str[i - 1]和 target_str[j - 1]
    source_str = "YPCL"
    target_str = "YCPLS"
    source_len = len(source_str)
    target_len = len(target_str)

    path_matrix = min_ed(source_str, target_str, source_len, target_len)
    min_ed_path(path_matrix, source_str, target_str, source_len, target_len)
    print("final matrix = ", path_matrix)

if __name__ == '__main__':
    main()

3. 结果

源字符串 插入字符 S, 操作代价为1
source back string = S
target back string = S
源字符串 中的 L 匹配 目标字符串 的 L, 操作代价为0
source back string = SL
target back string = SL
源字符串 插入字符 P, 操作代价为1
source back string = SLP
target back string = SLP
源字符串 中的 C 匹配 目标字符串 的 C, 操作代价为0
source back string = SLPC
target back string = SLPC
源字符串 删除字符 P, 操作代价为1
source back string = SLPC
target back string = SLPC
源字符串 中的 Y 匹配 目标字符串 的 Y, 操作代价为0
source back string = SLPCY
target back string = SLPCY
final matrix =  [[0, 1, 2, 3, 4, 5], [1, 0, 1, 2, 3, 4], [2, 1, 1, 1, 2, 3], [3, 2, 1, 2, 2, 3], [4, 3, 2, 2, 2, 3]]