动态规化 - 最小编辑距离

最小编辑距离(Minimum Edit Distance)

参考:https://web.stanford.edu/class/cs124/lec/med.pdf

目标(objective):给定两个字符串 s 1 s_{1} s1 s 2 s_{2} s2及编辑操作,求将字符串 s 1 s_{1} s1转换为 s 2 s_{2} s2所需的最小操作代价。

编辑操作:

  • 插入(insert),插入一个新字符,代价(cost) o i o_{\text{i}} oi

  • 删除(delete),删除一个字符,代价(cost) o d o_{\text{d}} od

  • 替换(substitute),将一个字符替换为另一个字符,代价(cost) o s o_{\text{s}} os

在这里插入图片描述

1 递归(Recursion)

计算时间复杂度(time complexity): O ( 3 n ) \mathcal{O} (3^{n}) O(3n)

在这里插入图片描述
在这里插入图片描述

import numpy as np
COST_INSERT=1
COST_DELETE=1
COST_REPLACE=2

def edit_distance_recur(str1, str2):
    """
    find minimum number operations required to convert string str1 into str2
    comparing string from right to left (backward)
    """
    op_list = ["insert", "delete", "replace"]

    if len(str1) == 0:
        distance = len(str2) * COST_INSERT
        op_sequence = {
            "insert": list(str2),
            "delete": [],
            "replace": [],
        }
        return distance, op_sequence

    if len(str2) == 0:
        distance = len(str1) * COST_DELETE
        op_sequence = {
            "insert": [],
            "delete": list(str1),
            "replace": [],
        }
        return distance, op_sequence

    if str1[-1] == str2[-1]:
        distance, op_sequence = edit_distance_recur(str1[: -1], str2[: -1])
    else:
        distance_insert, op_sequence_insert = edit_distance_recur(str1, str2[: -1])
        distance_delete, op_sequence_delete = edit_distance_recur(str1[: -1], str2)
        distance_replace, op_sequence_replace = edit_distance_recur(str1[: -1], str2[: -1])

        distance_insert += COST_INSERT
        distance_delete += COST_DELETE
        distance_replace += COST_REPLACE
        op = np.argmin(a=[distance_insert, distance_delete, distance_replace])
        op = op_list[op]

        if op == "insert":
            distance = distance_insert
            op_sequence = op_sequence_insert
            op_sequence["insert"].append(str2[-1])
        elif op == "delete":
            distance = distance_delete
            op_sequence = op_sequence_delete
            op_sequence["delete"].append(str1[-1])
        else:
            distance = distance_replace
            op_sequence = op_sequence_replace
            op_sequence["replace"].append(str1[-1] + " with " + str2[-1])

    return distance, op_sequence
edit_distance_recur("intention", "execution")
(8,
 {'insert': ['x', 'e', 'c', 'u'],
  'delete': ['i', 'n', 't', 'n'],
  'replace': []})

2 动态规化(Dynamic Programming)

动态规化(Dynamic&programming):将问题分解为子问题(subproblem)求解;构造计算表(a tabular computation) D ( m , n ) D(m, n) D(m,n)

计算时间复杂度为 O ( m n ) \mathcal{O} (mn) O(mn);计算空间复杂度为 O ( m n ) \mathcal{O} (mn) O(mn)

自下而上(bottom-up)

在这里插入图片描述
在这里插入图片描述

def edit_distance_dp(str1, str2, cost_insert=1, cost_delete=1, cost_replace=1):
    """
    find minimum number operations required to convert string str1 into str2
    bottom-up
    """

    len_str1 = len(str1)
    len_str2 = len(str2)

    str1 = "#" + str1
    str2 = "#" + str2

    # initialization
    distance_table = np.zeros(shape=(len_str1 + 1, len_str2 + 1))
    distance_table[:, 0] = np.linspace(start=0, stop=len_str1, num=len_str1 + 1)
    distance_table[0, :] = np.linspace(start=0, stop=len_str2, num=len_str2 + 1)

    for idx1 in range(1, len_str1 + 1):
        for idx2 in range(1, len_str2 + 1):

            # insert
            distance_insert = distance_table[idx1, idx2 - 1] + cost_insert
            # delete
            distance_delete = distance_table[idx1 - 1, idx2] + cost_delete
            # replace or remain
            if str1[idx1] == str2[idx2]:
                distance_replace = distance_table[idx1 - 1, idx2 - 1]
            else:
                distance_replace = distance_table[idx1 - 1, idx2 - 1] + cost_replace
            distance_table[idx1, idx2] = np.min(
                [distance_insert, distance_delete, distance_replace]
            )

    return distance_table[-1, -1]
edit_distance_dp("intention", "execution", cost_replace=2)
8.0

3 回溯(Backtrace)

回溯(Backtrace)计算时间、空间复杂度均为 O ( m + n ) \mathcal{O} (m + n) O(m+n)

在这里插入图片描述
在这里插入图片描述
( 0 , 0 ) (0, 0) (0,0) ( M , N ) (M, N) (M,N)的任意非减路径均为一条可行对齐路径;最优路径由最优子路径组成

在这里插入图片描述

def edit_distance_dp_backtrace(str1, str2, cost_insert=1, cost_delete=1, cost_replace=1):
    """
    find minimum number operations required to convert string str1 into str2
    bottom-up
    backtrace
    """

    len_str1 = len(str1)
    len_str2 = len(str2)

    str1 = "#" + str1
    str2 = "#" + str2

    # initialization
    distance_table = np.zeros(shape=(len_str1 + 1, len_str2 + 1))
    distance_table[:, 0] = np.linspace(start=0, stop=len_str1, num=len_str1 + 1)
    distance_table[0, :] = np.linspace(start=0, stop=len_str2, num=len_str2 + 1)

    for idx1 in range(1, len_str1 + 1):
        for idx2 in range(1, len_str2 + 1):

            # insert
            distance_insert = distance_table[idx1, idx2 - 1] + cost_insert
            # delete
            distance_delete = distance_table[idx1 - 1, idx2] + cost_delete
            # replace or remain
            if str1[idx1] == str2[idx2]:
                distance_replace = distance_table[idx1 - 1, idx2 - 1]
            else:
                distance_replace = distance_table[idx1 - 1, idx2 - 1] + cost_replace
            distance_table[idx1, idx2] = np.min(
                [distance_insert, distance_delete, distance_replace]
            )

    op_list = ["replace", "insert", "delete"]
    path = []
    idx1 = len_str1
    idx2 = len_str2
    while (idx1 > 0) & (idx2 > 0):
        op = np.argmin(distance_table[idx1 - 1: idx1 + 1, idx2 - 1: idx2 + 1])
        op = op_list[op]
        if op == "insert":
            op = "{} {}".format(op, str2[idx2])
            idx2 -= 1
        elif op == "delete":
            op = "{} {}".format(op, str1[idx1])
            idx1 -= 1
        else:
            if str1[idx1] == str2[idx2]:
                op = "remaind"
            else:
                op = "{} {} with {}".format(op, str1[idx1], str2[idx2])
            idx1 -= 1
            idx2 -= 1
        path.append(op)

    if idx1 == 0:
        path.extend(["{} {}".format("insert", str2[i]) for i in range(idx2, 0, -1)])
    if idx2 == 0:
        path.extend(["{} {}".format("delete", str1[i]) for i in range(idx1, 0, -1)])

    path.reverse()

    return distance_table[-1, -1], path
min_edit_distance, optimal_path = edit_distance_dp_backtrace("intention", "execution", cost_replace=2)
print("min edit distance: ", min_edit_distance)
print("optimal path: ", optimal_path)
min edit distance:  8.0
optimal path:  ['insert e', 'replace i with x', 'replace n with e', 'replace t with c', 'delete e', 'replace n with u', 'remaind', 'remaind', 'remaind', 'remaind']

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值