编辑距离(levenshtein distance)C/C++ 实现
Levenshtein distance 距离简介
Levenshtein distance 的由来
Levenshtein distance 原本叫 edit distance ,用于衡量两个序列(字符串)之间的差异。1965年,苏联数学家 Vladimir Levenshtein 提出这一问题的解法,因此命名为 Levenshtein distance 。
编辑距离的内涵
编辑距离从三个方面进行度量:
- 替换
- 插入
- 删除
如下表所示:
操作 | 距离 | 解析 |
---|---|---|
替换 | cat → rat | “c” 被 “r” 替换了 |
插入 | cat → coat | “c” 和 “a” 之间插入了 “o” |
删除 | cat → at | “a” 之间的 “c” 被删除了 |
编辑距离示例
Saturday 与 Sunday 的编辑距离:
- Sunday (原本)
- Saunday ( “S” 和 “u” 之间增加 “a” )
- Satunday ( “a” 和 “u” 之间增加 “t” )
- Saturday ( “n” 改为 “r” )
故 Saturday 与 Sunday 的编辑距离为 3 。
编辑距离 C 语言实现
代码参考维基百科:https://en.wikipedia.org/wiki/Levenshtein_distance
最小时间复杂度
#include <stdio.h>
#include <string.h>
#define MIN3(a,b,c) ((a)<(b)?((a)<(c)?(a):(c)):((b)<(c)?(b):(c)))
int cmp_levenshtein( const char *s1, const char *s2 );
int main( void )
{
char str1[] = "Today is Saturday.";
char str2[] = "Tomorrow is Sunday.";
int d = cmp_levenshtein( str1, str2 );
printf( "edit distance: %d\n", d );
return 0;
}
int cmp_levenshtein( const char *s1, const char *s2 )
{
int row = strlen(s1); /* s1 的长度 */
int col = strlen(s2); /* s2 的长度 */
int mat[row][col]; /* C99 - variable-length array */
for( int i=0; i<row; ++i ) { /* 数组的行 */
for( int j=0; j<col; ++j ) { /* 数组的列 */
if( i == 0 ) {
mat[i][j] = j; /* 初始化第1行为 [ 0 1 2 ... ] */
}
else if( j == 0 ) {
mat[i][j] = i; /* 初始化第1列为 [ 0 1 2 ... ] */
}
else {
int cost = ( s1[i-1] == s2[j-1] ) ? 0 : 1; /* 记录s1[i-1]与s2[j-1]是否相等 */
mat[i][j] = MIN3( mat[i-1][j ] + 1, /* 取三者的最小值 */
mat[i ][j-1] + 1,
mat[i-1][j-1] + cost);
}
}
}
return mat[row-1][col-1];
}
最小空间复杂度
#include <stdlib.h>
#include <string.h>
#define MIN3(a,b,c) ((a)<(b)?((a)<(c)?(a):(c)):((b)<(c)?(b):(c)))
#define SWAP(a,b) do { \
typeof(a) tmp = (a); \
(a) = (b); \
(b) = tmp; \
}while(0)
int cmp_levenshtein( const char *s1, const char *s2 )
{
// create two work vectors of integer distances
int m = strlen(s1);
int n = strlen(s2);
int *v1 = (int*)malloc( (n+1)*sizeof(int) );
if( v1==NULL ) {
return -1;
}
int *v2 = (int*)malloc( (n+1)*sizeof(int) );
if( v2==NULL ) {
free( v1 );
return -1;
}
// initialize v1 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s1
// the distance is just the number of characters to delete from s2
for( int i=0; i<=n; ++i ) {
v1[i] = i;
}
// calculate v2 (current row distances) from the previous row v1
for( int i=0; i<m; ++i ) {
// first element of v2 is A[i+1][0]
// edit distance is delete (i+1) chars from s to match empty s2
v2[0] = i+1;
// use formula to fill in the rest of the row
for( int j=0; j<n; ++j ) {
// calculating costs for A[i+1][j+1]
int deletionCost = v1[j+1] + 1;
int insertionCost = v2[j] + 1;
int substitutionCost = v1[j];
if( s1[i] != s2[j] ) {
++ substitutionCost;
}
v2[j+1] = MIN3( deletionCost, insertionCost, substitutionCost );
}
// copy v2 (current row) to v1 (previous row) for next iteration
SWAP( v1, v2 );
}
// after the last swap, the results of v2 are now in v1
int retval = v1[n];
free(v1);
free(v2);
return retval;
}
编辑距离 C++ 语言实现
#include <string>
#include <iostream>
class levenshtein
{
public:
static int compare( const std::string& s1, const std::string& s2 )
{
// create two work vectors of integer distances
const int m = s1.size();
const int n = s2.size();
int* v1 = new int[n+1];
int* v2 = new int[n+1];
// initialize v1 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s1
// the distance is just the number of characters to delete from s2
for( int i = 0; i <= n; ++i ) {
v1[i] = i;
}
// calculate v2 (current row distances) from the previous row v1
for( int i = 0; i < m; ++i ) {
// first element of v2 is A[i+1][0]
// edit distance is delete (i+1) chars from s to match empty s2
v2[0] = i+1;
// use formula to fill in the rest of the row
for( int j = 0; j < n; ++j ) {
// calculating costs for A[i+1][j+1]
int deletionCost = v1[j+1] + 1;
int insertionCost = v2[j] + 1;
int substitutionCost = v1[j];
if( s1[i] != s2[j] ) {
++ substitutionCost;
}
v2[j+1] = min3( deletionCost, insertionCost, substitutionCost );
}
// copy v2 (current row) to v1 (previous row) for next iteration
swap( v1, v2 );
}
// after the last swap, the results of v2 are now in v1
int retval = v1[n];
delete []v1;
delete []v2;
return retval;
}
private:
static int min3( int a, int b, int c )
{
if ( a < b ) {
return a < c ? a : c ;
}
else {
return b < c ? b : c ;
}
}
static void swap( int*& a, int*& b )
{
int* tmp = a;
a = b;
b = tmp;
}
};
int main( void )
{
const std::string s1 = "Today is Saturday.";
const std::string s2 = "Tomorrow is Sunday.";
int d = levenshtein::compare( s1, s2 );
std::cout << "edit distance: " << d << std::endl;
return 0;
}