编辑距离(Edit Distance),又称Levenshtein距离,是指两个字串之间,由一个转成另一个所需的最少编辑操作次数。许可的编辑操作包括将一个
字符替换成另一个字符,插入一个字符,删除一个字符。一般来说,编辑距离越小,两个串的相似度越大。
例如将kitten转成sitting:
kitten->sitten (k→s)
sitten->sittin (e→i)
sittin->sitting (插入g)
算法思想:
比如要计算cafe和coffee的编辑距离。cafe→caffe→coffe→coffee
先创建一个6×8的表(cafe长度为4,coffee长度为6,各加2)
| c | o | f | f | e | e | |
c | |||||||
a | |||||||
f | |||||||
e | 表1 |
接着,在如下位置填入数字(表2):
| c | o | f | f | e | e | |
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
c | 1 | ||||||
a | 2 | ||||||
f | 3 | ||||||
e | 4 | 表1 |
从3,3格开始,开始计算。取以下三个值的最小值:
-
如果最上方的字符等于最左方的字符,则为左上方的数字。否则为左上方的数字+1。(对于3,3来说为0)
-
左方数字+1(对于3,3格来说为2)
-
上方数字+1(对于3,3格来说为2)
| c | o | f | f | e | e | |
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
c | 1 | 0 | 1 | 2 | 3 | 4 | 5 |
a | 2 | 1 | 1 | 2 | 3 | 4 | 5 |
f | 3 | 2 | 2 | 1 | 2 | 3 | 4 |
e | 4 | 3 | 3 | 2 | 2 | 2 | 3 |
c++
// Leveinshtein.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <stdio.h>
#include<string.h>
char s1[1000],s2[1000];
int min(int a, int b, int c){
int temp = a < b ? a : b;
return temp < c ? temp : c;
}
void Levenshtein (int len1, int len2){
int **d = new int*[len1+1];
for(int k = 0; k <= len1; k++)
d[k] = new int[len2+1];
int i,j;
for(i = 0; i <= len1; i++)
d[i][0] = i;
for(j = 0; j <= len2; j++)
d[0][j] = j;
for(i = 1; i <= len1; i++)
{
for(j = 1; j<= len2; j++){
int cost = s1[i-1] == s2[j-1]?0:1;
int deletion = d[i-1][j]+1;//top
int insertion = d[i][j-1]+1;//left
int substitution = d[i-1][j-1]+cost;
d[i][j] = min(deletion,insertion,substitution);
}
}
printf("距离为:%d\n",d[len1][len2]);
for(i=0;i<=len1;i++){
delete[] d[i];
}
delete[] d;
}
int main(int argc, char* argv[])
{
while(scanf("%s%s",s1,s2) != EOF)
{
Levenshtein(strlen(s1),strlen(s2));
}
return 0;
}
java
public class App {
public static int Levenshtein(String str1, String str2) {
int row = str1.length() + 1;
int column = str2.length() + 1;
int cost;
int[] matrix = new int[row * column];
for (int i = 0; i < row; i++) {
matrix[i] = i;
}
for (int j = 0; j < matrix.length; j += row)
if (j % row == 0)
matrix[j] = j / row;
for (int i = 1; i < row; i++) {
for (int j = 1; j < column; j++) {
if (str1.charAt(i - 1) == str2.charAt(j - 1))
cost = 0;
else
cost = 1;
matrix[j * row + i] = Math.min((matrix[(j - 1) * row + i] + 1), // left
Math.min(matrix[j * row + i - 1] + 1, // top
matrix[(j - 1) * row + (i - 1)] + cost));// left-top
}
}
return matrix[matrix.length - 1];
}
public static void main(String[] args) {
String str1 = "cafe";
String str2 = "coffee";
int distance = Levenshtein(str1, str2);
System.out.println(distance);
}
}
python:
#coding=utf-8
from __future__ import division
def normal_leven(str1, str2):
len_str1 = len(str1) + 1
len_str2 = len(str2) + 1
# create matrix
matrix = [0 for n in range(len_str1 * len_str2)]
# init x axis
for i in range(len_str1):
matrix[i] = i
# init y axis
#print(matrix)
for j in range(0, len(matrix), len_str1):
if j % len_str1 == 0:
matrix[j] = j // len_str1
#print(matrix)
for i in range(1, len_str1):
for j in range(1, len_str2):
if str1[i - 1] == str2[j - 1]:
cost = 0
else:
cost = 1
matrix[j * len_str1 + i] = min(matrix[(j - 1) * len_str1 + i] + 1,#left
matrix[j * len_str1 + (i - 1)] + 1,#top
matrix[(j - 1) * len_str1 + (i - 1)] + cost)# left-top
print matrix
return matrix[-1]
if __name__ == '__main__':
str1 = u'cafe'
str2 = u'coffe'
distance = normal_leven(str1,str2)
print distance
| c | o | f | f | e | e | |
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
c | 1 | 0 | 1 | 2 | 3 | 4 | 5 |
a | 2 | 1 | 1 | 2 | 3 | 4 | 5 |
f | 3 | 2 | 2 | 1 | 2 | 3 | 4 |
e | 4 | 3 | 3 | 2 | 2 | 2 | 3 |