C++ 实现编辑距离算法求两句话的相似度:
代码如下:
核心算法如下:
float levenshtein_distance(vector<string>& hypothesis, vector<string>& reference)
{
vector<int>result;
const int len_typ = hypothesis.size();
const int len_ref = reference.size();
vector<vector<int>>cost_matrix(len_typ +1,vector<int>(len_ref +1,0));
vector<vector<int>>ops_matrix(len_typ +1, vector<int>(len_ref +1, 0));
for (int i = 0; i < len_typ + 1; i++)
{
cost_matrix[i][0] = i;
}
for (int j = 0; j < len_ref + 1; j++)
{
cost_matrix[0][j] = j;
}
for (int i = 1; i < len_typ + 1; i++)
{
for (int j = 1; j < len_ref + 1; j++)
{
if (hypothesis[i - 1] == reference[j - 1])
cost_matrix[i][j] = cost_matrix[i - 1][j - 1];
else
{
int substitution = cost_matrix[i - 1][j - 1] + 1;
int insertion = cost_matrix[i - 1][j] + 1;
int deletion = cost_matrix[i][j - 1] + 1;
result.push_back(substitution);
result.push_back(insertion);
result.push_back(deletion);
auto min_iter = min_element(result.begin(), result.end());
int min_val = *min_iter;
auto iter = find(result.begin(), result.end(), min_val);
auto operation_idx = distance(begin(result), min_iter)+1;
//cout << operation_idx << endl;
cost_matrix[i][j] = min_val;
ops_matrix[i][j] = operation_idx;
result.clear();
}
}
}
int i = len_typ;
int j = len_ref;
map<string, int> nb_map = { { "n",len_ref },{"c",0} ,{"w",0},{"i",0},{"d",0},{"s",0} };
while (i >= 0 || j >= 0)
{
int i_idx = max(0, i);
int j_idx = max(0, j);
if (ops_matrix[i_idx][j_idx] == 0)
{
if (i - 1 >= 0 && j - 1 >= 0)
{
nb_map["c"] += 1;
}
i -= 1;
j -= 1;
}
else if (ops_matrix[i_idx][j_idx] == 2)
{
i -= 1;
nb_map["i"] += 1;
}
else if (ops_matrix[i_idx][j_idx]==3)
{
j -= 1;
nb_map["d"] += 1;
}
else if (ops_matrix[i_idx][j_idx] == 1)
{
i -= 1;
j -= 1;
nb_map["s"] += 1;
}
if (i < 0 && j >= 0)
{
nb_map["d"] += 1;
}
else if(j<0&&i>=0)
{
nb_map["i"] += 1;
}
}
//先不对其进行反转
int wrong_cnt = cost_matrix[len_typ][len_ref];
nb_map["w"] = wrong_cnt;
int n = nb_map["n"];
int c = nb_map["c"];
int w = nb_map["w"];
int i1 = nb_map["i"];
int d = nb_map["d"];
int s = nb_map["s"];
float wer = 100 * (((float)i1+s+d ) / n);
std::cout << n << endl;
std::cout<< c << " " << w << " " << i1 << " " << d << " " << s << " " << endl;
std::cout << wer << endl;
vector<int>result1 = {n,c,w,i1,d,s};
return wer;
}
主函数:
int main()
{
//要对比的文本
string s1 = "娃娃鱼的生活习性特殊不易人工喂养然而游客却不知这些只图稀奇好玩随意捕捉";
string s2 = "完万余的生活习性撤出不易人工喂养然而游客绝不止这些指出嬉戏好玩随意捕捉";
vector<string>vec1;
vector<string>vec2;
cout << s1.size() << endl;
for (int i = 0; i < s1.length(); i += 2)
{
string sub = s1.substr(i, 2);
std::cout << sub;
vec1.push_back(sub);
}
cout << vec1.size()<<endl;
std::cout << endl;
for (int i = 0; i < s2.length(); i += 3)
{
string sub = s2.substr(i, 3);
std::cout << sub;
vec2.push_back(sub);
}
std::cout << vec2.size()<<endl;
float wer = 0.0;
wer = levenshtein_distance(vec2, vec1);
//输出两句话的WER
cout << wer << endl;
return 0;
}
参考博客:
https://blog.csdn.net/baobao3456810/article/details/107381052/