C++ 实现编辑距离算法求两句话的相似度:

C++ 实现编辑距离算法求两句话的相似度:

代码如下:
核心算法如下:

float levenshtein_distance(vector<string>& hypothesis, vector<string>& reference)
 {
 	vector<int>result;
 	const int len_typ = hypothesis.size();
 	const int len_ref = reference.size();
 
 	vector<vector<int>>cost_matrix(len_typ +1,vector<int>(len_ref +1,0));
 	vector<vector<int>>ops_matrix(len_typ +1, vector<int>(len_ref +1, 0));
 	
 	for (int i = 0; i < len_typ + 1; i++)
 	{
 		cost_matrix[i][0] = i;
 	}
 
 	for (int j = 0; j < len_ref + 1; j++)
 	{
 		cost_matrix[0][j] = j;
 	}
 	
 	for (int i = 1; i < len_typ + 1; i++)
 	{
 		for (int j = 1; j < len_ref + 1; j++)
 		{
 			if (hypothesis[i - 1] == reference[j - 1])
 				cost_matrix[i][j] = cost_matrix[i - 1][j - 1];
 			else
 			{
 				int substitution = cost_matrix[i - 1][j - 1] + 1;
 				int insertion = cost_matrix[i - 1][j] + 1;
 				int deletion = cost_matrix[i][j - 1] + 1;
 
 				result.push_back(substitution);
 				result.push_back(insertion);
 				result.push_back(deletion);
 
 				auto min_iter = min_element(result.begin(), result.end());
 			    
 				int min_val = *min_iter;
 				auto iter = find(result.begin(), result.end(), min_val);
 				auto operation_idx = distance(begin(result), min_iter)+1;
 				//cout << operation_idx << endl;
 				cost_matrix[i][j] = min_val;
 				ops_matrix[i][j] = operation_idx;
 				result.clear();
 			}
 		}
 	}
 	int i = len_typ;
 	int j = len_ref;
 
 	map<string, int> nb_map = { { "n",len_ref },{"c",0} ,{"w",0},{"i",0},{"d",0},{"s",0} };
 	while (i >= 0 || j >= 0)
 	{
 		int i_idx = max(0, i);
 		int j_idx = max(0, j);
 
 
 		if (ops_matrix[i_idx][j_idx] == 0)
 		{
 			if (i - 1 >= 0 && j - 1 >= 0)
 			{
 				nb_map["c"] += 1;
 			}
 
 			i -= 1;
 			j -= 1;
 
 		}
 		else if (ops_matrix[i_idx][j_idx] == 2)
 		{
 			i -= 1;
 			nb_map["i"] += 1;
 		}
 		else if (ops_matrix[i_idx][j_idx]==3)
 		{
 			j -= 1;
 			nb_map["d"] += 1;
 		}
 		else if (ops_matrix[i_idx][j_idx] == 1)
 		{
 			i -= 1;
 			j -= 1;
 			nb_map["s"] += 1;
 		}
 		if (i < 0 && j >= 0)
 		{
 			nb_map["d"] += 1;
 		}
 		else if(j<0&&i>=0)
 		{
 			nb_map["i"] += 1;
 		}
 
 	}
 
 	//先不对其进行反转
 	int wrong_cnt = cost_matrix[len_typ][len_ref];
 	nb_map["w"] = wrong_cnt;
 
 	int n = nb_map["n"];
 	int c = nb_map["c"];
 	int w = nb_map["w"];
 	int i1 = nb_map["i"];
 	int d = nb_map["d"];
 	int s = nb_map["s"];
 	
 	float wer = 100 * (((float)i1+s+d ) / n);
 	std::cout << n << endl;
 	std::cout<< c << " " << w << " " << i1 << " " << d << " " << s << " " << endl;
 
 	std::cout << wer << endl;
 	vector<int>result1 = {n,c,w,i1,d,s};
 
 	return wer;
 }

主函数:

 int main()
 {
 //要对比的文本
 	string s1 = "娃娃鱼的生活习性特殊不易人工喂养然而游客却不知这些只图稀奇好玩随意捕捉";
 	string s2 = "完万余的生活习性撤出不易人工喂养然而游客绝不止这些指出嬉戏好玩随意捕捉";
 	vector<string>vec1;
 	vector<string>vec2;
 	cout << s1.size() << endl;
 	for (int i = 0; i < s1.length(); i += 2)
 	{
 		string sub = s1.substr(i, 2);
 		std::cout << sub;
 		vec1.push_back(sub);
 
 	}
 	cout << vec1.size()<<endl;
 	std::cout << endl;
 	for (int i = 0; i < s2.length(); i += 3)
 	{
 		string sub = s2.substr(i, 3);
 		std::cout << sub;
 		vec2.push_back(sub);
 
 	}
 	std::cout << vec2.size()<<endl;
 	float wer = 0.0;
 	wer = levenshtein_distance(vec2, vec1);
 	//输出两句话的WER
 	cout << wer << endl;
 	return 0;
 }

参考博客:
https://blog.csdn.net/baobao3456810/article/details/107381052/

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值