最近在项目中,被要求对相关上传文本需进行高精度查重,网上进行了相关搜索,根据理论实现的编辑距离算法皆为二维数组,对于几千字的文本进行查重,构建数组时将直接导致内存溢出。这里将网上大神对于C语言中进行查重算法的转化,转为非建立二维数组进行的查重,分别循环2个字符串进行比对。记录下算法,以便日后使用
/// <summary>
/// 文本相似度(编辑距离算法,精度最高,花费时间最多)
/// </summary>
/// <param name="s1"></param>
/// <param name="s2"></param>
/// <param name="pro">比对进度,百分比数值</param>
/// <param name="cost_ins">插入一个字符的权重</param>
/// <param name="cost_rep">替换一个字符的权重</param>
/// <param name="cost_del">删除一个字符的权重</param>
/// <returns>文本相似度</returns>
public static decimal GetLevenshteinSimilarity(string s1, string s2, Action<int> pro = null, int cost_ins = 1, int cost_rep = 1, int cost_del = 1)
{
var l1 = s1.Length;
var l2 = s2.Length;
if (l1 == 0)
{
return l2 * cost_ins;
}
if (l2 == 0)
{
return l1 * cost_del;
}
var p1 = Enumerable.Range(0, s2.Length + 1).Select(x => x * cost_ins).ToArray();
var p2 = Enumerable.Repeat(0, s2.Length + 1).ToArray();
int c0 = 0, c1 = 0, c2 = 0;
for (var i1 = 0; i1 < l1; i1++)
{
p2[0] = p1[0] + cost_del;
for (var i2 = 0; i2 < l2; i2++)
{
c0 = p1[i2] + ((s1[i1] == s2[i2]) ? 0 : cost_rep);
c1 = p1[i2 + 1] + cost_del;
if (c1 < c0)
{
c0 = c1;
}
c2 = p2[i2] + cost_ins;
if (c2 < c0)
{
c0 = c2;
}
p2[i2 + 1] = c0;
}
var tmp = p1;
p1 = p2;
p2 = tmp;
pro?.Invoke(i1 * 100 / l1);
}
c0 = p1[l2];
return 1 - (decimal)c0 / Math.Max(l1, l2);
}