C#:编辑距离计算及更新维基百科

开发工具:

Visual Studio v2010

.NET Framework 4 Client Profile

维基百科相关主题:

http://en.wikipedia.org/wiki/Levenshtein_distance

http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance(更新)

源代码:

EditDistance.cs

using System;
using System.Collections.Generic;

namespace Splash
{
    /// <summary>
    /// 编辑距离:莱文斯坦距离 Damerau-Levenshtein Distance
    /// </summary>
    public static class SpellHelper
    {
        /// <summary>
        /// 莱文斯坦距离(Levenshtein Distance)
        /// </summary>
        /// <param name="source">源串</param>
        /// <param name="target">目标串</param>
        /// <param name="similarity">输出:相似度,值在0~1</param>
        /// <param name="isCaseSensitive">是否大小写敏感</param>
        /// <returns>源串和目标串之间的编辑距离</returns>
        /// <remarks>http://en.wikipedia.org/wiki/Levenshtein_distance</remarks>
        public static Int32 LevenshteinDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false)
        {
            if (String.IsNullOrEmpty(source))
            {
                if (String.IsNullOrEmpty(target))
                {
                    similarity = 1;
                    return 0;
                }
                else
                {
                    similarity = 0;
                    return target.Length;
                }
            }
            else if (String.IsNullOrEmpty(target))
            {
                similarity = 0;
                return source.Length;
            }

            String From, To;
            if (isCaseSensitive)
            {   // 大小写敏感
                From = source;
                To = target;
            }
            else
            {   // 大小写无关
                From = source.ToLower();
                To = target.ToLower();
            }

            // 初始化
            Int32 m = From.Length;
            Int32 n = To.Length;
            Int32[,] H = new Int32[m + 1, n + 1];
            for (Int32 i = 0; i <= m; i++) H[i, 0] = i;  // 注意:初始化[0,0]
            for (Int32 j = 1; j <= n; j++) H[0, j] = j;

            // 迭代
            for (Int32 i = 1; i <= m; i++)
            {
                Char SI = From[i - 1];
                for (Int32 j = 1; j <= n; j++)
                {   // 删除(deletion) 插入(insertion) 替换(substitution)
                    if (SI == To[j - 1])
                        H[i, j] = H[i - 1, j - 1];
                    else
                        H[i, j] = Math.Min(H[i - 1, j - 1], Math.Min(H[i - 1, j], H[i, j - 1])) + 1;
                }
            }

            // 计算相似度(此相似度未必合理)
            Int32 MaxLength = Math.Max(m, n);   // 两字符串的最大长度
            similarity = ((Double)(MaxLength - H[m, n])) / MaxLength;

            return H[m, n];    // 编辑距离
        }

        /// <summary>
        /// 受限的Damerau-Levenshtein Distance(只允许相邻字符交换)
        /// </summary>
        /// <param name="source">源串</param>
        /// <param name="target">目标串</param>
        /// <param name="similarity">输出:相似度,值在0~1</param>
        /// <param name="isCaseSensitive">是否大小写敏感</param>
        /// <returns>源串和目标串之间的编辑距离</returns>
        /// <remarks>http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance</remarks>
        public static Int32 OptimalStringAlignmentDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false)
        {
            if (String.IsNullOrEmpty(source))
            {
                if (String.IsNullOrEmpty(target))
                {
                    similarity = 1;
                    return 0;
                }
                else
                {
                    similarity = 0;
                    return target.Length;
                }
            }
            else if (String.IsNullOrEmpty(target))
            {
                similarity = 0;
                return source.Length;
            }

            String From, To;
            if (isCaseSensitive)
            {   // 大小写敏感
                From = source;
                To = target;
            }
            else
            {   // 大小写无关
                From = source.ToLower();
                To = target.ToLower();
            }

            // 初始化
            Int32 m = From.Length;
            Int32 n = To.Length;
            Int32[,] H = new Int32[m + 1, n + 1];
            for (Int32 i = 0; i <= m; i++) H[i, 0] = i;  // 注意:初始化[0,0]
            for (Int32 j = 1; j <= n; j++) H[0, j] = j;

            // 迭代
            for (Int32 i = 1; i <= m; i++)
            {
                Char SI = From[i - 1];
                for (Int32 j = 1; j <= n; j++)
                {   // 删除(deletion) 插入(insertion) 替换(substitution)
                    Char DJ = To[j - 1];
                    if (SI == DJ)
                        H[i, j] = H[i - 1, j - 1];
                    else
                        H[i, j] = Math.Min(H[i - 1, j - 1], Math.Min(H[i - 1, j], H[i, j - 1])) + 1;

                    if (i > 1 && j > 1)
                    {   // 交换相邻字符(transposition of two adjacent characters)
                        if (SI == To[j - 2] && DJ == From[i - 2])
                        {
                            H[i, j] = Math.Min(H[i, j], H[i - 2, j - 2] + 1);
                        }
                    }
                }
            }

            // 计算相似度(此相似度未必合理)
            Int32 MaxLength = Math.Max(m, n);   // 两字符串的最大长度
            similarity = ((Double)(MaxLength - H[m, n])) / MaxLength;

            return H[m, n];    // 编辑距离
        }

        /// <summary>
        /// 不受限的Damerau-Levenshtein Distance(允许交换字符间的删除插入操作)
        /// 我在维基百科中贡献了此段代码的修改版(去掉了相似度和大小写敏感)
        /// </summary>
        /// <param name="source">源串</param>
        /// <param name="target">目标串</param>
        /// <param name="similarity">输出:相似度,值在0~1</param>
        /// <param name="isCaseSensitive">是否大小写敏感</param>
        /// <remarks>http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance</remarks>
        public static Int32 DamerauLevenshteinDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false)
        {
            if (String.IsNullOrEmpty(source))
            {
                if (String.IsNullOrEmpty(target))
                {
                    similarity = 1;
                    return 0;
                }
                else
                {
                    similarity = 0;
                    return target.Length;
                }
            }
            else if (String.IsNullOrEmpty(target))
            {
                similarity = 0;
                return source.Length;
            }

            String From, To;
            if (isCaseSensitive)
            {   // 大小写敏感
                From = source;
                To = target;
            }
            else
            {   // 大小写无关
                From = source.ToLower();
                To = target.ToLower();
            }

            // 初始化
            Int32 m = From.Length;
            Int32 n = To.Length;
            Int32[,] H = new Int32[m + 2, n + 2];

            Int32 INF = m + n;
            H[0, 0] = INF;
            for (Int32 i = 0; i <= m; i++) { H[i + 1, 1] = i; H[i + 1, 0] = INF; }
            for (Int32 j = 0; j <= n; j++) { H[1, j + 1] = j; H[0, j + 1] = INF; }

            // 对维基百科中给出ActionScript代码优化,去掉参数C,可以更好地适合各国语言
            SortedDictionary<Char, Int32> sd = new SortedDictionary<Char, Int32>();
            foreach (Char Letter in (From + To))
            {
                if (!sd.ContainsKey(Letter))
                    sd.Add(Letter, 0);
            }

            // 迭代
            for (Int32 i = 1; i <= m; i++)
            {
                Int32 DB = 0;
                for (Int32 j = 1; j <= n; j++)
                {
                    Int32 i1 = sd[To[j - 1]];   // 定位字符To[j-1]在源串From[0:i-2]中的最后一次索引
                    Int32 j1 = DB;              // 定位字符From[i-1]在目标串To[0:j-2]中的最后一次索引

                    // 删除(deletion) 插入(insertion) 替换(substitution)
                    if (From[i - 1] == To[j - 1])
                    {
                        H[i + 1, j + 1] = H[i, j];
                        DB = j;
                    }
                    else
                    {
                        H[i + 1, j + 1] = Math.Min(H[i, j], Math.Min(H[i + 1, j], H[i, j + 1])) + 1;
                    }

                    // transposition of two adjacent characters
                    // 将源串i1-1到i-1内的字符删除,然后交换i1-1和i-1的字符,再加上目标串j1-1到j-1内的字符  
                    H[i + 1, j + 1] = Math.Min(H[i + 1, j + 1], H[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1));
                }

                sd[From[i - 1]] = i;
            }

            // 计算相似度(此相似度未必合理)
            Int32 MaxLength = Math.Max(m, n);   // 两字符串的最大长度
            similarity = ((Double)(MaxLength - H[m + 1, n + 1])) / MaxLength;

            return H[m + 1, n + 1];    // 编辑距离
        }

        /// <summary>
        /// 不受限的Damerau-Levenshtein Distance(允许交换字符间的删除插入操作)
        /// </summary>
        /// <param name="source">源串</param>
        /// <param name="target">目标串</param>
        /// <param name="similarity">输出:相似度,值在0~1</param>
        /// <param name="isCaseSensitive">是否大小写敏感</param>
        /// <remarks>更好理解的代码</remarks>
        public static Int32 EZDamerauLevenshteinDistance(String source, String target, out Double similarity, Boolean isCaseSensitive = false)
        {
            if (String.IsNullOrEmpty(source))
            {
                if (String.IsNullOrEmpty(target))
                {
                    similarity = 1;
                    return 0;
                }
                else
                {
                    similarity = 0;
                    return target.Length;
                }
            }
            else if (String.IsNullOrEmpty(target))
            {
                similarity = 0;
                return source.Length;
            }

            String From, To;
            if (isCaseSensitive)
            {   // 大小写敏感
                From = source;
                To = target;
            }
            else
            {   // 大小写无关
                From = source.ToLower();
                To = target.ToLower();
            }

            // 初始化
            Int32 m = From.Length;
            Int32 n = To.Length;
            Int32[,] H = new Int32[m + 1, n + 1];
            for (Int32 i = 0; i <= m; i++) H[i, 0] = i;  // 注意:初始化[0,0]
            for (Int32 j = 1; j <= n; j++) H[0, j] = j;

            // 迭代
            for (Int32 i = 1; i <= m; i++)
            {
                Char SI = From[i - 1];
                for (Int32 j = 1; j <= n; j++)
                {   // 删除(deletion) 插入(insertion) 替换(substitution)
                    Char DJ = To[j - 1];
                    if (SI == DJ)
                        H[i, j] = H[i - 1, j - 1];
                    else
                        H[i, j] = Math.Min(H[i - 1, j - 1], Math.Min(H[i - 1, j], H[i, j - 1])) + 1;

                    if (i > 1 && j > 1)
                    {   // 交换相邻字符(transposition of two adjacent characters)                       
                        Int32 i1 = From.LastIndexOf(DJ, i - 2, i - 1);
                        if (i1 != -1)
                        {
                            Int32 j1 = To.LastIndexOf(SI, j - 2, j - 1);
                            if (j1 != -1)
                            {   // 将源串i1到i-1内的字符删除,然后交换i1和i-1的字符,再加上目标串j1到j-1内的字符                              
                                H[i, j] = Math.Min(H[i, j], H[i1, j1] + (i - i1 - 2) + 1 + (j - j1 - 2));
                            }
                        }
                    }
                }
            }

            // 计算相似度(此相似度未必合理)
            Int32 MaxLength = Math.Max(m, n);   // 两字符串的最大长度
            similarity = ((Double)(MaxLength - H[m, n])) / MaxLength;

            return H[m, n];    // 编辑距离
        }
    }
}
 

     

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值