基于 Needleman-Wunsch 算法的文本相关性计算

        // 计算字符串(数组)A和B(数组)的相关性:
        static float Correlation(List<string> A, List<string> B)
        {
            List<string> lcs = NeedlemanWunsch(A, B);// 调用Needleman-Wunsch算法子函数
            float x = (float)lcs.Count;
            float corA = x / (float)A.Count;
            float corB = x / (float)B.Count;
            return (corA * corB);
        }

        static List<String> CreateStringList(string s)
        {
            List<String> lst = new List<string>();
            string[] arr = s.Split(Separators, StringSplitOptions.RemoveEmptyEntries);
            for (int i = 0; i < arr.Length; ++i)
                lst.Add(arr[i]);
            return lst;
        }

        // 采用Needleman-Wunsch算法计算字符串A和B的最长公共子序列
        static List<string> NeedlemanWunsch(List<string> A, List<string> B)
        {
            int M = A.Count;
            int N = B.Count;
            List<string>[,] T = new List<string>[M + 1, N + 1];
            for (int i = 0; i <= M; ++i)
            {
                T[i, 0] = new List<string>();
            }
            for (int j = 1; j <= N; ++j)
            {
                T[0, j] = new List<string>();
            }
            for (int k = 1; k <= Math.Max(M, N); ++k)
            {
                if (k <= M)
                {
                    // 计算 T[k,*]
                    for (int n = Math.Min(k, N); n <= N; ++n)
                    {
                        if(T[k, n] == null)
                            T[k, n] = LCS(A[k - 1], B[n - 1], T[k, n - 1], T[k - 1, n], T[k - 1, n - 1]);
                    }
                }
                if (k <= N)
                {
                    // 计算 T[*,k]
                    for (int m = Math.Min(k, M); m <= M; ++m)
                    {
                        if(T[m, k] == null)
                            T[m, k] = LCS(A[m - 1], B[k - 1], T[m, k - 1], T[m - 1, k], T[m - 1, k - 1]);
                    }
                }
            }
            System.Diagnostics.Debug.Assert(T[M, N].Count <= Math.Max(A.Count, B.Count));
            return T[M, N];
        }

        // 计算单步LCS (如何处理长度相等的情况?)
        static List<string> LCS(string a, string b, List<string> s1, List<string> s2, List<string> s3)
        {
            List<string> lcs = null;

            if (s1.Count > s2.Count)
                lcs = s1;
            else
                lcs = s2;

            if (a != b)
            {
                if (s3.Count > lcs.Count)
                    lcs = new List<string>(s3); // 拷贝而非引用
                else
                    lcs = new List<string>(lcs); // 拷贝而非引用
            }
            else
            {
                if (s3.Count + 1 > lcs.Count)
                {
                    lcs = new List<string>(s3); // 拷贝而非引用
                    lcs.Insert(lcs.Count, a);
                }
                else
                    lcs = new List<string>(lcs); // 拷贝而非引用
            }

            return lcs;
        }

        static char[] Separators = new char[] { ',', '~', '!', '@', '$', '%', '^', '&', '*', '(', ')', '_', '-',
                                                 '+', '=', '{', '}', '[', ']', '|', '//', ':', ';', '/"', '/'',
                                                 '<', '>', ',', '.', '?', '/', '/t', '/n', '/r', '`', '#', ' '};

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值