基于 Needleman-Wunsch 算法的文本相关性计算

最新推荐文章于 2024-01-02 14:21:08 发布

Zevin

最新推荐文章于 2024-01-02 14:21:08 发布

阅读量990

点赞数

分类专栏：算法导论文章标签：算法 float string null

算法导论专栏收录该内容

18 篇文章 0 订阅

订阅专栏

// 计算字符串（数组）A和B（数组）的相关性：
        static float Correlation(List<string> A, List<string> B)
        {
            List<string> lcs = NeedlemanWunsch(A, B);// 调用Needleman-Wunsch算法子函数
            float x = (float)lcs.Count;
            float corA = x / (float)A.Count;
            float corB = x / (float)B.Count;
            return (corA * corB);
        }

        static List<String> CreateStringList(string s)
        {
            List<String> lst = new List<string>();
            string[] arr = s.Split(Separators, StringSplitOptions.RemoveEmptyEntries);
            for (int i = 0; i < arr.Length; ++i)
                lst.Add(arr[i]);
            return lst;
        }

        // 采用Needleman-Wunsch算法计算字符串A和B的最长公共子序列
        static List<string> NeedlemanWunsch(List<string> A, List<string> B)
        {
            int M = A.Count;
            int N = B.Count;
            List<string>[,] T = new List<string>[M + 1, N + 1];
            for (int i = 0; i <= M; ++i)
            {
                T[i, 0] = new List<string>();
            }
            for (int j = 1; j <= N; ++j)
            {
                T[0, j] = new List<string>();
            }
            for (int k = 1; k <= Math.Max(M, N); ++k)
            {
                if (k <= M)
                {
                    // 计算 T[k,*]
                    for (int n = Math.Min(k, N); n <= N; ++n)
                    {
                        if(T[k, n] == null)
                            T[k, n] = LCS(A[k - 1], B[n - 1], T[k, n - 1], T[k - 1, n], T[k - 1, n - 1]);
                    }
                }
                if (k <= N)
                {
                    // 计算 T[*,k]
                    for (int m = Math.Min(k, M); m <= M; ++m)
                    {
                        if(T[m, k] == null)
                            T[m, k] = LCS(A[m - 1], B[k - 1], T[m, k - 1], T[m - 1, k], T[m - 1, k - 1]);
                    }
                }
            }
            System.Diagnostics.Debug.Assert(T[M, N].Count <= Math.Max(A.Count, B.Count));
            return T[M, N];
        }

        // 计算单步LCS (如何处理长度相等的情况？)
        static List<string> LCS(string a, string b, List<string> s1, List<string> s2, List<string> s3)
        {
            List<string> lcs = null;

            if (s1.Count > s2.Count)
                lcs = s1;
            else
                lcs = s2;

            if (a != b)
            {
                if (s3.Count > lcs.Count)
                    lcs = new List<string>(s3); // 拷贝而非引用
                else
                    lcs = new List<string>(lcs); // 拷贝而非引用
            }
            else
            {
                if (s3.Count + 1 > lcs.Count)
                {
                    lcs = new List<string>(s3); // 拷贝而非引用
                    lcs.Insert(lcs.Count, a);
                }
                else
                    lcs = new List<string>(lcs); // 拷贝而非引用
            }

return lcs;
}

        static char[] Separators = new char[] { ',', '~', '!', '@', '$', '%', '^', '&', '*', '(', ')', '_', '-',
                                                 '+', '=', '{', '}', '[', ']', '|', '//', ':', ';', '/"', '/'',
                                                 '<', '>', ',', '.', '?', '/', '/t', '/n', '/r', '`', '#', ' '};

Zevin

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
基于 Needleman-Wunsch 算法的文本相关性计算

// 计算字符串（数组）A和B（数组）的相关性： static float Correlation(List A, List B) { List lcs = NeedlemanWunsch(A, B);// 调用Needleman-Wunsch算法子函数 float x = (float)lcs.Count;
复制链接

扫一扫

专栏目录