【算法&数据结构体系篇class27】：KMP算法

本文链接：https://blog.csdn.net/studyday1/article/details/130150766

一、KMP算法

假设字符串str长度为N，字符串match长度为M，M <= N

想确定str中是否有某个子串是等于match的。

时间复杂度O(N)

KMP算法：用来解决该类问题，字符串中存在某个子串，返回其索引位置时间优化到O(N)

二、KMP算法核心

1）如何理解next数组

2）如何利用next数组加速匹配过程，优化时的两个实质！

三、代码演示

package class27;

/**
 * 假设字符串str长度为N，字符串match长度为M，M <= N
 *
 * 想确定str中是否有某个子串是等于match的。 有的话就返回出现第一个子串的首字符索引 没有就返回-1
 *
 * 时间复杂度O(N)
 *
 * KMP算法
 * 1）如何理解next数组
 *
 * 2）如何利用next数组加速匹配过程，优化时的两个实质！（私货解释）
 */
public class KMP {

    //KMP算法
    //获取s1中 存在子串s2所在的第一个字符索引位置  如果没有就返回-1
    public static int getIndexOf(String s1, String s2) {
        //边界判断 数组空 或者s1长度小于s2  数组长度的 都是无效的 返回-1
        if(s1 == null || s2 == null ||  s2.length() == 0 || s1.length() < s2.length()) return -1;

        //将字符串转换成字符数组
        char[] c1 = s1.toCharArray();
        char[] c2 = s2.toCharArray();


         O(M) m <= n M为c2长度
        //调用函数 取出c2字符数组 对应的next数组,有了该数组 我们再比对两字符串 s2在s1某个位置的过程 就省了很多比较次数 也就是加速匹配过程 时间时间复杂度O(N)的关键辅助数组
        //顾名思义 NextArray就是 我每次比较的位置 跳过不必要已知不符合的位置 得到这么一个数组
        int[] next = getNextArray(c2);
        //定义两个数组的开始索引 用来遍历做条件 x表示c1的索引  y表示c2的索引
        int x = 0;
        int y = 0;

        //时间复杂度 同理也是 O(N) N为c1长度
        //开始遍历比对两个字符数组 从0位置开始 两个索引不越界 就进行遍历
        while(x < c1.length && y < c2.length){
            if(c1[x] == c2[y]){
                //如果两字符数组 位置值现在 说明匹配 那么就两个索引都++ 来到下个位置进行比对
                x++;
                y++;
            }else if(next[y] == -1){
                //如果不相等， 并且当前c2字符数组位置已经来到c2[0]位置了 该位置的next[0] 值是-1 表示不能再往左了 左边到底了
                //那么就需要c1重新来到下一个位置进行匹配 索引x++
                x++;
            }else{
                //不相等 且c2索引位置还没来到0 处 那么就可以令c2当前索引跳到其next数组的位置,也就是往左边跳 next数组i位置值就是在0---i-1取得得 也有-1的时候  进行下一轮的比对
                y = next[y];
            }
        }

        //最后弹出循环  如果是y先越界 那么根据我们设定的 肯定 s1存在s2字符串 才会y越界退出循环
        //那么在s1的位置 当前来到x是到符合s2字符串的右一个位置了 然后y此时是来到了s2的长度 那么就用x - y 减去s2长度 刚好就来到s1中 s2字符串的首个索引位置
        //如果不是y越界 那就是x越界退出 那么就表示s1中没有存在s2字符串 直接返回-1
        return y == c2.length? x- y : -1;



    }

    /**
     * 该方法 返回s2字符数组对应的一个比较字符的索引数组
     * 每个位置i的值怎么求：
     * 取0...i-1区间值，也就是每个位置i的值，与本身的值无关，只与前面0...i-1区间的值有关，
     * 其中前缀子串0..x  后缀子串 y...i-1  相等且最大的长度，且长度不能包括0...i-1全部位置    该长度当前i值   注意前缀串要从0开始  后缀串结尾要到i-1
     * 而s2[0] 由于是最左边的数，左侧已经没有数了 所以直接赋值-1长度
     * s2[1] 由于左边只有1个数 要求前后缀子串不能占满前面全部长度  所以这里就赋值 0长度
     * s2[3] 看0-1两个位置值，比如aab  前面两位一样  那就说明相同最长前后缀子串长度为1  如果不一样acb 那么就是没有相同串 赋值0
     * 举例说明：   abcedabci   s2字符数组  最后一个 next[8] = 3  最后一位字符i 前面a---c 字符串 符合的前后缀串相等最长且不包含全部长度的子串就是abc 长度3
     * next[7] = 2  索引7位置字符 c 倒数第二个  前面字符a---c 中 符合的就是 ab子串 最长 长度2......
     *
     *
     * 时间复杂度 while循环三个分支分别讨论：
     * (范围i<= s2.length M<=N  s1长度)  i-cn <=s2.length   同等范围
     *   变量           i-cn(为什么不是cn,因为我们需要两个分析变量的变化都是同个方向的才能评估 不能i变大 cn变小 这样就不知道范围)
     * 1： i++ 变大        i++ cn++变大, i-cn则不变
     * 2： 不变            cn = next[cn] 变小 i-cn 最后是 变大
     * 3： i++ 变大         i++   i-cn 变大
     *
     * 结论： 三个情况下 i  i-cn两个变量都是同方向的增大 如果右变小 那就不好判断复杂度 所以需要同方向变化 且不超过数组长度
     *       时间复杂度最多两个变量就是 O(2N)  也就是O(N)
     *
     * 三个分支都是同等方向变化 次数相加最大复杂度不超过这个值      大小不超过数组长度 时间复杂度O(M)
     */
    public static int[] getNextArray(char[] s2){
        //0  1 位置如前面分析 都是 -1  0 固定的
        if(s2.length == 1) return new int[]{-1};
        int[] next = new int[s2.length];
        next[0] = -1;
        next[1] = 0;
        int i = 2;      //前面两个位置已经填充好 接着就是从索引2开始
        int cn = 0;     //cn位置表示在i 位置下 对应的前缀位置要和i-1位置进行比对的位置 一开始就是在0   因为一开始i=2 表示三个字符 那前面两个位置 只能是第0和第1进行比较 相等 i位置就是1 不等i位置就是0
        while (i < next.length){
            if(s2[i-1] == s2[cn]){
                //如果前缀的尾部和后缀的尾部i-相同 那么就表示前面都是符合的  因为我们就是从头往后匹配的 所以next[i] 值就是cn前缀尾部索引+1 这里因为cn也需要++ 来到下一个值 做下次遍历
                //所以直接赋值++cn
                next[i++] = ++cn;
            } else if(cn > 0){
                //如果没有匹配上，那么我们的前缀尾索引cn需要做调整，向0---cn-1位置跳， 那么就利用next[cn]找到cn位置对应的下标
                cn = next[cn];
            } else {
                //如果cn位置跳到前面不能再跳了 那就说明当前位置下 没有范围没有符合的前后缀子串 长度为0,  i位置++来到下个位置
                next[i++] = 0;
            }
        }
        return next;
    }

    // for test
    public static String getRandomString(int possibilities, int size) {
        char[] ans = new char[(int) (Math.random() * size) + 1];
        for (int i = 0; i < ans.length; i++) {
            ans[i] = (char) ((int) (Math.random() * possibilities) + 'a');
        }
        return String.valueOf(ans);
    }

    public static void main(String[] args) {
        int possibilities = 5;
        int strSize = 20;
        int matchSize = 5;
        int testTimes = 5000000;
        System.out.println("test begin");
        for (int i = 0; i < testTimes; i++) {
            String str = getRandomString(possibilities, strSize);
            String match = getRandomString(possibilities, matchSize);
            if (getIndexOf(str, match) != str.indexOf(match)) {
                System.out.println("Oops!");
            }
        }
        System.out.println("test finish");
    }
}

四、题目一：二叉树是否包含着另外一个二叉树

给定两棵二叉树的头节点head1和head2

想知道head1中是否有某个子树的结构和head2完全一样

package class27;

import java.util.ArrayList;

/**
 * 给定两棵二叉树的头节点head1和head2
 * <p>
 * 想知道head1中是否有某个子树的结构和head2完全一样
 * <p>
 * 思路： 将二叉树 先序遍历打印出字符串数组 然后两个字符串数组进行比较 KMP算法
 */
public class TreeEqual {
    //node节点结构
    public static class Node {
        public int value;
        public Node left;
        public Node right;

        public Node(int v) {
            value = v;
        }
    }

    //方法一： 递归
    public static boolean containsTree1(Node big, Node small) {
        //边界判断 如果small空 那么肯定big树不管是什么结构 也会包含空树 返回true
        if (small == null) return true;
        //small非空 big空树 那么肯定是不包含small的  返回false
        if (big == null) return false;

        //如果两个树相等 也是属于包含small树 返回true
        if (sameTree(big, small)) return true;

        //最后再递归判断 big左树 与big右树 是否是包含small树的
        return containsTree1(big.left, small) || containsTree1(big.right, small);
    }

    //判断两个树是否结构都一致
    public static boolean sameTree(Node big, Node small) {
        //任意一个为空 都是表示结构不一致 返回false
        if (big == null && small != null)
            return false;
        if (small == null && big != null)
            return false;

        //都为空节点 那么就表示结构相等 返回true
        if (big == null && small == null)
            return true;
        //如果节点值不相等 那么就返回false  相等的不能直接返回true 因为后面可能还有节点还不好判断
        if (big.value != small.value)
            return false;

        //返回左节点 右节点 需要当前节点的左树和右树都相等整个结构树才是一致的
        return sameTree(big.left, small.left) && sameTree(big.right, small.right);
    }


    //方法二： 树结构转换先序数组   KMP
    public static boolean containsTree2(Node big, Node small) {
        //边界判断 如果small空 那么肯定big树不管是什么结构 也会包含空树 返回true
        if (small == null) return true;
        //small非空 big空树 那么肯定是不包含small的  返回false
        if (big == null) return false;

        //先序遍历 保存到集合中
        ArrayList<String> bpre = pre(big);
        ArrayList<String> spre = pre(small);

        //将集合转换成字符串数组 再进行处理
        String[] bstr = new String[bpre.size()];
        String[] sstr = new String[spre.size()];
        for (int i = 0; i < bstr.length; i++) {
            bstr[i] = bpre.get(i);
        }
        for (int i = 0; i < sstr.length; i++) {
            sstr[i] = spre.get(i);
        }

        //调用函数 获取是否存在sstr 返回其在bstr的初始索引位置 不存在则返回-1
        return getIndex(bstr, sstr) != -1;

    }

    public static int getIndex(String[] bstr, String[] sstr) {
        //边界判断 两个空串  bstr长度小于1 或者长度小于sstr 都是无效的 返回-1
        if (bstr == null || sstr == null || bstr.length < sstr.length || bstr.length < 1) {
            return -1;
        }

        //获取spre 串的 next数组
        int[] next = getNext(sstr);

        //开始遍历 比较两字符串数组
        int b = 0;
        int s = 0;
        while (b < bstr.length && s < sstr.length) {
            if (isEqual(bstr[b] , sstr[s])) {
                //两字符串相等 两个索引++
                b++;
                s++;
            } else if (next[s] == -1) {
                //索引不相等 并且next数组当前位置以及不能再往左移动 与 bstr比较 那么就需要b++ 后移bstr再进行下次比较
                b++;
            } else {
                //如果next数组当前位置还可以左移 那么接着往前滚动s 索引 再来到下次进行比较
                s = next[s];
            }
        }
        //跳出循环 说明某个索引越界 如果是s越界 那么就是存在big 包含 small 返回起始首索引位置
        //当前s 是sstr的长度了  那么bstr中的位置也是来到b++ b-1前面的位置就是sstr  所以b-s就是
        //在bstr中的起始位置   如果是b越界 那说明没有匹配到sstr 返回-1
        return s == sstr.length ? b - s : -1;
    }

    //先序遍历返回集合
    public static ArrayList<String> pre(Node head) {
        ArrayList<String> res = new ArrayList<>();
        getPre(head, res);
        return res;
    }

    public static void getPre(Node head, ArrayList<String> res) {
        if (head == null) {
            res.add(null);
        } else {
            res.add(String.valueOf(head.value));
            getPre(head.left,res);
            getPre(head.right,res);
        }
    }


    //获取每个节点位置的next值 next数组
    public static int[] getNext(String[] str) {
        //长度为1  直接赋值-1
        if (str.length == 1) {
            return new int[]{-1};
        }
        //大于1长度 第二个索引值为 0
        int[] next = new int[str.length];
        next[0] = -1;
        next[1] = 0;
        int i = 2;    //遍历数组的开始位置
        int cn = 0;   //是前缀尾元素  要与i-1 后缀尾元素进行比较
        while (i < next.length) {
            //后缀尾元素i-1  与前缀尾元素cn相等 则表示next数组值为cn+1的后一个位置 同时cn要+1
            if (isEqual(str[i - 1] , str[cn])) {
                next[i++] = ++cn;
            } else if (cn > 0) {
                //两个后缀不相等 那么就利用next数组往前刷新cn的值 看看前面是否有存在前缀与当前后缀相等的情况
                cn = next[cn];
            } else {
                next[i++] = 0;  //如果来到首个位置cn=0 那么就表示前面没有符合的前缀 返回0
            }
        }
        return next;   //最后返回next数组
    }

    //判断两个字符串是否相等 因为树结构中存在了null值 我们提出一个函数处理
    public static boolean isEqual(String a, String b) {
        if (a == null && b == null) {
            //都为空 返回true
            return true;
        } else {
            //只有一个为空 返回false
            if (a == null || b == null) {
                return false;
            } else {
                //都不为空 比较
                return a.equals(b);
            }
        }
    }


    // for test
    public static Node generateRandomBST(int maxLevel, int maxValue) {
        return generate(1, maxLevel, maxValue);
    }

    // for test
    public static Node generate(int level, int maxLevel, int maxValue) {
        if (level > maxLevel || Math.random() < 0.5) {
            return null;
        }
        Node head = new Node((int) (Math.random() * maxValue));
        head.left = generate(level + 1, maxLevel, maxValue);
        head.right = generate(level + 1, maxLevel, maxValue);
        return head;
    }

    //先序遍历
    public static void prePrint(Node head){
        if(head == null) return;
        System.out.print(head.value + " ");
        prePrint(head.left);
        prePrint(head.right);
    }

    public static void main(String[] args) {
        int bigTreeLevel = 7;
        int smallTreeLevel = 4;
        int nodeMaxValue = 5;
        int testTimes = 100000;
        System.out.println("test begin");
        for (int i = 0; i < testTimes; i++) {
            Node big = generateRandomBST(bigTreeLevel, nodeMaxValue);
            Node small = generateRandomBST(smallTreeLevel, nodeMaxValue);
            boolean ans1 = containsTree1(big, small);
            boolean ans2 = containsTree2(big, small);
            if (ans1 != ans2) {
                System.out.println("Oops!");
            }
        }
        System.out.println("test finish!");

    }
}

五、题目二、判断str1和str2是否是旋转字符串

判断str1和str2是否是旋转字符串

package class27;

/**
 * 判断str1和str2是否是旋转字符串
 *
 * 123456   旋转字符串有： 左边的字符串 与右边的字符串交换  比如12旋转到右边 3456旋转到左边
 * ===
 * 123456
 * 234561
 * 345612
 * 456123....
 *
 * 反过来看 234561  345612 ...的旋转字符串也有 123456   所以两个字符串是相互都是旋转字符串
 *
 * 思路： 将其中一个字符串内容拼接2次 str1+str1  然后与str2判断是否包含str2 KMP算法
 * 123456123456    这里肯定包含了全部的旋转字符串 比如234561 345612...
 *
 */
public class IsRotation {
    public static boolean isRotation(String a, String b) {
        //边界判断 空字符  长度不等 都是无效
        if(a == null || b == null || a.length() != b.length()){
            return false;
        }

        //将a 复制2倍  与b进行判断是否包含b  包含则返回true  其旋转字符串b 一定会在对应字符串a+a中
        String aa = a+a;
        //调用kmp算法函数获取b在aa中的首个索引位置 如果不存在那么就是返回-1   不为-1就说明存在 存在即表示一定是旋转字符串 返回true
        return getIndex(aa,b) != -1;
    }

    //KMP算法 获取aa中包含b字符串的首个索引位置
    public static int getIndex(String aa, String b){
        //将字符串转换字符数组 以及遍历索引变量
        char[] c1 = aa.toCharArray();
        int ax = 0;
        char[] c2 = b.toCharArray();
        int by = 0;

        //或者b字符串的next数组
        int[] next = getNextArray(c2);
        while(ax < c1.length && by < c2.length){
            if(c1[ax] == c2[by]){
                ax++;
                by++;
            }else if(next[by] == -1){
                ax++;
            }else{
                by = next[by];
            }
        }
        return by == c2.length ? ax - by : -1;
    }

    //next数组 c[i]  表示0---i-1范围的 前缀串和后缀串相等的最大长度值  且长度不能等于i-1长度 也就是不能包括i前面全部字符
    public static int[] getNextArray(char[] c){
        //长度1的情况的  左边没有值了 所以返回-1
        if(c.length == 1){
            return new int[]{-1};
        }

        int[] next = new int[c.length];
        next[0] = -1;
        next[1] = 1;    //长度2时 前面只有一个位置 res[0]  不能包含左侧全部值 所以返回1
        int index = 2;
        int cn = 0;
        while(index < next.length){
            if(c[index-1] == c[cn]){
                next[index] = ++cn;
            }else if(cn > 0){
                cn = next[cn];
            }else{
                next[index++] = 0;
            }
        }
        return next;
    }

    public static void main(String[] args) {
        String str1 = "yunzuocheng";
        String str2 = "zuochengyun";
        System.out.println(isRotation(str1, str2));

    }

}