一种跳跃重复段的字符搜索算法

下面是一个面试中出现的算法题目:给定两个字符串,用第二个字符串对第一个字符串做split

实现方式有多重,如何做到效率最优,大家可以发挥各自的创想。

下面算法在解决两个大字符串搜索问题,具有较好的效率。通过比较可以看出效率差异。

package com.uu.oxgen.tool;

import com.alibaba.fastjson.JSON;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class SplitStringUtil {

    public static void main(String[] args) {
        SplitStringUtil util = new SplitStringUtil();

        String[] strArr = new String[] {
                "alipayshanghai",
                "aaaahacccccdlaaaaaaahaccccssssaaaaaahac",
                "ccccdlcdlahasssssaaassssaaaaaahaccccsshaahacssssaaaaaahssssaaaaaahaccccssss",
                "aaaaaahaccccsshaahacssssaaaaaahssssaaaaaahaccccccsshaahacssssaaaaaaccssssaaaaaahacccccdlassaaaaaahaccccssssaaaa",
        };

        String seperator = "ssssaaaaaahac";
//        seperator = "ha";
//        util.compareTimeCost(strArr, seperator, 1);
        util.compareTimeCost(strArr, seperator, 1000);
        util.compareTimeCost(strArr, seperator, 10000);
        util.compareTimeCost(strArr, seperator, 100000);
        util.compareTimeCost(strArr, seperator, 1000000);
        util.compareTimeCost(strArr, seperator, 10000000);
//        util.compareTimeCost(strArr, seperator, 100000000);
    }

    private void compareTimeCost(String[] strArr, String seperator, int loopCount) {
        System.out.println(String.format("calculate %s times: ", loopCount));
        int loop;
        long startTime;
        String[] result = null;
        int slength = strArr.length;
        // 方案一耗时计算
        startTime = System.currentTimeMillis();
        loop = 0;
        while (loop ++ < loopCount) {
            result = splitFun_1(strArr[loop%slength], seperator);
        }
        System.out.println(String.format("splitFun_1 cost time: %s, result = %s", System.currentTimeMillis() - startTime, Arrays.asList(result)));
        // 方案二耗时计算
        startTime = System.currentTimeMillis();
        loop = 0;
        while (loop ++ < loopCount) {
            result = splitFun_2(strArr[loop%slength], seperator);
        }
        System.out.println(String.format("splitFun_2 cost time: %s, result = %s", System.currentTimeMillis() - startTime, Arrays.asList(result)));
        // 方案三耗时计算
        startTime = System.currentTimeMillis();
        loop = 0;
        while (loop ++ < loopCount) {
            result = splitFun_3(strArr[loop%slength], seperator);
        }
        System.out.println(String.format("splitFun_3 cost time: %s, result = %s", System.currentTimeMillis() - startTime, Arrays.asList(result)));
        // 方案四耗时计算
        startTime = System.currentTimeMillis();
        loop = 0;
        while (loop ++ < loopCount) {
            result = strArr[loop%slength].split(seperator, -1);
        }
        System.out.println(String.format("split cost time: %s, result = %s", System.currentTimeMillis() - startTime, Arrays.asList(result)));
    }

    /**************************************方案一****************************************/

    /**
     * 字符串分割
     *
     * @param str
     * @param separator
     * @return
     */
    public String[] splitFun_1(String str, String separator) {
        List<String> results = new ArrayList();

        if (str == null) {
            return null;
        }

        if (str.length() == 0) {
            return new String[] {""};
        }

        if (separator == null || separator.length() == 0) {
            return new String[] {str};
        }

        char[] strArr = str.toCharArray();
        char[] seperatorArr = separator.toCharArray();
        int strLength = str.length();
        int separatorLength = separator.length();

        //各字符的序号,如果存在连续字符,则连续相同字符的索引等于最前面的字符索引
        int[] skipNumArr = calculateSkipNumbs(seperatorArr);

        int begin = 0;
        char firstChar = seperatorArr[0];
        int skipNum;
        for (int i = 0; i < strLength; ) {
            if (firstChar == strArr[i]) {
                skipNum = checkMatch(strArr, i, i + separatorLength, seperatorArr, skipNumArr);
                if (skipNum == separatorLength) {
                    results.add(str.substring(begin, i));
                    begin = i + separatorLength;
                }
                i = i + (skipNum > 1 ? skipNum - 1 : 1);
            } else {
                i++;
            }
        }
        results.add(str.substring(begin, strLength));
        return results.toArray(new String[results.size()]);
    }

    /**
     * 计算跳跃步长
     * 避免来回重复比较,特定情况下可以跳过已经检索过的字符
     *
     * @param seperatorArr
     * @return
     */
    private int[] calculateSkipNumbs(char[] seperatorArr) {
        int separatorLength = seperatorArr.length;
        int[] diffIndexArr = new int[separatorLength];
        diffIndexArr[0] = 1;
        int n = 1;// 前面n个字符组成一组,并且后面重复出现这种组,记录组的长度
        boolean duplicate = false;
        for (int i = 1; i < separatorLength; ) {
            int nextGroupEndIndex = i + n;
            int end = nextGroupEndIndex > separatorLength ? separatorLength : nextGroupEndIndex;
            if (checkNextCharsMatchGroup(seperatorArr, i, n)) {
                while (i < end) {
                    diffIndexArr[i] = i + 1;
                    i ++;
                }
                duplicate = true;
                continue;
            } else {
                if (duplicate) {
                    while (i < end) {
                        diffIndexArr[i] = n;
                        i ++;
                    }
                    duplicate = false;
                } else {
                    diffIndexArr[i] = ++ i;
                }
                n = i;
            }
        }
//        System.out.println(String.format("skipArr: %s", JSON.toJSONString(diffIndexArr)));
        return diffIndexArr;
    }

    /**
     * 检查分割字符接下来的n个字符字符是否等于字符组
     *
     * @param seperatorArr  分割字符串
     * @param start 比较起始位
     * @param n 字符组的长度
     * @return
     */
    private boolean checkNextCharsMatchGroup(char[] seperatorArr, int start, int n) {
        if (start + n > seperatorArr.length) {
            return false;
        }
        int end = start + n;
        for (int i = start, j = 0; i < end; i ++, j ++) {
            if (seperatorArr[j] != seperatorArr[i]) {
                return false;
            }
        }
        return true;
    }

    /**
     * 检查两个字符串是否相等
     *
     * @param srcArr
     * @param start
     * @param end
     * @param seperatorArr
     * @param skipNumArr 各字符的序号,如果存在连续字符,则连续相同字符的索引等于最前面的字符索引
     * @return
     */
    private int checkMatch(char[] srcArr, int start, int end, char[] seperatorArr, int[] skipNumArr) {
        if (end > srcArr.length) {
            end = srcArr.length;
        }
        int j = 1;
        for (int i = start + 1; i < end; i++, j++) {
            if (srcArr[i] != seperatorArr[j]) {
                return skipNumArr[j];
            }
        }
        return skipNumArr[end - start - 1];
    }

    /**************************************方案二****************************************/

    /**
     * 字符串分割
     *
     * @param str
     * @param separator
     * @return
     */
    public String[] splitFun_2(String str, String separator) {
        List<String> results = new ArrayList();

        char[] strArr = str.toCharArray();
        char[] seperatorArr = separator.toCharArray();
        int strLength = str.length();
        int separatorLength = separator.length();
        int begin = 0;
        char firstChar = seperatorArr[0];
        for (int i = 0; i < strLength; i++) {
            if (firstChar == strArr[i] && checkMatch(strArr, i, i + separatorLength, seperatorArr)) {
                results.add(str.substring(begin, i));
                begin = i + separatorLength;
            }
        }
        if (begin < strLength) {
            results.add(str.substring(begin, strLength));
        } else if (begin == strLength) {
            results.add("");
        }
        return results.toArray(new String[results.size()]);
    }

    /**
     * 检查两个字符串是否相等
     *
     * @param srcArr
     * @param start
     * @param end
     * @param seArr
     * @return
     */
    private boolean checkMatch(char[] srcArr, int start, int end, char[] seArr) {
        if (end > srcArr.length) {
            return false;
        }
        for (int j = 1, i = start + 1; i < end; i++, j++) {
            if (srcArr[i] != seArr[j]) {
                return false;
            }
        }
        return true;
    }

    /**************************************方案三****************************************/

    /**
     * 字符串分割
     *
     * @param str
     * @param separator
     * @return
     */
    public String[] splitFun_3(String str, String separator) {
        List<String> results = new ArrayList();
        int index = str.indexOf(separator);
        while (index > 0) {
            results.add(str.substring(0, index));
            str = str.substring(index + separator.length());

            index = str.indexOf(separator);
        }
        results.add(str);

        return results.toArray(new String[results.size()]);
    }

}

执行结果如下:

calculate 1000 times: 
splitFun_1 cost time: 20, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_2 cost time: 25, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_3 cost time: 36, result = [hasssshaahacssssaaaaaah, ccc, ...
split cost time: 27, result = [hasssshaahacssssaaaaaah, ccc, ...
calculate 10000 times: 
splitFun_1 cost time: 83, result = [aaaahacccccdlaaaaaaahacccc, ]
splitFun_2 cost time: 67, result = [aaaahacccccdlaaaaaaahacccc, ]
splitFun_3 cost time: 40, result = [aaaahacccccdlaaaaaaahacccc, ]
split cost time: 62, result = [aaaahacccccdlaaaaaaahacccc, ]
calculate 100000 times: 
splitFun_1 cost time: 270, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_2 cost time: 299, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_3 cost time: 433, result = [hasssshaahacssssaaaaaah, ccc, ...
split cost time: 304, result = [hasssshaahacssssaaaaaah, ccc, ...
calculate 1000000 times: 
splitFun_1 cost time: 1496, result = [aaaahacccccdlaaaaaaahacccc, ]
splitFun_2 cost time: 1721, result = [aaaahacccccdlaaaaaaahacccc, ]
splitFun_3 cost time: 1709, result = [aaaahacccccdlaaaaaaahacccc, ]
split cost time: 1329, result = [aaaahacccccdlaaaaaaahacccc, ]
calculate 10000000 times: 
splitFun_1 cost time: 11262, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_2 cost time: 17799, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_3 cost time: 16961, result = [hasssshaahacssssaaaaaah, ccc, ...
split cost time: 14165, result = [hasssshaahacssssaaaaaah, ccc, ...

Process finished with exit code 0

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值