下面是一个面试中出现的算法题目:给定两个字符串,用第二个字符串对第一个字符串做split
实现方式有多重,如何做到效率最优,大家可以发挥各自的创想。
下面算法在解决两个大字符串搜索问题,具有较好的效率。通过比较可以看出效率差异。
package com.uu.oxgen.tool;
import com.alibaba.fastjson.JSON;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class SplitStringUtil {
public static void main(String[] args) {
SplitStringUtil util = new SplitStringUtil();
String[] strArr = new String[] {
"alipayshanghai",
"aaaahacccccdlaaaaaaahaccccssssaaaaaahac",
"ccccdlcdlahasssssaaassssaaaaaahaccccsshaahacssssaaaaaahssssaaaaaahaccccssss",
"aaaaaahaccccsshaahacssssaaaaaahssssaaaaaahaccccccsshaahacssssaaaaaaccssssaaaaaahacccccdlassaaaaaahaccccssssaaaa",
};
String seperator = "ssssaaaaaahac";
// seperator = "ha";
// util.compareTimeCost(strArr, seperator, 1);
util.compareTimeCost(strArr, seperator, 1000);
util.compareTimeCost(strArr, seperator, 10000);
util.compareTimeCost(strArr, seperator, 100000);
util.compareTimeCost(strArr, seperator, 1000000);
util.compareTimeCost(strArr, seperator, 10000000);
// util.compareTimeCost(strArr, seperator, 100000000);
}
private void compareTimeCost(String[] strArr, String seperator, int loopCount) {
System.out.println(String.format("calculate %s times: ", loopCount));
int loop;
long startTime;
String[] result = null;
int slength = strArr.length;
// 方案一耗时计算
startTime = System.currentTimeMillis();
loop = 0;
while (loop ++ < loopCount) {
result = splitFun_1(strArr[loop%slength], seperator);
}
System.out.println(String.format("splitFun_1 cost time: %s, result = %s", System.currentTimeMillis() - startTime, Arrays.asList(result)));
// 方案二耗时计算
startTime = System.currentTimeMillis();
loop = 0;
while (loop ++ < loopCount) {
result = splitFun_2(strArr[loop%slength], seperator);
}
System.out.println(String.format("splitFun_2 cost time: %s, result = %s", System.currentTimeMillis() - startTime, Arrays.asList(result)));
// 方案三耗时计算
startTime = System.currentTimeMillis();
loop = 0;
while (loop ++ < loopCount) {
result = splitFun_3(strArr[loop%slength], seperator);
}
System.out.println(String.format("splitFun_3 cost time: %s, result = %s", System.currentTimeMillis() - startTime, Arrays.asList(result)));
// 方案四耗时计算
startTime = System.currentTimeMillis();
loop = 0;
while (loop ++ < loopCount) {
result = strArr[loop%slength].split(seperator, -1);
}
System.out.println(String.format("split cost time: %s, result = %s", System.currentTimeMillis() - startTime, Arrays.asList(result)));
}
/**************************************方案一****************************************/
/**
* 字符串分割
*
* @param str
* @param separator
* @return
*/
public String[] splitFun_1(String str, String separator) {
List<String> results = new ArrayList();
if (str == null) {
return null;
}
if (str.length() == 0) {
return new String[] {""};
}
if (separator == null || separator.length() == 0) {
return new String[] {str};
}
char[] strArr = str.toCharArray();
char[] seperatorArr = separator.toCharArray();
int strLength = str.length();
int separatorLength = separator.length();
//各字符的序号,如果存在连续字符,则连续相同字符的索引等于最前面的字符索引
int[] skipNumArr = calculateSkipNumbs(seperatorArr);
int begin = 0;
char firstChar = seperatorArr[0];
int skipNum;
for (int i = 0; i < strLength; ) {
if (firstChar == strArr[i]) {
skipNum = checkMatch(strArr, i, i + separatorLength, seperatorArr, skipNumArr);
if (skipNum == separatorLength) {
results.add(str.substring(begin, i));
begin = i + separatorLength;
}
i = i + (skipNum > 1 ? skipNum - 1 : 1);
} else {
i++;
}
}
results.add(str.substring(begin, strLength));
return results.toArray(new String[results.size()]);
}
/**
* 计算跳跃步长
* 避免来回重复比较,特定情况下可以跳过已经检索过的字符
*
* @param seperatorArr
* @return
*/
private int[] calculateSkipNumbs(char[] seperatorArr) {
int separatorLength = seperatorArr.length;
int[] diffIndexArr = new int[separatorLength];
diffIndexArr[0] = 1;
int n = 1;// 前面n个字符组成一组,并且后面重复出现这种组,记录组的长度
boolean duplicate = false;
for (int i = 1; i < separatorLength; ) {
int nextGroupEndIndex = i + n;
int end = nextGroupEndIndex > separatorLength ? separatorLength : nextGroupEndIndex;
if (checkNextCharsMatchGroup(seperatorArr, i, n)) {
while (i < end) {
diffIndexArr[i] = i + 1;
i ++;
}
duplicate = true;
continue;
} else {
if (duplicate) {
while (i < end) {
diffIndexArr[i] = n;
i ++;
}
duplicate = false;
} else {
diffIndexArr[i] = ++ i;
}
n = i;
}
}
// System.out.println(String.format("skipArr: %s", JSON.toJSONString(diffIndexArr)));
return diffIndexArr;
}
/**
* 检查分割字符接下来的n个字符字符是否等于字符组
*
* @param seperatorArr 分割字符串
* @param start 比较起始位
* @param n 字符组的长度
* @return
*/
private boolean checkNextCharsMatchGroup(char[] seperatorArr, int start, int n) {
if (start + n > seperatorArr.length) {
return false;
}
int end = start + n;
for (int i = start, j = 0; i < end; i ++, j ++) {
if (seperatorArr[j] != seperatorArr[i]) {
return false;
}
}
return true;
}
/**
* 检查两个字符串是否相等
*
* @param srcArr
* @param start
* @param end
* @param seperatorArr
* @param skipNumArr 各字符的序号,如果存在连续字符,则连续相同字符的索引等于最前面的字符索引
* @return
*/
private int checkMatch(char[] srcArr, int start, int end, char[] seperatorArr, int[] skipNumArr) {
if (end > srcArr.length) {
end = srcArr.length;
}
int j = 1;
for (int i = start + 1; i < end; i++, j++) {
if (srcArr[i] != seperatorArr[j]) {
return skipNumArr[j];
}
}
return skipNumArr[end - start - 1];
}
/**************************************方案二****************************************/
/**
* 字符串分割
*
* @param str
* @param separator
* @return
*/
public String[] splitFun_2(String str, String separator) {
List<String> results = new ArrayList();
char[] strArr = str.toCharArray();
char[] seperatorArr = separator.toCharArray();
int strLength = str.length();
int separatorLength = separator.length();
int begin = 0;
char firstChar = seperatorArr[0];
for (int i = 0; i < strLength; i++) {
if (firstChar == strArr[i] && checkMatch(strArr, i, i + separatorLength, seperatorArr)) {
results.add(str.substring(begin, i));
begin = i + separatorLength;
}
}
if (begin < strLength) {
results.add(str.substring(begin, strLength));
} else if (begin == strLength) {
results.add("");
}
return results.toArray(new String[results.size()]);
}
/**
* 检查两个字符串是否相等
*
* @param srcArr
* @param start
* @param end
* @param seArr
* @return
*/
private boolean checkMatch(char[] srcArr, int start, int end, char[] seArr) {
if (end > srcArr.length) {
return false;
}
for (int j = 1, i = start + 1; i < end; i++, j++) {
if (srcArr[i] != seArr[j]) {
return false;
}
}
return true;
}
/**************************************方案三****************************************/
/**
* 字符串分割
*
* @param str
* @param separator
* @return
*/
public String[] splitFun_3(String str, String separator) {
List<String> results = new ArrayList();
int index = str.indexOf(separator);
while (index > 0) {
results.add(str.substring(0, index));
str = str.substring(index + separator.length());
index = str.indexOf(separator);
}
results.add(str);
return results.toArray(new String[results.size()]);
}
}
执行结果如下:
calculate 1000 times:
splitFun_1 cost time: 20, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_2 cost time: 25, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_3 cost time: 36, result = [hasssshaahacssssaaaaaah, ccc, ...
split cost time: 27, result = [hasssshaahacssssaaaaaah, ccc, ...
calculate 10000 times:
splitFun_1 cost time: 83, result = [aaaahacccccdlaaaaaaahacccc, ]
splitFun_2 cost time: 67, result = [aaaahacccccdlaaaaaaahacccc, ]
splitFun_3 cost time: 40, result = [aaaahacccccdlaaaaaaahacccc, ]
split cost time: 62, result = [aaaahacccccdlaaaaaaahacccc, ]
calculate 100000 times:
splitFun_1 cost time: 270, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_2 cost time: 299, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_3 cost time: 433, result = [hasssshaahacssssaaaaaah, ccc, ...
split cost time: 304, result = [hasssshaahacssssaaaaaah, ccc, ...
calculate 1000000 times:
splitFun_1 cost time: 1496, result = [aaaahacccccdlaaaaaaahacccc, ]
splitFun_2 cost time: 1721, result = [aaaahacccccdlaaaaaaahacccc, ]
splitFun_3 cost time: 1709, result = [aaaahacccccdlaaaaaaahacccc, ]
split cost time: 1329, result = [aaaahacccccdlaaaaaaahacccc, ]
calculate 10000000 times:
splitFun_1 cost time: 11262, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_2 cost time: 17799, result = [hasssshaahacssssaaaaaah, ccc, ...
splitFun_3 cost time: 16961, result = [hasssshaahacssssaaaaaah, ccc, ...
split cost time: 14165, result = [hasssshaahacssssaaaaaah, ccc, ...
Process finished with exit code 0