文章目录
背景
在数据开发的过程中,涉及到日志内容处理的时候,往往需要用到一些字符串处理方法,在Java中除了JVM自带的字符串处理方式,其他的一些工具类(比如apache/common、guava等)也提供了强大的字符串处理方法。但是当我们在对大数据量的日志进行处理的时候,就需要考虑到每种字符串处理方法在不同场景下的性能如何。
有哪些常用的字符串切割方法?
- 使用正则表达式进行匹配;
- JVM自带1: “”.split()
- JVM自带2: indexOf()、substring()
- apache common工具类:StringUtils.split()
- guava工具类:Splitter.on().splitToList()
每种字符串切割方法的使用
以一条日志为例,我们按照xx规则进行切分,经过处理后得到目标格式的日志:
package com.commonUtils;
import com.google.common.base.Splitter;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StringSplitCompare {
private static final Set<Character> charactersToEscape = new HashSet<Character>() {
{
add('[');
add(']');
}
};
private static List<String> logFieldNames = new ArrayList<String>();
// Returns a pattern where all punctuation characters are escaped.
private static final Pattern escaper = Pattern.compile("([\\[\\]])");
private static final Pattern extractVariablePattern = Pattern.compile("\\$[a-zA-Z0-9_]*");
private static String escapeRE(String str) {
return escaper.matcher(str).replaceAll("\\\\$1");
}
private static final String test = "youxiaoshuo, xiaoshuoyou|,strange girl, request the next time, a good girl,a nice girl,I will be try,do you know I scared to lost everything and lost you.emm,yeah,lalala,lalal,lalal,llalala,llllllllllllllllllalalalalalalalallalalalalala,ls";
private static final String testPattern = "$1,$2,$3,$4";
public static void testSplitter() {
// 2 6 9 12
String str = "iw, ie,ts,hh,lsl";
List<String> splitToList = Splitter.on(',').splitToList(str);
String[] splitToArray = StringUtils.split(str, ',');
String[] regexSplitToArray = str.split(",");
System.out.println(splitToList.toString());
System.out.println(Arrays.asList(splitToArray).toString());
System.out.println(Arrays.asList(regexSplitToArray).toString());
}
// 使用正则表达式进行匹配 / JVM自带1:"".split()
public static void regexp(int count) {
String regex = getPatternByString(testPattern);
Pattern compile = Pattern.compile(regex);
long before = System.currentTimeMillis();
while (count >= 0) {
Matcher matcher = compile.matcher(test);
if (matcher.matches()) {
for (int i = 1; i <= matcher.groupCount(); i++) {
// System.out.println(matcher.group(i));
}
}
count--;
}
System.out.println("regexp: " + (System.currentTimeMillis() - before));
}
// JVM自带2:indexOf()、substring()
public static void indexSub(int count) {
long before = System.currentTimeMillis();
while (count >= 0) {
String str = test;
while (str.length() > 0) {
int index = str.indexOf(",");
if (index < 0) {
// System.out.println(str);
break;
}
// System.out.println(str.substring(0, index));
str = str.substring(index + 1);
}
count--;
}
System.out.println("indexSub:" + (System.currentTimeMillis() - before));
}
// apache common工具类:StringUtils.split()
public static void apacheCommon(int count) {
long before = System.currentTimeMillis();
while (count >= 0) {
List<String> list = Arrays.asList(StringUtils.split(test, ','));
for (String item : list) {
// System.out.println(item);
}
count--;
}
System.out.println("apacheCommon:" + (System.currentTimeMillis() - before));
}
// guava工具类:Splitter.on().splitToList()
public static void guavaCommon(int count) {
long before = System.currentTimeMillis();
while (count >= 0) {
List<String> list = Splitter.on(',').splitToList(test);
for (String item : list) {
// System.out.println(item);
}
count--;
}
System.out.println("guavaCommon:" + (System.currentTimeMillis() - before));
}
private static String getPatternByString(String metaPattern) {
Matcher matcher = extractVariablePattern.matcher(metaPattern);
int parsedPosition = 0;
StringBuilder parsePatternBuilder = new StringBuilder();
char splitCharacter = 0;
int length = metaPattern.length();
while (matcher.find()) {
if (parsedPosition < matcher.start()) {
String residualPattern = metaPattern.substring(parsedPosition, matcher.start());
parsePatternBuilder.append(escapeRE(residualPattern));
}
String logFieldName = metaPattern.substring(matcher.start() + 1, matcher.end());
logFieldNames.add(logFieldName);
parsedPosition = matcher.end();
if (parsedPosition < length) {
splitCharacter = metaPattern.charAt(parsedPosition);
}
parsePatternBuilder.append("([^");
if (charactersToEscape.contains(splitCharacter)) {
parsePatternBuilder.append("\\");
}
parsePatternBuilder.append(splitCharacter);
parsePatternBuilder.append("]*)");
}
parsePatternBuilder.append(metaPattern.substring(parsedPosition, metaPattern.length()));
return parsePatternBuilder.toString();
}
private static final int mcount1w = 1 * 10000;
private static final int mcount10w = 10 * 10000;
private static final int mcount100w = 100 * 10000;
private static final int mcount1000w = 1000 * 10000;
private static final int mcount10000w = 10000 * 10000;
public static void main(String[] args) {
// guavaCommon(mcount1w); // 65 66
// System.out.println("");
// guavaCommon(mcount10w); // 102 96
// System.out.println("");
// guavaCommon(mcount100w); // 760 825
// System.out.println("");
// guavaCommon(mcount1000w); // 4885 5429
// System.out.println("");
// guavaCommon(mcount10000w); //51698
// regexp(mcount1w); // 25 26
// System.out.println("");
// regexp(mcount10w); // 151 149
// System.out.println("");
// regexp(mcount100w); // 974 883
// System.out.println("");
// regexp(mcount1000w); // 7566 7784
// System.out.println("");
// regexp(mcount10000w); // 74087
// indexSub(mcount1w); // 30 29
// System.out.println("");
// indexSub(mcount10w); // 86 94
// System.out.println("");
// indexSub(mcount100w); // 756 857
// System.out.println("");
// indexSub(mcount1000w); // 4423 4571
// System.out.println("");
// indexSub(mcount10000w); // 39330
apacheCommon(mcount1w); // 41 48
System.out.println("");
apacheCommon(mcount10w); // 103 108
System.out.println("");
apacheCommon(mcount100w); // 707 725
System.out.println("");
apacheCommon(mcount1000w); // 5044 5096
System.out.println("");
apacheCommon(mcount10000w); // 39987
}
}
测试结果总结:
从上面的各种字符串匹配性能的测试结果来看,其性能排序:
JDK自带indexOf/substring > apache common StringUtils.split > guava Splitter.on().splitToList() > regexp
在实际生产过程中,对超长日志的字符串处理,采用guava的效果要优于其他几种方法,原因可能是因为对超长复杂日志的处理过程比较复杂,导致最终的性能与我们上面真实测试结果不同。还需要认真对比源码实现进行分析
源码分析
使用正则表达式进行匹配 / JVM自带1:"".split()
JVM自带2:indexOf()、substring()
apache common工具类:StringUtils.split()
guava工具类:Splitter.on().splitToList()
Splitter.class中采用迭代器的方式来实现分割字符的定位,其实现了自己的迭代器,并重写了computeNext()方法
@Override
protected String computeNext() {
/*
* The returned string will be from the end of the last match to the
* beginning of the next one. nextStart is the start position of the
* returned substring, while offset is the place to start looking for a
* separator.
*/
int nextStart = offset;
while (offset != -1) {
int start = nextStart;
int end;
int separatorPosition = separatorStart(offset);
if (separatorPosition == -1) {
end = toSplit.length();
offset = -1;
} else {
end = separatorPosition;
offset = separatorEnd(separatorPosition);
}
if (offset == nextStart) {
/*
* This occurs when some pattern has an empty match, even if it
* doesn't match the empty string -- for example, if it requires
* lookahead or the like. The offset must be increased to look for
* separators beyond this point, without changing the start position
* of the next returned substring -- so nextStart stays the same.
*/
offset++;
if (offset >= toSplit.length()) {
offset = -1;
}
continue;
}
while (start < end && trimmer.matches(toSplit.charAt(start))) {
start++;
}
while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
end--;
}
if (omitEmptyStrings && start == end) {
// Don't include the (unused) separator in next split string.
nextStart = offset;
continue;
}
if (limit == 1) {
// The limit has been reached, return the rest of the string as the
// final item. This is tested after empty string removal so that
// empty strings do not count towards the limit.
end = toSplit.length();
offset = -1;
// Since we may have changed the end, we need to trim it again.
while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
end--;
}
} else {
limit--;
}
return toSplit.subSequence(start, end).toString();
}
return endOfData();
}
对上述源码及时序图的概括就是: Splitter.on(separator).splitToList(toSplitStr)的处理流程:以迭代器的方式逐个返回匹配到的字符的下标,然后结合上次的下标调用jdk原生substring()方法返回字符串,整个过程的时间复杂度是O(n)。