Java字符串处理性能对比

背景

在数据开发的过程中,涉及到日志内容处理的时候,往往需要用到一些字符串处理方法,在Java中除了JVM自带的字符串处理方式,其他的一些工具类(比如apache/common、guava等)也提供了强大的字符串处理方法。但是当我们在对大数据量的日志进行处理的时候,就需要考虑到每种字符串处理方法在不同场景下的性能如何。

有哪些常用的字符串切割方法?

  1. 使用正则表达式进行匹配;
  2. JVM自带1: “”.split()
  3. JVM自带2: indexOf()、substring()
  4. apache common工具类:StringUtils.split()
  5. guava工具类:Splitter.on().splitToList()

每种字符串切割方法的使用

以一条日志为例,我们按照xx规则进行切分,经过处理后得到目标格式的日志:

package com.commonUtils;

import com.google.common.base.Splitter;
import org.apache.commons.lang3.StringUtils;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class StringSplitCompare {
  private static final Set<Character> charactersToEscape = new HashSet<Character>() {
    {
      add('[');
      add(']');
    }
  };

  private static List<String> logFieldNames = new ArrayList<String>();

  // Returns a pattern where all punctuation characters are escaped.
  private static final Pattern escaper = Pattern.compile("([\\[\\]])");
  private static final Pattern extractVariablePattern = Pattern.compile("\\$[a-zA-Z0-9_]*");

  private static String escapeRE(String str) {
    return escaper.matcher(str).replaceAll("\\\\$1");
  }

  private static final String test = "youxiaoshuo, xiaoshuoyou|,strange girl, request the next time, a good girl,a nice girl,I will be try,do you know I scared to lost everything and lost you.emm,yeah,lalala,lalal,lalal,llalala,llllllllllllllllllalalalalalalalallalalalalala,ls";
  private static final String testPattern = "$1,$2,$3,$4";

  public static void testSplitter() {
    // 2 6 9 12
    String str = "iw, ie,ts,hh,lsl";
    List<String> splitToList = Splitter.on(',').splitToList(str);
    String[] splitToArray = StringUtils.split(str, ',');
    String[] regexSplitToArray = str.split(",");
    System.out.println(splitToList.toString());
    System.out.println(Arrays.asList(splitToArray).toString());
    System.out.println(Arrays.asList(regexSplitToArray).toString());
  }

  // 使用正则表达式进行匹配 / JVM自带1:"".split()
  public static void regexp(int count) {
    String regex = getPatternByString(testPattern);
    Pattern compile = Pattern.compile(regex);

    long before = System.currentTimeMillis();

    while (count >= 0) {
      Matcher matcher = compile.matcher(test);
      if (matcher.matches()) {
        for (int i = 1; i <= matcher.groupCount(); i++) {
//          System.out.println(matcher.group(i));
        }
      }
      count--;
    }

    System.out.println("regexp: " + (System.currentTimeMillis() - before));

  }

  // JVM自带2:indexOf()、substring()
  public static void indexSub(int count) {
    long before = System.currentTimeMillis();

    while (count >= 0) {
      String str = test;

      while (str.length() > 0) {
        int index = str.indexOf(",");
        if (index < 0) {
//          System.out.println(str);
          break;
        }
//        System.out.println(str.substring(0, index));
        str = str.substring(index + 1);
      }

      count--;
    }

    System.out.println("indexSub:" + (System.currentTimeMillis() - before));

  }

  // apache common工具类:StringUtils.split()
  public static void apacheCommon(int count) {
    long before = System.currentTimeMillis();

    while (count >= 0) {
      List<String> list = Arrays.asList(StringUtils.split(test, ','));
      for (String item : list) {
//        System.out.println(item);
      }

      count--;
    }

    System.out.println("apacheCommon:" + (System.currentTimeMillis() - before));
  }

  // guava工具类:Splitter.on().splitToList()
  public static void guavaCommon(int count) {
    long before = System.currentTimeMillis();

    while (count >= 0) {
      List<String> list = Splitter.on(',').splitToList(test);
      for (String item : list) {
//        System.out.println(item);
      }

      count--;
    }

    System.out.println("guavaCommon:" + (System.currentTimeMillis() - before));
  }

  private static String getPatternByString(String metaPattern) {
    Matcher matcher = extractVariablePattern.matcher(metaPattern);
    int parsedPosition = 0;
    StringBuilder parsePatternBuilder = new StringBuilder();
    char splitCharacter = 0;
    int length = metaPattern.length();
    while (matcher.find()) {
      if (parsedPosition < matcher.start()) {
        String residualPattern = metaPattern.substring(parsedPosition, matcher.start());
        parsePatternBuilder.append(escapeRE(residualPattern));
      }
      String logFieldName = metaPattern.substring(matcher.start() + 1, matcher.end());

      logFieldNames.add(logFieldName);
      parsedPosition = matcher.end();
      if (parsedPosition < length) {
        splitCharacter = metaPattern.charAt(parsedPosition);
      }
      parsePatternBuilder.append("([^");
      if (charactersToEscape.contains(splitCharacter)) {
        parsePatternBuilder.append("\\");
      }
      parsePatternBuilder.append(splitCharacter);
      parsePatternBuilder.append("]*)");
    }
    parsePatternBuilder.append(metaPattern.substring(parsedPosition, metaPattern.length()));
    return parsePatternBuilder.toString();
  }

  private static final int mcount1w = 1 * 10000;
  private static final int mcount10w = 10 * 10000;
  private static final int mcount100w = 100 * 10000;
  private static final int mcount1000w = 1000 * 10000;
  private static final int mcount10000w = 10000 * 10000;

  public static void main(String[] args) {

//    guavaCommon(mcount1w);          // 65           66
//    System.out.println("");
//    guavaCommon(mcount10w);         // 102          96
//    System.out.println("");
//    guavaCommon(mcount100w);        // 760          825
//    System.out.println("");
//    guavaCommon(mcount1000w);       // 4885         5429
//    System.out.println("");
//    guavaCommon(mcount10000w);      //51698


//    regexp(mcount1w);             // 25             26
//    System.out.println("");
//    regexp(mcount10w);            // 151            149
//    System.out.println("");
//    regexp(mcount100w);           // 974            883
//    System.out.println("");
//    regexp(mcount1000w);          // 7566           7784
//    System.out.println("");
//    regexp(mcount10000w);         // 74087


//    indexSub(mcount1w);             // 30           29
//    System.out.println("");
//    indexSub(mcount10w);            // 86           94
//    System.out.println("");
//    indexSub(mcount100w);           // 756          857
//    System.out.println("");
//    indexSub(mcount1000w);          // 4423         4571
//    System.out.println("");
//    indexSub(mcount10000w);         // 39330

    apacheCommon(mcount1w);           // 41       48
    System.out.println("");
    apacheCommon(mcount10w);          // 103      108
    System.out.println("");
    apacheCommon(mcount100w);         // 707      725
    System.out.println("");
    apacheCommon(mcount1000w);        // 5044     5096
    System.out.println("");
    apacheCommon(mcount10000w);       // 39987
  }

}

测试结果总结:

从上面的各种字符串匹配性能的测试结果来看,其性能排序:
JDK自带indexOf/substring > apache common StringUtils.split > guava Splitter.on().splitToList() > regexp
在实际生产过程中,对超长日志的字符串处理,采用guava的效果要优于其他几种方法,原因可能是因为对超长复杂日志的处理过程比较复杂,导致最终的性能与我们上面真实测试结果不同。还需要认真对比源码实现进行分析

源码分析

使用正则表达式进行匹配 / JVM自带1:"".split()

JVM自带2:indexOf()、substring()

apache common工具类:StringUtils.split()

guava工具类:Splitter.on().splitToList()

Splitter.class中采用迭代器的方式来实现分割字符的定位,其实现了自己的迭代器,并重写了computeNext()方法
在这里插入图片描述
在这里插入图片描述

@Override 
protected String computeNext() {
  /*
   * The returned string will be from the end of the last match to the
   * beginning of the next one. nextStart is the start position of the
   * returned substring, while offset is the place to start looking for a
   * separator.
   */
  int nextStart = offset;
  while (offset != -1) {
    int start = nextStart;
    int end;

    int separatorPosition = separatorStart(offset);
    if (separatorPosition == -1) {
      end = toSplit.length();
      offset = -1;
    } else {
      end = separatorPosition;
      offset = separatorEnd(separatorPosition);
    }
    if (offset == nextStart) {
      /*
       * This occurs when some pattern has an empty match, even if it
       * doesn't match the empty string -- for example, if it requires
       * lookahead or the like. The offset must be increased to look for
       * separators beyond this point, without changing the start position
       * of the next returned substring -- so nextStart stays the same.
       */
      offset++;
      if (offset >= toSplit.length()) {
        offset = -1;
      }
      continue;
    }

    while (start < end && trimmer.matches(toSplit.charAt(start))) {
      start++;
    }
    while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
      end--;
    }

    if (omitEmptyStrings && start == end) {
      // Don't include the (unused) separator in next split string.
      nextStart = offset;
      continue;
    }

    if (limit == 1) {
      // The limit has been reached, return the rest of the string as the
      // final item.  This is tested after empty string removal so that
      // empty strings do not count towards the limit.
      end = toSplit.length();
      offset = -1;
      // Since we may have changed the end, we need to trim it again.
      while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
        end--;
      }
    } else {
      limit--;
    }

    return toSplit.subSequence(start, end).toString();
  }
  return endOfData();
}

对上述源码及时序图的概括就是: Splitter.on(separator).splitToList(toSplitStr)的处理流程:以迭代器的方式逐个返回匹配到的字符的下标,然后结合上次的下标调用jdk原生substring()方法返回字符串,整个过程的时间复杂度是O(n)。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值