一篇文章搞定 Java String 源码分析

最新推荐文章于 2024-04-29 09:58:27 发布

NJU_lemon

最新推荐文章于 2024-04-29 09:58:27 发布

阅读量206

点赞数

分类专栏： Java

本文链接：https://blog.csdn.net/qq_39618369/article/details/108000135

版权

Java 专栏收录该内容

130 篇文章 0 订阅

订阅专栏

String源码分析

之前总是对String的各种方法和实现原理一知半解，花了好久终于梳理好了常用方法的源码
另外关于String常见的面试题，各种对象的"=="问题，请参考这篇博客，写的很清楚

1.实现的接口

java.io.Serializable
这个序列化接口没有任何方法和域，仅用于标识序列化的语意。
Comparable<String>
这个接口只有一个compareTo(T 0)接口，用于对两个实例化对象比较大小。
CharSequence
这个接口是一个只读的字符序列。包括length(), charAt(int index), subSequence(int start, int end)
这几个API接口，值得一提的是，StringBuffer和StringBuild也是实现了这些接口

2.构造函数

2.1 无参构造函数和用String构造

/**
* 底层存储字符串的目标字节数组，Jdk 8 之前都是字符数组 private final char[] value;
*/
private final byte[] value;
/**
* 编码底层字节数组的字符集，支持 LATIN1【一个字节，0-255，只能够表示英文和西欧字符】、
* UTF-16【两个字节，可以表示汉字】
*/
private final byte coder;//LATIN1=0；UTF-16=1
/*字符串的哈希码值，默认为 0*/
 private int hash; // Default to 0
/**哈希值是否为零 */
private boolean hashIsZero; // Default to false;

/** use serialVersionUID from JDK 1.0.2 for interoperability */
private static final long serialVersionUID = -6849794470754667710L;

/**
 * 字符串是否可压缩，默认是可压缩true（无汉字），false的时候coder为UTF-16，
 * 后面会有怎么判断是汉字的源码
 */
static final boolean COMPACT_STRINGS;
static {
    COMPACT_STRINGS = true;
}
/**
* 创建一个空字符串
*/
public String() {
    this.value = "".value;
    this.coder = "".coder;
 }
 public String(String original) {
    this.value = original.value;
    this.coder = original.coder;
    this.hash = original.hash;
}

2.2 基于字节数组创建字符串

1、
public String(byte[] bytes) {
        this(bytes, 0, bytes.length);
    }  
2、
public String(byte bytes[], Charset charset) {
        this(bytes, 0, bytes.length, charset);
    }
 public String(byte bytes[], String charsetName)
        throws UnsupportedEncodingException {
    this(bytes, 0, bytes.length, charsetName);
}
3、
public String(byte bytes[], int offset, int length) {
        checkBoundsOffCount(offset, length, bytes.length);
        // 按当前虚拟机默认编码方式对目标字节数组进行编码
        StringCoding.Result ret = StringCoding.decode(bytes, offset, length);
        // 获取编码后的字节数组
        this.value = ret.value;
        // 获取编码后的字符集
        this.coder = ret.coder;
    }
/*判断偏移量和长度是否合法*/
 static void checkBoundsOffCount(int offset, int count, int length) {
    if (offset < 0 || count < 0 || offset > length - count) {
        throw new StringIndexOutOfBoundsException(
            "offset " + offset + ", count " + count + ", length " + length);
      }
  }
4、
 public String(byte bytes[], int offset, int length, Charset charset) {
        // 防御式编程，null 校验
        if (charset == null)
            throw new NullPointerException("charset");
        checkBoundsOffCount(offset, length, bytes.length);
        // 根据指定的字符集对字节数组进行编码
        StringCoding.Result ret =
            StringCoding.decode(charset, bytes, offset, length);
        this.value = ret.value;
        this.coder = ret.coder;
    }
 public String(byte bytes[], int offset, int length, String charsetName)
            throws UnsupportedEncodingException {
        if (charsetName == null)
            throw new NullPointerException("charsetName");
        checkBoundsOffCount(offset, length, bytes.length);
        // 根据指定的字符集对字节数组进行编码，编码名称错误时，抛出 UnsupportedEncodingException 异常
        StringCoding.Result ret =
            StringCoding.decode(charsetName, bytes, offset, length);
        this.value = ret.value;
        this.coder = ret.coder;
    }
5、
/*hibyte是当编码格式是UTF-16时的高 8尾的值，每个字符都相等*/
public String(byte ascii[], int hibyte, int offset, int count) {
    checkBoundsOffCount(offset, count, ascii.length);
    if (count == 0) {
        this.value = "".value;
        this.coder = "".coder;
        return;
    }
    //按照LATIN1编码，因为ascii相当于它的一个子集
    if (COMPACT_STRINGS && (byte)hibyte == 0) {
        this.value = Arrays.copyOfRange(ascii, offset, offset + count);
        this.coder = LATIN1;
    } else {//转化成UTF-16
        hibyte <<= 8;
        byte[] val = StringUTF16.newBytesFor(count);
        for (int i = 0; i < count; i++) {
            //原数组byte转换为二进制，只取最低的8位，把高8位或上去
            StringUTF16.putChar(val, i, hibyte | (ascii[offset++] & 0xff));
        }
        this.value = val;
        this.coder = UTF16;
    }
}
public String(byte ascii[], int hibyte) {
    this(ascii, hibyte, 0, ascii.length);
}
/*可以看到StringUTF16.putChar是一次放两个byte*/
static void putChar(byte[] val, int index, int c) {
    assert index >= 0 && index < length(val) : "Trusted caller missed bounds check";
    //先把index*2
    index <<= 1;
    //以小端序为例，低地址（index小的位置）存低位数据，高地址（index大的位置）存高位数据
    //HI_BYTE_SHIFT=0，LO_BYTE_SHIFT=8
    //eg：“中”=20013=4E2D，那么刚好是低8位（2D）放入index小的位置，高8位（4E）放入index大的位置
    val[index++] = (byte)(c >> HI_BYTE_SHIFT);
    val[index]   = (byte)(c >> LO_BYTE_SHIFT);
}
static final int HI_BYTE_SHIFT;
static final int LO_BYTE_SHIFT;
static {
    if (isBigEndian()) {//判断当前cpu是否是大端序
        HI_BYTE_SHIFT = 8;
        LO_BYTE_SHIFT = 0;
    } else {
        HI_BYTE_SHIFT = 0;
        LO_BYTE_SHIFT = 8;
    }
}
public String(byte ascii[], int hibyte) {
    this(ascii, hibyte, 0, ascii.length);
}

2.3 基于字符数组创建字符串

1、
public String(char value[]) {
        this(value, 0, value.length, null);
    }
    //调用内部的default访问权限的构造函数：
    String(char[] value, int off, int len, Void sig) {
    if (len == 0) {
        this.value = "".value;
        this.coder = "".coder;
        return;
    }
    if (COMPACT_STRINGS) {//默认位true，先走这里
        byte[] val = StringUTF16.compress(value, off, len);
        if (val != null) {
           //如果可以压缩（里面全是LATIN1字符），char数组会直接一对一转化为buyte数组
            this.value = val;
            //字符集设置为 LATIN1
            this.coder = LATIN1;
            return;
        }
    }
    //里面包含汉字
    this.coder = UTF16;
    //StringUTF16.toByte函数下面有分析
    this.value = StringUTF16.toBytes(value, off, len);
}
//StringUTF16.compress函数
public static byte[] compress(char[] val, int off, int len) {
    byte[] ret = new byte[len];
    if (compress(val, off, ret, 0, len) == len) {
        return ret;
    }
    return null;
}
@HotSpotIntrinsicCandidate
public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
    for (int i = 0; i < len; i++) {
        char c = src[srcOff];
        if (c > 0xFF) {
            //这里中文的话就会>oxff，len==0，使得上面的函数返回null
            len = 0;
            break;
        }
        dst[dstOff] = (byte)c;
        srcOff++;
        dstOff++;
    }
    return len;
}
/*StringUTF16.compress函数*/
@HotSpotIntrinsicCandidate
public static byte[] toBytes(char[] value, int off, int len) {
    //这里是申请了len的两倍的空间给byte数组
    byte[] val = newBytesFor(len);
    for (int i = 0; i < len; i++) {
        //putchar函数上面有分析，函数内部先把i*2，然后一次放两个byte，高8位和低8位
        putChar(val, i, value[off]);
        off++;
    }
    return val;
}
2、
public String(char value[], int offset, int count) {
        this(value, offset, count, rangeCheck(value, offset, count));
    }
private static Void rangeCheck(char[] value, int offset, int count) {
        // 字符串下标合法性校验
        checkBoundsOffCount(offset, count, value.length);
        return null;
    }

2.4 基于代码点数组创建字符串

public String(int[] codePoints, int offset, int count) {
    checkBoundsOffCount(offset, count, codePoints.length);
    if (count == 0) {
        this.value = "".value;
        this.coder = "".coder;
        return;
    }
    if (COMPACT_STRINGS) {
        byte[] val = StringLatin1.toBytes(codePoints, offset, count);
        if (val != null) {
            this.coder = LATIN1;
            this.value = val;
            return;
        }
    }
    this.coder = UTF16;
    //不能压缩就要转化为StringUTF16 byte数组
    this.value = StringUTF16.toBytes(codePoints, offset, count);
}
/*StringLatin1.toBytes函数*/
public static byte[] toBytes(int[] val, int off, int len) {
    byte[] ret = new byte[len];
    for (int i = 0; i < len; i++) {
        int cp = val[off++];
        if (!canEncode(cp)) {
            //如果有整数大于0xff，就不能转化为LATIN1 byte数组
            return null;
        }
        ret[i] = (byte)cp;
    }
    return ret;
}
public static boolean canEncode(int cp) {
    return cp >>> 8 == 0;
}

2.5 基于StringBuffer和StringBuilder创建字符串

public String(StringBuffer buffer) {
    this(buffer.toString());
}
/*StringBuffer.toString函数，有synchronized 关键字，是同步方法*/
@HotSpotIntrinsicCandidate
public synchronized String toString() {
    if (toStringCache == null) {
        return toStringCache =
                isLatin1() ? StringLatin1.newString(value, 0, count)
                           : StringUTF16.newString(value, 0, count);
    }
    //注意最后时返回的一个new 出来的对象
    return new String(toStringCache);
}
final boolean isLatin1() {
    return COMPACT_STRINGS && coder == LATIN1;
}
public String(StringBuilder builder) {
    this(builder, null);
}
String(AbstractStringBuilder asb, Void sig) {
    byte[] val = asb.getValue();//本来就是根据是否有汉字处理好的byte数组
    int length = asb.length();//返回的是count（“字符”的个数），有汉字时，val.length=2*count
    if (asb.isLatin1()) {
        this.coder = LATIN1;
        this.value = Arrays.copyOfRange(val, 0, length);
    } else {
        //有可能虽然coder = UTF16，但是里面并没有汉字，这样就是可压缩的
        if (COMPACT_STRINGS) {
            byte[] buf = StringUTF16.compress(val, 0, length);
            if (buf != null) {
                this.coder = LATIN1;
                this.value = buf;
                return;
            }
        }
        this.coder = UTF16;
        this.value = Arrays.copyOfRange(val, 0, length << 1);
    }
}

3.常用函数

3.1 length()

public int length() {
    return value.length >> coder();
    //如果有汉字，value是用两个字节表示一个字符，coder()返回1，
    //value.length/2刚好是字符数量
}

3.2 isEmpty()

public boolean isEmpty() {
    return value.length == 0;
}

3.3 charAt(int index)

public char charAt(int index) {
    if (isLatin1()) {
        return StringLatin1.charAt(value, index);
    } else {
        return StringUTF16.charAt(value, index);
    }
}
/*StringLatin1.charAt函数*/
public static char charAt(byte[] value, int index) {
    if (index < 0 || index >= value.length) {
        throw new StringIndexOutOfBoundsException(index);
    }
    return (char)(value[index] & 0xff);
}
/*StringUTF16.charAt函数*/
public static char charAt(byte[] value, int index) {
    checkIndex(index, value);
    return getChar(value, index);
}
static char getChar(byte[] val, int index) {
    assert index >= 0 && index < length(val) : "Trusted caller missed bounds check";
    index <<= 1;
    //按照大端序或者小端序拼在一起再转成char
    return (char)(((val[index++] & 0xff) << HI_BYTE_SHIFT) |
                  ((val[index]   & 0xff) << LO_BYTE_SHIFT));
}

3.4 getChars()
public void getChars(int srcBegin, int srcEnd, char dst[], int dstBegin) {
    checkBoundsBeginEnd(srcBegin, srcEnd, length());
    checkBoundsOffCount(dstBegin, srcEnd - srcBegin, dst.length);
    if (isLatin1()) {
        StringLatin1.getChars(value, srcBegin, srcEnd, dst, dstBegin);
    } else {
        StringUTF16.getChars(value, srcBegin, srcEnd, dst, dstBegin);
    }
}
/*StringLatin1.getChars函数*/
public static void getChars(byte[] value, int srcBegin, int srcEnd, char dst[], int dstBegin) {
    inflate(value, srcBegin, dst, dstBegin, srcEnd - srcBegin);
}
public static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
    for (int i = 0; i < len; i++) {
        //还是按序直接转成char就好了
        dst[dstOff++] = (char)(src[srcOff++] & 0xff);
    }
}
/*StringUTF16.getChars函数*/
public static void getChars(byte[] value, int srcBegin, int srcEnd, char dst[], int dstBegin) {
    // We need a range check here because 'getChar' has no checks
    if (srcBegin < srcEnd) {
        checkBoundsOffCount(srcBegin, srcEnd - srcBegin, value);
    }
    for (int i = srcBegin; i < srcEnd; i++) {
        //用到了上面提到的getChar函数，还是一次转化两个byte
        dst[dstBegin++] = getChar(value, i);
    }
}

3.5 equals()比较字符串内容
public boolean equals(Object anObject) {
//地址相等直接返回true
if (this == anObject) {
return true;
}

    // 形参对象为字符串
    if (anObject instanceof String) {
        String aString = (String)anObject;
        // 字符编码相同时才能继续比较内容
        if (!COMPACT_STRINGS || this.coder == aString.coder) {
            return StringLatin1.equals(value, aString.value);
        }
    }
    return false;
}
/*StringLatin1.equals函数，因为底层是byte数组，所以要逐个字节来判断*/
@HotSpotIntrinsicCandidate
public static boolean equals(byte[] value, byte[] other) {
    if (value.length == other.length) {
        for (int i = 0; i < value.length; i++) {
            if (value[i] != other[i]) {
                return false;
            }
        }
        return true;
    }
    return false;
}

3.6 equalsIgnoreCase()比较字符串内容并且不区分大小写

public boolean equalsIgnoreCase(String anotherString) {
    return (this == anotherString) ? true
            : (anotherString != null)// 形参字符串不为 nul
            && (anotherString.length() == length())// 两个字符串长度一致
            && regionMatches(true, 0, anotherString, 0, length());// 不区分大小写编码后的区域是否匹配
}

3.7 比较函数

compareTo(String anotherString)比较字符串的大小，源字符串大就返回一个大于零的数
    compare(String s1,String s2) s1>s2就返回一个大于零的数

public int compareTo(String anotherString) {
    byte v1[] = value;
    byte v2[] = anotherString.value;
    byte coder = coder();
    //编码方式是否相同
    if (coder == anotherString.coder()) {
        return coder == LATIN1 ? StringLatin1.compareTo(v1, v2)
                               : StringUTF16.compareTo(v1, v2);
    }
    return coder == LATIN1 ? StringLatin1.compareToUTF16(v1, v2)
                           : StringUTF16.compareToLatin1(v1, v2);
 }
 public int compare(String s1, String s2) {
    byte v1[] = s1.value;
    byte v2[] = s2.value;
    byte coder = s1.coder();
    if (coder == s2.coder()) {
        return coder == LATIN1 ? StringLatin1.compareToCI(v1, v2)
                               : StringUTF16.compareToCI(v1, v2);
    }
    return coder == LATIN1 ? StringLatin1.compareToCI_UTF16(v1, v2)
                           : StringUTF16.compareToCI_Latin1(v1, v2);
}
 /*这里面用到的几个函数都大同小异，一其中一个为例，StringLatin1.compareTo函数*/
 @HotSpotIntrinsicCandidate
public static int compareTo(byte[] value, byte[] other) {
    int len1 = value.length;
    int len2 = other.length;
    return compareTo(value, other, len1, len2);
}

public static int compareTo(byte[] value, byte[] other, int len1, int len2) {
    int lim = Math.min(len1, len2);
    for (int k = 0; k < lim; k++) {
        if (value[k] != other[k]) {
            //只要有一个字符不相同，就返回这两个字符的差值
            return getChar(value, k) - getChar(other, k);
        }
    }
    //否则返回字符串长度的差值
    return len1 - len2;
}

**3.8 replace() **

public String replace(char oldChar, char newChar) {
    if (oldChar != newChar) {
        String ret = isLatin1() ? StringLatin1.replace(value, oldChar, newChar)
                                : StringUTF16.replace(value, oldChar, newChar);
        if (ret != null) {
            return ret;
        }
    }
    return this;
}
/*StringLatin1.replace函数*/
public static String replace(byte[] value, char oldChar, char newChar) {
    if (canEncode(oldChar)) {
        int len = value.length;
        int i = -1;
        while (++i < len) {
            if (value[i] == (byte)oldChar) {
                break;
            }
        }
        if (i < len) {
            if (canEncode(newChar)) {
                byte[] buf = StringConcatHelper.newArray(len);
                for (int j = 0; j < i; j++) {    // TBD arraycopy?
                    buf[j] = value[j];
                }
                while (i < len) {
                    byte c = value[i];
                    buf[i] = (c == (byte)oldChar) ? (byte)newChar : c;
                    i++;
                }
                return new String(buf, LATIN1);
            } else {
                byte[] buf = StringUTF16.newBytesFor(len);
                // inflate from latin1 to UTF16
                inflate(value, 0, buf, 0, i);
                while (i < len) {
                    char c = (char)(value[i] & 0xff);
                    StringUTF16.putChar(buf, i, (c == oldChar) ? newChar : c);
                    i++;
                }
                return new String(buf, UTF16);
            }
        }
    }
    return null; // for string to return this;
}

**3.9 substring() **

1、
public String substring(int beginIndex) {
    if (beginIndex < 0) {
        throw new StringIndexOutOfBoundsException(beginIndex);
    }
    int subLen = length() - beginIndex;
    if (subLen < 0) {
        throw new StringIndexOutOfBoundsException(subLen);
    }
    if (beginIndex == 0) {
        return this;
    }
    return isLatin1() ? StringLatin1.newString(value, beginIndex, subLen)
                      : StringUTF16.newString(value, beginIndex, subLen);
}
/*StringLatin1.newString函数*/
public static String newString(byte[] val, int index, int len) {
    //利用了Arrays.copyOfRange，并指定编码格式
    return new String(Arrays.copyOfRange(val, index, index + len),
                      LATIN1);
}
/*StringUTF16.newString函数*/
public static String newString(byte[] val, int index, int len) {
    if (String.COMPACT_STRINGS) {
        //先检查是否可以压缩
        byte[] buf = compress(val, index, len);
        if (buf != null) {
            return new String(buf, LATIN1);
        }
    }
    int last = index + len;
    //对于UTF16,String中每个char对应两个byte，所以byte的索引值=2*char的索引值
    return new String(Arrays.copyOfRange(val, index << 1, last << 1), UTF16);
}
2、
public String substring(int beginIndex, int endIndex) {
    int length = length();
    checkBoundsBeginEnd(beginIndex, endIndex, length);
    int subLen = endIndex - beginIndex;
    if (beginIndex == 0 && endIndex == length) {
        return this;
    }
    return isLatin1() ? StringLatin1.newString(value, beginIndex, subLen)
                      : StringUTF16.newString(value, beginIndex, subLen);
}

一些问题和参考链接

1. String底层使用的是char数组还是byte数组？
jdk1.8及以前String使用的是char数组，jdk1.9及以后使用的是byte数组。因为开发人员发现人们大多数使用的String还是拉丁字符而之前使用的char数组每一个char占用两个字节而拉丁字符只需要一个字节就可以存储，剩下的一个字节就浪费了，造成gc的更加频繁。因此在jdk9中将String底层的实现改为了byte数组，遇到汉字的时候再进行转换。

2.大端序和小端序

3.代码点(Code Point)和代码单元(Code Unit)

4. String为什么不可变：
1.常量池思想 （设计考虑）：字符串常量池的需要，若允许改变，那么将会引起各种逻辑错误，例如 String s1 = “abcd”; String s2 = “abcd”; s1,s2都指向常量池中的同一个字符串，如果 String 是可变类，引用 s1 对 String 对象的修改，会直接导致引用 s2 获取错误的值。
2.HashCode：允许String对象创建时就缓存HashCode，字符串的不变性保证了HashCode的唯一性，可以放心缓存；并且不需要重新计算，这就使得字符串很适合作为 HashMap 中的 key，效率大大提高。
3.安全性：String被许多的库（类）用作参数，例如网络连接URL，文件路径等，还有反射机制所需要的String参数等, 假若String不是固定不变的,将会引起各种安全隐患；多线程中，可变对象的值很可能被其他线程改变，造成不可预期的结果。而不可变的 String 可以自由在多个线程之间共享，不需要同步处理。

boolean connect(string s){
    if (!isSecure(s)) { 
throw new SecurityException(); 
}
    // 如果在其他地方可以修改String,那么此处就会引起各种预料不到的问题/错误 
    causeProblem(s);
}

5.String 类是如何实现不可变的？
1、private final成员变量
2、Public 的方法都是复制一份数据
String 有很多 public 方法，每个方法都将创建新的 String 对象，比如 substring 方法
3、String 是 final 的
String 被 final 修饰，因此我们不可以继承 String，因此就不能通过继承来重写一些方法。
4、构造函数深拷贝
当传入可变数组 value[] 时，进行 copy 而不是直接将 value[] 复制给内部变量，直接复制其实是复制的引用。

public String(char value[]) {
    this.value = Arrays.copyOf(value, value.length);
}

另外一些函数：

public boolean startsWith(String prefix, int toffset);//是否在toffset以prefix开头
public boolean startsWith(String prefix);//是否以prefix开头

public boolean endsWith(String suffix);//是否以suffix结尾

public int indexOf(int ch);//ch是字符的code point值
public int indexOf(int ch, int fromIndex);//从fromIndex开始搜索

public int lastIndexOf(int ch);
public int lastIndexOf(int ch, int fromIndex);

public int indexOf(String str);
public int indexOf(String str, int fromIndex);

public int lastIndexOf(String str);
public int lastIndexOf(String str, int fromIndex) ;

public boolean matches(String regex);//正则表达式的匹配
public String replaceFirst(String regex, String replacement);
public String replaceAll(String regex, String replacement) ;
public String[] split(String regex, int limit);//limit是结果数组的字数限制
public String[] split(String regex);

public String replace(CharSequence target, CharSequence replacement) ;
public boolean contains(CharSequence s);

public String toLowerCase(Locale locale);//变小写
public String toLowerCase();
public String toUpperCase(Locale locale);//变大写
public String toUpperCase();

public String trim() ;//去掉开头和结尾的空白字符(空格、tab键、换行符)
public String strip() ;//JAVA11新方法，可以删除掉Unicode空白字符，而trim()不可以
public String stripLeading();//去掉开头的空白字符
public String stripTrailing();//去掉结尾的空白字符
public boolean isBlank();//是否为空或者只含空白字符

NJU_lemon

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
一篇文章搞定 Java String 源码分析

String源码分析之前总是对String的各种方法和实现原理一知半解，花了好久终于梳理好了常用方法的源码另外关于String常见的面试题，各种对象的"=="问题，请参考这篇博客，写的很清楚1.实现的接口java.io.Serializable这个序列化接口没有任何方法和域，仅用于标识序列化的语意。Comparable<String>这个接口只有一个compareTo(T 0)接口，用于对两个实例化对象比较大小。CharSequence这个接口是一个只读的字符序列。包括leng
复制链接

扫一扫