go源码之strings.go 详解一_go string 源码-CSDN博客

本文链接：https://blog.csdn.net/qq_38207878/article/details/131368482

go源码之strings.go 详解一

源码视频解析见b站@蚂蚁过肩被摔

explode

将s按照utf8字符拆分，n为拆的份数，如果字符串长度多于n最后一位为剩下的字符串

func explode(s string, n int) []string {
	l := utf8.RuneCountInString(s) // 获取utf8字符个数
	if n < 0 || n > l {
		n = l // 长度不对时固定为总个数
	}
	a := make([]string, n) // 按长度新建返回列表
	for i := 0; i < n-1; i++ { // 遍历a
		_, size := utf8.DecodeRuneInString(s) //获取 s首个字符及其长度
		a[i] = s[:size] // a[i] 为s的首个字符
		s = s[size:]    // 去除s的首个字符 进入下个循环
	}
	if n > 0 {  
		a[n-1] = s // 将最后剩下的字符串放入a的最后一位
	}
	return a
}

Count

获取子字符串的个数

func Count(s, substr string) int {
	if len(substr) == 0 { // 直接返回s的utf8字符个数+1
		return utf8.RuneCountInString(s) + 1
	}
	if len(substr) == 1 { // 调用bytealg.CountString计算sub的char出现次数
		return bytealg.CountString(s, substr[0])
	}
	n := 0
	for {
		i := Index(s, substr) // 获取首个sub下标
		if i == -1 {
			return n     // 没有则返回n
		}
		n++
		s = s[i+len(substr):] // s从sub开始截取
	}
}

Contains方法组

func Contains(s, substr string) bool {
	return Index(s, substr) >= 0 // sub是否在s中
}
func ContainsAny(s, chars string) bool {
	return IndexAny(s, chars) >= 0 // chars的任意一个在s中
}
func ContainsRune(s string, r rune) bool {
	return IndexRune(s, r) >= 0 // utf8码是否在s中
}

LastIndexBytes

找到sub在s中最后一个实例的下标，不存在返回-1

func LastIndexByte(s string, c byte) int {
	for i := len(s) - 1; i >= 0; i-- {
		if s[i] == c { // i从最后一位开始，往前推直到找到
			return i
		}
	}
	return -1
}

LastIndex

找到sub在s中出现的最后一个下标

func LastIndex(s, substr string) int {
	n := len(substr)
	switch { // speacial case
	case n == 0: // empty substr
		return len(s)
	case n == 1: // simplify to byte
		return LastIndexByte(s, substr[0])
	case n == len(s): // simplify to equal
		if substr == s {
			return 0
		}
		return -1
	case n > len(s): // bad case
		return -1
	}
	// Rabin-Karp search from the end of the string
	hashss, pow := bytealg.HashStrRev(substr)
	last := len(s) - n // 获取初始下标
	var h uint32
	for i := len(s) - 1; i >= last; i-- { 
		h = h*bytealg.PrimeRK + uint32(s[i])
	} // h为初始下标对应的长度为n的字串的hash
	if h == hashss && s[last:] == substr {
		return last
	} // hash相等且字符完全相等
	for i := last - 1; i >= 0; i-- { // 否则从初始下标开始挨个往头推
		h *= bytealg.PrimeRK    // 进制加1
		h += uint32(s[i])   // 加上新的字符编码hash
		h -= pow * uint32(s[i+n])   // 减去第一个字符的编码hash
		if h == hashss && s[i:i+n] == substr {
			return i
		} // hash相等且字符完全相等
	} 
	return -1 // not found if reach here
}

IndexRune

找到一个unicode字符在s的首个索引

func IndexRune(s string, r rune) int {
	switch { // special case
	case 0 <= r && r < utf8.RuneSelf: // 小于ox80是ascii码
		return IndexByte(s, byte(r)) // simplity to byte
    case r == utf8.RuneError: // \uFFFD-> 找不到的编码
		for i, r := range s { // match the first RuneError
			if r == utf8.RuneError {
				return i
			}
		}
		return -1
	case !utf8.ValidRune(r): //is a valid Unicode code @see utf8.ValidRune
		return -1
	default:   //original case, see as a substr
		return Index(s, string(r))
	}
}

IndexAny

找到chars中任意一个unicode在s的第一个下标

func IndexAny(s, chars string) int {
	if chars == "" { // obvious
		return -1
	}
	if len(chars) == 1 { // simplity to a char
		r := rune(chars[0])
		if r >= utf8.RuneSelf { // large then 0x80, therefore see as a RuneError
			r = utf8.RuneError
		}// simplity to a rune
		return IndexRune(s, r)
	}
	if len(s) > 8 { // 推测时间复杂度瓶颈为8
		if as, isASCII := makeASCIISet(chars); isASCII {
			for i := 0; i < len(s); i++ {
				if as.contains(s[i]) {
					return i //见 @ASCIISet
				}
			}
			return -1
		}
	}
	for i, c := range s {
		if IndexRune(chars, c) >= 0 {
			return i
		}
	}
	return -1
}

ASCIISet

奇思妙想

type asciiSet [8]uint32 // 32*8=256位，低128位代表128个ascii码

// makeASCIISet creates a set of ASCII characters and reports whether all
// characters in chars are ASCII.
func makeASCIISet(chars string) (as asciiSet, ok bool) {
	for i := 0; i < len(chars); i++ {
		c := chars[i]
		if c >= utf8.RuneSelf { //大于ox80，不是ascii码，返回false
			return as, false
        } // c/32 以32为基算出c对应set的下标,第一次as[c/32]为0，1<<(c%32)即2的c%32如3平方
      // 他对应1后面跟3个0，第二次如果c%32是4即10000，那么|运算结果为11000，类推
		as[c/32] |= 1 << (c % 32)// 两者再进行或运算
	}
	return as, true
}

// contains reports whether c is inside the set.
func (as *asciiSet) contains(c byte) bool { // 由第11行，可知如此运行后只有两者与运算为0才代表c不在as中，
	return (as[c/32] & (1 << (c % 32))) != 0
}

LastIndexAny

见@IndexAny ，找到chars中任一个unicode字符在s的最后一个下标

func LastIndexAny(s, chars string) int {
	if chars == "" { // obvious
		// Avoid scanning all of s.
		return -1
	}
	if len(s) == 1 { // special case as usual
		rc := rune(s[0]) // len为1看做一个rune
		if rc >= utf8.RuneSelf { // @see IndexAny
			rc = utf8.RuneError
		}
		if IndexRune(chars, rc) >= 0 {
			return 0
		}
		return -1
	}
	if len(s) > 8 { // @see IndexAny
		if as, isASCII := makeASCIISet(chars); isASCII {
			for i := len(s) - 1; i >= 0; i-- {
				if as.contains(s[i]) {
					return i
				}
			}
			return -1
		}
	}
	if len(chars) == 1 {
		rc := rune(chars[0]) // len为1看做一个rune
		if rc >= utf8.RuneSelf {
			rc = utf8.RuneError
		}
		for i := len(s); i > 0; { //倒序遍历
			r, size := utf8.DecodeLastRuneInString(s[:i])
			i -= size // 获取最后一个unicode字符r并往前推r长size
			if rc == r { //相等 返回i
				return i
			}
		}
		return -1 // 未找到
	}
	for i := len(s); i > 0; {// 倒序遍历
		r, size := utf8.DecodeLastRuneInString(s[:i])
		i -= size // 获取最后一个unicode字符r并往前推r长size
		if IndexRune(chars, r) >= 0 { // r在chars中，返回i
			return i
		}
	}
	return -1 // 真的找不到
}

LastIndexByte

找到byte c在s的最后下标

func LastIndexByte(s string, c byte) int {
	for i := len(s) - 1; i >= 0; i-- { // 倒序遍历
		if s[i] == c { // 找到
			return i
		}
	}
	return -1
}

Index

获取sub在s出现的首个下标

func Index(s, substr string) int {
	n := len(substr)
	switch {
	case n == 0: // obvious
		return 0
	case n == 1: // 长度1转换为IndexByte
		return IndexByte(s, substr[0])
	case n == len(s): // 长度相等看是否相等
		if substr == s {
			return 0
		}
		return -1
	case n > len(s): // obvious
		return -1
	case n <= bytealg.MaxLen:
		// Use brute force when s and substr both are small
		if len(s) <= bytealg.MaxBruteForce {
			return bytealg.IndexString(s, substr)
		}
		c0 := substr[0]
		c1 := substr[1]
		i := 0
		t := len(s) - n + 1
		fails := 0
		for i < t {
			if s[i] != c0 {
				// IndexByte is faster than bytealg.IndexString, so use it as long as
				// we're not getting lots of false positives.
				o := IndexByte(s[i+1:t], c0)
				if o < 0 {
					return -1
				}
				i += o + 1
			}
			if s[i+1] == c1 && s[i:i+n] == substr {
				return i
			}
			fails++
			i++
			// Switch to bytealg.IndexString when IndexByte produces too many false positives.
			if fails > bytealg.Cutover(i) {
				r := bytealg.IndexString(s[i:], substr)
				if r >= 0 {
					return r + i
				}
				return -1
			}
		}
		return -1
	}
	c0 := substr[0]
	c1 := substr[1]
	i := 0
	t := len(s) - n + 1
	fails := 0
	for i < t {
		if s[i] != c0 {
			o := IndexByte(s[i+1:t], c0)
			if o < 0 {
				return -1
			}
			i += o + 1
		}
		if s[i+1] == c1 && s[i:i+n] == substr {
			return i
		}
		i++
		fails++
		if fails >= 4+i>>4 && i < t {
			// See comment in ../bytes/bytes.go.
			j := bytealg.IndexRabinKarp(s[i:], substr)
			if j < 0 {
				return -1
			}
			return i + j
		}
	}
	return -1
}

引用的方法

utf8.RuneCountInString(s) // 获取utf8字符个数
utf8.DecodeRuneInString(s) //获取 s首个utf8字符及其长度
bytealg.CountString(s, substr[0]) // char出现在s的次数
计算一个sep的hash值和乘法因子

func HashStrRev(sep string) (uint32, uint32) {
	hash := uint32(0)
	for i := len(sep) - 1; i >= 0; i-- {
		hash = hash*PrimeRK + uint32(sep[i])
	} // hash值计算 primeRK为 进制，Unicode有1114112个，16777619是最接近的素数
	var pow, sq uint32 = 1, PrimeRK
	for i := len(sep); i > 0; i >>= 1 {
		if i&1 != 0 { // 二进制遍历 减少计算次数
			pow *= sq
		}
		sq *= sq
	}
	return hash, pow
}