一、标记方法和过程
从golang的GC发展历程可以看到,它其实是从保守式GC到准确式GC发展的一个过程,它是追踪式垃圾回收算法(Tracing garbage collection)(另外一种是计数器方法( Reference counting ))。在前面提到过,保守式和准确式GC的主要区别在于对指针类型的准确判断。既然是准确式GC,那么意味着一些全局变量和寄存器等的变量就无法成为GC过程中root根了。其实所谓的标记算法,在GO中使用的是三色标记-清除算法。所以标记当然是使用的三色标记。三色标记最大的优点就是可以支持并行,这可是一个非常重要的特点。
所以谓标记的过程,就是遍历root能到达的所有的地方,然后根据实际情况,对其标记为黑、灰。一开始所有的内存对象默认为白色。在看源码的时候儿,可以和前面的提到的增量式垃圾回收和准确式垃圾回收以及其它三色标记算法进行一下对比学习分析。
二、Mark准备源码分析
Go语言的三色标记过程主要在mgcmark.go这个文件中,而在前面分析gcStart这个函数时有:
func gcStart(mode gcMode, forceTrigger bool) {
...
if mode == gcBackgroundMode {
gcBgMarkStartWorkers()
}
...
}
func gcBgMarkStartWorkers() {
// Background marking is performed by per-P G's. Ensure that
// each P has a background GC G.
for _, p := range &allp {
if p == nil || p.status == _Pdead {
break
}
if p.gcBgMarkWorker == 0 {
go gcBgMarkWorker(p)
notetsleepg(&work.bgMarkReady, -1)
noteclear(&work.bgMarkReady)
}
}
}
func gcBgMarkWorker(_p_ *p) {
gp := getg()
type parkInfo struct {
m muintptr // Release this m on park.
attach puintptr // If non-nil, attach to this p on park.
}
// We pass park to a gopark unlock function, so it can't be on
// the stack (see gopark). Prevent deadlock from recursively
// starting GC by disabling preemption.
gp.m.preemptoff = "GC worker init"
park := new(parkInfo)
gp.m.preemptoff = ""
park.m.set(acquirem())
park.attach.set(_p_)
// Inform gcBgMarkStartWorkers that this worker is ready.
// After this point, the background mark worker is scheduled
// cooperatively by gcController.findRunnable. Hence, it must
// never be preempted, as this would put it into _Grunnable
// and put it on a run queue. Instead, when the preempt flag
// is set, this puts itself into _Gwaiting to be woken up by
// gcController.findRunnable at the appropriate time.
notewakeup(&work.bgMarkReady)
for {
// Go to sleep until woken by gcController.findRunnable.
// We can't releasem yet since even the call to gopark
// may be preempted.
gopark(func(g *g, parkp unsafe.Pointer) bool {
park := (*parkInfo)(parkp)
// The worker G is no longer running, so it's
// now safe to allow preemption.
releasem(park.m.ptr())
// If the worker isn't attached to its P,
// attach now. During initialization and after
// a phase change, the worker may have been
// running on a different P. As soon as we
// attach, the owner P may schedule the
// worker, so this must be done after the G is
// stopped.
if park.attach != 0 {
p := park.attach.ptr()
park.attach.set(nil)
// cas the worker because we may be
// racing with a new worker starting
// on this P.
if !p.gcBgMarkWorker.cas(0, guintptr(unsafe.Pointer(g))) {
// The P got a new worker.
// Exit this worker.
return false
}
}
return true
}, unsafe.Pointer(park), "GC worker (idle)", traceEvGoBlock, 0)
// Loop until the P dies and disassociates this
// worker (the P may later be reused, in which case
// it will get a new worker) or we failed to associate.
if _p_.gcBgMarkWorker.ptr() != gp {
break
}
// Disable preemption so we can use the gcw. If the
// scheduler wants to preempt us, we'll stop draining,
// dispose the gcw, and then preempt.
park.m.set(acquirem())
if gcBlackenEnabled == 0 {
throw("gcBgMarkWorker: blackening not enabled")
}
startTime := nanotime()
decnwait := atomic.Xadd(&work.nwait, -1)
if decnwait == work.nproc {
println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
throw("work.nwait was > work.nproc")
}
systemstack(func() {
// Mark our goroutine preemptible so its stack
// can be scanned. This lets two mark workers
// scan each other (otherwise, they would
// deadlock). We must not modify anything on
// the G stack. However, stack shrinking is
// disabled for mark workers, so it is safe to
// read from the G stack.
casgstatus(gp, _Grunning, _Gwaiting)
switch _p_.gcMarkWorkerMode {
default:
throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
case gcMarkWorkerDedicatedMode:
gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
case gcMarkWorkerFractionalMode:
gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
case gcMarkWorkerIdleMode:
gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
}
casgstatus(gp, _Gwaiting, _Grunning)
})
// If we are nearing the end of mark, dispose
// of the cache promptly. We must do this
// before signaling that we're no longer
// working so that other workers can't observe
// no workers and no work while we have this
// cached, and before we compute done.
if gcBlackenPromptly {
_p_.gcw.dispose()
}
// Account for time.
duration := nanotime() - startTime
switch _p_.gcMarkWorkerMode {
case gcMarkWorkerDedicatedMode:
atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
case gcMarkWorkerFractionalMode:
atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
case gcMarkWorkerIdleMode:
atomic.Xaddint64(&gcController.idleMarkTime, duration)
}
// Was this the last worker and did we run out
// of work?
incnwait := atomic.Xadd(&work.nwait, +1)
if incnwait > work.nproc {
println("runtime: p.gcMarkWorkerMode=", _p_.gcMarkWorkerMode,
"work.nwait=", incnwait, "work.nproc=", work.nproc)
throw("work.nwait > work.nproc")
}
// If this worker reached a background mark completion
// point, signal the main GC goroutine.
if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
// Make this G preemptible and disassociate it
// as the worker for this P so
// findRunnableGCWorker doesn't try to
// schedule it.
_p_.gcBgMarkWorker.set(nil)
releasem(park.m.ptr())
gcMarkDone()
// Disable preemption and prepare to reattach
// to the P.
//
// We may be running on a different P at this
// point, so we can't reattach until this G is
// parked.
park.m.set(acquirem())
park.attach.set(_p_)
}
}
}
这个函数的目的就是启动Mark阶段的协程,但这个协程只有到标记启动时才会工作。在这一系列的调用中,除了对GMP的协调,一个重要的函数就是gcDrain这个函数,它相当重要。但先把它放一边,要继续分析一下,和Mark相关的一些准备函数和事件通知参数,然后回过头来再分析这个gcDrain。
继续向下看gcStart中几个函数:
func gcStart(mode gcMode, forceTrigger bool) {
...
gcResetMarkState()
......
if mode == gcBackgroundMode { // Do as much work concurrently as possible
gcController.startCycle()
work.heapGoal = memstats.next_gc
// Enter concurrent mark phase and enable
// write barriers.
//
// Because the world is stopped, all Ps will
// observe that write barriers are enabled by
// the time we start the world and begin
// scanning.
//
// It's necessary to enable write barriers
// during the scan phase for several reasons:
//
// They must be enabled for writes to higher
// stack frames before we scan stacks and
// install stack barriers because this is how
// we track writes to inactive stack frames.
// (Alternatively, we could not install stack
// barriers over frame boundaries with
// up-pointers).
//
// They must be enabled before assists are
// enabled because they must be enabled before
// any non-leaf heap objects are marked. Since
// allocations are blocked until assists can
// happen, we want enable assists as early as
// possible.
setGCPhase(_GCmark)
gcBgMarkPrepare() // Must happen before assist enable.
gcMarkRootPrepare()
// Mark all active tinyalloc blocks. Since we're
// allocating from these, they need to be black like
// other allocations. The alternative is to blacken
// the tiny block on every allocation from it, which
// would slow down the tiny allocator.
gcMarkTinyAllocs()
// At this point all Ps have enabled the write
// barrier, thus maintaining the no white to
// black invariant. Enable mutator assists to
// put back-pressure on fast allocating
// mutators.
atomic.Store(&gcBlackenEnabled, 1)
// Assists and workers can start the moment we start
// the world.
gcController.markStartTime = now
// Concurrent mark.
systemstack(startTheWorldWithSema)
now = nanotime()
work.pauseNS += now - work.pauseStart
work.tMark = now
} else {
t := nanotime()
work.tMark, work.tMarkTerm = t, t
work.heapGoal = work.heap0
if forced {
memstats.numforcedgc++
}
// Perform mark termination. This will restart the world.
gcMarkTermination()
}
...
}
在这里,通过gcResetMarkState来重置一些全局状态和G相关栈的扫描状态,gcBgMarkPrepare函数来设置相关的任务数量,可以看一下这个函数非常简只是两个参数的设置。gcMarkRootPrepare对根扫描队列(栈、全局参数及其它杂项)排队并初始化一些与扫描相关的状态,主要是要设置一下任务队列的数量。可以看一下它的最后一和行:
work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
而gcMarkTinyAllocs 这个函数用来标记所有待合并的tiny alloc对象,这种操作可以提高内存的使用效率。gcMarkTermination函数终结标记过程,并启动世界(停止STW),这个函数内要把写屏障等停掉,设置清理标记唤醒清理,在STW结束后,开始Sweep。
systemstack(startTheWorldWithSema)这个函数启动世界,同时,后台任务开始启动标记工作。
三、gcDrain源码分析
在上面的代码看到,启动这个函数有三种情况:
func gcBgMarkWorker(_p_ *p) {
......
//处理器专门负责标记对象,不会被调度器抢占;
case gcMarkWorkerDedicatedMode:
gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
//当垃圾收集的后台 CPU 使用率达不到预期时(默认为 25%),启动该类型的工作协程帮助垃圾收集达到利用率的目标,
//因为它只占用同一个 CPU 的部分资源,所以可以被调度;
case gcMarkWorkerFractionalMode:
gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
//当处理器没有可以执行的 Goroutine 时,它会运行垃圾收集的标记任务直到被抢占;
case gcMarkWorkerIdleMode:
gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
}
......
}
runtime.gcControllerState.startCycle会根据全局处理器的个数以及垃圾收集的 CPU 利用率计算出上述的 dedicatedMarkWorkersNeeded和fractionalUtilizationGoal以决定不同模式的工作协程的数量。
好,前置的基本都说明了,开始看这个函数:
// gcDrain scans roots and objects in work buffers, blackening grey
// objects until all roots and work buffers have been drained.
//
// If flags&gcDrainUntilPreempt != 0, gcDrain returns when g.preempt
// is set. This implies gcDrainNoBlock.
//
// If flags&gcDrainIdle != 0, gcDrain returns when there is other work
// to do. This implies gcDrainNoBlock.
//
// If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is
// unable to get more work. Otherwise, it will block until all
// blocking calls are blocked in gcDrain.
//
// If flags&gcDrainFlushBgCredit != 0, gcDrain flushes scan work
// credit to gcController.bgScanCredit every gcCreditSlack units of
// scan work.
//
//go:nowritebarrier
func gcDrain(gcw *gcWork, flags gcDrainFlags) {
//处理写屏蔽
if !writeBarrier.needed {
throw("gcDrain phase incorrect")
}
//得到当前G协程
gp := getg().m.curg
//处理抢占标志位
preemptible := flags&gcDrainUntilPreempt != 0
//处理无任务是阻塞标志
blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainNoBlock) == 0
//处理扫描量以控制减少辅助GC或唤醒后台处理协程
flushBgCredit := flags&gcDrainFlushBgCredit != 0
//空闲标记,空闲时是否处理标记任务
idle := flags&gcDrainIdle != 0
//记录已经扫描过的扫描工作
initScanWork := gcw.scanWork
// idleCheck is the scan work at which to perform the next
// idle check with the scheduler.
//设置相应工作模式的的检查函数
idleCheck := initScanWork + idleCheckThreshold
// Drain root marking jobs.
//一直遍历Root完成,将其置入工作队列,即标记为灰色
if work.markrootNext < work.markrootJobs {
for !(preemptible && gp.preempt) {
job := atomic.Xadd(&work.markrootNext, +1) - 1
if job >= work.markrootJobs {
break
}
//扫描Root,进入到gcw的灰色标记队列
markroot(gcw, job)
if idle && pollWork() {
goto done
}
}
}
// Drain heap marking jobs.当全部标记完成后,一直循环到任务被抢占或stw,消费灰色队列
for !(preemptible && gp.preempt) {
// Try to keep work available on the global queue. We used to
// check if there were waiting workers, but it's better to
// just keep work available than to make workers wait. In the
// worst case, we'll do O(log(_WorkbufSize)) unnecessary
// balances.
//前面提到到,为了方便窃取操作,平衡各个协程任务量,在全局队列为空则放入全局队列
if work.full == 0 {
gcw.balance()
}
//下面这段代码类似于对象获取的几种方式,阻塞情况下和非阻塞情况下以及对不同获取unitputr状态的控制
var b uintptr
if blocking {
b = gcw.get()
} else {
b = gcw.tryGetFast()
if b == 0 {
b = gcw.tryGet()
}
}
if b == 0 {
// work barrier reached or tryGet failed.
break
}
//扫描获取到的对象——前面得到的工作队列缓存对象(buf1,buf2,global后续还有wbbuf即写屏蔽产生的灰色对象)
//通过扫描当前对象遍历其引用的对象,通过三色标记设置对象为灰色
//这也是前面的得到根,然后利用根再次调用根所能达到的对象并进行相关标记
scanobject(b, gcw)
// Flush background scan work credit to the global
// account if we've accumulated enough locally so
// mutator assists can draw on it.
//如果当前扫描的数量超过了 gcCreditSlack,就把扫描的对象数量加到全局的数量
if gcw.scanWork >= gcCreditSlack {
atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
//减少辅助GC
if flushBgCredit {
gcFlushBgCredit(gcw.scanWork - initScanWork)
initScanWork = 0
}
idleCheck -= gcw.scanWork
gcw.scanWork = 0
//idle模式下检查是否可以窃取
if idle && idleCheck <= 0 {
idleCheck += idleCheckThreshold
if pollWork() {
break
}
}
}
}
// In blocking mode, write barriers are not allowed after this
// point because we must preserve the condition that the work
// buffers are empty.
done:
// Flush remaining scan work credit.添加到全局并减少辅助GC数量并唤醒相关协程
if gcw.scanWork > 0 {
atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
if flushBgCredit {
gcFlushBgCredit(gcw.scanWork - initScanWork)
}
gcw.scanWork = 0
}
}
基本上,gcDrain就是整个标记过程中的核心函数,Drain排出,流出,把没用的垃圾标记扔出,等着后面的Sweep。函数有两个参数,第一个就是前面提到过的gcWork,这是处理缓存和缓存记录的一个重要的数据结构体,第二个是gcDrainFlags,这个参数对协程调度有着重要的作用。
然后看一下Markroot:
func markroot(gcw *gcWork, i uint32) {
// TODO(austin): This is a bit ridiculous. Compute and store
// the bases in gcMarkRootPrepare instead of the counts.
//处理一些全局变量
baseFlushCache := uint32(fixedRootCount)
baseData := baseFlushCache + uint32(work.nFlushCacheRoots)
baseBSS := baseData + uint32(work.nDataRoots)
baseSpans := baseBSS + uint32(work.nBSSRoots)
baseStacks := baseSpans + uint32(work.nSpanRoots)
baseRescan := baseStacks + uint32(work.nStackRoots)
end := baseRescan + uint32(work.nRescanRoots)
// Note: if you add a case here, please also update heapdump.go:dumproots.
switch {
case baseFlushCache <= i && i < baseData://释放mcache中的span
flushmcache(int(i - baseFlushCache))
//扫描可读的写全局变量
case baseData <= i && i < baseBSS:
for _, datap := range activeModules() {
markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-baseData))
}
//扫描只读的全局队列
case baseBSS <= i && i < baseSpans:
for _, datap := range activeModules() {
markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-baseBSS))
}
//扫描Finalizer队列
case i == fixedRootFinalizers:
for fb := allfin; fb != nil; fb = fb.alllink {
cnt := uintptr(atomic.Load(&fb.cnt))
scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), cnt*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw)
}
//释放已经终止的Stack
case i == fixedRootFreeGStacks:
// Only do this once per GC cycle; preferably
// concurrently.
if !work.markrootDone {
// Switch to the system stack so we can call
// stackfree.
systemstack(markrootFreeGStacks)
}
//扫描特殊的析构器队列
case baseSpans <= i && i < baseStacks:
// mark MSpan.specials
markrootSpans(gcw, int(i-baseSpans))
default:
// the rest is scanning goroutine stacks
var gp *g
if baseStacks <= i && i < baseRescan {
gp = allgs[i-baseStacks]
} else if baseRescan <= i && i < end {
gp = work.rescan.list[i-baseRescan].ptr()
if gp.gcRescan != int32(i-baseRescan) {
// Looking for issue #17099.
println("runtime: gp", gp, "found at rescan index", i-baseRescan, "but should be at", gp.gcRescan)
throw("bad g rescan index")
}
} else {
throw("markroot: bad index")
}
// remember when we've first observed the G blocked
// needed only to output in traceback
status := readgstatus(gp) // We are not in a scan state
if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
gp.waitsince = work.tstart
}
// scang must be done on the system stack in case
// we're trying to scan our own stack.
//由G0来处理,G0是一个特殊的协程,拥有自己的调度栈即此栈和M对应的线程栈为一体
systemstack(func() {
// If this is a self-scan, put the user G in
// _Gwaiting to prevent self-deadlock. It may
// already be in _Gwaiting if this is a mark
// worker or we're in mark termination.
userG := getg().m.curg
selfScan := gp == userG && readgstatus(userG) == _Grunning
if selfScan {
casgstatus(userG, _Grunning, _Gwaiting)
userG.waitreason = "garbage collection scan"
}
// TODO: scang blocks until gp's stack has
// been scanned, which may take a while for
// running goroutines. Consider doing this in
// two phases where the first is non-blocking:
// we scan the stacks we can and ask running
// goroutines to scan themselves; and the
// second blocks.
scang(gp, gcw)
if selfScan {
//处理扫描自己的栈的状态
casgstatus(userG, _Gwaiting, _Grunning)
}
})
}
}
在此函数里有两个重要函数markrootBlock和scanblock
// markrootBlock scans the shard'th shard of the block of memory [b0,
// b0+n0), with the given pointer mask.
//
//go:nowritebarrier
func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
if rootBlockBytes%(8*sys.PtrSize) != 0 {
// This is necessary to pick byte offsets in ptrmask0.
throw("rootBlockBytes must be a multiple of 8*ptrSize")
}
b := b0 + uintptr(shard)*rootBlockBytes
if b >= b0+n0 {
return
}
ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*sys.PtrSize))))
n := uintptr(rootBlockBytes)
if b+n > b0+n0 {
n = b0 + n0 - b
}
// Scan this shard.
scanblock(b, n, ptrmask, gcw)
}
markrootBlock则是根据ptrmask0对相关的内存块范围内进行扫描,最后也需要调用scanblock函数。
// scanblock scans b as scanobject would, but using an explicit
// pointer bitmap instead of the heap bitmap.
//
// This is used to scan non-heap roots, so it does not update
// gcw.bytesMarked or gcw.scanWork.
//
//go:nowritebarrier
func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) {
// Use local copies of original parameters, so that a stack trace
// due to one of the throws below shows the original block
// base and extent.
b := b0
n := n0
arena_start := mheap_.arena_start
arena_used := mheap_.arena_used
//循环搜索相关BITMAP指定位
for i := uintptr(0); i < n; {
// Find bits for the next word.
bits := uint32(*addb(ptrmask, i/(sys.PtrSize*8)))
if bits == 0 {
i += sys.PtrSize * 8
continue
}
for j := 0; j < 8 && i < n; j++ {
if bits&1 != 0 {
// Same work as in scanobject; see comments there.
//找到相关指针,并对其引用对象标灰
obj := *(*uintptr)(unsafe.Pointer(b + i))
if obj != 0 && arena_start <= obj && obj < arena_used {
if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i); obj != 0 {
greyobject(obj, b, i, hbits, span, gcw, objIndex)
}
}
}
bits >>= 1
i += sys.PtrSize
}
}
}
scanblock函数是一个通用的扫描函数, 扫描全局变量和栈空间都会用它, 和scanobject不同的是bitmap需要手动传入。
这里面有一个标记灰色的函数greyobject:
// obj is the start of an object with mark mbits.
// If it isn't already marked, mark it and enqueue into gcw.
// base and off are for debugging only and could be removed.
//go:nowritebarrierrec
func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork, objIndex uintptr) {
// obj should be start of allocation, and so must be at least pointer-aligned.
if obj&(sys.PtrSize-1) != 0 {
throw("greyobject: obj not pointer-aligned")
}
mbits := span.markBitsForIndex(objIndex)
if useCheckmark {
if !mbits.isMarked() {
printlock()
print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n")
print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
// Dump the source (base) object
gcDumpObject("base", base, off)
// Dump the object
gcDumpObject("obj", obj, ^uintptr(0))
throw("checkmark found unmarked object")
}
if hbits.isCheckmarked(span.elemsize) {
return
}
hbits.setCheckmarked(span.elemsize)
if !hbits.isCheckmarked(span.elemsize) {
throw("setCheckmarked and isCheckmarked disagree")
}
} else {
if debug.gccheckmark > 0 && span.isFree(objIndex) {
print("runtime: marking free object ", hex(obj), " found at *(", hex(base), "+", hex(off), ")\n")
gcDumpObject("base", base, off)
gcDumpObject("obj", obj, ^uintptr(0))
throw("marking free object")
}
// If marked we have nothing to do.已经正确标记则返回
if mbits.isMarked() {
return
}
// mbits.setMarked() // Avoid extra call overhead with manual inlining.
atomic.Or8(mbits.bytep, mbits.mask)
// If this is a noscan object, fast-track it to black
// instead of greying it.
if !hbits.hasPointers(span.elemsize) {
gcw.bytesMarked += uint64(span.elemsize)
return
}
}
// Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
// seems like a nice optimization that can be added back in.
// There needs to be time between the PREFETCH and the use.
// Previously we put the obj in an 8 element buffer that is drained at a rate
// to give the PREFETCH time to do its work.
// Use of PREFETCHNTA might be more appropriate than PREFETCH
//将标记对象放入队列,先放入本地,如果失败则部分放入全局部分进入本地
if !gcw.putFast(obj) {
gcw.put(obj)
}
这个函数是就是实际标记灰色的函数。注意几个调试的地方可以忽略。
可以看一下这个put和 putFast函数:
// put enqueues a pointer for the garbage collector to trace.
// obj must point to the beginning of a heap object or an oblet.
//go:nowritebarrier
func (w *gcWork) put(obj uintptr) {
flushed := false
wbuf := w.wbuf1.ptr()
if wbuf == nil {
w.init()
wbuf = w.wbuf1.ptr()
// wbuf is empty at this point.
} else if wbuf.nobj == len(wbuf.obj) {
w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
wbuf = w.wbuf1.ptr()
if wbuf.nobj == len(wbuf.obj) {
//如果交换后仍然队列满员,则向全局申请
putfull(wbuf)
wbuf = getempty()
w.wbuf1 = wbufptrOf(wbuf)
flushed = true
}
}
wbuf.obj[wbuf.nobj] = obj
wbuf.nobj++
// If we put a buffer on full, let the GC controller know so
// it can encourage more workers to run. We delay this until
// the end of put so that w is in a consistent state, since
// enlistWorker may itself manipulate w.
if flushed && gcphase == _GCmark {
gcController.enlistWorker()
}
}
// putFast does a put and returns true if it can be done quickly
// otherwise it returns false and the caller needs to call put.
//go:nowritebarrier
func (w *gcWork) putFast(obj uintptr) bool {
wbuf := w.wbuf1.ptr()
if wbuf == nil {
return false
} else if wbuf.nobj == len(wbuf.obj) {
return false
}
wbuf.obj[wbuf.nobj] = obj
wbuf.nobj++
return true
}
这个其实很简单,类似于缓存,直接读缓存速度快但可能无法命中,那么就必须是直接put了。在gcWork中有两个缓存区即wbuf1,wbuf2,putfast把对象直接放到buf1中,如果满了就返回false。而put不仅可以从buf1中放入对象,也可以在其满时交换buf1,buf2,当这两个缓冲区都满的时候儿,还可以去全局缓冲区处理。
回到gcDrain函数中看一下负载平衡相关函数:
// balance moves some work that's cached in this gcWork back on the
// global queue.
//go:nowritebarrier
func (w *gcWork) balance() {
if w.wbuf1 == 0 {
return
}
//如果wbuf2不为空,则将其交给全局队列
if wbuf := w.wbuf2.ptr(); wbuf.nobj != 0 {
putfull(wbuf)
w.wbuf2 = wbufptrOf(getempty())
} else if wbuf := w.wbuf1.ptr(); wbuf.nobj > 4 {
//二分法处理未满的wbuf1
w.wbuf1 = wbufptrOf(handoff(wbuf))
} else {
return
}
// We flushed a buffer to the full list, so wake a worker.
if gcphase == _GCmark {
gcController.enlistWorker()
}
}
这个没啥可重点说明的,后面的get和tryGet类似前面的put等,不再赘述。
再回到gcDrain中继续分析,看一下scanobject这个函数:
// scanobject scans the object starting at b, adding pointers to gcw.
// b must point to the beginning of a heap object or an oblet.
// scanobject consults the GC bitmap for the pointer mask and the
// spans for the size of the object.
//
//go:nowritebarrier
func scanobject(b uintptr, gcw *gcWork) {
// Note that arena_used may change concurrently during
// scanobject and hence scanobject may encounter a pointer to
// a newly allocated heap object that is *not* in
// [start,used). It will not mark this object; however, we
// know that it was just installed by a mutator, which means
// that mutator will execute a write barrier and take care of
// marking it. This is even more pronounced on relaxed memory
// architectures since we access arena_used without barriers
// or synchronization, but the same logic applies.
//处理arena和mheap的映射关系
arena_start := mheap_.arena_start
arena_used := mheap_.arena_used
// Find the bits for b and the size of the object at b.
//
// b is either the beginning of an object, in which case this
// is the size of the object to scan, or it points to an
// oblet, in which case we compute the size to scan below.
//根据bitmap寻找到Span的地址
hbits := heapBitsForAddr(b)
s := spanOfUnchecked(b)
//获取其大小
n := s.elemsize
if n == 0 {
throw("scanobject n == 0")
}
//根据大小来进行扫描,大于128K要分割扫描
if n > maxObletBytes {
// Large object. Break into oblets for better
// parallelism and lower latency.
if b == s.base() {
// It's possible this is a noscan object (not
// from greyobject, but from other code
// paths), in which case we must *not* enqueue
// oblets since their bitmaps will be
// uninitialized.
if !hbits.hasPointers(n) {
// Bypass the whole scan.
gcw.bytesMarked += uint64(n)
return
}
// Enqueue the other oblets to scan later.
// Some oblets may be in b's scalar tail, but
// these will be marked as "no more pointers",
// so we'll drop out immediately when we go to
// scan those.分割后放入队列
for oblet := b + maxObletBytes; oblet < s.base()+s.elemsize; oblet += maxObletBytes {
if !gcw.putFast(oblet) {
gcw.put(oblet)
}
}
}
// Compute the size of the oblet. Since this object
// must be a large object, s.base() is the beginning
// of the object.
n = s.base() + s.elemsize - b
if n > maxObletBytes {
n = maxObletBytes
}
}
//扫描对象中的指针
var i uintptr
for i = 0; i < n; i += sys.PtrSize {
// Find bits for this word.
if i != 0 {
// Avoid needless hbits.next() on last iteration.
hbits = hbits.next()
}
// Load bits once. See CL 22712 and issue 16973 for discussion.
bits := hbits.bits()
// During checkmarking, 1-word objects store the checkmark
// in the type bit for the one word. The only one-word objects
// are pointers, or else they'd be merged with other non-pointer
// data into larger allocations.
if i != 1*sys.PtrSize && bits&bitScan == 0 {
break // no more pointers in this object
}
if bits&bitPointer == 0 {
continue // not a pointer
}
// Work here is duplicated in scanblock and above.
// If you make changes here, make changes there too.
//取出指针对应的值
obj := *(*uintptr)(unsafe.Pointer(b + i))
// At this point we have extracted the next potential pointer.
// Check if it points into heap and not back at the current object.
//检查其是否是一个堆指针而不是当前值
if obj != 0 && arena_start <= obj && obj < arena_used && obj-b >= n {
// Mark the object.将其引用对象标灰
if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i); obj != 0 {
greyobject(obj, b, i, hbits, span, gcw, objIndex)
}
}
}
gcw.bytesMarked += uint64(n)
gcw.scanWork += int64(i)
}
这个问题涉及到一个标黑和标灰的问题,从上面的代码可以看出来,标记灰色就是放入队列,就是说放入队列的都是灰色。那么拿出来后就可以认为变成了黑色。
回到更上一层的gcBgMarkWorker函数,它会在结束前调用gcMarkDone(),而此函数又会调用 gcMarkTermination:
// gcMarkDone transitions the GC from mark 1 to mark 2 and from mark 2
// to mark termination.
//
// This should be called when all mark work has been drained. In mark
// 1, this includes all root marking jobs, global work buffers, and
// active work buffers in assists and background workers; however,
// work may still be cached in per-P work buffers. In mark 2, per-P
// caches are disabled.
//
// The calling context must be preemptible.
//
// Note that it is explicitly okay to have write barriers in this
// function because completion of concurrent mark is best-effort
// anyway. Any work created by write barriers here will be cleaned up
// by mark termination.
func gcMarkDone() {
top:
semacquire(&work.markDoneSema, 0)
// Re-check transition condition under transition lock.
if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
semrelease(&work.markDoneSema)
return
}
// Disallow starting new workers so that any remaining workers
// in the current mark phase will drain out.
//
// TODO(austin): Should dedicated workers keep an eye on this
// and exit gcDrain promptly?
atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, -0xffffffff)
if !gcBlackenPromptly {
// Transition from mark 1 to mark 2.
//
// The global work list is empty, but there can still be work
// sitting in the per-P work caches.
// Flush and disable work caches.
// Disallow caching workbufs and indicate that we're in mark 2.
//禁用本地缓存队列并进入标记2
gcBlackenPromptly = true
// Prevent completion of mark 2 until we've flushed
// cached workbufs.
atomic.Xadd(&work.nwait, -1)
// GC is set up for mark 2. Let Gs blocked on the
// transition lock go while we flush caches.
semrelease(&work.markDoneSema)
systemstack(func() {
// Flush all currently cached workbufs and
// ensure all Ps see gcBlackenPromptly. This
// also blocks until any remaining mark 1
// workers have exited their loop so we can
// start new mark 2 workers.
forEachP(func(_p_ *p) {
_p_.gcw.dispose()
})
})
// Check that roots are marked. We should be able to
// do this before the forEachP, but based on issue
// #16083 there may be a (harmless) race where we can
// enter mark 2 while some workers are still scanning
// stacks. The forEachP ensures these scans are done.
//
// TODO(austin): Figure out the race and fix this
// properly.
gcMarkRootCheck()
// Now we can start up mark 2 workers.
atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 0xffffffff)
incnwait := atomic.Xadd(&work.nwait, +1)
if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
// This loop will make progress because
// gcBlackenPromptly is now true, so it won't
// take this same "if" branch.
goto top
}
} else {
// Transition to mark termination.
now := nanotime()
work.tMarkTerm = now
work.pauseStart = now
getg().m.preemptoff = "gcing"
systemstack(stopTheWorldWithSema)
// The gcphase is _GCmark, it will transition to _GCmarktermination
// below. The important thing is that the wb remains active until
// all marking is complete. This includes writes made by the GC.
// Record that one root marking pass has completed.
work.markrootDone = true
// Disable assists and background workers. We must do
// this before waking blocked assists.
atomic.Store(&gcBlackenEnabled, 0)
// Wake all blocked assists. These will run when we
// start the world again.
gcWakeAllAssists()
// Likewise, release the transition lock. Blocked
// workers and assists will run when we start the
// world again.
semrelease(&work.markDoneSema)
// endCycle depends on all gcWork cache stats being
// flushed. This is ensured by mark 2.
gcController.endCycle()
// Perform mark termination. This will restart the world.
gcMarkTermination()
}
}
func gcMarkTermination() {
// World is stopped.
// Start marktermination which includes enabling the write barrier.
atomic.Store(&gcBlackenEnabled, 0)
gcBlackenPromptly = false
setGCPhase(_GCmarktermination)
work.heap1 = memstats.heap_live
startTime := nanotime()
mp := acquirem()
mp.preemptoff = "gcing"
_g_ := getg()
_g_.m.traceback = 2
gp := _g_.m.curg
casgstatus(gp, _Grunning, _Gwaiting)
gp.waitreason = "garbage collection"
// Run gc on the g0 stack. We do this so that the g stack
// we're currently running on will no longer change. Cuts
// the root set down a bit (g0 stacks are not scanned, and
// we don't need to scan gc's internal state). We also
// need to switch to g0 so we can shrink the stack.
systemstack(func() {
gcMark(startTime)
// Must return immediately.
// The outer function's stack may have moved
// during gcMark (it shrinks stacks, including the
// outer function's stack), so we must not refer
// to any of its variables. Return back to the
// non-system stack to pick up the new addresses
// before continuing.
})
systemstack(func() {
work.heap2 = work.bytesMarked
if debug.gccheckmark > 0 {
// Run a full stop-the-world mark using checkmark bits,
// to check that we didn't forget to mark anything during
// the concurrent mark process.
gcResetMarkState()
initCheckmarks()
gcMark(startTime)
clearCheckmarks()
}
// marking is complete so we can turn the write barrier off
setGCPhase(_GCoff)
gcSweep(work.mode)
if debug.gctrace > 1 {
startTime = nanotime()
// The g stacks have been scanned so
// they have gcscanvalid==true and gcworkdone==true.
// Reset these so that all stacks will be rescanned.
gcResetMarkState()
finishsweep_m()
// Still in STW but gcphase is _GCoff, reset to _GCmarktermination
// At this point all objects will be found during the gcMark which
// does a complete STW mark and object scan.
setGCPhase(_GCmarktermination)
gcMark(startTime)
setGCPhase(_GCoff) // marking is done, turn off wb.
gcSweep(work.mode)
}
})
_g_.m.traceback = 0
casgstatus(gp, _Gwaiting, _Grunning)
if trace.enabled {
traceGCDone()
}
// all done
mp.preemptoff = ""
if gcphase != _GCoff {
throw("gc done but gcphase != _GCoff")
}
// Update timing memstats
now, unixNow := nanotime(), unixnanotime()
work.pauseNS += now - work.pauseStart
work.tEnd = now
atomic.Store64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
memstats.pause_total_ns += uint64(work.pauseNS)
// Update work.totaltime.
sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
// We report idle marking time below, but omit it from the
// overall utilization here since it's "free".
markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
cycleCpu := sweepTermCpu + markCpu + markTermCpu
work.totaltime += cycleCpu
// Compute overall GC CPU utilization.
totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
memstats.numgc++
// Reset sweep state.
sweep.nbgsweep = 0
sweep.npausesweep = 0
systemstack(startTheWorldWithSema)
// Update heap profile stats if gcSweep didn't do it. This is
// relatively expensive, so we don't want to do it while the
// world is stopped, but it needs to happen ASAP after
// starting the world to prevent too many allocations from the
// next cycle leaking in. It must happen before releasing
// worldsema since there are applications that do a
// runtime.GC() to update the heap profile and then
// immediately collect the profile.
if _ConcurrentSweep && work.mode != gcForceBlockMode {
mProf_GC()
}
// Free stack spans. This must be done between GC cycles.
systemstack(freeStackSpans)
// Best-effort remove stack barriers so they don't get in the
// way of things like GDB and perf.
lock(&allglock)
myallgs := allgs
unlock(&allglock)
gcTryRemoveAllStackBarriers(myallgs)
// Print gctrace before dropping worldsema. As soon as we drop
// worldsema another cycle could start and smash the stats
// we're trying to print.
if debug.gctrace > 0 {
util := int(memstats.gc_cpu_fraction * 100)
var sbuf [24]byte
printlock()
print("gc ", memstats.numgc,
" @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
util, "%: ")
prev := work.tSweepTerm
for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
if i != 0 {
print("+")
}
print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
prev = ns
}
print(" ms clock, ")
for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
if i == 2 || i == 3 {
// Separate mark time components with /.
print("/")
} else if i != 0 {
print("+")
}
print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
}
print(" ms cpu, ",
work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
work.heapGoal>>20, " MB goal, ",
work.maxprocs, " P")
if work.mode != gcBackgroundMode {
print(" (forced)")
}
print("\n")
printunlock()
}
semrelease(&worldsema)
// Careful: another GC cycle may start now.
releasem(mp)
mp = nil
// now that gc is done, kick off finalizer thread if needed
if !concurrentSweep {
// give the queued finalizers, if any, a chance to run
Gosched()
}
}
gcMarkDone的作用是进行状态转换,即从标记1(所有root标记,全局缓存队列和本地缓存队列)转向标记2(禁用本地缓存队列),并最终调用gcMarkTermination,在此函数的英文说明中有详细的说明。而gcMarkTermination的作用在前边已经介绍过了。
需要说明:在后面的新版本中write Barrier和mark root以及scan stack这三个函数,都是生产灰色队列的。也就是为消费者提供灰色对象。scan stack会在Go的栈处理分析中进行分析。
***需要注意的是,三色标记的过程需要进行写屏蔽处理,这个在后面的一篇文章中分析介绍。
四、总结
早期的GC的三色标记算法,是要STW的,这玩意儿老费时间了,对上层应用非常不友好。那么有人提出了使用写屏障的方法,这样,可以在并行时,保证安全,同时可以适当的降低STW的时间(2~3ms),可这仍然无法满足人们对时间是上的迫切的要求,Go1.8以后,增加了混合写屏障,近一步将STW的时间缩小到1ms内。
但是无论怎么减少,STW还是没有去除,这就是心病,大家应该明白一件事,这就是一个雷,可能最需要这1ms的时候儿,STW了。所以还是得继续进步,能降维打击才是最好。