承接上一篇文章的调用链路
1.span(server-api)
go-zero的http的server的trace的中间件代码为,看代码注释
func TraceHandler(serviceName, path string, opts ...TraceOption) func(http.Handler) http.Handler {
var options traceOptions
for _, opt := range opts {
opt(&options)
}
ignorePaths := collection.NewSet()
//添加需要忽略trace的路由
ignorePaths.AddStr(options.traceIgnorePaths...)
return func(next http.Handler) http.Handler {
tracer := otel.Tracer(trace.TraceName)
propagator := otel.GetTextMapPropagator()
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
spanName := path
if len(spanName) == 0 {
spanName = r.URL.Path
}
//对于忽略的路由,我们不用traceing
if ignorePaths.Contains(spanName) {
next.ServeHTTP(w, r)
return
}
ctx := propagator.Extract(r.Context(), propagation.HeaderCarrier(r.Header))
spanCtx, span := tracer.Start(
ctx,
spanName,
oteltrace.WithSpanKind(oteltrace.SpanKindServer),
oteltrace.WithAttributes(semconv.HTTPServerAttributesFromHTTPRequest(
serviceName, spanName, r)...),
)
defer span.End()
// convenient for tracking error messages
propagator.Inject(spanCtx, propagation.HeaderCarrier(w.Header()))
trw := response.NewWithCodeResponseWriter(w)
next.ServeHTTP(trw, r.WithContext(spanCtx))
span.SetAttributes(semconv.HTTPAttributesFromHTTPStatusCode(trw.Code)...)
span.SetStatus(semconv.SpanStatusFromHTTPStatusCodeAndSpanKind(
trw.Code, oteltrace.SpanKindServer))
})
}
}
tracer := otel.Tracer(trace.TraceName)
func (p *TracerProvider) Tracer(name string, opts ...trace.TracerOption) trace.Tracer {
// This check happens before the mutex is acquired to avoid deadlocking if Tracer() is called from within Shutdown().
if p.isShutdown.Load() {
return trace.NewNoopTracerProvider().Tracer(name, opts...)
}
c := trace.NewTracerConfig(opts...)
if name == "" {
name = defaultTracerName
}
//这里的name是"go-zero",前面赋值的
is := instrumentation.Scope{
Name: name,
Version: c.InstrumentationVersion(),
SchemaURL: c.SchemaURL(),
}
t, ok := func() (trace.Tracer, bool) {
p.mu.Lock()
defer p.mu.Unlock()
// Must check the flag after acquiring the mutex to avoid returning a valid tracer if Shutdown() ran
// after the first check above but before we acquired the mutex.
if p.isShutdown.Load() {
return trace.NewNoopTracerProvider().Tracer(name, opts...), true
}
//is的key是否存在
t, ok := p.namedTracer[is]
if !ok {
//不存在就将tracer加入namedTracer属性中,加入一次之后,后面都可以通过
//name是"go-zero"的is,获取到当前的tracer
t = &tracer{
provider: p,
instrumentationScope: is,
}
p.namedTracer[is] = t
}
return t, ok
}()
if !ok {
// This code is outside the mutex to not hold the lock while calling third party logging code:
// - That code may do slow things like I/O, which would prolong the duration the lock is held,
// slowing down all tracing consumers.
// - Logging code may be instrumented with tracing and deadlock because it could try
// acquiring the same non-reentrant mutex.
global.Info("Tracer created", "name", name, "version", is.Version, "schemaURL", is.SchemaURL)
}
return t
}
上面就是将第一章设置的TracerProvider的加入到tracer,并将自己的字段namedTracer,设置key包含"go-zero"的instrumentation.Scope属性,value是包裹的tracer,并且前一章的span的json数据打印出了
"InstrumentationLibrary": {
"Name": "go-zero",
"Version": "",
"SchemaURL": ""
}
propagator := otel.GetTextMapPropagator()函数就是上一章获取到的propagator,可以去看上一章ctx := propagator.Extract(r.Context(), propagation.HeaderCarrier(r.Header)) 是从http的head里找到"traceparent"字段的值,并提取到ctx的valuectx中,由于我是用的postman,没用go-zero的http的client,所以就没插入这个字段的值,我们调试看出的数据,ctx的value是nil
我们看怎么产生的span
spanCtx, span := tracer.Start( ctx, spanName, oteltrace.WithSpanKind(oteltrace.SpanKindServer), oteltrace.WithAttributes(semconv.HTTPServerAttributesFromHTTPRequest( serviceName, spanName, r)...), )
type SpanConfig struct {
attributes []attribute.KeyValue
timestamp time.Time
links []Link
newRoot bool
spanKind SpanKind
stackTrace bool
}
//span是赋值SpanConfig 的 spanKind
oteltrace.WithSpanKind(oteltrace.SpanKindServer)
//是赋值[]attribute.KeyValue数组
oteltrace.WithAttributes(semconv.HTTPServerAttributesFromHTTPRequest( serviceName, spanName, r)...)
赋值完成功后,我们要进入tracer.Start看下面的源码注释
func (tr *tracer) Start(ctx context.Context, name string, options ...trace.SpanStartOption) (context.Context, trace.Span) {
//可选项模式赋值config
config := trace.NewSpanStartConfig(options...)
//如果没有ctx,就搞个原始的ctx
if ctx == nil {
// Prevent trace.ContextWithSpan from panicking.
ctx = context.Background()
}
// For local spans created by this SDK, track child span count.
//从当前ctx获取span,并且span是recordingSpan,就将recordingSpan的属性childSpanCount加1
//一般是这种情况,比如在当前服务中server-api已经ContextWithSpan了,然后将这个ctx传给当前服
//务的mysql,mysql就能从中取,并获取到span,并将它的数加1
if p := trace.SpanFromContext(ctx); p != nil {
if sdkSpan, ok := p.(*recordingSpan); ok {
sdkSpan.addChild()
}
}
//然后产生一个span,要记录的话,就会返回recordingSpan,recordingSpan实现
//了ReadWriteSpan和runtimeTracer接口
s := tr.newSpan(ctx, name, &config)
if rw, ok := s.(ReadWriteSpan); ok && s.IsRecording() {
sps := tr.provider.getSpanProcessors()
for _, sp := range sps {
//这里调用的就是我们第一章的那个batchSpanProcessor的 OnStart函数
sp.sp.OnStart(ctx, rw)
}
}
if rtt, ok := s.(runtimeTracer); ok {
ctx = rtt.runtimeTrace(ctx)
}
//最后将recordingSpan加入到ctx中
return trace.ContextWithSpan(ctx, s), s
}
// newSpan returns a new configured span.
func (tr *tracer) newSpan(ctx context.Context, name string, config *trace.SpanConfig) trace.Span {
// If told explicitly to make this a new root use a zero value SpanContext
// as a parent which contains an invalid trace ID and is not remote.
var psc trace.SpanContext
//config根据前面的设置,是否为root
if config.NewRoot() {
//如果是将psc 加入到当前的ctx中
ctx = trace.ContextWithSpanContext(ctx, psc)
} else {
//如果不是,就从当前的ctx获取span,没有的话,就返回psc自己
psc = trace.SpanContextFromContext(ctx)
}
// If there is a valid parent trace ID, use it to ensure the continuity of
// the trace. Always generate a new span ID so other components can rely
// on a unique span ID, even if the Span is non-recording.
var tid trace.TraceID
var sid trace.SpanID
//上面的返回psc自己的,肯定是没有traceid的,就相当的头一个
if !psc.TraceID().IsValid() {
//生成traceid和spanid
tid, sid = tr.provider.idGenerator.NewIDs(ctx)
} else {
//如果ctx中有span,那说明前面的span已经有traceid了,只用生成spanid
tid = psc.TraceID()
sid = tr.provider.idGenerator.NewSpanID(ctx, tid)
}
//这里调用第一章设置的sampler.ShouldSample,我们配置的是alwaysOnSampler
//其中Tracestate赋值的是psc中的Tracestate
samplingResult := tr.provider.sampler.ShouldSample(SamplingParameters{
ParentContext: ctx,
TraceID: tid,
Name: name,
Kind: config.SpanKind(),
Attributes: config.Attributes(),
Links: config.Links(),
})
//将这些值赋给scc,用它来生成一个新的span
scc := trace.SpanContextConfig{
TraceID: tid,
SpanID: sid,
TraceState: samplingResult.Tracestate,
}
//看是否要记录,由于是我们配置的alwaysOnSampler,所以要记录
if isSampled(samplingResult) {
//将trace.FlagsSampled加入TraceFlags
scc.TraceFlags = psc.TraceFlags() | trace.FlagsSampled
} else {
//如果不是,就去除这个标志
scc.TraceFlags = psc.TraceFlags() &^ trace.FlagsSampled
}
//新生成一个spancontext
sc := trace.NewSpanContext(scc)
//如果不用记录,就返回nonRecordingSpan,它的函数什么都没有,就相当于什么都没做
if !isRecording(samplingResult) {
return tr.newNonRecordingSpan(sc)
}
//要记录的话就返回recordingSpan,所以我们json生成的数据其实主要是recordingSpan
return tr.newRecordingSpan(psc, sc, name, samplingResult, config)
}
// newRecordingSpan returns a new configured recordingSpan.
func (tr *tracer) newRecordingSpan(psc, sc trace.SpanContext, name string, sr SamplingResult, config *trace.SpanConfig) *recordingSpan {
startTime := config.Timestamp()
if startTime.IsZero() {
startTime = time.Now()
}
s := &recordingSpan{
// Do not pre-allocate the attributes slice here! Doing so will
// allocate memory that is likely never going to be used, or if used,
// will be over-sized. The default Go compiler has been tested to
// dynamically allocate needed space very well. Benchmarking has shown
// it to be more performant than what we can predetermine here,
// especially for the common use case of few to no added
// attributes.
parent: psc, //父的spancontext
spanContext: sc, //当前的spancontext
spanKind: trace.ValidateSpanKind(config.SpanKind()), //种类,client,server,.
name: name, //当前span的名字,这里的是路由
startTime: startTime, //记录开始时间
events: newEvictedQueue(tr.provider.spanLimits.EventCountLimit), //事件属性
links: newEvictedQueue(tr.provider.spanLimits.LinkCountLimit), //links属性
tracer: tr,
}
for _, l := range config.Links() {
s.addLink(l)
}
//先添加前面的SamplingResult的键值对
s.SetAttributes(sr.Attributes...)
//再添加后面的config的键值对,就是我们前面设置的那些
s.SetAttributes(config.Attributes()...)
return s
}
// newNonRecordingSpan returns a new configured nonRecordingSpan.
func (tr *tracer) newNonRecordingSpan(sc trace.SpanContext) nonRecordingSpan {
return nonRecordingSpan{tracer: tr, sc: sc}
}
我们看产生的recordingSpan的数据,ctx有recordingSpan的值
其值为
可以关注attributes的键值对个数8个
下一步propagator.Inject(spanCtx, propagation.HeaderCarrier(w.Header()))
将数据设置到w的header里,可以进行网络传输
下面执行下面的中间件和具体函数,r.WithContext(spanCtx)会将当前产生的recordingSpan通过ctx传递给下层级的函数,如果下层级函数要用,这个就是parent的span了
trw := response.NewWithCodeResponseWriter(w)
next.ServeHTTP(trw, r.WithContext(spanCtx))
执行完后,就会添加recordingSpan的状态码的attribute.KeyValue,并设置recordingSpan的具体状态码的status的属性
span.SetAttributes(semconv.HTTPAttributesFromHTTPStatusCode(trw.Code)...) span.SetStatus(semconv.SpanStatusFromHTTPStatusCodeAndSpanKind( trw.Code, oteltrace.SpanKindServer))
9个了,多了一个状态码的
最后会执行defer span.End(),结束,这个会设置span的结束时间,并将记录写入expoter的队列中。
func (s *recordingSpan) End(options ...trace.SpanEndOption) {
// Do not start by checking if the span is being recorded which requires
// acquiring a lock. Make a minimal check that the span is not nil.
if s == nil {
return
}
// Store the end time as soon as possible to avoid artificially increasing
// the span's duration in case some operation below takes a while.
et := internal.MonotonicEndTime(s.startTime)
// Do relative expensive check now that we have an end time and see if we
// need to do any more processing.
if !s.IsRecording() {
return
}
config := trace.NewSpanEndConfig(options...)
//如果panic了,记录panic的属性和堆栈,然后继续向上抛出panic
if recovered := recover(); recovered != nil {
// Record but don't stop the panic.
defer panic(recovered)
opts := []trace.EventOption{
trace.WithAttributes(
semconv.ExceptionType(typeStr(recovered)),
semconv.ExceptionMessage(fmt.Sprint(recovered)),
),
}
if config.StackTrace() {
opts = append(opts, trace.WithAttributes(
semconv.ExceptionStacktrace(recordStackTrace()),
))
}
s.addEvent(semconv.ExceptionEventName, opts...)
}
if s.executionTracerTaskEnd != nil {
s.executionTracerTaskEnd()
}
s.mu.Lock()
// Setting endTime to non-zero marks the span as ended and not recording.
//设置span的结束时间
if config.Timestamp().IsZero() {
s.endTime = et
} else {
s.endTime = config.Timestamp()
}
s.mu.Unlock()
sps := s.tracer.provider.getSpanProcessors()
if len(sps) == 0 {
return
}
snap := s.snapshot()
for _, sp := range sps {
//这里就是第一章的那个batchSpanProcessor的OnEnd函数,将记录加入到队列
sp.sp.OnEnd(snap)
}
}
2.span(client-rpc)
go-zero的rpc的client的trace的中间件代码为
func UnaryTracingInterceptor(ctx context.Context, method string, req, reply any,
cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
ctx, span := startSpan(ctx, method, cc.Target())
defer span.End()
ztrace.MessageSent.Event(ctx, 1, req)
err := invoker(ctx, method, req, reply, cc, opts...)
ztrace.MessageReceived.Event(ctx, 1, reply)
if err != nil {
s, ok := status.FromError(err)
if ok {
span.SetStatus(codes.Error, s.Message())
span.SetAttributes(ztrace.StatusCodeAttr(s.Code()))
} else {
span.SetStatus(codes.Error, err.Error())
}
return err
}
span.SetAttributes(ztrace.StatusCodeAttr(gcodes.OK))
return nil
}
func startSpan(ctx context.Context, method, target string) (context.Context, trace.Span) {
md, ok := metadata.FromOutgoingContext(ctx)
if !ok {
md = metadata.MD{}
}
tr := otel.Tracer(ztrace.TraceName)
name, attr := ztrace.SpanInfo(method, target)
ctx, span := tr.Start(ctx, name, trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(attr...))
ztrace.Inject(ctx, otel.GetTextMapPropagator(), &md)
ctx = metadata.NewOutgoingContext(ctx, md)
return ctx, span
}
由于第一节讲完具体的细节到这里,我们只需要调试一下,看具体的数据,就会明白
由于ctx没有(mdOutgoingKey{},md)的值,因为只有从上下一节生成传下来的ctx的(currentSpanKey,recordingSpan),所以md为空
看一下rpc的client的name和attributes的键值对属性
将上面生成的name和attributes数据传入,由于上一个server-api传下来的recordingSpan,然后成为当前新生成span的父span
最后将ctx设置(mdOutgoingKey{},md),所以ctx现在有两个值,md的键值和第一节的head的一样,这个也是要通过网络传输,rpc的
最后添加span的event的属性
最后和第一节一样,将ctx往下面的执行函数传,并设置span的状态和attributes的状态key,可以自己调试看看
3.span(server-rpc)
go-zero的rpc的server的trace的中间件代码为
// UnaryTracingInterceptor is a grpc.UnaryServerInterceptor for opentelemetry.
func UnaryTracingInterceptor(ctx context.Context, req any, info *grpc.UnaryServerInfo,
handler grpc.UnaryHandler) (any, error) {
ctx, span := startSpan(ctx, info.FullMethod)
defer span.End()
ztrace.MessageReceived.Event(ctx, 1, req)
resp, err := handler(ctx, req)
if err != nil {
s, ok := status.FromError(err)
if ok {
span.SetStatus(codes.Error, s.Message())
span.SetAttributes(ztrace.StatusCodeAttr(s.Code()))
ztrace.MessageSent.Event(ctx, 1, s.Proto())
} else {
span.SetStatus(codes.Error, err.Error())
}
return nil, err
}
span.SetAttributes(ztrace.StatusCodeAttr(gcodes.OK))
ztrace.MessageSent.Event(ctx, 1, resp)
return resp, nil
}
func startSpan(ctx context.Context, method string) (context.Context, trace.Span) {
md, ok := metadata.FromIncomingContext(ctx)
if !ok {
md = metadata.MD{}
}
bags, spanCtx := ztrace.Extract(ctx, otel.GetTextMapPropagator(), &md)
ctx = baggage.ContextWithBaggage(ctx, bags)
tr := otel.Tracer(ztrace.TraceName)
name, attr := ztrace.SpanInfo(method, ztrace.PeerFromCtx(ctx))
return tr.Start(trace.ContextWithRemoteSpanContext(ctx, spanCtx), name,
trace.WithSpanKind(trace.SpanKindServer), trace.WithAttributes(attr...))
}
rpc的client和server很相似通过前面的代码解释已经很清楚了,就没必要讲代码了,看看调试的数据,就一目了然
首先是rpc的client通过md将数据传了过来,从ctx同获取md
然后将md的数据traceparent的数据,提取到ctx的键值对(currentSpanKey,nonRecordingSpan),这里span的remote设置为true
其实上面是提取两个,还有baggage,但是它为空,就被忽略了
这里的rpc的server的生成的name和attributes键值对数组
最后生成span,注意这里的parentspancontext是client传过来的
最后就是和rpc client一样了,添加事件和状态码。
4.span(client-redis)
go-zero的redis的client的trace的中间件代码为
func (h hook) startSpan(ctx context.Context, cmds ...red.Cmder) context.Context {
tracer := trace.TracerFromContext(ctx)
ctx, span := tracer.Start(ctx,
spanName,
oteltrace.WithSpanKind(oteltrace.SpanKindClient),
)
cmdStrs := make([]string, 0, len(cmds))
for _, cmd := range cmds {
cmdStrs = append(cmdStrs, cmd.Name())
}
span.SetAttributes(redisCmdsAttributeKey.StringSlice(cmdStrs))
return ctx
}
func (h hook) endSpan(ctx context.Context, err error) {
span := oteltrace.SpanFromContext(ctx)
defer span.End()
if err == nil || err == red.Nil {
span.SetStatus(codes.Ok, "")
return
}
span.SetStatus(codes.Error, err.Error())
span.RecordError(err)
}
通过上面的解释,这个就很简单了,就不讲了
4.总结
现在为止,第一章生成的四个span的数据的字段(Name,SpanContext,Parent,SpanKind,StartTime,EndTimeAttributes,Events,...),应该都可以读懂了,可以根据这些字段找到父子级的层级关系和时间长短。其实trace本身原理很简单,如果进程里,就通过ctx传输数据,这个俗称go的狗链子,到哪里都要带着,如果网络通信,就通过网络传输数据,可以固定几个字段,搞几个键值对,将数据传输到下层,这里的http是head传输,rpc是md,网上说trace的难点是埋点,我们只是通过go-zero的代码来看埋点的过程,上面讲的四个都是原作者埋的点,通过这两篇文章解析,应该是自己也可以学会go的trace 的用法。
最后本人能力有限,如果上面的有纰漏和问题,欢迎大佬指出