go的分布式链路追踪(2)

       承接上一篇文章的调用链路

1.span(server-api)

        go-zero的http的server的trace的中间件代码为,看代码注释

func TraceHandler(serviceName, path string, opts ...TraceOption) func(http.Handler) http.Handler {
	var options traceOptions
	for _, opt := range opts {
		opt(&options)
	}

	ignorePaths := collection.NewSet()
    //添加需要忽略trace的路由
	ignorePaths.AddStr(options.traceIgnorePaths...)
    
	return func(next http.Handler) http.Handler {
		tracer := otel.Tracer(trace.TraceName)
		propagator := otel.GetTextMapPropagator()

		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
			spanName := path
			if len(spanName) == 0 {
				spanName = r.URL.Path
			}
            //对于忽略的路由,我们不用traceing
			if ignorePaths.Contains(spanName) {
				next.ServeHTTP(w, r)
				return
			}

			ctx := propagator.Extract(r.Context(), propagation.HeaderCarrier(r.Header))
			spanCtx, span := tracer.Start(
				ctx,
				spanName,
				oteltrace.WithSpanKind(oteltrace.SpanKindServer),
				oteltrace.WithAttributes(semconv.HTTPServerAttributesFromHTTPRequest(
					serviceName, spanName, r)...),
			)
			defer span.End()

			// convenient for tracking error messages
			propagator.Inject(spanCtx, propagation.HeaderCarrier(w.Header()))

			trw := response.NewWithCodeResponseWriter(w)
			next.ServeHTTP(trw, r.WithContext(spanCtx))

			span.SetAttributes(semconv.HTTPAttributesFromHTTPStatusCode(trw.Code)...)
			span.SetStatus(semconv.SpanStatusFromHTTPStatusCodeAndSpanKind(
				trw.Code, oteltrace.SpanKindServer))
		})
	}
}

        tracer := otel.Tracer(trace.TraceName)

func (p *TracerProvider) Tracer(name string, opts ...trace.TracerOption) trace.Tracer {
	// This check happens before the mutex is acquired to avoid deadlocking if Tracer() is called from within Shutdown().
	if p.isShutdown.Load() {
		return trace.NewNoopTracerProvider().Tracer(name, opts...)
	}
	c := trace.NewTracerConfig(opts...)
	if name == "" {
		name = defaultTracerName
	}
    //这里的name是"go-zero",前面赋值的
	is := instrumentation.Scope{
		Name:      name,
		Version:   c.InstrumentationVersion(),
		SchemaURL: c.SchemaURL(),
	}

	t, ok := func() (trace.Tracer, bool) {
		p.mu.Lock()
		defer p.mu.Unlock()
		// Must check the flag after acquiring the mutex to avoid returning a valid tracer if Shutdown() ran
		// after the first check above but before we acquired the mutex.
		if p.isShutdown.Load() {
			return trace.NewNoopTracerProvider().Tracer(name, opts...), true
		}
        //is的key是否存在
		t, ok := p.namedTracer[is]
		if !ok {
            //不存在就将tracer加入namedTracer属性中,加入一次之后,后面都可以通过
            //name是"go-zero"的is,获取到当前的tracer
			t = &tracer{
				provider:             p,
				instrumentationScope: is,
			}
			p.namedTracer[is] = t
		}
		return t, ok
	}()
	if !ok {
		// This code is outside the mutex to not hold the lock while calling third party logging code:
		// - That code may do slow things like I/O, which would prolong the duration the lock is held,
		//   slowing down all tracing consumers.
		// - Logging code may be instrumented with tracing and deadlock because it could try
		//   acquiring the same non-reentrant mutex.
		global.Info("Tracer created", "name", name, "version", is.Version, "schemaURL", is.SchemaURL)
	}
	return t
}

        上面就是将第一章设置的TracerProvider的加入到tracer,并将自己的字段namedTracer,设置key包含"go-zero"的instrumentation.Scope属性,value是包裹的tracer,并且前一章的span的json数据打印出了

"InstrumentationLibrary": {
        "Name": "go-zero",
        "Version": "",
        "SchemaURL": ""
    }

propagator := otel.GetTextMapPropagator()函数就是上一章获取到的propagator,可以去看上一章ctx := propagator.Extract(r.Context(), propagation.HeaderCarrier(r.Header)) 是从http的head里找到"traceparent"字段的值,并提取到ctx的valuectx中,由于我是用的postman,没用go-zero的http的client,所以就没插入这个字段的值,我们调试看出的数据,ctx的value是nil

        我们看怎么产生的span

spanCtx, span := tracer.Start( ctx, spanName, oteltrace.WithSpanKind(oteltrace.SpanKindServer), oteltrace.WithAttributes(semconv.HTTPServerAttributesFromHTTPRequest( serviceName, spanName, r)...), )

type SpanConfig struct {
	attributes []attribute.KeyValue
	timestamp  time.Time
	links      []Link
	newRoot    bool
	spanKind   SpanKind
	stackTrace bool
}
//span是赋值SpanConfig 的 spanKind 
oteltrace.WithSpanKind(oteltrace.SpanKindServer)
//是赋值[]attribute.KeyValue数组
oteltrace.WithAttributes(semconv.HTTPServerAttributesFromHTTPRequest( serviceName, spanName, r)...)



        赋值完成功后,我们要进入tracer.Start看下面的源码注释

func (tr *tracer) Start(ctx context.Context, name string, options ...trace.SpanStartOption) (context.Context, trace.Span) {
    //可选项模式赋值config
	config := trace.NewSpanStartConfig(options...)
    //如果没有ctx,就搞个原始的ctx
	if ctx == nil {
		// Prevent trace.ContextWithSpan from panicking.
		ctx = context.Background()
	}

	// For local spans created by this SDK, track child span count.
    //从当前ctx获取span,并且span是recordingSpan,就将recordingSpan的属性childSpanCount加1
    //一般是这种情况,比如在当前服务中server-api已经ContextWithSpan了,然后将这个ctx传给当前服
    //务的mysql,mysql就能从中取,并获取到span,并将它的数加1
	if p := trace.SpanFromContext(ctx); p != nil {
		if sdkSpan, ok := p.(*recordingSpan); ok {
			sdkSpan.addChild()
		}
	}
    //然后产生一个span,要记录的话,就会返回recordingSpan,recordingSpan实现
    //了ReadWriteSpan和runtimeTracer接口
	s := tr.newSpan(ctx, name, &config)
	if rw, ok := s.(ReadWriteSpan); ok && s.IsRecording() {
		sps := tr.provider.getSpanProcessors()
		for _, sp := range sps {
            //这里调用的就是我们第一章的那个batchSpanProcessor的 OnStart函数
			sp.sp.OnStart(ctx, rw)
		}
	}
	if rtt, ok := s.(runtimeTracer); ok {
		ctx = rtt.runtimeTrace(ctx)
	}
    //最后将recordingSpan加入到ctx中
	return trace.ContextWithSpan(ctx, s), s
}

// newSpan returns a new configured span.
func (tr *tracer) newSpan(ctx context.Context, name string, config *trace.SpanConfig) trace.Span {
	// If told explicitly to make this a new root use a zero value SpanContext
	// as a parent which contains an invalid trace ID and is not remote.
	var psc trace.SpanContext
    //config根据前面的设置,是否为root
	if config.NewRoot() {
        //如果是将psc 加入到当前的ctx中
		ctx = trace.ContextWithSpanContext(ctx, psc)
	} else {
        //如果不是,就从当前的ctx获取span,没有的话,就返回psc自己
		psc = trace.SpanContextFromContext(ctx)
	}

	// If there is a valid parent trace ID, use it to ensure the continuity of
	// the trace. Always generate a new span ID so other components can rely
	// on a unique span ID, even if the Span is non-recording.
	var tid trace.TraceID
	var sid trace.SpanID
    //上面的返回psc自己的,肯定是没有traceid的,就相当的头一个
	if !psc.TraceID().IsValid() {
        //生成traceid和spanid
		tid, sid = tr.provider.idGenerator.NewIDs(ctx)
	} else {
        //如果ctx中有span,那说明前面的span已经有traceid了,只用生成spanid
		tid = psc.TraceID()
		sid = tr.provider.idGenerator.NewSpanID(ctx, tid)
	}

    //这里调用第一章设置的sampler.ShouldSample,我们配置的是alwaysOnSampler
    //其中Tracestate赋值的是psc中的Tracestate
	samplingResult := tr.provider.sampler.ShouldSample(SamplingParameters{
		ParentContext: ctx,
		TraceID:       tid,
		Name:          name,
		Kind:          config.SpanKind(),
		Attributes:    config.Attributes(),
		Links:         config.Links(),
	})
    
    //将这些值赋给scc,用它来生成一个新的span
	scc := trace.SpanContextConfig{
		TraceID:    tid,
		SpanID:     sid,
		TraceState: samplingResult.Tracestate,
	}
    //看是否要记录,由于是我们配置的alwaysOnSampler,所以要记录
	if isSampled(samplingResult) {
        //将trace.FlagsSampled加入TraceFlags 
		scc.TraceFlags = psc.TraceFlags() | trace.FlagsSampled
	} else {
        //如果不是,就去除这个标志
		scc.TraceFlags = psc.TraceFlags() &^ trace.FlagsSampled
	}
    //新生成一个spancontext
	sc := trace.NewSpanContext(scc)

    //如果不用记录,就返回nonRecordingSpan,它的函数什么都没有,就相当于什么都没做
	if !isRecording(samplingResult) {
		return tr.newNonRecordingSpan(sc)
	}
    //要记录的话就返回recordingSpan,所以我们json生成的数据其实主要是recordingSpan
	return tr.newRecordingSpan(psc, sc, name, samplingResult, config)
}

// newRecordingSpan returns a new configured recordingSpan.
func (tr *tracer) newRecordingSpan(psc, sc trace.SpanContext, name string, sr SamplingResult, config *trace.SpanConfig) *recordingSpan {
	startTime := config.Timestamp()
	if startTime.IsZero() {
		startTime = time.Now()
	}

	s := &recordingSpan{
		// Do not pre-allocate the attributes slice here! Doing so will
		// allocate memory that is likely never going to be used, or if used,
		// will be over-sized. The default Go compiler has been tested to
		// dynamically allocate needed space very well. Benchmarking has shown
		// it to be more performant than what we can predetermine here,
		// especially for the common use case of few to no added
		// attributes.

		parent:      psc,  //父的spancontext
		spanContext: sc,   //当前的spancontext
		spanKind:    trace.ValidateSpanKind(config.SpanKind()), //种类,client,server,.
		name:        name,  //当前span的名字,这里的是路由
		startTime:   startTime,  //记录开始时间
		events:      newEvictedQueue(tr.provider.spanLimits.EventCountLimit), //事件属性
		links:       newEvictedQueue(tr.provider.spanLimits.LinkCountLimit),  //links属性

		tracer:      tr,
	}

	for _, l := range config.Links() {
		s.addLink(l)
	}

    //先添加前面的SamplingResult的键值对
	s.SetAttributes(sr.Attributes...)
    //再添加后面的config的键值对,就是我们前面设置的那些
	s.SetAttributes(config.Attributes()...)

	return s
}

// newNonRecordingSpan returns a new configured nonRecordingSpan.
func (tr *tracer) newNonRecordingSpan(sc trace.SpanContext) nonRecordingSpan {
	return nonRecordingSpan{tracer: tr, sc: sc}
}

        

        我们看产生的recordingSpan的数据,ctx有recordingSpan的值

        其值为

        可以关注attributes的键值对个数8个

        下一步propagator.Inject(spanCtx, propagation.HeaderCarrier(w.Header()))

将数据设置到w的header里,可以进行网络传输

      

        下面执行下面的中间件和具体函数,r.WithContext(spanCtx)会将当前产生的recordingSpan通过ctx传递给下层级的函数,如果下层级函数要用,这个就是parent的span了

trw := response.NewWithCodeResponseWriter(w)

next.ServeHTTP(trw, r.WithContext(spanCtx))

        执行完后,就会添加recordingSpan的状态码的attribute.KeyValue,并设置recordingSpan的具体状态码的status的属性

span.SetAttributes(semconv.HTTPAttributesFromHTTPStatusCode(trw.Code)...) span.SetStatus(semconv.SpanStatusFromHTTPStatusCodeAndSpanKind( trw.Code, oteltrace.SpanKindServer))

        9个了,多了一个状态码的

        最后会执行defer span.End(),结束,这个会设置span的结束时间,并将记录写入expoter的队列中。

func (s *recordingSpan) End(options ...trace.SpanEndOption) {
	// Do not start by checking if the span is being recorded which requires
	// acquiring a lock. Make a minimal check that the span is not nil.
	if s == nil {
		return
	}

	// Store the end time as soon as possible to avoid artificially increasing
	// the span's duration in case some operation below takes a while.
	et := internal.MonotonicEndTime(s.startTime)

	// Do relative expensive check now that we have an end time and see if we
	// need to do any more processing.
	if !s.IsRecording() {
		return
	}

	config := trace.NewSpanEndConfig(options...)
    //如果panic了,记录panic的属性和堆栈,然后继续向上抛出panic
	if recovered := recover(); recovered != nil {
		// Record but don't stop the panic.
		defer panic(recovered)
		opts := []trace.EventOption{
			trace.WithAttributes(
				semconv.ExceptionType(typeStr(recovered)),
				semconv.ExceptionMessage(fmt.Sprint(recovered)),
			),
		}

		if config.StackTrace() {
			opts = append(opts, trace.WithAttributes(
				semconv.ExceptionStacktrace(recordStackTrace()),
			))
		}

		s.addEvent(semconv.ExceptionEventName, opts...)
	}

	if s.executionTracerTaskEnd != nil {
		s.executionTracerTaskEnd()
	}

	s.mu.Lock()
	// Setting endTime to non-zero marks the span as ended and not recording.
    //设置span的结束时间
	if config.Timestamp().IsZero() {
		s.endTime = et
	} else {
		s.endTime = config.Timestamp()
	}
	s.mu.Unlock()

	sps := s.tracer.provider.getSpanProcessors()
	if len(sps) == 0 {
		return
	}
	snap := s.snapshot()
	for _, sp := range sps {
        //这里就是第一章的那个batchSpanProcessor的OnEnd函数,将记录加入到队列
		sp.sp.OnEnd(snap)
	}
}

2.span(client-rpc)

        go-zero的rpc的client的trace的中间件代码为

func UnaryTracingInterceptor(ctx context.Context, method string, req, reply any,
	cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
	ctx, span := startSpan(ctx, method, cc.Target())
	defer span.End()

	ztrace.MessageSent.Event(ctx, 1, req)
	err := invoker(ctx, method, req, reply, cc, opts...)
	ztrace.MessageReceived.Event(ctx, 1, reply)
	if err != nil {
		s, ok := status.FromError(err)
		if ok {
			span.SetStatus(codes.Error, s.Message())
			span.SetAttributes(ztrace.StatusCodeAttr(s.Code()))
		} else {
			span.SetStatus(codes.Error, err.Error())
		}
		return err
	}

	span.SetAttributes(ztrace.StatusCodeAttr(gcodes.OK))
	return nil
}

func startSpan(ctx context.Context, method, target string) (context.Context, trace.Span) {
	md, ok := metadata.FromOutgoingContext(ctx)
	if !ok {
		md = metadata.MD{}
	}
	tr := otel.Tracer(ztrace.TraceName)
	name, attr := ztrace.SpanInfo(method, target)
	ctx, span := tr.Start(ctx, name, trace.WithSpanKind(trace.SpanKindClient),
		trace.WithAttributes(attr...))
	ztrace.Inject(ctx, otel.GetTextMapPropagator(), &md)
	ctx = metadata.NewOutgoingContext(ctx, md)

	return ctx, span
}

        由于第一节讲完具体的细节到这里,我们只需要调试一下,看具体的数据,就会明白

由于ctx没有(mdOutgoingKey{},md)的值,因为只有从上下一节生成传下来的ctx的(currentSpanKey,recordingSpan),所以md为空

       看一下rpc的client的name和attributes的键值对属性

        将上面生成的name和attributes数据传入,由于上一个server-api传下来的recordingSpan,然后成为当前新生成span的父span

        最后将ctx设置(mdOutgoingKey{},md),所以ctx现在有两个值,md的键值和第一节的head的一样,这个也是要通过网络传输,rpc的

        最后添加span的event的属性

        最后和第一节一样,将ctx往下面的执行函数传,并设置span的状态和attributes的状态key,可以自己调试看看

3.span(server-rpc)

        go-zero的rpc的server的trace的中间件代码为

// UnaryTracingInterceptor is a grpc.UnaryServerInterceptor for opentelemetry.
func UnaryTracingInterceptor(ctx context.Context, req any, info *grpc.UnaryServerInfo,
	handler grpc.UnaryHandler) (any, error) {
	ctx, span := startSpan(ctx, info.FullMethod)
	defer span.End()

	ztrace.MessageReceived.Event(ctx, 1, req)
	resp, err := handler(ctx, req)
	if err != nil {
		s, ok := status.FromError(err)
		if ok {
			span.SetStatus(codes.Error, s.Message())
			span.SetAttributes(ztrace.StatusCodeAttr(s.Code()))
			ztrace.MessageSent.Event(ctx, 1, s.Proto())
		} else {
			span.SetStatus(codes.Error, err.Error())
		}
		return nil, err
	}

	span.SetAttributes(ztrace.StatusCodeAttr(gcodes.OK))
	ztrace.MessageSent.Event(ctx, 1, resp)

	return resp, nil
}

func startSpan(ctx context.Context, method string) (context.Context, trace.Span) {
	md, ok := metadata.FromIncomingContext(ctx)
	if !ok {
		md = metadata.MD{}
	}
	bags, spanCtx := ztrace.Extract(ctx, otel.GetTextMapPropagator(), &md)
	ctx = baggage.ContextWithBaggage(ctx, bags)
	tr := otel.Tracer(ztrace.TraceName)
	name, attr := ztrace.SpanInfo(method, ztrace.PeerFromCtx(ctx))

	return tr.Start(trace.ContextWithRemoteSpanContext(ctx, spanCtx), name,
		trace.WithSpanKind(trace.SpanKindServer), trace.WithAttributes(attr...))
}

        rpc的client和server很相似通过前面的代码解释已经很清楚了,就没必要讲代码了,看看调试的数据,就一目了然

        首先是rpc的client通过md将数据传了过来,从ctx同获取md

        然后将md的数据traceparent的数据,提取到ctx的键值对(currentSpanKey,nonRecordingSpan),这里span的remote设置为true

        其实上面是提取两个,还有baggage,但是它为空,就被忽略了

        这里的rpc的server的生成的name和attributes键值对数组

        最后生成span,注意这里的parentspancontext是client传过来的

        最后就是和rpc client一样了,添加事件和状态码。

4.span(client-redis)

        go-zero的redis的client的trace的中间件代码为

func (h hook) startSpan(ctx context.Context, cmds ...red.Cmder) context.Context {
	tracer := trace.TracerFromContext(ctx)

	ctx, span := tracer.Start(ctx,
		spanName,
		oteltrace.WithSpanKind(oteltrace.SpanKindClient),
	)

	cmdStrs := make([]string, 0, len(cmds))
	for _, cmd := range cmds {
		cmdStrs = append(cmdStrs, cmd.Name())
	}
	span.SetAttributes(redisCmdsAttributeKey.StringSlice(cmdStrs))

	return ctx
}

func (h hook) endSpan(ctx context.Context, err error) {
	span := oteltrace.SpanFromContext(ctx)
	defer span.End()

	if err == nil || err == red.Nil {
		span.SetStatus(codes.Ok, "")
		return
	}

	span.SetStatus(codes.Error, err.Error())
	span.RecordError(err)
}

        通过上面的解释,这个就很简单了,就不讲了

4.总结

        现在为止,第一章生成的四个span的数据的字段(Name,SpanContext,Parent,SpanKind,StartTime,EndTimeAttributes,Events,...),应该都可以读懂了,可以根据这些字段找到父子级的层级关系和时间长短。其实trace本身原理很简单,如果进程里,就通过ctx传输数据,这个俗称go的狗链子,到哪里都要带着,如果网络通信,就通过网络传输数据,可以固定几个字段,搞几个键值对,将数据传输到下层,这里的http是head传输,rpc是md,网上说trace的难点是埋点,我们只是通过go-zero的代码来看埋点的过程,上面讲的四个都是原作者埋的点,通过这两篇文章解析,应该是自己也可以学会go的trace 的用法。

最后本人能力有限,如果上面的有纰漏和问题,欢迎大佬指出

  • 23
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值