最近搭建完gateway模块后想要详细了解下 网关的链路追踪与日志管理,使用的是tempo+grafana+loki,这里三个都需要搭建。
最后实现的是 可以追踪请求信息和日志管理,具体能实现的功能还有很多。
快速定位问题追踪
分析性能瓶颈
理解服务调用关系
监控系统健康状态
优化系统性能
我这里只是简单的搭建了一下,大家可以根据自己的业务深入了解下。
废话不多说,我这里用的docker。直接上docker 配置PS:走了很多弯路最终才配置成功,苦不堪言
docker-compose.yaml
networks:
lokinet: # 更改网络名称,使其更具唯一性
driver: bridge
volumes:
loki-data:
grafana-data:
name: loki-stack
services:
# Tempo runs as user 10001, and docker compose creates the volume as root.
# As such, we need to chown the volume in order for Tempo to start correctly.
tempo:
image: grafana/tempo:latest
container_name: tempo
command: [ "-config.file=/etc/tempo.yaml" ]
volumes:
- ./tempo.yaml:/etc/tempo.yaml
- ./tempo-data:/var/tempo
networks:
- lokinet
ports:
- "14268:14268" # jaeger ingest
- "3200:3200" # tempo
- "9095:9095" # tempo grpc
- "4317:4317" # otlp grpc
- "4318:4318" # otlp http
- "9411:9411" # zipkin
loki:
image: grafana/loki:2.9.0
container_name: lokiserver
ports:
- "3100:3100"
volumes:
- ./loki-config.yaml:/etc/loki/local-config.yaml
- loki-data:/loki
command: -config.file=/etc/loki/local-config.yaml
networks:
lokinet:
aliases:
- lokiserver
restart: unless-stopped
promtail:
image: grafana/promtail:latest
container_name: promtail
depends_on:
- loki # 依赖于 Loki
networks:
- lokinet
volumes:
- ./promtail.yaml:/etc/promtail/config.yaml # Promtail配置文件
- /var/log:/var/log # 挂载本地日志目录
command: -config.file=/etc/promtail/config.yaml
grafana:
image: grafana/grafana:11.2.0
container_name: grafana
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/provisioning/:/etc/grafana/provisioning/
networks:
- lokinet
depends_on:
- loki
tempo.yaml
stream_over_http_enabled: true
server:
http_listen_port: 3200
log_level: info
query_frontend:
search:
duration_slo: 5s
throughput_bytes_slo: 1.073741824e+09
metadata_slo:
duration_slo: 5s
throughput_bytes_slo: 1.073741824e+09
trace_by_id:
duration_slo: 5s
distributor:
receivers: # this configuration will listen on all ports and protocols that tempo is capable of.
jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can
protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
thrift_http: #
grpc: # for a production deployment you should only enable the receivers you need!
thrift_binary:
thrift_compact:
zipkin:
otlp:
protocols:
http:
grpc:
opencensus:
ingester:
max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally
compactor:
compaction:
block_retention: 1h # overall Tempo trace retention. set for demo purposes
storage:
trace:
backend: local # backend configuration to use
wal:
path: /var/tempo/wal # where to store the wal locally
local:
path: /var/tempo/blocks
loki-config.yaml
auth_enabled: false
server:
http_listen_port: 3100
ingester:
wal:
enabled: true
dir: "/loki/wal" # 指定 WAL 目录
lifecycler:
ring:
kvstore:
store: inmemory
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 5m
chunk_retain_period: 30s
schema_config:
configs:
- from: 2020-05-15
store: boltdb
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb:
directory: /loki/index
filesystem:
directory: /loki/chunks
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
promtail.yaml
server:
http_listen_port: 9080
grpc_listen_port: 0
clients:
- url: http://loki:3100/loki/api/v1/push # Loki 服务地址,与 docker-compose 中的服务名保持一致
positions:
filename: /etc/promtail/positions.yaml # 记录日志读取位置
scrape_configs:
- job_name: system
static_configs:
- targets:
- localhost
labels:
job: varlogs
__path__: /var/log/*.log # 要收集的日志路径
gateway-service pom
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.example.project</groupId>
<artifactId>project-root</artifactId>
<version>1.0-SNAPSHOT</version>
</parent>
<artifactId>service-gateway</artifactId>
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencyManagement>
<dependencies>
<!-- OpenTelemetry BOM -->
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-bom</artifactId>
<version>1.32.0</version>
<type>pom</type>
</dependency>
<!-- Spring Cloud BOM -->
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-dependencies</artifactId>
<version>2023.0.0</version>
<type>pom</type>
<scope>import</scope>
<exclusions>
<exclusion>
<groupId>io.r2dbc</groupId>
<artifactId>r2dbc-spi</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</dependencyManagement>
<dependencies>
<!-- Spring Cloud Gateway -->
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-gateway</artifactId>
</dependency>
<!-- Spring Boot Starter Actuator -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<!-- Spring Cloud Starter Config -->
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-config</artifactId>
</dependency>
<!-- Nacos Discovery 服务发现 -->
<dependency>
<groupId>com.alibaba.cloud</groupId>
<artifactId>spring-cloud-starter-alibaba-nacos-discovery</artifactId>
<version>2023.0.1.3</version>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-loadbalancer</artifactId>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.springframework.boot</groupId>-->
<!-- <artifactId>spring-boot-starter-security</artifactId>-->
<!-- </dependency>-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis-reactive</artifactId>
</dependency>
<!-- OpenTelemetry API -->
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-api</artifactId>
</dependency>
<!-- OpenTelemetry SDK -->
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-sdk</artifactId>
</dependency>
<!-- OpenTelemetry OTLP Exporter -->
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-exporter-otlp</artifactId>
</dependency>
<!-- OpenTelemetry Auto Configuration -->
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-sdk-extension-autoconfigure</artifactId>
</dependency>
<dependency>
<groupId>io.opentelemetry.instrumentation</groupId>
<artifactId>opentelemetry-logback-mdc-1.0</artifactId>
<version>1.32.0-alpha</version>
</dependency>
<dependency>
<groupId>io.opentelemetry.instrumentation</groupId>
<artifactId>opentelemetry-instrumentation-annotations</artifactId>
<version>1.27.0</version>
</dependency>
<!-- Micrometer Tracing -->
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-tracing</artifactId>
<version>1.1.5</version>
</dependency>
<!--logback -->
<!-- Log4j2 配置 -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.17.1</version> <!-- 使用适合版本 -->
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.17.1</version>
</dependency>
<!-- Loki Appender 依赖 -->
<dependency>
<groupId>com.github.loki4j</groupId>
<artifactId>loki-logback-appender</artifactId>
<version>1.5.0-rc1</version>
</dependency>
</dependencies>
</project>
applicatuon.yaml
server:
port: 8080 # 配置网关服务的端口
spring:
application:
name: gateway-service
cloud:
nacos:
discovery:
server-addr: localhost:8848
namespace: public
config:
enabled: false
gateway:
discovery:
locator:
enabled: true # 启用服务发现路由
lower-case-service-id: true
routes:
- id: user
uri: lb://user-service # 使用负载均衡,通过服务名进行路由
predicates:
- Path=/user/** # 所有以 /user/ 开头的请求会被路由到 user-service
redis:
host: localhost
port: 6379
autoconfigure:
exclude:
- org.springframework.boot.autoconfigure.r2dbc.R2dbcAutoConfiguration
management:
tracing:
otlp:
endpoint: "http://localhost:4317" # Docker 网络中访问 Tempo
compression: gzip
enabled: true
endpoints:
web:
exposure:
include: "*" # 开启所有端点
endpoint:
health:
show-details: always # 显示详细健康信息
logback-spring.xml
<configuration>
<appender name="LOKI" class="com.github.loki4j.logback.Loki4jAppender">
<url>http://loki:3100/loki/api/v1/push</url>
<labels>
<label name="app" value="my-java-app"/>
</labels>
<logFormat>json</logFormat> <!-- 可以设置为 json 或 simple -->
<encoder class="com.github.loki4j.logback.JsonEncoder">
<pattern>
{
"timestamp": "%d{yyyy-MM-dd HH:mm:ss.SSS}",
"level": "%level",
"thread": "%thread",
"logger": "%logger{36}",
"message": "%message",
"trace_id": "%X{trace_id}",
"span_id": "%X{span_id}"
}
</pattern>
</encoder>
<metricsEnabled>true</metricsEnabled>
</appender>
<root level="info">
<appender-ref ref="LOKI"/>
</root>
</configuration>
package gateway.config;
import io.opentelemetry.api.OpenTelemetry;
import io.opentelemetry.api.common.Attributes;
import io.opentelemetry.context.propagation.ContextPropagators;
import io.opentelemetry.context.propagation.TextMapPropagator;
import io.opentelemetry.sdk.OpenTelemetrySdk;
import io.opentelemetry.sdk.resources.Resource;
import io.opentelemetry.sdk.trace.SdkTracerProvider;
import io.opentelemetry.sdk.trace.export.BatchSpanProcessor;
import io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter;
import io.opentelemetry.semconv.ResourceAttributes;
// 更改这两个 import
import io.opentelemetry.api.baggage.propagation.W3CBaggagePropagator;
import io.opentelemetry.api.trace.propagation.W3CTraceContextPropagator;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@Configuration
public class OpenTelemetryConfig {
@Bean
public OpenTelemetry openTelemetry() {
Resource resource = Resource.getDefault()
.merge(Resource.create(Attributes.of(
ResourceAttributes.SERVICE_NAME, "my-java-app"
)));
SdkTracerProvider sdkTracerProvider = SdkTracerProvider.builder()
.addSpanProcessor(BatchSpanProcessor.builder(
OtlpGrpcSpanExporter.builder()
.setEndpoint("http://localhost:4317")
.build())
.build())
.setResource(resource)
.build();
return OpenTelemetrySdk.builder()
.setTracerProvider(sdkTracerProvider)
.setPropagators(ContextPropagators.create(
TextMapPropagator.composite(
W3CTraceContextPropagator.getInstance(),
W3CBaggagePropagator.getInstance())))
.buildAndRegisterGlobal();
}
}
package gateway.filter;
import io.opentelemetry.api.OpenTelemetry;
import io.opentelemetry.api.trace.Span;
import io.opentelemetry.api.trace.SpanKind;
import io.opentelemetry.api.trace.Tracer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
import org.springframework.cloud.gateway.filter.GlobalFilter;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
import reactor.core.publisher.Mono;
import org.springframework.web.server.ServerWebExchange;
import org.springframework.http.server.reactive.ServerHttpRequest;
@Component
@Order(0) // 优先级,越小优先级越高
public class GlobalLogFilter implements GlobalFilter {
private static final Logger logger = LoggerFactory.getLogger(GlobalLogFilter.class);
private final Tracer tracer;
public GlobalLogFilter(OpenTelemetry openTelemetry) {
this.tracer = openTelemetry.getTracer("gateway-service");
}
@Override
public Mono<Void> filter(ServerWebExchange exchange, org.springframework.cloud.gateway.filter.GatewayFilterChain chain) {
// 记录请求信息
ServerHttpRequest request = exchange.getRequest();
Span span = tracer.spanBuilder("gateway: " + request.getURI())
.setSpanKind(SpanKind.SERVER)
.startSpan();
span.setAttribute("name","test");
span.setAttribute("http.method", request.getMethod().toString());
span.setAttribute("http.url", request.getURI().toString());
logger.info("Incoming Request: Method={}, URI={}, Headers={}",
request.getMethod(), request.getURI(), request.getHeaders());
Span currentSpan = Span.current();
// 将 trace_id 和 span_id 放入 MDC 中
MDC.put("trace_id", currentSpan.getSpanContext().getTraceId());
MDC.put("span_id", currentSpan.getSpanContext().getSpanId());
// 你的日志代码
logger.info("This is a log with trace_id and span_id");
// 执行完后,清除 MDC 中的数据
MDC.remove("trace_id");
MDC.remove("span_id");
// 继续执行链路中的其他过滤器
return chain.filter(exchange).doFinally(signalType -> {
logger.info("Tracing completed for path: {}", request.getURI());
span.end();
}).then(Mono.fromRunnable(() -> {
// 记录响应信息
logger.info("Response: Status Code={}, Headers={}",
exchange.getResponse().getStatusCode(), exchange.getResponse().getHeaders());
})).then();
}
}
以上是全部配置代码,我这里还有一个user-service用来测试,大家可以看上文搭建篇了解网关搭建,最后请求 http://localhost:8080/user-service/users(这里有个问题 关于服务发现配置与静态路由配置冲突问题,正常我user-service 可以直接用静态代理user 访问 即http://localhost:8080/user/users 但是由于启用了服务发现,这里直接用的nacos servicename,这个问题我查了下可以通过配置解决,这里没做更改,大家可以了解下)就可以在tempo 和loki中查看相关信息
这两个工具可以通过查询语句来实现,具体的大家可以去了解下。我这里不赘述了。