version: '3.3'
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.5.0
container_name: elasticsearch
restart: always
ports:
- 9200:9200
environment:
- discovery.type=single-node
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms256m -Xmx256m"
ulimits:
memlock:
soft: -1
hard: -1
oap:
image: apache/skywalking-oap-server:8.8.0
container_name: oap
depends_on:
- elasticsearch
links:
- elasticsearch
restart: always
ports:
- 11800:11800
- 12800:12800
environment:
SW_STORAGE: elasticsearch
SW_STORAGE_ES_CLUSTER_NODES: elasticsearch:9200
ui:
image: apache/skywalking-ui
container_name: ui
depends_on:
- oap
links:
- oap
restart: always
ports:
- 8080:8080
environment:
SW_OAP_ADDRESS: http://oap:12800
2.添加依赖
nuget 包 SkyAPM.Agent.AspNetCore
3.编辑Skywalking
配置文件skyapm.json
{
"SkyWalking": {
"ServiceName": "Cnsns.SiteStarV6",
"Namespace": "",
"HeaderVersions": [
"sw8"
],
"Sampling": {
"SamplePer3Secs": -1,
"Percentage": -1.0
},
"Logging": {
"Level": "Information",
"FilePath": "logs\\skyapm-{Date}.log"
},
"Transport": {
"Interval": 3000,
"ProtocolVersion": "v8",
"QueueSize": 30000,
"BatchSize": 3000,
"gRPC": {
"Servers": "111.111.13.11:11800",
"Timeout": 10000,
"ConnectTimeout": 10000,
"ReportTimeout": 600000,
"Authentication": ""
}
}
}
}
配置文件生成如下
1、安装CLI(SkyAPM.DotNet.CLI)
dotnet tool install -g SkyAPM.DotNet.CLI
2、自动生成skyapm.json
文件
dotnet skyapm config [service name] [server]:11800
eg: dotnet skyapm config MySkyWalking_OrderService 111.111.13.11:11800
server name
指的就是您刚才配置的SKYWALKING__SERVICENAME
,server
指的是您Skywalking
的ip
地址。
4. 在launchSettings.json文件配置SkyWalking
"profiles": { // 项目
"IIS Express": { // IIS部署项
"commandName": "IISExpress",
"launchBrowser": true,
"launchUrl": "weatherforecast",
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development",
"ASPNETCORE_HOSTINGSTARTUPASSEMBLIES": "SkyAPM.Agent.AspNetCore",
"SKYWALKING__SERVICENAME": "MySkyWalkingDemoTest"
}
},
"SkyWalkingDemo": { // castrol部署项
"commandName": "Project",
"launchBrowser": true,
"launchUrl": "weatherforecast",
"applicationUrl": "http://localhost:5000",
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development",
"ASPNETCORE_HOSTINGSTARTUPASSEMBLIES": "SkyAPM.Agent.AspNetCore", // 必须配置
"SKYWALKING__SERVICENAME": "MySkyWalkingDemoTest" // 必须配置,在skywalking做标识
}
}
}
5.在Program.cs 注册
#region 注册Skywalking
builder.Services.AddSkyApmExtensions(); // 添加Skywalking相关配置
#endregion
6.调用获取链路追踪的Id
private readonly IEntrySegmentContextAccessor segContext;
public SkywalkingController(IEntrySegmentContextAccessor segContext)
{
this.segContext = segContext;
}
/// <summary>
/// 获取链接追踪ID
/// </summary>
/// <returns></returns>
public IActionResult GetSkywalkingTraceId()
{
return Content(_segContextAccessor.Context.TraceId.ToString());
}
7.自定义链路追踪
[HttpGet]
public async Task<IActionResult> SkywalkingTest()
{
//获取全局的skywalking的TracId
var TraceId = _segContext.Context.TraceId;
Console.WriteLine($"TraceId={TraceId}");
_segContext.Context.Span.AddLog(LogEvent.Message($"SkywalkingTest---Worker running at: {DateTime.Now}"));
System.Threading.Thread.Sleep(1000);
_segContext.Context.Span.AddLog(LogEvent.Message($"SkywalkingTest---Worker running at--end: {DateTime.Now}"));
return Ok($"Ok,SkywalkingTest-TraceId={TraceId} ");
}
网关和服务之间调用,同上配置。
有时候我们需要发通知(比如那个服务实例出问题了)
8.配置告警规则
进入容器
docker exec -it 12f053748e85 /bin/sh
通过cat alarm-settings.yml可以查阅文件内容,如下:
docker cp 12f053748e85:/skywalking/config/alarm-settings.yml .
# Sample alarm rules.
rules:
# Rule unique name, must be ended with `_rule`.
service_resp_time_rule:
metrics-name: service_resp_time
op: ">"
threshold: 1000
period: 10
count: 3
silence-period: 5
message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes.
service_sla_rule:
# Metrics value need to be long, double or int
metrics-name: service_sla
op: "<"
threshold: 8000
# The length of time to evaluate the metrics
period: 10
# How many times after the metrics match the condition, will trigger alarm
count: 2
# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
silence-period: 3
message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes
service_resp_time_percentile_rule:
# Metrics value need to be long, double or int
metrics-name: service_percentile
op: ">"
threshold: 1000,1000,1000,1000,1000
period: 10
count: 3
silence-period: 5
message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
service_instance_resp_time_rule:
metrics-name: service_instance_resp_time
op: ">"
threshold: 1000
period: 10
count: 2
silence-period: 5
message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes
database_access_resp_time_rule:
metrics-name: database_access_resp_time
threshold: 1000
op: ">"
period: 10
count: 2
message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutes
endpoint_relation_resp_time_rule:
metrics-name: endpoint_relation_resp_time
threshold: 1000
op: ">"
period: 10
count: 2
message: Response time of endpoint relation {name} is more than 1000ms in 2 minutes of last 10 minutes
# Active endpoint related metrics alarm will cost more memory than service and service instance metrics alarm.
# Because the number of endpoint is much more than service and instance.
#
# endpoint_avg_rule:
# metrics-name: endpoint_avg
# op: ">"
# threshold: 1000
# period: 10
# count: 2
# silence-period: 5
# message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes
webhooks:
# - http://127.0.0.1/notify/
# - http://127.0.0.1/go-wechat/
规则常用指标解读:
- rule name: 规则名称,必须唯一,必须以 _rule结尾;
- metrics name: oal(Observability Analysis Language)脚本中的度量名;名称在SkyWalking后端服务中已经定义,进入容器skywalking-oap之后,进入如下目录就可以找到。
- include names: 本规则告警生效的实体名称,如服务名,终端名;
- exclude-names:将此规则作用于不匹配的实体名称上,如服务名,终端名;
- threshold: 阈值,可以是一个数组,即可以配置多个值;
- op: 操作符, 可以设定 >, <, =;
- period: 多久检查一次当前的指标数据是否符合告警规则;以分钟为单位
- count: 超过阈值条件,达到count次数,触发告警;
- silence period:在同一个周期,指定的silence period时间内,忽略相同的告警消息;
更多告警规则详情,请参照这个地址:https://github.com/apache/skywalking/blob/master/docs/en/setup/backend/backend-alarm.md
修改告警规则
rules:
service_test_sal_rule:
# 指定指标名称
metrics-name: service_test_sal
# 小于
op: "<"
# 指定阈值
threshold: 8000
# 每2分钟检测告警该规则
period: 2
# 触发1次规则就告警
count: 1
# 设置三分钟内容相同告警,不重复告警
silence-period: 3
# 配置告警信息
message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes
概要:服务成功率在过去2分钟内低于80%
告警API编写
这个本质还是SkyWalking根据规则进行检查,如果符合规则条件,就通过WebHook、gRPCHook、WeChat Hook、Dingtalk Hook等方式进行消息通知;接收到告警数据信息之后,可以自行处理消息。这里为了方便,就采用WebHook的方式进行演示,即触发告警条件之后,SkyWalking会调用配置的WebHook 接口,并传递对应的告警信息;
//定义数据模型
public class AlarmMsg
{
public int scopeId { get; set; }
public string? scope { get; set; }
public string? name { get; set; }
public string? id0 { get; set; }
public string? id1 { get; set; }
public string? ruleName { get; set; }
public string? alarmMessage { get; set; }
}
定义WebHook调用API
/// <summary>
/// 告警API
/// </summary>
/// <param name="msgs"></param>
/// <returns></returns>
[HttpPost("AlarmMsg")]
public void AlarmMsg(List<AlarmMsg> msgs)
{
string msg = "触发告警:";
msg += msgs.FirstOrDefault()?.alarmMessage;
Console.WriteLine(msg);
SendMail(msg);
}
配置webHook
#http://192.168.3.105:7900/api/Skywalking/AlarmMsg
# Sample alarm rules.
rules:
# Rule unique name, must be ended with `_rule`.
service_resp_time_rule:
metrics-name: service_resp_time
op: ">"
threshold: 1000
period: 10
count: 3
silence-period: 5
message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes.
service_sla_rule:
# Metrics value need to be long, double or int
metrics-name: service_sla
op: "<"
threshold: 8000
# The length of time to evaluate the metrics
period: 10
# How many times after the metrics match the condition, will trigger alarm
count: 2
# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
silence-period: 3
message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes
service_resp_time_percentile_rule:
# Metrics value need to be long, double or int
metrics-name: service_percentile
op: ">"
threshold: 1000,1000,1000,1000,1000
period: 10
count: 3
silence-period: 5
message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
service_instance_resp_time_rule:
metrics-name: service_instance_resp_time
op: ">"
threshold: 1000
period: 10
count: 2
silence-period: 5
message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes
database_access_resp_time_rule:
metrics-name: database_access_resp_time
threshold: 1000
op: ">"
period: 10
count: 2
message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutes
endpoint_relation_resp_time_rule:
metrics-name: endpoint_relation_resp_time
threshold: 1000
op: ">"
period: 10
count: 2
message: Response time of endpoint relation {name} is more than 1000ms in 2 minutes of last 10 minutes
# Active endpoint related metrics alarm will cost more memory than service and service instance metrics alarm.
# Because the number of endpoint is much more than service and instance.
#
# endpoint_avg_rule:
# metrics-name: endpoint_avg
# op: ">"
# threshold: 1000
# period: 10
# count: 2
# silence-period: 5
# message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes
webhooks:
- http://192.168.3.105:7900/api/Skywalking/AlarmMsg
# - http://127.0.0.1/go-wechat/
rules:
# 告警规则名称,必须唯一,以_rule结尾
service_sla_rule:
# 指定metrics-name
metrics-name: service_sla
# 小于
op: "<"
# 指定阈值
threshold: 8000
# 10分钟检测一次告警规则
period: 10
# 触发2次告警规则就告警
count: 2
# 设置的3分钟时间段有相同的告警,不重复告警.
silence-period: 3
# 配置告警消息
message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes
webhooks:
- http://192.168.3.105:7900/api/Skywalking/AlarmMsg
skywoking 为什么能无侵入,因为在ioc之前他已经注册上了