核心链路调用
Entry e = new CtEntry(resourceWrapper, chain, context);
try {
//开启链路调用
chain.entry(context, resourceWrapper, null, count, prioritized, args);
} catch (BlockException e1) {
e.exit(count, args);
throw e1;
} catch (Throwable e1) {
// This should not happen, unless there are errors existing in Sentinel internal.
RecordLog.info("Sentinel unexpected exception", e1);
}
return e;
资源调用会创建一个CtEntry,然后开始执行上面创建的执行链
NodeSelectorSlot
private volatile Map<String, DefaultNode> map = new HashMap<String, DefaultNode>(10);
@Override
public void entry(Context context, ResourceWrapper resourceWrapper, Object obj, int count, boolean prioritized, Object... args)
throws Throwable {
//一个context+resource确定一个DefaultNode
DefaultNode node = map.get(context.getName());
//双重检查
if (node == null) {
synchronized (this) {
//运行上下对应一个DefaultNode
node = map.get(context.getName());
if (node == null) {
node = new DefaultNode(resourceWrapper, null);
HashMap<String, DefaultNode> cacheMap = new HashMap<String, DefaultNode>(map.size());
cacheMap.putAll(map);
cacheMap.put(context.getName(), node);
map = cacheMap;
}
// Build invocation tree构建调用树
((DefaultNode)context.getLastNode()).addChild(node);
}
}
context.setCurNode(node);
fireEntry(context, resourceWrapper, node, count, prioritized, args);
}
这个slot主要是为了收集资源路径,并将这些资源的调用路径,以树状结构存储起来,用于根据调用路径来限流降级。首先判断缓存map中是否已经生成了DefaultNode,如果没有则进入双重检查逻辑,创建DefaultNode并设置资源,一个context+resource确定一个DefaultNode,构建请求的资源路径,构建树状结构,并且设置context中当前调用的node。这个类上有详细的注解解释我们要仔细看一下
ContextUtil.enter("entrance1", "appA");
Entry nodeA = SphU.entry("nodeA");
if (nodeA != null) {
nodeA.exit();
}
ContextUtil.exit();
上述代码通过 ContextUtil.enter() 创建了一个名为 entrance1 的上下文,同时指定调用发起者为 appA;接着通过 SphU.entry()判断请求是否可以正常通过,如果不能顺利通过发生限流降级则会抛出 BlockException。
以上代码将在内存中生成以下结构:
machine-root
* /
* /
* EntranceNode1
* /
* /
* DefaultNode(nodeA)- - - - - -> ClusterNode(nodeA);
每个 DefaultNode 由资源 ID 和输入名称来标识,同一个资源在不同的context中会有不同的DefaultNode,比如
ContextUtil.enter("entrance1", "appA");
Entry nodeA = SphU.entry("nodeA");
if (nodeA != null) {
nodeA.exit();
}
ContextUtil.exit();
ContextUtil.enter("entrance2", "appA");
nodeA = SphU.entry("nodeA");
if (nodeA != null) {
nodeA.exit();
}
ContextUtil.exit();
以上代码将在内存中生成以下结构:
machine-root
* / \
* / \
* EntranceNode1 EntranceNode2
* / \
* / \
* DefaultNode(nodeA) DefaultNode(nodeA)
* | |
* +- - - - - - - - - - +- - - - - - -> ClusterNode(nodeA);
可以看到同一个资源在不同context中会有不同的DefaultNode,但是全局只有有一个ClusterNode
ClusterBuilderSlot
private static volatile Map<ResourceWrapper, ClusterNode> clusterNodeMap = new HashMap<>();
private static final Object lock = new Object();
private volatile ClusterNode clusterNode = null;
@Override
public void entry(Context context, ResourceWrapper resourceWrapper, DefaultNode node, int count,
boolean prioritized, Object... args)
throws Throwable {
if (clusterNode == null) {
synchronized (lock) {
//双重检查
if (clusterNode == null) {
// Create the cluster node.
//创建clusterNode,用于统计全局请求指标
clusterNode = new ClusterNode();
//这里为什么不使用ConcurrentHashMap
HashMap<ResourceWrapper, ClusterNode> newMap = new HashMap<>(Math.max(clusterNodeMap.size(), 16));
newMap.putAll(clusterNodeMap);
newMap.put(node.getId(), clusterNode);
clusterNodeMap = newMap;
}
}
}
node.setClusterNode(clusterNode);
/*
* if context origin is set, we should get or create a new {@link Node} of
* the specific origin.
*/
//请求源
if (!"".equals(context.getOrigin())) {
Node originNode = node.getClusterNode().getOrCreateOriginNode(context.getOrigin());
context.getCurEntry().setOriginNode(originNode);
}
fireEntry(context, resourceWrapper, node, count, prioritized, args);
}
源码中可以看到ClusterNode和ResourceWrapper绑定,在这里构建了资源的clusterNode和请求源originNode,这里有个疑问为什么不使用ConcurrentHashMap,而是使用普通HashMap通过加锁的形式来设置ClusterNode呢?
这里可能是为了高并发考虑,使用copyOnWrite操作,clusterNodeMap是使用static volatile修饰的,如果直接put其他线程无法感知到map内容的变化,通过copyOnWrite操作重新赋值,会使clusterNodeMap地址发生改变,其他线程会感知到变化,保证数据一致性,同时起到读写分离的效果,提高读写并发,同时创建的临时map进行了初始容量的赋值,不会因为扩容导致性能波动。
LogSlot
public void entry(Context context, ResourceWrapper resourceWrapper, DefaultNode obj, int count, boolean prioritized, Object… args)
throws Throwable {
try {
fireEntry(context, resourceWrapper, obj, count, prioritized, args);
} catch (BlockException e) {
EagleEyeLogUtil.log(resourceWrapper.getName(), e.getClass().getSimpleName(), e.getRuleLimitApp(),
context.getOrigin(), count);
throw e;
} catch (Throwable e) {
RecordLog.warn(“Unexpected entry exception”, e);
}
}
这里可以看到该slot没做任何逻辑处理,只有在抛出BlockException时或其他异常时打印日志输出
StatisticSlot
@Override
public void entry(Context context, ResourceWrapper resourceWrapper, DefaultNode node, int count,
boolean prioritized, Object... args) throws Throwable {
try {
// Do some checking.
//先执行后面slot,如果校验通过在执行统计
fireEntry(context, resourceWrapper, node, count, prioritized, args);
// Request passed, add thread count and pass count.
//增加请求线程数
node.increaseThreadNum();
//增加统计请求数
node.addPassRequest(count);
//如果存在源节点,则增加源节点的统计信息
if (context.getCurEntry().getOriginNode() != null) {
// Add count for origin node.
//计算请求源的线程数和请求通过的请求
context.getCurEntry().getOriginNode().increaseThreadNum();
context.getCurEntry().getOriginNode().addPassRequest(count);
}
//增加全局统计
if (resourceWrapper.getType() == EntryType.IN) {
// Add count for global inbound entry node for global statistics.
Constants.ENTRY_NODE.increaseThreadNum();
Constants.ENTRY_NODE.addPassRequest(count);
}
// Handle pass event with registered entry callback handlers.
//执行回调逻辑
for (ProcessorSlotEntryCallback<DefaultNode> handler : StatisticSlotCallbackRegistry.getEntryCallbacks()) {
handler.onPass(context, resourceWrapper, node, count, args);
}
} catch (PriorityWaitException ex) {
node.increaseThreadNum();
if (context.getCurEntry().getOriginNode() != null) {
// Add count for origin node.
context.getCurEntry().getOriginNode().increaseThreadNum();
}
if (resourceWrapper.getType() == EntryType.IN) {
// Add count for global inbound entry node for global statistics.
Constants.ENTRY_NODE.increaseThreadNum();
}
// Handle pass event with registered entry callback handlers.
for (ProcessorSlotEntryCallback<DefaultNode> handler : StatisticSlotCallbackRegistry.getEntryCallbacks()) {
handler.onPass(context, resourceWrapper, node, count, args);
}
} catch (BlockException e) {
// Blocked, set block exception to current entry.
//设置error信息
context.getCurEntry().setError(e);
// Add block count.
//计算被阻塞的请求数量
node.increaseBlockQps(count);
if (context.getCurEntry().getOriginNode() != null) {
context.getCurEntry().getOriginNode().increaseBlockQps(count);
}
if (resourceWrapper.getType() == EntryType.IN) {
// Add count for global inbound entry node for global statistics.
//计算全局请求阻塞的数量
Constants.ENTRY_NODE.increaseBlockQps(count);
}
// Handle block event with registered entry callback handlers.
for (ProcessorSlotEntryCallback<DefaultNode> handler : StatisticSlotCallbackRegistry.getEntryCallbacks()) {
handler.onBlocked(e, context, resourceWrapper, node, count, args);
}
throw e;
} catch (Throwable e) {
// Unexpected error, set error to current entry.
context.getCurEntry().setError(e);
// This should not happen.
//计算发生异常调用
node.increaseExceptionQps(count);
if (context.getCurEntry().getOriginNode() != null) {
context.getCurEntry().getOriginNode().increaseExceptionQps(count);
}
//计算全局发生异常调用
if (resourceWrapper.getType() == EntryType.IN) {
Constants.ENTRY_NODE.increaseExceptionQps(count);
}
throw e;
}
}
这里可以看到该slot主要完成调用数据的实时统计
SystemSlot
@Override
public void entry(Context context, ResourceWrapper resourceWrapper, DefaultNode node, int count,
boolean prioritized, Object... args) throws Throwable {
//全局的指标统计
SystemRuleManager.checkSystem(resourceWrapper);
fireEntry(context, resourceWrapper, node, count, prioritized, args);
}
public static void checkSystem(ResourceWrapper resourceWrapper) throws BlockException {
// Ensure the checking switch is on.
if (!checkSystemStatus.get()) {
return;
}
// for inbound traffic only
//只统计进入的请求
if (resourceWrapper.getType() != EntryType.IN) {
return;
}
// total qps 全局QPS统计
double currentQps = Constants.ENTRY_NODE == null ? 0.0 : Constants.ENTRY_NODE.successQps();
if (currentQps > qps) {
throw new SystemBlockException(resourceWrapper.getName(), "qps");
}
// total thread 全局线程统计
int currentThread = Constants.ENTRY_NODE == null ? 0 : Constants.ENTRY_NODE.curThreadNum();
if (currentThread > maxThread) {
throw new SystemBlockException(resourceWrapper.getName(), "thread");
}
//全局平均请求RT
double rt = Constants.ENTRY_NODE == null ? 0 : Constants.ENTRY_NODE.avgRt();
if (rt > maxRt) {
throw new SystemBlockException(resourceWrapper.getName(), "rt");
}
// load. BBR algorithm.
//计算负载
if (highestSystemLoadIsSet && getCurrentSystemAvgLoad() > highestSystemLoad) {
if (!checkBbr(currentThread)) {
throw new SystemBlockException(resourceWrapper.getName(), "load");
}
}
// cpu usage
//计算cpu使用
if (highestCpuUsageIsSet && getCurrentCpuUsage() > highestCpuUsage) {
if (!checkBbr(currentThread)) {
throw new SystemBlockException(resourceWrapper.getName(), "cpu");
}
}
}
AuthoritySlot
@Override
public void entry(Context context, ResourceWrapper resourceWrapper, DefaultNode node, int count, boolean prioritized, Object... args)
throws Throwable {
checkBlackWhiteAuthority(resourceWrapper, context);
fireEntry(context, resourceWrapper, node, count, prioritized, args);
}
void checkBlackWhiteAuthority(ResourceWrapper resource, Context context) throws AuthorityException {
//获取配置的权限控制规则
Map<String, Set<AuthorityRule>> authorityRules = AuthorityRuleManager.getAuthorityRules();
//不存在则直接跳过
if (authorityRules == null) {
return;
}
//获取针对请求资源的授权规则
Set<AuthorityRule> rules = authorityRules.get(resource.getName());
if (rules == null) {
return;
根据配置的黑白名单和调用来源信息,来做黑白名单控制
FlowSlot
@Override
public void entry(Context context, ResourceWrapper resourceWrapper, DefaultNode node, int count,
boolean prioritized, Object... args) throws Throwable {
//检查限流规则
checkFlow(resourceWrapper, context, node, count, prioritized);
//调用下一个
fireEntry(context, resourceWrapper, node, count, prioritized, args);
}
public void checkFlow(Function<String, Collection<FlowRule>> ruleProvider, ResourceWrapper resource,
Context context, DefaultNode node, int count, boolean prioritized) throws BlockException {
if (ruleProvider == null || resource == null) {
return;
}
//根据资源名称找到对应的限流规则
Collection<FlowRule> rules = ruleProvider.apply(resource.getName());
if (rules != null) {
for (FlowRule rule : rules) {
//遍历规则,依次判断是否通过
if (!canPassCheck(rule, context, node, count, prioritized)) {
throw new FlowException(rule.getLimitApp(), rule);
}
}
}
}
public boolean canPassCheck(/*@NonNull*/ FlowRule rule, Context context, DefaultNode node, int acquireCount,
boolean prioritized) {
String limitApp = rule.getLimitApp();
if (limitApp == null) {
return true;
}
//集群模式检查是否限流
if (rule.isClusterMode()) {
return passClusterCheck(rule, context, node, acquireCount, prioritized);
}
//本地模式检查是否限流
return passLocalCheck(rule, context, node, acquireCount, prioritized);
}
private static boolean passLocalCheck(FlowRule rule, Context context, DefaultNode node, int acquireCount,
boolean prioritized) {
//获取源请求的node
Node selectedNode = selectNodeByRequesterAndStrategy(rule, context, node);
if (selectedNode == null) {
return true;
}
return rule.getRater().canPass(selectedNode, acquireCount, prioritized);
}
这里通过资源名称获取dashboard配置的流控规则,一次判断规则是否可以通过,根据配置的规则是找到对应的node
static Node selectNodeByRequesterAndStrategy(/*@NonNull*/ FlowRule rule, Context context, DefaultNode node) {
// The limit app should not be empty.
//是否针对调用来源,默认为default就是不区分
String limitApp = rule.getLimitApp();
int strategy = rule.getStrategy();
String origin = context.getOrigin();
//针对的调用来源和请求源相同
if (limitApp.equals(origin) && filterOrigin(origin)) {
if (strategy == RuleConstant.STRATEGY_DIRECT) {//快速失败
// Matches limit origin, return origin statistic node.
//返回请求源node
return context.getOriginNode();
}
//查询关联的其他请求
return selectReferenceNode(rule, context, node);
//如果limitApp为default,及针对所有请求
} else if (RuleConstant.LIMIT_APP_DEFAULT.equals(limitApp)) {
if (strategy == RuleConstant.STRATEGY_DIRECT) {//快速失败
// Return the cluster node.
//返回clusterNode
return node.getClusterNode();
}
//查询关联的其他请求的node
return selectReferenceNode(rule, context, node);
//如果是other及配了当前几个资源之外的其他资源使用的规则
} else if (RuleConstant.LIMIT_APP_OTHER.equals(limitApp)
&& FlowRuleManager.isOtherOrigin(origin, rule.getResource())) {
if (strategy == RuleConstant.STRATEGY_DIRECT) {
return context.getOriginNode();
}
//查询关联的其他请求的node
return selectReferenceNode(rule, context, node);
}
return null;
}
static Node selectReferenceNode(FlowRule rule, Context context, DefaultNode node) {
//关联的资源
String refResource = rule.getRefResource();
int strategy = rule.getStrategy();
if (StringUtil.isEmpty(refResource)) {
return null;
}
//关联
if (strategy == RuleConstant.STRATEGY_RELATE) {
return ClusterBuilderSlot.getClusterNode(refResource);
}
//请求链
if (strategy == RuleConstant.STRATEGY_CHAIN) {
if (!refResource.equals(context.getName())) {
return null;
}
return node;
}
// No node.
return null;
}
我们先来看看FlowRule中的重要属性
private int grade = RuleConstant.FLOW_GRADE_QPS;//限流类型,线程总数或QPS
private double count;//流量控制阈值计数
private int strategy = RuleConstant.STRATEGY_DIRECT;//限制本资源
private String refResource;//使用相关资源或上下文引用流控制中的资源。引用资源
private int controlBehavior = RuleConstant.CONTROL_BEHAVIOR_DEFAULT;//流控策略,0:快速失败 1.预热 2.匀速通过 3.预热+匀速通过
private int warmUpPeriodSec = 10;//预热时间
private int maxQueueingTimeMs = 500;//最大排队时间
private boolean clusterMode;//是否是集群限流模式
private ClusterFlowConfig clusterConfig;//集群限流配置
private TrafficShapingController controller;//流量整形(节流)控制器
这里根据配置的规则针对的来源分为3类
- 设置了针对的来源,则会判断来源和设置的针对来源是否相同,如果相同而且来源不是default或者other则说明命中,再判断设置的限流策略如果是直接则直接返回请求源originNode,如果是请求链路则返回DefaultNode,如果是关联,则根据规则中设置的关联资源,找到对应关联资源的clusterNode然后返回
- 如果针对来源没有设置及为default,则是针对当前资源进行流控,再判断设置的限流策略如果是直接则直接返回资源ClusterNode,如果是请求链路则返回DefaultNode,如果是关联,则根据规则中设置的关联资源,找到对应关联资源的clusterNode然后返回
- 如果是other及配了当前几个资源之外的其他资源使用的规则,再判断设置的限流策略如果是直接则直接返回当前资源之外的请求源originNode,如果是请求链路则返回DefaultNode,如果是关联,则根据规则中设置的关联资源,找到对应关联资源的clusterNode然后返回
根据返回node进行数据统计是否可以正常通过,这里我们认为是DefaultController快速失败
@Override
public boolean canPass(Node node, int acquireCount, boolean prioritized) {
//已经统计的数
int curCount = avgUsedTokens(node);
//已经统计的数+请求数 > 限流数量,则返回false代表限流
if (curCount + acquireCount > count) {
//是优先权而且是根据QPS限流
if (prioritized && grade == RuleConstant.FLOW_GRADE_QPS) {
long currentTime;
long waitInMs;
currentTime = TimeUtil.currentTimeMillis();
//尝试抢用后续时间窗的许可值,返回该时间窗口所剩余的时间,如果获取失败,
//则返回 OccupyTimeoutProperty.getOccupyTimeout()
waitInMs = node.tryOccupyNext(currentTime, acquireCount, count);
//等待时间不会超过一个时间窗
if (waitInMs < OccupyTimeoutProperty.getOccupyTimeout()) {
//将请求数添加到借用的borrowArray一个未来时间窗
node.addWaitingRequest(currentTime + waitInMs, acquireCount);
node.addOccupiedPass(acquireCount);
//等待一段时间
sleep(waitInMs);
// PriorityWaitException indicates that the request will pass after waiting for {@link @waitInMs}.
//表示等待一定时间后可以通过
throw new PriorityWaitException(waitInMs);
}
}
return false;
}
return true;
}
public long tryOccupyNext(long currentTime, int acquireCount, double threshold) {
//可以通过的最大请求数
double maxCount = threshold * IntervalProperty.INTERVAL / 1000;
//获取已经借用的请求数
long currentBorrow = rollingCounterInSecond.waiting();
//大于最大请求数量了则返回默认值500ms
if (currentBorrow >= maxCount) {
return OccupyTimeoutProperty.getOccupyTimeout();
}
//窗口长度,默认值500
int windowLength = IntervalProperty.INTERVAL / SampleCountProperty.SAMPLE_COUNT;
//earliestTime为currentTime往前推了一个周期
long earliestTime = currentTime - currentTime % windowLength + windowLength - IntervalProperty.INTERVAL;
int idx = 0;
/*
* Note: here {@code currentPass} may be less than it really is NOW, because time difference
* since call rollingCounterInSecond.pass(). So in high concurrency, the following code may
* lead more tokens be borrowed.
*/
long currentPass = rollingCounterInSecond.pass();
while (earliestTime < currentTime) {
//当前窗口剩余的时间
long waitInMs = idx * windowLength + windowLength - currentTime % windowLength;
//设置窗口时间不能大于一个时间窗
if (waitInMs >= OccupyTimeoutProperty.getOccupyTimeout()) {
break;
}
//获取前一个时间窗通过请求数
long windowPass = rollingCounterInSecond.getWindowPass(earliestTime);
//当前pass数量+借用的数量+当前需要获取的数量-1s前通过的数量<=最大通过数量
//也就是近两个时间窗口通过的请求和不能是maxCount的2倍
if (currentPass + currentBorrow + acquireCount - windowPass <= maxCount) {
return waitInMs;
}
//向后一个一个窗口
earliestTime += windowLength;
//当前一个窗口通过数量
currentPass -= windowPass;
idx++;
}
return OccupyTimeoutProperty.getOccupyTimeout();
}
判断请求数量是否大于限流数量,如果超过限流数量,判断是否允许占用后面的请求如果可以则获取需要等待的时间,等待一定时间后可以通过,否则直接返回false发生限流
DegradeSlot
@Override
public void entry(Context context, ResourceWrapper resourceWrapper, DefaultNode node, int count, boolean prioritized, Object... args)
throws Throwable {
//是否降级
DegradeRuleManager.checkDegrade(resourceWrapper, context, node, count);
fireEntry(context, resourceWrapper, node, count, prioritized, args);
}
public static void checkDegrade(ResourceWrapper resource, Context context, DefaultNode node, int count)
throws BlockException {
//获取降级规则
Set<DegradeRule> rules = degradeRules.get(resource.getName());
if (rules == null) {
return;
}
for (DegradeRule rule : rules) {
//检查是否可以通过
if (!rule.passCheck(context, node, count)) {
throw new DegradeException(rule.getLimitApp(), rule);
}
}
}
我们看下DegradeRule几个核心属性
private static final int RT_MAX_EXCEED_N = 5;
private double count;//错误率或者是RT值
private int timeWindow;//时间窗
private int grade = RuleConstant.DEGRADE_GRADE_RT;//默认为RT时间
根据资源名称获取所有降级规则,检查是否发生降级
public boolean passCheck(Context context, DefaultNode node, int acquireCount, Object... args) {
if (cut.get()) {
return false;
}
//获取全局统计node
ClusterNode clusterNode = ClusterBuilderSlot.getClusterNode(this.getResource());
if (clusterNode == null) {
return true;
}
//判断RT响应时间
if (grade == RuleConstant.DEGRADE_GRADE_RT) {
double rt = clusterNode.avgRt();
//响应时间没有超过阈值
if (rt < this.count) {
passCount.set(0);
return true;
}
// Sentinel will degrade the service only if count exceeds.
//连接发生5次请求rt超过阈值的请求则发生降级
if (passCount.incrementAndGet() < RT_MAX_EXCEED_N) {
return true;
}
//降级规则为异常错误率
} else if (grade == RuleConstant.DEGRADE_GRADE_EXCEPTION_RATIO) {
//异常请求数
double exception = clusterNode.exceptionQps();
//正常请求数
double success = clusterNode.successQps();
//请求总数
double total = clusterNode.totalQps();
// if total qps less than RT_MAX_EXCEED_N, pass.
//请求总数<5直接通过
if (total < RT_MAX_EXCEED_N) {
return true;
}
//正常请求数中包含异常请求数量
double realSuccess = success - exception;
if (realSuccess <= 0 && exception < RT_MAX_EXCEED_N) {
return true;
}
//计算异常请求率是否超过阈值
if (exception / success < count) {
return true;
}
//降级规则为判断异常请求数量
} else if (grade == RuleConstant.DEGRADE_GRADE_EXCEPTION_COUNT) {
double exception = clusterNode.totalException();
//异常请求没有达到阈值
if (exception < count) {
return true;
}
}
//只会开启一次,开启定时任务经过timeWindow时间后将降级开关关闭
if (cut.compareAndSet(false, true)) {
ResetTask resetTask = new ResetTask(this);
pool.schedule(resetTask, timeWindow, TimeUnit.SECONDS);
}
return false;
}
这里根据配置的降级规则,一个是判断RT时间是否超过阈值,一个是根据错误率判断是否发生降级,另一个根据发送错误数量判断是否发生降级,如果发生降级则开启定时任务经过一段时间后将重新开启,服务又可以对外提供服务。发送降级则会抛出BlockException,然后StatisticSlot会根据FlowSlot和DegradeSlot的返回结果进行实时数据统计。导致里执行链的核心执行流程就分析完了。