1.简介
data stall检测机制就我现在的理解看来是Android 在网络校验成功后,对网络的一种持续监控措施,一旦发现当前网络断网,则通知ConnectivityService,进行相应的处理措施。
2.流程分析
2.1 tcp health 获取
NetworkMonitor.java
private class ValidatedState extends State {
@Override
public void enter() {
maybeLogEvaluationResult(
networkEventType(validationStage(), EvaluationResult.VALIDATED));
// If the user has accepted partial connectivity and HTTPS probing is disabled, then
// mark the network as validated and partial so that settings can keep informing the
// user that the connection is limited.
int result = NETWORK_VALIDATION_RESULT_VALID;
if (!mUseHttps && mAcceptPartialConnectivity) {
result |= NETWORK_VALIDATION_RESULT_PARTIAL;
}
mEvaluationState.reportEvaluationResult(result, null /* redirectUrl */);
mValidations++;
initSocketTrackingIfRequired();
// start periodical polling.
sendTcpPollingEvent();
maybeStopCollectionAndSendMetrics();
}
看下网络在校验通过后干了啥
private void initSocketTrackingIfRequired() {
if (!isValidationRequired()) return;
final TcpSocketTracker tst = getTcpSocketTracker();
if (tst != null) {
tst.pollSocketsInfo();
}
}
/**
* Request to send a SockDiag Netlink request. Receive and parse the returned message. This
* function is not thread-safe and should only be called from only one thread.
*
* @Return if this polling request executes successfully or not.
*/
public boolean pollSocketsInfo() {
if (!mDependencies.isTcpInfoParsingSupported()) return false;
FileDescriptor fd = null;
try {
final long time = SystemClock.elapsedRealtime();
fd = mDependencies.connectToKernel();
final TcpStat stat = new TcpStat();
for (final int family : ADDRESS_FAMILIES) {
mDependencies.sendPollingRequest(fd, mSockDiagMsg.get(family));
// Messages are composed with the following format. Stop parsing when receiving
// message with nlmsg_type NLMSG_DONE.
// +------------------+---------------+--------------+--------+
// | Netlink Header | Family Header | Attributes | rtattr |
// | struct nlmsghdr | struct rtmsg | struct rtattr| data |
// +------------------+---------------+--------------+--------+
// : : :
// +------------------+---------------+--------------+--------+
// | Netlink Header | Family Header | Attributes | rtattr |
// | struct nlmsghdr | struct rtmsg | struct rtattr| data |
// +------------------+---------------+--------------+--------+
final ByteBuffer bytes = mDependencies.recvMessage(fd);
try {
while (enoughBytesRemainForValidNlMsg(bytes)) {
final StructNlMsgHdr nlmsghdr = StructNlMsgHdr.parse(bytes);
if (nlmsghdr == null) {
Log.e(TAG, "Badly formatted data.");
break;
}
final int nlmsgLen = nlmsghdr.nlmsg_len;
log("pollSocketsInfo: nlmsghdr=" + nlmsghdr + ", limit=" + bytes.limit());
// End of the message. Stop parsing.
if (nlmsghdr.nlmsg_type == NLMSG_DONE) break;
if (nlmsghdr.nlmsg_type != SOCK_DIAG_BY_FAMILY) {
Log.e(TAG, "Expect to get family " + family
+ " SOCK_DIAG_BY_FAMILY message but get "
+ nlmsghdr.nlmsg_type);
break;
}
if (isValidInetDiagMsgSize(nlmsgLen)) {
// Get the socket cookie value. Composed by two Integers value.
// Corresponds to inet_diag_sockid in
// <linux_src>/include/uapi/linux/inet_diag.h
bytes.position(bytes.position() + IDIAG_COOKIE_OFFSET);
// It's stored in native with 2 int. Parse it as long for convenience.
final long cookie = bytes.getLong();
// Skip the rest part of StructInetDiagMsg.
bytes.position(bytes.position()
+ StructInetDiagMsg.STRUCT_SIZE - IDIAG_COOKIE_OFFSET
- Long.BYTES);
final SocketInfo info = parseSockInfo(bytes, family, nlmsgLen, time);
// Update TcpStats based on previous and current socket info.
stat.accumulate(
calculateLatestPacketsStat(info, mSocketInfos.get(cookie)));
mSocketInfos.put(cookie, info);
}
}
} catch (IllegalArgumentException | BufferUnderflowException e) {
Log.wtf(TAG, "Unexpected socket info parsing, family " + family
+ " buffer:" + bytes + " "
+ Base64.getEncoder().encodeToString(bytes.array()), e);
}
}
// Calculate mLatestReceiveCount, mSentSinceLastRecv and mLatestPacketFailPercentage.
mSentSinceLastRecv = (stat.receivedCount == 0)
? (mSentSinceLastRecv + stat.sentCount) : 0;
mLatestReceivedCount = stat.receivedCount;
mLatestPacketFailPercentage = ((stat.sentCount != 0)
? ((stat.retransmitCount + stat.lostCount) * 100 / stat.sentCount) : 0);
// Remove out-of-date socket info.
cleanupSocketInfo(time);
return true;
} catch (ErrnoException | SocketException | InterruptedIOException e) {
Log.e(TAG, "Fail to get TCP info via netlink.", e);
} finally {
NetworkStackUtils.closeSocketQuietly(fd);
}
return false;
}
获取当前最新的发包数+失败率+收包数
// Number of packets sent since the last received packet
private int mSentSinceLastRecv;
// The latest fail rate calculated by the latest tcp info.
private int mLatestPacketFailPercentage;
// Number of packets received in the latest polling cycle.
private int mLatestReceivedCount;
上面的获取收发包情况的逻辑会在如下消息处理中循环往复,间隔为20s
@VisibleForTesting
void sendTcpPollingEvent() {
if (isValidationRequired()) {
sendMessageDelayed(EVENT_POLL_TCPINFO, getTcpPollingInterval());
}
}
case EVENT_POLL_TCPINFO:
final TcpSocketTracker tst = getTcpSocketTracker();
if (tst == null) break;
// Transit if retrieve socket info is succeeded and suspected as a stall.
if (tst.pollSocketsInfo() && evaluateDataStall()) {
transitionTo(mEvaluatingState);
} else {
sendTcpPollingEvent();
}
break;
2.2 tcp health 判定
boolean evaluateDataStall() {
if (isDataStall()) {
validationLog("Suspecting data stall, reevaluate");
return true;
}
return false;
}
@VisibleForTesting
protected boolean isDataStall() {
if (!isValidationRequired()) {
return false;
}
Boolean result = null;
final StringJoiner msg = (DBG || VDBG_STALL) ? new StringJoiner(", ") : null;
// Reevaluation will generate traffic. Thus, set a minimal reevaluation timer to limit the
// possible traffic cost in metered network.
if (!mNetworkCapabilities.hasCapability(NET_CAPABILITY_NOT_METERED)
&& (SystemClock.elapsedRealtime() - getLastProbeTime()
< mDataStallMinEvaluateTime)) {
return false;
}
// Check TCP signal. Suspect it may be a data stall if :
// 1. TCP connection fail rate(lost+retrans) is higher than threshold.
// 2. Accumulate enough packets count.
final TcpSocketTracker tst = getTcpSocketTracker();
if (dataStallEvaluateTypeEnabled(DATA_STALL_EVALUATION_TYPE_TCP) && tst != null) {
if (tst.getLatestReceivedCount() > 0) {
result = false;
} else if (tst.isDataStallSuspected()) {
result = true;
mDataStallTypeToCollect = DATA_STALL_EVALUATION_TYPE_TCP;
final DataStallReportParcelable p = new DataStallReportParcelable();
p.detectionMethod = DETECTION_METHOD_TCP_METRICS;
p.timestampMillis = SystemClock.elapsedRealtime();
p.tcpPacketFailRate = tst.getLatestPacketFailPercentage();
p.tcpMetricsCollectionPeriodMillis = getTcpPollingInterval();
notifyDataStallSuspected(p);
}
if (DBG || VDBG_STALL) {
msg.add("tcp packets received=" + tst.getLatestReceivedCount())
.add("latest tcp fail rate=" + tst.getLatestPacketFailPercentage());
}
}
// Check dns signal. Suspect it may be a data stall if both :
// 1. The number of consecutive DNS query timeouts >= mConsecutiveDnsTimeoutThreshold.
// 2. Those consecutive DNS queries happened in the last mValidDataStallDnsTimeThreshold ms.
final DnsStallDetector dsd = getDnsStallDetector();
if ((result == null) && (dsd != null)
&& dataStallEvaluateTypeEnabled(DATA_STALL_EVALUATION_TYPE_DNS)) {
if (dsd.isDataStallSuspected(mConsecutiveDnsTimeoutThreshold,
mDataStallValidDnsTimeThreshold)) {
result = true;
mDataStallTypeToCollect = DATA_STALL_EVALUATION_TYPE_DNS;
logNetworkEvent(NetworkEvent.NETWORK_CONSECUTIVE_DNS_TIMEOUT_FOUND);
final DataStallReportParcelable p = new DataStallReportParcelable();
p.detectionMethod = DETECTION_METHOD_DNS_EVENTS;
p.timestampMillis = SystemClock.elapsedRealtime();
p.dnsConsecutiveTimeouts = mDnsStallDetector.getConsecutiveTimeoutCount();
notifyDataStallSuspected(p);
}
if (DBG || VDBG_STALL) {
msg.add("consecutive dns timeout count=" + dsd.getConsecutiveTimeoutCount());
}
}
// log only data stall suspected.
if ((DBG && Boolean.TRUE.equals(result)) || VDBG_STALL) {
log("isDataStall: result=" + result + ", " + msg);
}
return (result == null) ? false : result;
}
首先是统计当前tcp是否能收到包,若能收到则认为网络正常,其次判定收发包失败率是否大于80%
/**
* Default tcp packets fail rate to suspect as a data stall.
*
* Calculated by ((# of packets lost)+(# of packets retrans))/(# of packets sent)*100. Ideally,
* the percentage should be 100%. However, the ongoing packets may not be considered as neither
* lost or retrans yet. It will cause the percentage lower.
*/
public static final int DEFAULT_TCP_PACKETS_FAIL_PERCENTAGE = 80;
再接着会判定dns的情况,若30min中连续5次dns失败则认为是断网
// Default configuration values for data stall detection.
public static final int DEFAULT_CONSECUTIVE_DNS_TIMEOUT_THRESHOLD = 5;
public static final int DEFAULT_DATA_STALL_VALID_DNS_TIME_THRESHOLD_MS = 30 * 60 * 1000;
3.后续处理
NetworkMonitor发现当前网络没往后会重新进行网络校验,后续通知给CS
另外发现断网的即刻就会回调通知CS告知与当前网络绑定的各应用,该网络断网了
private void handleDataStallSuspected(
@NonNull NetworkAgentInfo nai, long timestampMillis, int detectionMethod,
@NonNull PersistableBundle extras) {
final NetworkCapabilities networkCapabilities =
getNetworkCapabilitiesWithoutUids(nai.networkCapabilities);
final DataStallReport report =
new DataStallReport(
nai.network,
timestampMillis,
detectionMethod,
nai.linkProperties,
networkCapabilities,
extras);
final List<IConnectivityDiagnosticsCallback> results =
getMatchingPermissionedCallbacks(nai);
for (final IConnectivityDiagnosticsCallback cb : results) {
try {
cb.onDataStallSuspected(report);
} catch (RemoteException ex) {
loge("Error invoking onDataStallSuspected", ex);
}
}
}
4.总结
网络连接成功后有个叫做data stall的检测机制来持续检测网络可达性,判定标准为是否可以正常收包或者包失败率大于80%或者在30min内dns连续失败5次,即判定断网,通报给ConnectivityService。