Sender.completeBatch
if(error != Errors.NONE &&canRetry(batch, error)){// retry
log.warn("Got error produce response with correlation id {} on topic-partition {}, retrying ({} attempts left). Error: {}",
correlationId,
batch.topicPartition,this.retries - batch.attempts -1,
error);//重新把发送失败等着批次 加入到队列里面。this.accumulator.reenqueue(batch, now);this.sensors.recordRetries(batch.topicPartition.topic(), batch.recordCount);}else{
2 超时
Sender.run.this.accumulator.abortExpiredBatches
public List<RecordBatch>abortExpiredBatches(int requestTimeout,long now){
List<RecordBatch> expiredBatches =newArrayList<>();int count =0;//for(Map.Entry<TopicPartition, Deque<RecordBatch>> entry :this.batches.entrySet()){//获取到每个分区的队列 -》 队列里面对应的批次
Deque<RecordBatch> dq = entry.getValue();
TopicPartition tp = entry.getKey();// We only check if the batch should be expired if the partition does not have a batch in flight.// This is to prevent later batches from being expired while an earlier batch is still in progress.// Note that `muted` is only ever populated if `max.in.flight.request.per.connection=1` so this protection// is only active in this case. Otherwise the expiration order is not guaranteed.if(!muted.contains(tp)){synchronized(dq){// iterate over the batches and expire them if they have been in the accumulator for more than requestTimeOut
RecordBatch lastBatch = dq.peekLast();//迭代的看每个分区里面的每个批次
Iterator<RecordBatch> batchIterator = dq.iterator();while(batchIterator.hasNext()){
RecordBatch batch = batchIterator.next();boolean isFull = batch != lastBatch || batch.records.isFull();// check if the batch is expired//TODO 判断一下是否超时if(batch.maybeExpire(requestTimeout, retryBackoffMs, now,this.lingerMs, isFull)){//增加到超时的数据结构里面
expiredBatches.add(batch);
count++;//从数据结构里面移除
batchIterator.remove();//释放资源deallocate(batch);}else{// Stop at the first batch that has not expired.break;}}}}}if(!expiredBatches.isEmpty())
log.trace("Expired {} batches in accumulator", count);return expiredBatches;}
batch.maybeExpire
publicbooleanmaybeExpire(int requestTimeoutMs,long retryBackoffMs,long now,long lingerMs,boolean isFull){boolean expire =false;
String errorMessage = null;/**
* requestTimeoutMs:代表的是请求发送的超时的时间。默认值是30.
* now:当前时间
* lastAppendTime:批次的创建的时间(上一次重试的时间)
* now - this.lastAppendTime 大于30秒,说明批次超时了 还没发送出去。
*/if(!this.inRetry()&& isFull && requestTimeoutMs <(now -this.lastAppendTime)){
expire =true;//记录异常信息
errorMessage =(now -this.lastAppendTime)+" ms has passed since last append";/**
* lingerMs: 100ms,无论如何都要把消息发送出去的时间
*
* createdMs:批次创建的时间
*
* 已经大于30秒了。 说明也是超时了。
*
*/}elseif(!this.inRetry()&& requestTimeoutMs <(now -(this.createdMs + lingerMs))){
expire =true;
errorMessage =(now -(this.createdMs + lingerMs))+" ms has passed since batch creation plus linger time";/**
* 针对重试
* lastAttemptMs: 上一次重试的时间(批次创建的时间)
* retryBackoffMs: 重试的时间间隔
* 说明也是超时了。
*/}elseif(this.inRetry()&& requestTimeoutMs <(now -(this.lastAttemptMs + retryBackoffMs))){
expire =true;
errorMessage =(now -(this.lastAttemptMs + retryBackoffMs))+" ms has passed since last attempt plus backoff time";}if(expire){this.records.close();//调用done方法//方法里面传过去了一个TimeoutException的异常。(超时了)//TODO 处理超时的批次this.done(-1L, Record.NO_TIMESTAMP,newTimeoutException("Expiring "+ recordCount +" record(s) for "+ topicPartition +" due to "+ errorMessage));}return expire;}
3 长时间没有响应的消息处理
NetWorkClient.poll.handleTimedOutRequests
privatevoidhandleTimedOutRequests(List<ClientResponse> responses,long now){//获取到请求超时的主机。
List<String> nodeIds =this.inFlightRequests.getNodesWithTimedOutRequests(now,this.requestTimeoutMs);for(String nodeId : nodeIds){// close connection to the node//关闭请求超时的主机的连接this.selector.close(nodeId);
log.debug("Disconnecting from node {} due to request timeout.", nodeId);//我们猜应该是会去修改 连接的状态processDisconnection(responses, nodeId, now);}// we disconnected, so we should probably refresh our metadataif(nodeIds.size()>0)
metadataUpdater.requestUpdate();}
processDisconnection
privatevoidprocessDisconnection(List<ClientResponse> responses, String nodeId,long now){//修改连接状态
connectionStates.disconnected(nodeId, now);//for(ClientRequest request :this.inFlightRequests.clearAll(nodeId)){
log.trace("Cancelled request {} due to node {} being disconnected", request, nodeId);if(!metadataUpdater.maybeHandleDisconnection(request))//对这些请求进行处理//大家会看到一个比较有意思的事//自己封装了一个响应。这个响应里面没有服务端响应消息(服务端没给响应)//失去连接的状态表标识为true
responses.add(newClientResponse(request, now,true, null));}}
disconnected
publicvoiddisconnected(String id,long now){
NodeConnectionState nodeState =nodeState(id);//修缓存的对应主机的连接转态:DISCONNECTED//sender -> 检查网络是否可以举报发送消息的条件 -> 是否可以尝试建立网络连接。//如果主机的状态是:DISCONNECTED,可以尝试初始化连接。//最后调用networkclient的poll方法(Selector 去完成的最后的网络连接)
nodeState.state = ConnectionState.DISCONNECTED;
nodeState.lastConnectAttemptMs = now;}