Pipe核心分析及消息体结构
Pipe是什么
Pipe是SimpleMessenger中的一个复杂的组件,每个Pipe实例都拥有工作在socket上的两个线程,分别负责socket连接上的读取和写入的数据传输,除此之外还会将socket上的错误信息汇报给SimpleMessenger,同时处于一个稳定状态,来持续不断的为SimpleMessenger提供数据服务。Pipe的不同状态:
STATE_ACCEPTING, STATE_CONNECTING, STATE_OPEN, STATE_STANDBY, STATE_CLOSED, STATE_CLOSING, STATE_WAIT
Pipe::reader()从socke中读取消息:
主要处理逻辑过程:- 判断pipe的当前状态,如果处于STATE_ACCEPTING,则执行accept(),接收连接。
- 判断Pipe的当前状态,Pipe的状态不是STATE_CONNECTING且也不是STATE_CLOSED,进入主循环体。
- 从连接的socket中读取消息的tag,tag的取值有多种对应着不同的消息类型。
- 根据tag类型,做相应的处理操作,依tag == CEPH_MSGR_TAG_MSG为例。
- 调用read_message()读取消息(message)
- 根据message的类型判断是快速处理(fast_dispatch)还是先放到in_q队列中。
- 执行主体循环逻辑,如果Pipe的状态不在满足,跳槽主循环结束reader。
Pipe::reader的函数调用:
Pipe::reader() >Pipe::accept() >Pipe::tcp_read()//读取tag >Pipe::read_message() //读取消息 >Pipe::tcp_read()->Pipe::tcp_read_noblocking() or >Pipe::tcp_read_noblocking() >Pipe::buffered_read() >Pipe::do_recv() >recv() //syscall
Pipe::reader部分源代码
void Pipe::reader()
{
pipe_lock.Lock();
//1.判断pipe的当前状态,如果处于STATE_ACCEPTING,则执行accept(),接收连接。
if (state == STATE_ACCEPTING) {
accept();
assert(pipe_lock.is_locked());
}
//2.判断Pipe的当前状态,Pipe的状态不是STATE_CONNECTING且也不是STATE_CLOSED,进入主循环体。
// loop.
while (state != STATE_CLOSED &&
state != STATE_CONNECTING) {
assert(pipe_lock.is_locked());
// sleep if (re)connecting
if (state == STATE_STANDBY) {
ldout(msgr->cct,20) << "reader sleeping during reconnect|standby" << dendl;
cond.Wait(pipe_lock);
continue;
}
// get a reference to the AuthSessionHandler while we have the pipe_lock
ceph::shared_ptr<AuthSessionHandler> auth_handler = session_security;
pipe_lock.Unlock();
//3.从连接的socket中读取消息的tag,tag的取值有多种对应着不同的消息类型。
char tag = -1;
ldout(msgr->cct,20) << "reader reading tag..." << dendl;
if (tcp_read((char*)&tag, 1) < 0) {
pipe_lock.Lock();
ldout(msgr->cct,2) << "reader couldn't read tag, " << cpp_strerror(errno) << dendl;
fault(true);
continue;
}
//4.根据tag类型,做相应的处理操作,依tag == CEPH_MSGR_TAG_MSG为例。
...
if(tag == "..."){ //此处忽略其他类型的tag,对应的操作。
}
else if (tag == CEPH_MSGR_TAG_MSG) {
ldout(msgr->cct,20) << "reader got MSG" << dendl;
Message *m = 0;
//5.调用read_message()读取消息(message)
int r = read_message(&m, auth_handler.get());
pipe_lock.Lock();
if (!m) {
if (r < 0)
fault(true);
continue;
}
if (state == STATE_CLOSED ||
state == STATE_CONNECTING) {
msgr->dispatch_throttle_release(m->get_dispatch_throttle_size());
m->put();
continue;
}
//保证接收消息的安全可靠
// check received seq#. if it is old, drop the message.
// note that incoming messages may skip ahead. this is convenient for the client
// side queueing because messages can't be renumbered, but the (kernel) client will
// occasionally pull a message out of the sent queue to send elsewhere. in that case
// it doesn't matter if we "got" it or not.
if (m->get_seq() <= in_seq) {
ldout(msgr->cct,0) << "reader got old message "
<< m->get_seq() << " <= " << in_seq << " " << m << " " << *m
<< ", discarding" << dendl;
msgr->dispatch_throttle_release(m->get_dispatch_throttle_size());
m->put();
if (connection_state->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
msgr->cct->_conf->ms_die_on_old_message)
assert(0 == "old msgs despite reconnect_seq feature");
continue;
}
if (m->get_seq() > in_seq + 1) {
ldout(msgr->cct,0) << "reader missed message? skipped from seq "
<< in_seq << " to " << m->get_seq() << dendl;
if (msgr->cct->_conf->ms_die_on_skipped_message)
assert(0 == "skipped incoming seq");
}
m->set_connection(connection_state.get());
// note last received message.
in_seq = m->get_seq();
cond.Signal(); // wake up writer, to ack this
ldout(msgr->cct,10) << "reader got message "
<< m->get_seq() << " " << m << " " << *m
<< dendl;
in_q->fast_preprocess(m);
if (delay_thread) {
utime_t release;
if (rand() % 10000 < msgr->cct->_conf->ms_inject_delay_probability * 10000.0) {
release = m->get_recv_stamp();
release += msgr->cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
lsubdout(msgr->cct, ms, 1) << "queue_received will delay until " << release << " on " << m << " " << *m << dendl;
}
delay_thread->queue(release, m);
} else {
//6.根据message的类型判断是快速处理(fast_dispatch)还是先放到in_q队列中。
if (in_q->can_fast_dispatch(m)) {
reader_dispatching = true;
pipe_lock.Unlock();
in_q->fast_dispatch(m);
pipe_lock.Lock();
reader_dispatching = false;
if (state == STATE_CLOSED ||
notify_on_dispatch_done) { // there might be somebody waiting
notify_on_dispatch_done = false;
cond.Signal();
}
} else {
in_q->enqueue(m, m->get_priority(), conn_id);
}
}
}
else if (tag == CEPH_MSGR_TAG_CLOSE) {
ldout(msgr->cct,20) << "reader got CLOSE" << dendl;
pipe_lock.Lock();
if (state == STATE_CLOSING) {
state = STATE_CLOSED;
state_closed.set(1);
} else {
state = STATE_CLOSING;
}
cond.Signal();
break;
}
else {
ldout(msgr->cct,0) << "reader bad tag " << (int)tag << dendl;
pipe_lock.Lock();
fault(true);
}
}
//7.执行主体循环逻辑,如果Pipe的状态不在满足,跳槽主循环结束reader。
// reap?
reader_running = false;
reader_needs_join = true;
unlock_maybe_reap();
ldout(msgr->cct,10) << "reader done" << dendl;
}
int Pipe::read_message(Message **pm, AuthSessionHandler* auth_handler)
{
int ret = -1;
// envelope
//ldout(msgr->cct,10) << "receiver.read_message from sd " << sd << dendl;
ceph_msg_header header;
ceph_msg_footer footer;
__u32 header_crc = 0;
//接收消息的头部
if (connection_state->has_feature(CEPH_FEATURE_NOSRCADDR)) {
¦ if (tcp_read((char*)&header, sizeof(header)) < 0)
¦ ¦ return -1;
¦ if (msgr->crcflags & MSG_CRC_HEADER) {
¦ ¦ header_crc = ceph_crc32c(0, (unsigned char *)&header, sizeof(header) - sizeof(header.crc));
¦ }
} else {
¦ ceph_msg_header_old oldheader;
¦ if (tcp_read((char*)&oldheader, sizeof(oldheader)) < 0)
¦ ¦ return -1;
¦ // this is fugly
¦ memcpy(&header, &oldheader, sizeof(header));
¦ header.src = oldheader.src.name;
¦ header.reserved = oldheader.reserved;
¦ if (msgr->crcflags & MSG_CRC_HEADER) {
¦ ¦ header.crc = oldheader.crc;
¦ ¦ header_crc = ceph_crc32c(0, (unsigned char *)&oldheader, sizeof(oldheader) - sizeof(oldheader.crc));
¦ }
}
ldout(msgr->cct,20) << "reader got envelope type=" << header.type
¦ ¦ ¦ ¦ ¦<< " src " << entity_name_t(header.src)
¦ ¦ ¦ ¦ ¦<< " front=" << header.front_len
¦ ¦<< " data=" << header.data_len
¦ ¦<< " off " << header.data_off
¦ ¦ ¦ ¦ ¦<< dendl;
//验证头部的crc
// verify header crc
if ((msgr->crcflags & MSG_CRC_HEADER) && header_crc != header.crc) {
¦ ldout(msgr->cct,0) << "reader got bad header crc " << header_crc << " != " << header.crc << dendl;
¦ return -1;
}
bufferlist front, middle, data;
int front_len, middle_len;
unsigned data_len, data_off;
int aborted;
Message *message;
utime_t recv_stamp = ceph_clock_now(msgr->cct);
//执行simpleMessenger的throttler的策略
if (policy.throttler_messages) {
¦ ldout(msgr->cct,10) << "reader wants " << 1 << " message from policy throttler "
<< policy.throttler_messages->get_current() << "/"
<< policy.throttler_messages->get_max() << dendl;
¦ policy.throttler_messages->get();
}
uint64_t message_size = header.front_len + header.middle_len + header.data_len;
if (message_size) {
¦ if (policy.throttler_bytes) {
¦ ¦ ldout(msgr->cct,10) << "reader wants " << message_size << " bytes from policy throttler "
¦ ¦ ¦ ¦<< policy.throttler_bytes->get_current() << "/"
¦ ¦ ¦ ¦<< policy.throttler_bytes->get_max() << dendl;
¦ ¦ policy.throttler_bytes->get(message_size);
¦ }
¦ // throttle total bytes waiting for dispatch. do this _after_ the
¦ // policy throttle, as this one does not deadlock (unless dispatch
¦ // blocks indefinitely, which it shouldn't). in contrast, the
¦ // policy throttle carries for the lifetime of the message.
¦ ldout(msgr->cct,10) << "reader wants " << message_size << " from dispatch throttler "
¦ ¦ ¦<< msgr->dispatch_throttler.get_current() << "/"
¦ ¦ ¦<< msgr->dispatch_throttler.get_max() << dendl;
¦ msgr->dispatch_throttler.get(message_size);
}
utime_t throttle_stamp = ceph_clock_now(msgr->cct);
//接收消息的front部分
// read front
front_len = header.front_len;
if (front_len) {
¦ bufferptr bp = buffer::create(front_len);
¦ if (tcp_read(bp.c_str(), front_len) < 0)
¦ ¦ goto out_dethrottle;
¦ front.push_back(bp);
¦ ldout(msgr->cct,20) << "reader got front " << front.length() << dendl;
}
//读取消息的middle部分
// read middle
middle_len = header.middle_len;
if (middle_len) {
¦ bufferptr bp = buffer::create(middle_len);
¦ if (tcp_read(bp.c_str(), middle_len) < 0)
¦ ¦ goto out_dethrottle;
¦ middle.push_back(bp);
¦ ldout(msgr->cct,20) << "reader got middle " << middle.length() << dendl;
}
//接收消息的数据体(data)
// read data
data_len = le32_to_cpu(header.data_len);
data_off = le32_to_cpu(header.data_off);
if (data_len) {
¦ unsigned offset = 0;
¦ unsigned left = data_len;
¦ bufferlist newbuf, rxbuf;
¦ bufferlist::iterator blp;
¦ int rxbuf_version = 0;
¦ while (left > 0) {
¦ ¦ // wait for data
¦ ¦ if (tcp_read_wait() < 0)
goto out_dethrottle;
¦ ¦ // get a buffer
¦ ¦ connection_state->lock.Lock();
¦ ¦ map<ceph_tid_t,pair<bufferlist,int> >::iterator p = connection_state->rx_buffers.find(header.tid);
¦ ¦ if (p != connection_state->rx_buffers.end()) {
if (rxbuf.length() == 0 || p->second.second != rxbuf_version) {
¦ ldout(msgr->cct,10) << "reader seleting rx buffer v " << p->second.second
¦ ¦<< " at offset " << offset
¦ ¦<< " len " << p->second.first.length() << dendl;
¦ rxbuf = p->second.first;
¦ rxbuf_version = p->second.second;
¦ // make sure it's big enough
¦ if (rxbuf.length() < data_len)
¦ ¦ rxbuf.push_back(buffer::create(data_len - rxbuf.length()));
¦ blp = p->second.first.begin();
¦ blp.advance(offset);
}
¦ ¦ } else {
if (!newbuf.length()) {
¦ ldout(msgr->cct,20) << "reader allocating new rx buffer at offset " << offset << dendl;
¦ alloc_aligned_buffer(newbuf, data_len, data_off);
¦ blp = newbuf.begin();
¦ blp.advance(offset);
}
¦ ¦ }
¦ ¦ bufferptr bp = blp.get_current_ptr();
¦ ¦ int read = MIN(bp.length(), left);
¦ ¦ ldout(msgr->cct,20) << "reader reading nonblocking into " << (void*)bp.c_str() << " len " << bp.length() << dendl;
¦ ¦ int got = tcp_read_nonblocking(bp.c_str(), read);
¦ ¦ ldout(msgr->cct,30) << "reader read " << got << " of " << read << dendl;
¦ ¦ connection_state->lock.Unlock();
¦ ¦ if (got < 0)
goto out_dethrottle;
¦ ¦ if (got > 0) {
blp.advance(got);
data.append(bp, 0, got);
offset += got;
left -= got;
¦ ¦ } // else we got a signal or something; just loop.
¦ }
}
//接收消息的footer部分
// footer
if (connection_state->has_feature(CEPH_FEATURE_MSG_AUTH)) {
¦ if (tcp_read((char*)&footer, sizeof(footer)) < 0)
¦ ¦ goto out_dethrottle;
} else {
¦ ceph_msg_footer_old old_footer;
¦ if (tcp_read((char*)&old_footer, sizeof(old_footer)) < 0)
¦ ¦ goto out_dethrottle;
¦ footer.front_crc = old_footer.front_crc;
¦ footer.middle_crc = old_footer.middle_crc;
¦ footer.data_crc = old_footer.data_crc;
¦ footer.sig = 0;
¦ footer.flags = old_footer.flags;
}
aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0;
ldout(msgr->cct,10) << "aborted = " << aborted << dendl;
if (aborted) {
¦ ldout(msgr->cct,0) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length()
¦ ¦ << " byte message.. ABORTED" << dendl;
¦ ret = 0;
¦ goto out_dethrottle;
}
ldout(msgr->cct,20) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length()
¦ ¦<< " byte message" << dendl;
message = decode_message(msgr->cct, msgr->crcflags, header, footer, front, middle, data);
if (!message) {
¦ ret = -EINVAL;
¦ goto out_dethrottle;
}
//
// Check the signature if one should be present. A zero return indicates success. PLR
//
//验证消息的签名
if (auth_handler == NULL) {
¦ ldout(msgr->cct, 10) << "No session security set" << dendl;
} else {
¦ if (auth_handler->check_message_signature(message)) {
¦ ¦ ldout(msgr->cct, 0) << "Signature check failed" << dendl;
¦ ¦ ret = -EINVAL;
¦ ¦ goto out_dethrottle;
¦ }
}
message->set_byte_throttler(policy.throttler_bytes);
message->set_message_throttler(policy.throttler_messages);
// store reservation size in message, so we don't get confused
// by messages entering the dispatch queue through other paths.
message->set_dispatch_throttle_size(message_size);
message->set_recv_stamp(recv_stamp);
message->set_throttle_stamp(throttle_stamp);
message->set_recv_complete_stamp(ceph_clock_now(msgr->cct));
*pm = message;
return 0;
out_dethrottle:
// release bytes reserved from the throttlers on failure
if (policy.throttler_messages) {
¦ ldout(msgr->cct,10) << "reader releasing " << 1 << " message to policy throttler "
<< policy.throttler_messages->get_current() << "/"
<< policy.throttler_messages->get_max() << dendl;
¦ policy.throttler_messages->put();
}
if (message_size) {
¦ if (policy.throttler_bytes) {
¦ ¦ ldout(msgr->cct,10) << "reader releasing " << message_size << " bytes to policy throttler "
<< policy.throttler_bytes->get_current() << "/"
<< policy.throttler_bytes->get_max() << dendl;
¦ ¦ policy.throttler_bytes->put(message_size);
¦ }
¦ msgr->dispatch_throttle_release(message_size);
}
return ret;
}
`
6. 总结:
reader函数是Pipe reader thread的主要逻辑,负责从连接的socket上接收消息。