Pipe核心分析及消息体结构
Pipe是什么
Pipe是SimpleMessenger中的一个复杂的组件,每个Pipe实例都拥有工作在socket上的两个线程,分别负责socket连接上的读取和写入的数据传输,除此之外还会将socket上的错误信息汇报给SimpleMessenger,同时处于一个稳定状态,来持续不断的为SimpleMessenger提供数据服务。Pipe的不同状态:
STATE_ACCEPTING, STATE_CONNECTING, STATE_OPEN, STATE_STANDBY, STATE_CLOSED, STATE_CLOSING, STATE_WAIT
Pipe::write()将消息在socket上发生出去。
主要处理逻辑过程:- 只要Pipe的状态不为STATE_CLOSED,Pipe就一直处于writer的主循环中(异常除外)。
- 如果当前Pipe实例的状态为STATE_STANDBY,且不属于服务,有消息要发送,则将Pipe置为STATE_CONNECTING。
- 如果Pipe实例的状态为STATE_CONNECTING,则调用connect()连接服务端,(注:只有客户端才允许调用connect())。
- 如果Pipe的状态为STATE_CLOSING将要关闭,则向对应端发送tag(CEPH_MSGR_TAG_CLOSE)。
- 如果Pipe实例的状态不是STATE_CONNECTING STATE_WAIT STATE_STANDBY其中的任意一个且有消息需要发送,或者存在为ack的消息则根据具体情况发送消息。
- 假设有正常的消息需要发送,则从out_q中取出一个消息。
- 将message中的header和footer取出来,并将front、middle、data序列化到bufferlist中。
- 调用write_message()将header、footer、序列化后的数据进一步处理。
- 如果Pipe实例状态没有被关闭则,继续主循环,负责结束writer。
Pipe::reader的函数调用:
Pipe::writer()
>Pipe::connect()
>Pipe::write()//发送tag 关闭pipe是调用
>Pipe::write_keepalive2() //需要keepalive是调用 或ack keepalive时调用
>Pipe::write_ack //ack message时调用
>Pipe::_get_next_outgoing() //从out_q中取出需要发送的消息 or
>Pipe::write_message() //发送消息
>Pipe::do_sendmsg() //发送tag、header、data(front middle data) footer
>Pipe::sendmsg() //syscall
`
5. Pipe::writer部分源代码
void Pipe::writer()
{
pipe_lock.Lock();
//1.只要Pipe的状态不为STATE_CLOSED,Pipe就一直处于writer的主循环中(异常除外)。
while (state != STATE_CLOSED) {// && state != STATE_WAIT) {
¦ ldout(msgr->cct,10) << "writer: state = " << get_state_name()
<< " policy.server=" << policy.server << dendl;
//2.如果当前Pipe实例的状态为STATE_STANDBY,且不属于服务,有消息要发送,则将Pipe置为STATE_CONNECTING。
¦ // standby?
¦ if (is_queued() && state == STATE_STANDBY && !policy.server)
¦ ¦ state = STATE_CONNECTING;
//3.如果Pipe实例的状态为STATE_CONNECTING,则调用connect()连接服务端,(注:只有客户端才允许调用connect())。
¦ // connect?
¦ if (state == STATE_CONNECTING) {
¦ ¦ assert(!policy.server);
¦ ¦ connect();
¦ ¦ continue;
¦ }
¦ //4.如果Pipe的状态为STATE_CLOSING将要关闭,则向对应端发送tag(CEPH_MSGR_TAG_CLOSE)。
¦ if (state == STATE_CLOSING) {
¦ ¦ // write close tag
¦ ¦ ldout(msgr->cct,20) << "writer writing CLOSE tag" << dendl;
¦ ¦ char tag = CEPH_MSGR_TAG_CLOSE;
¦ ¦ state = STATE_CLOSED;
¦ ¦ state_closed.set(1);
¦ ¦ pipe_lock.Unlock();
¦ ¦ if (sd) {
int r = ::write(sd, &tag, 1);
// we can ignore r, actually; we don't care if this succeeds.
r++; r = 0; // placate gcc
¦ ¦ }
¦ ¦ pipe_lock.Lock();
¦ ¦ continue;
¦ }
//5.如果Pipe实例的状态不是STATE_CONNECTING STATE_WAIT STATE_STANDBY其中的任意一个且有消息需要发送,或者存在为ack的消息则根据具体情况发送消息。
¦ if (state != STATE_CONNECTING && state != STATE_WAIT && state != STATE_STANDBY &&
(is_queued() || in_seq > in_seq_acked)) {
¦ ¦ // keepalive?
¦ ¦ if (send_keepalive) {
int rc;
if (connection_state->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
¦ pipe_lock.Unlock();
¦ rc = write_keepalive2(CEPH_MSGR_TAG_KEEPALIVE2,
ceph_clock_now(msgr->cct));
} else {
¦ pipe_lock.Unlock();
¦ rc = write_keepalive();
}
pipe_lock.Lock();
if (rc < 0) {
¦ ldout(msgr->cct,2) << "writer couldn't write keepalive[2], "
<< cpp_strerror(errno) << dendl;
¦ fault();
¦ continue;
}
send_keepalive = false;
¦ ¦ }
//ack keepalive2
¦ ¦ if (send_keepalive_ack) {
utime_t t = keepalive_ack_stamp;
pipe_lock.Unlock();
int rc = write_keepalive2(CEPH_MSGR_TAG_KEEPALIVE2_ACK, t);
pipe_lock.Lock();
if (rc < 0) {
¦ ldout(msgr->cct,2) << "writer couldn't write keepalive_ack, " << cpp_strerror(errno) << dendl;
¦ fault();
¦ continue;
}
send_keepalive_ack = false;
¦ ¦ }
¦ ¦ // send ack?
¦ ¦ if (in_seq > in_seq_acked) {
uint64_t send_seq = in_seq;
pipe_lock.Unlock();
//ack message
int rc = write_ack(send_seq);
pipe_lock.Lock();
if (rc < 0) {
¦ ldout(msgr->cct,2) << "writer couldn't write ack, " << cpp_strerror(errno) << dendl;
¦ fault();
¦ continue;
}
in_seq_acked = send_seq;
¦ ¦ }
//6.假设有正常的消息需要发送,则从out_q中取出一个消息。
¦ ¦ // grab outgoing message 取出一个待发送的message
¦ ¦ Message *m = _get_next_outgoing();
//重新构建message
¦ ¦ if (m) {
m->set_seq(++out_seq);
if (!policy.lossy) {
¦ // put on sent list
¦ sent.push_back(m);
¦ m->get();
}
// associate message with Connection (for benefit of encode_payload)
m->set_connection(connection_state.get());
uint64_t features = connection_state->get_features();
if (m->empty_payload())
¦ ldout(msgr->cct,20) << "writer encoding " << m->get_seq() << " features " << features
<< " " << m << " " << *m << dendl;
else
¦ ldout(msgr->cct,20) << "writer half-reencoding " << m->get_seq() << " features " << features
<< " " << m << " " << *m << dendl;
// encode and copy out of *m
m->encode(features, msgr->crcflags);
//7.将message中的header和footer取出来,并将front、middle、data序列化到bufferlist中。
// prepare everything
ceph_msg_header& header = m->get_header();
ceph_msg_footer& footer = m->get_footer();
// Now that we have all the crcs calculated, handle the
// digital signature for the message, if the pipe has session
// security set up. Some session security options do not
// actually calculate and check the signature, but they should
// handle the calls to sign_message and check_signature. PLR
if (session_security.get() == NULL) {
¦ ldout(msgr->cct, 20) << "writer no session security" << dendl;
} else {
¦ if (session_security->sign_message(m)) {
¦ ¦ ldout(msgr->cct, 20) << "writer failed to sign seq # " << header.seq
<< "): sig = " << footer.sig << dendl;
¦ } else {
¦ ¦ ldout(msgr->cct, 20) << "writer signed seq # " << header.seq
<< "): sig = " << footer.sig << dendl;
¦ }
}
bufferlist blist = m->get_payload();
blist.append(m->get_middle());
blist.append(m->get_data());
¦ ¦ ¦ pipe_lock.Unlock();
¦ ¦ ¦ ldout(msgr->cct,20) << "writer sending " << m->get_seq() << " " << m << dendl;
//8.调用write_message()将header、footer、序列化后的数据进一步处理。
int rc = write_message(header, footer, blist);
pipe_lock.Lock();
if (rc < 0) {
¦ ¦ ¦ ¦ ldout(msgr->cct,1) << "writer error sending " << m << ", "
¦ << cpp_strerror(errno) << dendl;
¦ fault();
¦ ¦ ¦ }
m->put();
¦ ¦ }
¦ ¦ continue;
¦ }
¦
¦ // wait
¦ ldout(msgr->cct,20) << "writer sleeping" << dendl;
¦ cond.Wait(pipe_lock);
}
//9.如果Pipe实例状态没有被关闭则,继续主循环,负责结束writer。
ldout(msgr->cct,20) << "writer finishing" << dendl;
// reap?
writer_running = false;
unlock_maybe_reap();
ldout(msgr->cct,10) << "writer done" << dendl;
}
int Pipe::write_message(ceph_msg_header& header, ceph_msg_footer& footer, bufferlist& blist)
{
int ret;
// set up msghdr and iovecs
struct msghdr msg;
memset(&msg, 0, sizeof(msg));
msg.msg_iov = msgvec;
int msglen = 0;
// send tag 填充tag
char tag = CEPH_MSGR_TAG_MSG;
msgvec[msg.msg_iovlen].iov_base = &tag;
msgvec[msg.msg_iovlen].iov_len = 1;
msglen++;
msg.msg_iovlen++;
// send envelope 填充header
ceph_msg_header_old oldheader;
if (connection_state->has_feature(CEPH_FEATURE_NOSRCADDR)) {
¦ msgvec[msg.msg_iovlen].iov_base = (char*)&header;
¦ msgvec[msg.msg_iovlen].iov_len = sizeof(header);
¦ msglen += sizeof(header);
¦ msg.msg_iovlen++;
} else {
¦ memcpy(&oldheader, &header, sizeof(header));
¦ oldheader.src.name = header.src;
¦ oldheader.src.addr = connection_state->get_peer_addr();
¦ oldheader.orig_src = oldheader.src;
¦ oldheader.reserved = header.reserved;
¦ if (msgr->crcflags & MSG_CRC_HEADER) {
oldheader.crc = ceph_crc32c(0, (unsigned char*)&oldheader,
sizeof(oldheader) - sizeof(oldheader.crc));
¦ } else {
oldheader.crc = 0;
¦ }
¦ msgvec[msg.msg_iovlen].iov_base = (char*)&oldheader;
¦ msgvec[msg.msg_iovlen].iov_len = sizeof(oldheader);
¦ msglen += sizeof(oldheader);
¦ msg.msg_iovlen++;
}
// payload (front+data) 填充front、middle、data
list<bufferptr>::const_iterator pb = blist.buffers().begin();
int b_off = 0; // carry-over buffer offset, if any
int bl_pos = 0; // blist pos
int left = blist.length();
while (left > 0) {
¦ int donow = MIN(left, (int)pb->length()-b_off);
¦ if (donow == 0) {
¦ ¦ ldout(msgr->cct,0) << "donow = " << donow << " left " << left << " pb->length " << pb->length()
¦ ¦ ¦ << " b_off " << b_off << dendl;
¦ }
¦ assert(donow > 0);
¦ ldout(msgr->cct,30) << " bl_pos " << bl_pos << " b_off " << b_off
¦ ¦ ¦<< " leftinchunk " << left
¦ ¦ ¦<< " buffer len " << pb->length()
¦ ¦ ¦<< " writing " << donow
¦ ¦ ¦<< dendl;
¦
¦ if (msg.msg_iovlen >= IOV_MAX-2) {
¦ ¦ if (do_sendmsg(&msg, msglen, true))
goto fail;
¦ ¦
¦ ¦ // and restart the iov
¦ ¦ msg.msg_iov = msgvec;
¦ ¦ msg.msg_iovlen = 0;
¦ ¦ msglen = 0;
¦ }
¦
¦ msgvec[msg.msg_iovlen].iov_base = (void*)(pb->c_str()+b_off);
¦ msgvec[msg.msg_iovlen].iov_len = donow;
¦ msglen += donow;
¦ msg.msg_iovlen++;
¦
¦ left -= donow;
¦ assert(left >= 0);
¦ b_off += donow;
¦ bl_pos += donow;
¦ if (left == 0)
¦ ¦ break;
¦ while (b_off == (int)pb->length()) {
¦ ¦ ++pb;
¦ ¦ b_off = 0;
¦ }
}
assert(left == 0);
// send footer; if receiver doesn't support signatures, use the old footer format
//填充footer
ceph_msg_footer_old old_footer;
if (connection_state->has_feature(CEPH_FEATURE_MSG_AUTH)) {
¦ msgvec[msg.msg_iovlen].iov_base = (void*)&footer;
¦ msgvec[msg.msg_iovlen].iov_len = sizeof(footer);
¦ msglen += sizeof(footer);
¦ msg.msg_iovlen++;
} else {
¦ if (msgr->crcflags & MSG_CRC_HEADER) {
¦ ¦ old_footer.front_crc = footer.front_crc;
¦ ¦ old_footer.middle_crc = footer.middle_crc;
¦ } else {
old_footer.front_crc = old_footer.middle_crc = 0;
¦ }
¦ old_footer.data_crc = msgr->crcflags & MSG_CRC_DATA ? footer.data_crc : 0;
¦ old_footer.flags = footer.flags;
¦ msgvec[msg.msg_iovlen].iov_base = (char*)&old_footer;
¦ msgvec[msg.msg_iovlen].iov_len = sizeof(old_footer);
¦ msglen += sizeof(old_footer);
¦ msg.msg_iovlen++;
}
// send 将填充在msg中的数据发送
if (do_sendmsg(&msg, msglen))
¦ goto fail;
ret = 0;
out:
return ret;
fail:
ret = -1;
goto out;
}
int Pipe::do_sendmsg(struct msghdr *msg, int len, bool more)
{
while (len > 0) {
¦ if (0) { // sanity
¦ ¦ int l = 0;
¦ ¦ for (unsigned i=0; i<msg->msg_iovlen; i++)
l += msg->msg_iov[i].iov_len;
¦ ¦ assert(l == len);
¦ }
¦ int r = ::sendmsg(sd, msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
¦ if (r == 0)
¦ ¦ ldout(msgr->cct,10) << "do_sendmsg hmm do_sendmsg got r==0!" << dendl;
¦ if (r < 0) {
¦ ¦ ldout(msgr->cct,1) << "do_sendmsg error " << cpp_strerror(errno) << dendl;
¦ ¦ return -1;
¦ }
¦ if (state == STATE_CLOSED) {
¦ ¦ ldout(msgr->cct,10) << "do_sendmsg oh look, state == CLOSED, giving up" << dendl;
¦ ¦ errno = EINTR;
¦ ¦ return -1; // close enough
¦ }
¦ len -= r;
¦ if (len == 0) break;
¦
¦ // hrmph. trim r bytes off the front of our message. //保证数据完全发送
¦ ldout(msgr->cct,20) << "do_sendmsg short write did " << r << ", still have " << len << dendl;
¦ while (r > 0) {
¦ ¦ if (msg->msg_iov[0].iov_len <= (size_t)r) {
// lose this whole item
//ldout(msgr->cct,30) << "skipping " << msg->msg_iov[0].iov_len << ", " << (msg->msg_iovlen-1) << " v, " << r << " left" << dendl;
r -= msg->msg_iov[0].iov_len;
msg->msg_iov++;
msg->msg_iovlen--;
¦ ¦ } else {
// partial!
//ldout(msgr->cct,30) << "adjusting " << msg->msg_iov[0].iov_len << ", " << msg->msg_iovlen << " v, " << r << " left" << dendl;
msg->msg_iov[0].iov_base = (char *)msg->msg_iov[0].iov_base + r;
msg->msg_iov[0].iov_len -= r;
break;
¦ ¦ }
¦ }
}
return 0;
}
`
6. 总结
Pipe::writer的主要任务就是从out_q中取出消息,通过socket,将消息安全稳定发送出去。