src/test/msgr/perf_msgr_server.cc
server, 服务端, 建连接
MessengerServer
Messenger *Messenger::create type=async+rdma, lname=server -> Messenger *Messenger::create
new AsyncMessenger -> AsyncMessenger::AsyncMessenger
dispatch_queue
DispatchQueue(CephContext *cct, Messenger *msgr, string &name) 本地快速分发, 节流阀
lookup_or_create_singleton_object<StackSingleton>
single->ready(transport_type)
NetworkStack::create(cct, type) -> std::make_shared<RDMAStack>(c, t)
RDMAStack::RDMAStack
NetworkStack(cct, t) 构造, 线程池默认3个线程,
create_worker(cct, type, worker_id) -> NetworkStack::create_worker -> new RDMAWorker(c, worker_id) -> RDMAWorker::RDMAWorker
Worker(CephContext *c, unsigned worker_id) Stack是一个网络IO框架,封装了所有必要的基础网络接口,然后它管理线程工作。posix、dpdk甚至RDMA等不同的网络后端都需要继承Stack类来实现必要的接口。 所以这会让其他人轻松网络后端集成到 ceph 中。 否则,每个后端都需要实现整个 Messenger 逻辑,如重新连接、策略处理、会话维持...
w->center.init -> EventCenter::init
driver = new EpollDriver(cct)
driver->init(this, nevent) -> int EpollDriver::init
events = (struct epoll_event*)calloc
epfd = epoll_create(1024)
fcntl(epfd, F_SETFD, FD_CLOEXEC)
file_events.resize(nevent) 5000
pipe_cloexec(fds, 0) -> pipe2(pipefd, O_CLOEXEC | flags) 创建管道,均为非阻塞
notify_receive_fd = fds[0] 接收端,读端
notify_send_fd = fds[1] 发送端,写端
workers.push_back(w)
Infiniband::Infiniband
device_name(cct->_conf->ms_async_rdma_device_name) 从配置中获取rdma设备名 TODO
port_num( cct->_conf->ms_async_rdma_port_num) 默认为1 端口也从配置文件中获取
verify_prereq -> void Infiniband::verify_prereq
RDMAV_HUGEPAGES_SAFE 设置安全大页
ibv_fork_init
getrlimit(RLIMIT_MEMLOCK, &limit) 获取资源限制的配置
get_num_worker 3
for
w->set_dispatcher(rdma_dispatcher)
w->set_ib(ib)
stack->start()
std::function<void ()> thread = add_thread(i) 暂不执行
w->center.set_owner()
notify_handler = new C_handle_notify(this, cct)
create_file_event(notify_receive_fd, EVENT_READABLE, notify_handler) 将之前管道的读端设置epoll监听
driver->add_event(fd, event->mask, mask)
epoll_ctl(epfd, op, fd, &ee)
event->read_cb = ctxt 设置读事件回调
w->initialize()
w->init_done()
init_cond.notify_all() 通知等待的线程,完成初始化
while (!w->done)
w->center.process_events 循环处理事件 -> int EventCenter::process_events
driver->event_wait(fired_events, &tv) -> int EpollDriver::event_wait
epoll_wait 写端写入c触发执行此处
fired_events[event_id].fd = e->data.fd
event = _get_file_event(fired_events[event_id].fd)
cb = event->read_cb 可读回调
cb->do_request(fired_events[event_id].fd) 处理事件
r = read(fd_or_id, c, sizeof(c)) 读管道对端发来的字符,如:c
cur_process.swap(external_events)
spawn_worker(i, std::move(thread)) 启动新线程,返回join控制器
workers[i]->wait_for_init() 等所有工人完成初始化
local_connection = ceph::make_ref<AsyncConnection> -> AsyncConnection::AsyncConnection
ms_connection_ready_timeout 建连超时时间
ms_connection_idle_timeout 不活跃的时间, 如果两端连接空闲超过15分钟(没有活动的读写),则销毁连接
read_handler = new C_handle_read(this) -> conn->process()
void AsyncConnection::process()
write_handler = new C_handle_write(this) -> conn->handle_write()
void AsyncConnection::handle_write
write_callback_handler = new C_handle_write_callback(this) -> AsyncConnection::handle_write_callback -> AsyncConnection::write 写的时候传递callback
wakeup_handler = new C_time_wakeup(this) -> void AsyncConnection::wakeup_from -> void AsyncConnection::process()
tick_handler = new C_tick_wakeup(this)-> void AsyncConnection::tick 计时器()
protocol->fault() 处理错误
init_local_connection
void ms_deliver_handle_fast_connect
reap_handler = new C_handle_reap(this)
void AsyncMessenger::reap_dead 收割死连接
processors.push_back(new Processor(this, stack->get_worker(i), cct))
Processor::Processor
listen_handler(new C_processor_accept(this))
void Processor::accept() 等待事件触发(客户端执行connect后触发)
listen_sockets -> while (true)
msgr->get_stack()->get_worker()
listen_socket.accept(&cli_socket, opts, &addr, w)
msgr->add_accept
msgr->set_default_policy
dummy_auth.auth_registry.refresh_config()
msgr->set_auth_server(&dummy_auth) 初始化函数,在绑定前调用
server.start()
msgr->bind(addr)
AsyncMessenger::bind
bindv -> int r = p->bind
int Processor::bind
listen_sockets.resize
conf->ms_bind_retry_count 3次重试
worker->center.submit_to lambda []()->void 匿名函数
c->in_thread()
pthread_equal(pthread_self(), owner) 本线程
C_submit_event<func> event(std::move(f), false) f=listen
void do_request -> f() -> listen -> worker->listen(listen_addr, k, opts, &listen_sockets[k]) -> int RDMAWorker::listen 由事件触发执行
ib->init() -> void Infiniband::init
new DeviceList(cct)
ibv_get_device_list 4网口
if (cct->_conf->ms_async_rdma_cm)
new Device(cct, device_list[i]) -> Device::Device
ibv_open_device
ibv_get_device_name
ibv_query_device 参考设备属性: device_attr
get_device 根据配置的设备名在设备列表中查询, 默认取第一个, 如: mlx5_0
binding_port -> void Device::binding_port
new Port(cct, ctxt, port_id) 端口ID从1开始 -> Port::Port
ibv_query_port(ctxt, port_num, &port_attr)
ibv_query_gid(ctxt, port_num, gid_idx, &gid)
ib_physical_port = device->active_port->get_port_num() 获取物理端口
new ProtectionDomain(cct, device) -> Infiniband::ProtectionDomain::ProtectionDomain -> ibv_alloc_pd(device->ctxt)
support_srq = cct->_conf->ms_async_rdma_support_srq 共享接收队列srq
rx_queue_len = device->device_attr.max_srq_wr 最终为4096
tx_queue_len = device->device_attr.max_qp_wr - 1 发送队列为beacon保留1个WR, 如:1024 1_K 重载操作符
device->device_attr.max_cqe 设备允许 4194303 完成事件
memory_manager = new MemoryManager(cct, device, pd) -> Infiniband::MemoryManager::MemoryManager 128K -> mem_pool -> boost::pool
memory_manager->create_tx_pool(cct->_conf->ms_async_rdma_buffer_size, tx_queue_len) -> void Infiniband::MemoryManager::create_tx_pool
send = new Cluster(*this, size)
send->fill(tx_num) -> int Infiniband::MemoryManager::Cluster::fill
base = (char*)manager.malloc(bytes) -> void* Infiniband::MemoryManager::malloc -> std::malloc(size) 标准分配或分配大页(huge_pages_malloc)
ibv_reg_mr 注册内存
new(chunk) Chunk
free_chunks.push_back(chunk)
create_shared_receive_queue
ibv_create_srq
post_chunks_to_rq -> int Infiniband::post_chunks_to_rq
chunk = get_memory_manager()->get_rx_buffer() -> return reinterpret_cast<Chunk *>(rxbuf_pool.malloc())
ibv_post_srq_recv
dispatcher->polling_start() -> void RDMADispatcher::polling_start
ib->get_memory_manager()->set_rx_stat_logger(perf_logger) -> void PerfCounters::set
tx_cc = ib->create_comp_channel(cct) -> Infiniband::CompletionChannel* Infiniband::create_comp_channel -> new Infiniband::CompletionChannel
tx_cq = ib->create_comp_queue(cct, tx_cc)
cq->init() -> int Infiniband::CompletionChannel::init
ibv_create_comp_channel 创建完成通道 -> NetHandler(cct).set_nonblock(channel->fd) 设置非阻塞
t = std::thread(&RDMADispatcher::polling, this) 启动polling线程 rdma-polling -> void RDMADispatcher::polling
tx_cq->poll_cq(MAX_COMPLETIONS, wc)
handle_tx_event -> tx_chunks.push_back(chunk) -> post_tx_buffer
tx -> void RDMAWorker::handle_pending_message()
handle_rx_event -> void RDMADispatcher::handle_rx_event
conn->post_chunks_to_rq(1) 向接收队列补一个内存块(WR) -> int Infiniband::post_chunks_to_rq
ibv_post_srq_recv | ibv_post_recv
polled[conn].push_back(*response)
qp->remove_rq_wr(chunk)
chunk->clear_qp()
pass_wc -> void RDMAConnectedSocketImpl::pass_wc(std::vector<ibv_wc> &&v) -> notify() -> void RDMAConnectedSocketImpl::notify
eventfd_write(notify_fd, event_val) -> eventfd_read(notify_fd, &event_val) <- ssize_t RDMAConnectedSocketImpl::read <- process
new RDMAServerSocketImpl(cct, ib, dispatcher, this, sa, addr_slot)
int r = p->listen(sa, opt) -> int RDMAServerSocketImpl::listen
server_setup_socket = net.create_socket(sa.get_family(), true) -> socket_cloexec
net.set_nonblock
net.set_socket_options
::bind(server_setup_socket, sa.get_sockaddr(), sa.get_sockaddr_len()) 系统调用
::listen backlog=512
*sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p))
cond.notify_all() -> 通知等待的线程
dispatch_event_external -> void EventCenter::dispatch_event_external
external_events.push_back(e)
wakeup()
write(notify_send_fd, &buf, sizeof(buf)) buf=c -> notify_receive_fd, 唤醒 epoll_wait
event.wait()
msgr->add_dispatcher_head(&dispatcher)
ready()
p->start() -> void Processor::start()
worker->center.create_file_event listen_handler -> pro->accept() -> void Processor::accept()
msgr->start() -> int AsyncMessenger::start()
msgr->wait() -> void AsyncMessenger::wait()
客户端建连接, src/test/msgr/perf_msgr_client.cc, gdb --args ceph_perf_msgr_client 175.16.53.62:10001 10 1 1 0 4096
perf_msgr_client.cc -> main
MessengerClient client(public_msgr_type, args[0], think_time)
client.ready
Messenger *msgr = Messenger::create
msgr->set_default_policy -> Policy(bool l, bool s ...
msgr->start() -> int AsyncMessenger::start()
if (!did_bind) 客户端不需要bind
set_myaddrs(newaddrs) -> void Messenger::set_endpoint_addr
_init_local_connection() -> void _init_local_connection()
ms_deliver_handle_fast_connect(local_connection.get()) -> void ms_deliver_handle_fast_connect将新连接通知每个快速调度程序。 每当启动或重新连接新连接时调用此函数 fast_dispatchers为空?
ConnectionRef conn = msgr->connect_to_osd(addrs) 连接到OSD -> ConnectionRef connect_to_osd -> ConnectionRef AsyncMessenger::connect_to
AsyncConnectionRef conn = _lookup_conn(av) 先在连接池查找连接
conn = create_connect(av, type, false) 没找到,新建连接 -> AsyncConnectionRef AsyncMessenger::create_connect
Worker *w = stack->get_worker()
auto conn = ceph::make_ref<AsyncConnection> -> AsyncConnection::AsyncConnection 构造连接
recv_buf = new char[2*recv_max_prefetch] 使用缓冲区读取来避免小的读取开销
new ProtocolV2(this) ceph v2协议, 在v1基础上支持地址向量, 在横幅(banner)交换之后,对等体交换他们的地址向量address vectors
conn->connect(addrs, type, target) -> void AsyncConnection::connect -> _connect -> void AsyncConnection::_connect()
state = STATE_CONNECTING 初始状态机
protocol->connect() -> void ProtocolV2::connect() -> state = START_CONNECT
center->dispatch_event_external(read_handler) -> 触发状态机推进 -> process
conns[addrs] = conn 保存连接 -> ceph::unordered_map<entity_addrvec_t, AsyncConnectionRef> conns 无序map
ClientThread *t = new ClientThread(msgr, c, conn, msg_len, ops, think_time_us) -> ClientThread(Messenger *m 新建客户端线程, 构造数据
m->add_dispatcher_head(&dispatcher)
bufferptr ptr(msg_len) 申请数据指针 -> buffer::ptr::ptr(unsigned l) : _off(0), _len(l)
_raw = buffer::create(l).release() -> ceph::unique_leakable_ptr<buffer::raw> buffer::create, 通过返回其值并用空指针替换它来释放其存储指针的所有权。此调用不会破坏托管对象,但 unique_ptr 对象从删除对象的责任中解脱出来。 某些其他实体必须负责在某个时刻删除该对象。要强制销毁指向的对象,请使用成员函数 reset 或对其执行赋值操作
buffer::create_aligned(len, sizeof(size_t)) -> ceph::unique_leakable_ptr<buffer::raw> buffer::create_aligned
create_aligned_in_mempool -> mempool::mempool_buffer_anon 宏: f(buffer_anon) -> ceph::unique_leakable_ptr<buffer::raw> buffer::create_aligned_in_mempool, 1M: create_aligned_in_mempool (len=1048576, align=8, mempool=18)
len >= CEPH_PAGE_SIZE * 2 如果待分配内存长度大于2倍CEPH_PAGE_SIZE(系统页:sysconf(_SC_PAGESIZE))=8K, 则用原生posix对齐分配 -> ceph::unique_leakable_ptr<buffer::raw>(new raw_posix_aligned(len, align)) -> raw_posix_aligned(unsigned l, unsigned _align) : raw(l)
r = ::posix_memalign((void**)(void*)&data, align, len);
return raw_combined::create(len, align, mempool) -> src/common/buffer.cc -> static ceph::unique_leakable_ptr<buffer::raw> -> create(unsigned len,
align = std::max<unsigned>(align, sizeof(void *)) = 8
size_t rawlen = round_up_to(sizeof(buffer::raw_combined) 96
size_t datalen = round_up_to(len, alignof(buffer::raw_combined)) 4096
int r = ::posix_memalign((void**)(void*)&ptr, align, rawlen + datalen); 96+4096
new (ptr + datalen) raw_combined(ptr, len, align, mempool))
_raw->nref.store(1, std::memory_order_release)
memset(ptr.c_str(), 0, msg_len) 置0
data.append(ptr) 将data填充全0 -> void buffer::list::append -> void push_back(const ptr& bp)
_buffers.push_back(*ptr_node::create(bp).release())
_len += bp.length()
msgrs.push_back(msgr)
clients.push_back(t)
Cycles::init() -> void Cycles::init() 校准时钟频率
uint64_t start = Cycles::rdtsc()
client.start() -> void start() -> clients[i]->create("client") -> void Thread::create
pthread_create(&thread_id, thread_attr, _entry_func, (void*)this) -> void *Thread::_entry_func
void *entry() override 重写entry
hobject_t hobj(oid, oloc.key -> struct object_t
void build_hash_cache() crc32c
MOSDOp *m = new MOSDOp -> MOSDOp(int inc, long tid,
bufferlist msg_data(data) 拷贝构造函数?, 拷贝数据
m->write(0, msg_len, msg_data) -> void write 通过消息msg写数据到对端, offset=0, len=4096, buffer_list=bl(msg_data)
add_simple_op(CEPH_OSD_OP_WRITE, off, len) -> ops.push_back(osd_op)
osd_op.op.extent.offset = off
osd_op.op.extent.length = len
ops.push_back(osd_op)
data.claim(bl)
clear()
claim_append(bl) -> void buffer::list::claim_append 要求追加, 免拷贝?
_buffers.splice_back(bl._buffers) 拼接回来
bl._buffers.clear_and_dispose()
header.data_off = off
conn->send_message(m) -> void ProtocolV2::send_message(Message *m) ssize_t RDMAConnectedSocketImpl::send
out_queue[m->get_priority()].emplace_back
connection->center->dispatch_event_external(connection->write_handler) -> void AsyncConnection::handle_write
const auto out_entry = _get_next_outgoing()
more = !out_queue.empty() 如果发送队列不为空,则more为true,表示还有更多的待发送的数据
write_message(out_entry.m, more)
ssize_t total_send_size = connection->outgoing_bl.length() 4406=310+4096
connection->_try_send(more) -> cs.send(outgoing_bl, more) -> ssize_t RDMAConnectedSocketImpl::send
size_t bytes = bl.length() 4KB:4406B=4096+310/1MB:1048886=1048576+310
pending_bl.claim_append(bl) 换变量, bl留着干啥? 回收?
ssize_t r = submit(more) ssize_t -> RDMAConnectedSocketImpl::submit
pending_bl.length() 4406
auto it = std::cbegin(pending_bl.buffers()) cbegin()和cend()是C++11新增的,它们返回一个const的迭代器,不能用于修改元素, 常量迭代器
while (it != pending_bl.buffers().end()) 循环, 切片, 分段
if (ib->is_tx_buffer(it->raw_c_str())) 不进该分支
msg/async/rdma:使用 shared_ptr 管理 Infiniband obj
1.不要使用裸指针来管理Infiniband obj
2.直接访问Infiniband obj而不是从RDMA堆栈。 这可以避免在 RDMAWorker 和 RDMADispatcher 中缓存 RDMAStack obj
wait_copy_len += it->length() = 32
tx_buffers.push_back(ib->get_tx_chunk_by_buffer(it->raw_c_str()))
size_t copied = tx_copy_chunk(tx_buffers, wait_copy_len, copy_start, it);
total_copied += tx_copy_chunk(tx_buffers, wait_copy_len, copy_start, it) -> size_t RDMAConnectedSocketImpl::tx_copy_chunk
int RDMAWorker::get_reged_mem -> 获取已注册的内存 int Infiniband::get_tx_buffers -> get_send_buffers -> Infiniband::MemoryManager::Cluster::get_buffers
size_t got = ib->get_memory_manager()->get_tx_buffer_size() * r 131072>4406 获取到的内存满足需求的大小, 1MB, 131072*9=1179648
auto chunk_idx = tx_buffers.size() 9个chunk
Chunk *current_chunk = tx_buffers[chunk_idx]
size_t real_len = current_chunk->write((char*)addr + slice_write_len, start->length() - slice_write_len) -> uint32_t Infiniband::MemoryManager::Chunk::write
memcpy(buffer + offset, buf, write_len) 拷贝内存(循环拷贝)
write_len 4406
pending_bl.clear() 拷贝完释放pb
post_work_request(tx_buffers)
tx_buffers.size() = 1
while (current_buffer != tx_buffers.end())
ibv_post_send -> ibv_poll_cq 触发发端/收端 -> int Infiniband::CompletionQueue::poll_cq <- void RDMADispatcher::polling()
msgr->shutdown()
stop = Cycles::rdtsc()
...
NetworkStack::add_thread
w->center.process_events -> C_handle_read -> conn->process() -> void AsyncConnection::process()
worker->connect(target_addr, opts, &cs) -> int RDMAWorker::connect
ib->init()
dispatcher->polling_start()
new RDMAConnectedSocketImpl -> RDMAConnectedSocketImpl::RDMAConnectedSocketImpl
read_handler(new C_handle_connection_read(this))
established_handler(new C_handle_connection_established(this))
p->try_connect(addr, opts) -> int RDMAConnectedSocketImpl::try_connect
tcp_fd = net.nonblock_connect(peer_addr, opts.connect_bind_addr) -> generic_connect -> int NetHandler::generic_connect
create_socket
::connect(s, addr.get_sockaddr(), addr.get_sockaddr_len()) syscall 客户端连接服务端(socket) -> 服务端触发事件(C_processor_accept) -> void Processor::accept()
worker->center.create_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE , established_handler) -> established_handler -> int RDMAConnectedSocketImpl::handle_connection_established
*socket = ConnectedSocket(std::move(csi))
center->create_file_event(cs.fd(), EVENT_READABLE, read_handler) -> state = STATE_CONNECTING_RE -> void AsyncConnection::process() (回到process)
...
case STATE_CONNECTING_RE
cs.is_connected()
center->create_file_event EVENT_WRITABLE read_handler -> process
logger->tinc -> void PerfCounters::tinc 性能统计(时延统计)
...
protocol->read_event() -> switch (state) -> 判断状态 -> START_ACCEPT -> run_continuation(CONTINUATION(start_server_banner_exchange)) -> 消息状态机 -> class ProtocolV2 : public Protocol
CONTINUATION_RUN(continuation)
CtPtr ProtocolV2::read -> ssize_t AsyncConnection::read -> read_until
read_bulk -> nread = cs.read(buf, len) -> ssize_t RDMAConnectedSocketImpl::read -> eventfd_read(notify_fd, &event_val)
read = read_buffers(buf,len) -> ssize_t RDMAConnectedSocketImpl::read_buffers
buffer_prefetch() 预读 -> void RDMAConnectedSocketImpl::buffer_prefetch
ibv_wc* response = &cqe[i]
chunk->prepare_read(response->byte_len)
buffers.push_back(chunk)
tmp = (*pchunk)->read(buf + read_size, len - read_size) -> uint32_t Infiniband::MemoryManager::Chunk::read
memcpy(buf, buffer + offset, read_len);
(*pchunk)->reset_read_chunk() 将偏移和边界都置0
dispatcher->post_chunk_to_pool(*pchunk) -> void RDMADispatcher::post_chunk_to_pool
ib->post_chunk_to_pool(chunk)
update_post_backlog -> void RDMAConnectedSocketImpl::update_post_backlog
超时处理:
new C_handle_reap(this)
local_worker->create_time_event( ReapDeadConnectionMaxPeriod...
reap_dead
设备属性: device_attr
(gdb) p device_attr
$17 = {
fw_ver = "16.33.1048", '\000' <repeats 53 times>,
node_guid = 8550064101420093112,
sys_image_guid = 8550064101420093112,
max_mr_size = 18446744073709551615,
page_size_cap = 18446744073709547520,
vendor_id = 713,
vendor_part_id = 4119,
hw_ver = 0,
max_qp = 131072,
max_qp_wr = 32768,
---Type <return> to continue, or q <return> to quit---
device_cap_flags = 3983678518,
max_sge = 30,
max_sge_rd = 30,
max_cq = 16777216,
max_cqe = 4194303,
max_mr = 16777216,
max_pd = 8388608,
max_qp_rd_atom = 16,
max_ee_rd_atom = 0,
max_res_rd_atom = 2097152,
max_qp_init_rd_atom = 16,
---Type <return> to continue, or q <return> to quit---
max_ee_init_rd_atom = 0,
atomic_cap = IBV_ATOMIC_HCA,
max_ee = 0,
max_rdd = 0,
max_mw = 16777216,
max_raw_ipv6_qp = 0,
max_raw_ethy_qp = 0,
max_mcast_grp = 2097152,
max_mcast_qp_attach = 240,
max_total_mcast_qp_attach = 503316480,
max_ah = 2147483647,
---Type <return> to continue, or q <return> to quit---
max_fmr = 0,
max_map_per_fmr = 0,
max_srq = 8388608,
max_srq_wr = 32767,
max_srq_sge = 31,
max_pkeys = 128,
local_ca_ack_delay = 16 '\020',
phys_port_cnt = 1 '\001'
}
ceph_msg消息模块_RDMA_性能测试工具源码流程分析_src_test_msgr_perf_msgr_server_client.cc
最新推荐文章于 2024-08-13 10:15:11 发布