ovn-controller的作用是将sbdb中的信息转换成本chassis上的openflow流表信息,而且只会将和本chassis相关的信息进行转换,所以ovn-controller首先得识别出来哪些信息是和本chassis相关的。比如一个vm的vif接口在本chassis,则其对应的logical_switch,logical_switch_port都属于和本chassis相关的信息。
识别local datapath和local port
ovn-controller使用inc-proc-engine nodes机制来追踪sbdb信息变化,对于和datapath和端口相关的变化使用en_runtime_data来追踪,调用的函数为en_runtime_data_run,代码如下
static void
en_runtime_data_run(struct engine_node *node, void *data)
struct ed_type_runtime_data *rt_data = data;
struct sset *active_tunnels = &rt_data->active_tunnels;
struct binding_ctx_in b_ctx_in;
struct binding_ctx_out b_ctx_out;
init_binding_ctx(node, rt_data, &b_ctx_in, &b_ctx_out);
//取出 ofctrl_is_connected engine node,查看是否连接到 br-int
struct ed_type_ofctrl_is_connected *ed_ofctrl_is_connected = engine_get_input_data("ofctrl_is_connected", node);
if (ed_ofctrl_is_connected->connected) {
/* Calculate the active tunnels only if have an an active
* OpenFlow connection to br-int.
* If we don't have a connection to br-int, it could mean
* ovs-vswitchd is down for some reason and the BFD status
* in the Interface rows could be stale. So its better to
* consider 'active_tunnels' set to be empty if it's not
* connected. */
//遍历br-int桥上的tunnel接口(带有选项remote_ip),如果使能了bfd,并且是up状态,
//则从 external_ids的ovn-chassis-id 字段解析出 chassis 名字,插入 active_tunnels,
//即 active_tunnels 保存的是本节点有tunnel连接的其他chassis名字。
bfd_calculate_active_tunnels(b_ctx_in.br_int, active_tunnels);
}
binding_run(&b_ctx_in, &b_ctx_out);
//将本chassis的br_int上带有 iface-id 字段的接口保存到 struct local_binding,
//并插入 b_ctx_out->lbinding_data->bindings。
//带有 iface-id 字段说明这是一个逻辑端口对应的物理实体。
if (b_ctx_in->br_int) {
build_local_bindings(b_ctx_in, b_ctx_out);
for (i = 0; i < b_ctx_in->br_int->n_ports; i++) {
const struct ovsrec_port *port_rec = b_ctx_in->br_int->ports[i];
const char *iface_id;
int j;
//跳过和网桥同名的端口
if (!strcmp(port_rec->name, b_ctx_in->br_int->name)) {
continue;
}
struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings;
for (j = 0; j < port_rec->n_interfaces; j++) {
const struct ovsrec_interface *iface_rec;
iface_rec = port_rec->interfaces[j];
//获取 iface-id,值应该为 port_binding 表的 logical_port 字段
iface_id = smap_get(&iface_rec->external_ids, "iface-id");
int64_t ofport = iface_rec->n_ofport ? *iface_rec->ofport : 0;
//如果指定了 iface-id,并且 ofport 为有效值,则将此接口保存到 struct local_binding,
//并插入 b_ctx_out->lbinding_data->bindings
if (iface_id && ofport > 0) {
struct local_binding *lbinding =
local_binding_find(local_bindings, iface_id);
if (!lbinding) {
lbinding = local_binding_create(iface_id, iface_rec);
local_binding_add(local_bindings, lbinding);
//以name为key,插入hash表
shash_add(local_bindings, lbinding->name, lbinding);
} else {
static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(1, 5);
VLOG_WARN_RL(
&rl,
"Invalid configuration: iface-id is configured on "
"interfaces : [%s] and [%s]. Ignoring the "
"configuration on interface [%s]",
lbinding->iface->name, iface_rec->name,
iface_rec->name);
}
update_local_lports(iface_id, b_ctx_out);
smap_replace(b_ctx_out->local_iface_ids, iface_rec->name, iface_id);
}
}
}
}
//遍历port_binding,根据端口类型进行处理
const struct sbrec_port_binding *pb;
SBREC_PORT_BINDING_TABLE_FOR_EACH (pb, b_ctx_in->port_binding_table) {
enum en_lport_type lport_type = get_lport_type(pb);
switch (lport_type) {
//vm接口
case LP_VIF:
consider_vif_lport(pb, b_ctx_in, b_ctx_out, NULL, qos_map_ptr);
//选项 requested-chassis 表示主动将vif绑定到指定的chassis
const char *vif_chassis = smap_get(&pb->options, "requested-chassis");
//如果没有指定选项 requested-chassis,则还不能确定此逻辑端口对应的物理实体在本chassis上,可以返回true
//或者指定的选项为本chassis,则此逻辑端口对应的物理实体肯定在本chassis上,返回true。
//如果指定了requested-chassis,但不是本chassis,则返回false,表示此vif没有绑定
//到此chassis。
bool can_bind = can_bind_on_this_chassis(b_ctx_in->chassis_rec, vif_chassis);
return !requested_chassis || !requested_chassis[0]
|| !strcmp(requested_chassis, chassis_rec->name)
|| !strcmp(requested_chassis, chassis_rec->hostname);
if (!lbinding) {
//根据 logical_port 名字查找 bindings 表,能查到说明此逻辑端口对应的物理实体在本chassis上
lbinding = local_binding_find(&b_ctx_out->lbinding_data->bindings, pb->logical_port);
}
struct binding_lport *b_lport = NULL;
if (lbinding) {
struct shash *binding_lports = &b_ctx_out->lbinding_data->lports;
b_lport = local_binding_add_lport(binding_lports, lbinding, pb, LP_VIF);
struct binding_lport *b_lport =
binding_lport_find(binding_lports, pb->logical_port);
bool add_to_lport_list = false;
if (!b_lport) {
b_lport = binding_lport_create(pb, lbinding, b_type);
binding_lport_add(binding_lports, b_lport);
add_to_lport_list = true;
} else if (b_lport->lbinding != lbinding) {
add_to_lport_list = true;
if (!ovs_list_is_empty(&b_lport->list_node)) {
ovs_list_remove(&b_lport->list_node);
}
b_lport->lbinding = lbinding;
b_lport->type = b_type;
}
if (add_to_lport_list) {
if (b_type == LP_VIF) {
ovs_list_push_front(&lbinding->binding_lports, &b_lport->list_node);
} else {
ovs_list_push_back(&lbinding->binding_lports, &b_lport->list_node);
}
}
}
return consider_vif_lport_(pb, can_bind, vif_chassis, b_ctx_in, b_ctx_out, b_lport, qos_map);
//同时满足lbinding_set和can_bind,说明此逻辑端口确实在本chassis上,则保存
//逻辑端口所在datapath到b_ctx_out->local_datapaths,并保存逻辑端口到b_ctx->local_lports
bool lbinding_set = b_lport && is_lbinding_set(b_lport->lbinding);
if (lbinding_set) {
if (can_bind) {
/* We can claim the lport. */
const struct sbrec_port_binding *parent_pb =
binding_lport_get_parent_pb(b_lport);
claim_lport(pb, parent_pb, b_ctx_in->chassis_rec,
b_lport->lbinding->iface,
!b_ctx_in->ovnsb_idl_txn,
!parent_pb, b_ctx_out->tracked_dp_bindings)
sbrec_port_binding_set_chassis(pb, chassis_rec);
sbrec_port_binding_set_encap(pb, encap_rec);
//将此datapath插入b_ctx_out->local_datapaths
add_local_datapath(b_ctx_in->sbrec_datapath_binding_by_key,
b_ctx_in->sbrec_port_binding_by_datapath,
b_ctx_in->sbrec_port_binding_by_name,
pb->datapath, false,
b_ctx_out->local_datapaths,
b_ctx_out->tracked_dp_bindings);
//组合成"dp-key_lport-key",插入 b_ctx->local_lport_ids
update_local_lport_ids(pb, b_ctx_out);
get_unique_lport_key(pb->datapath->tunnel_key, pb->tunnel_key, buf, sizeof(buf));
snprintf(buf, buf_size, "%"PRId64"_%"PRId64, dp_tunnel_key, lport_tunnel_key);
sset_add(b_ctx->local_lport_ids, buf);
//将logical_port插入b_ctx->local_lports
update_local_lports(pb->logical_port, b_ctx_out);
sset_add(b_ctx->local_lports, iface_id);
if (b_lport->lbinding->iface && qos_map && b_ctx_in->ovs_idl_txn) {
get_qos_params(pb, qos_map);
}
} else {
/* We could, but can't claim the lport. */
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
VLOG_INFO_RL(&rl,
"Not claiming lport %s, chassis %s "
"requested-chassis %s",
pb->logical_port,
b_ctx_in->chassis_rec->name,
vif_chassis);
}
}
break;
}
}
流表转换
openflow流表按照功能划分为如下的table
/* OpenFlow table numbers.
*
* These are heavily documented in ovn-architecture(7), please update it if
* you make any changes. */
#define OFTABLE_PHY_TO_LOG 0
#define OFTABLE_LOG_INGRESS_PIPELINE 8 /* First of LOG_PIPELINE_LEN tables. */
#define OFTABLE_REMOTE_OUTPUT 37
#define OFTABLE_LOCAL_OUTPUT 38
#define OFTABLE_CHECK_LOOPBACK 39
#define OFTABLE_LOG_EGRESS_PIPELINE 40 /* First of LOG_PIPELINE_LEN tables. */
#define OFTABLE_SAVE_INPORT 64
#define OFTABLE_LOG_TO_PHY 65
#define OFTABLE_MAC_BINDING 66
#define OFTABLE_MAC_LOOKUP 67
#define OFTABLE_CHK_LB_HAIRPIN 68
#define OFTABLE_CHK_LB_HAIRPIN_REPLY 69
#define OFTABLE_CT_SNAT_FOR_VIP 70
#define OFTABLE_GET_FDB 71
#define OFTABLE_LOOKUP_FDB 72
处理流表的engine node是flow_output,调用en_flow_output_run进行处理。
en_runtime_data_run中收集的数据会作为en_flow_output_run的input数据,比如rt_data->local_datapaths,rt_data->active_tunnels,rt_data->local_lport_ids等。en_flow_output_run根据这些数据判断需要转换哪些流表信息。
static void
en_flow_output_run(struct engine_node *node, void *data)
struct ed_type_runtime_data *rt_data =
engine_get_input_data("runtime_data", node);
struct lflow_ctx_in l_ctx_in;
struct lflow_ctx_out l_ctx_out;
init_lflow_ctx(node, rt_data, fo, &l_ctx_in, &l_ctx_out);
//将sbdb的logical_flow表转换到openflow流表中
lflow_run(&l_ctx_in, &l_ctx_out);
struct physical_ctx p_ctx;
init_physical_ctx(node, rt_data, &p_ctx);
//将和物理实体相关的处理添加到openflow流表
physical_run(&p_ctx, &fo->flow_table);
在转换过程中,都会调用get_local_datapath判断datapath是否在本chassis上,如果是的话才会进行转换。
lflow_run
/* Translates logical flows in the Logical_Flow table in the OVN_SB database
* into OpenFlow flows. See ovn-architecture(7) for more information. */
void
lflow_run(struct lflow_ctx_in *l_ctx_in, struct lflow_ctx_out *l_ctx_out)
{
COVERAGE_INC(lflow_run);
add_logical_flows(l_ctx_in, l_ctx_out);
add_neighbor_flows(l_ctx_in->sbrec_port_binding_by_name,
l_ctx_in->mac_binding_table, l_ctx_in->local_datapaths,
l_ctx_out->flow_table);
add_lb_hairpin_flows(l_ctx_in->lb_table, l_ctx_in->local_datapaths,
l_ctx_out->flow_table);
add_fdb_flows(l_ctx_in->fdb_table, l_ctx_in->local_datapaths,
l_ctx_out->flow_table);
}
physical_run
void
physical_run(struct physical_ctx *p_ctx,
struct ovn_desired_flow_table *flow_table)
...
/* Set up flows in table 0 for physical-to-logical translation and in table
* 64 for logical-to-physical translation. */
const struct sbrec_port_binding *binding;
SBREC_PORT_BINDING_TABLE_FOR_EACH (binding, p_ctx->port_binding_table) {
consider_port_binding(p_ctx->sbrec_port_binding_by_name,
p_ctx->mff_ovn_geneve, p_ctx->ct_zones,
p_ctx->active_tunnels, p_ctx->local_datapaths,
binding, p_ctx->chassis,
flow_table, &ofpacts);
}
/* Table 0, priority 100.
* ======================
*
* Process packets that arrive from a remote hypervisor (by matching
* on tunnel in_port). */
/* Add flows for Geneve, STT and VXLAN encapsulations. Geneve and STT
* encapsulations have metadata about the ingress and egress logical ports.
* VXLAN encapsulations have metadata about the egress logical port only.
* We set MFF_LOG_DATAPATH, MFF_LOG_INPORT, and MFF_LOG_OUTPORT from the
* tunnel key data where possible, then resubmit to table 33 to handle
* packets to the local hypervisor. */
HMAP_FOR_EACH (tun, hmap_node, &tunnels) {
struct match match = MATCH_CATCHALL_INITIALIZER;
match_set_in_port(&match, tun->ofport);
ofpbuf_clear(&ofpacts);
if (tun->type == GENEVE) {
put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 24, &ofpacts);
put_move(p_ctx->mff_ovn_geneve, 16, MFF_LOG_INPORT, 0, 15,
&ofpacts);
put_move(p_ctx->mff_ovn_geneve, 0, MFF_LOG_OUTPORT, 0, 16,
&ofpacts);
} else if (tun->type == STT) {
put_move(MFF_TUN_ID, 40, MFF_LOG_INPORT, 0, 15, &ofpacts);
put_move(MFF_TUN_ID, 24, MFF_LOG_OUTPORT, 0, 16, &ofpacts);
put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 24, &ofpacts);
} else if (tun->type == VXLAN) {
/* Add flows for non-VTEP tunnels. Split VNI into two 12-bit
* sections and use them for datapath and outport IDs. */
put_move(MFF_TUN_ID, 12, MFF_LOG_OUTPORT, 0, 12, &ofpacts);
put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 12, &ofpacts);
} else {
OVS_NOT_REACHED();
}
put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
&ofpacts, hc_uuid);
}
逻辑通道标识id用24位表示。
逻辑出端口标识id用16位表示,其中范围0-32767表示普通端口,范围32768-65535表示组播组。
ingress
logical_flow的ingress流表被设置在table0到table24中。对应到openflow流表的table8到table32。
logical_flow ingress的流表的动作output,在openflow中会被固定resubmit到openflow table37。
openflow table37的主要作用就是将输出端口在其他chassis的报文(单播和组播)通过tunnel端口发送出去,需要在本地chassis处理的报文会匹配table37中优先级最低的流表,将报文送到table38进行处理。
其他chassis收到tunnel报文后,在table0从报文中提取出datapath id,入端口id和出端口id,并将报文送到table38进行处理。
由上面可知,不管本地chassis处理还是发给其他chassis处理,最终都会到table38,所以openflow table38的作用是处理输出端口为本地chassis的报文。
table38又将报文送到table39处理。
table39检查回环,即输入端口和输出端口是否是同一个端口,如果是则将报文丢弃。如果不是,则将报文送到table40处理。
table40开始egress的处理。
egress
logical_flow的egress流表被设置在table0到table10中。对应到openflow流表的table40到table50。
logical_flow egress的流表的动作output,在openflow中会被固定resubmit到openflow table64。
table64又将报文送到table65处理。
table65负责将报文发送到逻辑端口对应的物理端口。
到其他chassis的处理
在table 37(OFTABLE_REMOTE_OUTPUT)添加通过tunnel到其他chassis的流表如下(实验环境只有一个tunnel):
匹配域中的metadata 0x1表示匹配哪个datapath,reg15 0x2表示匹配输出端口0x2。
执行的动作为: 将datapath id 0x1赋值到NXM_NX_TUN_ID,将输出端口0x2赋值到tun_metadata0,将输入端口NXM_NX_REG14赋值到NXM_NX_TUN_METADATA0,最后将报文从端口ovn-node1-0发出去。
cookie=0x131cd5ba, duration=660713.444s, table=37, n_packets=5, n_bytes=322, priority=100,reg15=0x2,metadata=0x1 actions=load:0x1->NXM_NX_TUN_ID[0..23],set_field:0x2->tun_metadata0,move:NXM_NX_REG14[0..14]->NXM_NX_TUN_METADATA0[16..30],output:"ovn-node1-0"
对应的源码如下
consider_port_binding --> put_remote_port_redirect_overlay
if (!is_ha_remote) {
/* Setup encapsulation */
const struct chassis_tunnel *rem_tun =
get_port_binding_tun(binding);
if (!rem_tun) {
return;
}
//根据隧道类型,设置不同的字段
put_encapsulation(mff_ovn_geneve, tun, binding->datapath, port_key, !strcmp(binding->type, "vtep"), ofpacts_p);
if (tun->type == GENEVE) {
put_load(datapath->tunnel_key, MFF_TUN_ID, 0, 24, ofpacts);
put_load(outport, mff_ovn_geneve, 0, 32, ofpacts);
put_move(MFF_LOG_INPORT, 0, mff_ovn_geneve, 16, 15, ofpacts);
} else if (tun->type == STT) {
put_load(datapath->tunnel_key | ((uint64_t) outport << 24),
MFF_TUN_ID, 0, 64, ofpacts);
put_move(MFF_LOG_INPORT, 0, MFF_TUN_ID, 40, 15, ofpacts);
} else if (tun->type == VXLAN) {
uint64_t vni = datapath->tunnel_key;
if (!is_ramp_switch) {
/* Only some bits are used for regular tunnels. */
vni |= (uint64_t) outport << 12;
}
put_load(vni, MFF_TUN_ID, 0, 24, ofpacts);
} else {
OVS_NOT_REACHED();
}
/* Output to tunnel. */
ofpact_put_OUTPUT(ofpacts_p)->port = rem_tun->ofport;
}
ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
binding->header_.uuid.parts[0],
match, ofpacts_p, &binding->header_.uuid);
组播报文的处理
在table 37(OFTABLE_REMOTE_OUTPUT)添加对组播报文的处理,流表如下
匹配域: metadata 0x1 表示匹配id为0x1的datapath,reg15 0x8000表示输出端口id为0x8000。
执行的动作: 将datapath id 0x1赋值到NXM_NX_TUN_ID,将输出端口0x2赋值到tun_metadata0,将输入端口NXM_NX_REG14赋值到NXM_NX_TUN_METADATA0,最后将报文从端口ovn-node1-0发送到其他chassis上。同时因为是组播报文,还要继续在本chassis上转发,所以resubmit(,38)转到下一个表。
cookie=0xae89cd31, duration=660713.444s, table=37, n_packets=385, n_bytes=26970, priority=100,reg15=0x8000,metadata=0x1 actions=load:0x1->NXM_NX_TUN_ID[0..23],set_field:0x8000->tun_metadata0,move:NXM_NX_REG14[0..14]->NXM_NX_TUN_METADATA0[16..30],output:"ovn-node1-0",resubmit(,38)
对应的源码如下
consider_mc_group
/* Table 32, priority 100.
* =======================
*
* Handle output to the remote chassis in the multicast group, if
* any. */
if (!sset_is_empty(&remote_chassis) || remote_ofpacts.size > 0) {
if (remote_ofpacts.size > 0) {
/* Following delivery to logical patch ports, restore the
* multicast group as the logical output port. */
put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
&remote_ofpacts);
}
const char *chassis_name;
const struct chassis_tunnel *prev = NULL;
SSET_FOR_EACH (chassis_name, &remote_chassis) {
const struct chassis_tunnel *tun
= chassis_tunnel_find(chassis_name, NULL);
if (!tun) {
continue;
}
if (!prev || tun->type != prev->type) {
put_encapsulation(mff_ovn_geneve, tun, mc->datapath,
mc->tunnel_key, true, &remote_ofpacts);
prev = tun;
}
ofpact_put_OUTPUT(&remote_ofpacts)->port = tun->ofport;
}
if (remote_ofpacts.size) {
if (local_ports) {
put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts);
}
ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
mc->header_.uuid.parts[0],
&match, &remote_ofpacts, &mc->header_.uuid);
}