ovn-controller转换流表

ovn-controller的作用是将sbdb中的信息转换成本chassis上的openflow流表信息,而且只会将和本chassis相关的信息进行转换,所以ovn-controller首先得识别出来哪些信息是和本chassis相关的。比如一个vm的vif接口在本chassis,则其对应的logical_switch,logical_switch_port都属于和本chassis相关的信息。

识别local datapath和local port
ovn-controller使用inc-proc-engine nodes机制来追踪sbdb信息变化,对于和datapath和端口相关的变化使用en_runtime_data来追踪,调用的函数为en_runtime_data_run,代码如下

static void
en_runtime_data_run(struct engine_node *node, void *data)
    struct ed_type_runtime_data *rt_data = data;
    struct sset *active_tunnels = &rt_data->active_tunnels;

    struct binding_ctx_in b_ctx_in;
    struct binding_ctx_out b_ctx_out;
    init_binding_ctx(node, rt_data, &b_ctx_in, &b_ctx_out);

    //取出 ofctrl_is_connected engine node,查看是否连接到 br-int
    struct ed_type_ofctrl_is_connected *ed_ofctrl_is_connected = engine_get_input_data("ofctrl_is_connected", node);
    if (ed_ofctrl_is_connected->connected) {
        /* Calculate the active tunnels only if have an an active
         * OpenFlow connection to br-int.
         * If we don't have a connection to br-int, it could mean
         * ovs-vswitchd is down for some reason and the BFD status
         * in the Interface rows could be stale. So its better to
         * consider 'active_tunnels' set to be empty if it's not
         * connected. */
        //遍历br-int桥上的tunnel接口(带有选项remote_ip),如果使能了bfd,并且是up状态,
        //则从 external_ids的ovn-chassis-id 字段解析出 chassis 名字,插入 active_tunnels,
        //即 active_tunnels 保存的是本节点有tunnel连接的其他chassis名字。
        bfd_calculate_active_tunnels(b_ctx_in.br_int, active_tunnels);
    }

    binding_run(&b_ctx_in, &b_ctx_out);
        //将本chassis的br_int上带有 iface-id 字段的接口保存到 struct local_binding,
        //并插入 b_ctx_out->lbinding_data->bindings。
        //带有 iface-id 字段说明这是一个逻辑端口对应的物理实体。
        if (b_ctx_in->br_int) {
            build_local_bindings(b_ctx_in, b_ctx_out);
                for (i = 0; i < b_ctx_in->br_int->n_ports; i++) {
                    const struct ovsrec_port *port_rec = b_ctx_in->br_int->ports[i];
                    const char *iface_id;
                    int j;
                    //跳过和网桥同名的端口
                    if (!strcmp(port_rec->name, b_ctx_in->br_int->name)) {
                        continue;
                    }

                    struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings;
                    for (j = 0; j < port_rec->n_interfaces; j++) {
                        const struct ovsrec_interface *iface_rec;

                        iface_rec = port_rec->interfaces[j];
                        //获取 iface-id,值应该为 port_binding 表的 logical_port 字段
                        iface_id = smap_get(&iface_rec->external_ids, "iface-id");
                        int64_t ofport = iface_rec->n_ofport ? *iface_rec->ofport : 0;

                        //如果指定了 iface-id,并且 ofport 为有效值,则将此接口保存到 struct local_binding,
                        //并插入 b_ctx_out->lbinding_data->bindings
                        if (iface_id && ofport > 0) {
                            struct local_binding *lbinding =
                                local_binding_find(local_bindings, iface_id);
                            if (!lbinding) {
                                lbinding = local_binding_create(iface_id, iface_rec);
                                local_binding_add(local_bindings, lbinding);
                                    //以name为key,插入hash表
                                    shash_add(local_bindings, lbinding->name, lbinding);
                            } else {
                                static struct vlog_rate_limit rl =
                                    VLOG_RATE_LIMIT_INIT(1, 5);
                                VLOG_WARN_RL(
                                    &rl,
                                    "Invalid configuration: iface-id is configured on "
                                    "interfaces : [%s] and [%s]. Ignoring the "
                                    "configuration on interface [%s]",
                                    lbinding->iface->name, iface_rec->name,
                                    iface_rec->name);
                            }

                            update_local_lports(iface_id, b_ctx_out);
                            smap_replace(b_ctx_out->local_iface_ids, iface_rec->name, iface_id);
                        }
                    }
                }
        }

        //遍历port_binding,根据端口类型进行处理
        const struct sbrec_port_binding *pb;
        SBREC_PORT_BINDING_TABLE_FOR_EACH (pb, b_ctx_in->port_binding_table) {
            enum en_lport_type lport_type = get_lport_type(pb);
            switch (lport_type) {
            //vm接口
            case LP_VIF:
                consider_vif_lport(pb, b_ctx_in, b_ctx_out, NULL, qos_map_ptr);
                    //选项 requested-chassis 表示主动将vif绑定到指定的chassis
                    const char *vif_chassis = smap_get(&pb->options, "requested-chassis");
                    //如果没有指定选项 requested-chassis,则还不能确定此逻辑端口对应的物理实体在本chassis上,可以返回true
                    //或者指定的选项为本chassis,则此逻辑端口对应的物理实体肯定在本chassis上,返回true。
                    //如果指定了requested-chassis,但不是本chassis,则返回false,表示此vif没有绑定
                    //到此chassis。
                    bool can_bind = can_bind_on_this_chassis(b_ctx_in->chassis_rec, vif_chassis);
                        return !requested_chassis || !requested_chassis[0]
                               || !strcmp(requested_chassis, chassis_rec->name)
                               || !strcmp(requested_chassis, chassis_rec->hostname);

                    if (!lbinding) {
                        //根据 logical_port 名字查找 bindings 表,能查到说明此逻辑端口对应的物理实体在本chassis上
                        lbinding = local_binding_find(&b_ctx_out->lbinding_data->bindings, pb->logical_port);
                    }

                    struct binding_lport *b_lport = NULL;
                    if (lbinding) {
                        struct shash *binding_lports = &b_ctx_out->lbinding_data->lports;
                        b_lport = local_binding_add_lport(binding_lports, lbinding, pb, LP_VIF);
                            struct binding_lport *b_lport =
                                binding_lport_find(binding_lports, pb->logical_port);
                            bool add_to_lport_list = false;
                            if (!b_lport) {
                                b_lport = binding_lport_create(pb, lbinding, b_type);
                                binding_lport_add(binding_lports, b_lport);
                                add_to_lport_list = true;
                            } else if (b_lport->lbinding != lbinding) {
                                add_to_lport_list = true;
                                if (!ovs_list_is_empty(&b_lport->list_node)) {
                                    ovs_list_remove(&b_lport->list_node);
                                }
                                b_lport->lbinding = lbinding;
                                b_lport->type = b_type;
                            }

                            if (add_to_lport_list) {
                                if (b_type == LP_VIF) {
                                    ovs_list_push_front(&lbinding->binding_lports, &b_lport->list_node);
                                } else {
                                    ovs_list_push_back(&lbinding->binding_lports, &b_lport->list_node);
                                }
                            }
                    }

                    return consider_vif_lport_(pb, can_bind, vif_chassis, b_ctx_in, b_ctx_out, b_lport, qos_map);
                        //同时满足lbinding_set和can_bind,说明此逻辑端口确实在本chassis上,则保存
                        //逻辑端口所在datapath到b_ctx_out->local_datapaths,并保存逻辑端口到b_ctx->local_lports
                        bool lbinding_set = b_lport && is_lbinding_set(b_lport->lbinding);
                        if (lbinding_set) {
                            if (can_bind) {
                                /* We can claim the lport. */
                                const struct sbrec_port_binding *parent_pb =
                                    binding_lport_get_parent_pb(b_lport);

                                claim_lport(pb, parent_pb, b_ctx_in->chassis_rec,
                                                 b_lport->lbinding->iface,
                                                 !b_ctx_in->ovnsb_idl_txn,
                                                 !parent_pb, b_ctx_out->tracked_dp_bindings)
                                    sbrec_port_binding_set_chassis(pb, chassis_rec);
                                    sbrec_port_binding_set_encap(pb, encap_rec);

                                //将此datapath插入b_ctx_out->local_datapaths
                                add_local_datapath(b_ctx_in->sbrec_datapath_binding_by_key,
                                                   b_ctx_in->sbrec_port_binding_by_datapath,
                                                   b_ctx_in->sbrec_port_binding_by_name,
                                                   pb->datapath, false,
                                                   b_ctx_out->local_datapaths,
                                                   b_ctx_out->tracked_dp_bindings);
                                //组合成"dp-key_lport-key",插入 b_ctx->local_lport_ids
                                update_local_lport_ids(pb, b_ctx_out);
                                    get_unique_lport_key(pb->datapath->tunnel_key, pb->tunnel_key, buf, sizeof(buf));
                                        snprintf(buf, buf_size, "%"PRId64"_%"PRId64, dp_tunnel_key, lport_tunnel_key);
                                    sset_add(b_ctx->local_lport_ids, buf);

                                //将logical_port插入b_ctx->local_lports
                                update_local_lports(pb->logical_port, b_ctx_out);
                                    sset_add(b_ctx->local_lports, iface_id);

                                if (b_lport->lbinding->iface && qos_map && b_ctx_in->ovs_idl_txn) {
                                    get_qos_params(pb, qos_map);
                                }
                            } else {
                                /* We could, but can't claim the lport. */
                                static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
                                    VLOG_INFO_RL(&rl,
                                                 "Not claiming lport %s, chassis %s "
                                                 "requested-chassis %s",
                                                 pb->logical_port,
                                                 b_ctx_in->chassis_rec->name,
                                                 vif_chassis);
                            }
                        }
                break;
            }
        }

流表转换
openflow流表按照功能划分为如下的table

/* OpenFlow table numbers.
 *
 * These are heavily documented in ovn-architecture(7), please update it if
 * you make any changes. */
#define OFTABLE_PHY_TO_LOG            0
#define OFTABLE_LOG_INGRESS_PIPELINE  8 /* First of LOG_PIPELINE_LEN tables. */
#define OFTABLE_REMOTE_OUTPUT        37
#define OFTABLE_LOCAL_OUTPUT         38
#define OFTABLE_CHECK_LOOPBACK       39
#define OFTABLE_LOG_EGRESS_PIPELINE  40 /* First of LOG_PIPELINE_LEN tables. */
#define OFTABLE_SAVE_INPORT          64
#define OFTABLE_LOG_TO_PHY           65
#define OFTABLE_MAC_BINDING          66
#define OFTABLE_MAC_LOOKUP           67
#define OFTABLE_CHK_LB_HAIRPIN       68
#define OFTABLE_CHK_LB_HAIRPIN_REPLY 69
#define OFTABLE_CT_SNAT_FOR_VIP      70
#define OFTABLE_GET_FDB              71
#define OFTABLE_LOOKUP_FDB           72

处理流表的engine node是flow_output,调用en_flow_output_run进行处理。
en_runtime_data_run中收集的数据会作为en_flow_output_run的input数据,比如rt_data->local_datapaths,rt_data->active_tunnels,rt_data->local_lport_ids等。en_flow_output_run根据这些数据判断需要转换哪些流表信息。

static void
en_flow_output_run(struct engine_node *node, void *data)
    struct ed_type_runtime_data *rt_data =
        engine_get_input_data("runtime_data", node);

    struct lflow_ctx_in l_ctx_in;
    struct lflow_ctx_out l_ctx_out;
    init_lflow_ctx(node, rt_data, fo, &l_ctx_in, &l_ctx_out);
    //将sbdb的logical_flow表转换到openflow流表中
    lflow_run(&l_ctx_in, &l_ctx_out);
    
    struct physical_ctx p_ctx;
    init_physical_ctx(node, rt_data, &p_ctx);
    //将和物理实体相关的处理添加到openflow流表
    physical_run(&p_ctx, &fo->flow_table);

在转换过程中,都会调用get_local_datapath判断datapath是否在本chassis上,如果是的话才会进行转换。

lflow_run

/* Translates logical flows in the Logical_Flow table in the OVN_SB database
 * into OpenFlow flows.  See ovn-architecture(7) for more information. */
void
lflow_run(struct lflow_ctx_in *l_ctx_in, struct lflow_ctx_out *l_ctx_out)
{
    COVERAGE_INC(lflow_run);

    add_logical_flows(l_ctx_in, l_ctx_out);
    add_neighbor_flows(l_ctx_in->sbrec_port_binding_by_name,
                       l_ctx_in->mac_binding_table, l_ctx_in->local_datapaths,
                       l_ctx_out->flow_table);
    add_lb_hairpin_flows(l_ctx_in->lb_table, l_ctx_in->local_datapaths,
                         l_ctx_out->flow_table);
    add_fdb_flows(l_ctx_in->fdb_table, l_ctx_in->local_datapaths,
                  l_ctx_out->flow_table);
}

physical_run

void
physical_run(struct physical_ctx *p_ctx,
             struct ovn_desired_flow_table *flow_table)
    ...
    /* Set up flows in table 0 for physical-to-logical translation and in table
     * 64 for logical-to-physical translation. */
    const struct sbrec_port_binding *binding;
    SBREC_PORT_BINDING_TABLE_FOR_EACH (binding, p_ctx->port_binding_table) {
        consider_port_binding(p_ctx->sbrec_port_binding_by_name,
                              p_ctx->mff_ovn_geneve, p_ctx->ct_zones,
                              p_ctx->active_tunnels, p_ctx->local_datapaths,
                              binding, p_ctx->chassis,
                              flow_table, &ofpacts);
    }

    /* Table 0, priority 100.
     * ======================
     *
     * Process packets that arrive from a remote hypervisor (by matching
     * on tunnel in_port). */

    /* Add flows for Geneve, STT and VXLAN encapsulations.  Geneve and STT
     * encapsulations have metadata about the ingress and egress logical ports.
     * VXLAN encapsulations have metadata about the egress logical port only.
     * We set MFF_LOG_DATAPATH, MFF_LOG_INPORT, and MFF_LOG_OUTPORT from the
     * tunnel key data where possible, then resubmit to table 33 to handle
     * packets to the local hypervisor. */
    HMAP_FOR_EACH (tun, hmap_node, &tunnels) {
        struct match match = MATCH_CATCHALL_INITIALIZER;
        match_set_in_port(&match, tun->ofport);

        ofpbuf_clear(&ofpacts);
        if (tun->type == GENEVE) {
            put_move(MFF_TUN_ID, 0,  MFF_LOG_DATAPATH, 0, 24, &ofpacts);
            put_move(p_ctx->mff_ovn_geneve, 16, MFF_LOG_INPORT, 0, 15,
                     &ofpacts);
            put_move(p_ctx->mff_ovn_geneve, 0, MFF_LOG_OUTPORT, 0, 16,
                     &ofpacts);
        } else if (tun->type == STT) {
            put_move(MFF_TUN_ID, 40, MFF_LOG_INPORT,   0, 15, &ofpacts);
            put_move(MFF_TUN_ID, 24, MFF_LOG_OUTPORT,  0, 16, &ofpacts);
            put_move(MFF_TUN_ID,  0, MFF_LOG_DATAPATH, 0, 24, &ofpacts);
        } else if (tun->type == VXLAN) {
            /* Add flows for non-VTEP tunnels. Split VNI into two 12-bit
             * sections and use them for datapath and outport IDs. */
            put_move(MFF_TUN_ID, 12, MFF_LOG_OUTPORT,  0, 12, &ofpacts);
            put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 12, &ofpacts);
        } else {
            OVS_NOT_REACHED();
        }

        put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);

        ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
                        &ofpacts, hc_uuid);
    }

逻辑通道标识id用24位表示。
逻辑出端口标识id用16位表示,其中范围0-32767表示普通端口,范围32768-65535表示组播组。

ingress
logical_flow的ingress流表被设置在table0到table24中。对应到openflow流表的table8到table32。
logical_flow ingress的流表的动作output,在openflow中会被固定resubmit到openflow table37。
openflow table37的主要作用就是将输出端口在其他chassis的报文(单播和组播)通过tunnel端口发送出去,需要在本地chassis处理的报文会匹配table37中优先级最低的流表,将报文送到table38进行处理。
其他chassis收到tunnel报文后,在table0从报文中提取出datapath id,入端口id和出端口id,并将报文送到table38进行处理。

由上面可知,不管本地chassis处理还是发给其他chassis处理,最终都会到table38,所以openflow table38的作用是处理输出端口为本地chassis的报文。

table38又将报文送到table39处理。
table39检查回环,即输入端口和输出端口是否是同一个端口,如果是则将报文丢弃。如果不是,则将报文送到table40处理。
table40开始egress的处理。

egress
logical_flow的egress流表被设置在table0到table10中。对应到openflow流表的table40到table50。
logical_flow egress的流表的动作output,在openflow中会被固定resubmit到openflow table64。
table64又将报文送到table65处理。
table65负责将报文发送到逻辑端口对应的物理端口。

到其他chassis的处理
在table 37(OFTABLE_REMOTE_OUTPUT)添加通过tunnel到其他chassis的流表如下(实验环境只有一个tunnel):
匹配域中的metadata 0x1表示匹配哪个datapath,reg15 0x2表示匹配输出端口0x2。
执行的动作为: 将datapath id 0x1赋值到NXM_NX_TUN_ID,将输出端口0x2赋值到tun_metadata0,将输入端口NXM_NX_REG14赋值到NXM_NX_TUN_METADATA0,最后将报文从端口ovn-node1-0发出去。

 cookie=0x131cd5ba, duration=660713.444s, table=37, n_packets=5, n_bytes=322, priority=100,reg15=0x2,metadata=0x1 actions=load:0x1->NXM_NX_TUN_ID[0..23],set_field:0x2->tun_metadata0,move:NXM_NX_REG14[0..14]->NXM_NX_TUN_METADATA0[16..30],output:"ovn-node1-0"

对应的源码如下

consider_port_binding --> put_remote_port_redirect_overlay
    if (!is_ha_remote) {
        /* Setup encapsulation */
        const struct chassis_tunnel *rem_tun =
            get_port_binding_tun(binding);
        if (!rem_tun) {
            return;
        }
        //根据隧道类型,设置不同的字段
        put_encapsulation(mff_ovn_geneve, tun, binding->datapath, port_key, !strcmp(binding->type, "vtep"), ofpacts_p);
            if (tun->type == GENEVE) {
                put_load(datapath->tunnel_key, MFF_TUN_ID, 0, 24, ofpacts);
                put_load(outport, mff_ovn_geneve, 0, 32, ofpacts);
                put_move(MFF_LOG_INPORT, 0, mff_ovn_geneve, 16, 15, ofpacts);
            } else if (tun->type == STT) {
                put_load(datapath->tunnel_key | ((uint64_t) outport << 24),
                         MFF_TUN_ID, 0, 64, ofpacts);
                put_move(MFF_LOG_INPORT, 0, MFF_TUN_ID, 40, 15, ofpacts);
            } else if (tun->type == VXLAN) {
                uint64_t vni = datapath->tunnel_key;
                if (!is_ramp_switch) {
                    /* Only some bits are used for regular tunnels. */
                    vni |= (uint64_t) outport << 12;
                }
                put_load(vni, MFF_TUN_ID, 0, 24, ofpacts);
            } else {
                OVS_NOT_REACHED();
            }
        /* Output to tunnel. */
        ofpact_put_OUTPUT(ofpacts_p)->port = rem_tun->ofport;
    }
    ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
                    binding->header_.uuid.parts[0],
                    match, ofpacts_p, &binding->header_.uuid);

组播报文的处理
在table 37(OFTABLE_REMOTE_OUTPUT)添加对组播报文的处理,流表如下
匹配域: metadata 0x1 表示匹配id为0x1的datapath,reg15 0x8000表示输出端口id为0x8000。
执行的动作: 将datapath id 0x1赋值到NXM_NX_TUN_ID,将输出端口0x2赋值到tun_metadata0,将输入端口NXM_NX_REG14赋值到NXM_NX_TUN_METADATA0,最后将报文从端口ovn-node1-0发送到其他chassis上。同时因为是组播报文,还要继续在本chassis上转发,所以resubmit(,38)转到下一个表。

 cookie=0xae89cd31, duration=660713.444s, table=37, n_packets=385, n_bytes=26970, priority=100,reg15=0x8000,metadata=0x1 actions=load:0x1->NXM_NX_TUN_ID[0..23],set_field:0x8000->tun_metadata0,move:NXM_NX_REG14[0..14]->NXM_NX_TUN_METADATA0[16..30],output:"ovn-node1-0",resubmit(,38)

对应的源码如下

consider_mc_group
    /* Table 32, priority 100.
     * =======================
     *
     * Handle output to the remote chassis in the multicast group, if
     * any. */
    if (!sset_is_empty(&remote_chassis) || remote_ofpacts.size > 0) {
        if (remote_ofpacts.size > 0) {
            /* Following delivery to logical patch ports, restore the
             * multicast group as the logical output port. */
            put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
                     &remote_ofpacts);
        }

        const char *chassis_name;
        const struct chassis_tunnel *prev = NULL;
        SSET_FOR_EACH (chassis_name, &remote_chassis) {
            const struct chassis_tunnel *tun
                = chassis_tunnel_find(chassis_name, NULL);
            if (!tun) {
                continue;
            }

            if (!prev || tun->type != prev->type) {
                put_encapsulation(mff_ovn_geneve, tun, mc->datapath,
                                  mc->tunnel_key, true, &remote_ofpacts);
                prev = tun;
            }
            ofpact_put_OUTPUT(&remote_ofpacts)->port = tun->ofport;
        }

        if (remote_ofpacts.size) {
            if (local_ports) {
                put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts);
            }
            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
                            mc->header_.uuid.parts[0],
                            &match, &remote_ofpacts, &mc->header_.uuid);
        }

也可参考:ovn-controller转换流表 - 简书 

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值