ovs分类器 flow和miniflow

本文介绍ovs分类器中很重要的两个结构体:flow和miniflow,及其相关函数。

flow
flow中保存的是报文相关的字段和其他一些元数据,用来匹配流表,主要包含如下四个层次内容:

a. metadata: 入端口号,寄存器等信息
b. l2: 源目的mac,vlan和mpls等信息
c. l3: ipv4/ipv6源目的ip,ttl等信息
d. l4: 源目的端口号,icmp code和type等信息。

这四部分内容用来在分类器中查找流表时,被分阶段查找使用,低层的字段先匹配,如果匹配成功,则继续匹配更高层的,如果匹配失败,则不用再匹配更高层次的字段,加速匹配速度,同时也能使下发到datapath的流表更模糊。

flow的一个特点是它整个结构是8字节对齐的,在2.8版本中它的大小是672字节。

struct flow {
    /* Metadata */
    //struct flow_tnl结构体是8字节对齐的
    struct flow_tnl tunnel;     /* Encapsulating tunnel parameters. */

    ovs_be64 metadata;          /* OpenFlow Metadata. */

    uint32_t regs[FLOW_N_REGS]; /* Registers. */

    uint32_t skb_priority;      /* Packet priority for QoS. */
    uint32_t pkt_mark;          /* Packet mark. */

    uint32_t dp_hash;           /* Datapath computed hash value. The exact
                                 * computation is opaque to the user space. */
    union flow_in_port in_port; /* Input port.*/

    uint32_t recirc_id;         /* Must be exact match. */
    uint8_t ct_state;           /* Connection tracking state. */
    uint8_t ct_nw_proto;        /* CT orig tuple IP protocol. */
    uint16_t ct_zone;           /* Connection tracking zone. */

    uint32_t ct_mark;           /* Connection mark.*/
    ovs_be32 packet_type;       /* OpenFlow packet type. */

    ovs_u128 ct_label;          /* Connection label. */

    uint32_t conj_id;           /* Conjunction ID. */
    ofp_port_t actset_output;   /* Output port in action set. */

    /* L2, Order the same as in the Ethernet header! (64-bit aligned) */
    struct eth_addr dl_dst;     /* Ethernet destination address. */
    struct eth_addr dl_src;     /* Ethernet source address. */
    ovs_be16 dl_type;           /* Ethernet frame type.
                                   Note: This also holds the Ethertype for L3
                                   packets of type PACKET_TYPE(1, Ethertype) */
    uint8_t pad1[2];            /* Pad to 64 bits. */
    
    union flow_vlan_hdr vlans[FLOW_MAX_VLAN_HEADERS]; /* VLANs */
    
    ovs_be32 mpls_lse[ROUND_UP(FLOW_MAX_MPLS_LABELS, 2)]; /* MPLS label stack
                                                             (with padding). */

    /* L3 (64-bit aligned) */
    ovs_be32 nw_src;            /* IPv4 source address or ARP SPA. */
    ovs_be32 nw_dst;            /* IPv4 destination address or ARP TPA. */
    
    ovs_be32 ct_nw_src;         /* CT orig tuple IPv4 source address. */
    ovs_be32 ct_nw_dst;         /* CT orig tuple IPv4 destination address. */
    
    struct in6_addr ipv6_src;   /* IPv6 source address. */
    struct in6_addr ipv6_dst;   /* IPv6 destination address. */
    struct in6_addr ct_ipv6_src; /* CT orig tuple IPv6 source address. */
    struct in6_addr ct_ipv6_dst; /* CT orig tuple IPv6 destination address. */
    
    ovs_be32 ipv6_label;        /* IPv6 flow label. */
    uint8_t nw_frag;            /* FLOW_FRAG_* flags. */
    uint8_t nw_tos;             /* IP ToS (including DSCP and ECN). */
    uint8_t nw_ttl;             /* IP TTL/Hop Limit. */
    uint8_t nw_proto;           /* IP protocol or low 8 bits of ARP opcode. */
    
    struct in6_addr nd_target;  /* IPv6 neighbor discovery (ND) target. */
    
    struct eth_addr arp_sha;    /* ARP/ND source hardware address. */
    struct eth_addr arp_tha;    /* ARP/ND target hardware address. */
    ovs_be16 tcp_flags;         /* TCP flags/ICMPv6 ND options type.
                                 * With L3 to avoid matching L4. */
    ovs_be16 pad2;              /* Pad to 64 bits. */
    
    struct ovs_key_nsh nsh;     /* Network Service Header keys */

    /* L4 (64-bit aligned) */
    ovs_be16 tp_src;            /* TCP/UDP/SCTP source port/ICMP type. */
    ovs_be16 tp_dst;            /* TCP/UDP/SCTP destination port/ICMP code. */
    ovs_be16 ct_tp_src;         /* CT original tuple source port/ICMP type. */
    ovs_be16 ct_tp_dst;         /* CT original tuple dst port/ICMP code. */
    
    ovs_be32 igmp_group_ip4;    /* IGMP group IPv4 address/ICMPv6 ND reserved
                                 * field.
                                 * Keep last for BUILD_ASSERT_DECL below. */
    ovs_be32 pad3;              /* Pad to 64 bits. */
};

miniflow
miniflow是压缩版的struct flow,其包含两部分内容,一个是flowmap用来记录和flow的对应关系,flowmap中的每一个bit对应struct flow中的一个uint64_t字段,如果bit为1,则flow中对应的uint64_t字段为非0值,如果bit为0,则flow中对应的uint64_t字段为0;另一部分是flowmap后面的内存,由调用者根据flowmap中bit为1的个数*8字节申请内存,用来保存flow中非0的uint64_t。

前面说到flow结构体必须是8字节对齐的,就是为了和miniflow配合使用,比如如果struct flow大小为672字节,则包含84个8字节,在miniflow中使用84个bit即可表示flow信息,一个bit对应flow的一个uint64_t。

struct flow是一个很大的结构体,前面提到它占用672字节空间,但是大部分字段都是0,是用不到的。如果flow中只有一个8字节包含非0值,则miniflow使用24字节(flowmap占用固定的16字节,加上flowmap后面保存的8字节内存)即可表示flow的全部有用信息,相比flow的672字节,大大节省了内存。

所以使用miniflow表示flow有如下两个好处

a. 使用miniflow可以节省内存
b. 如果只想遍历flow中的非0字段时,使用miniflow找到对应的非0字段,可以节省时间

miniflow定义如下

//flow是8字节对齐的,除8得到flow中包含8字节的个数
#define FLOW_U64S (sizeof(struct flow) / sizeof(uint64_t))

//map大小为8字节,MAP_T_BITS 为64位
typedef unsigned long long map_t;
#define MAP_T_BITS (sizeof(map_t) * CHAR_BIT)

//每位表示一个u64,FLOWMAP_UNITS 表示最少需要几个64位
#define FLOWMAP_UNITS DIV_ROUND_UP(FLOW_U64S, MAP_T_BITS)

struct flowmap {
    map_t bits[FLOWMAP_UNITS];
};

struct miniflow {
    struct flowmap map;
    /* Followed by:
     *     uint64_t values[n];
     * where 'n' is miniflow_n_values(miniflow). */
};

函数

miniflow_extract
miniflow_extract用来从报文中提取flow信息,并保存到miniflow中,调用miniflow_extract的函数应该保证miniflow有足够的空间容纳FLOW_U64S * 8 字节大小。

/* Caller is responsible for initializing 'dst' with enough storage for
 * FLOW_U64S * 8 bytes. */
void
miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
{
    const struct pkt_metadata *md = &packet->md;
    const void *data = dp_packet_data(packet);
    size_t size = dp_packet_size(packet);
    ovs_be32 packet_type = packet->packet_type;
    uint64_t *values = miniflow_values(dst);
    struct mf_ctx mf = { FLOWMAP_EMPTY_INITIALIZER, values,
                         values + FLOW_U64S };
    const char *frame;
    ovs_be16 dl_type = OVS_BE16_MAX;
    uint8_t nw_frag, nw_tos, nw_ttl, nw_proto;
    uint8_t *ct_nw_proto_p = NULL;
    ovs_be16 ct_tp_src = 0, ct_tp_dst = 0;
    ...
    ...
    //保存metadata信息到miniflow
    if (md->skb_priority || md->pkt_mark) {
        miniflow_push_uint32(mf, skb_priority, md->skb_priority);
        miniflow_push_uint32(mf, pkt_mark, md->pkt_mark);
    }
    //保存md->dp_hash到miniflow
    miniflow_push_uint32(mf, dp_hash, md->dp_hash);
    //保存报文入端口到miniflow
    miniflow_push_uint32(mf, in_port, odp_to_u32(md->in_port.odp_port));
    ...
    ...
    //保存二层信息到miniflow
    /* Link layer. */
    ASSERT_SEQUENTIAL(dl_dst, dl_src);
    miniflow_push_macs(mf, dl_dst, data);

    /* VLAN */
    union flow_vlan_hdr vlans[FLOW_MAX_VLAN_HEADERS];
    size_t num_vlans = parse_vlan(&data, &size, vlans);

    dl_type = parse_ethertype(&data, &size);
    miniflow_push_be16(mf, dl_type, dl_type);
    miniflow_pad_to_64(mf, dl_type);
    if (num_vlans > 0) {
        miniflow_push_words_32(mf, vlans, vlans, num_vlans);
    }
    ...
    ...
    //保存三层信息到miniflow
    /* Push both source and destination address at once. */
    miniflow_push_words(mf, nw_src, &nh->ip_src, 1);
    ...
    ...
    //保存四层信息到miniflow
    if (OVS_LIKELY(nw_proto == IPPROTO_TCP)) {
        if (OVS_LIKELY(size >= TCP_HEADER_LEN)) {
            const struct tcp_header *tcp = data;

            miniflow_push_be32(mf, arp_tha.ea[2], 0);
            miniflow_push_be32(mf, tcp_flags,
                               TCP_FLAGS_BE32(tcp->tcp_ctl));
            miniflow_push_be16(mf, tp_src, tcp->tcp_src);
            miniflow_push_be16(mf, tp_dst, tcp->tcp_dst);
            miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
            miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
        }
    }

在上面将value保存到miniflow时,用到了几个辅助函数,比如下面的miniflow_push_uint32用来将一个32位的值保存到miniflow中FIELD对应的位置。其首先调用offsetof获取field在flow中的偏移字节数,因为flow是8字节对齐的,所以一个四字节的成员变量要么位于8字节的起始位置,要么位于8字节的中间位置,即对8取模值肯定为0或者4,再调用miniflow_push_uint32_保存到对应的位置,并设置map中对应的bit为1。

#define miniflow_push_uint32_(MF, OFS, VALUE)   \
    {                                           \
    MINIFLOW_ASSERT(MF.data < MF.end);          \
                                                \
    //成员变量位于起始位置,需要调用miniflow_set_map设置对应的bit为1
    if ((OFS) % 8 == 0) {                       \
        miniflow_set_map(MF, OFS / 8);          \
        *(uint32_t *)MF.data = VALUE;           \
    } else if ((OFS) % 8 == 4) {                \
    //成员变量不在起始位置,要判断此变量所在的bit为1
        miniflow_assert_in_map(MF, OFS / 8);    \
        *((uint32_t *)MF.data + 1) = VALUE;     \
        MF.data++;                              \
    }                                           \
}

#define miniflow_push_uint32(MF, FIELD, VALUE)                      \
    miniflow_push_uint32_(MF, offsetof(struct flow, FIELD), VALUE)

注意的是,一定要按照flow中成员的顺序保存到miniflow。

miniflow_expand
miniflow_expand用来将miniflow中的值恢复到flow结构体中。

/* Initializes 'dst' as a copy of 'src'. */
void
miniflow_expand(const struct miniflow *src, struct flow *dst)
{
    memset(dst, 0, sizeof *dst);
    flow_union_with_miniflow(dst, src);
}

/* Perform a bitwise OR of miniflow 'src' flow data with the equivalent
 * fields in 'dst', storing the result in 'dst'. */
static inline void
flow_union_with_miniflow(struct flow *dst, const struct miniflow *src)
{
    flow_union_with_miniflow_subset(dst, src, src->map);
}

/* Perform a bitwise OR of miniflow 'src' flow data specified in 'subset' with
 * the equivalent fields in 'dst', storing the result in 'dst'.  'subset' must
 * be a subset of 'src's map. */
static inline void
flow_union_with_miniflow_subset(struct flow *dst, const struct miniflow *src,
                                struct flowmap subset)
{
    uint64_t *dst_u64 = (uint64_t *) dst;
    const uint64_t *p = miniflow_get_values(src);
    map_t map;
    //遍历所有的map
    FLOWMAP_FOR_EACH_MAP (map, subset) {
        size_t idx;
        //遍历map中所有的非0 bit
        MAP_FOR_EACH_INDEX(idx, map) {
            dst_u64[idx] |= *p++;
        }
        dst_u64 += MAP_T_BITS;
    }
}

也可参考:ovs分类器 flow和miniflow - 简书 (jianshu.com) 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值