OVS datapath流表结构及匹配过程

最新推荐文章于 2023-04-10 21:22:14 发布

majieyue

最新推荐文章于 2023-04-10 21:22:14 发布

阅读量8.4k

点赞数

分类专栏： OpenvSwitch

本文链接：https://blog.csdn.net/majieyue/article/details/52988122

版权

OpenvSwitch 专栏收录该内容

14 篇文章 2 订阅

订阅专栏

datapath流表的查找函数是ovs_flow_tbl_lookup_stats，在此之前，先看下datapath组织流表的方式。
最新2.6的ovs流表，已经不是最早单纯的精确匹配了，而是一种精确匹配+带掩码匹配合并在一起的方式，叫做megaflow，目的是减少datapath里精确流表的条目数。但在我看来，这种方式只是在yy，在大规模生产环境下，不会因为用了megaflow精确流表就会变得可控，反而增加了datapath匹配时的复杂度，降低了性能，使用精确匹配还是掩码匹配还是hybrid的方式，应该留给运维人员去选择。

struct flow_table {
        struct table_instance __rcu *ti;
        struct table_instance __rcu *ufid_ti;
        struct mask_cache_entry __percpu *mask_cache;
        struct mask_array __rcu *mask_array;
        unsigned long last_rehash;
        unsigned int count;
        unsigned int ufid_count;
};

struct table_instance {
        struct flex_array *buckets;
        unsigned int n_buckets;
        struct rcu_head rcu;
        int node_ver;
        u32 hash_seed;
        bool keep_flows;
};

struct table_instance *ti 是传统的桶+链表的哈希表结构，其中桶基于struct flex_array的结构体，该结构体是linux内核定义的，本文先不去分析

/*
 * This is meant to replace cases where an array-like
 * structure has gotten too big to fit into kmalloc()
 * and the developer is getting tempted to use
 * vmalloc().
 */

struct flex_array {
        union {
                struct {
                        int element_size;
                        int total_nr_elements;
                        int elems_per_part;
                        struct reciprocal_value reciprocal_elems;
                        struct flex_array_part *parts[];
                };
                /*
                 * This little trick makes sure that
                 * sizeof(flex_array) == PAGE_SIZE
                 */
                char padding[FLEX_ARRAY_BASE_SIZE];
        };
};

除了struct table_instance之外，struct flow_table还增加了缓存机制，这张表是一张精确匹配和掩码匹配的合体，struct mask_array *mask_array是一个掩码结构体struct sw_flow_mask的数组，可以看到struct sw_flow_mask分为两部分：掩码的范围struct sw_flow_key_range以及流的keystruct sw_flow_key

struct sw_flow_key {
        u8 tun_opts[255];
        u8 tun_opts_len;
        struct ip_tunnel_key tun_key;  /* Encapsulating tunnel key. */
        struct {
                u32     priority;       /* Packet QoS priority. */
                u32     skb_mark;       /* SKB mark. */
                u16     in_port;        /* Input switch port (or DP_MAX_PORTS). */
        } __packed phy; /* Safe when right after 'tun_key'. */
        u8 tun_proto;                   /* Protocol of encapsulating tunnel. */
        u32 ovs_flow_hash;              /* Datapath computed hash value.  */
        u32 recirc_id;                  /* Recirculation ID.  */
        struct {
                u8     src[ETH_ALEN];   /* Ethernet source address. */
                u8     dst[ETH_ALEN];   /* Ethernet destination address. */
                __be16 tci;             /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
                __be16 type;            /* Ethernet frame type. */
        } eth;
        union {
                struct {
                        __be32 top_lse; /* top label stack entry */
                } mpls;
                struct {
                        u8     proto;   /* IP protocol or lower 8 bits of ARP opcode. */
                        u8     tos;         /* IP ToS. */
                        u8     ttl;         /* IP TTL/hop limit. */
                        u8     frag;    /* One of OVS_FRAG_TYPE_*. */
                } ip;
        };
        struct {
                __be16 src;             /* TCP/UDP/SCTP source port. */
                __be16 dst;             /* TCP/UDP/SCTP destination port. */
                __be16 flags;           /* TCP flags. */
        } tp;
        union {
                struct {
                        struct {
                                __be32 src;     /* IP source address. */
                                __be32 dst;     /* IP destination address. */
                        } addr;
                        struct {
                                u8 sha[ETH_ALEN];       /* ARP source hardware address. */
                                u8 tha[ETH_ALEN];       /* ARP target hardware address. */
                        } arp;
                } ipv4;
                struct {
                        struct {
                                struct in6_addr src;    /* IPv6 source address. */
                                struct in6_addr dst;    /* IPv6 destination address. */
                        } addr;
                        __be32 label;                   /* IPv6 flow label. */
                        struct {
                                struct in6_addr target; /* ND target address. */
                                u8 sll[ETH_ALEN];       /* ND source link layer address. */
                                u8 tll[ETH_ALEN];       /* ND target link layer address. */
                        } nd;
                } ipv6;
        };
        struct {
                /* Connection tracking fields. */
                u16 zone;
                u32 mark;
                u8 state;
                struct ovs_key_ct_labels labels;
        } ct;

} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */

struct sw_flow_key_range {
        unsigned short int start;
        unsigned short int end;
};

struct mask_array {
        struct rcu_head rcu;
        int count, max;
        struct sw_flow_mask __rcu *masks[];
};

struct mask_cache_entry结构体包含了hash值和mask_array数组的索引的key pair，在struct flow_table中，struct mask_cache_entry __percpu *mask_cache实际是一个cuckoo hash表，对应有256条表项，总共有4组key用于计算哈希值（这里用4个不同的key值，同一个哈希函数来计算哈希值，而不是像cuckoo哈希要求的那样提供四个不同的哈希函数，但效果基本是一样的，而对于实际的生产环境，需要更大的cuckoo哈希表项）。在cuckoo哈希查找时，每次key值都会右移8个bit，基于key计算cuckoo hash的索引，即mask_cache的索引，并调用flow_lookup查看flow是否在table_instace里，如果cache未命中，则做一次全量查找，即遍历mask_array查找flow是否在表项中（代价非常大）。

/*
 * mask_cache maps flow to probable mask. This cache is not tightly
 * coupled cache, It means updates to  mask list can result in inconsistent
 * cache entry in mask cache.
 * This is per cpu cache and is divided in MC_HASH_SEGS segments.
 * In case of a hash collision the entry is hashed in next segment.
 */
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
                                          const struct sw_flow_key *key,
                                          u32 skb_hash,
                                          u32 *n_mask_hit)
{
        struct mask_array *ma = rcu_dereference(tbl->mask_array);
        struct table_instance *ti = rcu_dereference(tbl->ti);
        struct mask_cache_entry *entries, *ce;
        struct sw_flow *flow;
        u32 hash;
        int seg;

        *n_mask_hit = 0;
        if (unlikely(!skb_hash)) {
                u32 mask_index = 0;
                return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index);
        }

        /* Pre and post recirulation flows usually have the same skb_hash
         * value. To avoid hash collisions, rehash the 'skb_hash' with
         * 'recirc_id'.  */
        if (key->recirc_id)
                skb_hash = jhash_1word(skb_hash, key->recirc_id);

        ce = NULL;
        hash = skb_hash;
        entries = this_cpu_ptr(tbl->mask_cache);

        /* Find the cache entry 'ce' to operate on. */
        for (seg = 0; seg < MC_HASH_SEGS; seg++) {
                int index = hash & (MC_HASH_ENTRIES - 1);
                struct mask_cache_entry *e;

                e = &entries[index];
                if (e->skb_hash == skb_hash) {
                        flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,
                                           &e->mask_index);
                        if (!flow)
                                e->skb_hash = 0;
                        return flow;
                }

                if (!ce || e->skb_hash < ce->skb_hash)
                        ce = e;  /* A better replacement cache candidate. */

                hash >>= MC_HASH_SHIFT;
        }

        /* Cache miss, do full lookup. */
        flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);
        if (flow)
                ce->skb_hash = skb_hash;

        return flow;
}

flow_lookup会基于mask_cache_entry缓存的mask_array表项或者通过全量遍历mask_array的表项来查找流

/* Flow lookup does full lookup on flow table. It starts with
 * mask from index passed in *index.
 */
static struct sw_flow *flow_lookup(struct flow_table *tbl,
                                   struct table_instance *ti,
                                   const struct mask_array *ma,
                                   const struct sw_flow_key *key,
                                   u32 *n_mask_hit,
                                   u32 *index)
{
        struct sw_flow_mask *mask;
        struct sw_flow *flow;
        int i;

        if (*index < ma->max) {
                mask = rcu_dereference_ovsl(ma->masks[*index]);
                if (mask) {
                        flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
                        if (flow)
                                return flow;
                }
        }
        for (i = 0; i < ma->max; i++)  {

                if (i == *index)
                        continue;

                mask = rcu_dereference_ovsl(ma->masks[i]);
                if (!mask)
                        continue;

                flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
                if (flow) { /* Found */
                        *index = i;
                        return flow;
                }
        }

        return NULL;
}

最后masked_flow_lookup基于掩码查找流，除了哈希值要相同，流的key要相同，掩码也必须相同

static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
                                          const struct sw_flow_key *unmasked,
                                          const struct sw_flow_mask *mask,
                                          u32 *n_mask_hit)
{
        struct sw_flow *flow;
        struct hlist_head *head;
        u32 hash;
        struct sw_flow_key masked_key;

        ovs_flow_mask_key(&masked_key, unmasked, false, mask);
        hash = flow_hash(&masked_key, &mask->range);
        head = find_bucket(ti, hash);
        (*n_mask_hit)++;
        hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
                if (flow->mask == mask && flow->flow_table.hash == hash &&
                    flow_cmp_masked_key(flow, &masked_key, &mask->range))
                        return flow;
        }
        return NULL;
}