datapath流表的查找函数是ovs_flow_tbl_lookup_stats
,在此之前,先看下datapath组织流表的方式。
最新2.6的ovs流表,已经不是最早单纯的精确匹配了,而是一种精确匹配+带掩码匹配合并在一起的方式,叫做megaflow,目的是减少datapath里精确流表的条目数。但在我看来,这种方式只是在yy,在大规模生产环境下,不会因为用了megaflow精确流表就会变得可控,反而增加了datapath匹配时的复杂度,降低了性能,使用精确匹配还是掩码匹配还是hybrid的方式,应该留给运维人员去选择。
struct flow_table {
struct table_instance __rcu *ti;
struct table_instance __rcu *ufid_ti;
struct mask_cache_entry __percpu *mask_cache;
struct mask_array __rcu *mask_array;
unsigned long last_rehash;
unsigned int count;
unsigned int ufid_count;
};
struct table_instance {
struct flex_array *buckets;
unsigned int n_buckets;
struct rcu_head rcu;
int node_ver;
u32 hash_seed;
bool keep_flows;
};
struct table_instance *ti 是传统的桶+链表的哈希表结构,其中桶基于struct flex_array
的结构体,该结构体是linux内核定义的,本文先不去分析
/*
* This is meant to replace cases where an array-like
* structure has gotten too big to fit into kmalloc()
* and the developer is getting tempted to use
* vmalloc().
*/
struct flex_array {
union {
struct {
int element_size;
int total_nr_elements;
int elems_per_part;
struct reciprocal_value reciprocal_elems;
struct flex_array_part *parts[];
};
/*
* This little trick makes sure that
* sizeof(flex_array) == PAGE_SIZE
*/
char padding[FLEX_ARRAY_BASE_SIZE];
};
};
除了struct table_instance
之外,struct flow_table
还增加了缓存机制,这张表是一张精确匹配和掩码匹配的合体,struct mask_array *mask_array
是一个掩码结构体struct sw_flow_mask
的数组,可以看到struct sw_flow_mask
分为两部分:掩码的范围struct sw_flow_key_range
以及流的keystruct sw_flow_key
struct sw_flow_key {
u8 tun_opts[255];
u8 tun_opts_len;
struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */
struct {
u32 priority; /* Packet QoS priority. */
u32 skb_mark; /* SKB mark. */
u16 in_port; /* Input switch port (or DP_MAX_PORTS). */
} __packed phy; /* Safe when right after 'tun_key'. */
u8 tun_proto; /* Protocol of encapsulating tunnel. */
u32 ovs_flow_hash; /* Datapath computed hash value. */
u32 recirc_id; /* Recirculation ID. */
struct {
u8 src[ETH_ALEN]; /* Ethernet source address. */
u8 dst[ETH_ALEN]; /* Ethernet destination address. */
__be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
__be16 type; /* Ethernet frame type. */
} eth;
union {
struct {
__be32 top_lse; /* top label stack entry */
} mpls;
struct {
u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
u8 tos; /* IP ToS. */
u8 ttl; /* IP TTL/hop limit. */
u8 frag; /* One of OVS_FRAG_TYPE_*. */
} ip;
};
struct {
__be16 src; /* TCP/UDP/SCTP source port. */
__be16 dst; /* TCP/UDP/SCTP destination port. */
__be16 flags; /* TCP flags. */
} tp;
union {
struct {
struct {
__be32 src; /* IP source address. */
__be32 dst; /* IP destination address. */
} addr;
struct {
u8 sha[ETH_ALEN]; /* ARP source hardware address. */
u8 tha[ETH_ALEN]; /* ARP target hardware address. */
} arp;
} ipv4;
struct {
struct {
struct in6_addr src; /* IPv6 source address. */
struct in6_addr dst; /* IPv6 destination address. */
} addr;
__be32 label; /* IPv6 flow label. */
struct {
struct in6_addr target; /* ND target address. */
u8 sll[ETH_ALEN]; /* ND source link layer address. */
u8 tll[ETH_ALEN]; /* ND target link layer address. */
} nd;
} ipv6;
};
struct {
/* Connection tracking fields. */
u16 zone;
u32 mark;
u8 state;
struct ovs_key_ct_labels labels;
} ct;
} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
struct sw_flow_key_range {
unsigned short int start;
unsigned short int end;
};
struct mask_array {
struct rcu_head rcu;
int count, max;
struct sw_flow_mask __rcu *masks[];
};
struct mask_cache_entry
结构体包含了hash值和mask_array数组的索引的key pair,在struct flow_table
中,struct mask_cache_entry __percpu *mask_cache
实际是一个cuckoo hash表,对应有256条表项,总共有4组key用于计算哈希值(这里用4个不同的key值,同一个哈希函数来计算哈希值,而不是像cuckoo哈希要求的那样提供四个不同的哈希函数,但效果基本是一样的,而对于实际的生产环境,需要更大的cuckoo哈希表项)。在cuckoo哈希查找时,每次key值都会右移8个bit,基于key计算cuckoo hash的索引,即mask_cache的索引,并调用flow_lookup
查看flow是否在table_instace里,如果cache未命中,则做一次全量查找,即遍历mask_array
查找flow是否在表项中(代价非常大)。
/*
* mask_cache maps flow to probable mask. This cache is not tightly
* coupled cache, It means updates to mask list can result in inconsistent
* cache entry in mask cache.
* This is per cpu cache and is divided in MC_HASH_SEGS segments.
* In case of a hash collision the entry is hashed in next segment.
*/
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
const struct sw_flow_key *key,
u32 skb_hash,
u32 *n_mask_hit)
{
struct mask_array *ma = rcu_dereference(tbl->mask_array);
struct table_instance *ti = rcu_dereference(tbl->ti);
struct mask_cache_entry *entries, *ce;
struct sw_flow *flow;
u32 hash;
int seg;
*n_mask_hit = 0;
if (unlikely(!skb_hash)) {
u32 mask_index = 0;
return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index);
}
/* Pre and post recirulation flows usually have the same skb_hash
* value. To avoid hash collisions, rehash the 'skb_hash' with
* 'recirc_id'. */
if (key->recirc_id)
skb_hash = jhash_1word(skb_hash, key->recirc_id);
ce = NULL;
hash = skb_hash;
entries = this_cpu_ptr(tbl->mask_cache);
/* Find the cache entry 'ce' to operate on. */
for (seg = 0; seg < MC_HASH_SEGS; seg++) {
int index = hash & (MC_HASH_ENTRIES - 1);
struct mask_cache_entry *e;
e = &entries[index];
if (e->skb_hash == skb_hash) {
flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,
&e->mask_index);
if (!flow)
e->skb_hash = 0;
return flow;
}
if (!ce || e->skb_hash < ce->skb_hash)
ce = e; /* A better replacement cache candidate. */
hash >>= MC_HASH_SHIFT;
}
/* Cache miss, do full lookup. */
flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);
if (flow)
ce->skb_hash = skb_hash;
return flow;
}
flow_lookup
会基于mask_cache_entry
缓存的mask_array
表项或者通过全量遍历mask_array
的表项来查找流
/* Flow lookup does full lookup on flow table. It starts with
* mask from index passed in *index.
*/
static struct sw_flow *flow_lookup(struct flow_table *tbl,
struct table_instance *ti,
const struct mask_array *ma,
const struct sw_flow_key *key,
u32 *n_mask_hit,
u32 *index)
{
struct sw_flow_mask *mask;
struct sw_flow *flow;
int i;
if (*index < ma->max) {
mask = rcu_dereference_ovsl(ma->masks[*index]);
if (mask) {
flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
if (flow)
return flow;
}
}
for (i = 0; i < ma->max; i++) {
if (i == *index)
continue;
mask = rcu_dereference_ovsl(ma->masks[i]);
if (!mask)
continue;
flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
if (flow) { /* Found */
*index = i;
return flow;
}
}
return NULL;
}
最后masked_flow_lookup
基于掩码查找流,除了哈希值要相同,流的key要相同,掩码也必须相同
static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
const struct sw_flow_key *unmasked,
const struct sw_flow_mask *mask,
u32 *n_mask_hit)
{
struct sw_flow *flow;
struct hlist_head *head;
u32 hash;
struct sw_flow_key masked_key;
ovs_flow_mask_key(&masked_key, unmasked, false, mask);
hash = flow_hash(&masked_key, &mask->range);
head = find_bucket(ti, hash);
(*n_mask_hit)++;
hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
if (flow->mask == mask && flow->flow_table.hash == hash &&
flow_cmp_masked_key(flow, &masked_key, &mask->range))
return flow;
}
return NULL;
}
下图是从http://vinllen.com/中找到的,可以参考下整个流匹配的流程