1.datapath的底层数据结构关系
流表采用hash的方式排列存放,流表的hash头结点存储数据结构如下
该hash 桶的初始化函数alloc_buckets (),生成的数据格式可参考如下
2 流表创建流程分析
2.1 flow table数据结构
1> flow_table
流表结构, 每个datapath
都有一个流表
2> table_instance
流表实例, 其中的 buckets 用来存放具体的 flow 条目,存储方式参见FlexArray
3> sw_flow
flow条目, 其中 key 表示报文的特征, 在进行匹配时, 便是从收到的报文中提取 key , 与flow 条目的 key进行比较
4> sw_flow_key
报文特征. 提取报文特征时,会提取每一层的特征.
5> mask_array
流表掩码集合. 老版本OVS只支持exact flow
, 即报文特征必须和flow中描述完全相同才算匹配,而在较新的版本中,支持wildcarded flow
. 可以为 flow中的特征添加掩码。最常见的例子,可以设置 flow条目中源 IP 和 IP 掩码, 只要进行匹配的报文 IP 在掩码作用后的网段内,就认为是通过匹配的。
6> sw_flow_mask
掩码条目。其中 refcount 表明有多少个 flow 正在关联它。
7> sw_flow_match
在匹配过程中使用的结构
2.2 创建流表
用户态通过netlink 进行datapath 流表更新的入口函数都定义在dp_flow_genl_ops中,流表创建的入口函数是ovs_flow_cmd_new 函数,代码分析如下:
static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = sock_net(skb->sk);
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow *flow = NULL, *new_flow;
struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
struct sw_flow_actions *acts;
struct sw_flow_match match;
u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
int error;
bool log = !a[OVS_FLOW_ATTR_PROBE];
/* Most of the time we need to allocate a new flow, do it before
* locking.
*/
new_flow = ovs_flow_alloc(); //分配存储空间
if (IS_ERR(new_flow)) {
error = PTR_ERR(new_flow);
goto error;
}
/* 创建一个match结构,并将其与空的key和mask关联起来 */
ovs_match_init(&match, &new_flow->key, false, &mask);
/* 解析用户下发的命令,用户的命令以netlink attribute的形式存放, 有key和mask两条属性序列 */
error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
if (error)
goto err_kfree_flow;
/* Extract flow identifier. */
error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
&new_flow->key, log);
if (error)
goto err_kfree_flow;
/* unmasked key is needed to match when ufid is not used. */
if (ovs_identifier_is_key(&new_flow->id))
match.key = new_flow->id.unmasked_key;
/* 将key & mask 拷贝到 new_flow->key
eg. 192.168.1.101 & 255.255.255.0 = 192.168.1.0
*/
ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask);
/* Validate actions. */
error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
&new_flow->key, &acts, log);
if (error) {
OVS_NLERR(log, "Flow actions may not be safe on all matching packets.");
goto err_kfree_flow;
}
reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false,
ufid_flags);
if (IS_ERR(reply)) {
error = PTR_ERR(reply);
goto err_kfree_acts;
}
ovs_lock();
dp = get_dp(net, ovs_header->dp_ifindex);
if (unlikely(!dp)) {
error = -ENODEV;
goto err_unlock_ovs;
}
/* Check if this is a duplicate flow */
if (ovs_identifier_is_ufid(&new_flow->id))
flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
if (!flow)
flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key);
if (likely(!flow)) {
rcu_assign_pointer(new_flow->sf_acts, acts);
/* 将新创建的flow插入dp的流表 */
error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
if (unlikely(error)) {
acts = NULL;
goto err_unlock_ovs;
}
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(new_flow,
ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
OVS_FLOW_CMD_NEW,
ufid_flags);
BUG_ON(error < 0);
}
ovs_unlock();
} else {
struct sw_flow_actions *old_acts;
/* Bail out if we're not allowed to modify an existing flow.
* We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
* because Generic Netlink treats the latter as a dump
* request. We also accept NLM_F_EXCL in case that bug ever
* gets fixed.
*/
if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
| NLM_F_EXCL))) {
error = -EEXIST;
goto err_unlock_ovs;
}
/* The flow identifier has to be the same for flow updates.
* Look for any overlapping flow.
*/
if (unlikely(!ovs_flow_cmp(flow, &match))) {
if (ovs_identifier_is_key(&flow->id))
flow = ovs_flow_tbl_lookup_exact(&dp->table,
&match);
else /* UFID matches but key is different */
flow = NULL;
if (!flow) {
error = -ENOENT;
goto err_unlock_ovs;
}
}
/* Update actions. */
old_acts = ovsl_dereference(flow->sf_acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(flow,
ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
OVS_FLOW_CMD_NEW,
ufid_flags);
BUG_ON(error < 0);
}
ovs_unlock();
ovs_nla_free_flow_actions_rcu(old_acts);
ovs_flow_free(new_flow, false);
}
if (reply)
ovs_notify(&dp_flow_genl_family, reply, info);
return 0;
}
int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
const struct nlattr *nla_key,
const struct nlattr *nla_mask,
bool log)
{
const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
struct nlattr *newmask = NULL;
u64 key_attrs = 0; /* KEY属性BITMAP */
u64 mask_attrs = 0; /* MASK属性BITMAP */
int err;
/* 解析nla_key, 将内层attr放到属性数组a, 位图放到key_attrs */
err = parse_flow_nlattrs(nla_key, a, &key_attrs, log);
if (err)
return err;
err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log);
if (err)
return err;
/* 利用位图key_attrs, 属性数组a, 填入match->key */
err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log);
if (err)
return err;
if (match->mask) {
if (!nla_mask) {
/* Create an exact match mask. We need to set to 0xff
* all the 'match->mask' fields that have been touched
* in 'match->key'. We cannot simply memset
* 'match->mask', because padding bytes and fields not
* specified in 'match->key' should be left to 0.
* Instead, we use a stream of netlink attributes,
* copied from 'key' and set to 0xff.
* ovs_key_from_nlattrs() will take care of filling
* 'match->mask' appropriately.
*/
newmask = kmemdup(nla_key,
nla_total_size(nla_len(nla_key)),
GFP_KERNEL);
if (!newmask)
return -ENOMEM;
mask_set_nlattr(newmask, 0xff);
/* The userspace does not send tunnel attributes that
* are 0, but we should not wildcard them nonetheless.
*/
if (match->key->tun_proto)
SW_FLOW_KEY_MEMSET_FIELD(match, tun_key,
0xff, true);
nla_mask = newmask;
}
/* 解析nla_mask, 将内层attr放到a, 位图放到mask_attrs */
err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs, log);
if (err)
goto free_newmask;
/* Always match on tci. */
SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true);
SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true);
err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log);
if (err)
goto free_newmask;
/* 利用位图mask_attrs, 属性数组a, 填入match->mask.key */
err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true,
log);
if (err)
goto free_newmask;
}
if (!match_validate(match, key_attrs, mask_attrs, log))
err = -EINVAL;
free_newmask:
kfree(newmask);
return err;
}
int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
const struct sw_flow_mask *mask)
{
int err;
/* mask 插入 table->mask_list */
err = flow_mask_insert(table, flow, mask);
if (err)
return err;
flow_key_insert(table, flow);
if (ovs_identifier_is_ufid(&flow->id))
flow_ufid_insert(table, flow);
return 0;
}
3 流表查找流程
一般的数据包在 Linux 网络协议中的流向为上图中的蓝色箭头流向:网卡eth0 收到数据包后判断报文走向,如果是本地报文把数据传送到用户态,如果是转发报文根据选路(二层交换或三层路由)把报文送到另一个网卡如eth1。当有 OVS 时,数据流向如红色所示:从网卡 eth0 收到报文后进入ovs 的端口,根据 key 值进行流表匹配,如果匹配成功执行流表对应的 action;如果失败通过upcall 送入用户态处理。
OVS
中, 内核模块datapath
负责报文的处理和转发, 当它从一个接收端口(vport
)收到报文后, 会提取报文中的字段, 查询流表(flow table
)进行流匹配, 如果与其中一条flow
匹配成功, 则执行flow
中规定的动作(action
), 如从另外某个vport
转发, 这个过程如上面的Fast Path所示; 如果没有匹配上任何一条flow
, 则将报文上送到用户空间, 如上图中的Slow Path所示
3.1 收包处理流程
通过vport注册的回调函数netdev_frame_hook()-> netdev_port_receive()->ovs_vport_receive()处理接收报文,ovs_flow_key_extract()函数生成flow的key内容用以接下来进行流表匹配,最后调用ovs_dp_process_packet()函数进入真正的ovs数据包处理,代码流程如下
3.2 流表查找
流表查找主要是查表关键字的匹配,关键字数据结构如下,根据skb 中的Ethernet 帧生成key 的函数为
流表查询的入口函数ovs_flow_tbl_lookup_stats(),flow 的匹配策略是和流表中所有mask 和所有key 进行匹配处理,为了加速查询效率,在调用真正的流表查询函数flow_lookup()之前,对于mask 的查询采用了缓存机制,实现原理是首先查询缓存的mask_cache_entry,这些cache 是查询成功后形成的cache,并针对cache 采用分段查询的方式
mask的缓存机制是后面为了提高查询效率加上的,支持mask cache的版本
在 Aug 15 2014又进行了一次优化,所以在使用的时候建议直接选择优化后的版本
在4.19.136内核代码datapath实现中,还没有maks 缓存机制,ovs_flow_tbl_lookup_stats 函数直接调用的就是masked_flow_lookup,进行全流表的匹配,在阅读代码时需要特殊注意。
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
const struct sw_flow_key *key,
u32 *n_mask_hit)
{
struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
struct sw_flow_mask *mask;
struct sw_flow *flow;
*n_mask_hit = 0;
list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
(*n_mask_hit)++;
flow = masked_flow_lookup(ti, key, mask);
if (flow) /* Found */
return flow;
}
return NULL;
}
一个数据包到达内核的datapath后,如果能匹配到流表,则根据对应的actions进行处理,如果未匹配到流量,则使用upcall机制上送用户态ovs进程,进行慢路处理,接下来的章节会分析actions处理流程和upcall流程。