当ovs收到一条连接的第一个包时,由于datapath没有缓存流表信息,因此需要upcall到用户态,根据用户态流表规则,生成一条缓存流表,然后将缓存流表提交到datapath,后续改连接的消息包就可以直接根据datapath的缓存流表直接完成转发,本文主要描述内核态ovs的upcall的整体处理过程。
ovs_vport_receive
内核datapath收到一个消息包时,进入ovs_vport_receive处理,首先将skb流的关键信息填充到key里,然后根据key通过ovs_dp_process_packet查表,如果有对应的缓存流表信息,则直接完成转发, 没有就触发upcall流程,其中key里主要包含消息流的L2、L3、L4以及CT连接等关键信息。
int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
const struct ip_tunnel_info *tun_info)
{
struct sw_flow_key key;
int error;
OVS_CB(skb)->input_vport = vport;
OVS_CB(skb)->mru = 0;
if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) {
u32 mark;
mark = skb->mark;
skb_scrub_packet(skb, true);
skb->mark = mark;
tun_info = NULL;
}
/* Extract flow from 'skb' into 'key'. */
//根据skb的流信息填充key
error = ovs_flow_key_extract(tun_info, skb, &key);
if (unlikely(error)) {
kfree_skb(skb);
return error;
}
//开始查表,转发skb,缓存流表没有相关信息,则upcall
ovs_dp_process_packet(skb, &key);
return 0;
}
struct sw_flow_key {
u8 tun_opts[255];
u8 tun_opts_len;
struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */
struct {
u32 priority; /* Packet QoS priority. */
u32 skb_mark; /* SKB mark. */
u16 in_port; /* Input switch port (or DP_MAX_PORTS). */
} __packed phy; /* Safe when right after 'tun_key'. */
u8 tun_proto; /* Protocol of encapsulating tunnel. */
u32 ovs_flow_hash; /* Datapath computed hash value. */
u32 recirc_id; /* Recirculation ID. */
struct {
u8 src[ETH_ALEN]; /* Ethernet source address. */
u8 dst[ETH_ALEN]; /* Ethernet destination address. */
struct vlan_head vlan;
struct vlan_head cvlan;
__be16 type; /* Ethernet frame type. */
} eth;
union {
struct {
__be32 top_lse; /* top label stack entry */
} mpls;
struct {
u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
u8 tos; /* IP ToS. */
u8 ttl; /* IP TTL/hop limit. */
u8 frag; /* One of OVS_FRAG_TYPE_*. */
} ip;
};
struct {
__be16 src; /* TCP/UDP/SCTP source port. */
__be16 dst; /* TCP/UDP/SCTP destination port. */
__be16 flags; /* TCP flags. */
} tp;
union {
struct {
struct {
__be32 src; /* IP source address. */
__be32 dst; /* IP destination address. */
} addr;
struct {
u8 sha[ETH_ALEN]; /* ARP source hardware address. */
u8 tha[ETH_ALEN]; /* ARP target hardware address. */
} arp;
} ipv4;
struct {
struct {
struct in6_addr src; /* IPv6 source address. */
struct in6_addr dst; /* IPv6 destination address. */
} addr;
__be32 label; /* IPv6 flow label. */
struct {
struct in6_addr target; /* ND target address. */
u8 sll[ETH_ALEN]; /* ND source link layer address. */
u8 tll[ETH_ALEN]; /* ND target link layer address. */
} nd;
} ipv6;
};
struct {
/* Connection tracking fields. */
u16 zone;
u32 mark;
u8 state;
struct ovs_key_ct_labels labels;
} ct;
} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
queue_userspace_packet
当datapath找不到对应的缓存流表时,就会触发upcall,进入queue_userspace_packet处理函数,该函数主要是重新构造一个netlink skb,然后将触发upcall的原始skb以及对应的key填充到netlink skb,并最终通过netlink发送给用户态ovs处理。
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb = NULL; /* to be queued to userspace */
struct nlattr *nla;
struct genl_info info = {
.dst_sk = ovs_dp_get_net(dp)->genl_sock,
.snd_portid = upcall_info->portid,
};
size_t len;
unsigned int hlen;
int err, dp_ifindex;
dp_ifindex = get_dpifindex(dp);
if (!dp_ifindex)
return -ENODEV;
if (skb_vlan_tag_present(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
nskb = __vlan_hwaccel_push_inside(nskb);
if (!nskb)
return -ENOMEM;
skb = nskb;
}
if (nla_attr_size(skb->len) > USHRT_MAX) {
err = -EFBIG;
goto out;
}
/* Complete checksum if needed */
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_csum_hwoffload_help(skb, 0)))
goto out;
/* Older versions of OVS user space enforce alignment of the last
* Netlink attribute to NLA_ALIGNTO which would require extensive
* padding logic. Only perform zerocopy if padding is not required.
*/
if (dp->user_features & OVS_DP_F_UNALIGNED)
hlen = skb_zerocopy_headlen(skb);
else
hlen = skb->len;
len = upcall_msg_size(upcall_info, hlen);
//新申请一个skb,用户存放upcall消息
user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
goto out;
}
upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
0, upcall_info->cmd);
upcall->dp_ifindex = dp_ifindex;
//将skb的key信息填充到user_skb中
err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb);
BUG_ON(err);
if (upcall_info->userdata)
__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
if (upcall_info->egress_tun_info) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
err = ovs_nla_put_tunnel_info(user_skb,
upcall_info->egress_tun_info);
BUG_ON(err);
nla_nest_end(user_skb, nla);
}
if (upcall_info->actions_len) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS);
err = ovs_nla_put_actions(upcall_info->actions,
upcall_info->actions_len,
user_skb);
if (!err)
nla_nest_end(user_skb, nla);
else
nla_nest_cancel(user_skb, nla);
}
/* Add OVS_PACKET_ATTR_MRU */
if (upcall_info->mru) {
if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
upcall_info->mru)) {
err = -ENOBUFS;
goto out;
}
pad_packet(dp, user_skb);
}
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy() */
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
err = -ENOBUFS;
goto out;
}
nla->nla_len = nla_attr_size(skb->len);
//将触发upcall的skb也填充到user_skb中,用户态生成缓存流表put到datapath后,
//旧的skb也会通过execute流程重新下发到datapath执行(OVS_PACKET_CMD_EXECUTE)
//datapath在execute过程中就会去查询缓存流表,完成转发
err = skb_zerocopy(user_skb, skb, skb->len, hlen);
if (err)
goto out;
/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
pad_packet(dp, user_skb);
((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
//通过netlink,将user_skb发送到用户态
err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
user_skb = NULL;
out:
if (err)
skb_tx_error(skb);
kfree_skb(user_skb);
kfree_skb(nskb);
return err;
}
udpif_upcall_handler
内核通过netlink将upcall消息发送给ovs后,就进入ovs用户态处理流程了,ovs会启动handle线程,用于接收内核的upcall消息,handle线性入口函数为udpif_upcall_handler,udpif_upcall_handler的主流程主要是在poll handle线程。
static void *
udpif_upcall_handler(void *arg)
{
struct handler *handler = arg;
struct udpif *udpif = handler->udpif;
while (!latch_is_set(&handler->udpif->exit_latch)) {
if (recv_upcalls(handler)) {
poll_immediate_wake();
} else {
dpif_recv_wait(udpif->dpif, handler->handler_id);
latch_wait(&udpif->exit_latch);
}
poll_block();
}
return NULL;
}
recv_upcalls
recv_upcalls会通过epoll_wait监听该handle线程的event事件,当有upcall消息时被唤醒,然后通过nl_sock_recv->recvmsg接收upcall消息,节后完后将upcall消息通过parse_odp_packet接口解析,然后存放到struct dpif_upcall upcall数据结构里,主要包含触发upcall的skb流的key以及完成的skb消息包等信息。
struct dpif_upcall {
/* All types. */
//触发upcall的原始消息包,等upcall处理完后再通过execute流程重新发送到datapath去
struct dp_packet packet; /* Packet data,'dp_packet' should be the first
member to avoid a hole. This is because
'rte_mbuf' in dp_packet is aligned atleast
on a 64-byte boundary */
enum dpif_upcall_type type;
//触发upcall的消息的key信息
struct nlattr *key; /* Flow key. */
size_t key_len; /* Length of 'key' in bytes. */
ovs_u128 ufid; /* Unique flow identifier for 'key'. */
struct nlattr *mru; /* Maximum receive unit. */
struct nlattr *cutlen; /* Number of bytes shrink from the end. */
/* DPIF_UC_ACTION only. */
struct nlattr *userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
struct nlattr *out_tun_key; /* Output tunnel key. */
struct nlattr *actions; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
};
process_upcall
当解析完upcall消息的内容后,进入process_upcall流程,这个流程主要是查用户态流表规则,然后为改upcall消息流找到正确的匹配actions动作。
process_upcall
upcall_xlate
xlate_actions
rule_dpif_lookup_from_table(用户态查表过程)
do_xlate_actions
xlate_output_action
static void
xlate_output_action(struct xlate_ctx *ctx, ofp_port_t port,
uint16_t controller_len, bool may_packet_in,
bool is_last_action, bool truncate,
bool group_bucket_action)
{
ofp_port_t prev_nf_output_iface = ctx->nf_output_iface;
ctx->nf_output_iface = NF_OUT_DROP;
switch (port) {
case OFPP_IN_PORT:
compose_output_action(ctx, ctx->xin->flow.in_port.ofp_port, NULL,
is_last_action, truncate);
break;
case OFPP_TABLE:
xlate_table_action(ctx, ctx->xin->flow.in_port.ofp_port,
0, may_packet_in, true, false, false,
do_xlate_actions);
break;
case OFPP_NORMAL:
//流表查找结果为normal,走这个流程
xlate_normal(ctx);
break;
case OFPP_FLOOD:
flood_packets(ctx, false, is_last_action);
break;
case OFPP_ALL:
flood_packets(ctx, true, is_last_action);
break;
case OFPP_CONTROLLER:
xlate_controller_action(ctx, controller_len,
(ctx->in_packet_out ? OFPR_PACKET_OUT
: group_bucket_action ? OFPR_GROUP
: ctx->in_action_set ? OFPR_ACTION_SET
: OFPR_ACTION),
0, UINT32_MAX, NULL, 0);
break;
case OFPP_NONE:
break;
case OFPP_LOCAL:
default:
//如果流表规则查询结果为output到某个具体的port,则走这个流程
if (port != ctx->xin->flow.in_port.ofp_port) {
xlate_output_action_update_mac(ctx);
compose_output_action(ctx, port, NULL, is_last_action, truncate);
} else {
xlate_report_info(ctx, "skipping output to input port");
}
break;
}
if (prev_nf_output_iface == NF_OUT_FLOOD) {
ctx->nf_output_iface = NF_OUT_FLOOD;
} else if (ctx->nf_output_iface == NF_OUT_DROP) {
ctx->nf_output_iface = prev_nf_output_iface;
} else if (prev_nf_output_iface != NF_OUT_DROP &&
ctx->nf_output_iface != NF_OUT_FLOOD) {
ctx->nf_output_iface = NF_OUT_MULTI;
}
}
xlate_output_action根据不同的actions结果走不同的流程,比如当匹配流表的结果的normal时,进入xlate_normal,xlate_normal流程主要是会跟新本地的mac表信息,然后新建cache entry,为后续revalidator流程做准备,然后再根据流的目的mac从本地mac表里找port,如果能找到,则设置从对应port转发,如果找不到,则flood到所有port。
static void
xlate_normal(struct xlate_ctx *ctx)
{
//更新mac表,记录源mac对应的port信息
/* Learn source MAC. */
bool is_grat_arp = is_gratuitous_arp(flow, wc);
if (ctx->xin->allow_side_effects
&& flow->packet_type == htonl(PT_ETH)
&& in_port->pt_mode != NETDEV_PT_LEGACY_L3
) {
update_learning_table(ctx, in_xbundle, flow->dl_src, vlan,
is_grat_arp);
}
//新建xcache entry信息,后面revalidator流程会根据entry信息刷新mac表
if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) {
struct xc_entry *entry;
/* Save just enough info to update mac learning table later. */
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_NORMAL);
entry->normal.ofproto = ctx->xbridge->ofproto;
entry->normal.in_port = flow->in_port.ofp_port;
entry->normal.dl_src = flow->dl_src;
entry->normal.vlan = vlan;
entry->normal.is_gratuitous_arp = is_grat_arp;
}
/* Determine output bundle. */
if (mcast_snooping_enabled(ctx->xbridge->ms)
&& !eth_addr_is_broadcast(flow->dl_dst)
&& eth_addr_is_multicast(flow->dl_dst)
&& is_ip_any(flow)) {
struct mcast_snooping *ms = ctx->xbridge->ms;
struct mcast_group *grp = NULL;
struct dp_packet *p = CONST_CAST(struct dp_packet *,
ctx->xin->packet);
/* We will need the whole data for processing the packet below */
if (p && !dp_packet_is_linear(p)) {
dp_packet_linearize(p);
}
if (is_igmp(flow, wc)) {
/*
* IGMP packets need to take the slow path, in order to be
* processed for mdb updates. That will prevent expires
* firing off even after hosts have sent reports.
*/
ctx->xout->slow |= SLOW_ACTION;
memset(&wc->masks.tp_src, 0xff, sizeof wc->masks.tp_src);
if (mcast_snooping_is_membership(flow->tp_src) ||
mcast_snooping_is_query(flow->tp_src)) {
if (ctx->xin->allow_side_effects && ctx->xin->packet) {
update_mcast_snooping_table(ctx, flow, vlan,
in_xbundle, ctx->xin->packet);
}
}
if (mcast_snooping_is_membership(flow->tp_src)) {
struct mcast_output out = MCAST_OUTPUT_INIT;
ovs_rwlock_rdlock(&ms->rwlock);
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
/* RFC4541: section 2.1.1, item 1: A snooping switch should
* forward IGMP Membership Reports only to those ports where
* multicast routers are attached. Alternatively stated: a
* snooping switch should not forward IGMP Membership Reports
* to ports on which only hosts are attached.
* An administrative control may be provided to override this
* restriction, allowing the report messages to be flooded to
* other ports. */
xlate_normal_mcast_send_rports(ctx, ms, in_xbundle, &out);
ovs_rwlock_unlock(&ms->rwlock);
mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
} else {
xlate_report(ctx, OFT_DETAIL, "multicast traffic, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
}
return;
} else if (is_mld(flow, wc)) {
ctx->xout->slow |= SLOW_ACTION;
if (ctx->xin->allow_side_effects && ctx->xin->packet) {
update_mcast_snooping_table(ctx, flow, vlan,
in_xbundle, ctx->xin->packet);
}
if (is_mld_report(flow, wc)) {
struct mcast_output out = MCAST_OUTPUT_INIT;
ovs_rwlock_rdlock(&ms->rwlock);
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
xlate_normal_mcast_send_rports(ctx, ms, in_xbundle, &out);
ovs_rwlock_unlock(&ms->rwlock);
mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
} else {
xlate_report(ctx, OFT_DETAIL, "MLD query, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
}
} else {
if (is_ip_local_multicast(flow, wc)) {
/* RFC4541: section 2.1.2, item 2: Packets with a dst IP
* address in the 224.0.0.x range which are not IGMP must
* be forwarded on all ports */
xlate_report(ctx, OFT_DETAIL,
"RFC4541: section 2.1.2, item 2, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
return;
}
}
/* forwarding to group base ports */
struct mcast_output out = MCAST_OUTPUT_INIT;
ovs_rwlock_rdlock(&ms->rwlock);
if (flow->dl_type == htons(ETH_TYPE_IP)) {
grp = mcast_snooping_lookup4(ms, flow->nw_dst, vlan);
} else if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
grp = mcast_snooping_lookup(ms, &flow->ipv6_dst, vlan);
}
if (grp) {
xlate_normal_mcast_send_group(ctx, ms, grp, in_xbundle, &out);
xlate_normal_mcast_send_fports(ctx, ms, in_xbundle, &out);
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
} else {
if (mcast_snooping_flood_unreg(ms)) {
xlate_report(ctx, OFT_DETAIL,
"unregistered multicast, flooding");
out.flood = true;
} else {
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
xlate_normal_mcast_send_fports(ctx, ms, in_xbundle, &out);
}
}
ovs_rwlock_unlock(&ms->rwlock);
mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
} else {
ovs_rwlock_rdlock(&ctx->xbridge->ml->rwlock);
//根据目的mac,从mac表里找port信息
mac = mac_learning_lookup(ctx->xbridge->ml, flow->dl_dst, vlan);
mac_port = mac ? mac_entry_get_port(ctx->xbridge->ml, mac) : NULL;
ovs_rwlock_unlock(&ctx->xbridge->ml->rwlock);
//如果找到了,就从对应的port转发
if (mac_port) {
struct xbundle *mac_xbundle = xbundle_lookup(ctx->xcfg, mac_port);
if (mac_xbundle
&& mac_xbundle != in_xbundle
&& mac_xbundle->ofbundle != in_xbundle->ofbundle) {
xlate_report(ctx, OFT_DETAIL, "forwarding to learned port");
output_normal(ctx, mac_xbundle, &xvlan);
} else if (!mac_xbundle) {
xlate_report(ctx, OFT_WARN,
"learned port is unknown, dropping");
} else {
xlate_report(ctx, OFT_DETAIL,
"learned port is input port, dropping");
}
} else {
//如果本地没有目的mac信息,则flood到所有的port
xlate_report(ctx, OFT_DETAIL,
"no learned MAC for destination, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
}
}
}
handle_upcalls
经过process_upcall流程后,已经为upcall的流生成对应的缓存流表信息了,缓存流表的key信息以及actions动作保存在struct upcall里,接线去就是要将缓存流表put到datapath了;
static void
handle_upcalls(struct udpif *udpif, struct upcall *upcalls,
size_t n_upcalls)
{
struct dpif_op *opsp[UPCALL_MAX_BATCH * 2];
struct ukey_op ops[UPCALL_MAX_BATCH * 2];
size_t n_ops, n_opsp, i;
/* Handle the packets individually in order of arrival.
*
* - For SLOW_CFM, SLOW_LACP, SLOW_STP, SLOW_BFD, and SLOW_LLDP,
* translation is what processes received packets for these
* protocols.
*
* - For SLOW_ACTION, translation executes the actions directly.
*
* The loop fills 'ops' with an array of operations to execute in the
* datapath. */
n_ops = 0;
for (i = 0; i < n_upcalls; i++) {
struct upcall *upcall = &upcalls[i];
const struct dp_packet *packet = upcall->packet;
struct ukey_op *op;
if (should_install_flow(udpif, upcall)) {
struct udpif_key *ukey = upcall->ukey;
//判断是否需要安装ukey,如果安装成功,返回true
if (ukey_install(udpif, ukey)) {
upcall->ukey_persists = true;
//重新安装ukey,则需要下发创建流表
put_op_init(&ops[n_ops++], ukey, DPIF_FP_CREATE);
}
}
//如果有action动作,则下发execute,将upcall的消息包重新执行下发
if (upcall->odp_actions.size) {
op = &ops[n_ops++];
op->ukey = NULL;
op->dop.type = DPIF_OP_EXECUTE;
op->dop.execute.packet = CONST_CAST(struct dp_packet *, packet);
op->dop.execute.flow = upcall->flow;
odp_key_to_dp_packet(upcall->key, upcall->key_len,
op->dop.execute.packet);
op->dop.execute.actions = upcall->odp_actions.data;
op->dop.execute.actions_len = upcall->odp_actions.size;
op->dop.execute.needs_help = (upcall->xout.slow & SLOW_ACTION) != 0;
op->dop.execute.probe = false;
op->dop.execute.mtu = upcall->mru;
}
}
/* Execute batch. */
n_opsp = 0;
for (i = 0; i < n_ops; i++) {
opsp[n_opsp++] = &ops[i].dop;
}
dpif_operate(udpif->dpif, opsp, n_opsp, DPIF_OFFLOAD_AUTO);
for (i = 0; i < n_ops; i++) {
struct udpif_key *ukey = ops[i].ukey;
if (ukey) {
ovs_mutex_lock(&ukey->mutex);
if (ops[i].dop.error) {
COVERAGE_INC(upcall_dp_oper_error);
transition_ukey(ukey, UKEY_EVICTED);
} else if (ukey->state < UKEY_OPERATIONAL) {
COVERAGE_INC(upcall_dp_oper_normal);
transition_ukey(ukey, UKEY_OPERATIONAL);
}
ovs_mutex_unlock(&ukey->mutex);
}
}
}
handle_upcalls首先会根据ukey信息判断是否需要重新安装,如果要的话就进入put_op_init,初始化put消息的信息,put_op_init主要是初始化缓存流表的flow key以及actions动作;然后判断缓存流表如果有actions动作,初始化execute信息,准备将触发upacll的skb流重新下发给datapath执行。
static void
put_op_init(struct ukey_op *op, struct udpif_key *ukey,
enum dpif_flow_put_flags flags)
{
//设置缓存流表的flow key信息
op->ukey = ukey;
op->dop.type = DPIF_OP_FLOW_PUT;
op->dop.flow_put.flags = flags;
op->dop.flow_put.key = ukey->key;
op->dop.flow_put.key_len = ukey->key_len;
op->dop.flow_put.mask = ukey->mask;
op->dop.flow_put.mask_len = ukey->mask_len;
op->dop.flow_put.ufid = ukey->ufid_present ? &ukey->ufid : NULL;
op->dop.flow_put.pmd_id = ukey->pmd_id;
op->dop.flow_put.stats = NULL;
//设置缓存流表的actions动作
ukey_get_actions(ukey, &op->dop.flow_put.actions,
&op->dop.flow_put.actions_len);
}
dpif_operate
缓存流表的put动作在dpif_operate流程里执行,如果是内核态的ovs,最终进入dpif_netlink_operate__,这里主要是构造neilink消息,然后通知内核态ovs处理put(对应OVS_FLOW_CMD_NEW)及execute(OVS_PACKET_CMD_EXECUTE)流程;
static size_t
dpif_netlink_operate__(struct dpif_netlink *dpif,
struct dpif_op **ops, size_t n_ops)
{
struct op_auxdata {
struct nl_transaction txn;
struct ofpbuf request;
uint64_t request_stub[1024 / 8];
struct ofpbuf reply;
uint64_t reply_stub[1024 / 8];
} auxes[OPERATE_MAX_OPS];
struct nl_transaction *txnsp[OPERATE_MAX_OPS];
size_t i;
n_ops = MIN(n_ops, OPERATE_MAX_OPS);
for (i = 0; i < n_ops; i++) {
struct op_auxdata *aux = &auxes[i];
struct dpif_op *op = ops[i];
struct dpif_flow_put *put;
struct dpif_flow_del *del;
struct dpif_flow_get *get;
struct dpif_netlink_flow flow;
ofpbuf_use_stub(&aux->request,
aux->request_stub, sizeof aux->request_stub);
aux->txn.request = &aux->request;
ofpbuf_use_stub(&aux->reply, aux->reply_stub, sizeof aux->reply_stub);
aux->txn.reply = NULL;
switch (op->type) {
case DPIF_OP_FLOW_PUT:
put = &op->flow_put;
dpif_netlink_init_flow_put(dpif, put, &flow);
if (put->stats) {
flow.nlmsg_flags |= NLM_F_ECHO;
aux->txn.reply = &aux->reply;
}
dpif_netlink_flow_to_ofpbuf(&flow, &aux->request);
break;
case DPIF_OP_FLOW_DEL:
del = &op->flow_del;
dpif_netlink_init_flow_del(dpif, del, &flow);
if (del->stats) {
flow.nlmsg_flags |= NLM_F_ECHO;
aux->txn.reply = &aux->reply;
}
dpif_netlink_flow_to_ofpbuf(&flow, &aux->request);
break;
case DPIF_OP_EXECUTE:
/* Can't execute a packet that won't fit in a Netlink attribute. */
if (OVS_UNLIKELY(nl_attr_oversized(
dp_packet_size(op->execute.packet)))) {
/* Report an error immediately if this is the first operation.
* Otherwise the easiest thing to do is to postpone to the next
* call (when this will be the first operation). */
if (i == 0) {
VLOG_ERR_RL(&error_rl,
"dropping oversized %"PRIu32"-byte packet",
dp_packet_size(op->execute.packet));
op->error = ENOBUFS;
return 1;
}
n_ops = i;
} else {
/* We will need to pass the whole to encode the message */
if (!dp_packet_is_linear(op->execute.packet)) {
dp_packet_linearize(op->execute.packet);
}
dpif_netlink_encode_execute(dpif->dp_ifindex, &op->execute,
&aux->request);
}
break;
case DPIF_OP_FLOW_GET:
get = &op->flow_get;
dpif_netlink_init_flow_get(dpif, get, &flow);
aux->txn.reply = get->buffer;
dpif_netlink_flow_to_ofpbuf(&flow, &aux->request);
break;
default:
OVS_NOT_REACHED();
}
}
for (i = 0; i < n_ops; i++) {
txnsp[i] = &auxes[i].txn;
}
nl_transact_multiple(NETLINK_GENERIC, txnsp, n_ops);
for (i = 0; i < n_ops; i++) {
struct op_auxdata *aux = &auxes[i];
struct nl_transaction *txn = &auxes[i].txn;
struct dpif_op *op = ops[i];
struct dpif_flow_put *put;
struct dpif_flow_del *del;
struct dpif_flow_get *get;
op->error = txn->error;
switch (op->type) {
case DPIF_OP_FLOW_PUT:
put = &op->flow_put;
if (put->stats) {
if (!op->error) {
struct dpif_netlink_flow reply;
op->error = dpif_netlink_flow_from_ofpbuf(&reply,
txn->reply);
if (!op->error) {
dpif_netlink_flow_get_stats(&reply, put->stats);
}
}
}
break;
case DPIF_OP_FLOW_DEL:
del = &op->flow_del;
if (del->stats) {
if (!op->error) {
struct dpif_netlink_flow reply;
op->error = dpif_netlink_flow_from_ofpbuf(&reply,
txn->reply);
if (!op->error) {
dpif_netlink_flow_get_stats(&reply, del->stats);
}
}
}
break;
case DPIF_OP_EXECUTE:
break;
case DPIF_OP_FLOW_GET:
get = &op->flow_get;
if (!op->error) {
struct dpif_netlink_flow reply;
op->error = dpif_netlink_flow_from_ofpbuf(&reply, txn->reply);
if (!op->error) {
dpif_netlink_flow_to_dpif_flow(&dpif->dpif, get->flow,
&reply);
}
}
break;
default:
OVS_NOT_REACHED();
}
ofpbuf_uninit(&aux->request);
ofpbuf_uninit(&aux->reply);
}
return n_ops;
}
ovs_flow_cmd_new
当内核ovs收到用户态ovs下发的put消息时,进入ovs_flow_cmd_new流程,该流程主要是将用户态下发的缓存流表保存到datapath本地的dp->table里;
ovs_packet_cmd_execute
改流程主要是处理用户态下发的execute消息,主要过程就是根据skb的key消息找到ovs_flow_cmd_new里保存下来的缓存流表,然后根据其actions执行对应的动作。