ovs upcall处理流程

当ovs收到一条连接的第一个包时,由于datapath没有缓存流表信息,因此需要upcall到用户态,根据用户态流表规则,生成一条缓存流表,然后将缓存流表提交到datapath,后续改连接的消息包就可以直接根据datapath的缓存流表直接完成转发,本文主要描述内核态ovs的upcall的整体处理过程。

ovs_vport_receive

内核datapath收到一个消息包时,进入ovs_vport_receive处理,首先将skb流的关键信息填充到key里,然后根据key通过ovs_dp_process_packet查表,如果有对应的缓存流表信息,则直接完成转发, 没有就触发upcall流程,其中key里主要包含消息流的L2、L3、L4以及CT连接等关键信息。

int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
		      const struct ip_tunnel_info *tun_info)
{
	struct sw_flow_key key;
	int error;

	OVS_CB(skb)->input_vport = vport;
	OVS_CB(skb)->mru = 0;
	if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) {
		u32 mark;

		mark = skb->mark;
		skb_scrub_packet(skb, true);
		skb->mark = mark;
		tun_info = NULL;
	}

	/* Extract flow from 'skb' into 'key'. */
	//根据skb的流信息填充key
	error = ovs_flow_key_extract(tun_info, skb, &key);
	if (unlikely(error)) {
		kfree_skb(skb);
		return error;
	}

	//开始查表,转发skb,缓存流表没有相关信息,则upcall
	ovs_dp_process_packet(skb, &key);
	return 0;
}

struct sw_flow_key {
	u8 tun_opts[255];
	u8 tun_opts_len;
	struct ip_tunnel_key tun_key;	/* Encapsulating tunnel key. */
	struct {
		u32	priority;	/* Packet QoS priority. */
		u32	skb_mark;	/* SKB mark. */
		u16	in_port;	/* Input switch port (or DP_MAX_PORTS). */
	} __packed phy; /* Safe when right after 'tun_key'. */
	u8 tun_proto;			/* Protocol of encapsulating tunnel. */
	u32 ovs_flow_hash;		/* Datapath computed hash value.  */
	u32 recirc_id;			/* Recirculation ID.  */
	struct {
		u8     src[ETH_ALEN];	/* Ethernet source address. */
		u8     dst[ETH_ALEN];	/* Ethernet destination address. */
		struct vlan_head vlan;
		struct vlan_head cvlan;
		__be16 type;		/* Ethernet frame type. */
	} eth;
	union {
		struct {
			__be32 top_lse;	/* top label stack entry */
		} mpls;
		struct {
			u8     proto;	/* IP protocol or lower 8 bits of ARP opcode. */
			u8     tos;	    /* IP ToS. */
			u8     ttl;	    /* IP TTL/hop limit. */
			u8     frag;	/* One of OVS_FRAG_TYPE_*. */
		} ip;
	};
	struct {
		__be16 src;		/* TCP/UDP/SCTP source port. */
		__be16 dst;		/* TCP/UDP/SCTP destination port. */
		__be16 flags;		/* TCP flags. */
	} tp;
	union {
		struct {
			struct {
				__be32 src;	/* IP source address. */
				__be32 dst;	/* IP destination address. */
			} addr;
			struct {
				u8 sha[ETH_ALEN];	/* ARP source hardware address. */
				u8 tha[ETH_ALEN];	/* ARP target hardware address. */
			} arp;
		} ipv4;
		struct {
			struct {
				struct in6_addr src;	/* IPv6 source address. */
				struct in6_addr dst;	/* IPv6 destination address. */
			} addr;
			__be32 label;			/* IPv6 flow label. */
			struct {
				struct in6_addr target;	/* ND target address. */
				u8 sll[ETH_ALEN];	/* ND source link layer address. */
				u8 tll[ETH_ALEN];	/* ND target link layer address. */
			} nd;
		} ipv6;
	};
	struct {
		/* Connection tracking fields. */
		u16 zone;
		u32 mark;
		u8 state;
		struct ovs_key_ct_labels labels;
	} ct;

} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */

queue_userspace_packet

当datapath找不到对应的缓存流表时,就会触发upcall,进入queue_userspace_packet处理函数,该函数主要是重新构造一个netlink skb,然后将触发upcall的原始skb以及对应的key填充到netlink skb,并最终通过netlink发送给用户态ovs处理。

static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
				  const struct sw_flow_key *key,
				  const struct dp_upcall_info *upcall_info)
{
	struct ovs_header *upcall;
	struct sk_buff *nskb = NULL;
	struct sk_buff *user_skb = NULL; /* to be queued to userspace */
	struct nlattr *nla;
	struct genl_info info = {
		.dst_sk = ovs_dp_get_net(dp)->genl_sock,
		.snd_portid = upcall_info->portid,
	};
	size_t len;
	unsigned int hlen;
	int err, dp_ifindex;

	dp_ifindex = get_dpifindex(dp);
	if (!dp_ifindex)
		return -ENODEV;

	if (skb_vlan_tag_present(skb)) {
		nskb = skb_clone(skb, GFP_ATOMIC);
		if (!nskb)
			return -ENOMEM;

		nskb = __vlan_hwaccel_push_inside(nskb);
		if (!nskb)
			return -ENOMEM;

		skb = nskb;
	}

	if (nla_attr_size(skb->len) > USHRT_MAX) {
		err = -EFBIG;
		goto out;
	}

	/* Complete checksum if needed */
	if (skb->ip_summed == CHECKSUM_PARTIAL &&
	    (err = skb_csum_hwoffload_help(skb, 0)))
		goto out;

	/* Older versions of OVS user space enforce alignment of the last
	 * Netlink attribute to NLA_ALIGNTO which would require extensive
	 * padding logic. Only perform zerocopy if padding is not required.
	 */
	if (dp->user_features & OVS_DP_F_UNALIGNED)
		hlen = skb_zerocopy_headlen(skb);
	else
		hlen = skb->len;

	len = upcall_msg_size(upcall_info, hlen);
	//新申请一个skb,用户存放upcall消息
	user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
	if (!user_skb) {
		err = -ENOMEM;
		goto out;
	}

	upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
			     0, upcall_info->cmd);
	upcall->dp_ifindex = dp_ifindex;

	//将skb的key信息填充到user_skb中
	err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb);
	BUG_ON(err);

	if (upcall_info->userdata)
		__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
			  nla_len(upcall_info->userdata),
			  nla_data(upcall_info->userdata));

	if (upcall_info->egress_tun_info) {
		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
		err = ovs_nla_put_tunnel_info(user_skb,
					      upcall_info->egress_tun_info);
		BUG_ON(err);
		nla_nest_end(user_skb, nla);
	}

	if (upcall_info->actions_len) {
		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS);
		err = ovs_nla_put_actions(upcall_info->actions,
					  upcall_info->actions_len,
					  user_skb);
		if (!err)
			nla_nest_end(user_skb, nla);
		else
			nla_nest_cancel(user_skb, nla);
	}

	/* Add OVS_PACKET_ATTR_MRU */
	if (upcall_info->mru) {
		if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
				upcall_info->mru)) {
			err = -ENOBUFS;
			goto out;
		}
		pad_packet(dp, user_skb);
	}

	/* Only reserve room for attribute header, packet data is added
	 * in skb_zerocopy() */
	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
		err = -ENOBUFS;
		goto out;
	}
	nla->nla_len = nla_attr_size(skb->len);

	//将触发upcall的skb也填充到user_skb中,用户态生成缓存流表put到datapath后,
	//旧的skb也会通过execute流程重新下发到datapath执行(OVS_PACKET_CMD_EXECUTE)
	//datapath在execute过程中就会去查询缓存流表,完成转发
	err = skb_zerocopy(user_skb, skb, skb->len, hlen);
	if (err)
		goto out;

	/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
	pad_packet(dp, user_skb);

	((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;

	//通过netlink,将user_skb发送到用户态
	err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
	user_skb = NULL;
out:
	if (err)
		skb_tx_error(skb);
	kfree_skb(user_skb);
	kfree_skb(nskb);
	return err;
}

udpif_upcall_handler

内核通过netlink将upcall消息发送给ovs后,就进入ovs用户态处理流程了,ovs会启动handle线程,用于接收内核的upcall消息,handle线性入口函数为udpif_upcall_handler,udpif_upcall_handler的主流程主要是在poll handle线程。

static void *
udpif_upcall_handler(void *arg)
{
    struct handler *handler = arg;
    struct udpif *udpif = handler->udpif;

    while (!latch_is_set(&handler->udpif->exit_latch)) {
        if (recv_upcalls(handler)) {
            poll_immediate_wake();
        } else {
            dpif_recv_wait(udpif->dpif, handler->handler_id);
            latch_wait(&udpif->exit_latch);
        }
        poll_block();
    }

    return NULL;
}

recv_upcalls

recv_upcalls会通过epoll_wait监听该handle线程的event事件,当有upcall消息时被唤醒,然后通过nl_sock_recv->recvmsg接收upcall消息,节后完后将upcall消息通过parse_odp_packet接口解析,然后存放到struct dpif_upcall upcall数据结构里,主要包含触发upcall的skb流的key以及完成的skb消息包等信息。

struct dpif_upcall {
    /* All types. */
	//触发upcall的原始消息包,等upcall处理完后再通过execute流程重新发送到datapath去
    struct dp_packet packet;    /* Packet data,'dp_packet' should be the first
                                   member to avoid a hole. This is because
                                   'rte_mbuf' in dp_packet is aligned atleast
                                   on a 64-byte boundary */
    enum dpif_upcall_type type;
	//触发upcall的消息的key信息
    struct nlattr *key;         /* Flow key. */
    size_t key_len;             /* Length of 'key' in bytes. */
    ovs_u128 ufid;              /* Unique flow identifier for 'key'. */
    struct nlattr *mru;         /* Maximum receive unit. */
    struct nlattr *cutlen;      /* Number of bytes shrink from the end. */

    /* DPIF_UC_ACTION only. */
    struct nlattr *userdata;    /* Argument to OVS_ACTION_ATTR_USERSPACE. */
    struct nlattr *out_tun_key;    /* Output tunnel key. */
    struct nlattr *actions;    /* Argument to OVS_ACTION_ATTR_USERSPACE. */
};

process_upcall

当解析完upcall消息的内容后,进入process_upcall流程,这个流程主要是查用户态流表规则,然后为改upcall消息流找到正确的匹配actions动作。

process_upcall
    upcall_xlate
        xlate_actions
            rule_dpif_lookup_from_table(用户态查表过程)
                do_xlate_actions
                    xlate_output_action

static void
xlate_output_action(struct xlate_ctx *ctx, ofp_port_t port,
                    uint16_t controller_len, bool may_packet_in,
                    bool is_last_action, bool truncate,
                    bool group_bucket_action)
{
    ofp_port_t prev_nf_output_iface = ctx->nf_output_iface;

    ctx->nf_output_iface = NF_OUT_DROP;

    switch (port) {
    case OFPP_IN_PORT:
        compose_output_action(ctx, ctx->xin->flow.in_port.ofp_port, NULL,
                              is_last_action, truncate);
        break;
    case OFPP_TABLE:
        xlate_table_action(ctx, ctx->xin->flow.in_port.ofp_port,
                           0, may_packet_in, true, false, false,
                           do_xlate_actions);
        break;
    case OFPP_NORMAL:
		//流表查找结果为normal,走这个流程
        xlate_normal(ctx);
        break;
    case OFPP_FLOOD:
        flood_packets(ctx, false, is_last_action);
        break;
    case OFPP_ALL:
        flood_packets(ctx, true, is_last_action);
        break;
    case OFPP_CONTROLLER:
        xlate_controller_action(ctx, controller_len,
                                (ctx->in_packet_out ? OFPR_PACKET_OUT
                                 : group_bucket_action ? OFPR_GROUP
                                 : ctx->in_action_set ? OFPR_ACTION_SET
                                 : OFPR_ACTION),
                                0, UINT32_MAX, NULL, 0);
        break;
    case OFPP_NONE:
        break;
    case OFPP_LOCAL:
    default:
		//如果流表规则查询结果为output到某个具体的port,则走这个流程
        if (port != ctx->xin->flow.in_port.ofp_port) {
			xlate_output_action_update_mac(ctx);
            compose_output_action(ctx, port, NULL, is_last_action, truncate);
        } else {
            xlate_report_info(ctx, "skipping output to input port");
        }
        break;
    }

    if (prev_nf_output_iface == NF_OUT_FLOOD) {
        ctx->nf_output_iface = NF_OUT_FLOOD;
    } else if (ctx->nf_output_iface == NF_OUT_DROP) {
        ctx->nf_output_iface = prev_nf_output_iface;
    } else if (prev_nf_output_iface != NF_OUT_DROP &&
               ctx->nf_output_iface != NF_OUT_FLOOD) {
        ctx->nf_output_iface = NF_OUT_MULTI;
    }
}

xlate_output_action根据不同的actions结果走不同的流程,比如当匹配流表的结果的normal时,进入xlate_normal,xlate_normal流程主要是会跟新本地的mac表信息,然后新建cache entry,为后续revalidator流程做准备,然后再根据流的目的mac从本地mac表里找port,如果能找到,则设置从对应port转发,如果找不到,则flood到所有port。

	static void
	xlate_normal(struct xlate_ctx *ctx)
	{
	 
		//更新mac表,记录源mac对应的port信息
		/* Learn source MAC. */
		bool is_grat_arp = is_gratuitous_arp(flow, wc);
		if (ctx->xin->allow_side_effects
			&& flow->packet_type == htonl(PT_ETH)
			&& in_port->pt_mode != NETDEV_PT_LEGACY_L3
		) {
			update_learning_table(ctx, in_xbundle, flow->dl_src, vlan,
								  is_grat_arp);
		}
		//新建xcache entry信息,后面revalidator流程会根据entry信息刷新mac表
		if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) {
			struct xc_entry *entry;

			/* Save just enough info to update mac learning table later. */
			entry = xlate_cache_add_entry(ctx->xin->xcache, XC_NORMAL);
			entry->normal.ofproto = ctx->xbridge->ofproto;
			entry->normal.in_port = flow->in_port.ofp_port;
			entry->normal.dl_src = flow->dl_src;
			entry->normal.vlan = vlan;
			entry->normal.is_gratuitous_arp = is_grat_arp;
		}

		/* Determine output bundle. */
		if (mcast_snooping_enabled(ctx->xbridge->ms)
			&& !eth_addr_is_broadcast(flow->dl_dst)
			&& eth_addr_is_multicast(flow->dl_dst)
			&& is_ip_any(flow)) {
			struct mcast_snooping *ms = ctx->xbridge->ms;
			struct mcast_group *grp = NULL;
			struct dp_packet *p = CONST_CAST(struct dp_packet *,
											 ctx->xin->packet);

			/* We will need the whole data for processing the packet below */
			if (p && !dp_packet_is_linear(p)) {
				dp_packet_linearize(p);
			}

			if (is_igmp(flow, wc)) {
				/*
				 * IGMP packets need to take the slow path, in order to be
				 * processed for mdb updates. That will prevent expires
				 * firing off even after hosts have sent reports.
				 */
				ctx->xout->slow |= SLOW_ACTION;

				memset(&wc->masks.tp_src, 0xff, sizeof wc->masks.tp_src);
				if (mcast_snooping_is_membership(flow->tp_src) ||
					mcast_snooping_is_query(flow->tp_src)) {
					if (ctx->xin->allow_side_effects && ctx->xin->packet) {
						update_mcast_snooping_table(ctx, flow, vlan,
													in_xbundle, ctx->xin->packet);
					}
				}

				if (mcast_snooping_is_membership(flow->tp_src)) {
					struct mcast_output out = MCAST_OUTPUT_INIT;

					ovs_rwlock_rdlock(&ms->rwlock);
					xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
													 &out);
					/* RFC4541: section 2.1.1, item 1: A snooping switch should
					 * forward IGMP Membership Reports only to those ports where
					 * multicast routers are attached.  Alternatively stated: a
					 * snooping switch should not forward IGMP Membership Reports
					 * to ports on which only hosts are attached.
					 * An administrative control may be provided to override this
					 * restriction, allowing the report messages to be flooded to
					 * other ports. */
					xlate_normal_mcast_send_rports(ctx, ms, in_xbundle, &out);
					ovs_rwlock_unlock(&ms->rwlock);

					mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
				} else {
					xlate_report(ctx, OFT_DETAIL, "multicast traffic, flooding");
					xlate_normal_flood(ctx, in_xbundle, &xvlan);
				}
				return;
			} else if (is_mld(flow, wc)) {
				ctx->xout->slow |= SLOW_ACTION;
				if (ctx->xin->allow_side_effects && ctx->xin->packet) {
					update_mcast_snooping_table(ctx, flow, vlan,
												in_xbundle, ctx->xin->packet);
				}
				if (is_mld_report(flow, wc)) {
					struct mcast_output out = MCAST_OUTPUT_INIT;

					ovs_rwlock_rdlock(&ms->rwlock);
					xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
													 &out);
					xlate_normal_mcast_send_rports(ctx, ms, in_xbundle, &out);
					ovs_rwlock_unlock(&ms->rwlock);

					mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
				} else {
					xlate_report(ctx, OFT_DETAIL, "MLD query, flooding");
					xlate_normal_flood(ctx, in_xbundle, &xvlan);
				}
			} else {
				if (is_ip_local_multicast(flow, wc)) {
					/* RFC4541: section 2.1.2, item 2: Packets with a dst IP
					 * address in the 224.0.0.x range which are not IGMP must
					 * be forwarded on all ports */
					xlate_report(ctx, OFT_DETAIL,
								 "RFC4541: section 2.1.2, item 2, flooding");
					xlate_normal_flood(ctx, in_xbundle, &xvlan);
					return;
				}
			}

			/* forwarding to group base ports */
			struct mcast_output out = MCAST_OUTPUT_INIT;

			ovs_rwlock_rdlock(&ms->rwlock);
			if (flow->dl_type == htons(ETH_TYPE_IP)) {
				grp = mcast_snooping_lookup4(ms, flow->nw_dst, vlan);
			} else if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
				grp = mcast_snooping_lookup(ms, &flow->ipv6_dst, vlan);
			}
			if (grp) {
				xlate_normal_mcast_send_group(ctx, ms, grp, in_xbundle, &out);
				xlate_normal_mcast_send_fports(ctx, ms, in_xbundle, &out);
				xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
												 &out);
			} else {
				if (mcast_snooping_flood_unreg(ms)) {
					xlate_report(ctx, OFT_DETAIL,
								 "unregistered multicast, flooding");
					out.flood = true;
				} else {
					xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
													 &out);
					xlate_normal_mcast_send_fports(ctx, ms, in_xbundle, &out);
				}
			}
			ovs_rwlock_unlock(&ms->rwlock);

			mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
		} else {
			ovs_rwlock_rdlock(&ctx->xbridge->ml->rwlock);
			//根据目的mac,从mac表里找port信息
			mac = mac_learning_lookup(ctx->xbridge->ml, flow->dl_dst, vlan);
			mac_port = mac ? mac_entry_get_port(ctx->xbridge->ml, mac) : NULL;
			ovs_rwlock_unlock(&ctx->xbridge->ml->rwlock);

			//如果找到了,就从对应的port转发
			if (mac_port) {
				struct xbundle *mac_xbundle = xbundle_lookup(ctx->xcfg, mac_port);
				if (mac_xbundle
					&& mac_xbundle != in_xbundle
					&& mac_xbundle->ofbundle != in_xbundle->ofbundle) {
					xlate_report(ctx, OFT_DETAIL, "forwarding to learned port");
					output_normal(ctx, mac_xbundle, &xvlan);
				} else if (!mac_xbundle) {
					xlate_report(ctx, OFT_WARN,
								 "learned port is unknown, dropping");
				} else {
					xlate_report(ctx, OFT_DETAIL,
								 "learned port is input port, dropping");
				}
			} else {
				//如果本地没有目的mac信息,则flood到所有的port
				xlate_report(ctx, OFT_DETAIL,
							 "no learned MAC for destination, flooding");
				xlate_normal_flood(ctx, in_xbundle, &xvlan);
			}
		}
	}

handle_upcalls

经过process_upcall流程后,已经为upcall的流生成对应的缓存流表信息了,缓存流表的key信息以及actions动作保存在struct upcall里,接线去就是要将缓存流表put到datapath了;

static void
handle_upcalls(struct udpif *udpif, struct upcall *upcalls,
               size_t n_upcalls)
{
    struct dpif_op *opsp[UPCALL_MAX_BATCH * 2];
    struct ukey_op ops[UPCALL_MAX_BATCH * 2];
    size_t n_ops, n_opsp, i;

    /* Handle the packets individually in order of arrival.
     *
     *   - For SLOW_CFM, SLOW_LACP, SLOW_STP, SLOW_BFD, and SLOW_LLDP,
     *     translation is what processes received packets for these
     *     protocols.
     *
     *   - For SLOW_ACTION, translation executes the actions directly.
     *
     * The loop fills 'ops' with an array of operations to execute in the
     * datapath. */
    n_ops = 0;
    for (i = 0; i < n_upcalls; i++) {
        struct upcall *upcall = &upcalls[i];
        const struct dp_packet *packet = upcall->packet;
        struct ukey_op *op;

        if (should_install_flow(udpif, upcall)) {
            struct udpif_key *ukey = upcall->ukey;

			//判断是否需要安装ukey,如果安装成功,返回true
            if (ukey_install(udpif, ukey)) {
                upcall->ukey_persists = true;
				//重新安装ukey,则需要下发创建流表
                put_op_init(&ops[n_ops++], ukey, DPIF_FP_CREATE);
            }
        }

		//如果有action动作,则下发execute,将upcall的消息包重新执行下发
        if (upcall->odp_actions.size) {
            op = &ops[n_ops++];
            op->ukey = NULL;
            op->dop.type = DPIF_OP_EXECUTE;
            op->dop.execute.packet = CONST_CAST(struct dp_packet *, packet);
            op->dop.execute.flow = upcall->flow;
            odp_key_to_dp_packet(upcall->key, upcall->key_len,
                                 op->dop.execute.packet);
            op->dop.execute.actions = upcall->odp_actions.data;
            op->dop.execute.actions_len = upcall->odp_actions.size;
            op->dop.execute.needs_help = (upcall->xout.slow & SLOW_ACTION) != 0;
            op->dop.execute.probe = false;
            op->dop.execute.mtu = upcall->mru;
        }
    }

    /* Execute batch. */
    n_opsp = 0;
    for (i = 0; i < n_ops; i++) {
        opsp[n_opsp++] = &ops[i].dop;
    }
    dpif_operate(udpif->dpif, opsp, n_opsp, DPIF_OFFLOAD_AUTO);
    for (i = 0; i < n_ops; i++) {
        struct udpif_key *ukey = ops[i].ukey;

        if (ukey) {
            ovs_mutex_lock(&ukey->mutex);
            if (ops[i].dop.error) {
				
				COVERAGE_INC(upcall_dp_oper_error);
                transition_ukey(ukey, UKEY_EVICTED);
            } else if (ukey->state < UKEY_OPERATIONAL) {
            
				COVERAGE_INC(upcall_dp_oper_normal);
                transition_ukey(ukey, UKEY_OPERATIONAL);
            }
            ovs_mutex_unlock(&ukey->mutex);
        }
    }
}

handle_upcalls首先会根据ukey信息判断是否需要重新安装,如果要的话就进入put_op_init,初始化put消息的信息,put_op_init主要是初始化缓存流表的flow key以及actions动作;然后判断缓存流表如果有actions动作,初始化execute信息,准备将触发upacll的skb流重新下发给datapath执行。

static void
put_op_init(struct ukey_op *op, struct udpif_key *ukey,
            enum dpif_flow_put_flags flags)
{
	//设置缓存流表的flow key信息
    op->ukey = ukey;
    op->dop.type = DPIF_OP_FLOW_PUT;
    op->dop.flow_put.flags = flags;
    op->dop.flow_put.key = ukey->key;
    op->dop.flow_put.key_len = ukey->key_len;
    op->dop.flow_put.mask = ukey->mask;
    op->dop.flow_put.mask_len = ukey->mask_len;
    op->dop.flow_put.ufid = ukey->ufid_present ? &ukey->ufid : NULL;
    op->dop.flow_put.pmd_id = ukey->pmd_id;
    op->dop.flow_put.stats = NULL;
	//设置缓存流表的actions动作
    ukey_get_actions(ukey, &op->dop.flow_put.actions,
                     &op->dop.flow_put.actions_len);
}

dpif_operate

缓存流表的put动作在dpif_operate流程里执行,如果是内核态的ovs,最终进入dpif_netlink_operate__,这里主要是构造neilink消息,然后通知内核态ovs处理put(对应OVS_FLOW_CMD_NEW)及execute(OVS_PACKET_CMD_EXECUTE)流程;

static size_t
dpif_netlink_operate__(struct dpif_netlink *dpif,
                       struct dpif_op **ops, size_t n_ops)
{
    struct op_auxdata {
        struct nl_transaction txn;

        struct ofpbuf request;
        uint64_t request_stub[1024 / 8];

        struct ofpbuf reply;
        uint64_t reply_stub[1024 / 8];
    } auxes[OPERATE_MAX_OPS];

    struct nl_transaction *txnsp[OPERATE_MAX_OPS];
    size_t i;

    n_ops = MIN(n_ops, OPERATE_MAX_OPS);
    for (i = 0; i < n_ops; i++) {
        struct op_auxdata *aux = &auxes[i];
        struct dpif_op *op = ops[i];
        struct dpif_flow_put *put;
        struct dpif_flow_del *del;
        struct dpif_flow_get *get;
        struct dpif_netlink_flow flow;

        ofpbuf_use_stub(&aux->request,
                        aux->request_stub, sizeof aux->request_stub);
        aux->txn.request = &aux->request;

        ofpbuf_use_stub(&aux->reply, aux->reply_stub, sizeof aux->reply_stub);
        aux->txn.reply = NULL;

        switch (op->type) {
        case DPIF_OP_FLOW_PUT:
            put = &op->flow_put;
            dpif_netlink_init_flow_put(dpif, put, &flow);
            if (put->stats) {
                flow.nlmsg_flags |= NLM_F_ECHO;
                aux->txn.reply = &aux->reply;
            }
            dpif_netlink_flow_to_ofpbuf(&flow, &aux->request);
            break;

        case DPIF_OP_FLOW_DEL:
            del = &op->flow_del;
            dpif_netlink_init_flow_del(dpif, del, &flow);
            if (del->stats) {
                flow.nlmsg_flags |= NLM_F_ECHO;
                aux->txn.reply = &aux->reply;
            }
            dpif_netlink_flow_to_ofpbuf(&flow, &aux->request);
            break;

        case DPIF_OP_EXECUTE:
            /* Can't execute a packet that won't fit in a Netlink attribute. */
            if (OVS_UNLIKELY(nl_attr_oversized(
                                 dp_packet_size(op->execute.packet)))) {
                /* Report an error immediately if this is the first operation.
                 * Otherwise the easiest thing to do is to postpone to the next
                 * call (when this will be the first operation). */
                if (i == 0) {
                    VLOG_ERR_RL(&error_rl,
                                "dropping oversized %"PRIu32"-byte packet",
                                dp_packet_size(op->execute.packet));
                    op->error = ENOBUFS;
                    return 1;
                }
                n_ops = i;
            } else {
                /* We will need to pass the whole to encode the message */
                if (!dp_packet_is_linear(op->execute.packet)) {
                    dp_packet_linearize(op->execute.packet);
                }

                dpif_netlink_encode_execute(dpif->dp_ifindex, &op->execute,
                                            &aux->request);
            }
            break;

        case DPIF_OP_FLOW_GET:
            get = &op->flow_get;
            dpif_netlink_init_flow_get(dpif, get, &flow);
            aux->txn.reply = get->buffer;
            dpif_netlink_flow_to_ofpbuf(&flow, &aux->request);
            break;

        default:
            OVS_NOT_REACHED();
        }
    }

    for (i = 0; i < n_ops; i++) {
        txnsp[i] = &auxes[i].txn;
    }
    nl_transact_multiple(NETLINK_GENERIC, txnsp, n_ops);

    for (i = 0; i < n_ops; i++) {
        struct op_auxdata *aux = &auxes[i];
        struct nl_transaction *txn = &auxes[i].txn;
        struct dpif_op *op = ops[i];
        struct dpif_flow_put *put;
        struct dpif_flow_del *del;
        struct dpif_flow_get *get;

        op->error = txn->error;

        switch (op->type) {
        case DPIF_OP_FLOW_PUT:
            put = &op->flow_put;
            if (put->stats) {
                if (!op->error) {
                    struct dpif_netlink_flow reply;

                    op->error = dpif_netlink_flow_from_ofpbuf(&reply,
                                                              txn->reply);
                    if (!op->error) {
                        dpif_netlink_flow_get_stats(&reply, put->stats);
                    }
                }
            }
            break;

        case DPIF_OP_FLOW_DEL:
            del = &op->flow_del;
            if (del->stats) {
                if (!op->error) {
                    struct dpif_netlink_flow reply;

                    op->error = dpif_netlink_flow_from_ofpbuf(&reply,
                                                              txn->reply);
                    if (!op->error) {
                        dpif_netlink_flow_get_stats(&reply, del->stats);
                    }
                }
            }
            break;

        case DPIF_OP_EXECUTE:
            break;

        case DPIF_OP_FLOW_GET:
            get = &op->flow_get;
            if (!op->error) {
                struct dpif_netlink_flow reply;

                op->error = dpif_netlink_flow_from_ofpbuf(&reply, txn->reply);
                if (!op->error) {
                    dpif_netlink_flow_to_dpif_flow(&dpif->dpif, get->flow,
                                                   &reply);
                }
            }
            break;

        default:
            OVS_NOT_REACHED();
        }

        ofpbuf_uninit(&aux->request);
        ofpbuf_uninit(&aux->reply);
    }

    return n_ops;
}

ovs_flow_cmd_new

当内核ovs收到用户态ovs下发的put消息时,进入ovs_flow_cmd_new流程,该流程主要是将用户态下发的缓存流表保存到datapath本地的dp->table里;

ovs_packet_cmd_execute

改流程主要是处理用户态下发的execute消息,主要过程就是根据skb的key消息找到ovs_flow_cmd_new里保存下来的缓存流表,然后根据其actions执行对应的动作。

 

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
OVS offload 是指在开放式虚拟交换机(Open vSwitch,即OVS)中使用硬件加速来提升网络数据包处理的性能和效率。传统上,虚拟交换机在软件层面进行数据包处理,这可能会导致性能瓶颈和延迟增加。因此,为了解决这些问题,OVS offload 技术应运而生。 通过 OVS offload,虚拟交换机可以将一些网络数据包的处理任务委托给硬件设备来完成,而不是完全依赖于软件。这些硬件设备可以是物理网络交换机的芯片或网卡上的功能块,也可以是专门设计的网络加速卡(Network Interface Card,即NIC),具体取决于硬件厂商的支持。 OVS offload 技术带来了多方面的好处。首先,它可以大幅度提高网络数据包的处理速度和吞吐量,从而减少延迟并提供更好的网络性能。其次,它可以减轻CPU的负担,使其能够处理更多的网络流量和更复杂的网络任务。此外,OVS offload 还可以提供更好的网络流量监控和安全性,通过硬件加速可以更快速地检测和处理网络攻击。 然而,OVS offload 技术也存在一些限制。首先,它取决于硬件设备的支持,因此只有特定的硬件设备才能充分发挥其优势。其次,OVS offload 目前仍处于发展阶段,可能存在一些兼容性问题或性能优化的空间。因此,在实际应用中,需要仔细评估硬件设备的支持和兼容性,以及进行适当的性能测试和调优。 总的来说,OVS offload 技术为虚拟交换机提供了一种有效的性能优化手段,可以提高网络数据包处理的效率和性能。它在实际应用中具有广泛的应用前景,并且随着硬件技术的不断发展,其性能还将进一步提升。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值