OVS dp支持的action都在do_execute_actions函数中定义,支持的action包括:
OVS_ACTION_ATTR_OUTPUT
OVS_ACTION_ATTR_USERSPACE
OVS_ACTION_ATTR_HASH
OVS_ACTION_ATTR_PUSH_MPLS
OVS_ACTION_ATTR_POP_MPLS
OVS_ACTION_ATTR_PUSH_VLAN
OVS_ACTION_ATTR_POP_VLAN
OVS_ACTION_ATTR_RECIRC
OVS_ACTION_ATTR_SET
OVS_ACTION_ATTR_SET_MASKED
OVS_ACTION_ATTR_SET_TO_MASKED
OVS_ACTION_ATTR_SAMPLE
OVS_ACTION_ATTR_CT
本系列要完成这些action的分析,output已经在之前介绍datapath主流程时已经介绍,不再进行介绍。
一、OVS_ACTION_ATTR_USERSPACE
本节为OVS_ACTION_ATTR_USERSPACE的处理函数为output_userspace函数,以此函数作为入口进行分析。
1、output_userspace函数
static int output_userspace(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key, const struct nlattr *attr,
const struct nlattr *actions, int actions_len)
{
struct ip_tunnel_info info;
struct dp_upcall_info upcall;
const struct nlattr *a;
int rem;
memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_ACTION; //封装upcall对象
upcall.mru = OVS_CB(skb)->mru;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0; //获取userspace action相关的信息
a = nla_next(a, &rem)) {
switch (nla_type(a)) {
case OVS_USERSPACE_ATTR_USERDATA:
upcall.userdata = a;
break;
case OVS_USERSPACE_ATTR_PID:
upcall.portid = nla_get_u32(a);
break;
case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
/* Get out tunnel info. */
struct vport *vport;
vport = ovs_vport_rcu(dp, nla_get_u32(a));
if (vport) {
int err;
upcall.egress_tun_info = &info;
err = ovs_vport_get_egress_tun_info(vport, skb,
&upcall);
if (err)
upcall.egress_tun_info = NULL;
}
break;
}
case OVS_USERSPACE_ATTR_ACTIONS: {
/* Include actions. */
upcall.actions = actions;
upcall.actions_len = actions_len;
break;
}
} /* End of switch. */
}
return ovs_dp_upcall(dp, skb, key, &upcall); //upcall
}
2、ovs_dp_upcall函数
int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
{
struct dp_stats_percpu *stats;
int err;
if (upcall_info->portid == 0) {
err = -ENOTCONN;
goto err;
}
if (!skb_is_gso(skb))
err = queue_userspace_packet(dp, skb, key, upcall_info);
else
err = queue_gso_packets(dp, skb, key, upcall_info);
if (err)
goto err;
return 0;
err:
stats = this_cpu_ptr(dp->stats_percpu);
u64_stats_update_begin(&stats->syncp);
stats->n_lost++;
u64_stats_update_end(&stats->syncp);
return err;
}
3、queue_userspace_packet函数
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb = NULL; /* to be queued to userspace */
struct nlattr *nla;
struct genl_info info = {
#ifdef HAVE_GENLMSG_NEW_UNICAST
.dst_sk = ovs_dp_get_net(dp)->genl_sock,
#endif
.snd_portid = upcall_info->portid,
};
size_t len;
unsigned int hlen;
int err, dp_ifindex;
dp_ifindex = get_dpifindex(dp);
if (!dp_ifindex)
return -ENODEV;
if (skb_vlan_tag_present(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto, skb_vlan_tag_get(nskb));
if (!nskb)
return -ENOMEM;
vlan_set_tci(nskb, 0); //为什么要把tci置0?
skb = nskb;
}
if (nla_attr_size(skb->len) > USHRT_MAX) {
err = -EFBIG;
goto out;
}
/* Complete checksum if needed */
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_checksum_help(skb)))
goto out;
/* Older versions of OVS user space enforce alignment of the last
* Netlink attribute to NLA_ALIGNTO which would require extensive
* padding logic. Only perform zerocopy if padding is not required.
*/
if (dp->user_features & OVS_DP_F_UNALIGNED)
hlen = skb_zerocopy_headlen(skb);
else
hlen = skb->len;
len = upcall_msg_size(upcall_info, hlen);
user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); //创建upcall消息对象
if (!user_skb) {
err = -ENOMEM;
goto out;
}
upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, //dp_packet_genl_family 和 upcall_info->cmd确定处理函数
0, upcall_info->cmd);
upcall->dp_ifindex = dp_ifindex;
err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); //upcall信息对象添加key
BUG_ON(err);
if (upcall_info->userdata)
__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, //upcall信息对象添加userdata
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
if (upcall_info->egress_tun_info) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); //upcall信息对象添加egress_tun_info
err = ovs_nla_put_egress_tunnel_key(user_skb,
upcall_info->egress_tun_info,
upcall_info->egress_tun_opts);
BUG_ON(err);
nla_nest_end(user_skb, nla);
}
if (upcall_info->actions_len) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); //upcall信息对象添加actions
err = ovs_nla_put_actions(upcall_info->actions,
upcall_info->actions_len,
user_skb);
if (!err)
nla_nest_end(user_skb, nla);
else
nla_nest_cancel(user_skb, nla);
}
/* Add OVS_PACKET_ATTR_MRU */
if (upcall_info->mru) {
if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
upcall_info->mru)) {
err = -ENOBUFS;
goto out;
}
pad_packet(dp, user_skb);
}
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy()
*/
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
err = -ENOBUFS;
goto out;
}
nla->nla_len = nla_attr_size(skb->len);
err = skb_zerocopy(user_skb, skb, skb->len, hlen); //upcall信息对象添加报文
if (err)
goto out;
/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
pad_packet(dp, user_skb);
((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); //发送netlink报文
user_skb = NULL;
out:
if (err)
skb_tx_error(skb);
kfree_skb(user_skb);
kfree_skb(nskb);
return err;
}
到此可以看到userspace action和精确流表未匹配导致的upcall在处理流程上是比较一致的,两者都是通过调用ovs_dp_upcall函数实现信息发送到用户态程序。upcall处理线程是如何处理的不在本篇分析,将在后续给出分析。
通过userspace能够实现什么功能呢? 现在还想不出,等分析upcall处理后,再回过头来回答这个问题。
二、OVS_ACTION_ATTR_HASH
本节分析OVS_ACTION_ATTR_HASH action,该action的处理函数为execute_hash函数
1、execute_hash函数
static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
const struct nlattr *attr)
{
struct ovs_action_hash *hash_act = nla_data(attr);
u32 hash = 0;
/* OVS_HASH_ALG_L4 is the only possible hash algorithm. */
hash = skb_get_hash(skb);
hash = jhash_1word(hash, hash_act->hash_basis);
if (!hash)
hash = 0x1;
key->ovs_flow_hash = hash; //计算hash值
}
该action仅对key的ovs_flow_hash成员变量进行了修改,从该变量的使用地方逆推,最终是queue_userspace_packet会使用,该函数是把报文发送给用户态进程,本次就看下queue_userspace_packet函数是如何使用到该成员变量的。
2、queue_userspace_packet函数
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb = NULL; /* to be queued to userspace */
struct nlattr *nla;
struct genl_info info = {
#ifdef HAVE_GENLMSG_NEW_UNICAST
.dst_sk = ovs_dp_get_net(dp)->genl_sock,
#endif
.snd_portid = upcall_info->portid,
};
size_t len;
unsigned int hlen;
int err, dp_ifindex;
dp_ifindex = get_dpifindex(dp);
if (!dp_ifindex)
return -ENODEV;
if (skb_vlan_tag_present(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto, skb_vlan_tag_get(nskb));
if (!nskb)
return -ENOMEM;
vlan_set_tci(nskb, 0); //为什么要把tci置0?
skb = nskb;
}
if (nla_attr_size(skb->len) > USHRT_MAX) {
err = -EFBIG;
goto out;
}
/* Complete checksum if needed */
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_checksum_help(skb)))
goto out;
/* Older versions of OVS user space enforce alignment of the last
* Netlink attribute to NLA_ALIGNTO which would require extensive
* padding logic. Only perform zerocopy if padding is not required.
*/
if (dp->user_features & OVS_DP_F_UNALIGNED)
hlen = skb_zerocopy_headlen(skb);
else
hlen = skb->len;
len = upcall_msg_size(upcall_info, hlen);
user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); //创建upcall消息对象
if (!user_skb) {
err = -ENOMEM;
goto out;
}
upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, //dp_packet_genl_family 和 upcall_info->cmd确定处理函数
0, upcall_info->cmd);
upcall->dp_ifindex = dp_ifindex;
err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); //upcall信息对象添加key,该函数最终会用到ovs_flow_hash
BUG_ON(err);
if (upcall_info->userdata)
__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, //upcall信息对象添加userdata
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
if (upcall_info->egress_tun_info) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); //upcall信息对象添加egress_tun_info
err = ovs_nla_put_egress_tunnel_key(user_skb,
upcall_info->egress_tun_info,
upcall_info->egress_tun_opts);
BUG_ON(err);
nla_nest_end(user_skb, nla);
}
if (upcall_info->actions_len) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); //upcall信息对象添加actions
err = ovs_nla_put_actions(upcall_info->actions,
upcall_info->actions_len,
user_skb);
if (!err)
nla_nest_end(user_skb, nla);
else
nla_nest_cancel(user_skb, nla);
}
/* Add OVS_PACKET_ATTR_MRU */
if (upcall_info->mru) {
if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
upcall_info->mru)) {
err = -ENOBUFS;
goto out;
}
pad_packet(dp, user_skb);
}
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy()
*/
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
err = -ENOBUFS;
goto out;
}
nla->nla_len = nla_attr_size(skb->len);
err = skb_zerocopy(user_skb, skb, skb->len, hlen); //upcall信息对象添加报文
if (err)
goto out;
/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
pad_packet(dp, user_skb);
((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); //发送netlink报文
user_skb = NULL;
out:
if (err)
skb_tx_error(skb);
kfree_skb(user_skb);
kfree_skb(nskb);
return err;
}
3、ovs_nla_put_key函数
int ovs_nla_put_key(const struct sw_flow_key *swkey,
const struct sw_flow_key *output, int attr, bool is_mask,
struct sk_buff *skb)
{
int err;
struct nlattr *nla;
nla = nla_nest_start(skb, attr);
if (!nla)
return -EMSGSIZE;
err = __ovs_nla_put_key(swkey, output, is_mask, skb);
if (err)
return err;
nla_nest_end(skb, nla);
return 0;
}
4、__ovs_nla_put_key函数
static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
const struct sw_flow_key *output, bool is_mask,
struct sk_buff *skb)
{
struct ovs_key_ethernet *eth_key;
struct nlattr *nla, *encap;
if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id))
goto nla_put_failure;
if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) //使用到该变量
goto nla_put_failure;
if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
goto nla_put_failure;
if ((swkey->tun_key.u.ipv4.dst || is_mask)) {
const void *opts = NULL;
if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len);
if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts,
swkey->tun_opts_len))
goto nla_put_failure;
}
if (swkey->phy.in_port == DP_MAX_PORTS) {
if (is_mask && (output->phy.in_port == 0xffff))
if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
goto nla_put_failure;
三、OVS_ACTION_ATTR_PUSH_VLAN
本节分析OVS_ACTION_ATTR_PUSH_VLAN action,该action的处理函数为push_vlan。
1、push_vlan函数
static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_vlan *vlan)
{
if (skb_vlan_tag_present(skb)) //如果报文已经包含vlan
invalidate_flow_key(key); //设置key的以太报文类型为0
else
key->eth.tci = vlan->vlan_tci; //设置key的报文tci值
return skb_vlan_push(skb, vlan->vlan_tpid, //添加vlan信息
ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
}
2、skb_vlan_push函数
#define skb_vlan_push rpl_skb_vlan_push
int rpl_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
if (skb_vlan_tag_present(skb)) { //如果报文已经有vlan信息
unsigned int offset = skb->data - skb_mac_header(skb); //计算data与mac之间的offset
int err;
/* __vlan_insert_tag expect skb->data pointing to mac header.
* So change skb->data before calling it and change back to
* original position later
*/
__skb_push(skb, offset); //data切换到mac地址
err = __vlan_insert_tag(skb, skb->vlan_proto, //插入vlan标签
skb_vlan_tag_get(skb));
if (err)
return err;
skb->mac_len += VLAN_HLEN; //skb二层头长度增加VLAN头长度,4个字节
__skb_pull(skb, offset); //data回到源位置,实际是相比之前的报文,还要往回移4字节
if (skb->ip_summed == CHECKSUM_COMPLETE) //重新计算checksum值
skb->csum = csum_add(skb->csum, csum_partial(skb->data
+ (2 * ETH_ALEN), VLAN_HLEN, 0));
}
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); //设置skb->vlan_tci值,不修改报文数据
return 0;
}
3、__vlan_insert_tag函数
#define __vlan_insert_tag(skb, proto, tci) rpl_vlan_insert_tag(skb, tci)
static inline int rpl_vlan_insert_tag(struct sk_buff *skb, u16 vlan_tci)
{
struct vlan_ethhdr *veth;
if (skb_cow_head(skb, VLAN_HLEN) < 0) //如果skb的headroom不能增加vlan头长度,则需要扩展
return -ENOMEM;
veth = (struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN); //data往前移4字节,使得报文在二层多出4字节存放vlan tag
/* Move the mac addresses to the beginning of the new header. */
memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN); //目的mac和源mac拷贝到新的位置,共拷贝12字节
skb->mac_header -= VLAN_HLEN; //mac头也往前移了4字节
/* first, the ethernet type */
veth->h_vlan_proto = htons(ETH_P_8021Q); //设置vlan tag的报文类型,veth指针指向不是目的mac地址的位置吗?为什么?
/* now, the TCI */
veth->h_vlan_TCI = htons(vlan_tci); //设置vlan tag的tci值
return 0;
}
push vlan动作就是如果报文已经有vlan,那么先修改报文的数据,添加vlan头,然后再设置skb->vlan_tci,该vlan头由硬件在发送时添加到报文中。
四、OVS_ACTION_ATTR_POP_VLAN
本节分析OVS_ACTION_ATTR_POP_VLAN action的处理函数pop_vlan。
1、pop_vlan函数
static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
{
int err;
err = skb_vlan_pop(skb);
if (skb_vlan_tag_present(skb)) //如果还存在vlan,则设置key的报文类型为0
invalidate_flow_key(key);
else
key->eth.tci = 0; //设置key的tci为0
return err;
}
2、skb_vlan_pop函数
#define skb_vlan_pop rpl_skb_vlan_pop
int rpl_skb_vlan_pop(struct sk_buff *skb)
{
u16 vlan_tci;
__be16 vlan_proto;
int err;
if (likely(skb_vlan_tag_present(skb))) { //如果skb的vlan_tci非0,直接设置该值为0
skb->vlan_tci = 0;
} else {
if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
skb->protocol != htons(ETH_P_8021AD)) ||
skb->len < VLAN_ETH_HLEN))
return 0;
err = __skb_vlan_pop(skb, &vlan_tci); //skb报文pop vlan,修改报文数据
if (err)
return err;
}
/* move next vlan tag to hw accel tag */
if (likely((skb->protocol != htons(ETH_P_8021Q) && //qinq场景
skb->protocol != htons(ETH_P_8021AD)) ||
skb->len < VLAN_ETH_HLEN))
return 0;
vlan_proto = htons(ETH_P_8021Q);
err = __skb_vlan_pop(skb, &vlan_tci); //需要进一步pop vlan
if (unlikely(err))
return err;
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); //设置skb的vlan_tci值
return 0;
}
3、__skb_vlan_pop函数
static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
struct vlan_hdr *vhdr;
unsigned int offset = skb->data - skb_mac_header(skb);
int err;
__skb_push(skb, offset); //data切换到mac地址
err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
if (unlikely(err))
goto pull;
skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
*vlan_tci = ntohs(vhdr->h_vlan_TCI);
memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); //mac往后移4个字节
__skb_pull(skb, VLAN_HLEN); //skb数据往后移4个字节
vlan_set_encap_proto(skb, vhdr); //解析内层报文,被赋值给skb protocol
skb->mac_header += VLAN_HLEN; //mac_header往后移4个字节
if (skb_network_offset(skb) < ETH_HLEN) //如果网络头的偏移小于二层长度(14字节)
skb_set_network_header(skb, ETH_HLEN); //设置网络头的偏移为14字节
skb_reset_mac_len(skb); //重新设置二层长度,等于network_header - mac_header
pull:
__skb_pull(skb, offset); //data切换到源位置,由于去掉了vlan头,实际会多往前移动4字节
return err;
}
pop vlan,如果报文vlan已经解析,即放在skb的vlan_tci变量,那么直接把该变量赋值为0,key的vlan_tci设置为0即可;否则的话就需要修改skb的报文数据,软件最多会pop两个vlan头,硬件还可以剥一个头。
五、OVS_ACTION_ATTR_SET
本节分析OVS_ACTION_ATTR_SET action的处理函数execute_set_action函数。
1、execute_set_action函数
static int execute_set_action(struct sk_buff *skb,
struct sw_flow_key *flow_key,
const struct nlattr *a)
{
/* Only tunnel set execution is supported without a mask. */
if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
struct ovs_tunnel_info *tun = nla_data(a);
ovs_skb_dst_drop(skb);
ovs_dst_hold((struct dst_entry *)tun->tun_dst); //为什么要调用空函数
ovs_skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); //设置skb的tun_dst成员对象,这个信息在vxlan报文发包的时候使用
return 0;
}
return -EINVAL;
}
该action的处理函数非常简单,仅设置了一个参数。 我们来看看是如何被使用到的,以vxlan隧道为例,我们从vxlan端口的send函数(vxlan_xmit)入手来看。
2、vxlan_xmit函数
#define vxlan_xmit rpl_vxlan_xmit
netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct vxlan_dev *vxlan = netdev_priv(dev);
const struct ip_tunnel_info *info;
info = skb_tunnel_info(skb); //得到tunnel信息,即execute_set_action函数设置的内容
skb_reset_mac_header(skb);
if ((vxlan->flags & VXLAN_F_PROXY))
goto out;
if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
info && info->mode & IP_TUNNEL_INFO_TX) {
vxlan_xmit_one(skb, dev, NULL, false);
return NETDEV_TX_OK;
}
out:
pr_warn("vxlan: unsupported flag set %x", vxlan->flags);
kfree_skb(skb);
return NETDEV_TX_OK;
}
vxlan报文发送流程,不在这里分析。 通过分析,该action的作用是封装报文,通过隧道发送报文。 在dp的层面,只有一个tunnel端口(每种tunnel隧道一个),而其他类型的端口可以是多个的,从这里也可以看到tunnel端口只是配置信息不同而已,所以只需要一个端口,配置信息在action中提供。
六、OVS_ACTION_ATTR_RECIRC
本节分析OVS_ACTION_ATTR_RECIRC action的处理函数execute_recirc。
1、execute_recirc函数
static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct nlattr *a, int rem)
{
struct deferred_action *da;
if (!is_flow_key_valid(key)) { //如果key为valid,需要重新生成key
int err;
err = ovs_flow_key_update(skb, key); //重新生成key
if (err)
return err;
}
BUG_ON(!is_flow_key_valid(key));
if (!nla_is_last(a, rem)) { //如果action不是最后一个,则需要克隆skb
/* Recirc action is the not the last action
* of the action list, need to clone the skb.
*/
skb = skb_clone(skb, GFP_ATOMIC);
/* Skip the recirc action when out of memory, but
* continue on with the rest of the action list.
*/
if (!skb)
return 0;
}
da = add_deferred_actions(skb, key, NULL); //添加deferred action
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
} else {
kfree_skb(skb);
if (net_ratelimit())
pr_warn("%s: deferred action limit reached, drop recirc action\n",
ovs_dp_name(dp));
}
return 0;
}
2、add_deferred_actions函数
/* Return queue entry if fifo is not full */
static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
const struct sw_flow_key *key,
const struct nlattr *attr)
{
struct action_fifo *fifo;
struct deferred_action *da;
fifo = this_cpu_ptr(action_fifos);
da = action_fifo_put(fifo); //添加一个deferred_action
if (da) {
da->skb = skb;
da->actions = attr; //recirc action,actions为空
da->pkt_key = *key;
}
return da;
}
3、action_fifo_put函数
static struct deferred_action *action_fifo_put(struct action_fifo *fifo)
{
if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1)
return NULL;
return &fifo->fifo[fifo->head++];
}
从上面可知,OVS_ACTION_ATTR_RECIRC action就是在action_fifos全局对象中添加一个deferred_action。 这些actions在什么被使用呢? 答案是ovs_execute_actions函数。
4、ovs_execute_actions函数
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
int level = this_cpu_read(exec_actions_level);
int err;
if (unlikely(level >= EXEC_ACTIONS_LEVEL_LIMIT)) {
if (net_ratelimit())
pr_warn("%s: packet loop detected, dropping.\n",
ovs_dp_name(dp));
kfree_skb(skb);
return -ELOOP;
}
this_cpu_inc(exec_actions_level);
err = do_execute_actions(dp, skb, key,
acts->actions, acts->actions_len);
if (!level)
process_deferred_actions(dp); //执行deferred actions, 前提条件是level为0,即第一次执行该函数时。可以把该action推迟到最后执行。
this_cpu_dec(exec_actions_level);
/* This return status currently does not reflect the errors
* encounted during deferred actions execution. Probably needs to
* be fixed in the future.
*/
return err;
}
5、process_deferred_actions函数
static void process_deferred_actions(struct datapath *dp)
{
struct action_fifo *fifo = this_cpu_ptr(action_fifos);
/* Do not touch the FIFO in case there is no deferred actions. */
if (action_fifo_is_empty(fifo))
return;
/* Finishing executing all deferred actions. */
do {
struct deferred_action *da = action_fifo_get(fifo);
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = &da->pkt_key;
const struct nlattr *actions = da->actions;
if (actions)
do_execute_actions(dp, skb, key, actions,
nla_len(actions));
else
ovs_dp_process_packet(skb, key); //recirc进该流程,开始重新处理该报文,从查找流表开始,和前一次处理的差异就是key多了recirc_id。
} while (!action_fifo_is_empty(fifo));
/* Reset FIFO for the next packet. */
action_fifo_init(fifo); //清空fifo
}
OVS_ACTION_ATTR_RECIRC action提供了重复处理的功能,但是这样的功能价值是什么? 现在还没想明白。
七、OVS_ACTION_ATTR_SET_MASKED 和 OVS_ACTION_ATTR_SET_TO_MASKED
本节分析OVS_ACTION_ATTR_SET_MASKED 和 OVS_ACTION_ATTR_SET_TO_MASKED action,处理函数为execute_masked_set_action函数。
1、execute_masked_set_action函数
static int execute_masked_set_action(struct sk_buff *skb,
struct sw_flow_key *flow_key,
const struct nlattr *a)
{
int err = 0;
switch (nla_type(a)) {
case OVS_KEY_ATTR_PRIORITY:
OVS_SET_MASKED(skb->priority, nla_get_u32(a), //报文优先级设置, 用于tc控制
*get_mask(a, u32 *));
flow_key->phy.priority = skb->priority;
break;
case OVS_KEY_ATTR_SKB_MARK:
OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); //报文mark设置, iptables会使用
flow_key->phy.skb_mark = skb->mark;
break;
case OVS_KEY_ATTR_TUNNEL_INFO:
/* Masked data not supported for tunnel. */
err = -EINVAL;
break;
case OVS_KEY_ATTR_ETHERNET:
err = set_eth_addr(skb, flow_key, nla_data(a), //设置源mac、目的mac
get_mask(a, struct ovs_key_ethernet *));
break;
case OVS_KEY_ATTR_IPV4:
err = set_ipv4(skb, flow_key, nla_data(a), //设置IPV4字段,源IP、目的IP、tos、ttl;
get_mask(a, struct ovs_key_ipv4 *));
break;
case OVS_KEY_ATTR_IPV6:
err = set_ipv6(skb, flow_key, nla_data(a), //设置IPV6相关字段
get_mask(a, struct ovs_key_ipv6 *));
break;
case OVS_KEY_ATTR_TCP:
err = set_tcp(skb, flow_key, nla_data(a), //设置tcp字段,修改源端口和目的端口
get_mask(a, struct ovs_key_tcp *));
break;
case OVS_KEY_ATTR_UDP:
err = set_udp(skb, flow_key, nla_data(a), //设置udp字段,修改源端口和目的端口
get_mask(a, struct ovs_key_udp *));
break;
case OVS_KEY_ATTR_SCTP:
err = set_sctp(skb, flow_key, nla_data(a),
get_mask(a, struct ovs_key_sctp *));
break;
case OVS_KEY_ATTR_MPLS:
err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
__be32 *));
break;
case OVS_KEY_ATTR_CT_STATE:
case OVS_KEY_ATTR_CT_ZONE:
case OVS_KEY_ATTR_CT_MARK:
case OVS_KEY_ATTR_CT_LABELS:
err = -EINVAL;
break;
}
return err;
}
2、set_eth_addr函数
static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
const struct ovs_key_ethernet *key,
const struct ovs_key_ethernet *mask)
{
int err;
err = skb_ensure_writable(skb, ETH_HLEN);
if (unlikely(err))
return err;
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src,
mask->eth_src);
ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst,
mask->eth_dst);
ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source);
ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest);
return 0;
}
3、set_ipv4函数
static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key,
const struct ovs_key_ipv4 *key,
const struct ovs_key_ipv4 *mask)
{
struct iphdr *nh;
__be32 new_addr;
int err;
err = skb_ensure_writable(skb, skb_network_offset(skb) +
sizeof(struct iphdr));
if (unlikely(err))
return err;
nh = ip_hdr(skb);
/* Setting an IP addresses is typically only a side effect of
* matching on them in the current userspace implementation, so it
* makes sense to check if the value actually changed.
*/
if (mask->ipv4_src) {
new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
if (unlikely(new_addr != nh->saddr)) {
set_ip_addr(skb, nh, &nh->saddr, new_addr);
flow_key->ipv4.addr.src = new_addr;
}
}
if (mask->ipv4_dst) {
new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
if (unlikely(new_addr != nh->daddr)) {
set_ip_addr(skb, nh, &nh->daddr, new_addr);
flow_key->ipv4.addr.dst = new_addr;
}
}
if (mask->ipv4_tos) {
ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos);
flow_key->ip.tos = nh->tos;
}
if (mask->ipv4_ttl) {
set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl);
flow_key->ip.ttl = nh->ttl;
}
return 0;
}
4、set_udp函数
static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key,
const struct ovs_key_udp *key,
const struct ovs_key_udp *mask)
{
struct udphdr *uh;
__be16 src, dst;
int err;
err = skb_ensure_writable(skb, skb_transport_offset(skb) +
sizeof(struct udphdr));
if (unlikely(err))
return err;
uh = udp_hdr(skb);
/* Either of the masks is non-zero, so do not bother checking them. */
src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src);
dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst);
if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) {
if (likely(src != uh->source)) {
set_tp_port(skb, &uh->source, src, &uh->check);
flow_key->tp.src = src;
}
if (likely(dst != uh->dest)) {
set_tp_port(skb, &uh->dest, dst, &uh->check);
flow_key->tp.dst = dst;
}
if (unlikely(!uh->check))
uh->check = CSUM_MANGLED_0;
} else {
uh->source = src;
uh->dest = dst;
flow_key->tp.src = src;
flow_key->tp.dst = dst;
}
skb_clear_hash(skb);
return 0;
}
本节分析的action的作用是修改skb报文,通过key和mask两个值可以修改任意sw_flow_key结构体定义的字段。基于该框架,可以任意修改报文内容。例如arp代答等等。
八、OVS_ACTION_ATTR_SAMPLE
本节分析OVS_ACTION_ATTR_SAMPLE action的处理函数sample。
1、sample函数
static int sample(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key, const struct nlattr *attr,
const struct nlattr *actions, int actions_len)
{
const struct nlattr *acts_list = NULL;
const struct nlattr *a;
int rem;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
a = nla_next(a, &rem)) {
u32 probability;
switch (nla_type(a)) {
case OVS_SAMPLE_ATTR_PROBABILITY: //提供概率设置
probability = nla_get_u32(a);
if (!probability || prandom_u32() > probability)
return 0;
break;
case OVS_SAMPLE_ATTR_ACTIONS: //提供对采样报文的处理
acts_list = a;
break;
}
}
rem = nla_len(acts_list);
a = nla_data(acts_list);
/* Actions list is empty, do nothing */
if (unlikely(!rem))
return 0;
/* The only known usage of sample action is having a single user-space
* action. Treat this usage as a special case.
* The output_userspace() should clone the skb to be sent to the
* user space. This skb will be consumed by its caller.
*/
if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE &&
nla_is_last(a, rem)))
return output_userspace(dp, skb, key, a, actions, actions_len); //发送到用户态,相比OVS_ACTION_ATTR_USERSPACE,提供概率的能力
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb)
/* Skip the sample action when out of memory. */
return 0;
if (!add_deferred_actions(skb, key, a)) { //放到fifo数组中,在最后处理
if (net_ratelimit())
pr_warn("%s: deferred actions limit reached, dropping sample action\n",
ovs_dp_name(dp));
kfree_skb(skb);
}
return 0;
}
output_userspace在前几篇已经分析过,会把报文上传到用户态,用户态如何处理后续分析。add_deferred_actions会把报文放在fifo数组中,在报文处理的最后时刻处理,看ovs_execute_actions函数。
2、ovs_execute_actions函数
/* Execute a list of actions against 'skb'. */
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
int level = this_cpu_read(exec_actions_level);
int err;
if (unlikely(level >= EXEC_ACTIONS_LEVEL_LIMIT)) {
if (net_ratelimit())
pr_warn("%s: packet loop detected, dropping.\n",
ovs_dp_name(dp));
kfree_skb(skb);
return -ELOOP;
}
this_cpu_inc(exec_actions_level);
err = do_execute_actions(dp, skb, key,
acts->actions, acts->actions_len);
if (!level) //do_execute_actions如果循环进入此函数,那么level非零,不会进入
process_deferred_actions(dp);
this_cpu_dec(exec_actions_level);
/* This return status currently does not reflect the errors
* encounted during deferred actions execution. Probably needs to
* be fixed in the future.
*/
return err;
}
我们再看一下process_deferred_actions函数是怎么处理的。
3、process_deferred_actions函数
static void process_deferred_actions(struct datapath *dp)
{
struct action_fifo *fifo = this_cpu_ptr(action_fifos);
/* Do not touch the FIFO in case there is no deferred actions. */
if (action_fifo_is_empty(fifo))
return;
/* Finishing executing all deferred actions. */
do {
struct deferred_action *da = action_fifo_get(fifo);
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = &da->pkt_key;
const struct nlattr *actions = da->actions;
if (actions)
do_execute_actions(dp, skb, key, actions, //sample进入此分支
nla_len(actions));
else
ovs_dp_process_packet(skb, key); //recirc进该流程,开始重新处理该报文
} while (!action_fifo_is_empty(fifo));
/* Reset FIFO for the next packet. */
action_fifo_init(fifo); //清空fifo
}
sample总体提供两个功能,1)概率性地发送报文到用户态;2)两次处理报文的能力(自定义处理动作),为什么提供这个能力? 作用是什么? 希望通过进一步分析,能够回答这个问题。
原文链接:https://blog.csdn.net/one_clouder/article/details/52418570