struct vport是OVS的设备结构,个人认为非常类似于kernel里的netdev结构
/**
* struct vport - one port within a datapath
* @rcu: RCU callback head for deferred destruction.
* @port_no: Index into @dp's @ports array.
* @dp: Datapath to which this port belongs.
* @kobj: Represents /sys/class/net/<devname>/brport.
* @linkname: The name of the link from /sys/class/net/<datapath>/brif to this
* &struct vport. (We keep this around so that we can delete it if the
* device gets renamed.) Set to the null string when no link exists.
* @node: Element in @dp's @port_list.
* @upcall_pid: The Netlink port to use for packets received on this port that
* miss the flow table.
* @hash_node: Element in @dev_table hash table in vport.c.
* @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
* @ops: Class structure.
* @percpu_stats: Points to per-CPU statistics used and maintained by vport
* @stats_lock: Protects @err_stats and @offset_stats.
* @err_stats: Points to error statistics used and maintained by vport
* @offset_stats: Added to actual statistics as a sop to compatibility with
* XAPI for Citrix XenServer. Deprecated.
*/
struct vport {
struct rcu_head rcu;
u16 port_no;
struct datapath *dp;
struct kobject kobj;
char linkname[IFNAMSIZ];
struct list_head node;
u32 upcall_pid;
struct hlist_node hash_node;
struct hlist_node dp_hash_node;
const struct vport_ops *ops;
struct vport_percpu_stats __percpu *percpu_stats;
spinlock_t stats_lock;
struct vport_err_stats err_stats;
struct ovs_vport_stats offset_stats;
};
和vport紧密相关的是struct datapath
/**
* struct datapath - datapath for flow-based packet switching
* @rcu: RCU callback head for deferred destruction.
* @list_node: Element in global 'dps' list.
* @ifobj: Represents /sys/class/net/<devname>/brif. Protected by RTNL.
* @n_flows: Number of flows currently in flow table.
* @table: Current flow table. Protected by genl_lock and RCU.
* @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by
* RTNL and RCU.
* @stats_percpu: Per-CPU datapath statistics.
* @net: Reference to net namespace.
*
* Context: See the comment on locking at the top of datapath.c for additional
* locking information.
*/
struct datapath {
struct rcu_head rcu;
struct list_head list_node;
struct kobject ifobj;
/* Flow table. */
struct flow_table __rcu *table;
/* Switch ports. */
struct hlist_head *ports;
/* Stats. */
struct dp_stats_percpu __percpu *stats_percpu;
#ifdef CONFIG_NET_NS
/* Network namespace ref. */
struct net *net;
#endif
};
我的理解是,无论vport还是datapath都是OVS用的虚拟设备,datapath中包含了多个vport,通过datapath->ports, vport->dp_hash_node的哈希表关联起来, vport->dp指向vport属于的datapath
datapath同时包含了一个flow_table
struct ovs_skb_cb用来作为sk_buff的私有结构,
/**
* struct ovs_skb_cb - OVS data in skb CB
* @flow: The flow associated with this packet. May be %NULL if no flow.
* @tun_id: ID of the tunnel that encapsulated this packet. It is 0 if the
* @ip_summed: Consistently stores L4 checksumming status across different
* kernel versions.
* @csum_start: Stores the offset from which to start checksumming independent
* of the transport header on all kernel versions.
* packet was not received on a tunnel.
* @vlan_tci: Provides a substitute for the skb->vlan_tci field on kernels
* before 2.6.27.
*/
struct ovs_skb_cb {
struct sw_flow *flow;
__be64 tun_id;
#ifdef NEED_CSUM_NORMALIZE
enum csum_type ip_summed;
u16 csum_start;
#endif
#ifdef NEED_VLAN_FIELD
u16 vlan_tci;
#endif
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
vlan_tci表示了802.1q的Tag Control Identifier,包括3bits 的Priority Code Point ( aka. CoS ),1bit 的Drop Eligible,和12bits的VLAN ID
tun_id表示了这个skb在OVS中的tunnel ID
flow表示了skb所属于的流,这是个openflow的概念,sw_flow->sw_flow_key用于唯一标识一个流,sw_flow->sw_flow_actions用于记录流match了之后的行为
struct vport_ops定义了vport的行为,
/**
* struct vport_ops - definition of a type of virtual port
*
* @type: %OVS_VPORT_TYPE_* value for this type of virtual port.
* @flags: Flags of type VPORT_F_* that influence how the generic vport layer
* handles this vport.
* @init: Called at module initialization. If VPORT_F_REQUIRED is set then the
* failure of this function will cause the module to not load. If the flag is
* not set and initialzation fails then no vports of this type can be created.
* @exit: Called at module unload.
* @create: Create a new vport configured as specified. On success returns
* a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value.
* @destroy: Destroys a vport. Must call vport_free() on the vport but not
* before an RCU grace period has elapsed.
* @set_options: Modify the configuration of an existing vport. May be %NULL
* if modification is not supported.
* @get_options: Appends vport-specific attributes for the configuration of an
* existing vport to a &struct sk_buff. May be %NULL for a vport that does not
* have any configuration.
* @set_addr: Set the device's MAC address. May be null if not supported.
* @get_name: Get the device's name.
* @get_addr: Get the device's MAC address.
* @get_config: Get the device's configuration.
* @get_kobj: Get the kobj associated with the device (may return null).
* @get_dev_flags: Get the device's flags.
* @is_running: Checks whether the device is running.
* @get_operstate: Get the device's operating state.
* @get_ifindex: Get the system interface index associated with the device.
* May be null if the device does not have an ifindex.
* @get_mtu: Get the device's MTU. May be %NULL if the device does not have an
* MTU (as e.g. some tunnels do not). Must be implemented if @get_ifindex is
* implemented.
* @send: Send a packet on the device. Returns the length of the packet sent.
*/
struct vport_ops {
enum ovs_vport_type type;
u32 flags;
/* Called at module init and exit respectively. */
int (*init)(void);
void (*exit)(void);
/* Called with RTNL lock. */
struct vport *(*create)(const struct vport_parms *);
void (*destroy)(struct vport *);
int (*set_options)(struct vport *, struct nlattr *);
int (*get_options)(const struct vport *, struct sk_buff *);
int (*set_addr)(struct vport *, const unsigned char *);
/* Called with rcu_read_lock or RTNL lock. */
const char *(*get_name)(const struct vport *);
const unsigned char *(*get_addr)(const struct vport *);
void (*get_config)(const struct vport *, void *);
struct kobject *(*get_kobj)(const struct vport *);
unsigned (*get_dev_flags)(const struct vport *);
int (*is_running)(const struct vport *);
unsigned char (*get_operstate)(const struct vport *);
int (*get_ifindex)(const struct vport *);
int (*get_mtu)(const struct vport *);
int (*send)(struct vport *, struct sk_buff *);
};
struct vport 有private data 的数据部分,是紧跟在vport后面的一段线性数据空间,可以通过vport_priv,vport_from_priv来操作
struct vport 其实是个基类,实际应用时会有netdev_vport, internal_vport, patch_vport, gre_vport等,相应的vport_ops为ovs_netdev_vport_ops, ovs_internal_vport_ops, ovs_patch_vport_ops, ovs_gre_vport_ops
下面跟进vport-netdev设备,对于任意的net_device设备,如果要成为OVS的vport,需要把OVS的接收函数hook到net_device的包接收函数中,这样net_device的进包就不会进入常规的内核协议栈中,而是由OVS接过来处理
netdev_vport结构就是一个struct net_device*的封装
struct netdev_vport {
struct net_device *dev;
};
我们来看netdev_create函数:
static struct vport *netdev_create(const struct vport_parms *parms)
{
struct vport *vport;
struct netdev_vport *netdev_vport;
int err;
vport = ovs_vport_alloc(sizeof(struct netdev_vport),
&ovs_netdev_vport_ops, parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto error;
}
netdev_vport = netdev_vport_priv(vport);
netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
if (!netdev_vport->dev) {
err = -ENODEV;
goto error_free_vport;
}
if (netdev_vport->dev->flags & IFF_LOOPBACK ||
netdev_vport->dev->type != ARPHRD_ETHER ||
ovs_is_internal_dev(netdev_vport->dev)) {
err = -EINVAL;
goto error_put;
}
err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
vport);
if (err)
goto error_put;
dev_set_promiscuity(netdev_vport->dev, 1);
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
dev_disable_lro(netdev_vport->dev);
#endif
netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
return vport;
error_put:
dev_put(netdev_vport->dev);
error_free_vport:
ovs_vport_free(vport);
error:
return ERR_PTR(err);
}
首先调用 ovs_vport_alloc(sizeof(struct netdev_vport), &ovs_netdev_vport_ops, vport_parms* parms) 创建一个vport结构,后面跟一个vport_private,即一个netdev_vport结构,这两个结构都在一段连续线性内存中。
接着调用内核函数dev_get_by_name,该函数基于net_device->name_hlist(这里全局的net_device都基于name_hlist形成一个hash表),通过name查找到对应的net_device
如果发现net_device是loopback, 或不是以太网接口,或是internal vport(internal vport表示这个vport可以把包直接交给内核,可以认为是内核连到OVS的一个虚拟网口),报错返回
调用netdev_rx_handler_register把net_device->br_port 设置为 vport,这样net_device 算是连到bridge上了,最后调用dev_set_promiscuity设置混杂模式后结束
这里要注意的是,2.6.36之后的内核,专门用了一个rx_handler函数指针,用来替代之前内核版本中的br_handle_frame_hook,用来接收传到bridge上的包,本人用的是redhat发布的2.6.32的内核版本,因此netdev_rx_handler_register里rx_handler = netdev_frame_hook这步被省去了,取而代之的是在net_init中进行了br_handle_frame_hook的初始化
static int netdev_init(void)
{
/* Hook into callback used by the bridge to intercept packets.
* Parasites we are. */
br_handle_frame_hook = netdev_frame_hook;
return 0;
}
netdev_frame_hook实际上是调用了netdev_port_receive:
/* Must be called with rcu_read_lock. */
static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
{
if (unlikely(!vport)) {
kfree_skb(skb);
return;
}
/* Make our own copy of the packet. Otherwise we will mangle the
* packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
* (No one comes after us, since we tell handle_bridge() that we took
* the packet.) */
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(!skb))
return;
skb_push(skb, ETH_HLEN);
if (unlikely(compute_ip_summed(skb, false))) {
kfree_skb(skb);
return;
}
vlan_copy_skb_tci(skb);
ovs_vport_receive(vport, skb);
}
可以看出netdev_port_receive实际是调用了ovs_vport_receive:
void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
{
struct vport_percpu_stats *stats;
stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
u64_stats_update_begin(&stats->sync);
stats->rx_packets++;
stats->rx_bytes += skb->len;
u64_stats_update_end(&stats->sync);
if (!(vport->ops->flags & VPORT_F_FLOW))
OVS_CB(skb)->flow = NULL;
if (!(vport->ops->flags & VPORT_F_TUN_ID))
OVS_CB(skb)->tun_id = 0;
ovs_dp_process_received_packet(vport, skb);
}
ovs_vport_receive实际上更新了统计信息,对ovs_skb_cb->tun_id, ovs_skb_flow做相应更新,接着调用ovs_dp_process_received_packet
/* Must be called with rcu_read_lock. */
void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
{
struct datapath *dp = p->dp;
struct sw_flow *flow;
struct dp_stats_percpu *stats;
u64 *stats_counter;
int error;
stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
if (!OVS_CB(skb)->flow) {
struct sw_flow_key key;
int key_len;
/* Extract flow from 'skb' into 'key'. */
error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
if (unlikely(error)) {
kfree_skb(skb);
return;
}
/* Look up flow. */
flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table),
&key, key_len);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.key = &key;
upcall.userdata = NULL;
upcall.pid = p->upcall_pid;
ovs_dp_upcall(dp, skb, &upcall);
consume_skb(skb);
stats_counter = &stats->n_missed;
goto out;
}
OVS_CB(skb)->flow = flow;
}
stats_counter = &stats->n_hit;
ovs_flow_used(OVS_CB(skb)->flow, skb);
ovs_execute_actions(dp, skb);
out:
/* Update datapath statistics. */
u64_stats_update_begin(&stats->sync);
(*stats_counter)++;
u64_stats_update_end(&stats->sync);
}
函数里很大一部分是和skb所属的flow相关,我们知道OVS是遵守openFlow规范的,所以OVS和bridge很大一块不同就是OVS在处理skb的时候有一个flow的概念在里面。首先会调用ovs_flow_extract基于skb算出一个key出来,之后调用ovs_flow_tbl_lookup查找这个flow,最后是调用ovs_execute_actions
ovs_execute_actions会调用do_execute_actions,后者一般情况下,会找出out_port出来,然后调用do_output把skb从out_port口发送出去,发送函数为ovs_vport_send
ovs_vport_send:
/**
* ovs_vport_send - send a packet on a device
*
* @vport: vport on which to send the packet
* @skb: skb to send
*
* Sends the given packet and returns the length of data sent. Either RTNL
* lock or rcu_read_lock must be held.
*/
int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
int sent = vport->ops->send(vport, skb);
if (likely(sent)) {
struct vport_percpu_stats *stats;
stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
u64_stats_update_begin(&stats->sync);
stats->tx_packets++;
stats->tx_bytes += sent;
u64_stats_update_end(&stats->sync);
}
return sent;
}
还有像ovs_vport_init, ovs_vport_exit, ovs_vport_destroy, ovs_vport_set_xxxx, ovs_vport_get_xxxx 这些函数就不一一介绍了,请自己阅读datapath/vport.c的源码
最后来看下vport-netdev的特殊实现,可以看出vport只是一个基类,而vport根据设备的不同而不同,netdev vport应该是最普遍的场景,前面我们看来接收的流程,对于发送,vport->ops->send会调用到netdev_send函数
static int netdev_send(struct vport *vport, struct sk_buff *skb)
{
struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
int mtu = netdev_vport->dev->mtu;
int len;
if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
if (net_ratelimit())
pr_warn("%s: dropped over-mtu packet: %d > %d\n",
ovs_dp_name(vport->dp), packet_length(skb), mtu);
goto error;
}
这段代码发现如果skb->len大过了MTU,同时skb又不允许gso,那么直接丢弃
下面一大段是和vlan相关的操作,如果没有vlan的话,那就直接通过dev_queue_xmit把skb发送出去
skb->dev = netdev_vport->dev;
forward_ip_summed(skb, true);
if (vlan_tx_tag_present(skb) && !dev_supports_vlan_tx(skb->dev)) {
int features;
features = netif_skb_features(skb);
if (!vlan_tso)
features &= ~(NETIF_F_TSO | NETIF_F_TSO6 |
NETIF_F_UFO | NETIF_F_FSO);
if (netif_needs_gso(skb, features)) {
struct sk_buff *nskb;
nskb = skb_gso_segment(skb, features);
if (!nskb) {
if (unlikely(skb_cloned(skb) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) {
kfree_skb(skb);
return 0;
}
skb_shinfo(skb)->gso_type &= ~SKB_GSO_DODGY;
goto tag;
}
if (IS_ERR(nskb)) {
kfree_skb(skb);
return 0;
}
consume_skb(skb);
skb = nskb;
len = 0;
do {
nskb = skb->next;
skb->next = NULL;
skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
if (likely(skb)) {
len += skb->len;
vlan_set_tci(skb, 0);
dev_queue_xmit(skb);
}
skb = nskb;
} while (skb);
return len;
}
如果设备不支持tso/gso,那么需要调用skb_gso_segment在内核里进行分段,如果分段成功会返回一个skb的list,然后对list里每一个skb,调用dev_queue_xmit发送出去
tag:
skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
if (unlikely(!skb))
return 0;
vlan_set_tci(skb, 0);
}
下面是vlan无关部分,直接通过dev_queue_xmit发送skb
len = skb->len;
dev_queue_xmit(skb);
return len;
error:
kfree_skb(skb);
ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
return 0;
}
开发者可以去参考datapath/README文档