Linux 内核下Bridge配置以及报文处理流程实现

背景       

        作为网络工程师,对于报文二三层转发过程应该比较熟悉,那么对于常见的bridge接口转发、vlan报文转发在Linux内核中是如何实现的,却不是很清楚,希望就此问题能和大家一起了解学习,这里从概念、配置、报文处理几个方面先对Bridge进行了解学习。

概念

        Bridge是一种虚拟网络设备,工作在数据链路层(二层),用于连接两个或多个网络接口,实现不同网络之间的通信。

配置

 添加网桥设备

       命令:brctl addbr br0(网桥接口名),通过strace命令跟踪系统调用过程如下:

root@ubuntu:~# strace brctl addbr br0
execve("/sbin/brctl", ["brctl", "addbr", "br0"], 0x7ffee6d50f30 /* 22 vars */) = 0
brk(NULL)                               = 0x562cf2203000
.....
socket(AF_UNIX, SOCK_STREAM, 0)         = 3
ioctl(3, SIOCBRADDBR, "br0")            = 0
exit_group(0)                           = ?
+++ exited with 0 +++

          由此可见brctl命令通过ioctl与内核通信,addbr具体命令对应为SIOCBRADDBR,通过ioclt参数“br0”下发到内核。

添加网桥接口

        命令: brctl addif br0 veth0_br,系统调用过程如下:

root@ubuntu:~# strace brctl addif br0 veth0_br
execve("/sbin/brctl", ["brctl", "addif", "br0", "veth0_br"], 0x7fffa6b16108 /* 22 vars */) = 0
.....
socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
ioctl(4, SIOCGIFINDEX, {ifr_name="veth0_br", }) = 0
close(4)                                = 0
ioctl(3, SIOCBRADDIF)                   = 0
exit_group(0)                           = ?
+++ exited with 0 +++

        addif具体命令对应为SIOCBRADDIF。

内核实现

        在linux 内核内部,Bridge的相关实现以模块的形式插入内核,模块初始化接口为br_init,br_init初始化过程中会调用brioctl_set设置Bridge ioctl命令的相关处理函数,具体实现如下:

brioctl_set(br_ioctl_stub);

int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd,
		  struct ifreq *ifr, void __user *uarg)
{
	int ret = -EOPNOTSUPP;

	rtnl_lock();

	switch (cmd) {
	case SIOCGIFBR:
	case SIOCSIFBR:
		ret = old_deviceless(net, uarg);
		break;
	case SIOCBRADDBR:
	case SIOCBRDELBR:
	{
		char buf[IFNAMSIZ];

		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
			ret = -EPERM;
			break;
		}

		if (copy_from_user(buf, uarg, IFNAMSIZ)) {
			ret = -EFAULT;
			break;
		}

		buf[IFNAMSIZ-1] = 0;
		if (cmd == SIOCBRADDBR)
            /* 添加网桥设备 */
			ret = br_add_bridge(net, buf);
		else
            /* 删除网桥设备 */
			ret = br_del_bridge(net, buf);
	}
		break;
	case SIOCBRADDIF:
	case SIOCBRDELIF:
        /* 网桥设备添加/删除网络设备接口*/
		ret = add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF);
		break;
	}

	rtnl_unlock();

	return ret;
}

        addbr 具体处理函数为:br_add_bridge()

int br_add_bridge(struct net *net, const char *name)
{
	struct net_device *dev;
	int res;

	/* 基于接口名参数,分配创建netdev,并初始化 */
	dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN,
			   br_dev_setup);

	if (!dev)
		return -ENOMEM;

	/* 设置dev 所属网络命名空间,网络命名空间实现不同系统间隔离 */
	dev_net_set(dev, net);
	/* 设置dev 对应的netlink消息处理函数 */
	dev->rtnl_link_ops = &br_link_ops;


	/* 注册网络设备 */
	res = register_netdevice(dev);
	if (res)
		free_netdev(dev);
	return res;
}

        addif处理函数这里做了一层封装, 具体实现如下:

static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
{
	struct net *net = dev_net(br->dev);
	struct net_device *dev;
	int ret;

	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
		return -EPERM;

	dev = __dev_get_by_index(net, ifindex);
	if (dev == NULL)
		return -EINVAL;

    /* 根据是否添加选择处理 */
	if (isadd)
        /* 添加网络接口 */
		ret = br_add_if(br, dev, NULL);
	else
        /* 删除网络接口 */
		ret = br_del_if(br, dev);

	return ret;
}

        addif 添加网络接口的实际处理函数为br_add_if();添加网络接口的实质是分配并初始化一个net_bridge_port的实例,将其添加到网桥设备的端口记录中。 

int br_add_if(struct net_bridge *br, struct net_device *dev,
	      struct netlink_ext_ack *extack)
{
	struct net_bridge_port *p;
	int err = 0;
	unsigned br_hr, dev_hr;
	bool changed_addr, fdb_synced = false;

	/* Don't allow bridging non-ethernet like devices. */
	/* 本地环回设备或非以太网设备,不添加 */
	if ((dev->flags & IFF_LOOPBACK) ||
	    dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN ||
	    !is_valid_ether_addr(dev->dev_addr))
		return -EINVAL;

	/* No bridging of bridges */
	/* 加入接口本身也是网桥设备,不添加 */
	if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) {
		NL_SET_ERR_MSG(extack,
			       "Can not enslave a bridge to a bridge");
		return -ELOOP;
	}

	/* Device has master upper dev */
	/* 如果接口存在上层主接口,不添加(该接口为某一类子接口)*/
	if (netdev_master_upper_dev_get(dev))
		return -EBUSY;

	/* No bridging devices that dislike that (e.g. wireless) */
	/* 明确指示不参与网络桥接的设备,不添加 */
	if (dev->priv_flags & IFF_DONT_BRIDGE) {
		NL_SET_ERR_MSG(extack,
			       "Device does not allow enslaving to a bridge");
		return -EOPNOTSUPP;
	}

	/* 新建一个net_bridge_port并初始化 */
	p = new_nbp(br, dev);
	if (IS_ERR(p))
		return PTR_ERR(p);

	.......

	/* 加入sysfs文件系统中 */
	err = br_sysfs_addif(p);
	if (err)
		goto err2;

	err = br_netpoll_enable(p);
	if (err)
		goto err3;

	err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p);
	if (err)
		goto err4;

	/* 指示该接口为网桥设备下的接口 */
	dev->priv_flags |= IFF_BRIDGE_PORT;

	err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack);
	if (err)
		goto err5;

	dev_disable_lro(dev);

	/*将接口对应net_bridge_port加入网桥端口列表中 */
	list_add_rcu(&p->list, &br->port_list);

	nbp_update_port_count(br);
	if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) {
		/* When updating the port count we also update all ports'
		 * promiscuous mode.
		 * A port leaving promiscuous mode normally gets the bridge's
		 * fdb synced to the unicast filter (if supported), however,
		 * `br_port_clear_promisc` does not distinguish between
		 * non-promiscuous ports and *new* ports, so we need to
		 * sync explicitly here.
		 */
		fdb_synced = br_fdb_sync_static(br, p) == 0;
		if (!fdb_synced)
			netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n");
	}

	netdev_update_features(br->dev);

	br_hr = br->dev->needed_headroom;
	dev_hr = netdev_get_fwd_headroom(dev);
	if (br_hr < dev_hr)
		update_headroom(br, dev_hr);
	else
		netdev_set_rx_headroom(dev, br_hr);

	if (br_fdb_add_local(br, p, dev->dev_addr, 0))
		netdev_err(dev, "failed insert local address bridge forwarding table\n");

	if (br->dev->addr_assign_type != NET_ADDR_SET) {
		/* Ask for permission to use this MAC address now, even if we
		 * don't end up choosing it below.
		 */
		err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack);
		if (err)
			goto err6;
	}

	err = nbp_vlan_init(p, extack);
	if (err) {
		netdev_err(dev, "failed to initialize vlan filtering on this port\n");
		goto err6;
	}

	spin_lock_bh(&br->lock);
	changed_addr = br_stp_recalculate_bridge_id(br);

	if (netif_running(dev) && netif_oper_up(dev) &&
	    (br->dev->flags & IFF_UP))
		br_stp_enable_port(p);
	spin_unlock_bh(&br->lock);

	br_ifinfo_notify(RTM_NEWLINK, NULL, p);

	if (changed_addr)
		call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);

	br_mtu_auto_adjust(br);
	br_set_gso_limits(br);

	kobject_uevent(&p->kobj, KOBJ_ADD);

	return 0;

.....
	return err;
}

        删除网桥设备、网桥接口与添加为逆操作,在此不在赘述。

 报文处理

 拓扑

         常见的桥接转发的网络拓扑如图所示:

        DUT(设备)有两个物理口eth0、eht1分别于PC1、PC2相连,eth0、eth1同时加入到网桥接口 Br0,以PC1发送报文到PC2为例,介绍网桥设备报文的接收发送处理流程。

        1)PC1发送报文,报文到达eth0,

        2)网卡eth0检测到报文到达,通过DMA技术将报文存放到网卡对应的缓冲区(ring_buffer),产生硬件中断通告CPU报文到来

        3)CPU硬件中断处理,将报文从ring_buffer中拷贝到内核报文缓冲区skb_buffer中,并放入报文接收队列中,触发收包软中断,

        4)软中断处理将报文从接收队列移入处理队列,

        5)调用__netif_receive_skb(skb对报文进行处理。

        这里重点接收__netif_receive_skb后续处理过程

        __netif_receive_skb核心处理逻辑对应__netif_receive_skb_core,

__netif_receive_skb_core

具体实现如下:

static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
				    struct packet_type **ppt_prev)
{
	struct packet_type *ptype, *pt_prev;
	rx_handler_func_t *rx_handler;
	struct sk_buff *skb = *pskb;
	struct net_device *orig_dev;
	bool deliver_exact = false;
	int ret = NET_RX_DROP;
	__be16 type;

	net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);

	trace_netif_receive_skb(skb);

	orig_dev = skb->dev;

	/*设置网络层头部起始位置*/
	skb_reset_network_header(skb);
	if (!skb_transport_header_was_set(skb))
		/*设置传输层部起始位置*/
		skb_reset_transport_header(skb);
	/* 设置以太网头部长度 */
	skb_reset_mac_len(skb);

	pt_prev = NULL;

another_round:
	skb->skb_iif = skb->dev->ifindex;

	__this_cpu_inc(softnet_data.processed);

	if (static_branch_unlikely(&generic_xdp_needed_key)) {
		int ret2;

		migrate_disable();
		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
		migrate_enable();

		if (ret2 != XDP_PASS) {
			ret = NET_RX_DROP;
			goto out;
		}
	}

	/* 如果是vlan协议报文802.1Q,去tag */
	if (eth_type_vlan(skb->protocol)) {
		skb = skb_vlan_untag(skb);
		if (unlikely(!skb))
			goto out;
	}

	if (skb_skip_tc_classify(skb))
		goto skip_classify;

	if (pfmemalloc)
		goto skip_taps;

	/* 如果报文属于某类协议报文,上送对应协议处理 */
	list_for_each_entry_rcu(ptype, &ptype_all, list) {
		if (pt_prev)
			ret = deliver_skb(skb, pt_prev, orig_dev);
		pt_prev = ptype;
	}

	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
		if (pt_prev)
			ret = deliver_skb(skb, pt_prev, orig_dev);
		pt_prev = ptype;
	}

skip_taps:
#ifdef CONFIG_NET_INGRESS
	if (static_branch_unlikely(&ingress_needed_key)) {
		bool another = false;

		nf_skip_egress(skb, true);
		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
					 &another);
		if (another)
			goto another_round;
		if (!skb)
			goto out;

		nf_skip_egress(skb, false);
		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
			goto out;
	}
#endif
	skb_reset_redirect(skb);
skip_classify:
	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
		goto drop;

	if (skb_vlan_tag_present(skb)) {
		if (pt_prev) {
			ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = NULL;
		}
		if (vlan_do_receive(&skb))
			goto another_round;
		else if (unlikely(!skb))
			goto out;
	}
	
	/* 获取网络设备对应的收包处理函数,对于网桥设备,在添加接口时注册,具体为br_handle_frame */
	rx_handler = rcu_dereference(skb->dev->rx_handler);
	if (rx_handler) {
		if (pt_prev) {
			ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = NULL;
		}
		switch (rx_handler(&skb)) {
		case RX_HANDLER_CONSUMED:
			ret = NET_RX_SUCCESS;
			goto out;
		case RX_HANDLER_ANOTHER:
			goto another_round;
		case RX_HANDLER_EXACT:
			deliver_exact = true;
			break;
		case RX_HANDLER_PASS:
			break;
		default:
			BUG();
		}
	}

	......
	return ret;
}

        该函数的作用:一方面当报文某种特定协议报文时将报文发往对应协议层处理,另一方面调用网络设备的接收处理函数对报文处理(每个网络设备只能注册一个报文接收处理函数),对于网网桥下的设备有如下函数调用关系:

br_add_if接口内部有如下对应实现:

err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p);

根据函数实现进一步展开:
rx_handler_func_t *br_get_rx_handler(const struct net_device *dev)
{
	if (netdev_uses_dsa(dev))
		return br_handle_frame_dummy;

	return br_handle_frame;
}

/* 网络设备注册报文接收处理函数 */
int netdev_rx_handler_register(struct net_device *dev,
			       rx_handler_func_t *rx_handler,
			       void *rx_handler_data)
{
	if (netdev_is_rx_handler_busy(dev))
		return -EBUSY;

	if (dev->priv_flags & IFF_NO_RX_HANDLER)
		return -EINVAL;

	/* Note: rx_handler_data must be set before rx_handler */
    /* rx_handler_data对应p,即net_bridge_port 接口相关信息;rx_handler为具体的接收处理函数 */
	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
	rcu_assign_pointer(dev->rx_handler, rx_handler);

	return 0;
}

        由此可见网桥接口接收报文对应的处理函数为:br_handle_frame;

br_handle_frame

static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
	struct net_bridge_port *p;/* 网桥接口指针 */
	struct sk_buff *skb = *pskb;
	const unsigned char *dest = eth_hdr(skb)->h_dest;

	if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
		return RX_HANDLER_PASS;

	/* 无效的以太网地址,报文直接丢弃 */
	if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
		goto drop;

	skb = skb_share_check(skb, GFP_ATOMIC);
	if (!skb)
		return RX_HANDLER_CONSUMED;

	memset(skb->cb, 0, sizeof(struct br_input_skb_cb));

	/* 根据skb->dev 获取网桥接口信息 net_bridge_port */
	p = br_port_get_rcu(skb->dev);
	if (p->flags & BR_VLAN_TUNNEL)
		br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p));

	/* 如果目的地址是01:80:c2:00:00:0X, 则为本地保留地址(由于特殊协议特殊报文的地址,普通报文跳过) */
	if (unlikely(is_link_local_ether_addr(dest))) {
		u16 fwd_mask = p->br->group_fwd_mask_required;

		/*
		 * See IEEE 802.1D Table 7-10 Reserved addresses
		 *
		 * Assignment		 		Value
		 * Bridge Group Address		01-80-C2-00-00-00
		 * (MAC Control) 802.3		01-80-C2-00-00-01
		 * (Link Aggregation) 802.3	01-80-C2-00-00-02
		 * 802.1X PAE address		01-80-C2-00-00-03
		 *
		 * 802.1AB LLDP 		01-80-C2-00-00-0E
		 *
		 * Others reserved for future standardization
		 */
		fwd_mask |= p->group_fwd_mask;
		switch (dest[5]) {
		case 0x00:	/* Bridge Group Address */
			/* If STP is turned off,
			   then must forward to keep loop detection */
			if (p->br->stp_enabled == BR_NO_STP ||
			    fwd_mask & (1u << dest[5]))
				goto forward;
			*pskb = skb;
			__br_handle_local_finish(skb);
			return RX_HANDLER_PASS;

		case 0x01:	/* IEEE MAC (Pause) */
			goto drop;

		case 0x0E:	/* 802.1AB LLDP */
			fwd_mask |= p->br->group_fwd_mask;
			if (fwd_mask & (1u << dest[5]))
				goto forward;
			*pskb = skb;
			__br_handle_local_finish(skb);
			return RX_HANDLER_PASS;

		default:
			/* Allow selective forwarding for most other protocols */
			fwd_mask |= p->br->group_fwd_mask;
			if (fwd_mask & (1u << dest[5]))
				goto forward;
		}

		/* The else clause should be hit when nf_hook():
		 *   - returns < 0 (drop/error)
		 *   - returns = 0 (stolen/nf_queue)
		 * Thus return 1 from the okfn() to signal the skb is ok to pass
		 */
		if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
			    dev_net(skb->dev), NULL, skb, skb->dev, NULL,
			    br_handle_local_finish) == 1) {
			return RX_HANDLER_PASS;
		} else {
			return RX_HANDLER_CONSUMED;
		}
	}

	/* 针对特殊协议的特殊处理,比如MRP(Media Redundancy Protocol),CFM(Connectivity Fault Management),正常返回0,继续向下处理 */
	if (unlikely(br_process_frame_type(p, skb)))
		return RX_HANDLER_PASS;

forward:
	if (br_mst_is_enabled(p->br))
		goto defer_stp_filtering;

	switch (p->state) {
	case BR_STATE_FORWARDING:
	case BR_STATE_LEARNING:
	/* 核心处理 */
defer_stp_filtering:
		if (ether_addr_equal(p->br->dev->dev_addr, dest))
			skb->pkt_type = PACKET_HOST;

		return nf_hook_bridge_pre(skb, pskb);
	default:
drop:
		kfree_skb(skb);
	}
	return RX_HANDLER_CONSUMED;
}

        核心处理为nf_hook_bridge_pre()

nf_hook_bridge_pre

        nf_hook_bridge_pre用于注册BR Netfilter处理函数,具体实现为:

static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb)
{
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
	......
	e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]);
	if (!e)
		goto frame_finish;
    /* 核心在NF_BR_PRE_ROUTING位置,基于NFPROTO_BRIDGE 注册br_handle_frame_finish处理函数*/
	nf_hook_state_init(&state, NF_BR_PRE_ROUTING,
			   NFPROTO_BRIDGE, skb->dev, NULL, NULL,
			   net, br_handle_frame_finish);

    /* 基于NF_BR_PRE_ROUTING位置,遍历执行处理函数 */
	for (i = 0; i < e->num_hook_entries; i++) {
		verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state);
        /* 根据Hook函数返回结果作决断 */
		switch (verdict & NF_VERDICT_MASK) {
		case NF_ACCEPT:
			if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) {
				*pskb = skb;
				return RX_HANDLER_PASS;
			}
			break;
		case NF_DROP:
			kfree_skb(skb);
			return RX_HANDLER_CONSUMED;
		case NF_QUEUE:
			ret = nf_queue(skb, &state, i, verdict);
			if (ret == 1)
				continue;
			return RX_HANDLER_CONSUMED;
		default: /* STOLEN */
			return RX_HANDLER_CONSUMED;
		}
	}
frame_finish:
	net = dev_net(skb->dev);
	br_handle_frame_finish(net, NULL, skb);
#else
	br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
#endif
	return RX_HANDLER_CONSUMED;
}

        下一步处理为:br_handle_frame_finish

br_handle_frame_finish 

/* note: already called with rcu_read_lock */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct net_bridge_port *p = br_port_get_rcu(skb->dev);
	enum br_pkt_type pkt_type = BR_PKT_UNICAST;
	struct net_bridge_fdb_entry *dst = NULL;
	struct net_bridge_mcast_port *pmctx;
	struct net_bridge_mdb_entry *mdst;
	bool local_rcv, mcast_hit = false;
	struct net_bridge_mcast *brmctx;
	struct net_bridge_vlan *vlan;
	struct net_bridge *br;
	u16 vid = 0;
	u8 state;

	if (!p)
		goto drop;

	br = p->br;

	if (br_mst_is_enabled(br)) {
		state = BR_STATE_FORWARDING;
	} else {
		if (p->state == BR_STATE_DISABLED)
			goto drop;

		state = p->state;
	}

	.......

	/* insert into forwarding database after filtering to avoid spoofing */
	if (p->flags & BR_LEARNING)
		/*更新桥转发表, 即SMAC与Port的对应关系,当有发往SMAC的报文则从Port发出 */
		br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0);

	local_rcv = !!(br->dev->flags & IFF_PROMISC);
	/* 根据目的MAC确定是广播、多播、还是单播(默认是单播)*/
	if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) {
		/* by definition the broadcast is also a multicast address */
		if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) {
			pkt_type = BR_PKT_BROADCAST;
			local_rcv = true;
		} else {
			pkt_type = BR_PKT_MULTICAST;
			if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid))
				goto drop;
		}
	}

	if (state == BR_STATE_LEARNING)
		goto drop;

	BR_INPUT_SKB_CB(skb)->brdev = br->dev;
	BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);

	if (IS_ENABLED(CONFIG_INET) &&
	    (skb->protocol == htons(ETH_P_ARP) ||
	     skb->protocol == htons(ETH_P_RARP))) {
		br_do_proxy_suppress_arp(skb, br, vid, p);
	} else if (IS_ENABLED(CONFIG_IPV6) &&
		   skb->protocol == htons(ETH_P_IPV6) &&
		   br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
		   pskb_may_pull(skb, sizeof(struct ipv6hdr) +
				 sizeof(struct nd_msg)) &&
		   ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
			struct nd_msg *msg, _msg;

			msg = br_is_nd_neigh_msg(skb, &_msg);
			if (msg)
				br_do_suppress_nd(skb, br, vid, p, msg);
	}

	switch (pkt_type) {
	case BR_PKT_MULTICAST:
		mdst = br_mdb_get(brmctx, skb, vid);
		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
		    br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) {
			if ((mdst && mdst->host_joined) ||
			    br_multicast_is_router(brmctx, skb)) {
				local_rcv = true;
				br->dev->stats.multicast++;
			}
			mcast_hit = true;
		} else {
			local_rcv = true;
			br->dev->stats.multicast++;
		}
		break;
	case BR_PKT_UNICAST:
		/* 如果为单播则依据目的MAC,查找网桥对应的转发条目 */
		dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid);
		break;
	default:
		break;
	}

	if (dst) {
		unsigned long now = jiffies;

		/* 到本机的报文 */
		if (test_bit(BR_FDB_LOCAL, &dst->flags))
			return br_pass_frame_up(skb);

		if (now != dst->used)
			dst->used = now;
		/* 单播根据目的转发 */
		br_forward(dst->dst, skb, local_rcv, false);
	} else {
		/* 多播洪范;此外,若单播未找到端口映射的也需要洪范 */
		if (!mcast_hit)
			br_flood(br, skb, pkt_type, local_rcv, false, vid);
		else
			br_multicast_flood(mdst, skb, brmctx, local_rcv, false);
	}

	if (local_rcv)
		return br_pass_frame_up(skb);

out:
	return 0;
drop:
	kfree_skb(skb);
	goto out;
}

        从br_handle_frame_finish的实现来看,主要做了如下几件事:

        1)根据目的MAC确定是单播、广播、还是组播;单播,则依据目的MAC,查找网桥对应的转发条目;组播,获取组播组信息(包含组播组接口列表)

        2)依据报文的源MAC,及报文入接口更新网桥转发表, 从而保证当有发往SRC MAC 的报文时可以从对应的接口发送出去。

        3)根据转发类型,确定转发处理函数,分几种情况: 

                a) 单播,到本机的:br_pass_frame_up,转发报文且存在端口-MAC映射关系的:br_forward,未找到端口映射关系的:br_flood

                b) 多播,洪范;组播,走br_multicast_flood,广播走br_flood

本机报文

        如上所述,通过桥接口到本机的报文处理逻辑走br_pass_frame_up,具体处理流程如下:

        1)判断br是否属于某个Vlan,是vlan报文处理br_handle_vlan

        2)网桥报文上送协议栈br_netif_receive_skb

                调用netif_receive_skb接口

                        继续调用netif_receive_skb_internal

                                __netif_receive_skb

                                        __netif_receive_skb_one_core

                                        在接口__netif_receive_skb_one_core内部,跟报文协议,上送对应的协议层处理,并最终上送具体应用

转发报文

单播

        网桥报文单播通过br_forward接口处理,具体流程如下:

        1)通过核心函数__br_forward处理

                a) 判断br是否属于某个Vlan,是vlan报文处理br_handle_vlan

                b)设置NF_BR_FORWARDhook点,调用相应的Hook函数br_forward_finish处理

                        br_forward_finish函数处理,触发NF_BR_POST_ROUTINGHook点处理函数br_dev_queue_push_xmit

                                dev_queue_xmit 报文外发,通过网络设备注册的报文发送函数将报文发出,对于网桥设备而言,发送接口对应为br_dev_xmit

多播

        广播处理函数为:br_flood

        组播处理函数为:br_multicast_flood

        广播与组播的区别在于广播将报文发往除来源口外的所有接口;组播将报文发往除来源口外的属于组播组的其他接口,最终的发送逻辑还是由__br_forward处理,如上所述,不再重复。

网桥处理流程总结

        进入网桥的数据报文分为几个类型,

        1)报文是本机发送给自己的,桥不处理,上送上层协议栈

        2)接收报文的物理接口不是网桥接口,桥不处理,上送协议栈

        3)进入网桥,桥状态为Disable,直接丢弃

        4)报文源地址无效(广播、多播,以及00:00:00:00:00:00)丢包

        5)如果是STP的BPDU包,交给上层协议栈

        6)如果是发往本机的报文,上送上层协议栈

        7)需要转发的报文分三种情况:

                a)广播或组播,除接口端口外的所有端口都需要转发一份(组播的话,组播组内的其他接口)

                b)单播并且在端口-MAC映射表中能找到端口映射的,只需往映射端口转发一份

                c)单播但找不到端口映射的,则除了接收端口外其余端口都需要转发。

参考连接 

Linux 网桥实现分析

  • 15
    点赞
  • 25
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值