内核协议栈中链路层的处理主要位于linux内核中的./net/bridge/中。数据包在链路层的流转如下图所示:
其中,netif_receive_skb是整个内核协议栈的起点,在5.15.153版本中,其定义为:
int netif_receive_skb(struct sk_buff *skb)
{
int ret;
trace_netif_receive_skb_entry(skb);
ret = netif_receive_skb_internal(skb);
trace_netif_receive_skb_exit(ret);
return ret;
}
EXPORT_SYMBOL(netif_receive_skb);
之后马上调用 netif_receive_skb_internal 进一步处理,netif_receive_skb_internal的定义为:
static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
//时间戳处理
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
rcu_read_lock();
//启用多核处理
#ifdef CONFIG_RPS
if (static_branch_unlikely(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
return ret;
}
}
#endif
ret = __netif_receive_skb(skb);
rcu_read_unlock();
return ret;
}
在进行时间戳处理之后,紧接着就是RPS(Receive Packet Steering)的处理,它主要用于从网卡收上协议栈的包交由指定的CPU处理,可以充分利用多核性能。接下来,数据包将传递给__netif_receive_skb处理,__netif_receive_skb的定义为:
static int __netif_receive_skb(struct sk_buff *skb)
{
int ret;
if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
unsigned int noreclaim_flag;
noreclaim_flag = memalloc_noreclaim_save();
ret = __netif_receive_skb_one_core(skb, true);
memalloc_noreclaim_restore(noreclaim_flag);
} else
ret = __netif_receive_skb_one_core(skb, false);
return ret;
}
接下来数据包将传递给 __netif_receive_skb_one_core ,根据上面 sk_memalloc_socks 及 skb_pfmemalloc 的返回结果,将为 __netif_receive_skb_one_core 设定不同的第二参数。__netif_receive_skb_one_core的定义为:
static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
int ret;
ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (pt_prev)
ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
skb->dev, pt_prev, orig_dev);
return ret;
}
接下来,数据包将进一步传递给 __netif_receive_skb_core 处理,__netif_receive_skb_core 的定义为:
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct sk_buff *skb = *pskb;
struct net_device *orig_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
trace_netif_receive_skb(skb);
orig_dev = skb->dev;
skb_reset_network_header(skb);
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
pt_prev = NULL;
another_round:
skb->skb_iif = skb->dev->ifindex;
__this_cpu_inc(softnet_data.processed);
if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret2;
migrate_disable();
ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
migrate_enable();
if (ret2 != XDP_PASS) {
ret = NET_RX_DROP;
goto out;
}
}
if (eth_type_vlan(skb->protocol)) {
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
}
if (skb_skip_tc_classify(skb))
goto skip_classify;
if (pfmemalloc)
goto skip_taps;
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
bool another = false;
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
&another);
if (another)
goto another_round;
if (!skb)
goto out;
if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
goto out;
}
#endif
skb_reset_redirect(skb);
skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
if (skb_vlan_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
if (vlan_do_receive(&skb))
goto another_round;
else if (unlikely(!skb))
goto out;
}
//使用netdev_rx_handler_register注册回调函数。
//对于bridge而言,回调函数一般是br_handle_frame,是通过 br_add_if 函数注册的
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
ret = NET_RX_SUCCESS;
goto out;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
break;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
}
if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
if (skb_vlan_tag_get_id(skb)) {
/* Vlan id is non 0 and vlan_do_receive() above couldn't
* find vlan device.
*/
skb->pkt_type = PACKET_OTHERHOST;
} else if (eth_type_vlan(skb->protocol)) {
/* Outer header is 802.1P with vlan 0, inner header is
* 802.1Q or 802.1AD and vlan_do_receive() above could
* not find vlan dev for vlan id 0.
*/
__vlan_hwaccel_clear_tag(skb);
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
if (vlan_do_receive(&skb))
/* After stripping off 802.1P header with vlan 0
* vlan dev is found for inner header.
*/
goto another_round;
else if (unlikely(!skb))
goto out;
else
/* We have stripped outer 802.1P vlan 0 header.
* But could not find vlan dev.
* check again for vlan id to set OTHERHOST.
*/
goto check_vlan_id;
}
/* Note: we might in the future use prio bits
* and set skb->priority like in vlan_do_receive()
* For the time being, just ignore Priority Code Point
*/
__vlan_hwaccel_clear_tag(skb);
}
type = skb->protocol;
/* deliver only exact match when indicated */
if (likely(!deliver_exact)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
}
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&orig_dev->ptype_specific);
if (unlikely(skb->dev != orig_dev)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&skb->dev->ptype_specific);
}
if (pt_prev) {
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
*ppt_prev = pt_prev;
} else {
drop:
if (!deliver_exact)
atomic_long_inc(&skb->dev->rx_dropped);
else
atomic_long_inc(&skb->dev->rx_nohandler);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
/* The invariant here is that if *ppt_prev is not NULL
* then skb should also be non-NULL.
*
* Apparently *ppt_prev assignment above holds this invariant due to
* skb dereferencing near it.
*/
*pskb = skb;
return ret;
}
__netif_receive_skb_core 函数略显复杂,因为在这里塞了许多网络关键技术节点处理,比如XDP generic节点处理、VLAN tag处理、tc处理、以及正常的网桥处理等。暂且不管那些新技术的处理,数据包下一步将被 rx_handler(&skb) 处理,rx_handler 取自 skb->dev->rx_handler,也就是收上此数据报文的网卡所注册的回调函数。netdev_rx_handler_register 函数用于为网卡注册回调函数,它在 br_add_if 中被调用,具体为:
err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p);
br_get_rx_handler的定义为:
rx_handler_func_t *br_get_rx_handler(const struct net_device *dev)
{
if (netdev_uses_dsa(dev))
return br_handle_frame_dummy;
return br_handle_frame;
}
因此,在未使用DSA的情况下,上述的注册过程等价于:
err = netdev_rx_handler_register(dev, br_handle_frame, p);
因此,数据包在桥中的下一个处理函数为 br_handle_frame ,其定义为:
static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
struct net_bridge_port *p;
struct sk_buff *skb = *pskb;
const unsigned char *dest = eth_hdr(skb)->h_dest;
//判断回环
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
//判断源MAC地址是否有效
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
goto drop;
//skb共享检查
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return RX_HANDLER_CONSUMED;
memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
//获取网桥端口
p = br_port_get_rcu(skb->dev);
if (p->flags & BR_VLAN_TUNNEL)
br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p));
if (unlikely(is_link_local_ether_addr(dest))) {
u16 fwd_mask = p->br->group_fwd_mask_required;
/*
* See IEEE 802.1D Table 7-10 Reserved addresses
*
* Assignment Value
* Bridge Group Address 01-80-C2-00-00-00
* (MAC Control) 802.3 01-80-C2-00-00-01
* (Link Aggregation) 802.3 01-80-C2-00-00-02
* 802.1X PAE address 01-80-C2-00-00-03
*
* 802.1AB LLDP 01-80-C2-00-00-0E
*
* Others reserved for future standardization
*/
fwd_mask |= p->group_fwd_mask;
switch (dest[5]) {
case 0x00: /* Bridge Group Address */
/* If STP is turned off,
then must forward to keep loop detection */
if (p->br->stp_enabled == BR_NO_STP ||
fwd_mask & (1u << dest[5]))
goto forward;
*pskb = skb;
__br_handle_local_finish(skb);
return RX_HANDLER_PASS;
case 0x01: /* IEEE MAC (Pause) */
goto drop;
case 0x0E: /* 802.1AB LLDP */
fwd_mask |= p->br->group_fwd_mask;
if (fwd_mask & (1u << dest[5]))
goto forward;
*pskb = skb;
__br_handle_local_finish(skb);
return RX_HANDLER_PASS;
default:
/* Allow selective forwarding for most other protocols */
fwd_mask |= p->br->group_fwd_mask;
if (fwd_mask & (1u << dest[5]))
goto forward;
}
/* The else clause should be hit when nf_hook():
* - returns < 0 (drop/error)
* - returns = 0 (stolen/nf_queue)
* Thus return 1 from the okfn() to signal the skb is ok to pass
*/
if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
br_handle_local_finish) == 1) {
return RX_HANDLER_PASS;
} else {
return RX_HANDLER_CONSUMED;
}
}
if (unlikely(br_process_frame_type(p, skb)))
return RX_HANDLER_PASS;
forward:
switch (p->state) {
case BR_STATE_FORWARDING:
case BR_STATE_LEARNING:
if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
return nf_hook_bridge_pre(skb, pskb);
default:
drop:
kfree_skb(skb);
}
return RX_HANDLER_CONSUMED;
}
br_handle_frame 也略显复杂,但是,对于一般的二层包而言,它要么被协议栈丢弃,要么匹配收包网卡的MAC进入更上层协议栈处理,要么未匹配收包网卡MAC而被转发。
1、二层转发
我们首先考虑转发的情况,因为它意味着此报文不会上升至网络层,情况较为简单。在判断报文需要被转发后,将进入转发处理:
forward:
switch (p->state) {
case BR_STATE_FORWARDING:
case BR_STATE_LEARNING:
if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
return nf_hook_bridge_pre(skb, pskb);
显然,又进入了Netfilter框架的处理,函数 nf_hook_bridge_pre 的定义为:
static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb)
{
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
struct nf_hook_entries *e = NULL;
struct nf_hook_state state;
unsigned int verdict, i;
struct net *net;
int ret;
net = dev_net(skb->dev);
#ifdef HAVE_JUMP_LABEL
if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING]))
goto frame_finish;
#endif
e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]);
if (!e)
goto frame_finish;
nf_hook_state_init(&state, NF_BR_PRE_ROUTING,
NFPROTO_BRIDGE, skb->dev, NULL, NULL,
net, br_handle_frame_finish);
for (i = 0; i < e->num_hook_entries; i++) {
verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state);
switch (verdict & NF_VERDICT_MASK) {
case NF_ACCEPT:
if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) {
*pskb = skb;
return RX_HANDLER_PASS;
}
break;
case NF_DROP:
kfree_skb(skb);
return RX_HANDLER_CONSUMED;
case NF_QUEUE:
ret = nf_queue(skb, &state, i, verdict);
if (ret == 1)
continue;
return RX_HANDLER_CONSUMED;
default: /* STOLEN */
return RX_HANDLER_CONSUMED;
}
}
frame_finish:
net = dev_net(skb->dev);
br_handle_frame_finish(net, NULL, skb);
#else
br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
#endif
return RX_HANDLER_CONSUMED;
}
在这里,如果Netfilter框架被启用,则首先遍历NF_BR_PRE_ROUTING处注册的全部回调函数,之后进入 br_handle_frame_finish。 br_handle_frame_finish 函数的定义为:
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
enum br_pkt_type pkt_type = BR_PKT_UNICAST;
struct net_bridge_fdb_entry *dst = NULL;
struct net_bridge_mcast_port *pmctx;
struct net_bridge_mdb_entry *mdst;
bool local_rcv, mcast_hit = false;
struct net_bridge_mcast *brmctx;
struct net_bridge_vlan *vlan;
struct net_bridge *br;
u16 vid = 0;
u8 state;
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
brmctx = &p->br->multicast_ctx;
pmctx = &p->multicast_ctx;
state = p->state;
if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid,
&state, &vlan))
goto out;
nbp_switchdev_frame_mark(p, skb);
/* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
if (p->flags & BR_LEARNING)
br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0);
local_rcv = !!(br->dev->flags & IFF_PROMISC);
if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) {
/* by definition the broadcast is also a multicast address */
if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) {
pkt_type = BR_PKT_BROADCAST;
local_rcv = true;
} else {
pkt_type = BR_PKT_MULTICAST;
if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid))
goto drop;
}
}
if (state == BR_STATE_LEARNING)
goto drop;
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
if (IS_ENABLED(CONFIG_INET) &&
(skb->protocol == htons(ETH_P_ARP) ||
skb->protocol == htons(ETH_P_RARP))) {
br_do_proxy_suppress_arp(skb, br, vid, p);
} else if (IS_ENABLED(CONFIG_IPV6) &&
skb->protocol == htons(ETH_P_IPV6) &&
br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
pskb_may_pull(skb, sizeof(struct ipv6hdr) +
sizeof(struct nd_msg)) &&
ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
struct nd_msg *msg, _msg;
msg = br_is_nd_neigh_msg(skb, &_msg);
if (msg)
br_do_suppress_nd(skb, br, vid, p, msg);
}
switch (pkt_type) {
case BR_PKT_MULTICAST:
mdst = br_mdb_get(brmctx, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) {
if ((mdst && mdst->host_joined) ||
br_multicast_is_router(brmctx, skb)) {
local_rcv = true;
DEV_STATS_INC(br->dev, multicast);
}
mcast_hit = true;
} else {
local_rcv = true;
DEV_STATS_INC(br->dev, multicast);
}
break;
case BR_PKT_UNICAST:
//查询2层fdb表
dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid);
break;
default:
break;
}
if (dst) {
unsigned long now = jiffies;
if (test_bit(BR_FDB_LOCAL, &dst->flags))
return br_pass_frame_up(skb);
if (now != dst->used)
dst->used = now;
//桥转发
br_forward(dst->dst, skb, local_rcv, false);
} else {
if (!mcast_hit)
//对于不确定的单播地址直接泛洪,这看起来有些危险,容易导致广播风暴
br_flood(br, skb, pkt_type, local_rcv, false);
else
//对于组播播地址泛洪,容易导致广播风暴
br_multicast_flood(mdst, skb, brmctx, local_rcv, false);
}
if (local_rcv)
return br_pass_frame_up(skb);
out:
return 0;
drop:
kfree_skb(skb);
goto out;
}
EXPORT_SYMBOL_GPL(br_handle_frame_finish);
在这里将根据报文的目的MAC去决定转发逻辑。对于单播报文,需要去查询fdb表,以根据目的MAC地址确定出端口,实际上,这里就是二层交换机的转发逻辑。
br_forward(dst->dst, skb, local_rcv, false);
而对于组播报文以及fdb表内查询不到的MAC地址,则将报文转发给入端口以外的全部端口,也就是泛洪,当然,这种方式很容易引起广播风暴。
if (!mcast_hit)
//对于不确定的单播地址直接泛洪,这看起来有些危险,容易导致广播风暴
br_flood(br, skb, pkt_type, local_rcv, false);
else
//对于组播播地址泛洪,容易导致广播风暴
br_multicast_flood(mdst, skb, brmctx, local_rcv, false);
对于可以找到出端口的单播报文,将送给 br_forward 进一步处理,br_forward 的定义为:
void br_forward(const struct net_bridge_port *to,
struct sk_buff *skb, bool local_rcv, bool local_orig)
{
if (unlikely(!to))
goto out;
/* redirect to backup link if the destination port is down */
if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
struct net_bridge_port *backup_port;
backup_port = rcu_dereference(to->backup_port);
if (unlikely(!backup_port))
goto out;
to = backup_port;
}
if (should_deliver(to, skb)) {
if (local_rcv)
deliver_clone(to, skb, local_orig);
else
__br_forward(to, skb, local_orig);
return;
}
out:
if (!local_rcv)
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(br_forward);
之后将进一步进入 __br_forward 处理,__br_forward 的定义为:
static void __br_forward(const struct net_bridge_port *to,
struct sk_buff *skb, bool local_orig)
{
struct net_bridge_vlan_group *vg;
struct net_device *indev;
struct net *net;
int br_hook;
/* Mark the skb for forwarding offload early so that br_handle_vlan()
* can know whether to pop the VLAN header on egress or keep it.
*/
nbp_switchdev_frame_mark_tx_fwd_offload(to, skb);
vg = nbp_vlan_group_rcu(to);
skb = br_handle_vlan(to->br, to, vg, skb);
if (!skb)
return;
indev = skb->dev;
skb->dev = to->dev;
if (!local_orig) {
if (skb_warn_if_lro(skb)) {
kfree_skb(skb);
return;
}
br_hook = NF_BR_FORWARD;
skb_forward_csum(skb);
net = dev_net(indev);
} else {
if (unlikely(netpoll_tx_running(to->br->dev))) {
skb_push(skb, ETH_HLEN);
if (!is_skb_forwardable(skb->dev, skb))
kfree_skb(skb);
else
br_netpoll_send_skb(to, skb);
return;
}
br_hook = NF_BR_LOCAL_OUT;
net = dev_net(skb->dev);
indev = NULL;
}
//执行钩子函数,之后执行br_forward_finish
NF_HOOK(NFPROTO_BRIDGE, br_hook,
net, NULL, skb, indev, skb->dev,
br_forward_finish);
}
在这里,报文被送入 NF_BR_FORWARD 的回调函数,因为根据前面提供的代码来看,传参 local_orig 总为 false ,这是因为我们讨论的是转发的逻辑,只有报文是从本机往外发出时,才应该将 local_orig 置为 true,表明这是由本机发出的报文,走OUT逻辑。
无论是OUT逻辑还是FORWARD逻辑,在相应的回调函数处理完毕后,都会交由 br_forward_finish 处理, br_forward_finish 的定义为:
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb->tstamp = 0;
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
net, sk, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
}
EXPORT_SYMBOL_GPL(br_forward_finish);
可以看出,马不停蹄地进入了下一个钩子 NF_BR_POST_ROUTING 的回调函数,然后交由 br_dev_queue_push_xmit 函数处理,br_dev_queue_push_xmit 的定义为:
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb_push(skb, ETH_HLEN);
if (!is_skb_forwardable(skb->dev, skb))
goto drop;
br_drop_fake_rtable(skb);
if (skb->ip_summed == CHECKSUM_PARTIAL &&
eth_type_vlan(skb->protocol)) {
int depth;
if (!vlan_get_protocol_and_depth(skb, skb->protocol, &depth))
goto drop;
skb_set_network_header(skb, depth);
}
br_switchdev_frame_set_offload_fwd_mark(skb);
dev_queue_xmit(skb);
return 0;
drop:
kfree_skb(skb);
return 0;
}
EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);
最终通过 dev_queue_xmit(skb) 将报文交给网卡处理,二层转发逻辑到此结束。
2、送至上层协议栈处理
对于进入上层协议栈处理的情况,对应的代码为:
static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
...
if (unlikely(is_link_local_ether_addr(dest))) {
...
if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
br_handle_local_finish) == 1) {
return RX_HANDLER_PASS;
} else {
return RX_HANDLER_CONSUMED;
}
...
}
...
}
因此,接下来首先进入NF_BR_LOCAL_IN处的钩子函数(如果有注册此钩子函数的话)处理,之后再进入br_handle_local_finish处理。
br_handle_local_finish 的处理非常简单,其定义为:
static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
__br_handle_local_finish(skb);
/* return 1 to signal the okfn() was called so it's ok to use the skb */
return 1;
}
__br_handle_local_finish 的处理也十分简单,其定义如下:
static void __br_handle_local_finish(struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
u16 vid = 0;
/* check if vlan is allowed, to avoid spoofing */
if ((p->flags & BR_LEARNING) &&
nbp_state_should_learn(p) &&
!br_opt_get(p->br, BROPT_NO_LL_LEARN) &&
br_should_learn(p, skb, &vid))
br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0);
}
主要是在开启vlan的情况下更新一下fdb表。那么问题来了,二层的报文如何进一步传递到三层处理呢?
上述函数执行完毕后依次出栈,直到返回到 __netif_receive_skb_one_core ,其定义上面有,但不妨在这里再次给出:
static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
int ret;
ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (pt_prev)
ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
skb->dev, pt_prev, orig_dev);
return ret;
}
可以看到,在 __netif_receive_skb_core 返回之后,将进一步根据上层协议类型进入对应的处理函数,IPv4报文将送由 ip_rcv 处理而IPv6报文将由 ipv6_rcv 处理。
至此,Linux网络内核中的二层处理结束,所有报文要么被丢弃,要么被二层转发,要么被送到更高层的协议栈处理。