基本概念
核心函数
ethernet_input_init
初始化函数,主循环之前会调用。
static clib_error_t *
ethernet_input_init (vlib_main_t * vm)
{
//支持vlan,和qinq协议
ethernet_main_t *em = ðernet_main;
__attribute__ ((unused)) vlan_table_t *invalid_vlan_table;
__attribute__ ((unused)) qinq_table_t *invalid_qinq_table;
/*只是给format_buffer,unformat_buffer赋值,值得注意的是有对packet generate初始化,基本协议都有自己的pg实现。*/
ethernet_setup_node (vm, ethernet_input_node.index);
ethernet_setup_node (vm, ethernet_input_type_node.index);
ethernet_setup_node (vm, ethernet_input_not_l2_node.index);
//初始化sparse_vec,用于根据3层协议来区分下一跳node这个目的。
next_by_ethertype_init (&em->l3_next);
// Initialize pools and vector for vlan parsing
vec_validate (em->main_intfs, 10); // 10 main interfaces
pool_alloc (em->vlan_pool, 10);
pool_alloc (em->qinq_pool, 1);
// The first vlan pool will always be reserved for an invalid table
pool_get (em->vlan_pool, invalid_vlan_table); // first id = 0
// The first qinq pool will always be reserved for an invalid table
pool_get (em->qinq_pool, invalid_qinq_table); // first id = 0
return 0;
}
ethernet_input_inline 完成了该node业务逻辑功能
static_always_inline uword
ethernet_input_inline (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * from_frame,
ethernet_input_variant_t variant)
{
vnet_main_t *vnm = vnet_get_main ();
ethernet_main_t *em = ðernet_main;
vlib_node_runtime_t *error_node;
u32 n_left_from, next_index, *from, *to_next;
u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
u32 cpu_index = os_get_cpu_number ();
/*ETHERNET_INPUT_VARIANT_ETHERNET_TYPE,ETHERNET_INPUT_VARIANT_NOT_L2,
ETHERNET_INPUT_VARIANT_ETHERNET三种模式下,公用ethernet_input_node的
error信息。博主没有看出这里有什么特殊的含义*/
if (variant != ETHERNET_INPUT_VARIANT_ETHERNET)
error_node = vlib_node_get_runtime (vm, ethernet_input_node.index);
else
error_node = node;
//返回frame尾部保存数据包信息内存的起始地址
from = vlib_frame_vector_args (from_frame);
//frame中的数据包个数
n_left_from = from_frame->n_vectors;
if (node->flags & VLIB_NODE_FLAG_TRACE)
vlib_trace_frame_buffers_only (vm, node,
from,
n_left_from,
sizeof (from[0]),
sizeof (ethernet_input_trace_t));
//上次数据包的下一跳这里直接使用,后面有机会修正
next_index = node->cached_next_index;
stats_sw_if_index = node->runtime_data[0];
stats_n_packets = stats_n_bytes = 0;
while (n_left_from > 0)
{
u32 n_left_to_next;
//获取传给下一跳node的保存数据包的缓存
vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
while (n_left_from >= 4 && n_left_to_next >= 2)
{
//操作两个数据包,再预取两个数据包
u32 bi0, bi1;
vlib_buffer_t *b0, *b1;
u8 next0, next1, error0, error1;
u16 type0, orig_type0, type1, orig_type1;
u16 outer_id0, inner_id0, outer_id1, inner_id1;
u32 match_flags0, match_flags1;
u32 old_sw_if_index0, new_sw_if_index0, len0, old_sw_if_index1,
new_sw_if_index1, len1;
vnet_hw_interface_t *hi0, *hi1;
main_intf_t *main_intf0, *main_intf1;
vlan_intf_t *vlan_intf0, *vlan_intf1;
qinq_intf_t *qinq_intf0, *qinq_intf1;
u32 is_l20, is_l21;
/* Prefetch next iteration. */
{
vlib_buffer_t *b2, *b3;
b2 = vlib_get_buffer (vm, from[2]);
b3 = vlib_get_buffer (vm, from[3]);
vlib_prefetch_buffer_header (b2, STORE);
vlib_prefetch_buffer_header (b3, STORE);
CLIB_PREFETCH (b2->data, sizeof (ethernet_header_t), LOAD);
CLIB_PREFETCH (b3->data, sizeof (ethernet_header_t), LOAD);
}
bi0 = from[0];
bi1 = from[1];
to_next[0] = bi0;
to_next[1] = bi1;
from += 2;
to_next += 2;
n_left_to_next -= 2;
n_left_from -= 2;
b0 = vlib_get_buffer (vm, bi0);
b1 = vlib_get_buffer (vm, bi1);
error0 = error1 = ETHERNET_ERROR_NONE;
/*解析2层信息,有多重封装的也解封,最终把
vlib_buffer_t->current_data指向三层头部*/
parse_header (variant,
b0,
&type0,
&orig_type0, &outer_id0, &inner_id0, &match_flags0);
parse_header (variant,
b1,
&type1,
&orig_type1, &outer_id1, &inner_id1, &match_flags1);
old_sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
old_sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
eth_vlan_table_lookups (em,
vnm,
old_sw_if_index0,
orig_type0,
outer_id0,
inner_id0,
&hi0,
&main_intf0, &vlan_intf0, &qinq_intf0);
eth_vlan_table_lookups (em,
vnm,
old_sw_if_index1,
orig_type1,
outer_id1,
inner_id1,
&hi1,
&main_intf1, &vlan_intf1, &qinq_intf1);
identify_subint (hi0,
b0,
match_flags0,
main_intf0,
vlan_intf0,
qinq_intf0, &new_sw_if_index0, &error0, &is_l20);
identify_subint (hi1,
b1,
match_flags1,
main_intf1,
vlan_intf1,
qinq_intf1, &new_sw_if_index1, &error1, &is_l21);
// Save RX sw_if_index for later nodes
vnet_buffer (b0)->sw_if_index[VLIB_RX] =
error0 !=
ETHERNET_ERROR_NONE ? old_sw_if_index0 : new_sw_if_index0;
vnet_buffer (b1)->sw_if_index[VLIB_RX] =
error1 !=
ETHERNET_ERROR_NONE ? old_sw_if_index1 : new_sw_if_index1;
// Check if there is a stat to take (valid and non-main sw_if_index for pkt 0 or pkt 1)
/*更新统计信息,vpp中大量代码都是先按照预测执行逻辑,随后再修正,或许对代码流水线有帮助,有空再仔细琢磨下*/
if (((new_sw_if_index0 != ~0)
&& (new_sw_if_index0 != old_sw_if_index0))
|| ((new_sw_if_index1 != ~0)
&& (new_sw_if_index1 != old_sw_if_index1)))
{
len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data
- vnet_buffer (b0)->ethernet.start_of_ethernet_header;
len1 = vlib_buffer_length_in_chain (vm, b1) + b1->current_data
- vnet_buffer (b1)->ethernet.start_of_ethernet_header;
stats_n_packets += 2;
stats_n_bytes += len0 + len1;
if (PREDICT_FALSE
(!(new_sw_if_index0 == stats_sw_if_index
&& new_sw_if_index1 == stats_sw_if_index)))
{
stats_n_packets -= 2;
stats_n_bytes -= len0 + len1;
if (new_sw_if_index0 != old_sw_if_index0
&& new_sw_if_index0 != ~0)
vlib_increment_combined_counter (vnm->
interface_main.combined_sw_if_counters
+
VNET_INTERFACE_COUNTER_RX,
cpu_index,
new_sw_if_index0, 1,
len0);
if (new_sw_if_index1 != old_sw_if_index1
&& new_sw_if_index1 != ~0)
vlib_increment_combined_counter (vnm->
interface_main.combined_sw_if_counters
+
VNET_INTERFACE_COUNTER_RX,
cpu_index,
new_sw_if_index1, 1,
len1);
if (new_sw_if_index0 == new_sw_if_index1)
{
if (stats_n_packets > 0)
{
vlib_increment_combined_counter
(vnm->interface_main.combined_sw_if_counters
+ VNET_INTERFACE_COUNTER_RX,
cpu_index,
stats_sw_if_index,
stats_n_packets, stats_n_bytes);
stats_n_packets = stats_n_bytes = 0;
}
stats_sw_if_index = new_sw_if_index0;
}
}
}
if (variant == ETHERNET_INPUT_VARIANT_NOT_L2)
is_l20 = is_l21 = 0;
//决定下一跳node,根据设置可以支持按照协议决定下一跳
determine_next_node (em, variant, is_l20, type0, b0, &error0,
&next0);
determine_next_node (em, variant, is_l21, type1, b1, &error1,
&next1);
b0->error = error_node->errors[error0];
b1->error = error_node->errors[error1];
// verify speculative enqueue
//修正这两个数据包下一跳node
vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
n_left_to_next, bi0, bi1, next0,
next1);
}
while (n_left_from > 0 && n_left_to_next > 0)
{
u32 bi0;
vlib_buffer_t *b0;
u8 error0, next0;
u16 type0, orig_type0;
u16 outer_id0, inner_id0;
u32 match_flags0;
u32 old_sw_if_index0, new_sw_if_index0, len0;
vnet_hw_interface_t *hi0;
main_intf_t *main_intf0;
vlan_intf_t *vlan_intf0;
qinq_intf_t *qinq_intf0;
u32 is_l20;
// Prefetch next iteration
if (n_left_from > 1)
{
vlib_buffer_t *p2;
p2 = vlib_get_buffer (vm, from[1]);
vlib_prefetch_buffer_header (p2, STORE);
CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, LOAD);
}
bi0 = from[0];
to_next[0] = bi0;
from += 1;
to_next += 1;
n_left_from -= 1;
n_left_to_next -= 1;
b0 = vlib_get_buffer (vm, bi0);
error0 = ETHERNET_ERROR_NONE;
parse_header (variant,
b0,
&type0,
&orig_type0, &outer_id0, &inner_id0, &match_flags0);
old_sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
eth_vlan_table_lookups (em,
vnm,
old_sw_if_index0,
orig_type0,
outer_id0,
inner_id0,
&hi0,
&main_intf0, &vlan_intf0, &qinq_intf0);
identify_subint (hi0,
b0,
match_flags0,
main_intf0,
vlan_intf0,
qinq_intf0, &new_sw_if_index0, &error0, &is_l20);
// Save RX sw_if_index for later nodes
vnet_buffer (b0)->sw_if_index[VLIB_RX] =
error0 !=
ETHERNET_ERROR_NONE ? old_sw_if_index0 : new_sw_if_index0;
// Increment subinterface stats
// Note that interface-level counters have already been incremented
// prior to calling this function. Thus only subinterface counters
// are incremented here.
//
// Interface level counters include packets received on the main
// interface and all subinterfaces. Subinterface level counters
// include only those packets received on that subinterface
// Increment stats if the subint is valid and it is not the main intf
if ((new_sw_if_index0 != ~0)
&& (new_sw_if_index0 != old_sw_if_index0))
{
len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data
- vnet_buffer (b0)->ethernet.start_of_ethernet_header;
stats_n_packets += 1;
stats_n_bytes += len0;
// Batch stat increments from the same subinterface so counters
// don't need to be incremented for every packet.
if (PREDICT_FALSE (new_sw_if_index0 != stats_sw_if_index))
{
stats_n_packets -= 1;
stats_n_bytes -= len0;
if (new_sw_if_index0 != ~0)
vlib_increment_combined_counter
(vnm->interface_main.combined_sw_if_counters
+ VNET_INTERFACE_COUNTER_RX,
cpu_index, new_sw_if_index0, 1, len0);
if (stats_n_packets > 0)
{
vlib_increment_combined_counter
(vnm->interface_main.combined_sw_if_counters
+ VNET_INTERFACE_COUNTER_RX,
cpu_index,
stats_sw_if_index, stats_n_packets, stats_n_bytes);
stats_n_packets = stats_n_bytes = 0;
}
stats_sw_if_index = new_sw_if_index0;
}
}
if (variant == ETHERNET_INPUT_VARIANT_NOT_L2)
is_l20 = 0;
determine_next_node (em, variant, is_l20, type0, b0, &error0,
&next0);
b0->error = error_node->errors[error0];
// verify speculative enqueue
vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
to_next, n_left_to_next,
bi0, next0);
}
vlib_put_next_frame (vm, node, next_index, n_left_to_next);
}
// Increment any remaining batched stats
if (stats_n_packets > 0)
{
vlib_increment_combined_counter
(vnm->interface_main.combined_sw_if_counters
+ VNET_INTERFACE_COUNTER_RX,
cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
node->runtime_data[0] = stats_sw_if_index;
}
return from_frame->n_vectors;
}
以下几个函数用于hook该node,替代determine_next_node中默认的下一跳node挑选机制
/默认有ETHERNET_TYPE_IP4,ETHERNET_TYPE_IP6,ETHERNET_TYPE_MPLS_UNICAST三种协议,用户可以自己添加更多协议。可以根据协议来做不同下一跳/
void
ethernet_register_input_type (vlib_main_t * vm,
ethernet_type_t type, u32 node_index)
{
ethernet_main_t *em = ðernet_main;
ethernet_type_info_t *ti;
u32 i;
{
clib_error_t *error = vlib_call_init_function (vm, ethernet_init);
if (error)
clib_error_report (error);
}
ti = ethernet_get_type_info (em, type);
ti->node_index = node_index;
ti->next_index = vlib_node_add_next (vm,
ethernet_input_node.index, node_index);
i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);
ASSERT (i == ti->next_index);
i = vlib_node_add_next (vm, ethernet_input_not_l2_node.index, node_index);
ASSERT (i == ti->next_index);
// Add the L3 node for this ethertype to the next nodes structure
next_by_ethertype_register (&em->l3_next, type, ti->next_index);
// Call the registration functions for other nodes that want a mapping
l2bvi_register_input_type (vm, type, node_index);
}
/vlan包下一跳判定机制,可以还原ethernet头部,根据这里注册的值作跳转/
void
ethernet_register_l2_input (vlib_main_t * vm, u32 node_index)
{
ethernet_main_t *em = ðernet_main;
u32 i;
em->l2_next =
vlib_node_add_next (vm, ethernet_input_node.index, node_index);
/*
* Even if we never use these arcs, we have to align the next indices...
*/
i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);
ASSERT (i == em->l2_next);
i = vlib_node_add_next (vm, ethernet_input_not_l2_node.index, node_index);
ASSERT (i == em->l2_next);
}
//调用该函数后,大多数下一跳基本就由这里注册的值决定了
// Register a next node for L3 redirect, and enable L3 redirect
void
ethernet_register_l3_redirect (vlib_main_t * vm, u32 node_index)
{
ethernet_main_t *em = ðernet_main;
u32 i;
em->redirect_l3 = 1;
em->redirect_l3_next = vlib_node_add_next (vm,
ethernet_input_node.index,
node_index);
/*
* Change the cached next nodes to the redirect node
*/
em->l3_next.input_next_ip4 = em->redirect_l3_next;
em->l3_next.input_next_ip6 = em->redirect_l3_next;
em->l3_next.input_next_mpls = em->redirect_l3_next;
/*
* Even if we never use these arcs, we have to align the next indices...
*/
i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);
ASSERT (i == em->redirect_l3_next);
}