vpp process类型节点调度过程
vpp节点类型
VLIB_NODE_TYPE_PROCESS:process类型节点可以被挂起也可以被恢复,main线程上调度
typedef enum
{
/* An internal node on the call graph (could be output). */
VLIB_NODE_TYPE_INTERNAL,
/* Nodes which input data into the processing graph.
Input nodes are called for each iteration of main loop. */
VLIB_NODE_TYPE_INPUT,
/* Nodes to be called before all input nodes.
Used, for example, to clean out driver TX rings before
processing input. */
VLIB_NODE_TYPE_PRE_INPUT,
/* "Process" nodes which can be suspended and later resumed. */
VLIB_NODE_TYPE_PROCESS,
VLIB_N_NODE_TYPE,
} vlib_node_type_t;
process节点注册
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
.function = ip4_full_reass_walk_expired,
.type = VLIB_NODE_TYPE_PROCESS,
.name = "ip4-full-reassembly-expire-walk",
.format_trace = format_ip4_full_reass_trace,
.n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
.error_strings = ip4_full_reass_error_strings,
};
process类型节点调度流程
每个process节点是由jump机制构成的一个协程,协程主要用于等待、处理事件。
使用longjmp/setjmp
的轻量级多任务协程,由应用进程自行进行调度,不受操作系统调度机制的影响,上下文切换只损耗调用longjmp/setjmp
的时间。
协程中运行的函数类似于线程函数,区别在于协程函数can be suspended, wait for events, be resumed…(based on setjump/longjump)
以x86_64 cpu来说,clib_longjmp_t
存放rbx, rbp, r12, r13, r14, r15, eip, rsp寄存器,利用这些寄存器实现跳转功能
#if defined(__x86_64__)
/* rbx, rbp, r12, r13, r14, r15, eip, rsp */
#define CLIB_ARCH_LONGJMP_REGS 8
#elif defined(i386)
/* ebx, ebp, esi, edi, eip, rsp */
#define CLIB_ARCH_LONGJMP_REGS 6
#elif (defined(__powerpc64__) || defined(__powerpc__))
typedef struct
{
uword regs[CLIB_ARCH_LONGJMP_REGS];
} clib_longjmp_t __attribute__ ((aligned (16)));
/* Return given value to saved context. */
void clib_longjmp (clib_longjmp_t * save, uword return_value);
/* Save context. Returns given value if jump is not taken;
otherwise returns value from clib_longjmp if long jump is taken. */
uword clib_setjmp (clib_longjmp_t * save, uword return_value_not_taken);
/* Call function on given stack. */
uword clib_calljmp (uword (*func) (uword func_arg),
uword func_arg, void *stack);
clib_setjmp
作用:保存上下文,即上述的寄存器。
返回值:首次调用(跳转功能还未执行过),返回传入的第二个参数,即return_value_not_taken
,否则返回clib_longjmp
的返回值(如果long jump已执行)
clib_longjmp
作用:恢复clib_setjmp
时调用栈环境的存储数据,跳转回去继续执行
返回值:传入的第二个参数
clib_calljmp
作用:在给定的栈空间上执行function
process节点初始化 (以ip4_full_reass_expire_node节点为例)
vlib_main_or_worker_loop
-> dispatch_process
-> vlib_process_startup
通过执行dispatch_process
完成最初对各process节点的一次调度,即初始化操作
static_always_inline void
vlib_main_or_worker_loop (vlib_main_t * vm, int is_main)
{
......
/* Start all processes. */
if (is_main)
{
uword i;
/*
* Perform an initial barrier sync. Pays no attention to
* the barrier sync hold-down timer scheme, which won't work
* at this point in time.
*/
vlib_worker_thread_initial_barrier_sync_and_release (vm);
nm->current_process_index = ~0;
for (i = 0; i < vec_len (nm->processes); i++)
cpu_time_now = dispatch_process (vm, nm->processes[i], /* frame */ 0,
cpu_time_now);
}
......
}
vlib_process_startup
中,clib_setjmp
设置return_longjmp,此时返回值r为VLIB_PROCESS_RETURN_LONGJMP_RETURN- 然后
clib_calljmp
在给定的栈空间上(给每个process节点在注册时分配的p->stack
)调用vlib_process_bootstrap
/* Called in main stack. */
static_always_inline uword
vlib_process_startup (vlib_main_t * vm, vlib_process_t * p, vlib_frame_t * f)
{
vlib_process_bootstrap_args_t a;
uword r;
a.vm = vm;
a.process = p;
a.frame = f;
/*****************************Step 1************************************/
r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN);
if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN)
{
vlib_process_start_switch_stack (vm, p);
/****************************Step 2**************************/
r = clib_calljmp (vlib_process_bootstrap, pointer_to_uword (&a),
(void *) p->stack + (1 << p->log2_n_stack_bytes)); // Step 2
}
else
vlib_process_finish_switch_stack (vm);
return r;
}
- 真正执行各process节点的function
/* Called in process stack. */
static uword
vlib_process_bootstrap (uword _a)
{
vlib_process_bootstrap_args_t *a;
vlib_main_t *vm;
vlib_node_runtime_t *node;
vlib_frame_t *f;
vlib_process_t *p;
uword n;
a = uword_to_pointer (_a, vlib_process_bootstrap_args_t *);
vm = a->vm;
p = a->process;
vlib_process_finish_switch_stack (vm);
f = a->frame;
node = &p->node_runtime;
n = node->function (vm, node, f); //************Step 3***************
ASSERT (vlib_process_stack_is_valid (p));
vlib_process_start_switch_stack (vm, 0);
clib_longjmp (&p->return_longjmp, n);
return n;
}
- 每个process节点都会在开头处调用
vlib_process_wait_for_event_or_clock
或vlib_process_wait_for_event
static uword
ip4_full_reass_walk_expired (vlib_main_t * vm,
vlib_node_runtime_t * node, vlib_frame_t * f)
{
ip4_full_reass_main_t *rm = &ip4_full_reass_main;
uword event_type, *event_data = 0;
while (true)
{
/****************Step4*****************/
vlib_process_wait_for_event_or_clock (vm,
(f64)
rm->expire_walk_interval_ms /
(f64) MSEC_PER_SEC);
event_type = vlib_process_get_events (vm, &event_data);
switch (event_type)
{
case ~0: /* no events => timeout */
/* nothing to do here */
break;
case IP4_EVENT_CONFIG_CHANGED:
break;
default:
clib_warning ("BUG: event type 0x%wx", event_type);
break;
}
......
}
vlib_process_wait_for_event_or_clock
先去检查non_empty_event_type_bitmap是否有置位,如有说明有事件需要去处理,则直接返回。否则将suspend状态标记置位,标识当前是suspend状态,等待event或clock。- 设置resume_longjmp,此时返回值r为VLIB_PROCESS_RESUME_LONGJMP_SUSPEND
- 调用
clib_longjmp
跳转到vlib_process_startup
->clib_setjmp
设置return_longjmp处,此时返回值r应为clib_longjmp
的第二个参数,即VLIB_PROCESS_RETURN_LONGJMP_SUSPEND - 上述步骤完成了一个process节点的初始调度,各process node暂时处于suspend状态,等待某个条件(时钟或事件)的到来进入resume状态并执行业务逻辑
/** Suspend a cooperative multi-tasking thread
Waits for an event, or for the indicated number of seconds to elapse
@param vm - vlib_main_t pointer
@param dt - timeout, in seconds.
@returns the remaining time interval
*/
always_inline f64
vlib_process_wait_for_event_or_clock (vlib_main_t * vm, f64 dt)
{
vlib_node_main_t *nm = &vm->node_main;
vlib_process_t *p;
f64 wakeup_time;
uword r;
p = vec_elt (nm->processes, nm->current_process_index);
/***********************Step5********************************/
if (vlib_process_suspend_time_is_zero (dt)
|| !clib_bitmap_is_zero (p->non_empty_event_type_bitmap))
return dt;
wakeup_time = vlib_time_now (vm) + dt;
/* Suspend waiting for both clock and event to occur. */
p->flags |= (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT
| VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK);
/***********************Step6********************************/
r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
{
p->resume_clock_interval = dt * 1e5;
vlib_process_start_switch_stack (vm, 0);
/***********************Step7********************************/
clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
}
else
vlib_process_finish_switch_stack (vm);
/* Return amount of time still left to sleep.
If <= 0 then we've been waken up by the clock (and not an event). */
return wakeup_time - vlib_time_now (vm);
}
static u64
dispatch_process (vlib_main_t * vm,
vlib_process_t * p, vlib_frame_t * f, u64 last_time_stamp)
{
......
n_vectors = vlib_process_startup (vm, p, f);//这里返回值为VLIB_PROCESS_RETURN_LONGJMP_SUSPEND
nm->current_process_index = old_process_index;
ASSERT (n_vectors != VLIB_PROCESS_RETURN_LONGJMP_RETURN);
is_suspend = n_vectors == VLIB_PROCESS_RETURN_LONGJMP_SUSPEND;
if (is_suspend)//节点进入suspend状态
{
vlib_pending_frame_t *pf;
n_vectors = 0;
pool_get (nm->suspended_process_frames, pf);
pf->node_runtime_index = node->runtime_index;
pf->frame = f;
pf->next_frame_index = ~0;
p->n_suspends += 1;
p->suspended_process_frame_index = pf - nm->suspended_process_frames;
if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK)//基于时间的调度策略,依赖时间轮构建
{
TWT (tw_timer_wheel) * tw =
(TWT (tw_timer_wheel) *) nm->timing_wheel;
p->stop_timer_handle =
TW (tw_timer_start) (tw,
vlib_timing_wheel_data_set_suspended_process
(node->runtime_index) /* [sic] pool idex */ ,
0 /* timer_id */ ,
p->resume_clock_interval);//vlib_process_wait_for_event_or_clock 设置这个值
}
}
......
}
resume
需要恢复的process有两种情况,一种是等待的时钟已经到期,一种是等待的事件发生
data_from_advancing_timing_wheel
数组存放所有超时的process index。不仅等待时钟的,等待事件的process,也会向数组加入自己的index
/* Check if process nodes have expired from timing wheel. */
ASSERT (nm->data_from_advancing_timing_wheel != 0);
if (PREDICT_FALSE (vm->elog_trace_graph_dispatch))
ed = ELOG_DATA (&vlib_global_main.elog_main, es);
nm->data_from_advancing_timing_wheel =
TW (tw_timer_expire_timers_vec)
((TWT (tw_timer_wheel) *) nm->timing_wheel, vlib_time_now (vm),
nm->data_from_advancing_timing_wheel);
ASSERT (nm->data_from_advancing_timing_wheel != 0);
if (PREDICT_FALSE (vm->elog_trace_graph_dispatch))
{
ed = ELOG_DATA (&vlib_global_main.elog_main, ee);
ed->nready_procs =
_vec_len (nm->data_from_advancing_timing_wheel);
}
dispatch_suspended_process
-> vlib_process_resume
- 设置return_longjmp,此时返回值r为VLIB_PROCESS_RETURN_LONGJMP_RETURN
- 调用
clib_longjmp
,跳转到第一次调度时设置resume_longjmp
处,即vlib_process_wait_for_event_or_clock
->clib_setjmp
处,此时clib_setjmp
返回值应为VLIB_PROCESS_RESUME_LONGJMP_RESUME
static_always_inline uword
vlib_process_resume (vlib_main_t * vm, vlib_process_t * p)
{
uword r;
p->flags &= ~(VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
| VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT
| VLIB_PROCESS_RESUME_PENDING);
/***********************Step1********************************/
r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN);
if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN)
{
vlib_process_start_switch_stack (vm, p);
/***********************Step2********************************/
clib_longjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_RESUME);
}
else
vlib_process_finish_switch_stack (vm);
return r;
}
/** Suspend a cooperative multi-tasking thread
Waits for an event, or for the indicated number of seconds to elapse
@param vm - vlib_main_t pointer
@param dt - timeout, in seconds.
@returns the remaining time interval
*/
always_inline f64
vlib_process_wait_for_event_or_clock (vlib_main_t * vm, f64 dt)
{
vlib_node_main_t *nm = &vm->node_main;
vlib_process_t *p;
f64 wakeup_time;
uword r;
p = vec_elt (nm->processes, nm->current_process_index);
if (vlib_process_suspend_time_is_zero (dt)
|| !clib_bitmap_is_zero (p->non_empty_event_type_bitmap))
return dt;
wakeup_time = vlib_time_now (vm) + dt;
/* Suspend waiting for both clock and event to occur. */
p->flags |= (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT
| VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK);
r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
{
p->resume_clock_interval = dt * 1e5;
vlib_process_start_switch_stack (vm, 0);
clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
}
else //此次走这个分支
vlib_process_finish_switch_stack (vm);
/* Return amount of time still left to sleep.
If <= 0 then we've been waken up by the clock (and not an event). */
return wakeup_time - vlib_time_now (vm);
}
- 返回到ip4_full_reass_walk_expired节点,执行业务逻辑,即重组buffer的老化流程,各节点的业务逻辑执行均为死循环,下一次调用到
vlib_process_wait_for_event_or_clock
时,设置resume_longjmp,此时返回值r为VLIB_PROCESS_RESUME_LONGJMP_SUSPEND - 调用
clib_longjmp
跳转到vlib_process_resume
->clib_setjmp
设置return_longjmp处,此时返回值r应为clib_longjmp
的第二个参数,即VLIB_PROCESS_RETURN_LONGJMP_SUSPEND - 进入到下一轮的suspend状态,在vpp main线程的循环中,一直持续着suspend resume的状态切换过程,每次resuem时,执行节点的业务逻辑
r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
{
p->resume_clock_interval = dt * 1e5;
vlib_process_start_switch_stack (vm, 0);
clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
}
else
vlib_process_finish_switch_stack (vm);
参考文档链接
- https://developer.aliyun.com/article/610474?spm=a2c6h.13262185.profile.29.67b32e31kB4Qt4
- https://blog.csdn.net/sjin_1314/article/details/106170429
- https://zhuanlan.zhihu.com/p/484606752
- https://blog.csdn.net/weixin_39915694/article/details/111253624
- https://segmentfault.com/a/1190000019613786
.net/weixin_39915694/article/details/111253624 - https://segmentfault.com/a/1190000019613786