VPP源码分析

技术虫

已于 2023-10-31 22:30:13 修改

阅读量1.3k

点赞数 1

分类专栏：开源代码文章标签：后端开源软件 github

于 2023-06-11 23:59:24 首次发布

本文链接：https://blog.csdn.net/chbljy321/article/details/131157888

版权

开源代码专栏收录该内容

5 篇文章 2 订阅

订阅专栏

本文详细分析了VPP的源码，包括进程启动时加载插件的流程，使用clib_calljmp和clib_longjmp实现的协程机制，以及如何在主线程和工作线程中处理业务。此外，还探讨了命令行配置和报文的转发过程，特别是DPDK设备输入的处理步骤。

摘要由CSDN通过智能技术生成

VPP源码（版本18.01）从三个维度进行分析，仅作为新手学习入门的几个点：

1. 进程启动流程加载plugin流程；

2. 命令行打开业务开关流程；

3. 报文收发包及转发流程。

一. 进程启动流程：

1. vpp -c /etc/vpp/startup.conf运行阶段，加载很多.so库。

vlib_plugin_early_init:356: plugin path /home/share/vpp/build-root/install-vpp_debug-native/vpp/lib64/vpp_plugins
load_one_plugin:184: Loaded plugin: acl_plugin.so (Access Control Lists)
load_one_plugin:184: Loaded plugin: dpdk_plugin.so (Data Plane Development Kit (DPDK))
load_one_plugin:184: Loaded plugin: flowprobe_plugin.so (Flow per Packet)
load_one_plugin:184: Loaded plugin: flowtable_plugin.so (sample of flowtable)
load_one_plugin:184: Loaded plugin: gtpu_plugin.so (GTPv1-U)
load_one_plugin:184: Loaded plugin: ila_plugin.so (Identifier-locator addressing for IPv6)
load_one_plugin:184: Loaded plugin: ioam_plugin.so (Inbound OAM)
load_one_plugin:114: Plugin disabled (default): ixge_plugin.so
load_one_plugin:184: Loaded plugin: kubeproxy_plugin.so (kube-proxy data plane)
load_one_plugin:184: Loaded plugin: l2e_plugin.so (L2 Emulation)
load_one_plugin:184: Loaded plugin: lb_plugin.so (Load Balancer)
load_one_plugin:184: Loaded plugin: libsixrd_plugin.so (IPv6 Rapid Deployment on IPv4 Infrastructure (RFC5969))
load_one_plugin:184: Loaded plugin: memif_plugin.so (Packet Memory Interface (experimetal))
load_one_plugin:184: Loaded plugin: nat_plugin.so (Network Address Translation)
load_one_plugin:184: Loaded plugin: pktdump_plugin.so (Sample of VPP Plugin)
load_one_plugin:184: Loaded plugin: pppoe_plugin.so (PPPoE)
load_one_plugin:184: Loaded plugin: stn_plugin.so (VPP Steals the NIC for Container integration)
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/vxlan_gpe_ioam_export_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/kubeproxy_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/gtpu_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/stn_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/acl_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/ioam_export_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/udp_ping_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/lb_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/ioam_pot_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/nat_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/ioam_vxlan_gpe_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/pppoe_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/dpdk_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/ioam_trace_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/flowprobe_test_plugin.so
load_one_plugin:63: Loaded plugin: /usr/lib/vpp_api_test_plugins/memif_test_plugin.so
vlib_pci_bind_to_uio: Skipping PCI device 0000:0b:00.0 as host interface eth1 is up
dpdk_config:1240: EAL init args: -c e -n 4 --huge-dir /run/vpp/hugepages --file-prefix vpp -b 0000:0b:00.0 --master-lcore 1 --socket-mem 64 
EAL: 512 hugepages of size 2097152 reserved, but no mounted hugetlbfs found for that size
EAL: VFIO support initialized
EAL:   Invalid NUMA socket, default to 0
EAL:   Invalid NUMA socket, default to 0
EAL:   Invalid NUMA socket, default to 0
DPDK physical memory layout:
Segment 0: IOVA:0x100000000, len:1073741824, virt:0x7fc500000000, socket_id:0, hugepage_sz:1073741824, nchannel:0, nrank:0
unix_physmem_region_iommu_register: ioctl (VFIO_IOMMU_MAP_DMA): Invalid argument
0: dpdk_ipsec_process:1011: not enough DPDK crypto resources, default to OpenSSL
0: dpdk_lib_init:225: DPDK drivers found 3 ports...
    _______    _        _   _____  ___ 
 __/ __/ _ \  (_)__    | | / / _ \/ _ \
 _/ _// // / / / _ \   | |/ / ___/ ___/
 /_/ /____(_)_/\___/   |___/_/  /_/

先看下vpp原码是如何加载.so库：main函数初始化阶段，先加载了vlib_plugin_early_init函数

int
vlib_plugin_early_init (vlib_main_t * vm)
{
  plugin_main_t *pm = &vlib_plugin_main;

  if (pm->plugin_path == 0)
    pm->plugin_path = format (0, "%s%c", vlib_plugin_path, 0);

  clib_warning ("plugin path %s", pm->plugin_path);

  pm->plugin_by_name_hash = hash_create_string (0, sizeof (uword));
  pm->vlib_main = vm;

  return vlib_load_new_plugins (pm, 1 /* from_early_init */ );
}

接下来在vlib_load_new_plugins函数遍历所有插件路径，并在load_one_plugin加载.so


int
vlib_load_new_plugins (plugin_main_t * pm, int from_early_init)
{
  DIR *dp;
  struct dirent *entry;
  struct stat statb;
  uword *p;
  plugin_info_t *pi;
  u8 **plugin_path;
  u32 *load_fail_indices = 0;
  int i;

  plugin_path = split_plugin_path (pm);

  for (i = 0; i < vec_len (plugin_path); i++)
    {
      dp = opendir ((char *) plugin_path[i]);

      if (dp == 0)
	continue;

      while ((entry = readdir (dp)))
	{
	  u8 *plugin_name;
	  u8 *filename;

	  if (pm->plugin_name_filter)
	    {
	      int j;
	      for (j = 0; j < vec_len (pm->plugin_name_filter); j++)
		if (entry->d_name[j] != pm->plugin_name_filter[j])
		  goto next;
	    }

	  filename = format (0, "%s/%s%c", plugin_path[i], entry->d_name, 0);

	  /* Only accept .so */
	  char *ext = strrchr ((const char *) filename, '.');
	  /* unreadable */
	  if (!ext || (strcmp (ext, ".so") != 0) ||
	      stat ((char *) filename, &statb) < 0)
	    {
	    ignore:
	      vec_free (filename);
	      continue;
	    }

	  /* a dir or other things which aren't plugins */
	  if (!S_ISREG (statb.st_mode))
	    goto ignore;

	  plugin_name = format (0, "%s%c", entry->d_name, 0);
	  /* Have we seen this plugin already? */
	  p = hash_get_mem (pm->plugin_by_name_hash, plugin_name);
	  if (p == 0)
	    {
	      /* No, add it to the plugin vector */
	      vec_add2 (pm->plugin_info, pi, 1);
	      pi->name = plugin_name;
	      pi->filename = filename;
	      pi->file_info = statb;
	      hash_set_mem (pm->plugin_by_name_hash, plugin_name,
			    pi - pm->plugin_info);
	    }
	next:
	  ;
	}
      closedir (dp);
      vec_free (plugin_path[i]);
    }
  vec_free (plugin_path);


  /*
   * Sort the plugins by name. This is important.
   * API traces contain absolute message numbers.
   * Loading plugins in directory (vs. alphabetical) order
   * makes trace replay incredibly fragile.
   */
  vec_sort_with_function (pm->plugin_info, plugin_name_sort_cmp);

  /*
   * Attempt to load the plugins
   */
  for (i = 0; i < vec_len (pm->plugin_info); i++)
    {
      pi = vec_elt_at_index (pm->plugin_info, i);

      if (load_one_plugin (pm, pi, from_early_init))
	{
	  /* Make a note of any which fail to load */
	  vec_add1 (load_fail_indices, i);
	  hash_unset_mem (pm->plugin_by_name_hash, pi->name);
	  vec_free (pi->name);
	  vec_free (pi->filename);
	}
    }

  /* Remove plugin info vector elements corresponding to load failures */
  if (vec_len (load_fail_indices) > 0)
    {
      for (i = vec_len (load_fail_indices) - 1; i >= 0; i--)
	vec_delete (pm->plugin_info, 1, load_fail_indices[i]);
      vec_free (load_fail_indices);
    }

  /* Recreate the plugin name hash */
  for (i = 0; i < vec_len (pm->plugin_info); i++)
    {
      pi = vec_elt_at_index (pm->plugin_info, i);
      hash_unset_mem (pm->plugin_by_name_hash, pi->name);
      hash_set_mem (pm->plugin_by_name_hash, pi->name, pi - pm->plugin_info);
    }

  return 0;
}

在load_one_plugin函数中，先读取可执行程序header信息（代码参考elf.c）；

并显示调用对应的动态链接库（dlopen，dlsym）；


static int
load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init)
{
  void *handle;
  clib_error_t *error;
  elf_main_t em = { 0 };
  elf_section_t *section;
  u8 *data;
  char *version_required;
  vlib_plugin_registration_t *reg;
  plugin_config_t *pc = 0;
  uword *p;

  if (elf_read_file (&em, (char *) pi->filename))
    return -1;

  error = elf_get_section_by_name (&em, ".vlib_plugin_registration",
				   &section);
  if (error)
    {
      clib_warning ("Not a plugin: %s\n", (char *) pi->name);
      return -1;
    }

  data = elf_get_section_contents (&em, section->index, 1);
  reg = (vlib_plugin_registration_t *) data;

  if (vec_len (data) != sizeof (*reg))
    {
      clib_warning ("vlib_plugin_registration size mismatch in plugin %s\n",
		    (char *) pi->name);
      goto error;
    }

  p = hash_get_mem (pm->config_index_by_name, pi->name);
  if (p)
    {
      pc = vec_elt_at_index (pm->configs, p[0]);
      if (pc->is_disabled)
	{
	  clib_warning ("Plugin disabled: %s", pi->name);
	  goto error;
	}
      if (reg->default_disabled && pc->is_enabled == 0)
	{
	  clib_warning ("Plugin disabled (default): %s", pi->name);
	  goto error;
	}
    }
  else if (reg->default_disabled)
    {
      clib_warning ("Plugin disabled (default): %s", pi->name);
      goto error;
    }

  version_required = str_array_to_vec ((char *) &reg->version_required,
				       sizeof (reg->version_required));

  if ((strlen (version_required) > 0) &&
      (strncmp (vlib_plugin_app_version, version_required,
		strlen (version_required))))
    {
      clib_warning ("Plugin %s version mismatch: %s != %s",
		    pi->name, vlib_plugin_app_version, reg->version_required);
      if (!(pc && pc->skip_version_check == 1))
	{
	  vec_free (version_required);
	  goto error;
	}
    }

  vec_free (version_required);
  vec_free (data);
  elf_main_free (&em);

  handle = dlopen ((char *) pi->filename, RTLD_LAZY);

  if (handle == 0)
    {
      clib_warning ("%s", dlerror ());
      clib_warning ("Failed to load plugin '%s'", pi->name);
      os_exit (1);
    }

  pi->handle = handle;

  reg = dlsym (pi->handle, "vlib_plugin_registration");

  if (reg == 0)
    {
      /* This should never happen unless somebody chagnes registration macro */
      clib_warning ("Missing plugin registration in plugin '%s'", pi->name);
      os_exit (1);
    }

  pi->reg = reg;
  pi->version = str_array_to_vec ((char *) &reg->version,
				  sizeof (reg->version));

  if (reg->early_init)
    {
      clib_error_t *(*ei) (vlib_main_t *);
      void *h;

      h = dlsym (pi->handle, reg->early_init);
      if (h)
	{
	  ei = h;
	  error = (*ei) (pm->vlib_main);
	  if (error)
	    {
	      clib_error_report (error);
	      os_exit (1);
	    }
	}
      else
	clib_warning ("Plugin %s: early init function %s set but not found",
		      (char *) pi->name, reg->early_init);
    }

  if (reg->description)
    clib_warning ("Loaded plugin: %s (%s)", pi->name, reg->description);
  else
    clib_warning ("Loaded plugin: %s", pi->name);

  return 0;
error:
  vec_free (data);
  elf_main_free (&em);
  return -1;
}

业务注册时使用 VLIB_PLUGIN_REGISTER注册插件section

#define VLIB_PLUGIN_REGISTER() \
  vlib_plugin_registration_t vlib_plugin_registration \
  __attribute__((__section__(".vlib_plugin_registration")))

2. 线程及协程初始化

vpp加载完插件，首先在主线程使用协程机制来处理业务（协程是一种轻量级线程，有自己堆栈）


int
vlib_unix_main (int argc, char *argv[])
{
  vlib_main_t *vm = &vlib_global_main;	/* one and only time for this! */
  unformat_input_t input;
  clib_error_t *e;
  int i;

  vm->argv = (u8 **) argv;
  vm->name = argv[0];
  vm->heap_base = clib_mem_get_heap ();
  ASSERT (vm->heap_base);

  unformat_init_command_line (&input, (char **) vm->argv);
  if ((e = vlib_plugin_config (vm, &input)))
    {
      clib_error_report (e);
      return 1;
    }
  unformat_free (&input);

  i = vlib_plugin_early_init (vm);
  if (i)
    return i;

  unformat_init_command_line (&input, (char **) vm->argv);
  if (vm->init_functions_called == 0)
    vm->init_functions_called = hash_create (0, /* value bytes */ 0);
  e = vlib_call_all_config_functions (vm, &input, 1 /* early */ );
  if (e != 0)
    {
      clib_error_report (e);
      return 1;
    }
  unformat_free (&input);

  vlib_thread_stack_init (0);

  __os_thread_index = 0;
  vm->thread_index = 0;

  i = clib_calljmp (thread0, (uword) vm,
		    (void *) (vlib_thread_stacks[0] + ?*二维数组*/
			      VLIB_THREAD_STACK_SIZE));/*协程,calljmp调用完继续执行，此时main和协程分开执行*/
  return i;
}

vpp使用的clib_calljmp, clib_longjmp, clib_calljmp实现协程机制；其中几个函数作用是：

clib_setjmp 设置跳转的标签

clib_longjmp跳转指定的标签

clib_calljmp(入口函数，参数，栈)

在thread0主线程的协程内初始化工作线程



/* Called early in the init sequence */

clib_error_t *
vlib_thread_init (vlib_main_t * vm)
{
  vlib_thread_main_t *tm = &vlib_thread_main;
  vlib_worker_thread_t *w;
  vlib_thread_registration_t *tr;
  u32 n_vlib_mains = 1;
  u32 first_index = 1;
  u32 i;
  uword *avail_cpu;

  /* get bitmaps of active cpu cores and sockets */
  tm->cpu_core_bitmap =
    clib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online");
  tm->cpu_socket_bitmap =
    clib_sysfs_list_to_bitmap ("/sys/devices/system/node/online");

  avail_cpu = clib_bitmap_dup (tm->cpu_core_bitmap);

  /* skip cores */
  for (i = 0; i < tm->skip_cores; i++)
    {
      uword c = clib_bitmap_first_set (avail_cpu);
      if (c == ~0)
	return clib_error_return (0, "no available cpus to skip");

      avail_cpu = clib_bitmap_set (avail_cpu, c, 0);
    }

  /* grab cpu for main thread */
  if (!tm->main_lcore)
    {
      tm->main_lcore = clib_bitmap_first_set (avail_cpu);
      if (tm->main_lcore == (u8) ~ 0)
	return clib_error_return (0, "no available cpus to be used for the"
				  " main thread");
    }
  else
    {
      if (clib_bitmap_get (avail_cpu, tm->main_lcore) == 0)
	return clib_error_return (0, "cpu %u is not available to be used"
				  " for the main thread", tm->main_lcore);
    }
  avail_cpu = clib_bitmap_set (avail_cpu, tm->main_lcore, 0);

  /* assume that there is socket 0 only if there is no data from sysfs */
  if (!tm->cpu_socket_bitmap)
    tm->cpu_socket_bitmap = clib_bitmap_set (0, 0, 1);

  /* pin main thread to main_lcore  */
  if (tm->cb.vlib_thread_set_lcore_cb)
    {
      tm->cb.vlib_thread_set_lcore_cb (0, tm->main_lcore);
    }
  else
    {
      cpu_set_t cpuset;
      CPU_ZERO (&cpuset);
      CPU_SET (tm->main_lcore, &cpuset);
      pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset);
    }

  /* as many threads as stacks... */
  vec_validate_aligned (vlib_worker_threads, vec_len (vlib_thread_stacks) - 1,
			CLIB_CACHE_LINE_BYTES);

  /* Preallocate thread 0 */
  _vec_len (vlib_worker_threads) = 1;
  w = vlib_worker_threads;
  w->thread_mheap = clib_mem_get_heap ();
  w->thread_stack = vlib_thread_stacks[0];
  w->lcore_id = tm->main_lcore;
  w->lwp = syscall (SYS_gettid);
  w->thread_id = pthread_self ();
  tm->n_vlib_mains = 1;

  if (tm->sched_policy != ~0)
    {
      struct sched_param sched_param;
      if (!sched_getparam (w->lwp, &sched_param))
	{
	  if (tm->sched_priority != ~0)
	    sched_param.sched_priority = tm->sched_priority;
	  sched_setscheduler (w->lwp, tm->sched_policy, &sched_param);
	}
    }

  /* assign threads to cores and set n_vlib_mains */
  tr = tm->next;

  while (tr)
    {
      vec_add1 (tm->registrations, tr);
      tr = tr->next;
    }

  vec_sort_with_function (tm->registrations, sort_registrations_by_no_clone);

  for (i = 0; i < vec_len (tm->registrations); i++)
    {
      int j;
      tr = tm->registrations[i];
      tr->first_index = first_index;
      first_index += tr->count;
      n_vlib_mains += (tr->no_data_structure_clone == 0) ? tr->count : 0;

      /* construct coremask */
      if (tr->use_pthreads || !tr->count)
	continue;

      if (tr->coremask)
	{
	  uword c;
          /* *INDENT-OFF* */
          clib_bitmap_foreach (c, tr->coremask, ({
            if (clib_bitmap_get(avail_cpu, c) == 0)
              return clib_error_return (0, "cpu %u is not available to be used"
                                        " for the '%s' thread",c, tr->name);

            avail_cpu = clib_bitmap_set(avail_cpu, c, 0);
          }));
/* *INDENT-ON* */

	}
      else
	{
	  for (j = 0; j < tr->count; j++)
	    {
	      uword c = clib_bitmap_first_set (avail_cpu);
	      if (c == ~0)
		return clib_error_return (0,
					  "no available cpus to be used for"
					  " the '%s' thread", tr->name);

	      avail_cpu = clib_bitmap_set (avail_cpu, c, 0);
	      tr->coremask = clib_bitmap_set (tr->coremask, c, 1);
	    }
	}
    }

  clib_bitmap_free (avail_cpu);

  tm->n_vlib_mains = n_vlib_mains;

  vec_validate_aligned (vlib_worker_threads, first_index - 1,
			CLIB_CACHE_LINE_BYTES);

  return 0;
}

vlib_main函数内有初始化vlib_worker_threads工作线程。相关配置可参考：/etc/vpp/startup.conf

工作线程注册函数为VLIB_MAIN_LOOP_ENTER_FUNCTION并在start_workers启动工作线程

VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers);


static clib_error_t *
start_workers (vlib_main_t * vm)
{
  int i, j;
  vlib_worker_thread_t *w;
  vlib_main_t *vm_clone;
  void *oldheap;
  vlib_thread_main_t *tm = &vlib_thread_main;
  vlib_thread_registration_t *tr;
  vlib_node_runtime_t *rt;
  u32 n_vlib_mains = tm->n_vlib_mains;
  u32 worker_thread_index;
  u8 *main_heap = clib_mem_get_per_cpu_heap ();
  mheap_t *main_heap_header = mheap_header (main_heap);

  vec_reset_length (vlib_worker_threads);

  /* Set up the main thread */
  vec_add2_aligned (vlib_worker_threads, w, 1, CLIB_CACHE_LINE_BYTES);
  w->elog_track.name = "main thread";
  elog_track_register (&vm->elog_main, &w->elog_track);

  if (vec_len (tm->thread_prefix))
    {
      w->name = format (0, "%v_main%c", tm->thread_prefix, '\0');
      vlib_set_thread_name ((char *) w->name);
    }

  /*
   * Truth of the matter: we always use at least two
   * threads. So, make the main heap thread-safe
   * and make the event log thread-safe.
   */
  main_heap_header->flags |= MHEAP_FLAG_THREAD_SAFE;
  vm->elog_main.lock =
    clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES);
  vm->elog_main.lock[0] = 0;

  if (n_vlib_mains > 1)
    {
      /* Replace hand-crafted length-1 vector with a real vector */
      vlib_mains = 0;

      vec_validate_aligned (vlib_mains, tm->n_vlib_mains - 1,
			    CLIB_CACHE_LINE_BYTES);
      _vec_len (vlib_mains) = 0;
      vec_add1_aligned (vlib_mains, vm, CLIB_CACHE_LINE_BYTES);

      vlib_worker_threads->wait_at_barrier =
	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
      vlib_worker_threads->workers_at_barrier =
	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);

      vlib_worker_threads->node_reforks_required =
	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);

      /* Ask for an initial barrier sync */
      *vlib_worker_threads->workers_at_barrier = 0;
      *vlib_worker_threads->wait_at_barrier = 1;

      /* Without update or refork */
      *vlib_worker_threads->node_reforks_required = 0;
      vm->need_vlib_worker_thread_node_runtime_update = 0;

      /* init timing */
      vm->barrier_epoch = 0;
      vm->barrier_no_close_before = 0;

      worker_thread_index = 1;

      for (i = 0; i < vec_len (tm->registrations); i++)
	{
	  vlib_node_main_t *nm, *nm_clone;
	  vlib_buffer_main_t *bm_clone;
	  vlib_buffer_free_list_t *fl_clone, *fl_orig;
	  vlib_buffer_free_list_t *orig_freelist_pool;
	  int k;

	  tr = tm->registrations[i];

	  if (tr->count == 0)
	    continue;

	  for (k = 0; k < tr->count; k++)
	    {
	      vlib_node_t *n;

	      vec_add2 (vlib_worker_threads, w, 1);
	      if (tr->mheap_size)
		w->thread_mheap =
		  mheap_alloc (0 /* use VM */ , tr->mheap_size);
	      else
		w->thread_mheap = main_heap;

	      w->thread_stack =
		vlib_thread_stack_init (w - vlib_worker_threads);
	      w->thread_function = tr->function;
	      w->thread_function_arg = w;
	      w->instance_id = k;
	      w->registration = tr;

	      w->elog_track.name =
		(char *) format (0, "%s %d", tr->name, k + 1);
	      vec_add1 (w->elog_track.name, 0);
	      elog_track_register (&vm->elog_main, &w->elog_track);

	      if (tr->no_data_structure_clone)
		continue;

	      /* Fork vlib_global_main et al. Look for bugs here */
	      oldheap = clib_mem_set_heap (w->thread_mheap);

	      vm_clone = clib_mem_alloc (sizeof (*vm_clone));
	      clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone));

	      vm_clone->thread_index = worker_thread_index;
	      vm_clone->heap_base = w->thread_mheap;
	      vm_clone->mbuf_alloc_list = 0;
	      vm_clone->init_functions_called =
		hash_create (0, /* value bytes */ 0);
	      vm_clone->pending_rpc_requests = 0;
	      vec_validate (vm_clone->pending_rpc_requests, 0);
	      _vec_len (vm_clone->pending_rpc_requests) = 0;
	      memset (&vm_clone->random_buffer, 0,
		      sizeof (vm_clone->random_buffer));

	      nm = &vlib_mains[0]->node_main;
	      nm_clone = &vm_clone->node_main;
	      /* fork next frames array, preserving node runtime indices */
	      nm_clone->next_frames = vec_dup (nm->next_frames);
	      for (j = 0; j < vec_len (nm_clone->next_frames); j++)
		{
		  vlib_next_frame_t *nf = &nm_clone->next_frames[j];
		  u32 save_node_runtime_index;
		  u32 save_flags;

		  save_node_runtime_index = nf->node_runtime_index;
		  save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
		  vlib_next_frame_init (nf);
		  nf->node_runtime_index = save_node_runtime_index;
		  nf->flags = save_flags;
		}

	      /* fork the frame dispatch queue */
	      nm_clone->pending_frames = 0;
	      vec_validate (nm_clone->pending_frames, 10);	/* $$$$$?????? */
	      _vec_len (nm_clone->pending_frames) = 0;

	      /* fork nodes */
	      nm_clone->nodes = 0;

	      /* Allocate all nodes in single block for speed */
	      n = clib_mem_alloc_no_fail (vec_len (nm->nodes) * sizeof (*n));

	      for (j = 0; j < vec_len (nm->nodes); j++)
		{
		  clib_memcpy (n, nm->nodes[j], sizeof (*n));
		  /* none of the copied nodes have enqueue rights given out */
		  n->owner_node_index = VLIB_INVALID_NODE_INDEX;
		  memset (&n->stats_total, 0, sizeof (n->stats_total));
		  memset (&n->stats_last_clear, 0,
			  sizeof (n->stats_last_clear));
		  vec_add1 (nm_clone->nodes, n);
		  n++;
		}
	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
		vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
	      vec_foreach (rt,
			   nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
	      {
		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
		rt->thread_index = vm_clone->thread_index;
		/* copy initial runtime_data from node */
		if (n->runtime_data && n->runtime_data_bytes > 0)
		  clib_memcpy (rt->runtime_data, n->runtime_data,
			       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
					 n->runtime_data_bytes));
	      }

	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
		vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
	      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
	      {
		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
		rt->thread_index = vm_clone->thread_index;
		/* copy initial runtime_data from node */
		if (n->runtime_data && n->runtime_data_bytes > 0)
		  clib_memcpy (rt->runtime_data, n->runtime_data,
			       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
					 n->runtime_data_bytes));
	      }

	      nm_clone->processes = vec_dup (nm->processes);

	      /* zap the (per worker) frame freelists, etc */
	      nm_clone->frame_sizes = 0;
	      nm_clone->frame_size_hash = hash_create (0, sizeof (uword));

	      /* Packet trace buffers are guaranteed to be empty, nothing to do here */

	      clib_mem_set_heap (oldheap);
	      vec_add1_aligned (vlib_mains, vm_clone, CLIB_CACHE_LINE_BYTES);

	      vm_clone->error_main.counters =
		vec_dup (vlib_mains[0]->error_main.counters);
	      vm_clone->error_main.counters_last_clear =
		vec_dup (vlib_mains[0]->error_main.counters_last_clear);

	      /* Fork the vlib_buffer_main_t free lists, etc. */
	      bm_clone = vec_dup (vm_clone->buffer_main);
	      vm_clone->buffer_main = bm_clone;

	      orig_freelist_pool = bm_clone->buffer_free_list_pool;
	      bm_clone->buffer_free_list_pool = 0;

            /* *INDENT-OFF* */
            pool_foreach (fl_orig, orig_freelist_pool,
                          ({
                            pool_get_aligned (bm_clone->buffer_free_list_pool,
                                              fl_clone, CLIB_CACHE_LINE_BYTES);
                            ASSERT (fl_orig - orig_freelist_pool
                                    == fl_clone - bm_clone->buffer_free_list_pool);

                            fl_clone[0] = fl_orig[0];
                            fl_clone->buffers = 0;
                            fl_clone->n_alloc = 0;
                          }));
/* *INDENT-ON* */

	      worker_thread_index++;
	    }
	}
    }
  else
    {
      /* only have non-data-structure copy threads to create... */
      for (i = 0; i < vec_len (tm->registrations); i++)
	{
	  tr = tm->registrations[i];

	  for (j = 0; j < tr->count; j++)
	    {
	      vec_add2 (vlib_worker_threads, w, 1);
	      if (tr->mheap_size)
		w->thread_mheap =
		  mheap_alloc (0 /* use VM */ , tr->mheap_size);
	      else
		w->thread_mheap = main_heap;
	      w->thread_stack =
		vlib_thread_stack_init (w - vlib_worker_threads);
	      w->thread_function = tr->function;
	      w->thread_function_arg = w;
	      w->instance_id = j;
	      w->elog_track.name =
		(char *) format (0, "%s %d", tr->name, j + 1);
	      w->registration = tr;
	      vec_add1 (w->elog_track.name, 0);
	      elog_track_register (&vm->elog_main, &w->elog_track);
	    }
	}
    }

  worker_thread_index = 1;

  for (i = 0; i < vec_len (tm->registrations); i++)
    {
      clib_error_t *err;
      int j;

      tr = tm->registrations[i];

      if (tr->use_pthreads || tm->use_pthreads)
	{
	  for (j = 0; j < tr->count; j++)
	    {
	      w = vlib_worker_threads + worker_thread_index++;
	      err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn,
					    w, 0);
	      if (err)
		clib_error_report (err);
	    }
	}
      else
	{
	  uword c;
          /* *INDENT-OFF* */
          clib_bitmap_foreach (c, tr->coremask, ({
            w = vlib_worker_threads + worker_thread_index++;
	    err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn,
					  w, c);
	    if (err)
	      clib_error_report (err);
          }));
          /* *INDENT-ON* */
	}
    }
  vlib_worker_thread_barrier_sync (vm);
  vlib_worker_thread_barrier_release (vm);
  return 0;
}

vpp为一个主线程，多个工作线程，并且在线程内有协程机制；

充分利用了多核线程亲和性和协程切换的低开销。

3. 静态节点注册和动态节点编排

静态node通过VLIB_REGISTER_NODE注册，在编译时加载；

main初始化流程在vlib_register_all_static_nodes内初始化，并在register_node进行编排


void
vlib_register_all_static_nodes (vlib_main_t * vm)
{
  vlib_node_registration_t *r;

  static char *null_node_error_strings[] = {
    "blackholed packets",
  };

  static vlib_node_registration_t null_node_reg = {
    .function = null_node_fn,
    .vector_size = sizeof (u32),
    .name = "null-node",
    .n_errors = 1,
    .error_strings = null_node_error_strings,
  };

  /* make sure that node index 0 is not used by
     real node */
  register_node (vm, &null_node_reg);

  r = vm->node_main.node_registrations;
  while (r)
    {
      register_node (vm, r);
      r = r->next_registration;
    }
}

动态节点是可以动态加载的，即在程序运行时加载：

vpp内vnet_config_add_feature，vnet_config_del_feature，find_config_with_features有实现。


static vnet_config_t *
find_config_with_features (vlib_main_t * vm,
			   vnet_config_main_t * cm,
			   vnet_config_feature_t * feature_vector)
{
  u32 last_node_index = ~0;
  vnet_config_feature_t *f;
  u32 *config_string;
  uword *p;
  vnet_config_t *c;

  config_string = cm->config_string_temp;
  cm->config_string_temp = 0;
  if (config_string)
    _vec_len (config_string) = 0;

  vec_foreach (f, feature_vector)
  {
    /* Connect node graph. */
    f->next_index = add_next (vm, cm, last_node_index, f->node_index);
    last_node_index = f->node_index;

    /* Store next index in config string. */
    vec_add1 (config_string, f->next_index);

    /* Store feature config. */
    vec_add (config_string, f->feature_config, vec_len (f->feature_config));
  }

  /* Terminate config string with next for end node. */
  if (last_node_index == ~0 || last_node_index != cm->end_node_index)
    {
      u32 next_index = add_next (vm, cm, last_node_index, cm->end_node_index);
      vec_add1 (config_string, next_index);
    }

  /* See if config string is unique. */
  p = hash_get_mem (cm->config_string_hash, config_string);
  if (p)
    {
      /* Not unique.  Share existing config. */
      cm->config_string_temp = config_string;	/* we'll use it again later. */
      free_feature_vector (feature_vector);
      c = pool_elt_at_index (cm->config_pool, p[0]);
    }
  else
    {
      u32 *d;

      pool_get (cm->config_pool, c);
      c->index = c - cm->config_pool;
      c->features = feature_vector;
      c->config_string_vector = config_string;

      /* Allocate copy of config string in heap.
         VLIB buffers will maintain pointers to heap as they read out
         configuration data. */
      c->config_string_heap_index
	= heap_alloc (cm->config_string_heap, vec_len (config_string) + 1,
		      c->config_string_heap_handle);

      /* First element in heap points back to pool index. */
      d =
	vec_elt_at_index (cm->config_string_heap,
			  c->config_string_heap_index);
      d[0] = c->index;
      clib_memcpy (d + 1, config_string, vec_bytes (config_string));
      hash_set_mem (cm->config_string_hash, config_string, c->index);

      c->reference_count = 0;	/* will be incremented by caller. */
    }

  return c;
}

二、命令行配置分析

命令行是在协程中实现，代码如下：


/* Called in main stack. */
static_always_inline uword
vlib_process_startup (vlib_main_t * vm, vlib_process_t * p, vlib_frame_t * f)
{
  vlib_process_bootstrap_args_t a;
  uword r;

  a.vm = vm;
  a.process = p;
  a.frame = f;

  r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN);
  if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN)
    r = clib_calljmp (vlib_process_bootstrap, pointer_to_uword (&a),
		      (void *) p->stack + (1 << p->log2_n_stack_bytes));

  return r;
}

unix_cli_process注册node回调，业务模块通过VLIB_CLI_COMMAND注册的命令都是在此函数中回调unix_cli_process_input


/** Store a new CLI session.
 * @param name The name of the session.
 * @param fd   The file descriptor for the session I/O.
 * @return The session ID.
 */
static u32
unix_cli_file_add (unix_cli_main_t * cm, char *name, int fd)
{
  unix_main_t *um = &unix_main;
  clib_file_main_t *fm = &file_main;
  unix_cli_file_t *cf;
  clib_file_t template = { 0 };
  vlib_main_t *vm = um->vlib_main;
  vlib_node_t *n;

  name = (char *) format (0, "unix-cli-%s", name);

  if (vec_len (cm->unused_cli_process_node_indices) > 0)
    {
      uword l = vec_len (cm->unused_cli_process_node_indices);

      /* Find node and give it new name. */
      n = vlib_get_node (vm, cm->unused_cli_process_node_indices[l - 1]);
      vec_free (n->name);
      n->name = (u8 *) name;

      vlib_node_set_state (vm, n->index, VLIB_NODE_STATE_POLLING);

      _vec_len (cm->unused_cli_process_node_indices) = l - 1;
    }
  else
    {
      static vlib_node_registration_t r = {
	.function = unix_cli_process,
	.type = VLIB_NODE_TYPE_PROCESS,
	.process_log2_n_stack_bytes = 16,
      };

      r.name = name;
      vlib_register_node (vm, &r);
      vec_free (name);

      n = vlib_get_node (vm, r.index);
    }

  pool_get (cm->cli_file_pool, cf);
  memset (cf, 0, sizeof (*cf));

  template.read_function = unix_cli_read_ready;
  template.write_function = unix_cli_write_ready;
  template.error_function = unix_cli_error_detected;
  template.file_descriptor = fd;
  template.private_data = cf - cm->cli_file_pool;

  cf->process_node_index = n->index;
  cf->clib_file_index = clib_file_add (fm, &template);
  cf->output_vector = 0;
  cf->input_vector = 0;

  vlib_start_process (vm, n->runtime_index);

  vlib_process_t *p = vlib_get_process_from_node (vm, n);
  p->output_function = unix_vlib_cli_output;
  p->output_function_arg = cf - cm->cli_file_pool;

  return cf - cm->cli_file_pool;
}

三、报文转发

#0  dpdk_device_input (dm=0x7fc770b66900 <dpdk_main>, xd=0x7fc771c7cc40, 
    node=0x7fc771c50228, thread_index=2, queue_id=0, maybe_multiseg=1)
    at /home/share/vpp/build-data/../src/plugins/dpdk/device/node.c:224
#1  0x00007fc76fd6082b in dpdk_input_avx2 (vm=0x7fc771c62f78, 
    node=0x7fc771c50228, f=0x0)
    at /home/share/vpp/build-data/../src/plugins/dpdk/device/node.c:606
#2  0x00007fc7b32b2c9a in dispatch_node (vm=0x7fc771c62f78, node=0x7fc771c50228, 
    type=VLIB_NODE_TYPE_INPUT, dispatch_state=VLIB_NODE_STATE_POLLING, 
    frame=0x0, last_time_stamp=7989642574756)
    at /home/share/vpp/build-data/../src/vlib/main.c:988
#3  0x00007fc7b32b4a79 in vlib_main_or_worker_loop (vm=0x7fc771c62f78, is_main=0)
    at /home/share/vpp/build-data/../src/vlib/main.c:1506
#4  0x00007fc7b32b5520 in vlib_worker_loop (vm=0x7fc771c62f78)
    at /home/share/vpp/build-data/../src/vlib/main.c:1634
#5  0x00007fc7b32f6de7 in vlib_worker_thread_fn (arg=0x7fc770ef4580)
    at /home/share/vpp/build-data/../src/vlib/threads.c:1736
#6  0x00007fc7b21fbdc4 in clib_calljmp ()
    at /home/share/vpp/build-data/../src/vppinfra/longjmp.S:110
#7  0x00007fc60cb8fd70 in ?? ()
#8  0x00007fc7b32f1e56 in vlib_worker_thread_bootstrap_fn (arg=0x7fc770ef4580)
    at /home/share/vpp/build-data/../src/vlib/threads.c:680

vpp可以使用dpdk收发包，也可以不使用，从以运行栈来看，工作线程也是协程机制。处理node报文如下：

1. 业务通过VLIB_REGISTER_NODE注册的node，vpp是先获取的node index;

2. VLIB_NODE_TYPE_PRE_INPUT 类似epoll机制检测io有无数据；

3. VLIB_NODE_TYPE_INPUT 接收dpdk_device_input，调用dpdk接口接收使用，然后调用dpdk_rx_burst进行收包处理


/*
 * This function is used when there are no worker threads.
 * The main thread performs IO and forwards the packets.
 */
static_always_inline u32
dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd,
		   vlib_node_runtime_t * node, u32 thread_index, u16 queue_id,
		   int maybe_multiseg)
{
  u32 n_buffers;
  u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
  u32 n_left_to_next, *to_next;
  u32 mb_index;
  vlib_main_t *vm = vlib_get_main ();
  uword n_rx_bytes = 0;
  u32 n_trace, trace_cnt __attribute__ ((unused));
  vlib_buffer_free_list_t *fl;
  vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, thread_index);

  if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0)
    return 0;

  n_buffers = dpdk_rx_burst (dm, xd, queue_id);

  if (n_buffers == 0)
    {
      return 0;
    }

  vec_reset_length (xd->d_trace_buffers[thread_index]);
  trace_cnt = n_trace = vlib_get_trace_count (vm, node);

  if (n_trace > 0)
    {
      u32 n = clib_min (n_trace, n_buffers);
      mb_index = 0;

      while (n--)
	{
	  struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++];
	  vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb);/*mbuf转换成buffer*/
	  vec_add1 (xd->d_trace_buffers[thread_index],
		    vlib_get_buffer_index (vm, b));
	}
    }

  fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);

  /* Update buffer template */
  vnet_buffer (bt)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
  bt->error = node->errors[DPDK_ERROR_NONE];
  /* as DPDK is allocating empty buffers from mempool provided before interface
     start for each queue, it is safe to store this in the template */
  bt->buffer_pool_index = xd->buffer_pool_for_queue[queue_id];

  mb_index = 0;

  while (n_buffers > 0)
    {
      vlib_buffer_t *b0, *b1, *b2, *b3;
      u32 bi0, next0;
      u32 bi1, next1;
      u32 bi2, next2;
      u32 bi3, next3;
      u8 error0, error1, error2, error3;
      i16 offset0, offset1, offset2, offset3;
      u64 or_ol_flags;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_buffers >= 12 && n_left_to_next >= 4)
	{
	  struct rte_mbuf *mb0, *mb1, *mb2, *mb3;

	  /* prefetches are interleaved with the rest of the code to reduce
	     pressure on L1 cache */
	  dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 8]);
	  dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 4]);

	  mb0 = xd->rx_vectors[queue_id][mb_index];
	  mb1 = xd->rx_vectors[queue_id][mb_index + 1];
	  mb2 = xd->rx_vectors[queue_id][mb_index + 2];
	  mb3 = xd->rx_vectors[queue_id][mb_index + 3];

	  ASSERT (mb0);
	  ASSERT (mb1);
	  ASSERT (mb2);
	  ASSERT (mb3);

	  if (maybe_multiseg)
	    {
	      if (PREDICT_FALSE (mb0->nb_segs > 1))
		dpdk_prefetch_buffer (mb0->next);
	      if (PREDICT_FALSE (mb1->nb_segs > 1))
		dpdk_prefetch_buffer (mb1->next);
	      if (PREDICT_FALSE (mb2->nb_segs > 1))
		dpdk_prefetch_buffer (mb2->next);
	      if (PREDICT_FALSE (mb3->nb_segs > 1))
		dpdk_prefetch_buffer (mb3->next);
	    }
	/*一次处理四个mbuf*/
	  b0 = vlib_buffer_from_rte_mbuf (mb0);
	  b1 = vlib_buffer_from_rte_mbuf (mb1);
	  b2 = vlib_buffer_from_rte_mbuf (mb2);
	  b3 = vlib_buffer_from_rte_mbuf (mb3);

	  dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 9]);
	  dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 5]);

	  clib_memcpy64_x4 (b0, b1, b2, b3, bt);

	  dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 10]);
	  dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 7]);

	  bi0 = vlib_get_buffer_index (vm, b0);
	  bi1 = vlib_get_buffer_index (vm, b1);
	  bi2 = vlib_get_buffer_index (vm, b2);
	  bi3 = vlib_get_buffer_index (vm, b3);

	  to_next[0] = bi0;
	  to_next[1] = bi1;
	  to_next[2] = bi2;
	  to_next[3] = bi3;
	  to_next += 4;
	  n_left_to_next -= 4;

	  if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
	    {
	      next0 = next1 = next2 = next3 = xd->per_interface_next_index;
	    }
	  else
	    {
	      next0 = dpdk_rx_next_from_etype (mb0);
	      next1 = dpdk_rx_next_from_etype (mb1);
	      next2 = dpdk_rx_next_from_etype (mb2);
	      next3 = dpdk_rx_next_from_etype (mb3);
	    }

	  dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 11]);
	  dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 6]);

	  or_ol_flags = (mb0->ol_flags | mb1->ol_flags |
			 mb2->ol_flags | mb3->ol_flags);
	  if (PREDICT_FALSE (or_ol_flags & PKT_RX_IP_CKSUM_BAD))
	    {
	      dpdk_rx_error_from_mb (mb0, &next0, &error0);
	      dpdk_rx_error_from_mb (mb1, &next1, &error1);
	      dpdk_rx_error_from_mb (mb2, &next2, &error2);
	      dpdk_rx_error_from_mb (mb3, &next3, &error3);
	      b0->error = node->errors[error0];
	      b1->error = node->errors[error1];
	      b2->error = node->errors[error2];
	      b3->error = node->errors[error3];
	    }

	  offset0 = device_input_next_node_advance[next0];
	  b0->current_data = mb0->data_off + offset0 - RTE_PKTMBUF_HEADROOM;
	  b0->flags |= device_input_next_node_flags[next0];
	  vnet_buffer (b0)->l3_hdr_offset = b0->current_data;
	  vnet_buffer (b0)->l2_hdr_offset =
	    mb0->data_off - RTE_PKTMBUF_HEADROOM;
	  b0->current_length = mb0->data_len - offset0;
	  n_rx_bytes += mb0->pkt_len;

	  offset1 = device_input_next_node_advance[next1];
	  b1->current_data = mb1->data_off + offset1 - RTE_PKTMBUF_HEADROOM;
	  b1->flags |= device_input_next_node_flags[next1];
	  vnet_buffer (b1)->l3_hdr_offset = b1->current_data;
	  vnet_buffer (b1)->l2_hdr_offset =
	    mb1->data_off - RTE_PKTMBUF_HEADROOM;
	  b1->current_length = mb1->data_len - offset1;
	  n_rx_bytes += mb1->pkt_len;

	  offset2 = device_input_next_node_advance[next2];
	  b2->current_data = mb2->data_off + offset2 - RTE_PKTMBUF_HEADROOM;
	  b2->flags |= device_input_next_node_flags[next2];
	  vnet_buffer (b2)->l3_hdr_offset = b2->current_data;
	  vnet_buffer (b2)->l2_hdr_offset =
	    mb2->data_off - RTE_PKTMBUF_HEADROOM;
	  b2->current_length = mb2->data_len - offset2;
	  n_rx_bytes += mb2->pkt_len;

	  offset3 = device_input_next_node_advance[next3];
	  b3->current_data = mb3->data_off + offset3 - RTE_PKTMBUF_HEADROOM;
	  b3->flags |= device_input_next_node_flags[next3];
	  vnet_buffer (b3)->l3_hdr_offset = b3->current_data;
	  vnet_buffer (b3)->l2_hdr_offset =
	    mb3->data_off - RTE_PKTMBUF_HEADROOM;
	  b3->current_length = mb3->data_len - offset3;
	  n_rx_bytes += mb3->pkt_len;


	  /* Process subsequent segments of multi-segment packets */
	  if (maybe_multiseg)
	    {
	      dpdk_process_subseq_segs (vm, b0, mb0, fl);
	      dpdk_process_subseq_segs (vm, b1, mb1, fl);
	      dpdk_process_subseq_segs (vm, b2, mb2, fl);
	      dpdk_process_subseq_segs (vm, b3, mb3, fl);
	    }

	  /*
	   * Turn this on if you run into
	   * "bad monkey" contexts, and you want to know exactly
	   * which nodes they've visited... See main.c...
	   */
	  VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
	  VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1);
	  VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2);
	  VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3);

	  /* Do we have any driver RX features configured on the interface? */
	  vnet_feature_start_device_input_x4 (xd->vlib_sw_if_index,
					      &next0, &next1, &next2, &next3,
					      b0, b1, b2, b3);

	  vlib_validate_buffer_enqueue_x4 (vm, node, next_index,
					   to_next, n_left_to_next,
					   bi0, bi1, bi2, bi3,
					   next0, next1, next2, next3);
	  n_buffers -= 4;
	  mb_index += 4;
	}
      while (n_buffers > 0 && n_left_to_next > 0)
	{
	  struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index];

	  if (PREDICT_TRUE (n_buffers > 3))
	    {
	      dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 2]);
	      dpdk_prefetch_ethertype (xd->rx_vectors[queue_id]
				       [mb_index + 1]);
	    }

	  ASSERT (mb0);

	  b0 = vlib_buffer_from_rte_mbuf (mb0);

	  /* Prefetch one next segment if it exists. */
	  if (PREDICT_FALSE (mb0->nb_segs > 1))
	    dpdk_prefetch_buffer (mb0->next);

	  clib_memcpy (b0, bt, CLIB_CACHE_LINE_BYTES);

	  bi0 = vlib_get_buffer_index (vm, b0);

	  to_next[0] = bi0;
	  to_next++;
	  n_left_to_next--;

	  if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
	    next0 = xd->per_interface_next_index;
	  else
	    next0 = dpdk_rx_next_from_etype (mb0);

	  dpdk_rx_error_from_mb (mb0, &next0, &error0);
	  b0->error = node->errors[error0];

	  offset0 = device_input_next_node_advance[next0];
	  b0->current_data = mb0->data_off + offset0 - RTE_PKTMBUF_HEADROOM;
	  b0->flags |= device_input_next_node_flags[next0];
	  vnet_buffer (b0)->l3_hdr_offset = b0->current_data;
	  vnet_buffer (b0)->l2_hdr_offset =
	    mb0->data_off - RTE_PKTMBUF_HEADROOM;
	  b0->current_length = mb0->data_len - offset0;
	  n_rx_bytes += mb0->pkt_len;

	  /* Process subsequent segments of multi-segment packets */
	  dpdk_process_subseq_segs (vm, b0, mb0, fl);

	  /*
	   * Turn this on if you run into
	   * "bad monkey" contexts, and you want to know exactly
	   * which nodes they've visited... See main.c...
	   */
	  VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);

	  /* Do we have any driver RX features configured on the interface? */
	  vnet_feature_start_device_input_x1 (xd->vlib_sw_if_index, &next0,
					      b0);

	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   bi0, next0);
	  n_buffers--;
	  mb_index++;
	}
      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[thread_index]) > 0))
    {
      dpdk_rx_trace (dm, node, xd, queue_id,
		     xd->d_trace_buffers[thread_index],
		     vec_len (xd->d_trace_buffers[thread_index]));
      vlib_set_trace_count (vm, node,
			    n_trace -
			    vec_len (xd->d_trace_buffers[thread_index]));
    }

  vlib_increment_combined_counter
    (vnet_get_main ()->interface_main.combined_sw_if_counters
     + VNET_INTERFACE_COUNTER_RX,
     thread_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes);

  vnet_device_increment_rx_packets (thread_index, mb_index);

  return mb_index;
}

新的体会：技术积累需要循序渐进，日拱一卒，不期速成。然而技术的积累需要满足当前市场环境的需求，也要避免重复造轮子。很多开源软件设计思想有很大的参考价值。