Xen中的异常和中断(系统调用)、gdt、ldt

HYPERVISOR_set_gdt

static void xen_load_gdt(const struct desc_ptr *dtr)

/*
  * load_gdt for early boot, when the gdt is only mapped once
  */
 static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)

 

static const struct pv_cpu_ops xen_cpu_ops __initconst = {

     .load_gdt = xen_load_gdt,
     .load_idt = xen_load_idt,

}

 

/* Load the original GDT from the per-cpu structure */
 void load_direct_gdt(int cpu) 
 {
     struct desc_ptr gdt_descr;
 
     gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
     gdt_descr.size = GDT_SIZE - 1; 
     load_gdt(&gdt_descr);
 }
 EXPORT_SYMBOL_GPL(load_direct_gdt);
 
 /* Load a fixmap remapping of the per-cpu GDT */
 void load_fixmap_gdt(int cpu) 
 {
     struct desc_ptr gdt_descr;
 
     gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
     gdt_descr.size = GDT_SIZE - 1; 
     load_gdt(&gdt_descr);
 }
 EXPORT_SYMBOL_GPL(load_fixmap_gdt);

/*
  * Current gdt points %fs at the "master" per-cpu area: after this,
  * it's on the real one.
  */
 void switch_to_new_gdt(int cpu) 
 {
     /* Load the original GDT */
     load_direct_gdt(cpu);
     /* Reload the per-cpu base */
     load_percpu_segment(cpu);
 }
/*
  * Set up the GDT and segment registers for -fstack-protector.  Until
  * we do this, we have to be careful not to call any stack-protected
  * function, which is most of the kernel.
  */
 static void __init xen_setup_gdt(int cpu) 
 {
     pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
     pv_ops.cpu.load_gdt = xen_load_gdt_boot;
 
     setup_stack_canary_segment(cpu);
     switch_to_new_gdt(cpu);
 
     pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
     pv_ops.cpu.load_gdt = xen_load_gdt;
 }
/* First C function to be called on Xen boot */
 asmlinkage __visible void __init xen_start_kernel(void)
 {
    /*
      * Set up kernel GDT and segment registers, mainly so that
      * -fstack-protector code can be executed.
      */
     xen_setup_gdt(0);

}

/*
  * Early setup to make printk work.
  */
 void __init native_smp_prepare_boot_cpu(void)
 {
     int me = smp_processor_id();
     switch_to_new_gdt(me);
     /* already set me in cpu_online_mask in boot_cpu_init() */
     cpumask_set_cpu(me, cpu_callout_mask);
     cpu_set_state_online(me);
 }
/*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
  * and IDT. We reload them nevertheless, this function acts as a
  * 'CPU state barrier', nothing should get across.
  * A lot of state is already set up in PDA init for 64 bit
  */
 #ifdef CONFIG_X86_64
 
 void cpu_init(void)
 {
    switch_to_new_gdt(cpu);

}

 

static void __init xen_pv_smp_prepare_boot_cpu(void)

static const struct smp_ops xen_smp_ops __initconst = { 
     .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu,

}

void __init xen_hvm_smp_init(void)

 

/* First C function to be called on Xen boot */
 asmlinkage __visible void __init xen_start_kernel(void)
 {
 

#ifdef CONFIG_X86_32
     pv_info.kernel_rpl = 1; 
     if (xen_feature(XENFEAT_supervisor_mode_kernel))
         pv_info.kernel_rpl = 0; 

}

 

特权级是实现系统虚拟化的关键因素,因为通过其可以将整个系统划分为虚拟化管理、系统内核、用户空间等不同部分,实现分级保护和资源共享。因此,深入分析gdt、ldt、异常、中断等的实现方式,对于理解由只含Linux内核至虚拟化管理器的出现,具有重要的理论和实践意义。本文将结合源码分析、xen的docs说明,充分阐述上述问题的解决方案。

 

Xen是怎么做到“特权解除”的呢?当然是通过设置GDT表格了

特权解除:是指解除正常情况下运行于ring0的段,比如中断处理程序,为了虚拟化需要,此时解除其特权,将其运行于ring1

当用户程序通过系统调用时,其跳转到的中断处理程序运行于ring1。但是,在中断处理程序中,有部分指令是必须在ring0才能执行的,比如(),此时,便会自动陷入,然后模拟。

也就是说,用户程序运行特权指令,会有两次特权下降,一次是通过系统调用进入ring1,第二次是通过特权指令陷入进入ring0。这说明,中断发生时的中断处理程序还是以前的位于内核的代码,但是其运行级别为ring1,部分指令还需要再次陷入,才能执行。

特权指令为什么会自动陷入呢?
Xen具体是怎么做到的呢?

GDT本质上是一个预先设定好的表格,里面包含了地址、权限等信息;在使用过程中,通过lgdt指令实现加载。同时,由于lgdt指令是敏感指令,因此,guest OS在启动过程中会将其陷入、并进行模拟。在guest os启动前,会通过struct vcpu结构体预置gdt表地址,然后在模拟lgdt指令时实现加载。

GDT:gdt开始时是由xen设定的,决定了guest os的启动位置;如果guest os不同意,可以通过hypercall进行修改。guest os在启动的时候,会利用xen为其提供的约定的GDT,这个GDT不在guest os内存空间内。如果,guest OS不想利用该GDT所提供的空间,而是想利用其它的位于ring1 或者ring3的“flat”空间,那么guest os需要首先从其内存空间内分配一块GDT内存,然后向xen注册。该注册过程是利用 int set_gdt(unsigned long frame_list, int entries)函数实现的,其中,frame_list是由14个machine page frame,新的GDT位于其内。在注册之后,这些frame只能为只读,只能利用前14个frame,因为第15,16个frame会被用来存放xen的GDT 项,xen保留项的具体内容参见(xen/include/public/arch-x86_32.h)(这就是说一共有16个frames)。entries代表frames中的项数。

file:./xen/arch/x86/domain.c

dt = !is_pv_32bit_domain(nd) ? per_cpu(gdt_table, cpu) : per_cpu(compat_gdt_table, cpu);

可以看出来,如果是半虚拟化方式,其将用compat_gdt_table

file:./xen/arch/x86/smpboot.c

    per_cpu(compat_gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
     if ( gdt == NULL )
         goto out; 
     memcpy(gdt, boot_cpu_compat_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;

可以看出,compat_gdt_table对应boot_cpu_compat_gdt_table

继续追file:./xen/arch/x86/boot/x86_64.S

GLOBAL(boot_cpu_gdt_table)
         .quad 0x0000000000000000     /* unused */
         .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
         .quad 0x0000000000000000     /* reserved                          */
         .quad 0x00cffa000000ffff     /* 0xe023 ring 3 code, compatibility */
         .quad 0x00cff2000000ffff     /* 0xe02b ring 3 data                */
         .quad 0x00affa000000ffff     /* 0xe033 ring 3 code, 64-bit mode   */
         .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
         .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
         .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
 
         .align PAGE_SIZE, 0
 /* NB. Even rings != 0 get access to the full 4Gb, as only the            */
 /*     (compatibility) machine->physical mapping table lives there.       */
 GLOBAL(boot_cpu_compat_gdt_table)
         .quad 0x0000000000000000     /* unused */
         .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
         .quad 0x00cfba000000ffff     /* 0xe019 ring 1 code, compatibility */
         .quad 0x00cfb2000000ffff     /* 0xe021 ring 1 data                */
         .quad 0x00cffa000000ffff     /* 0xe02b ring 3 code, compatibility */
         .quad 0x00cff2000000ffff     /* 0xe033 ring 3 data                */
         .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
         .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
         .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
         .align PAGE_SIZE, 0

 

哈哈,各位看清楚了吗?这就是秘密啦,操作系统的内核代码运行于ring 1。

 long pv_set_gdt(struct vcpu *v, unsigned long *frames, unsigned int entries)
 {
     struct domain *d = v->domain;
     l1_pgentry_t *pl1e;
     unsigned int i, nr_frames = DIV_ROUND_UP(entries, 512);
 
     if ( entries > FIRST_RESERVED_GDT_ENTRY )
         return -EINVAL;
 
     /* Check the pages in the new GDT. */
     for ( i = 0; i < nr_frames; i++ )
     {
         struct page_info *page;
 
         page = get_page_from_gfn(d, frames[i], NULL, P2M_ALLOC);
         if ( !page )
             goto fail;
         if ( !get_page_type(page, PGT_seg_desc_page) )
         {
             put_page(page);
             goto fail;
         }
         frames[i] = mfn_x(page_to_mfn(page));
     }
 
     /* Tear down the old GDT. */
     pv_destroy_gdt(v);
 
     /* Install the new GDT. */
     v->arch.pv_vcpu.gdt_ents = entries;
     pl1e = pv_gdt_ptes(v);
     for ( i = 0; i < nr_frames; i++ )
     {
         v->arch.pv_vcpu.gdt_frames[i] = frames[i];
         l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR_RW));
     }
 
     return 0;
 
  fail:
     while ( i-- > 0 )
     {
         put_page_and_type(mfn_to_page(_mfn(frames[i])));
     }
     return -EINVAL;
 }
 

这是硬件虚拟化模式

long do_set_gdt(XEN_GUEST_HANDLE_PARAM(xen_ulong_t) frame_list,
                 unsigned int entries)
 {
     unsigned int nr_frames = DIV_ROUND_UP(entries, 512);
     unsigned long frames[16];
     struct vcpu *curr = current;
     long ret;
 
     /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
     if ( entries > FIRST_RESERVED_GDT_ENTRY )
         return -EINVAL;
 
     if ( copy_from_guest(frames, frame_list, nr_frames) )
         return -EFAULT;
 
     domain_lock(curr->domain);
 
     if ( (ret = pv_set_gdt(curr, frames, entries)) == 0 )
         flush_tlb_local();
 
     domain_unlock(curr->domain);
 
     return ret;
 }
 

这是半虚拟化模式

int compat_set_gdt(XEN_GUEST_HANDLE_PARAM(uint) frame_list,
                    unsigned int entries)
 {
     struct vcpu *curr = current;
     unsigned int i, nr_frames = DIV_ROUND_UP(entries, 512);
     unsigned long frames[16];
     int ret;
 
     /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */
     if ( entries > FIRST_RESERVED_GDT_ENTRY )
         return -EINVAL;
 
     if ( !guest_handle_okay(frame_list, nr_frames) )
         return -EFAULT;
 
     for ( i = 0; i < nr_frames; ++i )
     {   
         unsigned int frame;
 
         if ( __copy_from_guest(&frame, frame_list, 1) )
             return -EFAULT;
 
         frames[i] = frame;
         guest_handle_add_offset(frame_list, 1); 
     }   
 
     domain_lock(curr->domain);
 
     if ( (ret = pv_set_gdt(curr, frames, entries)) == 0 ) 
         flush_tlb_local();
 
     domain_unlock(curr->domain);
 
     return ret;
 }
 

struct pv_vcpu
 {
     /* map_domain_page() mapping cache. */
     struct mapcache_vcpu mapcache;
 
     struct trap_info *trap_ctxt;
 
     unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE];
     unsigned long ldt_base;
     unsigned int gdt_ents, ldt_ents;
 
     unsigned long kernel_ss, kernel_sp;
     unsigned long ctrlreg[8];
 
     unsigned long event_callback_eip;
     unsigned long failsafe_callback_eip;
     union {
         unsigned long syscall_callback_eip;
         struct {
             unsigned int event_callback_cs;
             unsigned int failsafe_callback_cs;
         };  
     };  
 
     unsigned long syscall32_callback_eip;
     unsigned long sysenter_callback_eip;
     unsigned short syscall32_callback_cs;
     unsigned short sysenter_callback_cs;
     bool_t syscall32_disables_events;
     bool_t sysenter_disables_events;
 
     /* Segment base addresses. */
     unsigned long fs_base;
     unsigned long gs_base_kernel;
     unsigned long gs_base_user;
 
     /* Bounce information for propagating an exception to guest OS. */
     struct trap_bounce trap_bounce;
 
     /* I/O-port access bitmap. */
     XEN_GUEST_HANDLE(uint8) iobmp; /* Guest kernel vaddr of the bitmap. */
     unsigned int iobmp_limit; /* Number of ports represented in the bitmap. */
 #define IOPL(val) MASK_INSR(val, X86_EFLAGS_IOPL)
     unsigned int iopl;        /* Current IOPL for this VCPU, shifted left by
                                * 12 to match the eflags register. */
 
     /* Current LDT details. */
     unsigned long shadow_ldt_mapcnt;
     spinlock_t shadow_ldt_lock;
 
     /* data breakpoint extension MSRs */
     uint32_t dr_mask[4];
     /* Deferred VA-based update state. */
     bool_t need_update_runstate_area;
     struct vcpu_time_info pending_system_time;
 };
 struct arch_vcpu
 {
     /*
      * guest context (mirroring struct vcpu_guest_context) common
      * between pv and hvm guests
      */
 
     void              *fpu_ctxt;
     unsigned long      vgc_flags;
     struct cpu_user_regs user_regs;
     unsigned long      debugreg[8];
 
     /* other state */
 
     unsigned long      flags; /* TF_ */
 
     struct vpmu_struct vpmu;
 
     /* Virtual Machine Extensions */
     union {
         struct pv_vcpu pv_vcpu;
         struct hvm_vcpu hvm_vcpu;
     };
 
     pagetable_t guest_table_user;       /* (MFN) x86/64 user-space pagetable */
     pagetable_t guest_table;            /* (MFN) guest notion of cr3 */
     struct page_info *old_guest_table;  /* partially destructed pagetable */
     struct page_info *old_guest_ptpg;   /* containing page table of the */
                                         /* former, if any */
     /* guest_table holds a ref to the page, and also a type-count unless
      * shadow refcounts are in use */
     pagetable_t shadow_table[4];        /* (MFN) shadow(s) of guest */
     pagetable_t monitor_table;          /* (MFN) hypervisor PT (for HVM) */
     unsigned long cr3;                  /* (MA) value to install in HW CR3 */
 
     /*
      * The save area for Processor Extended States and the bitmask of the
      * XSAVE/XRSTOR features. They are used by: 1) when a vcpu (which has
      * dirtied FPU/SSE) is scheduled out we XSAVE the states here; 2) in
      * #NM handler, we XRSTOR the states we XSAVE-ed;
      */
     struct xsave_struct *xsave_area;
     uint64_t xcr0;
     /* Accumulated eXtended features mask for using XSAVE/XRESTORE by Xen
      * itself, as we can never know whether guest OS depends on content
      * preservation whenever guest OS clears one feature flag (for example,
      * temporarily).
      * However, processor should not be able to touch eXtended states before
      * it explicitly enables it via xcr0.
      */
     uint64_t xcr0_accum;
     /* This variable determines whether nonlazy extended state has been used,
      * and thus should be saved/restored. */

     bool_t nonlazy_xstate_used;
 
     /* Restore all FPU state (lazy and non-lazy state) on context switch? */
     bool fully_eager_fpu;
 
     struct vmce vmce;
 
     struct paging_vcpu paging;
 
     uint32_t gdbsx_vcpu_event;
 
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
 
     struct arch_vm_event *vm_event;
 
     struct msr_vcpu_policy *msr;
 
     struct {
         bool next_interrupt_enabled;
     } monitor;
 };
 /* Called by XEN_DOMCTL_setvcpucontext and VCPUOP_initialise. 
  *ADDED by me: dom->arch_hooks->vcpu calls XEN_DOMCTL_setvcpucontext when boot a guest os. And the source code is in xc_dom_boot.c
  * VCPUOP_initialise is called by guest os itself?*/
 int arch_set_info_guest(
     struct vcpu *v, vcpu_guest_context_u c)
 {

××××
             if ( !v->is_initialised )
     {
         if ( !compat && !(flags & VGCF_in_kernel) && !c.nat->ctrlreg[1] )
             return -EINVAL;
 
         v->arch.pv_vcpu.ldt_base = c(ldt_base);
         v->arch.pv_vcpu.ldt_ents = c(ldt_ents);
     }
     else
     {
         unsigned long pfn = pagetable_get_pfn(v->arch.guest_table);
         bool fail;
 
         if ( !compat )
         {
             fail = xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[3];
             if ( pagetable_is_null(v->arch.guest_table_user) )
                 fail |= c.nat->ctrlreg[1] || !(flags & VGCF_in_kernel);
             else
             {
                 pfn = pagetable_get_pfn(v->arch.guest_table_user);
                 fail |= xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[1];
             }
         } else {
             l4_pgentry_t *l4tab = map_domain_page(_mfn(pfn));
 
             pfn = l4e_get_pfn(*l4tab);
             unmap_domain_page(l4tab);
             fail = compat_pfn_to_cr3(pfn) != c.cmp->ctrlreg[3];
         }
 
         for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i )
             fail |= v->arch.pv_vcpu.gdt_frames[i] != c(gdt_frames[i]);
         fail |= v->arch.pv_vcpu.gdt_ents != c(gdt_ents);
 
         fail |= v->arch.pv_vcpu.ldt_base != c(ldt_base);
         fail |= v->arch.pv_vcpu.ldt_ents != c(ldt_ents);
 
         if ( fail )
            return -EOPNOTSUPP;
     }
 
     v->arch.pv_vcpu.kernel_ss = c(kernel_ss);
     v->arch.pv_vcpu.kernel_sp = c(kernel_sp);
     for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.ctrlreg); ++i )
         v->arch.pv_vcpu.ctrlreg[i] = c(ctrlreg[i]);
 
     v->arch.pv_vcpu.event_callback_eip = c(event_callback_eip);
     v->arch.pv_vcpu.failsafe_callback_eip = c(failsafe_callback_eip);

×××××××

}

 

static int vcpu_x86_32(struct xc_dom_image *dom){

***

rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);

***

}

 

static struct xc_dom_arch xc_dom_32_pae = {

.vcpu = vcpu_x86_32,

}

 

static void __init register_arch_hooks(void)

{

xc_dom_register_arch_hooks(&xc_dom_32_pae);

xc_dom_register_arch_hooks(&xc_dom_64);

xc_dom_register_arch_hooks(&xc_hvm_32);

}

 

int xc_dom_boot_image(struct xc_dom_image *dom)

{

xc_dominfo_t info;

int rc;

 

DOMPRINTF_CALLED(dom->xch);

 

/* misc stuff*/

if ( (rc = dom->arch_hooks->bootearly(dom)) != 0 )

return rc;

 

/* collect some info */

rc = xc_domain_getinfo(dom->xch, dom->guest_domid, 1, &info);

if ( rc < 0 )

{

xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,

"%s: getdomaininfo failed (rc=%d)", __FUNCTION__, rc);

return rc;

}

if ( rc == 0 || info.domid != dom->guest_domid )

{

xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,

"%s: Huh? No domains found (nr_domains=%d) "

"or domid mismatch (%d != %d)", __FUNCTION__,

rc, info.domid, dom->guest_domid);

return -1;

}

dom->shared_info_mfn = info.shared_info_frame;

 

/* sanity checks */

if ( !xc_dom_compat_check(dom) )

return -1;

 

/* initial mm setup */

if ( (rc = xc_dom_update_guest_p2m(dom)) != 0 )

return rc;

if ( dom->arch_hooks->setup_pgtables )

if ( (rc = dom->arch_hooks->setup_pgtables(dom)) != 0 )

return rc;

 

/* start info page */

if ( dom->arch_hooks->start_info )

dom->arch_hooks->start_info(dom);

 

/* hypercall page */

if ( (rc = setup_hypercall_page(dom)) != 0 )

return rc;

xc_dom_log_memory_footprint(dom);

 

/* misc x86 stuff */

if ( (rc = dom->arch_hooks->bootlate(dom)) != 0 )

return rc;

 

/* let the vm run */

if ( (rc = dom->arch_hooks->vcpu(dom)) != 0 )

return rc;

xc_dom_unmap_all(dom);

 

return rc;

}

 

static int libxl__build_dom(libxl__gc *gc, uint32_t domid,

libxl_domain_config *d_config, libxl__domain_build_state *state,

struct xc_dom_image *dom)

{

libxl_domain_build_info *const info = &d_config->b_info;

uint64_t mem_kb;

int ret;

 

if ( (ret = xc_dom_boot_xen_init(dom, CTX->xch, domid)) != 0 ) {

LOGE(ERROR, "xc_dom_boot_xen_init failed");

goto out;

}

#ifdef GUEST_RAM_BASE

if ( (ret = xc_dom_rambase_init(dom, GUEST_RAM_BASE)) != 0 ) {

LOGE(ERROR, "xc_dom_rambase failed");

goto out;

}

#endif

if ( (ret = xc_dom_parse_image(dom)) != 0 ) {

LOG(ERROR, "xc_dom_parse_image failed");

goto out;

}

if ( (ret = libxl__arch_domain_init_hw_description(gc, info, state, dom)) != 0 ) {

LOGE(ERROR, "libxl__arch_domain_init_hw_description failed");

goto out;

}

 

mem_kb = dom->container_type == XC_DOM_HVM_CONTAINER ?

(info->max_memkb - info->video_memkb) : info->target_memkb;

if ( (ret = xc_dom_mem_init(dom, mem_kb / 1024)) != 0 ) {

LOGE(ERROR, "xc_dom_mem_init failed");

goto out;

}

if ( (ret = xc_dom_boot_mem_init(dom)) != 0 ) {

LOGE(ERROR, "xc_dom_boot_mem_init failed");

goto out;

}

if ( (ret = libxl__arch_domain_finalise_hw_description(gc, domid, d_config, dom)) != 0 ) {

LOGE(ERROR, "libxl__arch_domain_finalise_hw_description failed");

goto out;

}

if ( (ret = xc_dom_build_image(dom)) != 0 ) {

LOGE(ERROR, "xc_dom_build_image failed");

goto out;

}

if ( (ret = xc_dom_boot_image(dom)) != 0 ) {

LOGE(ERROR, "xc_dom_boot_image failed");

goto out;

}

}

if ( (ret = xc_dom_gnttab_init(dom)) != 0 ) {

LOGE(ERROR, "xc_dom_gnttab_init failed");

goto out;

}

if ((ret = libxl__arch_build_dom_finish(gc, info, dom, state)) != 0) {

LOGE(ERROR, "libxl__arch_build_dom_finish failed");

goto out;

}

 

out:

return ret != 0 ? ERROR_FAIL : 0;

}

 

int libxl__build_pv(libxl__gc *gc, uint32_t domid,

libxl_domain_config *d_config, libxl__domain_build_state *state)

{

ret = libxl__build_dom(gc, domid, d_config, state, dom);

if (ret != 0)

goto out;

 

if (xc_dom_translated(dom)) {

state->console_mfn = dom->console_pfn;

state->store_mfn = dom->xenstore_pfn;

state->vuart_gfn = dom->vuart_gfn;

} else {

state->console_mfn = xc_dom_p2m(dom, dom->console_pfn);

state->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn);

}

 

ret = 0;

out:

xc_dom_release(dom);

return ret == 0 ? 0 : ERROR_FAIL;

}

 

int libxl__domain_build(){

 

ret = libxl__build_pv(gc, domid, d_config, state);

}

 

static void domcreate_bootloader_done(libxl__egc *egc,

libxl__bootloader_state *bl,

int rc)

{

if (restore_fd < 0 && dcs->domid_soft_reset == INVALID_DOMID) {

rc = libxl__domain_build(gc, d_config, domid, state);

domcreate_rebuild_done(egc, dcs, rc);

return;

}

 

}

 

static void initiate_domain_create(libxl__egc *egc,

libxl__domain_create_state *dcs)

{

if (restore_fd >= 0 || dcs->domid_soft_reset != INVALID_DOMID) {

LOGD(DEBUG, domid, "restoring, not running bootloader");

domcreate_bootloader_done(egc, &dcs->bl, 0);

} else {

LOGD(DEBUG, domid, "running bootloader");

dcs->bl.callback = domcreate_bootloader_done;

dcs->bl.console_available = domcreate_bootloader_console_available;

dcs->bl.info = &d_config->b_info;

dcs->bl.disk = bootdisk;

dcs->bl.domid = dcs->guest_domid;

 

dcs->bl.kernel = &dcs->build_state.pv_kernel;

dcs->bl.ramdisk = &dcs->build_state.pv_ramdisk;

 

libxl__bootloader_run(egc, &dcs->bl);

}

return;

 

error_out:

assert(ret);

domcreate_complete(egc, dcs, ret);

}

 

static int do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,

uint32_t *domid, int restore_fd, int send_back_fd,

const libxl_domain_restore_params *params,

const libxl_asyncop_how *ao_how,

const libxl_asyncprogress_how *aop_console_how)

{

initiate_domain_create(egc, &cdcs->dcs);

 

return AO_INPROGRESS;

 

out_err:

return AO_CREATE_FAIL(rc);

 

}

 

int libxl_domain_create_new(libxl_ctx *ctx, libxl_domain_config *d_config,

uint32_t *domid,

const libxl_asyncop_how *ao_how,

const libxl_asyncprogress_how *aop_console_how)

{

unset_disk_colo_restore(d_config);

return do_domain_create(ctx, d_config, domid, -1, -1, NULL,

ao_how, aop_console_how);

}

 

int create_domain(struct domain_create *dom_info){

ret = libxl_domain_create_new(ctx, &d_config, &domid,0, autoconnect_console_how);

 

}

 

int main_create(int argc, char **argv){

rc = create_domain(&dom_info);

}

 

struct cmd_spec cmd_table[] = {

{ "create",

&main_create, 1, 1,

"Create a domain from config file <filename>",

"<ConfigFile> [options] [vars]",

"-h Print this help.\n"

"-p Leave the domain paused after it is created.\n"

"-c Connect to the console after the domain is created.\n"

"-f FILE, --defconfig=FILE\n Use the given configuration file.\n"

"-q, --quiet Quiet.\n"

"-n, --dryrun Dry run - prints the resulting configuration\n"

" (deprecated in favour of global -N option).\n"

"-d Enable debug messages.\n"

"-F Run in foreground until death of the domain.\n"

"-e Do not wait in the background for the death of the domain.\n"

"-V, --vncviewer Connect to the VNC display after the domain is created.\n"

"-A, --vncviewer-autopass\n"

" Pass VNC password to viewer via stdin."

},

}

 

 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
  * and IDT. We reload them nevertheless, this function acts as a
  * 'CPU state barrier', nothing should get across.
  */
 void cpu_init(void)
 {     

     int cpu = smp_processor_id();
 
     if (cpumask_test_and_set_cpu(cpu, &cpu_initialized)) {
         printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
         for (;;) local_irq_enable();
     }   
     if (opt_cpu_info)
         printk("Initializing CPU#%d\n", cpu);
 
     if (cpu_has_pat)
         wrmsrl(MSR_IA32_CR_PAT, host_pat);
 
     /* Install correct page table. */
     write_ptbase(current);
 
     /* Ensure FPU gets initialised for each domain. */
     stts();
 
     /* Clear all 6 debug registers: */
 #define CD(register) asm volatile ( "mov %0,%%db" #register : : "r"(0UL) );
     CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
 #undef CD

}

 

对于gdt、ldt,32位的check_descriptor是通过 fixup_guest_code_selector将其DPL设为1(位于./xen/arch/x86/x86_32/mm.c),而对于64位,则是将其设为0(位于./xen/arch/x86/x86_64/mm.c)。其中,fixup_guest_code_selector被arch_set_info_guest函数调用,arch_set_info_guest函数注释如下:

/*
  * Initialise VCPU state. The context can be supplied by either the
  * toolstack (XEN_DOMCTL_setvcpucontext) or the guest
  * (VCPUOP_initialise) and therefore must be properly validated.
  */

这就是说,XEN_DOMCTL_setvcpucontext被libxc调用;VCPUOP_initialise作为超级调用Hypercall_vcpu_op被guest os调用,其实现在./xen/common/domain.c中do_vcpu_op函数实现。

libxc具体分析如下:

/**
  * This function will create a domain for a paravirtualized Linux
  * using file names pointing to kernel and ramdisk
  *
  * @parm xch a handle to an open hypervisor interface
  * @parm domid the id of the domain
  * @parm mem_mb memory size in megabytes
  * @parm image_name name of the kernel image file
  * @parm ramdisk_name name of the ramdisk image file
  * @parm cmdline command line string
  * @parm flags domain creation flags
  * @parm store_evtchn the store event channel for this domain to use
  * @parm store_mfn returned with the mfn of the store page
  * @parm console_evtchn the console event channel for this domain to use
  * @parm conole_mfn returned with the mfn of the console page
  * @return 0 on success, -1 on failure
  */
 int xc_linux_build(xc_interface *xch,
                    uint32_t domid,
                    unsigned int mem_mb,
                    const char *image_name,
                    const char *ramdisk_name,
                    const char *cmdline,
                    const char *features,
                    unsigned long flags,
                    unsigned int store_evtchn,
                    unsigned long *store_mfn,
                    unsigned int console_evtchn,
                    unsigned long *console_mfn);

int xc_linux_build(xc_interface *xch, uint32_t domid,
                    unsigned int mem_mb,           
                    const char *image_name,        
                    const char *initrd_name,       
                    const char *cmdline,           
                    const char *features,          
                    unsigned long flags,           
                    unsigned int store_evtchn,     
                    unsigned long *store_mfn,      
                    unsigned int console_evtchn,   
                    unsigned long *console_mfn)    
 {
     struct xc_dom_image *dom;
     int rc;
 
     xc_dom_loginit(xch);
     dom = xc_dom_allocate(xch, cmdline, features);
     if (dom == NULL)
         return -1;
     if ( (rc = xc_dom_kernel_file(dom, image_name)) != 0 )
         goto out;
     if ( initrd_name && strlen(initrd_name) &&
          ((rc = xc_dom_module_file(dom, initrd_name, NULL)) != 0) )
         goto out;
 
     dom->flags |= flags;
     dom->console_evtchn = console_evtchn;
     dom->xenstore_evtchn = store_evtchn;
 
     if ( (rc = xc_dom_boot_xen_init(dom, xch, domid)) != 0 )
         goto out;
     if ( (rc = xc_dom_parse_image(dom)) != 0 )
         goto out;
     if ( (rc = xc_dom_mem_init(dom, mem_mb)) != 0 )
         goto out;
     if ( (rc = xc_dom_boot_mem_init(dom)) != 0 )
         goto out;
     if ( (rc = xc_dom_build_image(dom)) != 0 )
         goto out;
     if ( (rc = xc_dom_boot_image(dom)) != 0 ) 
         goto out;
     if ( (rc = xc_dom_gnttab_init(dom)) != 0) 
         goto out;

    *console_mfn = xc_dom_p2m(dom, dom->console_pfn);
     *store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn);
 
  out:
     xc_dom_release(dom);
     return rc;
 }
 

int xc_dom_boot_image(struct xc_dom_image *dom)
 {
     xc_dominfo_t info;
     int rc;
 
     DOMPRINTF_CALLED(dom->xch);
 
     /* misc stuff*/
     if ( (rc = dom->arch_hooks->bootearly(dom)) != 0 )
         return rc;
 
     /* collect some info */
     rc = xc_domain_getinfo(dom->xch, dom->guest_domid, 1, &info);
     if ( rc < 0 )
     {
         xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
                      "%s: getdomaininfo failed (rc=%d)", __FUNCTION__, rc);
         return rc;
     }
     if ( rc == 0 || info.domid != dom->guest_domid )
     {
         xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
                      "%s: Huh? No domains found (nr_domains=%d) "
                      "or domid mismatch (%d != %d)", __FUNCTION__,
                      rc, info.domid, dom->guest_domid);
         return -1;
     }
     dom->shared_info_mfn = info.shared_info_frame;
 
     /* sanity checks */
     if ( !xc_dom_compat_check(dom) )
         return -1;
    
     /* initial mm setup */   
     if ( (rc = xc_dom_update_guest_p2m(dom)) != 0 )
         return rc;
     if ( dom->arch_hooks->setup_pgtables )
         if ( (rc = dom->arch_hooks->setup_pgtables(dom)) != 0 )
             return rc;
    
     /* start info page */
     if ( dom->arch_hooks->start_info )
         dom->arch_hooks->start_info(dom);
     /* hypercall page */
     if ( (rc = setup_hypercall_page(dom)) != 0 )
         return rc;
     xc_dom_log_memory_footprint(dom);
 
     /* misc x86 stuff */
     if ( (rc = dom->arch_hooks->bootlate(dom)) != 0 )
         return rc;
 
     /* let the vm run */
     if ( (rc = dom->arch_hooks->vcpu(dom)) != 0 )
         return rc;
     xc_dom_unmap_all(dom);
 
     return rc;
 }
 

struct xc_dom_image

{

      struct xc_dom_arch *arch_hooks;

}

 

static struct xc_dom_arch xc_dom_32_pae = {
     .guest_type = "xen-3.0-x86_32p",
     .native_protocol = XEN_IO_PROTO_ABI_X86_32,
     .page_shift = PAGE_SHIFT_X86,  
     .sizeof_pfn = 4,
     .p2m_base_supported = 0, 
     .arch_private_size = sizeof(struct xc_dom_image_x86),
     .alloc_magic_pages = alloc_magic_pages_pv,
     .alloc_pgtables = alloc_pgtables_x86_32_pae,
     .alloc_p2m_list = alloc_p2m_list_x86_32,
     .setup_pgtables = setup_pgtables_x86_32_pae,
     .start_info = start_info_x86_32,
     .shared_info = shared_info_x86_32,
     .vcpu = vcpu_x86_32,
     .meminit = meminit_pv,
     .bootearly = bootearly,
     .bootlate = bootlate_pv,
 }; 
 

static int vcpu_x86_32(struct xc_dom_image *dom)

{

*

rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);

*

}

int xc_vcpu_setcontext(xc_interface *xch, 
    xc_domain_unbind_pt_irq   | 1396                        uint32_t domid,
    xc_domain_unbind_pt_irq_in| 1397                        uint32_t vcpu,
    xc_domain_unbind_pt_spi_ir| 1398                        vcpu_guest_context_any_t *ctxt)

{

*

     DECLARE_DOMCTL;
     DECLARE_HYPERCALL_BOUNCE(ctxt, sizeof(vcpu_guest_context_any_t), XC_HYPERCALL_BUFFER_BOUNCE_IN);
     int rc;
    
     if ( xc_hypercall_bounce_pre(xch, ctxt) )
         return -1;
    
     domctl.cmd = XEN_DOMCTL_setvcpucontext;
     domctl.domain = domid;
     domctl.u.vcpucontext.vcpu = vcpu;
     set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt);
    
     rc = do_domctl(xch, &domctl);  
    
     xc_hypercall_bounce_post(xch, ctxt);
    
     return rc;
 

*

}

/* Called by XEN_DOMCTL_setvcpucontext and VCPUOP_initialise. 
  *ADDED by me: dom->arch_hooks->vcpu calls XEN_DOMCTL_setvcpucontext when boot a guest os. And the source code is in xc_dom_boot.c
  * VCPUOP_initialise is called by guest os itself?*/
 int arch_set_info_guest(
     struct vcpu *v, vcpu_guest_context_u c)
 {  
 

/*
 * SEGMENT DESCRIPTOR TABLES
 */
/*
 * A number of GDT entries are reserved by Xen. These are not situated at the
 * start of the GDT because some stupid OSes export hard-coded selector values
 * in their ABI. These hard-coded values are always near the start of the GDT,
 * so Xen places itself out of the way, at the far end of the GDT.
 */
#define FIRST_RESERVED_GDT_PAGE  14
#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)

xen也允许guest os去改变指定的segment  descriptor,这是通过hypercall update_descriptor(uint64_t ma, uint64_t desc)实现的。

LDT:guest os也可以自主更新LDT,这是通过mmu_update实现的,该函数将LDT的基地址和待项数作为参数。ldt也可以更新特定项,方法同GDT更新特定项。

对于gdt、ldt,32位的check_descriptor是通过 fixup_guest_code_selector将其DPL设为1(位于./xen/arch/x86/x86_32/mm.c),而对于64位,则是将其设为0(位于./xen/arch/x86/x86_64/mm.c)

 

 

 

系统调用(软中断)、异常是由内而外;硬件中断一般是由外而内:

 

 

Xen does not allow the guest kernel to set up the IDT for the processor, but
allows the guest kernel to pass on an IDT it desires to Xen by means of a
hypercall. Xen makes its own IDT on behalf of the guest kernel that the
processor accesses. However, when it does this the stack that the hardware
jumps to is at privilege level 0 and is a stack accessible only to Xen.

Xen emulates the hardware behavior to the OS by creating a bounce
frame on the Linux Kernel stack just as the x86 hardware would do. How-
ever, Xen does not turn the control directly to the Interrupt Service Routine
as set by the Linux kernel. Instead, the Xen hypervisor jumps to the “hy-
pervisor callback” routine defined in the XenoLinux kernel after creating a
bounce frame on the kernel stack of the domain.

 

Segments are defined in two tables, the GDT and LDT. Xen guests can update
the LDT in the same way they update the page tables, but they can only modify
the GDT via an explicit hypercall.--摘《The Definitive Guide to the Xen Hypervisor》)

 

 

参考资料

http://www.sprg.uniroma2.it/kernelhacking2008/lectures/lkhc08-06.pdf

xen Interface manual Xen v3.0 for x86 

非常有用的材料

Xen/IA64 interrupt virtualization

IDT

In vanilla Linux the IDT is initialized in trap_init() using set__gate() functions. Because Xen handles the IDT, it requires all calls to these function to be replaced with a single call to the HYPERVISOR_set_trap_table() hypercall.

HYPERVISOR_set_trap_table() accepts as a parameter the virtual IDT of the guest, represented by the trap_table structure (of type struct trap_info) in traps-xen.c.

struct trap_info resembles a trap or interrupt gate, having fields for vector, handler segment selector and offset.

Xen maintains two IDT's, one global IDT (its own) and other per domain IDT. Xen uses global IDT to register the entire trap handler except for system call handler (int 0x80).

 

Virtual IDT

•  A virtual IDT is provided by guest OS for setting up interrupt vector table.

•  The exception stack frame presented to a virtual trap handler is identical to its native equivalent.

 

Xen guest 中的trap table 见linux-2.6-xen-sparse /arch/i386/kernel/traps-xen.c

static trap_info_t trap_table[] = {

       {  0, 0, __KERNEL_CS, (unsigned long)divide_error        },

       {  1, 0|4, __KERNEL_CS, (unsigned long)debug                     },

       {  3, 3|4, __KERNEL_CS, (unsigned long)int3                 },

       {  4, 3, __KERNEL_CS, (unsigned long)overflow                   },

       {  5, 0, __KERNEL_CS, (unsigned long)bounds               },

       {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                  },

       {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },

       {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },

       { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS         },

       { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present   },

       { 12, 0, __KERNEL_CS, (unsigned long)stack_segment            },

       { 13, 0, __KERNEL_CS, (unsigned long)general_protection              },

       { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault         },

       { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment            },

       { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error             },

       { 17, 0, __KERNEL_CS, (unsigned long)alignment_check         },

#ifdef CONFIG_X86_MCE

       { 18, 0, __KERNEL_CS, (unsigned long)machine_check           },

#endif

       { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },

       { SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call },

       {  0, 0,     0, 0                                      }

};

 

void trap_init() {

HYPERVISOR_set_trap_table(trap_table);

}

hypercall为什么要使用中断门

hypercall 使用中断门见hypercall篇“Xen中的实现”小节。

trap/interrupt gate for hypercall

a curious question about IDT descriptor type for hypercall. What's the reason to use interrupt-gate type (14) for hypercall (0x82) on 32bit Xen?

 

回答:

Everything's an interrupt gate on 32-bit Xen, so that we can safely (atomically) save away guest segment register state. NMI is the only real pain, and I suppose MCE too.

Interrupt handlers save and restore segment registers. We could fault on a reload of a segment register and lose the original segment register value.

 

trap的流处理程

入口和出口见xen/arch/x86/x86-32/entry.S中handle_exception。

 

可分为以下几种情况

1 guest application的系统调用,直接切换到ring 1 的guest kernel执行,见后面的小节。

2 其余情况由xen的异常处理程序处理,发生异常 ==> 陷入VMM. When an exception occurs the processor transfers control to the Xen hypervisor, using the Xen exception handlers in entry.S.

2.1 如下面的异常 in xen/arch/x86/traps.c,都将调用do_trap

DO_ERROR_NOCODE(TRAP_divide_error,    divide_error)

DO_ERROR_NOCODE(TRAP_overflow,        overflow)

DO_ERROR_NOCODE(TRAP_bounds,          bounds)

DO_ERROR_NOCODE(TRAP_copro_seg,       coprocessor_segment_overrun)

DO_ERROR(       TRAP_invalid_tss,     invalid_TSS)

DO_ERROR(       TRAP_no_segment,      segment_not_present)

DO_ERROR(       TRAP_stack_error,     stack_segment)

DO_ERROR_NOCODE(TRAP_copro_error,     coprocessor_error)

DO_ERROR(       TRAP_alignment_check, alignment_check)

DO_ERROR_NOCODE(TRAP_simd_error,      simd_coprocessor_error)

do_trap()==> 判断trap是否来自Guest OS ==> 如果是,调用do_guest_trap()。否则xen panic。
              Guest OS App --> VMM --> Guest OS Kernel

2.2  GPE和Invalid op有自己的处理函数do_general_protection 和do_invalid_op。特别值得一提的是do_general_protection,有时候guest kernel执行sensitive instruction会导致GPE,所以调用emulate_privileged_op模拟执行,其他的处理类似do_trap

 

一个示例见http://wiki.xensource.com/xenwiki/XenMemoryManagement

do_guest_trap的处理

Gets from the guest context the gate for the exception

Creates the exception frame required by the guest OS to process the exception

Then iret is executed to return control to the guest OS exception handler

 

另外提一下 The Definitive Guide to the Xen Hypervisor 7.2 p120的说法不确切。

“The code path for delivering a trap is significantly simpler than that for events.When the guest is run on a particular (physical) CPU, the hypervisor installs an Interrupt Descriptor Table (IDT) on behalf of the guest domain. This means that the interrupt handling path does not involve the hypervisor at all, for all interrupts are handled by the guest.”

guest的System Call

trap table有如下项:

{ SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call },

前面已经提到int 80h被特殊对待,If everything is 32-bit, "int 80" will be used, but it'll be directed directly to the guest kernel in ring 1 (i.e. the hypervisor isn't involved).

具体的实现见:

in xen/arch/x86/traps.c:do_set_trap_table():

 if ( cur.vector == 0x80 )

     init_int80_direct_trap(curr);

init_int80_direct_trap 将设置int80_desc,然后进程切换时paravirt_ctxt_switch_to =>set_int80_direct_trap

 

When a VM gets scheduled, its system call handler (from per domain IDT table) is registered with the processor(VCPU内). Hence when a domain/VM executes a system call, its own handler is executed.

==》这样X86_32就可以不陷入VMM了。而且可以做到每个Guest OS的system call不同。

 

Implementation differs for x86_64: Xen registers its own system call handler with the processor and from that handler routes the request to VM/Domain specific handler.

==》因为x86_64的Kernel也是在Ring-3上(和以前的Ring-0不同),以前的system call不能用了,只能改写。

 

http://hal.archives-ouvertes.fr/docs/00/43/10/31/PDF/Technical_Report_Syscall_Interception.pdf

System Calls in x86_32

 

 

 

 

xen: add more Xen dom0 support

Xenguest都有各自的init_IRQ函数,irq_desc全局数组,do_IRQ处理函数,以及中断返回处理,简单来说,就是xen的中断处理借鉴了Linux的实现

全景

来自Xen Intro- version 1.0的材料非常精当

Registration (or binding) of irqs in guest domains:

 

第一部分:guest的初始化,guestirq实际和evtchn绑定,

The guest OS calls init_IRQ() when it boots (start_kernel() method calls init_IRQ() ; file init/main.c). (init_IRQ() is in file sparse/arch/xen/kernel/evtchn.c) There can be 256 physical irqs; so there is an array called irq_desc with 256 entries. (file sparse/include/linux/irq.h)

All elements in this array are initialized in init_IRQ() so that their status is disabled (IRQ_DISABLED).

Now, when a physical driver starts it usually calls request_irq(). This method eventually calls setup_irq() (both in sparse/kernel/irq/manage.c). which calls startup_pirq(). startup_pirq() send a hypercall to the hypervisor (HYPERVISOR_event_channel_op) in order to bind the physical irq (pirq).The hypercall is of type EVTCHNOP_bind_pirq. See: startup_pirq() (file sparse/arch/xen/kernel/evtchn.c)

 

1:在xen 3.1中已经不包含这个文件sparse/kernel/irq/manage.c,该文件在Linux内核中

2physical driver 对应static struct hw_interrupt_type pirq_type 。

static struct hw_interrupt_type pirq_type = {

       .typename = "Phys-irq",

       .startup  = startup_pirq,

       };

setup_irq中有这样调用desc->handler->startup(irq)

 

第二部分:Xen

On the Hypervisor side, handling this hypervisor call is done in: evtchn_bind_pirq() method (file /common/event_channel.c) which calls pirq_guest_bind() (file arch/x86/irq.c). The pirq_guest_bind() changes the status of the corresponding irq_desc array element to be enabled (~IRQ_DISABLED[3]). it also calls startup() method. Now when an interrupts arrives from the controller (the APIC), we arrive at do_IRQ() method as is also in usual linux kernel

(also in arch/x86/irq.c). The Hypervisor handles only timer and serial interrupts. Other interrupts are passed to the domains by calling _do_IRQ_guest() (In fact, the IRQ_GUEST flag is set for all interrupts except for timer and serial interrupts). _do_IRQ_guest() send the interrupt by calling send_guest_pirq() to all guests who are registered on this IRQ. The send_guest_pirq() creates an event channel (an instance of evtchn注[4]) and sets the pending flag of this event channel. (by calling evtchn_set_pending()) Then, asynchronously, Xen will notify this domain regarding this interrupt calling evtchn_set_pending()) Then, asynchronously, Xen will notify this domain regarding this interrupt (unless it is masked).

[3]: 此处的irq_desc注意是xenirq_desc,而第一部分提到设置为IRQ_DISABLEDguestirq_desc

[4]:这个说法不确切,“The send_guest_pirq() creates an event channel event channel是在evtchn_bind_pirq时已经分配好,send_guest_pirq只是根据pirq找到该evtchn而已。

Xen中断的处理

初始化init_IRQ函数在xen/arch/x86/i8259.c文件中

 

When an interrupt occurs control passes to the Xen common_interrupt routine(见文件asm/asm_defns.h中的宏BUILD_COMMON_IRQ), that calls the Xen do_IRQ function.(该函数在xen/arch/x86/irq.c文件中)

do_IRQ:

Checks who has the responsibility to handle the interrupt:

The VMM: the interrupt is handled internally by the VMM

One ore more guest OS: it calls __do_IRQ_guest function

__do_IRQ_guest:

For each domain that has a binding to the IRQ sets to 1 the pending flag of the event channel via send_guest_pirq

 

xen仅仅需要处理2个物理中断,即串口中断(ns16550)和计时器中断,分见于函数ns16550_init_postirq和early_time_init。

guest中断的处理

In Xen interrupts to be notified to the Linux guest OS are handled through the event channels notification mechanism.

During startup the guest OS installs two handlers (event and failsafe) via the HYPERVISOR_set_callbacks hypercall:

The event callback is the handler to be called to notify an event to the guest OS

The failsafe callback is used when a fault occurs when using the event callback

 

linux-2.6-xen-sparse/arch/i386/mach-xen/setup.c中有代码如下

void __init machine_specific_arch_setup(void)

{

       static struct callback_register __initdata event = {

              .type = CALLBACKTYPE_event,

              .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },

       };

       static struct callback_register __initdata failsafe = {

              .type = CALLBACKTYPE_failsafe,

              .address = { __KERNEL_CS, (unsigned long)failsafe_callback },

       };

       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);

       if (ret == 0)

              ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);

hypervisor_callback 在linux-2.6-xen-sparse/arch/i386/kernel/entry-xen.S文件中,其实现和作用见“xen的ret_from_intr”小节的分析。

 

可以看到:The event callback handler is hypervisor_callback function (is the installed at startup), that calls evtchn_do_upcall. 具体的分析见evtchn分析篇。

evtchn_do_upcall:

Checks for pending events

2 Resets to zero the pending flag

3 Uses the evtchn_to_irq array to identify the IRQ binding for the event channel

4 Calls Linux do_IRQ interrupt handler function

Andrndr

 

 

Dom 0或driver domain的物理中断

http://blog.csdn.net/snailhit/article/details/6413399

 “A guest in Domain 0, or in a driver domain, will want to set up physical IRQ to event channel mappings for the various devices under its control. Before doing this, of course, it will want to discover which devices are already bound to which IRQs. Typically, this is done via BIOS or APIC calls. This is not permitted in Xen, however, so they are forced to use the HYPERVISOR_physdev_op hypercall.”

startup_pirq, enable_pirq等几个操作都调用了HYPERVISOR_physdev_op超级调用.

 

construct_dom0中有如下代码:

/* DOM0 is permitted full I/O capabilities. */

    rc |= irqs_permit_access(dom0, 0, NR_IRQS-1);

问题:

Driver Domain是不是通过XEN_DOMCTL_irq_permission 打开中断?

xen的ret_from_intr

xen/arch/x86/x86-32/entry.S

 

通过CS来判断这个中断是否发生在ring0,如果是就跳到restore_all_xen返回,如果不是就跳到test_all_events,这里就开始进行guest中断的检测和处理。

 

ENTRY(ret_from_intr)

        GET_CURRENT(%ebx)

        movl  UREGS_eflags(%esp),%eax

        movb  UREGS_cs(%esp),%al

        testl $(3|X86_EFLAGS_VM),%eax

        jnz   test_all_events

        jmp   restore_all_xen

 

test_guest_events先检查upcall_mask,如果没有置位再检查upcall_pending

test_all_events:

       …..

test_guest_events:

        movl VCPU_vcpu_info(%ebx),%eax

        testb $0xFF,VCPUINFO_upcall_mask(%eax)

        jnz  restore_all_guest

        testb $0xFF,VCPUINFO_upcall_pending(%eax)

        jz   restore_all_guest

/*process_guest_events:*/

        sti

        leal VCPU_trap_bounce(%ebx),%edx

        movl VCPU_event_addr(%ebx),%eax

        movl %eax,TRAPBOUNCE_eip(%edx)

        movl VCPU_event_sel(%ebx),%eax

        movw %ax,TRAPBOUNCE_cs(%edx)

        movb $TBF_INTERRUPT,TRAPBOUNCE_flags(%edx)

        call create_bounce_frame

        jmp  test_all_events

create_bounce_frame:

testl $~3,%eax

        jz   domain_crash_synchronous

        movl %eax,UREGS_cs+4(%esp)

        movl TRAPBOUNCE_eip(%edx),%eax

        movl %eax,UREGS_eip+4(%esp)

        ret

 

如果有事件的话, 首先通过create_bounce_frame构造帧。create_bounce_frame的参数从哪里来呢?这就要回到前面提到的HYPERVISOR_set_callbacks。

xen , HYPERVISOR_set_callbacksxen中的实现为

do_set_callbacks=>register_guest_callback, 该函数纪录了guest中传递过来的callback信息.

static long register_guest_callback(struct callback_register *reg)

{

    long ret = 0;

    struct vcpu *v = current;

 

    switch ( reg->type )

    {

    case CALLBACKTYPE_event:

        v->arch.guest_context.event_callback_cs     = reg->address.cs;

        v->arch.guest_context.event_callback_eip    = reg->address.eip;

        break;

}

 

xen/arch/x86/x86-32/asm-offset.c中有如下代码

OFFSET(VCPU_event_addr, struct vcpu,

           arch.guest_context.event_callback_eip);

这样的话,可以看到hypervisor_callback被准备为create_bounce_frame的参数。所以当通过restore_all_guest返回guest时,hypervisor_callback被调用。

 

http://166.111.68.94/moin/projects/rtarmor/xen_related/xen_linux_interrupt

 

 

1. What is GDT
在Protected Mode下,一个重要的必不可少的数据结构就是GDT(Global Descriptor Table)。
为什么要有GDT?我们首先考虑一下在Real Mode下的编程模型:
在Real Mode下,我们对一个内存地址的访问是通过Segment:Offset的方式来进行的,其中Segment是一个段的Base Address,一个Segment的最大长度是64 KB,这是16-bit系统所能表示的最大长度。而Offset则是相对于此Segment Base Address的偏移量。Base Address+Offset就是一个内存绝对地址。由此,我们可以看出,一个段具备两个因素:Base Address和Limit(段的最大长度),而对一个内存地址的访问,则是需要指出:使用哪个段?以及相对于这个段Base Address的Offset,这个Offset应该小于此段的Limit。当然对于16-bit系统,Limit不要指定,默认为最大长度64KB,而 16-bit的Offset也永远不可能大于此Limit。我们在实际编程的时候,使用16-bit段寄存器CS(Code Segment),DS(Data Segment),SS(Stack Segment)来指定Segment,CPU将段积存器中的数值向左偏移4-bit,放到20-bit的地址线上就成为20-bit的Base Address。

到了Protected Mode,内存的管理模式分为两种,段模式和页模式,其中页模式也是基于段模式的。也就是说,Protected Mode的内存管理模式事实上是:纯段模式和段页式。进一步说,段模式是必不可少的,而页模式则是可选的——如果使用页模式,则是段页式;否则这是纯段模式。

既然是这样,我们就先不去考虑页模式。对于段模式来讲,访问一个内存地址仍然使用Segment:Offset的方式,这是很自然的。由于 Protected Mode运行在32-bit系统上,那么Segment的两个因素:Base Address和Limit也都是32位的。IA-32允许将一个段的Base Address设为32-bit所能表示的任何值(Limit则可以被设为32-bit所能表示的,以2^12为倍数的任何指),而不象Real Mode下,一个段的Base Address只能是16的倍数(因为其低4-bit是通过左移运算得来的,只能为0,从而达到使用16-bit段寄存器表示20-bit Base Address的目的),而一个段的Limit只能为固定值64 KB。另外,Protected Mode,顾名思义,又为段模式提供了保护机制,也就说一个段的描述符需要规定对自身的访问权限(Access)。所以,在Protected Mode下,对一个段的描述则包括3方面因素:[Base Address, Limit, Access],它们加在一起被放在一个64-bit长的数据结构中,被称为段描述符。这种情况下,如果我们直接通过一个64-bit段描述符来引用一个段的时候,就必须使用一个64-bit长的段积存器装入这个段描述符。但Intel为了保持向后兼容,将段积存器仍然规定为16-bit(尽管每个段积存器事实上有一个64-bit长的不可见部分,但对于程序员来说,段积存器就是16-bit的),那么很明显,我们无法通过16-bit长度的段积存器来直接引用64-bit的段描述符。
怎么办?解决的方法就是把这些长度为64-bit的段描述符放入一个数组中,而将段寄存器中的值作为下标索引来间接引用(事实上,是将段寄存器中的高13 -bit的内容作为索引)。这个全局的数组就是GDT。事实上,在GDT中存放的不仅仅是段描述符,还有其它描述符,它们都是64-bit长,我们随后再讨论。

GDT可以被放在内存的任何位置,那么当程序员通过段寄存器来引用一个段描述符时,CPU必须知道GDT的入口,也就是基地址放在哪里,所以Intel的设计者门提供了一个寄存器GDTR用来存放GDT的入口地址,程序员将GDT设定在内存中某个位置之后,可以通过LGDT指令将GDT的入口地址装入此积存器,从此以后,CPU就根据此积存器中的内容作为GDT的入口来访问GDT了。

GDT是Protected Mode所必须的数据结构,也是唯一的——不应该,也不可能有多个。另外,正象它的名字(Global Descriptor Table)所揭示的,它是全局可见的,对任何一个任务而言都是这样。

除了GDT之外,IA-32还允许程序员构建与GDT类似的数据结构,它们被称作LDT(Local Descriptor Table),但与GDT不同的是,LDT在系统中可以存在多个,并且从LDT的名字可以得知,LDT不是全局可见的,它们只对引用它们的任务可见,每个任务最多可以拥有一个LDT。另外,每一个LDT自身作为一个段存在,它们的段描述符被放在GDT中。

IA-32为LDT的入口地址也提供了一个寄存器LDTR,因为在任何时刻只能有一个任务在运行,所以LDT寄存器全局也只需要有一个。如果一个任务拥有自身的LDT,那么当它需要引用自身的LDT时,它需要通过LLDT将其LDT的段描述符装入此寄存器。LLDT指令与LGDT指令不同的时,LGDT指令的操作数是一个32-bit的内存地址,这个内存地址处存放的是一个32-bit GDT的入口地址,以及16-bit的GDT Limit。而LLDT指令的操作数是一个16-bit的选择子,这个选择子主要内容是:被装入的LDT的段描述符在GDT中的索引值——这一点和刚才所讨论的通过段积存器引用段的模式是一样的。

 

 

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值