4.3 时钟虚拟化


kvm支持的时钟有8254,local apic timer,kvmclock等,本节将分析8254和kvmclock

4.3.1 8254时钟虚拟化

structkvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)

{

    a. 建立内核线程

      pit->worker_task = kthread_run(kthread_worker_fn,&pit->worker,

                     "kvm-pit/%d", pid_nr);

    b. 准备workqueue

       init_kthread_work(&pit->expired,pit_do_work);

    c. 始化一个高精准定时器,这个定时器就作为我们虚拟时钟的时钟源

       hrtimer_init(&pit_state->timer,CLOCK_MONOTONIC, HRTIMER_MODE_ABS);

    d. pit_state->irq_ack_notifier.irq_acked= kvm_pit_ack_irq;时钟中断模拟ack

     .kvm_register_irq_mask_notifier(kvm, 0,&pit->mask_notifier);

    e. kvm_pit_reset(pit);

    f.     kvm_iodevice_init(&pit->dev,&pit_dev_ops); //注册io虚拟化操作

    ret = kvm_io_bus_register_dev(kvm,KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,

                    KVM_PIT_MEM_LENGTH, &pit->dev);

}

 

当guestos需要通过寄存器操作启动一个时钟控制器时,pit_load_count==》create_pit_timer, 会启动一个hr timer来模拟时钟中断源

 

    hrtimer_cancel(&ps->timer);

    flush_kthread_work(&ps->pit->expired);

    ps->period = interval;

    ps->is_periodic = is_period;

    ps->timer.function = pit_timer_fn;

    ps->kvm = ps->pit->kvm;

    atomic_set(&ps->pending, 0);

    ps->irq_ack = 1;

    .......

    hrtimer_start(&ps->timer,ktime_add_ns(ktime_get(), interval),

            HRTIMER_MODE_ABS);

 

pit_tiemr_fn完成时钟的累加

staticenum hrtimer_restart pit_timer_fn(struct hrtimer *data)

{

    struct kvm_kpit_state *ps =container_of(data, struct kvm_kpit_state, timer);

    struct kvm_pit *pt =ps->kvm->arch.vpit;

   

   //如果时钟中断需要重新注入,就直接累加;否则那么不进行累加,直接合并时钟中断

    if (ps->reinject ||!atomic_read(&ps->pending)) {

       atomic_inc(&ps->pending);

       queue_kthread_work(&pt->worker,&pt->expired);

    }

 

    if (ps->is_periodic) {

       //  如果定时器周期触发,则再次启动定时器,否则销毁

       hrtimer_add_expires_ns(&ps->timer,ps->period);

       return HRTIMER_RESTART;

    } else

       return HRTIMER_NORESTART;

}

 

当定时器将时钟中断pending增加,并且添加完工作队列以后,接着就触发下面的时钟中断注入,如果上一个中断被接收,接着触发下一个。代码如下:

static void pit_do_work(struct kthread_work *work)

{

    .......

    spin_lock(&ps->inject_lock);

    if (ps->irq_ack){

       ps->irq_ack= 0;

       inject = 1;

    }

    spin_unlock(&ps->inject_lock);

    if (inject) {

        ///*模拟一个高电平和一个低电平,发送给PIC,触发时钟中断。*/

       kvm_set_irq(kvm,kvm->arch.vpit->irq_source_id, 0, 1, false);

       kvm_set_irq(kvm,kvm->arch.vpit->irq_source_id, 0, 0, false);

 

       //nmi watchdog support

       if(kvm->arch.vapics_in_nmi_mode > 0)

           kvm_for_each_vcpu(i,vcpu, kvm)

              kvm_apic_nmi_wd_deliver(vcpu);

    }

}

 

kvm_pit_ack_irq实现中断的ack应答虚拟化

staticvoid kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)

{

    struct kvm_kpit_state *ps =container_of(kian, struct kvm_kpit_state,

                      irq_ack_notifier);

    int value;

 

    spin_lock(&ps->inject_lock);

    value =atomic_dec_return(&ps->pending); //注入成功,则中断累加器减一

    if (value < 0) // 异常情况,pending本来就是0还减少,说明是无效的ack

       atomic_inc(&ps->pending);

    else if (value > 0) / > 0还需要重新注入积累的中断

       queue_kthread_work(&ps->pit->worker,&ps->pit->expired);

    ps->irq_ack = 1;//设置ack

    spin_unlock(&ps->inject_lock);

}

 

4.3.2 kvmclock时钟虚拟化

  时间虚拟化的一种实现方式是通过时钟中断计数,进而换算得到,这种方式在虚拟机里存在问题,因为有时运行vpcu的cpu被调度出来使 时钟中断不能准时到达guest os。另外一种方式,如模拟HPET,guest os当需要的时候会去读当前的时间,这种方式会使得虚拟机频繁的VM-exit,影响性能。为此kvm引入了基于半虚拟化的时钟kvmclock,这种方式需要在guest上实现一个kvmclock驱动, 建立guest os 到VMM的通道, 这样通过这个通道guest os 向vmm 查询时间。

 

(1) guest os kvmclock 驱动

源码路径: arch\x86\kernel\  kvmclock.cpvclock.c.

kvmclock_init ==>

    a.  kvm_register_clock

    src =&hv_clock[cpu].pvti;

    low =(int)slow_virt_to_phys(src) | 1;

    high =((u64)slow_virt_to_phys(src) >> 32);

    ret =native_write_msr_safe(msr_kvm_system_time, low, high);

    通过msr寄存write的方式将hv_clock[cpu].pvti的gpa通知给vmm.

    b.  写改x86的函数指针

           pv_time_ops.sched_clock = kvm_clock_read;

       x86_platform.calibrate_tsc =kvm_get_tsc_khz;

       x86_platform.get_wallclock =kvm_get_wallclock;

       x86_platform.set_wallclock = kvm_set_wallclock;

例如x86_platform.get_wallclock 默认为mach_get_cmos_time(从cmos取得wallclock).

wallclock指的是操作系统从开机开始的绝对时间。

   c. clocksource_register_hz(&kvm_clock,NSEC_PER_SEC); //注册系统时钟源

static struct clocksource kvm_clock = {

    .name ="kvm-clock",

    .read = kvm_clock_get_cycles,

    .rating = 400, //rating400为理想时钟源

    .mask =CLOCKSOURCE_MASK(64),

    .flags =CLOCK_SOURCE_IS_CONTINUOUS,

};

由于kvm-clock将rating 设为400,这样会使clocksource_register_hz==》__clocksource_register_scale==》 clocksource_select==》__clocksource_select将guest os 的curr_clocksource设为kvmclock

 

下面重点分析kvm_clock_read和kvm_get_wallclock

static void kvm_get_wallclock(struct timespec *now)

{

    structpvclock_vcpu_time_info *vcpu_time;

    int low, high;

    int cpu;

    low =(int)__pa_symbol(&wall_clock);

    high = ((u64)__pa_symbol(&wall_clock)>> 32);

    native_write_msr(msr_kvm_wall_clock,low, high);

    preempt_disable();

    cpu =smp_processor_id();

    vcpu_time =&hv_clock[cpu].pvti;

    pvclock_read_wallclock(&wall_clock,vcpu_time, now);

    preempt_enable();

}

a. native_write_msr(msr_kvm_wall_clock, low, high);通知vmm要取wall_clock并将wall_clock的gpa告诉vmm.

 

b. pvclock_read_wallclock 返回vmm设置号的wallclock. wall_clock在返回前相当于是guest 与vmm间的共享内存.

 

pvclock_read_wallclock的访问

void pvclock_read_wallclock(struct pvclock_wall_clock*wall_clock,

{

 

    //等待vmm设置好wall_clock, 用version来标记数据是否更新

    do {

       version =wall_clock->version;

       rmb();     /* fetch version before time */

       now.tv_sec  = wall_clock->sec;

       now.tv_nsec =wall_clock->nsec;

       rmb();     /* fetch time before checking version */

    } while ((wall_clock->version& 1) || (version != wall_clock->version)); /

    //这时wall_clock记录的是系统开机时的时间

 

    //取得系统运行的时间, vcpu_time作为共享内存,其地址在kvm_register_clock通知了vmm

    delta =pvclock_clocksource_read(vcpu_time); 

 

    //两者相加为wall_clock,

    delta += now.tv_sec *(u64)NSEC_PER_SEC + now.tv_nsec;

 

    now.tv_nsec =do_div(delta, NSEC_PER_SEC);

    now.tv_sec = delta;

 

    set_normalized_timespec(ts,now.tv_sec, now.tv_nsec);

}

 

static cycle_t kvm_clock_read(void) ==> pvclock_clocksource_read

cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info*src)

{

 

    do {

       version =__pvclock_read_cycles(src, &ret, &flags); // __native_read_tsc

    } while((src->version & 1) || version != src->version);

 

    ......

    if ((valid_flags &PVCLOCK_TSC_STABLE_BIT) &&

       (flags & PVCLOCK_TSC_STABLE_BIT))

       return ret;

    last =atomic64_read(&last_value);

    do {

       if (ret < last)

           return last;

       last =atomic64_cmpxchg(&last_value, last, ret);

    } while (unlikely(last!= ret));

 

    return ret;

}

 

(2) VMM kvmclock实现

msr的实现:

kvm_set_msr_common ==>  case MSR_KVM_WALL_CLOCK ==> kvm_write_wall_clock

static void kvm_write_wall_clock(struct kvm *kvm, gpa_twall_clock)

{

    .......

   //a. 读guest version

    r = kvm_read_guest(kvm,wall_clock, &version, sizeof(version));

    if (r)

       return;

 

    if (version & 1)

       ++version;  /* first time write, random junk */

 

    ++version;

 

    kvm_write_guest(kvm,wall_clock, &version, sizeof(version));//更新version

 

    getboottime(&boot);//得到系统的boot时间

 

    if(kvm->arch.kvmclock_offset) {

       struct timespec ts =ns_to_timespec(kvm->arch.kvmclock_offset);

       boot =timespec_sub(boot, ts);

    }

    wc.sec = boot.tv_sec;

    wc.nsec = boot.tv_nsec;

    wc.version = version;

 

    kvm_write_guest(kvm,wall_clock, &wc, sizeof(wc)); //更新guest wall_clock

 

    version++;

    kvm_write_guest(kvm,wall_clock, &version, sizeof(version)); //更新version,完成通讯

}

 

kvm_read_guest/kvm_write_guest 的工作原理是通过gpa得到对应page 的hva和页内偏移,然后就能读写内存了

int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data,unsigned long len)

{

    gfn_t gfn = gpa>> PAGE_SHIFT;

    int seg;

    int offset =offset_in_page(gpa);

    int ret;

 

    while ((seg =next_segment(len, offset)) != 0) {

       ret =kvm_read_guest_page(kvm, gfn, data, offset, seg);

       if (ret < 0)

           return ret;

       offset = 0;

       len -= seg;

       data += seg;

       ++gfn;

    }

    return 0;

}

int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data,int offset,

           int len)

{

    int r;

    unsigned long addr;

 

    addr =gfn_to_hva_prot(kvm, gfn, NULL);

    if(kvm_is_error_hva(addr))

       return -EFAULT;

    r = kvm_read_hva(data,(void __user *)addr + offset, len); // call __copy_from_user

    if (r)

       return -EFAULT;

    return 0;

}

 

kvm_set_msr_common ==> case MSR_KVM_SYSTEM_TIME

    a. kvmclock_reset //vcpu->arch.pv_time_enabled = false

    b. kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE,vcpu);

    c.        if (kvm_gfn_to_hva_cache_init(vcpu->kvm,

            &vcpu->arch.pv_time, data &~1ULL,

            sizeof(struct pvclock_vcpu_time_info)))

           vcpu->arch.pv_time_enabled= false;

       else

           vcpu->arch.pv_time_enabled= true;

kvm_gfn_to_hva_cache_init会得到guest os 的hv_clock[cpu].pvti

 

 

vcpu_enter_guest==> KVM_REQ_GLOBAL_CLOCK_UPDATE  kvm_gen_kvmclock_update(vcpu);

    set_bit(KVM_REQ_CLOCK_UPDATE,&v->requests);

    schedule_delayed_work(&kvm->arch.kvmclock_update_work,

                  KVMCLOCK_UPDATE_DELAY);

 

由于在kvm_arch_init_vm时:

    INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work,kvmclock_update_fn);

    INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work,kvmclock_sync_fn);

 

所以kvm->arch.kvmclock_update_work==》

static void kvmclock_update_fn(struct work_struct *work)

{

    。。。。。。

   //对每个vcpu设置KVM_REQ_CLOCK_UPDATE

    kvm_for_each_vcpu(i,vcpu, kvm) {

       set_bit(KVM_REQ_CLOCK_UPDATE,&vcpu->requests);

       kvm_vcpu_kick(vcpu);

    }

}

 

vcpu_enter_guest==>KVM_REQ_CLOCK_UPDATE  kvm_guest_time_update(vcpu);

kvm_guest_time_update会将时间更新到vcpu->pv_time

 

4.3.3 Cpu Steal time

 Cpu Steal time指的是vcpu 等待 real cpu 的时间, 因为vcpu会发生vm-exit而进入vmm;进入vmm 后到重新vm-entry的时间就是一次cpu steal time. 该指标是衡量vm性能的重要指标。 通过半虚拟化技术guest os能得到cpu steal time. VMM与guest通讯机制与上一节类似,本节就不讨论了。

 

(1) Guest os 实现

  1. kvm_guest_init注册函数指针pv_time_ops.steal_clock =kvm_steal_clock; 对非guest而言

    该函数为native_steal_clock, 直接返回0

 

  2. Guest os 通过kvm_register_steal_time 通知vmm 共享内存地址:

  wrmsrl(MSR_KVM_STEAL_TIME,(slow_virt_to_phys(st) | KVM_MSR_ENABLED));

 

内核kernel\core.c update_rq_clock ==> update_rq_clock_task ==>

                 paravirt_steal_clock(cpu_of(rq))==> pv_time_ops.steal_clock;

 

(2) vmm 实现

kvm_set_msr_common ==》 case MSR_KVM_STEAL_TIME

  a. kvm_gfn_to_hva_cache_init得到guest os gpa -> hva

  b. vcpu->arch.st.last_steal= current->sched_info.run_delay;

  c. accumulate_steal_time(vcpu);

static void accumulate_steal_time(struct kvm_vcpu *vcpu)

{

    .......

    delta =current->sched_info.run_delay - vcpu->arch.st.last_steal;

    vcpu->arch.st.last_steal= current->sched_info.run_delay;

    vcpu->arch.st.accum_steal= delta;

}

第一调用时delta会为0, 但当以后vcpu_load时kvm_arch_vcpu_load会重新调用accumulate_steal_time

 

  d. kvm_make_request(KVM_REQ_STEAL_UPDATE,vcpu);

 

vcpu_enter_guest ==> record_steal_time(vcpu);

static void record_steal_time(struct kvm_vcpu *vcpu)

{

  

    ............ //kvm_read_guest_cached

    vcpu->arch.st.steal.steal+= vcpu->arch.st.accum_steal;

    vcpu->arch.st.steal.version+= 2;

    vcpu->arch.st.accum_steal= 0;

    ......... //kvm_write_guest_cached

}

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值