上一篇<<linux虚拟化: kvm: 初始化及创建用户过程>>分析了基于内核的虚拟机(KVM)是一种内建于Linux的开源虚拟化技术,每个虚拟资源(虚拟机)可以表示为虚拟用户,kvm通过获取影子(虚拟)物理位,模拟出专用的寄存器,及页回收、用户统计等功能。本篇分析一款主要基于amd、海光芯片的虚拟化技术,svm,全名为支持向量机(support vector machines)。
svm是一种虚拟机环境,可用于AMD64、海光64或Intel EM64T CPU。这种新型的虚拟机技术可以提高性能和节能,但您需要在使用任何使用它的软件之前启用它。
“支持向量机模式”是允许AMD处理器使用支持向量机指令集的功能。这对于需要同时兼容Intel、AMD和海光处理器的应用程序很有帮助。
svm通过创建kvm,注册svm_init_ops初始操作结构,完成硬件的一系列初始化及包含功能操作结构的访问方式。
svm_init_ops结构包括:
has_svm 检查amd 或 海光芯片是否支持svm
is_disabled 检查是否禁用功能
svm_hardware_setup svm硬件相关设置
svm_check_processor_compat 检查处理器兼容性
svm_x86_ops svm(x86)操作结构
amd_pmu_ops amd性能监控单元操作
svm硬件相关设置:
获取待分配页的订单大小,如果启用了NX大页面缓解,则影子分页和NPT都需要NX,分配页,获取页面映射的虚拟地址,填充内存(0xFF,每字节),获取kvm模式特征寄存器列表中的成员,经过偏移运算后写入msrpm_offsets[MSRPM_OFFSETS]列表,获取/设置标志位,并放入kvm_uret_msrs_list列表,检查暂停过滤支持,包括滤波暂停截距、暂停过滤器阈值,KVM的MMU本身不支持使用2级分页,因此如果主机使用2级寻呼,则不支持NPT,因为主机CR4在VMRUN上保持不变,强制VM NPT级别等于主机的分页级别,设置shadow_me_value和shadow_mo_mask,kvm内存管理单元设置掩码和值,sev硬件设置…,svm_hv硬件设置,设置刷新tlb相关函数,向量机cpu初始化…,avic硬件设置(包括LBR虚拟化支持),用于IOMMU驱动调用唤醒vcpu任务,svm设置cpu(寄存器)功能等等。
目录
1. 函数分析
1.1 svm_init
svm特征的kvm初始化
svm 支持向量机(support vector machines)
static int __init svm_init(void)
{
__unused_size_checks(); // 重要结构大小检查
// 虚拟控制保存区域
// 用户虚拟通讯保存区域
// 安全加密虚拟化状态保存区域
// 虚拟控制区域
// 用户(来宾)虚拟层通信块
return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
__alignof__(struct vcpu_svm), THIS_MODULE); // kvm初始化(svm注册及后续使用)
}
__unused_size_checks
svm_init_ops
2. 源码结构
svm_init_ops svm初始操作
static struct kvm_x86_init_ops svm_init_ops __initdata = {
.cpu_has_kvm_support = has_svm, // 检查amd 或 海光芯片是否支持向量机
.disabled_by_bios = is_disabled, // 检查是否禁用功能
// 检查vm_cr设置位
// #define MSR_VM_CR 0xc0010114
.hardware_setup = svm_hardware_setup, // svm硬件相关设置
.check_processor_compatibility = svm_check_processor_compat, // svm检查处理器兼容性
.runtime_ops = &svm_x86_ops, // svm(x86)操作结构
.pmu_ops = &amd_pmu_ops, // amd性能监控单元操作
};
svm_hardware_setup
amd_pmu_ops
amd_pmu_ops amd性能监控单元操作
pmu 性能监控单元(performance monitoring unit)
struct kvm_pmu_ops amd_pmu_ops __initdata = {
.hw_event_available = amd_hw_event_available, // 硬件事件可用
.pmc_is_enabled = amd_pmc_is_enabled, // 通过将PMC与global_ctrl位进行比较,检查PMC是否已启用
// 由于AMD CPU没有global_ctrl MSR,所有PMC都已启用(返回TRUE)
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = amd_msr_idx_to_pmc,
.is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx,
.is_valid_msr = amd_is_valid_msr,
.get_msr = amd_pmu_get_msr,
.set_msr = amd_pmu_set_msr,
.refresh = amd_pmu_refresh,
.init = amd_pmu_init,
.reset = amd_pmu_reset,
};
svm_direct_access_msrs svm模式特征寄存器访问通道
static const struct svm_direct_access_msrs {
u32 index; /* MSR索引 */
bool always; /* 如果最初清除拦截,则为true */
} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
{ .index = MSR_STAR, .always = true }, // 传统模式SYSCALL目标
{ .index = MSR_IA32_SYSENTER_CS, .always = true }, // cs
{ .index = MSR_IA32_SYSENTER_EIP, .always = false }, // eip
{ .index = MSR_IA32_SYSENTER_ESP, .always = false }, // esp
#ifdef CONFIG_X86_64
{ .index = MSR_GS_BASE, .always = true }, // 64位GS基地址
{ .index = MSR_FS_BASE, .always = true }, // 64位FS基地址
{ .index = MSR_KERNEL_GS_BASE, .always = true }, // SwapGS GS影子
{ .index = MSR_LSTAR, .always = true }, // 长模式SYSCALL目标
{ .index = MSR_CSTAR, .always = true }, // 兼容模式SYSCALL目标
{ .index = MSR_SYSCALL_MASK, .always = true }, // 系统调用的EFLAGS掩码
#endif
{ .index = MSR_IA32_SPEC_CTRL, .always = false }, // 预测控制
{ .index = MSR_IA32_PRED_CMD, .always = false }, // 预测命令
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, // 最后分支预测
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, // 最后分支跳转
{ .index = MSR_IA32_LASTINTFROMIP, .always = false }, // 最后分支中断(预测)
{ .index = MSR_IA32_LASTINTTOIP, .always = false }, // 最后分支中断(执行)
{ .index = MSR_EFER, .always = false }, // 读取扩展功能寄存器,32 or 64位
{ .index = MSR_IA32_CR_PAT, .always = false }, // 页属性表控制寄存器
{ .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, // 安全加密虚拟化状态 用户(来宾)虚拟层通信块
{ .index = MSR_TSC_AUX, .always = false }, // 辅助TSC
// TSC 时间戳计数器(Time Stamp Counter)
{ .index = X2APIC_MSR(APIC_ID), .always = false }, apic id
// #define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4))
// #define APIC_BASE_MSR 0x800
// APIC 高级可编程中断控制器(Advanced Programmable Interrupt Controller)
{ .index = X2APIC_MSR(APIC_LVR), .always = false }, // 传统I2C虚拟寄存器
{ .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, // 任务优先权
{ .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, // 公断优先权
{ .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, // 处理器优先权
{ .index = X2APIC_MSR(APIC_EOI), .always = false }, // 退出IO
{ .index = X2APIC_MSR(APIC_RRR), .always = false },
{ .index = X2APIC_MSR(APIC_LDR), .always = false }, // ldr
{ .index = X2APIC_MSR(APIC_DFR), .always = false },
{ .index = X2APIC_MSR(APIC_SPIV), .always = false },
{ .index = X2APIC_MSR(APIC_ISR), .always = false }, // 中断服务寄存器
{ .index = X2APIC_MSR(APIC_TMR), .always = false },
{ .index = X2APIC_MSR(APIC_IRR), .always = false }, // 中断请求寄存器
{ .index = X2APIC_MSR(APIC_ESR), .always = false },
{ .index = X2APIC_MSR(APIC_ICR), .always = false },
{ .index = X2APIC_MSR(APIC_ICR2), .always = false },
/*
* 注:
* AMD不虚拟化APIC TSC期限计时器模式,但它由KVM模拟
* 当设置APIC LVTT(0x832)寄存器位18时,中航工业硬件将生成GP故障
* 因此,始终拦截MSR 0x832,不要设置direct_access_MSR
*/
{ .index = X2APIC_MSR(APIC_LVTTHMR), .always = false },
{ .index = X2APIC_MSR(APIC_LVTPC), .always = false },
{ .index = X2APIC_MSR(APIC_LVT0), .always = false },
{ .index = X2APIC_MSR(APIC_LVT1), .always = false },
{ .index = X2APIC_MSR(APIC_LVTERR), .always = false },
{ .index = X2APIC_MSR(APIC_TMICT), .always = false },
{ .index = X2APIC_MSR(APIC_TMCCT), .always = false },
{ .index = X2APIC_MSR(APIC_TDCR), .always = false },
{ .index = MSR_INVALID, .always = false },
};
svm_x86_ops svm(x86)操作结构
static struct kvm_x86_ops svm_x86_ops __initdata = {
.name = "kvm_amd", 名称
.hardware_unsetup = svm_hardware_unsetup, // 硬件移除设置(复原),包括释放页等
.hardware_enable = svm_hardware_enable, // 设置硬件状态,并启动/刷新虚拟事件
// 清除全局启用位,用更新的PERF_ctr_virt_mask重新编程PERF_CTL寄存器,然后再次设置全局启用位
.hardware_disable = svm_hardware_disable, // 关闭当前CPU上的支持向量机,刷新虚拟事件
// 我们只屏蔽掉仅主机位,以便在禁用SVM时仅主机计数工作
如果有人在SVM被禁用时设置了仅来宾计数器,则仅来宾位仍然被设置,计数器将不计数任何内容
.has_emulated_msr = svm_has_emulated_msr, // 判断是不是模拟器特征寄存器
.vcpu_create = svm_vcpu_create,
.vcpu_free = svm_vcpu_free,
.vcpu_reset = svm_vcpu_reset,
.vm_size = sizeof(struct kvm_svm),
.vm_init = svm_vm_init,
.vm_destroy = svm_vm_destroy,
.prepare_switch_to_guest = svm_prepare_switch_to_guest,
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
.vcpu_blocking = avic_vcpu_blocking,
.vcpu_unblocking = avic_vcpu_unblocking,
.update_exception_bitmap = svm_update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
.get_segment_base = svm_get_segment_base,
.get_segment = svm_get_segment,
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = svm_get_cs_db_l_bits,
.set_cr0 = svm_set_cr0,
.post_set_cr3 = sev_post_set_cr3,
.is_valid_cr4 = svm_is_valid_cr4,
.set_cr4 = svm_set_cr4,
.set_efer = svm_set_efer,
.get_idt = svm_get_idt,
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
.get_if_flag = svm_get_if_flag,
.flush_tlb_all = svm_flush_tlb_current,
.flush_tlb_current = svm_flush_tlb_current,
.flush_tlb_gva = svm_flush_tlb_gva,
.flush_tlb_guest = svm_flush_tlb_current,
.vcpu_pre_run = svm_vcpu_pre_run,
.vcpu_run = svm_vcpu_run,
.handle_exit = svm_handle_exit,
.skip_emulated_instruction = svm_skip_emulated_instruction,
.update_emulated_instruction = NULL,
.set_interrupt_shadow = svm_set_interrupt_shadow,
.get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
.inject_irq = svm_inject_irq,
.inject_nmi = svm_inject_nmi,
.inject_exception = svm_inject_exception,
.cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
.nmi_allowed = svm_nmi_allowed,
.get_nmi_mask = svm_get_nmi_mask,
.set_nmi_mask = svm_set_nmi_mask,
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
.set_virtual_apic_mode = avic_set_virtual_apic_mode,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
.check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
.apicv_post_state_restore = avic_apicv_po