KVM_CREATE_IRQCHIP(中断控制器初始化)
KVM_CREATE_IRQCHIP用于在虚拟机初始化阶段创建中断请求芯片,当KVM接收到虚拟机相关联的ioctl系统调用时,函数kvm_vm_ioctl()进行处理调用上面的kvm_arch_vm_ioctl()函数,该函数中完成了初始化pic和ioapic控制器模块,配置中断请求默认路由等任务:
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
/*
* This union makes it completely explicit to gcc-3.x
* that these two variables' stack usage should be
* combined, not added together.
*/
union {
struct kvm_pit_state ps;
struct kvm_pit_state2 ps2;
struct kvm_pit_config pit_config;
} u;
switch (ioctl) {
...
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
...
//初始化虚拟pic
r = kvm_pic_init(kvm);
...
//初始化虚拟ioapic
r = kvm_ioapic_init(kvm);
...
// 设置默认的中断路由
r = kvm_setup_default_irq_routing(kvm);
...
/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
...
mutex_unlock(&kvm->lock);
break;
}
...
其中的kvm_setup_default_irq_routing会依次调用kvm_set_irq_routing和setup_routing_entry,最终调用kvm_set_routing_entry函数完成路由配置,核心操作是将各个类型中断控制器的中断置为函数与该类型的控制器路由入口进行绑定,以备后续发生中断请求时调用。
整体函数调用链如图:
int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
/* We can't check irqchip_in_kernel() here as some callers are
* currently initializing the irqchip. Other callers should therefore
* check kvm_arch_can_set_irq_routing() before calling this function.
*/
switch (ue->type) {
case KVM_IRQ_ROUTING_IRQCHIP: //中断路由芯片
if (irqchip_split(kvm))
return -EINVAL;
e->irqchip.pin = ue->u.irqchip.pin;//设置中断芯片引脚
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_SLAVE:
e->irqchip.pin += PIC_NUM_PINS / 2;
fallthrough;
case KVM_IRQCHIP_PIC_MASTER:
if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
return -EINVAL;
设置处理 PIC 中断的回调函数
e->set = kvm_set_pic_irq;
break;
case KVM_IRQCHIP_IOAPIC:
if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
return -EINVAL;
// 设置处理 IOPIC 中断的回调函数
e->set = kvm_set_ioapic_irq;
break;
default:
return -EINVAL;
}
e->irqchip.irqchip = ue->u.irqchip.irqchip;
break;
case KVM_IRQ_ROUTING_MSI:
// 设置处理 MSI 中断的回调函数
e->set = kvm_set_msi;
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
break;
case KVM_IRQ_ROUTING_HV_SINT:
e->set = kvm_hv_set_sint;
e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
e->hv_sint.sint = ue->u.hv_sint.sint;
break;
#ifdef CONFIG_KVM_XEN
case KVM_IRQ_ROUTING_XEN_EVTCHN:
return kvm_xen_setup_evtchn(kvm, e, ue);
#endif
default:
return -EINVAL;
}
return 0;
}
KVM_IRQ_LINE(中断注入)
KVM_CREATE_IRQCHIP用于虚拟机向VMM的虚拟apic发送中断请求,再有VMM将中断交付虚拟cpu处理,当kvm_vm_ioctl函数被调用并处理KVM_IRQ_LINE请求时会调用kvm_vm_ioctl_irq_line,该函数调用kvm_set_irq完成中断注入,其核心任务就是根据中断控制器类型调用之前所绑定的中断回调函数,这些中断回调函数会将中断请求写入虚拟cpu的vmcs中。
/* kvm_set_irq - 设置或清除 KVM 虚拟机中的一个中断
* @kvm: 指向当前 KVM 实例的指针
* @irq_source_id: 中断源的标识符,用于区分不同的中断源
* @irq: 要操作的中断号
* @level: 指定中断的电平,通常 1 表示触发中断,0 表示清除中断
* @line_status: 指定中断线路的当前状态
*
* Return value:
* < 0 中断被忽略(被屏蔽或其他原因未被送达)
* = 0 中断被合并(之前的中断仍在等待处理)
* > 0 中断成功送达的 CPU 数量
*/
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
bool line_status)
{
struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS];
int ret = -1, i, idx;
trace_kvm_set_irq(irq, level, irq_source_id);
idx = srcu_read_lock(&kvm->irq_srcu);
i = kvm_irq_map_gsi(kvm, irq_set, irq);
srcu_read_unlock(&kvm->irq_srcu, idx);
/* 之所以进行循环遍历设置而不直接调用相应的处理函数,是因为
* 无法检测客户机是使用 PIC 还是 IOAPIC。
* 因此,在两者中都设置位。客户机将忽略无效的中断注入。
*/
while (i--) {
int r;
r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
line_status);
if (r < 0)
continue;
ret = r + ((ret < 0) ? 0 : ret);
}
return ret;
}
当需要注入的中断为msi类型时,kvm_vm_ioctl会处理为KVM_SIGNAL_MSI类型的的请求,依次调用kvm_send_userspace_msi和kvm_set_msi
int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
{
struct kvm_kernel_irq_routing_entry route;
if (!kvm_arch_irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID))
return -EINVAL;
route.msi.address_lo = msi->address_lo;
route.msi.address_hi = msi->address_hi;
route.msi.data = msi->data;
route.msi.flags = msi->flags;
route.msi.devid = msi->devid;
return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false);
}
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, bool line_status)
{
struct kvm_lapic_irq irq;
if (!level)
return -1;
kvm_set_msi_irq(e, &irq); //填充kvm_lapic_irq结构体
return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); //分发中断
}
void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
{
//tracepoint kvm_msi_set_irq
trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
// 解析 MSI 中断信息并填充到 kvm_lapic_irq 结构体中
irq->dest_id = (e->msi.address_lo &
MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
irq->vector = (e->msi.data &
MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
irq->delivery_mode = e->msi.data & 0x700;
irq->msi_redir_hint = ((e->msi.address_lo
& MSI_ADDR_REDIRECTION_LOWPRI) > 0);
irq->level = 1;
irq->shorthand = 0;
}
其中ioapic的回调函数kvm_set_ioapic_irq依次调用kvm_ioapic_set_irq、ioapic_set_irq最后调用ioapic_service函数,ioapic_service主要是找到中断的重映射表,然后查找中断的目的地信息并转发到对应vcpu的lapic去处理。然后会调用kvm_irq_delivery_to_apic负责将中断分发给lapic
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, unsigned long *dest_map)
{
int i, r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
kvm_lowest_prio_delivery(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
//使用预先存在kvm中的kvm_apic_map来查找lapic
if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
return r;
/*目的是在虚拟机中选择一个或多个目标 vcpu,
将中断传递给它们。如果是选择最低优先级的 vcpu,则在循环结束后,
lowest 变量将指向具有最低优先级的 vcpu。
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu)) //当前vcpu没有启用lapic
continue;
if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
irq->dest_id, irq->dest_mode)) //如果vcpu的lapic匹配了指定的目标
continue;
if (!kvm_lowest_prio_delivery(irq)) { //如果不是要求将中断传递给具有最低优先级的处理器
if (r < 0)
r = 0;
r += kvm_apic_set_irq(vcpu, irq, dest_map);
} else if (kvm_lapic_enabled(vcpu)) { //如果是要求将中断传递给具有最低优先级的处理器,
//并且当前 vcpu 的 LAPIC 已启用
if (!lowest)
lowest = vcpu;
else if (kvm_apic_compare_prio(vcpu, lowest) < 0) //如果当前 vcpu 的 LAPIC 优先级比
// lowest 的 LAPIC 优先级更低
lowest = vcpu;
}
}
if (lowest)
r = kvm_apic_set_irq(lowest, irq, dest_map);
return r;
}
选取不同类型的的中断控制器的回调函数kvm_set_ioapic_irq–>kvm_ioapic_set_irq–>ioapic_set_irq、kvm_set_pic_irq–>kvm_pic_set_irq、kvm_set_msi作为ebpf程序的挂载点,可以监控中断注入的详细信息,如下:
TIME(ms) COMM PID DELAY TYPE/PIN DST/VEC OTHERS
962804587.768667 CPU 0/KVM 269394 773 MSI /- 0x2/40 Fixed |physical|edge |- |-
962805792.231419 vhost-529746 529767 3008 MSI /- 0x1/40 Fixed |physical|edge |- |-
962805792.234556 vhost-269394 269403 1442 MSI /- 0x3/40 Fixed |physical|edge |- |-
962805792.243754 vhost-426070 426078 1323 MSI /- 0x5/35 Fixed |physical|edge |- |-
962806603.650275 CPU 0/KVM 269394 3738 MSI /- 0x2/40 Fixed |physical|edge |- |-
962806603.713743 CPU 0/KVM 269394 1414 MSI /- 0x2/40 Fixed |physical|edge |- |-
962806816.308239 qemu-system-x86 269394 29495 IOAPIC /21 0x4/39 Fixed |physical|level|- |-
962806816.359852 qemu-system-x86 269394 38615 PIC slave /2 - /- - |- |level|masked|-
962806816.400501 qemu-system-x86 269394 1259 IOAPIC /10 0 /0 Fixed |physical|edge |masked|-
962806816.408792 qemu-system-x86 269394 1270 PIC slave /2 - /- - |- |level|masked|-
962806816.410425 qemu-system-x86 269394 226 IOAPIC /10 0 /0 Fixed |physical|edge |masked|-
962809792.316035 vhost-426070 426078 1747 MSI /- 0x5/35 Fixed |physical|edge |- |-
962810635.636493 CPU 0/KVM 269394 3034 MSI /- 0x2/40 Fixed |physical|edge |- |-
962810635.694923 CPU 0/KVM 269394 897 MSI /- 0x2/40 Fixed |physical|edge |- |-
962811776.481253 vhost-269394 269403 3719 MSI /- 0x3/40 Fixed |physical|edge |- |-
962811776.523581 vhost-529746 529767 1664 MSI /- 0x1/40 Fixed |physical|edge |- |-
962811776.654516 vhost-426070 426078 1522 MSI /- 0x5/35 Fixed |physical|edge |- |-
962812652.302519 CPU 2/KVM 269394 2605 MSI /- 0x2/40 Fixed |physical|edge |- |-
962812652.342239 CPU 2/KVM 269394 749 MSI /- 0x2/40 Fixed |physical|edge |- |-
962813344.856419 qemu-system-x86 269394 23230 IOAPIC /21 0x4/39 Fixed |physical|level|- |-
962813344.899277 qemu-system-x86 269394 5472 PIC slave /2 - /- - |- |level|masked|-
详细代码:https://github.com/nanshuaibo/lmp/tree/develop/eBPF_Supermarket/kvm_watcher