kvm中断虚拟化

1、x86平台主要使用的中断类型有pic、apic及msi中断,在多核系统下的apic结构图如下所示,每个cpu有一个lapic,外部中断通过ioapic转发到lapic,如果是msi中断,则绕过了io apic直接发给lapic。

2、kvm初始化过程为每个虚拟机维护一个pic主控制器、一个pic备控制器以及一个ioapic控制器,每个vcpu维护一个lapic控制器。同时每个虚拟机有一张中断路由表(kvm_irq_routing_table)。中断路由表里的chip二维数组保存非msi中断的gsi号,每个中断都有自己的routing_entry,routing_entry保存了中断的类型(pci、ioapic、msi)、中断号、以及set触发函数,所有的routing_entry以gsi为索引信息挂接到route_table的map链表里(可能同一个中断号会同时关联pic、ioapic两种中断type)。

ioapic里还维护了一张中断重映射表(redirtbl),负责为每个ioapic引脚(总共24个引脚)收到的中断选择路由到哪个lapic,每个vcpu的lapic控制器则模拟了主要的apic寄存器(IRR、ISR、EOI)。

3、中断路由表初始过程

kvm创建好pci、ioapic控制器后,会先使用default_routing(kvm/irq_common.c)安装默认的中断路由表。

kvm_arch_vm_ioctl
    kvm_create_pic
    kvm_ioapic_init
    kvm_setup_default_irq_routing
        kvm_set_irq_routing
            setup_routing_entry

static int setup_routing_entry(struct kvm *kvm,
			       struct kvm_irq_routing_table *rt,
			       struct kvm_kernel_irq_routing_entry *e,
			       const struct kvm_irq_routing_entry *ue)
{
	int r = -EINVAL;
	struct kvm_kernel_irq_routing_entry *ei;

	/*
	 * Do not allow GSI to be mapped to the same irqchip more than once.
	 * Allow only one to one mapping between GSI and non-irqchip routing.
	 */
	hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
		if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
		    ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
			return r;

	e->gsi = ue->gsi;
	e->type = ue->type;
        //设置每个routing_entry信息
	r = kvm_set_routing_entry(kvm, e, ue);
	if (r)
		goto out;
	if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
		rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi;
     
        //将routing_entry连接到route_table的map链表
	hlist_add_head(&e->link, &rt->map[e->gsi]);
	r = 0;
out:
	return r;
}

int kvm_set_routing_entry(struct kvm *kvm,
			  struct kvm_kernel_irq_routing_entry *e,
			  const struct kvm_irq_routing_entry *ue)
{
	int r = -EINVAL;
	int delta;
	unsigned max_pin;

	switch (ue->type) {
	case KVM_IRQ_ROUTING_IRQCHIP:
		delta = 0;
		switch (ue->u.irqchip.irqchip) {
		case KVM_IRQCHIP_PIC_MASTER:
			e->set = kvm_set_pic_irq;
			max_pin = PIC_NUM_PINS;
			break;
		case KVM_IRQCHIP_PIC_SLAVE:
			e->set = kvm_set_pic_irq;
			max_pin = PIC_NUM_PINS;
			delta = 8;
			break;
		case KVM_IRQCHIP_IOAPIC:
			max_pin = KVM_IOAPIC_NUM_PINS;
			e->set = kvm_set_ioapic_irq;
			break;
		default:
			goto out;
		}
		e->irqchip.irqchip = ue->u.irqchip.irqchip;
		e->irqchip.pin = ue->u.irqchip.pin + delta;
		if (e->irqchip.pin >= max_pin)
			goto out;
		break;
	case KVM_IRQ_ROUTING_MSI:
		e->set = kvm_set_msi;
		e->msi.address_lo = ue->u.msi.address_lo;
		e->msi.address_hi = ue->u.msi.address_hi;
		e->msi.data = ue->u.msi.data;

		if (kvm_msi_route_invalid(kvm, e))
			goto out;
		break;
	default:
		goto out;
	}

	r = 0;
out:
	return r;
}

setup_routing_entry的ue参数即为default_routing,以上的流程主要就是将default_routing定义的路由信息保存到routing_table里,default_routing初始化定义了0-24号中断的基本信息,如中断type(都是非msi的IRQCHIP类型,包括pic、ioapic),中断gsi号等。中断路由表除了初始化安装外,还可以通过KVM_SET_GSI_ROUTING重新安装。

#define IOAPIC_ROUTING_ENTRY(irq) \
	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
	  .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)

#define PIC_ROUTING_ENTRY(irq) \
	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
	  .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
#define ROUTING_ENTRY2(irq) \
	IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)

static const struct kvm_irq_routing_entry default_routing[] = {
	ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
	ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
	ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
	ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
	ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
	ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
	ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
	ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
	ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
	ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
	ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
	ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
};

4、中断触发流程

当vfio或vhost等后端通过eventfd唤醒kvm中断处理函数后,会进入irqfd_inject,然后调用kvm_set_irq,kvm_set_irq主要是查找中断路由表,找到中断对应的routing_entry,然后调用其set触发函数,如果是ioapic类型的中断,则会调用kvm_set_ioapic_irq,最后进入ioapic_service处理函数。ioapic_service主要是找到中断的重映射表,然后查找中断的目的地信息并转发到对应vcpu的lapic去处理。

static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
{
	union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
	struct kvm_lapic_irq irqe;
	int ret;

	if (entry->fields.mask)
		return -1;

	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
		     "vector=%x trig_mode=%x\n",
		     entry->fields.dest_id, entry->fields.dest_mode,
		     entry->fields.delivery_mode, entry->fields.vector,
		     entry->fields.trig_mode);

	irqe.dest_id = entry->fields.dest_id;
	irqe.vector = entry->fields.vector;
	irqe.dest_mode = entry->fields.dest_mode;
	irqe.trig_mode = entry->fields.trig_mode;
	irqe.delivery_mode = entry->fields.delivery_mode << 8;
	irqe.level = 1;
	irqe.shorthand = 0;
	irqe.msi_redir_hint = false;

	if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
		ioapic->irr_delivered |= 1 << irq;

	if (irq == RTC_GSI && line_status) {
		/*
		 * pending_eoi cannot ever become negative (see
		 * rtc_status_pending_eoi_check_valid) and the caller
		 * ensures that it is only called if it is >= zero, namely
		 * if rtc_irq_check_coalesced returns false).
		 */
		BUG_ON(ioapic->rtc_status.pending_eoi != 0);
		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
					       &ioapic->rtc_status.dest_map);
		ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
	} else
		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);

	if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
		entry->fields.remote_irr = 1;

	return ret;
}

lapic收到中断后,会根据不同的delivery_mode调用不同的处理函数,以常见的APIC_DM_FIXED为例,处理函数还会判断是否启用apicv功能,使用apicv和不使用apicv走不同的触发流程。

static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
			     int vector, int level, int trig_mode,
			     struct dest_map *dest_map)
{
	case APIC_DM_FIXED:

		//设置触发模式
		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
			if (trig_mode)
				kvm_lapic_set_vector(vector, apic->regs + APIC_TMR);
			else
				apic_clear_vector(vector, apic->regs + APIC_TMR);
		}
		//判断是否使用apicv
		if (vcpu->arch.apicv_active)
			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
		else {
			//irr寄存器对应bit位置1
			kvm_lapic_set_irr(vector, apic);
			//标记中断请求事件
			kvm_make_request(KVM_REQ_EVENT, vcpu);
			//把vcpu拉回到host
			kvm_vcpu_kick(vcpu);
		}
		break;
}

1)、如果使能了apicv,最终调用vmx_deliver_posted_interrupt,使用中断posting的方式来通知vcpu处理中断。

static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int r;
	//嵌套虚拟化的场景
	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
	if (!r)
		return;
	//将pi_desc对应的bit位置1
	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
		return;
	//设置pi_desc.on为1,表明有中断需要处理
	r = pi_test_and_set_on(&vmx->pi_desc);
	kvm_make_request(KVM_REQ_EVENT, vcpu);
	//判断vcpu是否处在Guest running状态,如果是,则给vcpu发送IPI中断POSTED_INTR_VECTOR
	//该IPI中断vcpu可以直接在non-root模式下处理,不需要vm-exit
	//如果vcpu处于非running状态,则将vcpu唤醒,这样vcpu执行vm_entry的时候就能感知到有中断需要处理
	if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
		kvm_vcpu_kick(vcpu);
}

2)、如果没有使能apicv功能,则标记lapic的IRR寄存器,通过kvm_make_request标记vcpu有中断请求事件,然后触发vcpu vm-exit。当vcpu重新回到Guest模式时,会检查是否有中断请求事件,如果有,则设置ISR、PPR等寄存器信息。

vcpu_enter_guest
    inject_pending_event
        kvm_cpu_get_interrupt
            kvm_get_apic_interrupt
                kvm_queue_interrupt

int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
{
	//从irr寄存器获取优先级最高的中断向量
	int vector = kvm_apic_has_interrupt(vcpu);
	struct kvm_lapic *apic = vcpu->arch.apic;

	if (vector == -1)
		return -1;

	/*
	 * We get here even with APIC virtualization enabled, if doing
	 * nested virtualization and L1 runs with the "acknowledge interrupt
	 * on exit" mode.  Then we cannot inject the interrupt via RVI,
	 * because the process would deliver it through the IDT.
	 */
	//设置isr寄存器,表明vcpu正在处理该中断
	apic_set_isr(vector, apic);
	//设置ppr寄存器
	apic_update_ppr(apic);
	apic_clear_irr(vector, apic);
	return vector;
}

最后再调用vmx_inject_irq将之前保存在kvm_queued_interrupt的中断信息写到vmcs的VM_ENTRY_INTR_INFO_FIELD,等vcpu执行vm_entry时,就能感知到该中断的存在。

static void vmx_inject_irq(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	uint32_t intr;
	int irq = vcpu->arch.interrupt.nr;

	trace_kvm_inj_virq(irq);

	++vcpu->stat.irq_injections;
	if (vmx->rmode.vm86_active) {
		int inc_eip = 0;
		if (vcpu->arch.interrupt.soft)
			inc_eip = vcpu->arch.event_exit_inst_len;
		if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
		return;
	}
	intr = irq | INTR_INFO_VALID_MASK;
	if (vcpu->arch.interrupt.soft) {
		intr |= INTR_TYPE_SOFT_INTR;
		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
			     vmx->vcpu.arch.event_exit_inst_len);
	} else
		intr |= INTR_TYPE_EXT_INTR;
	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值