X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fx86%2Fkvm%2Fsvm.c;h=468ff6e721cedc72c82fecdac39328193a320372;hb=66b7138f913635b40822890ba8913548f8b285b8;hp=3bb6d4b92b34a63cb1fd90c747b797dea7ad1d5e;hpb=9c4e40b9949868928bd7fec67a4b0902d90e57ed;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3bb6d4b..468ff6e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -46,17 +47,14 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) #define SVM_FEATURE_SVML (1 << 2) +#define SVM_FEATURE_NRIP (1 << 3) +#define SVM_FEATURE_PAUSE_FILTER (1 << 10) -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) - -/* Turn on to get debugging output*/ -/* #define NESTED_DEBUG */ +#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ +#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ +#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ -#ifdef NESTED_DEBUG -#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args) -#else -#define nsvm_printk(fmt, args...) do {} while(0) -#endif +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 @@ -81,6 +79,9 @@ struct nested_state { /* gpa pointers to the real vectors */ u64 vmcb_msrpm; + /* A VMEXIT is required but not yet emulated */ + bool exit_required; + /* cache for intercepts of the guest */ u16 intercept_cr_read; u16 intercept_cr_write; @@ -108,6 +109,11 @@ struct vcpu_svm { u32 *msrpm; struct nested_state nested; + + bool nmi_singlestep; + + unsigned int3_injected; + unsigned long int3_rip; }; /* enable NPT for AMD64 and X86 with PAE */ @@ -120,16 +126,15 @@ static int npt = 1; module_param(npt, int, S_IRUGO); -static int nested = 0; +static int nested = 1; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); +static int nested_svm_exit_handled(struct vcpu_svm *svm); +static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); -static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); @@ -232,24 +237,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.shadow_efer = efer; -} - -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - /* If we are within a nested VM we'd better #VMEXIT and let the - guest handle the exception */ - if (nested_svm_check_exception(svm, nr, has_error_code, error_code)) - return; - - svm->vmcb->control.event_inj = nr - | SVM_EVTINJ_VALID - | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) - | SVM_EVTINJ_TYPE_EXEPT; - svm->vmcb->control.event_inj_err = error_code; + vcpu->arch.efer = efer; } static int is_external_interrupt(u32 info) @@ -264,7 +252,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) u32 ret = 0; if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; + ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; return ret & mask; } @@ -284,7 +272,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (!svm->next_rip) { - if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != + if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != EMULATE_DONE) printk(KERN_DEBUG "%s: NOP\n", __func__); return; @@ -297,6 +285,39 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) svm_set_interrupt_shadow(vcpu, 0); } +static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* If we are within a nested VM we'd better #VMEXIT and let the + guest handle the exception */ + if (nested_svm_check_exception(svm, nr, has_error_code, error_code)) + return; + + if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { + unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); + + /* + * For guest debugging where we have to reinject #BP if some + * INT3 is guest-owned: + * Emulate nRIP by moving RIP forward. Will fail if injection + * raises a fault that is not intercepted. Still better than + * failing in all cases. + */ + skip_emulated_instruction(&svm->vcpu); + rip = kvm_rip_read(&svm->vcpu); + svm->int3_rip = rip + svm->vmcb->save.cs.base; + svm->int3_injected = rip - old_rip; + } + + svm->vmcb->control.event_inj = nr + | SVM_EVTINJ_VALID + | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) + | SVM_EVTINJ_TYPE_EXEPT; + svm->vmcb->control.event_inj_err = error_code; +} + static int has_svm(void) { const char *msg; @@ -314,75 +335,79 @@ static void svm_hardware_disable(void *garbage) cpu_svm_disable(); } -static void svm_hardware_enable(void *garbage) +static int svm_hardware_enable(void *garbage) { - struct svm_cpu_data *svm_data; + struct svm_cpu_data *sd; uint64_t efer; - struct descriptor_table gdt_descr; + struct desc_ptr gdt_descr; struct desc_struct *gdt; int me = raw_smp_processor_id(); + rdmsrl(MSR_EFER, efer); + if (efer & EFER_SVME) + return -EBUSY; + if (!has_svm()) { - printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); - return; + printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", + me); + return -EINVAL; } - svm_data = per_cpu(svm_data, me); + sd = per_cpu(svm_data, me); - if (!svm_data) { - printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", + if (!sd) { + printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", me); - return; + return -EINVAL; } - svm_data->asid_generation = 1; - svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; - svm_data->next_asid = svm_data->max_asid + 1; + sd->asid_generation = 1; + sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; + sd->next_asid = sd->max_asid + 1; kvm_get_gdt(&gdt_descr); - gdt = (struct desc_struct *)gdt_descr.base; - svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); + gdt = (struct desc_struct *)gdt_descr.address; + sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); - rdmsrl(MSR_EFER, efer); wrmsrl(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, - page_to_pfn(svm_data->save_area) << PAGE_SHIFT); + wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); + + return 0; } static void svm_cpu_uninit(int cpu) { - struct svm_cpu_data *svm_data - = per_cpu(svm_data, raw_smp_processor_id()); + struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id()); - if (!svm_data) + if (!sd) return; per_cpu(svm_data, raw_smp_processor_id()) = NULL; - __free_page(svm_data->save_area); - kfree(svm_data); + __free_page(sd->save_area); + kfree(sd); } static int svm_cpu_init(int cpu) { - struct svm_cpu_data *svm_data; + struct svm_cpu_data *sd; int r; - svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); - if (!svm_data) + sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); + if (!sd) return -ENOMEM; - svm_data->cpu = cpu; - svm_data->save_area = alloc_page(GFP_KERNEL); + sd->cpu = cpu; + sd->save_area = alloc_page(GFP_KERNEL); r = -ENOMEM; - if (!svm_data->save_area) + if (!sd->save_area) goto err_1; - per_cpu(svm_data, cpu) = svm_data; + per_cpu(svm_data, cpu) = sd; return 0; err_1: - kfree(svm_data); + kfree(sd); return r; } @@ -474,7 +499,7 @@ static __init int svm_hardware_setup(void) kvm_enable_efer_bits(EFER_SVME); } - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { r = svm_cpu_init(cpu); if (r) goto err; @@ -508,7 +533,7 @@ static __exit void svm_hardware_unsetup(void) { int cpu; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); @@ -537,6 +562,8 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + svm->vcpu.fpu_active = 1; + control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK; @@ -549,13 +576,19 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | - INTERCEPT_DR3_MASK; + INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | + INTERCEPT_DR7_MASK; control->intercept_dr_write = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | INTERCEPT_DR7_MASK; control->intercept_exceptions = (1 << PF_VECTOR) | @@ -566,6 +599,7 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | + (1ULL << INTERCEPT_SELECTIVE_CR0) | (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | @@ -623,11 +657,12 @@ static void init_vmcb(struct vcpu_svm *svm) save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; - /* - * cr0 val on cpu init should be 0x60000010, we enable cpu - * cache by default. the orderly way is to enable cache in bios. + /* This is the guest-visible cr0 value. + * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ - save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; + svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); + save->cr4 = X86_CR4_PAE; /* rdx = ?? */ @@ -637,13 +672,9 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); - control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); - control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); + control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; + control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; save->g_pat = 0x0007040600070406ULL; - /* enable caching because the QEMU Bios doesn't enable it */ - save->cr0 = X86_CR0_ET; save->cr3 = 0; save->cr4 = 0; } @@ -652,6 +683,11 @@ static void init_vmcb(struct vcpu_svm *svm) svm->nested.vmcb = 0; svm->vcpu.arch.hflags = 0; + if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { + control->pause_filter_count = 3000; + control->intercept |= (1ULL << INTERCEPT_PAUSE); + } + enable_gif(svm); } @@ -691,29 +727,28 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_svm; + err = -ENOMEM; page = alloc_page(GFP_KERNEL); - if (!page) { - err = -ENOMEM; + if (!page) goto uninit; - } - err = -ENOMEM; msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!msrpm_pages) - goto uninit; + goto free_page1; nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) - goto uninit; - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + goto free_page2; hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) - goto uninit; + goto free_page3; + svm->nested.hsave = page_address(hsave_page); + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->nested.msrpm = page_address(nested_msrpm_pages); svm->vmcb = page_address(page); @@ -723,13 +758,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) init_vmcb(svm); fx_init(&svm->vcpu); - svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; +free_page3: + __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); +free_page2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); +free_page1: + __free_page(page); uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: @@ -756,15 +796,18 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int i; if (unlikely(cpu != vcpu->cpu)) { - u64 tsc_this, delta; - - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - rdtscll(tsc_this); - delta = vcpu->arch.host_tsc - tsc_this; - svm->vmcb->control.tsc_offset += delta; + u64 delta; + + if (check_tsc_unstable()) { + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + delta = vcpu->arch.host_tsc - native_read_tsc(); + svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; + } vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; @@ -783,7 +826,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - rdtscll(vcpu->arch.host_tsc); + vcpu->arch.host_tsc = native_read_tsc(); } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -914,74 +957,106 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu) return save->cpl; } -static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - dt->limit = svm->vmcb->save.idtr.limit; - dt->base = svm->vmcb->save.idtr.base; + dt->size = svm->vmcb->save.idtr.limit; + dt->address = svm->vmcb->save.idtr.base; } -static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.idtr.limit = dt->limit; - svm->vmcb->save.idtr.base = dt->base ; + svm->vmcb->save.idtr.limit = dt->size; + svm->vmcb->save.idtr.base = dt->address ; } -static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - dt->limit = svm->vmcb->save.gdtr.limit; - dt->base = svm->vmcb->save.gdtr.base; + dt->size = svm->vmcb->save.gdtr.limit; + dt->address = svm->vmcb->save.gdtr.base; } -static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.gdtr.limit = dt->limit; - svm->vmcb->save.gdtr.base = dt->base ; + svm->vmcb->save.gdtr.limit = dt->size; + svm->vmcb->save.gdtr.base = dt->address ; +} + +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ } static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } +static void update_cr0_intercept(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb; + ulong gcr0 = svm->vcpu.arch.cr0; + u64 *hcr0 = &svm->vmcb->save.cr0; + + if (!svm->vcpu.fpu_active) + *hcr0 |= SVM_CR0_SELECTIVE_MASK; + else + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); + + + if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + if (is_nested(svm)) { + struct vmcb *hsave = svm->nested.hsave; + + hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read; + vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write; + } + } else { + svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + if (is_nested(svm)) { + struct vmcb *hsave = svm->nested.hsave; + + hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + } + } +} + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer |= EFER_LMA; + vcpu->arch.efer |= EFER_LMA; svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } #endif - if (npt_enabled) - goto set; + vcpu->arch.cr0 = cr0; - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - vcpu->fpu_active = 1; - } + if (!npt_enabled) + cr0 |= X86_CR0_PG | X86_CR0_WP; - vcpu->arch.cr0 = cr0; - cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - } -set: /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at @@ -989,6 +1064,7 @@ set: */ cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; + update_cr0_intercept(svm); } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1041,7 +1117,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu) svm->vmcb->control.intercept_exceptions &= ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); - if (vcpu->arch.singlestep) + if (svm->nmi_singlestep) svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { @@ -1056,26 +1132,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu) vcpu->guest_debug = 0; } -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int old_debug = vcpu->guest_debug; struct vcpu_svm *svm = to_svm(vcpu); - vcpu->guest_debug = dbg->control; - - update_db_intercept(vcpu); - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; else svm->vmcb->save.dr7 = vcpu->arch.dr7; - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - else if (old_debug & KVM_GUESTDBG_SINGLESTEP) - svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - - return 0; + update_db_intercept(vcpu); } static void load_host_msrs(struct kvm_vcpu *vcpu) @@ -1092,91 +1158,85 @@ static void save_host_msrs(struct kvm_vcpu *vcpu) #endif } -static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) +static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) { - if (svm_data->next_asid > svm_data->max_asid) { - ++svm_data->asid_generation; - svm_data->next_asid = 1; + if (sd->next_asid > sd->max_asid) { + ++sd->asid_generation; + sd->next_asid = 1; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; } - svm->asid_generation = svm_data->asid_generation; - svm->vmcb->control.asid = svm_data->next_asid++; + svm->asid_generation = sd->asid_generation; + svm->vmcb->control.asid = sd->next_asid++; } -static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) { struct vcpu_svm *svm = to_svm(vcpu); - unsigned long val; switch (dr) { case 0 ... 3: - val = vcpu->arch.db[dr]; + *dest = vcpu->arch.db[dr]; break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr6; + *dest = vcpu->arch.dr6; else - val = svm->vmcb->save.dr6; + *dest = svm->vmcb->save.dr6; break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr7; + *dest = vcpu->arch.dr7; else - val = svm->vmcb->save.dr7; + *dest = svm->vmcb->save.dr7; break; - default: - val = 0; } - return val; + return EMULATE_DONE; } -static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception) +static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); - *exception = 0; - switch (dr) { case 0 ... 3: vcpu->arch.db[dr] = value; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = value; - return; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - *exception = UD_VECTOR; - return; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; - return; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { svm->vmcb->save.dr7 = vcpu->arch.dr7; vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); } - return; - default: - /* FIXME: Possible case? */ - printk(KERN_DEBUG "%s: unexpected dr %u\n", - __func__, dr); - *exception = UD_VECTOR; - return; + break; } + + return EMULATE_DONE; } -static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int pf_interception(struct vcpu_svm *svm) { u64 fault_address; u32 error_code; @@ -1185,31 +1245,24 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) error_code = svm->vmcb->control.exit_info_1; trace_kvm_page_fault(fault_address, error_code); - /* - * FIXME: Tis shouldn't be necessary here, but there is a flush - * missing in the MMU code. Until we find this bug, flush the - * complete TLB here on an NPF - */ - if (npt_enabled) - svm_flush_tlb(&svm->vcpu); - else { - if (kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - } + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } -static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int db_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && - !svm->vcpu.arch.singlestep) { + !svm->nmi_singlestep) { kvm_queue_exception(&svm->vcpu, DB_VECTOR); return 1; } - if (svm->vcpu.arch.singlestep) { - svm->vcpu.arch.singlestep = false; + if (svm->nmi_singlestep) { + svm->nmi_singlestep = false; if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); @@ -1228,35 +1281,56 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int bp_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; kvm_run->debug.arch.exception = BP_VECTOR; return 0; } -static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int ud_interception(struct vcpu_svm *svm) { int er; - er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } -static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static void svm_fpu_activate(struct kvm_vcpu *vcpu) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) - svm->vmcb->save.cr0 &= ~X86_CR0_TS; + struct vcpu_svm *svm = to_svm(vcpu); + u32 excp; + + if (is_nested(svm)) { + u32 h_excp, n_excp; + + h_excp = svm->nested.hsave->control.intercept_exceptions; + n_excp = svm->nested.intercept_exceptions; + h_excp &= ~(1 << NM_VECTOR); + excp = h_excp | n_excp; + } else { + excp = svm->vmcb->control.intercept_exceptions; + excp &= ~(1 << NM_VECTOR); + } + + svm->vmcb->control.intercept_exceptions = excp; + svm->vcpu.fpu_active = 1; + update_cr0_intercept(svm); +} +static int nm_interception(struct vcpu_svm *svm) +{ + svm_fpu_activate(&svm->vcpu); return 1; } -static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int mc_interception(struct vcpu_svm *svm) { /* * On an #MC intercept the MCE handler is not called automatically in @@ -1269,8 +1343,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int shutdown_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + /* * VMCB is undefined after a SHUTDOWN intercept * so reinitialize it. @@ -1282,7 +1358,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 0; } -static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int io_interception(struct vcpu_svm *svm) { u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ int size, in, string; @@ -1296,7 +1372,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (string) { if (emulate_instruction(&svm->vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + 0, 0, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -1306,33 +1382,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; skip_emulated_instruction(&svm->vcpu); - return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); + return kvm_emulate_pio(&svm->vcpu, in, size, port); } -static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int nmi_interception(struct vcpu_svm *svm) { return 1; } -static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int intr_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.irq_exits; return 1; } -static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int nop_on_interception(struct vcpu_svm *svm) { return 1; } -static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int halt_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; skip_emulated_instruction(&svm->vcpu); return kvm_emulate_halt(&svm->vcpu); } -static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmmcall_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); @@ -1342,7 +1418,7 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) + if (!(svm->vcpu.arch.efer & EFER_SVME) || !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -1359,112 +1435,89 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code) { - if (is_nested(svm)) { - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = error_code; - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - if (nested_svm_exit_handled(svm, false)) { - nsvm_printk("VMexit -> EXCP 0x%x\n", nr); - return 1; - } - } - - return 0; -} - -static inline int nested_svm_intr(struct vcpu_svm *svm) -{ - if (is_nested(svm)) { - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return 0; + int vmexit; - if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) - return 0; + if (!is_nested(svm)) + return 0; - svm->vmcb->control.exit_code = SVM_EXIT_INTR; + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = error_code; + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - if (nested_svm_exit_handled(svm, false)) { - nsvm_printk("VMexit -> INTR\n"); - return 1; - } - } + vmexit = nested_svm_intercept(svm); + if (vmexit == NESTED_EXIT_DONE) + svm->nested.exit_required = true; - return 0; + return vmexit; } -static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) +/* This function returns true if it is save to enable the irq window */ +static inline bool nested_svm_intr(struct vcpu_svm *svm) { - struct page *page; + if (!is_nested(svm)) + return true; - down_read(¤t->mm->mmap_sem); - page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + return true; - if (is_error_page(page)) { - printk(KERN_INFO "%s: could not find page at 0x%llx\n", - __func__, gpa); - kvm_release_page_clean(page); - kvm_inject_gp(&svm->vcpu, 0); - return NULL; + if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) + return false; + + svm->vmcb->control.exit_code = SVM_EXIT_INTR; + + if (svm->nested.intercept & 1ULL) { + /* + * The #vmexit can't be emulated here directly because this + * code path runs with irqs and preemtion disabled. A + * #vmexit emulation might sleep. Only signal request for + * the #vmexit here. + */ + svm->nested.exit_required = true; + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); + return false; } - return page; + + return true; } -static int nested_svm_do(struct vcpu_svm *svm, - u64 arg1_gpa, u64 arg2_gpa, void *opaque, - int (*handler)(struct vcpu_svm *svm, - void *arg1, - void *arg2, - void *opaque)) +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) { - struct page *arg1_page; - struct page *arg2_page = NULL; - void *arg1; - void *arg2 = NULL; - int retval; + struct page *page; - arg1_page = nested_svm_get_page(svm, arg1_gpa); - if(arg1_page == NULL) - return 1; + might_sleep(); - if (arg2_gpa) { - arg2_page = nested_svm_get_page(svm, arg2_gpa); - if(arg2_page == NULL) { - kvm_release_page_clean(arg1_page); - return 1; - } - } + page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); + if (is_error_page(page)) + goto error; - arg1 = kmap_atomic(arg1_page, KM_USER0); - if (arg2_gpa) - arg2 = kmap_atomic(arg2_page, KM_USER1); + *_page = page; - retval = handler(svm, arg1, arg2, opaque); + return kmap(page); - kunmap_atomic(arg1, KM_USER0); - if (arg2_gpa) - kunmap_atomic(arg2, KM_USER1); +error: + kvm_release_page_clean(page); + kvm_inject_gp(&svm->vcpu, 0); - kvm_release_page_dirty(arg1_page); - if (arg2_gpa) - kvm_release_page_dirty(arg2_page); + return NULL; +} - return retval; +static void nested_svm_unmap(struct page *page) +{ + kunmap(page); + kvm_release_page_dirty(page); } -static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, - void *arg1, void *arg2, - void *opaque) +static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - u8 *msrpm = (u8 *)arg2; - u32 t0, t1; - u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u32 param = svm->vmcb->control.exit_info_1 & 1; + u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + bool ret = false; + u32 t0, t1; + u8 val; - if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return 0; + if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return false; switch (msr) { case 0 ... 0x1fff: @@ -1482,88 +1535,105 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, t0 %= 8; break; default: - return 1; - break; + ret = true; + goto out; } - if (msrpm[t1] & ((1 << param) << t0)) - return 1; - return 0; + if (!kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + t1, &val, 1)) + ret = val & ((1 << param) << t0); + +out: + return ret; } -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) +static int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; - bool vmexit = false; - if (kvm_override) { - switch (exit_code) { - case SVM_EXIT_INTR: - case SVM_EXIT_NMI: - return 0; + switch (exit_code) { + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: + return NESTED_EXIT_HOST; /* For now we are always handling NPFs when using them */ - case SVM_EXIT_NPF: - if (npt_enabled) - return 0; - break; - /* When we're shadowing, trap PFs */ - case SVM_EXIT_EXCP_BASE + PF_VECTOR: - if (!npt_enabled) - return 0; - break; - default: - break; - } + case SVM_EXIT_NPF: + if (npt_enabled) + return NESTED_EXIT_HOST; + break; + /* When we're shadowing, trap PFs */ + case SVM_EXIT_EXCP_BASE + PF_VECTOR: + if (!npt_enabled) + return NESTED_EXIT_HOST; + break; + case SVM_EXIT_EXCP_BASE + NM_VECTOR: + nm_interception(svm); + break; + default: + break; } + return NESTED_EXIT_CONTINUE; +} + +/* + * If this function returns true, this #vmexit was already handled + */ +static int nested_svm_intercept(struct vcpu_svm *svm) +{ + u32 exit_code = svm->vmcb->control.exit_code; + int vmexit = NESTED_EXIT_HOST; + switch (exit_code) { case SVM_EXIT_MSR: - if (nested_svm_do(svm, svm->nested.vmcb, svm->nested.vmcb_msrpm, - NULL, nested_svm_exit_handled_msr)) - vmexit = true; + vmexit = nested_svm_exit_handled_msr(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); if (svm->nested.intercept_cr_read & cr_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); if (svm->nested.intercept_cr_write & cr_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); if (svm->nested.intercept_dr_read & dr_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); if (svm->nested.intercept_dr_write & dr_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); if (svm->nested.intercept_exceptions & excp_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); - nsvm_printk("exit code: 0x%x\n", exit_code); if (svm->nested.intercept & exit_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; } } - if (vmexit) { - nsvm_printk("#VMEXIT reason=%04x\n", exit_code); + return vmexit; +} + +static int nested_svm_exit_handled(struct vcpu_svm *svm) +{ + int vmexit; + + vmexit = nested_svm_intercept(svm); + + if (vmexit == NESTED_EXIT_DONE) nested_svm_vmexit(svm); - } return vmexit; } @@ -1600,12 +1670,25 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->lbr_ctl = from->lbr_ctl; } -static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static int nested_svm_vmexit(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; + struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; + struct page *page; + + trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, + vmcb->control.exit_info_1, + vmcb->control.exit_info_2, + vmcb->control.exit_int_info, + vmcb->control.exit_int_info_err); + + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); + if (!nested_vmcb) + return 1; + + /* Exit nested SVM mode */ + svm->nested.vmcb = 0; /* Give the current vmcb to the guest */ disable_gif(svm); @@ -1616,9 +1699,13 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, nested_vmcb->save.ds = vmcb->save.ds; nested_vmcb->save.gdtr = vmcb->save.gdtr; nested_vmcb->save.idtr = vmcb->save.idtr; + nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); if (npt_enabled) nested_vmcb->save.cr3 = vmcb->save.cr3; + else + nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; nested_vmcb->save.cr2 = vmcb->save.cr2; + nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; nested_vmcb->save.rflags = vmcb->save.rflags; nested_vmcb->save.rip = vmcb->save.rip; nested_vmcb->save.rsp = vmcb->save.rsp; @@ -1636,6 +1723,22 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + + /* + * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have + * to make sure that we do not lose injected events. So check event_inj + * here and copy it to exit_int_info if it is valid. + * Exit_int_info and event_inj can't be both valid because the case + * below only happens on a VMRUN instruction intercept which has + * no valid exit_int_info set. + */ + if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { + struct vmcb_control_area *nc = &nested_vmcb->control; + + nc->exit_int_info = vmcb->control.event_inj; + nc->exit_int_info_err = vmcb->control.event_inj_err; + } + nested_vmcb->control.tlb_ctl = 0; nested_vmcb->control.event_inj = 0; nested_vmcb->control.event_inj_err = 0; @@ -1647,10 +1750,6 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, /* Restore the original control entries */ copy_vmcb_control_area(vmcb, hsave); - /* Kill any pending exceptions */ - if (svm->vcpu.arch.exception.pending == true) - nsvm_printk("WARNING: Pending Exception\n"); - kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); @@ -1678,18 +1777,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - /* Exit nested SVM mode */ - svm->nested.vmcb = 0; - - return 0; -} - -static int nested_svm_vmexit(struct vcpu_svm *svm) -{ - nsvm_printk("VMexit\n"); - if (nested_svm_do(svm, svm->nested.vmcb, 0, - NULL, nested_svm_vmexit_real)) - return 1; + nested_svm_unmap(page); kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); @@ -1697,27 +1785,45 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } -static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { + u32 *nested_msrpm; + struct page *page; int i; - u32 *nested_msrpm = (u32*)arg1; + + nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page); + if (!nested_msrpm) + return false; + for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; + svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); - return 0; + nested_svm_unmap(page); + + return true; } -static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static bool nested_svm_vmrun(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; + struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; + struct page *page; + u64 vmcb_gpa; - /* nested_vmcb is our indicator if nested SVM is activated */ - svm->nested.vmcb = svm->vmcb->save.rax; + vmcb_gpa = svm->vmcb->save.rax; + + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + if (!nested_vmcb) + return false; + + trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb, + nested_vmcb->save.rip, + nested_vmcb->control.int_ctl, + nested_vmcb->control.event_inj, + nested_vmcb->control.nested_ctl); /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); @@ -1731,8 +1837,8 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, hsave->save.ds = vmcb->save.ds; hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.shadow_efer; - hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; hsave->save.rip = svm->next_rip; @@ -1780,21 +1886,6 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->vmcb->save.dr6 = nested_vmcb->save.dr6; svm->vmcb->save.cpl = nested_vmcb->save.cpl; - /* We don't want a nested guest to be more powerful than the guest, - so all intercepts are ORed */ - svm->vmcb->control.intercept_cr_read |= - nested_vmcb->control.intercept_cr_read; - svm->vmcb->control.intercept_cr_write |= - nested_vmcb->control.intercept_cr_write; - svm->vmcb->control.intercept_dr_read |= - nested_vmcb->control.intercept_dr_read; - svm->vmcb->control.intercept_dr_write |= - nested_vmcb->control.intercept_dr_write; - svm->vmcb->control.intercept_exceptions |= - nested_vmcb->control.intercept_exceptions; - - svm->vmcb->control.intercept |= nested_vmcb->control.intercept; - svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; /* cache intercepts */ @@ -1806,37 +1897,51 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->nested.intercept = nested_vmcb->control.intercept; force_new_asid(&svm->vcpu); - svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; - svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err; svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; - if (nested_vmcb->control.int_ctl & V_IRQ_MASK) { - nsvm_printk("nSVM Injecting Interrupt: 0x%x\n", - nested_vmcb->control.int_ctl); - } if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; else svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; - nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n", - nested_vmcb->control.exit_int_info, - nested_vmcb->control.int_state); + if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { + /* We only want the cr8 intercept bits of the guest */ + svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + } + /* We don't want a nested guest to be more powerful than the guest, + so all intercepts are ORed */ + svm->vmcb->control.intercept_cr_read |= + nested_vmcb->control.intercept_cr_read; + svm->vmcb->control.intercept_cr_write |= + nested_vmcb->control.intercept_cr_write; + svm->vmcb->control.intercept_dr_read |= + nested_vmcb->control.intercept_dr_read; + svm->vmcb->control.intercept_dr_write |= + nested_vmcb->control.intercept_dr_write; + svm->vmcb->control.intercept_exceptions |= + nested_vmcb->control.intercept_exceptions; + + svm->vmcb->control.intercept |= nested_vmcb->control.intercept; + + svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; - if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID) - nsvm_printk("Injecting Event: 0x%x\n", - nested_vmcb->control.event_inj); svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; + nested_svm_unmap(page); + + /* nested_vmcb is our indicator if nested SVM is activated */ + svm->nested.vmcb = vmcb_gpa; + enable_gif(svm); - return 0; + return true; } -static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) { to_vmcb->save.fs = from_vmcb->save.fs; to_vmcb->save.gs = from_vmcb->save.gs; @@ -1850,69 +1955,79 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; - - return 1; -} - -static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque) -{ - return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb); } -static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque) +static int vmload_interception(struct vcpu_svm *svm) { - return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb); -} + struct vmcb *nested_vmcb; + struct page *page; -static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(nested_vmcb, svm->vmcb); + nested_svm_unmap(page); return 1; } -static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmsave_interception(struct vcpu_svm *svm) { + struct vmcb *nested_vmcb; + struct page *page; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(svm->vmcb, nested_vmcb); + nested_svm_unmap(page); return 1; } -static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmrun_interception(struct vcpu_svm *svm) { - nsvm_printk("VMrun\n"); if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - if (nested_svm_do(svm, svm->vmcb->save.rax, 0, - NULL, nested_svm_vmrun)) + if (!nested_svm_vmrun(svm)) return 1; - if (nested_svm_do(svm, svm->nested.vmcb_msrpm, 0, - NULL, nested_svm_vmrun_msrpm)) - return 1; + if (!nested_svm_vmrun_msrpm(svm)) + goto failed; + + return 1; + +failed: + + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); return 1; } -static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int stgi_interception(struct vcpu_svm *svm) { if (nested_svm_check_permissions(svm)) return 1; @@ -1925,7 +2040,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int clgi_interception(struct vcpu_svm *svm) { if (nested_svm_check_permissions(svm)) return 1; @@ -1942,10 +2057,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int invlpga_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - nsvm_printk("INVLPGA\n"); + + trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX], + vcpu->arch.regs[VCPU_REGS_RAX]); /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); @@ -1955,15 +2072,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int invalid_op_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int skinit_interception(struct vcpu_svm *svm) +{ + trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]); + + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; +} + +static int invalid_op_interception(struct vcpu_svm *svm) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } -static int task_switch_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int task_switch_interception(struct vcpu_svm *svm) { u16 tss_selector; int reason; @@ -2013,14 +2136,14 @@ static int task_switch_interception(struct vcpu_svm *svm, return kvm_task_switch(&svm->vcpu, tss_selector, reason); } -static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int cpuid_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; kvm_emulate_cpuid(&svm->vcpu); return 1; } -static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); @@ -2028,26 +2151,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } -static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int invlpg_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) + if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); return 1; } -static int emulate_on_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int emulate_on_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) + if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); return 1; } -static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int cr8_write_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ - emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); + emulate_instruction(&svm->vcpu, 0, 0, 0); if (irqchip_in_kernel(svm->vcpu.kvm)) { svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; return 1; @@ -2064,10 +2188,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) switch (ecx) { case MSR_IA32_TSC: { - u64 tsc; + u64 tsc_offset; + + if (is_nested(svm)) + tsc_offset = svm->nested.hsave->control.tsc_offset; + else + tsc_offset = svm->vmcb->control.tsc_offset; - rdtscll(tsc); - *data = svm->vmcb->control.tsc_offset + tsc; + *data = tsc_offset + native_read_tsc(); break; } case MSR_K6_STAR: @@ -2129,14 +2257,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) return 0; } -static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int rdmsr_interception(struct vcpu_svm *svm) { u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; - if (svm_get_msr(&svm->vcpu, ecx, &data)) + if (svm_get_msr(&svm->vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); - else { + } else { trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; @@ -2153,10 +2282,17 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) switch (ecx) { case MSR_IA32_TSC: { - u64 tsc; + u64 tsc_offset = data - native_read_tsc(); + u64 g_tsc_offset = 0; + + if (is_nested(svm)) { + g_tsc_offset = svm->vmcb->control.tsc_offset - + svm->nested.hsave->control.tsc_offset; + svm->nested.hsave->control.tsc_offset = tsc_offset; + } + + svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; - rdtscll(tsc); - svm->vmcb->control.tsc_offset = data - tsc; break; } case MSR_K6_STAR: @@ -2215,33 +2351,36 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) return 0; } -static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int wrmsr_interception(struct vcpu_svm *svm) { u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) + if (svm_set_msr(&svm->vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); - else + } else { + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(&svm->vcpu); + } return 1; } -static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int msr_interception(struct vcpu_svm *svm) { if (svm->vmcb->control.exit_info_1) - return wrmsr_interception(svm, kvm_run); + return wrmsr_interception(svm); else - return rdmsr_interception(svm, kvm_run); + return rdmsr_interception(svm); } -static int interrupt_window_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int interrupt_window_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -2259,13 +2398,18 @@ static int interrupt_window_interception(struct vcpu_svm *svm, return 1; } -static int (*svm_exit_handlers[])(struct vcpu_svm *svm, - struct kvm_run *kvm_run) = { +static int pause_interception(struct vcpu_svm *svm) +{ + kvm_vcpu_on_spin(&(svm->vcpu)); + return 1; +} + +static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = emulate_on_interception, [SVM_EXIT_READ_CR3] = emulate_on_interception, [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, - /* for now: */ + [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, [SVM_EXIT_WRITE_CR0] = emulate_on_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, @@ -2274,11 +2418,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_READ_DR4] = emulate_on_interception, + [SVM_EXIT_READ_DR5] = emulate_on_interception, + [SVM_EXIT_READ_DR6] = emulate_on_interception, + [SVM_EXIT_READ_DR7] = emulate_on_interception, [SVM_EXIT_WRITE_DR0] = emulate_on_interception, [SVM_EXIT_WRITE_DR1] = emulate_on_interception, [SVM_EXIT_WRITE_DR2] = emulate_on_interception, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR4] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR6] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, @@ -2295,6 +2445,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, + [SVM_EXIT_PAUSE] = pause_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invlpga_interception, @@ -2308,44 +2459,52 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_VMSAVE] = vmsave_interception, [SVM_EXIT_STGI] = stgi_interception, [SVM_EXIT_CLGI] = clgi_interception, - [SVM_EXIT_SKINIT] = invalid_op_interception, + [SVM_EXIT_SKINIT] = skinit_interception, [SVM_EXIT_WBINVD] = emulate_on_interception, [SVM_EXIT_MONITOR] = invalid_op_interception, [SVM_EXIT_MWAIT] = invalid_op_interception, [SVM_EXIT_NPF] = pf_interception, }; -static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; trace_kvm_exit(exit_code, svm->vmcb->save.rip); + if (unlikely(svm->nested.exit_required)) { + nested_svm_vmexit(svm); + svm->nested.exit_required = false; + + return 1; + } + if (is_nested(svm)) { - nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", - exit_code, svm->vmcb->control.exit_info_1, - svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); - if (nested_svm_exit_handled(svm, true)) + int vmexit; + + trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, + svm->vmcb->control.exit_info_1, + svm->vmcb->control.exit_info_2, + svm->vmcb->control.exit_int_info, + svm->vmcb->control.exit_int_info_err); + + vmexit = nested_svm_exit_special(svm); + + if (vmexit == NESTED_EXIT_CONTINUE) + vmexit = nested_svm_exit_handled(svm); + + if (vmexit == NESTED_EXIT_DONE) return 1; } svm_complete_interrupts(svm); - if (npt_enabled) { - int mmu_reload = 0; - if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { - svm_set_cr0(vcpu, svm->vmcb->save.cr0); - mmu_reload = 1; - } + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (mmu_reload) { - kvm_mmu_reset_context(vcpu); - kvm_mmu_load(vcpu); - } - } - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -2369,15 +2528,15 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } - return svm_exit_handlers[exit_code](svm, kvm_run); + return svm_exit_handlers[exit_code](svm); } static void reload_tss(struct kvm_vcpu *vcpu) { int cpu = raw_smp_processor_id(); - struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); - svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + sd->tss_desc->type = 9; /* available 32/64-bit TSS */ load_TR_desc(); } @@ -2385,12 +2544,12 @@ static void pre_svm_run(struct vcpu_svm *svm) { int cpu = raw_smp_processor_id(); - struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* FIXME: handle wraparound of asid_generation */ - if (svm->asid_generation != svm_data->asid_generation) - new_asid(svm, svm_data); + if (svm->asid_generation != sd->asid_generation) + new_asid(svm, sd); } static void svm_inject_nmi(struct kvm_vcpu *vcpu) @@ -2431,6 +2590,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); + if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + if (irr == -1) return; @@ -2446,28 +2608,53 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu) !(svm->vcpu.arch.hflags & HF_NMI_MASK); } +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return !!(svm->vcpu.arch.hflags & HF_NMI_MASK); +} + +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (masked) { + svm->vcpu.arch.hflags |= HF_NMI_MASK; + svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + } else { + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + } +} + static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - return (vmcb->save.rflags & X86_EFLAGS_IF) && - !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - gif_set(svm) && - !is_nested(svm); + int ret; + + if (!gif_set(svm) || + (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) + return 0; + + ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); + + if (is_nested(svm)) + return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); + + return ret; } static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - nsvm_printk("Trying to open IRQ window\n"); - - nested_svm_intr(svm); /* In case GIF=0 we can't rely on the CPU to tell us when * GIF becomes 1, because that's a separate STGI/VMRUN intercept. * The next time we get that intercept, this function will be * called again though and we'll get the vintr intercept. */ - if (gif_set(svm)) { + if (gif_set(svm) && nested_svm_intr(svm)) { svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } @@ -2484,7 +2671,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) /* Something prevents NMI from been injected. Single step over possible problem (IRET or exception injection or interrupt shadow) */ - vcpu->arch.singlestep = true; + svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_intercept(vcpu); } @@ -2507,6 +2694,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; kvm_set_cr8(vcpu, cr8); @@ -2518,6 +2708,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; + if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + cr8 = kvm_get_cr8(vcpu); svm->vmcb->control.int_ctl &= ~V_TPR_MASK; svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; @@ -2528,6 +2721,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; + unsigned int3_injected = svm->int3_injected; + + svm->int3_injected = 0; if (svm->vcpu.arch.hflags & HF_IRET_MASK) svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); @@ -2547,12 +2743,21 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->vcpu.arch.nmi_injected = true; break; case SVM_EXITINTINFO_TYPE_EXEPT: - /* In case of software exception do not reinject an exception - vector, but re-execute and instruction instead */ if (is_nested(svm)) break; - if (kvm_exception_is_soft(vector)) + /* + * In case of software exceptions, do not reinject the vector, + * but re-execute the instruction instead. Rewind RIP first + * if we emulated INT3 before. + */ + if (kvm_exception_is_soft(vector)) { + if (vector == BP_VECTOR && int3_injected && + kvm_is_linear_rip(&svm->vcpu, svm->int3_rip)) + kvm_rip_write(&svm->vcpu, + kvm_rip_read(&svm->vcpu) - + int3_injected); break; + } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; kvm_queue_exception_e(&svm->vcpu, vector, err); @@ -2574,13 +2779,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) #define R "e" #endif -static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u16 fs_selector; u16 gs_selector; u16 ldt_selector; + /* + * A vmexit emulation is required before the vcpu can be executed + * again. + */ + if (unlikely(svm->nested.exit_required)) + return; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; @@ -2593,8 +2805,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) fs_selector = kvm_read_fs(); gs_selector = kvm_read_gs(); ldt_selector = kvm_read_ldt(); - if (!is_nested(svm)) - svm->vmcb->save.cr2 = vcpu->arch.cr2; + svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ if (npt_enabled) svm->vmcb->save.cr3 = vcpu->arch.cr3; @@ -2714,12 +2925,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = root; force_new_asid(vcpu); - - if (vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - svm->vmcb->save.cr0 |= X86_CR0_TS; - vcpu->fpu_active = 0; - } } static int is_disabled(void) @@ -2768,6 +2973,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + static const struct trace_print_flags svm_exit_reasons_str[] = { { SVM_EXIT_READ_CR0, "read_cr0" }, { SVM_EXIT_READ_CR3, "read_cr3" }, @@ -2821,9 +3030,24 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { -1, NULL } }; -static bool svm_gb_page_enable(void) +static int svm_get_lpage_level(void) { - return true; + return PT_PDPE_LEVEL; +} + +static bool svm_rdtscp_supported(void) +{ + return false; +} + +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; + if (is_nested(svm)) + svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR; + update_cr0_intercept(svm); } static struct kvm_x86_ops svm_x86_ops = { @@ -2852,6 +3076,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, @@ -2866,6 +3091,8 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, + .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, @@ -2880,6 +3107,8 @@ static struct kvm_x86_ops svm_x86_ops = { .queue_exception = svm_queue_exception, .interrupt_allowed = svm_interrupt_allowed, .nmi_allowed = svm_nmi_allowed, + .get_nmi_mask = svm_get_nmi_mask, + .set_nmi_mask = svm_set_nmi_mask, .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, @@ -2889,7 +3118,11 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .exit_reasons_str = svm_exit_reasons_str, - .gb_page_enable = svm_gb_page_enable, + .get_lpage_level = svm_get_lpage_level, + + .cpuid_update = svm_cpuid_update, + + .rdtscp_supported = svm_rdtscp_supported, }; static int __init svm_init(void)