X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fx86%2Fkvm%2Fvmx.c;h=778f059ae423aea2c66f882141082059ad3e1218;hb=3cfc3092;hp=6610181267b13bb5e1ad3e44298b821dd6833d1c;hpb=e799794e02a368f79c3fae26aabaaadd0b7466ce;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6610181..778f059 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "x86.h" @@ -34,6 +35,8 @@ #include #include +#include "trace.h" + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -58,12 +61,36 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +/* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * ple_gap: upper bound on the amount of time between two successive + * executions of PAUSE in a loop. Also indicate if ple enabled. + * According to test, this time is usually small than 41 cycles. + * ple_window: upper bound on the amount of time a guest is allowed to execute + * in a PAUSE loop. Tests indicate that most spinlocks are held for + * less than 2^12 cycles + * Time is measured based on a counter that runs at the same rate as the TSC, + * refer SDM volume 3b section 21.6.13 & 22.1.3. + */ +#define KVM_VMX_DEFAULT_PLE_GAP 41 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; +module_param(ple_gap, int, S_IRUGO); + +static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, int, S_IRUGO); + struct vmcs { u32 revision_id; u32 abort; char data[0]; }; +struct shared_msr_entry { + unsigned index; + u64 data; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; struct list_head local_vcpus_link; @@ -71,13 +98,12 @@ struct vcpu_vmx { int launched; u8 fail; u32 idt_vectoring_info; - struct kvm_msr_entry *guest_msrs; - struct kvm_msr_entry *host_msrs; + struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; - int msr_offset_efer; #ifdef CONFIG_X86_64 - int msr_offset_kernel_gs_base; + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; #endif struct vmcs *vmcs; struct { @@ -85,7 +111,6 @@ struct vcpu_vmx { u16 fs_sel, gs_sel, ldt_sel; int gs_ldt_reload_needed; int fs_reload_needed; - int guest_efer_loaded; } host_state; struct { int vm86_active; @@ -104,7 +129,6 @@ struct vcpu_vmx { } rmode; int vpid; bool emulation_required; - enum emulation_result invalid_state_emulation_result; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; @@ -173,6 +197,8 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +static u64 host_efer; + static void ept_save_pdptrs(struct kvm_vcpu *vcpu); /* @@ -181,28 +207,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu); */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif MSR_EFER, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -static void load_msrs(struct kvm_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - wrmsrl(e[i].index, e[i].data); -} - -static void save_msrs(struct kvm_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - rdmsrl(e[i].index, e[i].data); -} - static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -317,6 +327,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void) SECONDARY_EXEC_UNRESTRICTED_GUEST; } +static inline int cpu_has_vmx_ple(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PAUSE_LOOP_EXITING; +} + static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && @@ -345,7 +361,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) int i; for (i = 0; i < vmx->nmsrs; ++i) - if (vmx->guest_msrs[i].index == msr) + if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) return i; return -1; } @@ -376,7 +392,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa) : : "a" (&operand), "c" (ext) : "cc", "memory"); } -static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -537,10 +553,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; + /* + * Unconditionally intercept #DB so we can maintain dr6 without + * reading it every exit. + */ + eb |= 1u << DB_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - eb |= 1u << DB_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) eb |= 1u << BP_VECTOR; } @@ -565,15 +583,13 @@ static void reload_tss(void) load_TR_desc(); } -static void load_transition_efer(struct vcpu_vmx *vmx) +static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) { - int efer_offset = vmx->msr_offset_efer; - u64 host_efer = vmx->host_msrs[efer_offset].data; - u64 guest_efer = vmx->guest_msrs[efer_offset].data; + u64 guest_efer; u64 ignore_bits; - if (efer_offset < 0) - return; + guest_efer = vmx->vcpu.arch.shadow_efer; + /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless * outside long mode @@ -586,26 +602,18 @@ static void load_transition_efer(struct vcpu_vmx *vmx) ignore_bits &= ~(u64)EFER_SCE; #endif if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) - return; + return false; - vmx->host_state.guest_efer_loaded = 1; guest_efer &= ~ignore_bits; guest_efer |= host_efer & ignore_bits; - wrmsrl(MSR_EFER, guest_efer); - vmx->vcpu.stat.efer_reload++; -} - -static void reload_host_efer(struct vcpu_vmx *vmx) -{ - if (vmx->host_state.guest_efer_loaded) { - vmx->host_state.guest_efer_loaded = 0; - load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); - } + vmx->guest_msrs[efer_offset].data = guest_efer; + return true; } static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int i; if (vmx->host_state.loaded) return; @@ -642,13 +650,14 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - save_msrs(vmx->host_msrs + - vmx->msr_offset_kernel_gs_base, 1); - + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + } #endif - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_transition_efer(vmx); + for (i = 0; i < vmx->save_nmsrs; ++i) + kvm_set_shared_msr(vmx->guest_msrs[i].index, + vmx->guest_msrs[i].data); } static void __vmx_load_host_state(struct vcpu_vmx *vmx) @@ -676,9 +685,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) local_irq_restore(flags); } reload_tss(); - save_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_msrs(vmx->host_msrs, vmx->save_nmsrs); - reload_host_efer(vmx); +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + } +#endif } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -701,7 +713,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->cpu != cpu) { vcpu_clear(vmx); kvm_migrate_timers(vcpu); - vpid_sync_vcpu_all(vmx); + set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); local_irq_disable(); list_add(&vmx->local_vcpus_link, &per_cpu(vcpus_on_cpu, cpu)); @@ -775,7 +787,12 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - return vmcs_readl(GUEST_RFLAGS); + unsigned long rflags; + + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) + rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -864,19 +881,14 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, /* * Swap MSR entry in host/guest MSR entry array. */ -#ifdef CONFIG_X86_64 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) { - struct kvm_msr_entry tmp; + struct shared_msr_entry tmp; tmp = vmx->guest_msrs[to]; vmx->guest_msrs[to] = vmx->guest_msrs[from]; vmx->guest_msrs[from] = tmp; - tmp = vmx->host_msrs[to]; - vmx->host_msrs[to] = vmx->host_msrs[from]; - vmx->host_msrs[from] = tmp; } -#endif /* * Set up the vmcs to automatically save and restore system @@ -885,15 +897,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) */ static void setup_msrs(struct vcpu_vmx *vmx) { - int save_nmsrs; + int save_nmsrs, index; unsigned long *msr_bitmap; vmx_load_host_state(vmx); save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { - int index; - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); @@ -903,9 +913,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. @@ -915,13 +922,11 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, save_nmsrs++); } #endif - vmx->save_nmsrs = save_nmsrs; + index = __find_msr_index(vmx, MSR_EFER); + if (index >= 0 && update_transition_efer(vmx, index)) + move_msr_up(vmx, index, save_nmsrs++); -#ifdef CONFIG_X86_64 - vmx->msr_offset_kernel_gs_base = - __find_msr_index(vmx, MSR_KERNEL_GS_BASE); -#endif - vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); + vmx->save_nmsrs = save_nmsrs; if (cpu_has_vmx_msr_bitmap()) { if (is_long_mode(&vmx->vcpu)) @@ -963,7 +968,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { u64 data; - struct kvm_msr_entry *msr; + struct shared_msr_entry *msr; if (!pdata) { printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); @@ -978,9 +983,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_GS_BASE: data = vmcs_readl(GUEST_GS_BASE); break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(to_vmx(vcpu)); + data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + break; +#endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_index, pdata); -#endif case MSR_IA32_TSC: data = guest_read_tsc(); break; @@ -997,6 +1006,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { + vmx_load_host_state(to_vmx(vcpu)); data = msr->data; break; } @@ -1015,7 +1025,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr; + struct shared_msr_entry *msr; u64 host_tsc; int ret = 0; @@ -1031,6 +1041,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_GS_BASE: vmcs_writel(GUEST_GS_BASE, data); break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(vmx); + vmx->msr_guest_kernel_gs_base = data; + break; #endif case MSR_IA32_SYSENTER_CS: vmcs_write32(GUEST_SYSENTER_CS, data); @@ -1053,9 +1067,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } /* Otherwise falls through to kvm_set_msr_common */ default: - vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); if (msr) { + vmx_load_host_state(vmx); msr->data = data; break; } @@ -1084,30 +1098,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int old_debug = vcpu->guest_debug; - unsigned long flags; - - vcpu->guest_debug = dbg->control; - if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) - vcpu->guest_debug = 0; - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); else vmcs_writel(GUEST_DR7, vcpu->arch.dr7); - flags = vmcs_readl(GUEST_RFLAGS); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - else if (old_debug & KVM_GUESTDBG_SINGLESTEP) - flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - vmcs_writel(GUEST_RFLAGS, flags); - update_exception_bitmap(vcpu); - - return 0; } static __init int cpu_has_kvm_support(void) @@ -1126,12 +1124,15 @@ static __init int vmx_disabled_by_bios(void) /* locked but not enabled */ } -static void hardware_enable(void *garbage) +static int hardware_enable(void *garbage) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); u64 old; + if (read_cr4() & X86_CR4_VMXE) + return -EBUSY; + INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); if ((old & (FEATURE_CONTROL_LOCKED | @@ -1146,6 +1147,10 @@ static void hardware_enable(void *garbage) asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) : "memory", "cc"); + + ept_sync_global(); + + return 0; } static void vmclear_local_vcpus(void) @@ -1237,7 +1242,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST; + SECONDARY_EXEC_UNRESTRICTED_GUEST | + SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1251,12 +1257,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ - min &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_INVLPG_EXITING); - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) - return -EIO; + _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, vmx_capability.ept, vmx_capability.vpid); } @@ -1334,15 +1337,17 @@ static void free_kvm_area(void) { int cpu; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) { free_vmcs(per_cpu(vmxarea, cpu)); + per_cpu(vmxarea, cpu) = NULL; + } } static __init int alloc_kvm_area(void) { int cpu; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct vmcs *vmcs; vmcs = alloc_vmcs_cpu(cpu); @@ -1381,6 +1386,12 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + + if (!cpu_has_vmx_ple()) + ple_gap = 0; + return alloc_kvm_area(); } @@ -1523,8 +1534,16 @@ continue_rmode: static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + + if (!msr) + return; + /* + * Force kernel_gs_base reloading before EFER changes, as control + * of this msr depends on is_long_mode(). + */ + vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.shadow_efer = efer; if (!msr) return; @@ -1628,7 +1647,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu->arch.cr4); - *hw_cr0 &= ~X86_CR0_WP; } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1637,9 +1655,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu->arch.cr4); - if (!(vcpu->arch.cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; } + + if (!(cr0 & X86_CR0_WP)) + *hw_cr0 &= ~X86_CR0_WP; } static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, @@ -1713,7 +1732,8 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : - VMX_EPT_IDENTITY_PAGETABLE_ADDR; + vcpu->kvm->arch.ept_identity_map_addr; + ept_load_pdptrs(vcpu); } vmx_flush_tlb(vcpu); @@ -1767,16 +1787,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - struct kvm_segment kvm_seg; - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ return 3; - vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); - return kvm_seg.selector & 3; + return vmcs_read16(GUEST_CS_SELECTOR) & 3; } static u32 vmx_segment_access_rights(struct kvm_segment *var) @@ -2116,7 +2133,7 @@ static int init_rmode_identity_map(struct kvm *kvm) if (likely(kvm->arch.ept_identity_pagetable_done)) return 1; ret = 0; - identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2185,14 +2202,15 @@ static int alloc_identity_pagetable(struct kvm *kvm) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + kvm_userspace_mem.guest_phys_addr = + kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); if (r) goto out; kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, - VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); + kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: up_write(&kvm->slots_lock); return r; @@ -2295,9 +2313,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (!ple_gap) + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } + if (ple_gap) { + vmcs_write32(PLE_GAP, ple_gap); + vmcs_write32(PLE_WINDOW, ple_window); + } + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -2365,10 +2390,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (wrmsr_safe(index, data_low, data_high) < 0) continue; data = data_low | ((u64)data_high << 32); - vmx->host_msrs[j].index = index; - vmx->host_msrs[j].reserved = 0; - vmx->host_msrs[j].data = data; - vmx->guest_msrs[j] = vmx->host_msrs[j]; + vmx->guest_msrs[j].index = i; + vmx->guest_msrs[j].data = 0; ++vmx->nmsrs; } @@ -2499,7 +2522,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx->vcpu.arch.cr0 = 0x60000010; + vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); @@ -2547,7 +2570,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) uint32_t intr; int irq = vcpu->arch.interrupt.nr; - KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + trace_kvm_inj_virq(irq); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { @@ -2616,6 +2639,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) GUEST_INTR_STATE_NMI)); } +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + if (!cpu_has_virtual_nmis()) + return to_vmx(vcpu)->soft_vnmi_blocked; + else + return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + GUEST_INTR_STATE_NMI); +} + +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_virtual_nmis()) { + if (vmx->soft_vnmi_blocked != masked) { + vmx->soft_vnmi_blocked = masked; + vmx->vnmi_blocked_time = 0; + } + } else { + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } +} + static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && @@ -2648,7 +2699,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) + if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) return 1; /* * Forward all other exceptions that are valid in real mode. @@ -2699,15 +2750,16 @@ static void kvm_machine_check(void) #endif } -static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_machine_check(struct kvm_vcpu *vcpu) { /* already handled by vcpu_run */ return 1; } -static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; unsigned long cr2, rip, dr6; u32 vect_info; @@ -2717,12 +2769,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (is_machine_check(intr_info)) - return handle_machine_check(vcpu, kvm_run); + return handle_machine_check(vcpu); if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) - printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " - "intr info 0x%x\n", __func__, vect_info, intr_info); + !is_page_fault(intr_info)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -2733,7 +2790,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } if (is_invalid_opcode(intr_info)) { - er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(vcpu, UD_VECTOR); return 1; @@ -2748,8 +2805,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (enable_ept) BUG(); cr2 = vmcs_readl(EXIT_QUALIFICATION); - KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, - (u32)((u64)cr2 >> 32), handler); + trace_kvm_page_fault(cr2, error_code); + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); @@ -2792,21 +2849,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } -static int handle_external_interrupt(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; - KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler); return 1; } -static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_triple_fault(struct kvm_vcpu *vcpu) { - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; } -static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; int size, in, string; @@ -2817,8 +2872,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) string = (exit_qualification & 16) != 0; if (string) { - if (emulate_instruction(vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -2828,7 +2882,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) port = exit_qualification >> 16; skip_emulated_instruction(vcpu); - return kvm_emulate_pio(vcpu, kvm_run, in, size, port); + return kvm_emulate_pio(vcpu, in, size, port); } static void @@ -2842,9 +2896,9 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_cr(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + unsigned long exit_qualification, val; int cr; int reg; @@ -2853,21 +2907,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), - (u32)((u64)kvm_register_read(vcpu, reg) >> 32), - handler); + val = kvm_register_read(vcpu, reg); + trace_kvm_cr_write(cr, val); switch (cr) { case 0: - kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr0(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 3: - kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr3(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 4: - kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr4(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 8: { @@ -2879,7 +2931,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; if (cr8_prev <= cr8) return 1; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; + vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } }; @@ -2889,23 +2941,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); vmx_fpu_activate(vcpu); - KVMTRACE_0D(CLTS, vcpu, handler); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { case 3: kvm_register_write(vcpu, reg, vcpu->arch.cr3); - KVMTRACE_3D(CR_READ, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), - (u32)((u64)kvm_register_read(vcpu, reg) >> 32), - handler); + trace_kvm_cr_read(cr, vcpu->arch.cr3); skip_emulated_instruction(vcpu); return 1; case 8: - kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); - KVMTRACE_2D(CR_READ, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), handler); + val = kvm_get_cr8(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); skip_emulated_instruction(vcpu); return 1; } @@ -2918,18 +2966,20 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) default: break; } - kvm_run->exit_reason = 0; + vcpu->run->exit_reason = 0; pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } -static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; unsigned long val; int dr, reg; + if (!kvm_require_cpl(vcpu, 0)) + return 1; dr = vmcs_readl(GUEST_DR7); if (dr & DR7_GD) { /* @@ -2938,13 +2988,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) * guest debugging itself. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - kvm_run->debug.arch.dr6 = vcpu->arch.dr6; - kvm_run->debug.arch.dr7 = dr; - kvm_run->debug.arch.pc = + vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; + vcpu->run->debug.arch.dr7 = dr; + vcpu->run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + vmcs_readl(GUEST_RIP); - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; + vcpu->run->debug.arch.exception = DB_VECTOR; + vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { vcpu->arch.dr7 &= ~DR7_GD; @@ -2973,7 +3023,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) val = 0; } kvm_register_write(vcpu, reg, val); - KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { val = vcpu->arch.regs[reg]; switch (dr) { @@ -3006,19 +3055,18 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } break; } - KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler); } skip_emulated_instruction(vcpu); return 1; } -static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_cpuid(struct kvm_vcpu *vcpu) { kvm_emulate_cpuid(vcpu); return 1; } -static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data; @@ -3028,8 +3076,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } - KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_read(ecx, data); /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; @@ -3038,14 +3085,13 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_write(ecx, data); if (vmx_set_msr(vcpu, ecx, data) != 0) { kvm_inject_gp(vcpu, 0); @@ -3056,14 +3102,12 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { return 1; } -static int handle_interrupt_window(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_interrupt_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; @@ -3072,7 +3116,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - KVMTRACE_0D(PEND_INTR, vcpu, handler); ++vcpu->stat.irq_window_exits; /* @@ -3080,34 +3123,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, * possible */ if (!irqchip_in_kernel(vcpu->kvm) && - kvm_run->request_interrupt_window && + vcpu->run->request_interrupt_window && !kvm_cpu_has_interrupt(vcpu)) { - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } return 1; } -static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_halt(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); return kvm_emulate_halt(vcpu); } -static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_vmcall(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); kvm_emulate_hypercall(vcpu); return 1; } -static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_vmx_insn(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } -static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3116,14 +3159,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_wbinvd(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); /* TODO: Add support for VT-d/pass-through device */ return 1; } -static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_apic_access(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; enum emulation_result er; @@ -3132,18 +3175,18 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); offset = exit_qualification & 0xffful; - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + er = emulate_instruction(vcpu, 0, 0, 0); if (er != EMULATE_DONE) { printk(KERN_ERR "Fail to handle apic access vmexit! Offset is 0x%lx\n", offset); - return -ENOTSUPP; + return -ENOEXEC; } return 1; } -static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; @@ -3197,7 +3240,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; @@ -3207,7 +3250,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (exit_qualification & (1 << 6)) { printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); - return -ENOTSUPP; + return -EINVAL; } gla_validity = (exit_qualification >> 7) & 0x3; @@ -3218,16 +3261,100 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vmcs_readl(GUEST_LINEAR_ADDRESS)); printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; return 0; } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + trace_kvm_page_fault(gpa, exit_qualification); return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); } -static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static u64 ept_rsvd_mask(u64 spte, int level) +{ + int i; + u64 mask = 0; + + for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) + mask |= (1ULL << i); + + if (level > 2) + /* bits 7:3 reserved */ + mask |= 0xf8; + else if (level == 2) { + if (spte & (1ULL << 7)) + /* 2MB ref, bits 20:12 reserved */ + mask |= 0x1ff000; + else + /* bits 6:3 reserved */ + mask |= 0x78; + } + + return mask; +} + +static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, + int level) +{ + printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); + + /* 010b (write-only) */ + WARN_ON((spte & 0x7) == 0x2); + + /* 110b (write/execute) */ + WARN_ON((spte & 0x7) == 0x6); + + /* 100b (execute-only) and value not supported by logical processor */ + if (!cpu_has_vmx_ept_execute_only()) + WARN_ON((spte & 0x7) == 0x4); + + /* not 000b */ + if ((spte & 0x7)) { + u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); + + if (rsvd_bits != 0) { + printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", + __func__, rsvd_bits); + WARN_ON(1); + } + + if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { + u64 ept_mem_type = (spte & 0x38) >> 3; + + if (ept_mem_type == 2 || ept_mem_type == 3 || + ept_mem_type == 7) { + printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", + __func__, ept_mem_type); + WARN_ON(1); + } + } + } +} + +static int handle_ept_misconfig(struct kvm_vcpu *vcpu) +{ + u64 sptes[4]; + int nr_sptes, i; + gpa_t gpa; + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + + printk(KERN_ERR "EPT: Misconfiguration.\n"); + printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); + + nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); + + for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) + ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); + + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; + + return 0; +} + +static int handle_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; @@ -3240,36 +3367,50 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; - - local_irq_enable(); - preempt_enable(); + int ret = 1; while (!guest_state_valid(vcpu)) { - err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + err = emulate_instruction(vcpu, 0, 0, 0); - if (err == EMULATE_DO_MMIO) - break; + if (err == EMULATE_DO_MMIO) { + ret = 0; + goto out; + } if (err != EMULATE_DONE) { kvm_report_emulation_failure(vcpu, "emulation failure"); - break; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + ret = 0; + goto out; } if (signal_pending(current)) - break; + goto out; if (need_resched()) schedule(); } - preempt_disable(); - local_irq_disable(); + vmx->emulation_required = 0; +out: + return ret; +} + +/* + * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE + * exiting, so only get here on cpu with PAUSE-Loop-Exiting. + */ +static int handle_pause(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + kvm_vcpu_on_spin(vcpu); - vmx->invalid_state_emulation_result = err; + return 1; } /* @@ -3277,8 +3418,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. */ -static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) = { +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, @@ -3306,8 +3446,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, - [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, }; static const int kvm_vmx_max_exit_handlers = @@ -3317,22 +3459,17 @@ static const int kvm_vmx_max_exit_handlers = * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int vmx_handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), - (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); + trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); - /* If we need to emulate an MMIO from handle_invalid_guest_state - * we just return 0 */ - if (vmx->emulation_required && emulate_invalid_guest_state) { - if (guest_state_valid(vcpu)) - vmx->emulation_required = 0; - return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO; - } + /* If guest state is invalid, start emulating */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return handle_invalid_guest_state(vcpu); /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ @@ -3340,8 +3477,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); if (unlikely(vmx->fail)) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); return 0; } @@ -3374,10 +3511,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) - return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); + return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_reason; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = exit_reason; } return 0; } @@ -3413,10 +3550,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) /* We need to handle NMIs before interrupts are enabled */ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) { - KVMTRACE_0D(NMI, &vmx->vcpu, handler); + (exit_intr_info & INTR_INFO_VALID_MASK)) asm("int $2"); - } idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; @@ -3517,23 +3652,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) #define Q "l" #endif -static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (enable_ept && is_paging(vcpu)) { - vmcs_writel(GUEST_CR3, vcpu->arch.cr3); - ept_load_pdptrs(vcpu); - } /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) vmx->entry_time = ktime_get(); - /* Handle invalid guest state instead of entering VMX */ - if (vmx->emulation_required && emulate_invalid_guest_state) { - handle_invalid_guest_state(vcpu, kvm_run); + /* Don't enter VMX if guest state is invalid, let the exit handler + start emulation until we arrive back to a valid state */ + if (vmx->emulation_required && emulate_invalid_guest_state) return; - } if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); @@ -3553,7 +3683,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) */ vmcs_writel(HOST_CR0, read_cr0()); - set_debugreg(vcpu->arch.dr6, 6); + if (vcpu->arch.switch_db_regs) + set_debugreg(vcpu->arch.dr6, 6); asm( /* Store host registers */ @@ -3564,11 +3695,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%"R"sp, %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" "1: \n\t" + /* Reload cr2 if changed */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%cr2, %%"R"dx \n\t" + "cmp %%"R"ax, %%"R"dx \n\t" + "je 2f \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "2: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[cr2](%0), %%"R"ax \n\t" - "mov %%"R"ax, %%cr2 \n\t" "mov %c[rax](%0), %%"R"ax \n\t" "mov %c[rbx](%0), %%"R"bx \n\t" "mov %c[rdx](%0), %%"R"dx \n\t" @@ -3650,7 +3786,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - get_debugreg(vcpu->arch.dr6, 6); + if (vcpu->arch.switch_db_regs) + get_debugreg(vcpu->arch.dr6, 6); vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) @@ -3685,7 +3822,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) __clear_bit(vmx->vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); vmx_free_vmcs(vcpu); - kfree(vmx->host_msrs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -3712,10 +3848,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit_vcpu; } - vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!vmx->host_msrs) - goto free_guest_msrs; - vmx->vmcs = alloc_vmcs(); if (!vmx->vmcs) goto free_msrs; @@ -3733,17 +3865,19 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (alloc_apic_access_page(kvm) != 0) goto free_vmcs; - if (enable_ept) + if (enable_ept) { + if (!kvm->arch.ept_identity_map_addr) + kvm->arch.ept_identity_map_addr = + VMX_EPT_IDENTITY_PAGETABLE_ADDR; if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; + } return &vmx->vcpu; free_vmcs: free_vmcs(vmx->vmcs); free_msrs: - kfree(vmx->host_msrs); -free_guest_msrs: kfree(vmx->guest_msrs); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); @@ -3799,6 +3933,34 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return ret; } +static const struct trace_print_flags vmx_exit_reasons_str[] = { + { EXIT_REASON_EXCEPTION_NMI, "exception" }, + { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, + { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, + { EXIT_REASON_NMI_WINDOW, "nmi_window" }, + { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, + { EXIT_REASON_CR_ACCESS, "cr_access" }, + { EXIT_REASON_DR_ACCESS, "dr_access" }, + { EXIT_REASON_CPUID, "cpuid" }, + { EXIT_REASON_MSR_READ, "rdmsr" }, + { EXIT_REASON_MSR_WRITE, "wrmsr" }, + { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, + { EXIT_REASON_HLT, "halt" }, + { EXIT_REASON_INVLPG, "invlpg" }, + { EXIT_REASON_VMCALL, "hypercall" }, + { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, + { EXIT_REASON_APIC_ACCESS, "apic_access" }, + { EXIT_REASON_WBINVD, "wbinvd" }, + { EXIT_REASON_TASK_SWITCH, "task_switch" }, + { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + { -1, NULL } +}; + +static bool vmx_gb_page_enable(void) +{ + return false; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -3851,6 +4013,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .queue_exception = vmx_queue_exception, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, @@ -3858,11 +4022,19 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, .get_mt_mask = vmx_get_mt_mask, + + .exit_reasons_str = vmx_exit_reasons_str, + .gb_page_enable = vmx_gb_page_enable, }; static int __init vmx_init(void) { - int r; + int r, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < NR_VMX_MSR; ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_a) @@ -3924,8 +4096,6 @@ static int __init vmx_init(void) if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); - ept_sync_global(); - return 0; out3: