X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fx86%2Fkvm%2Fvmx.c;h=07491c9c6ed03c1306b4bf5f7dfe04db9e830441;hb=2aaf69dcee864f4fb6402638dd2f263324ac839f;hp=61c2a3a8d20a9635da6c0d2d48705dba84ac1156;hpb=5e4a0b3c1b899bb0ba28bde6edf95c5ddeb48b5c;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 61c2a3a..07491c9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -16,8 +16,6 @@ */ #include "irq.h" -#include "vmx.h" -#include "segment_descriptor.h" #include "mmu.h" #include @@ -27,9 +25,15 @@ #include #include #include +#include "kvm_cache_regs.h" +#include "x86.h" #include #include +#include +#include + +#define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -37,6 +41,18 @@ MODULE_LICENSE("GPL"); static int bypass_guest_pf = 1; module_param(bypass_guest_pf, bool, 0); +static int enable_vpid = 1; +module_param(enable_vpid, bool, 0); + +static int flexpriority_enabled = 1; +module_param(flexpriority_enabled, bool, 0); + +static int enable_ept = 1; +module_param(enable_ept, bool, 0); + +static int emulate_invalid_guest_state = 0; +module_param(emulate_invalid_guest_state, bool, 0); + struct vmcs { u32 revision_id; u32 abort; @@ -45,6 +61,8 @@ struct vmcs { struct vcpu_vmx { struct kvm_vcpu vcpu; + struct list_head local_vcpus_link; + unsigned long host_rsp; int launched; u8 fail; u32 idt_vectoring_info; @@ -71,6 +89,13 @@ struct vcpu_vmx { unsigned rip; } irq; } rmode; + int vpid; + bool emulation_required; + + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -78,13 +103,19 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static int init_rmode_tss(struct kvm *kvm); +static int init_rmode(struct kvm *kvm); +static u64 construct_eptp(unsigned long root_hpa); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); static struct page *vmx_io_bitmap_a; static struct page *vmx_io_bitmap_b; +static struct page *vmx_msr_bitmap; + +static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); +static DEFINE_SPINLOCK(vmx_vpid_lock); static struct vmcs_config { int size; @@ -97,6 +128,11 @@ static struct vmcs_config { u32 vmentry_ctrl; } vmcs_config; +static struct vmx_capability { + u32 ept; + u32 vpid; +} vmx_capability; + #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ .selector = GUEST_##seg##_SELECTOR, \ @@ -176,6 +212,11 @@ static inline int is_external_interrupt(u32 intr_info) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } +static inline int cpu_has_vmx_msr_bitmap(void) +{ + return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); +} + static inline int cpu_has_vmx_tpr_shadow(void) { return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); @@ -194,8 +235,35 @@ static inline int cpu_has_secondary_exec_ctrls(void) static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { + return flexpriority_enabled + && (vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); +} + +static inline int cpu_has_vmx_invept_individual_addr(void) +{ + return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT)); +} + +static inline int cpu_has_vmx_invept_context(void) +{ + return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT)); +} + +static inline int cpu_has_vmx_invept_global(void) +{ + return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT)); +} + +static inline int cpu_has_vmx_ept(void) +{ return (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + SECONDARY_EXEC_ENABLE_EPT); +} + +static inline int vm_need_ept(void) +{ + return (cpu_has_vmx_ept() && enable_ept); } static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) @@ -204,6 +272,17 @@ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) (irqchip_in_kernel(kvm))); } +static inline int cpu_has_vmx_vpid(void) +{ + return (vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID); +} + +static inline int cpu_has_virtual_nmis(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -214,6 +293,32 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) return -1; } +static inline void __invvpid(int ext, u16 vpid, gva_t gva) +{ + struct { + u64 vpid : 16; + u64 rsvd : 48; + u64 gva; + } operand = { vpid, 0, gva }; + + asm volatile (__ex(ASM_VMX_INVVPID) + /* CF==1 or ZF==1 --> rc = -1 */ + "; ja 1f ; ud2 ; 1:" + : : "a"(&operand), "c"(ext) : "cc", "memory"); +} + +static inline void __invept(int ext, u64 eptp, gpa_t gpa) +{ + struct { + u64 eptp, gpa; + } operand = {eptp, gpa}; + + asm volatile (__ex(ASM_VMX_INVEPT) + /* CF==1 or ZF==1 --> rc = -1 */ + "; ja 1f ; ud2 ; 1:\n" + : : "a" (&operand), "c" (ext) : "cc", "memory"); +} + static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -229,7 +334,7 @@ static void vmcs_clear(struct vmcs *vmcs) u64 phys_addr = __pa(vmcs); u8 error; - asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" + asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc", "memory"); if (error) @@ -247,21 +352,58 @@ static void __vcpu_clear(void *arg) if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; rdtscll(vmx->vcpu.arch.host_tsc); + list_del(&vmx->local_vcpus_link); + vmx->vcpu.cpu = -1; + vmx->launched = 0; } static void vcpu_clear(struct vcpu_vmx *vmx) { if (vmx->vcpu.cpu == -1) return; - smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); - vmx->launched = 0; + smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); +} + +static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) +{ + if (vmx->vpid == 0) + return; + + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); +} + +static inline void ept_sync_global(void) +{ + if (cpu_has_vmx_invept_global()) + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); +} + +static inline void ept_sync_context(u64 eptp) +{ + if (vm_need_ept()) { + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + else + ept_sync_global(); + } +} + +static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) +{ + if (vm_need_ept()) { + if (cpu_has_vmx_invept_individual_addr()) + __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, + eptp, gpa); + else + ept_sync_context(eptp); + } } static unsigned long vmcs_readl(unsigned long field) { unsigned long value; - asm volatile (ASM_VMX_VMREAD_RDX_RAX + asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) : "=a"(value) : "d"(field) : "cc"); return value; } @@ -296,7 +438,7 @@ static void vmcs_writel(unsigned long field, unsigned long value) { u8 error; - asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" + asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" : "=q"(error) : "a"(value), "d"(field) : "cc"); if (unlikely(error)) vmwrite_error(field, value); @@ -314,10 +456,8 @@ static void vmcs_write32(unsigned long field, u32 value) static void vmcs_write64(unsigned long field, u64 value) { -#ifdef CONFIG_X86_64 - vmcs_writel(field, value); -#else vmcs_writel(field, value); +#ifndef CONFIG_X86_64 asm volatile (""); vmcs_writel(field+1, value >> 32); #endif @@ -341,27 +481,26 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; if (vcpu->guest_debug.enabled) - eb |= 1u << 1; + eb |= 1u << DB_VECTOR; if (vcpu->arch.rmode.active) eb = ~0; + if (vm_need_ept()) + eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ vmcs_write32(EXCEPTION_BITMAP, eb); } static void reload_tss(void) { -#ifndef CONFIG_X86_64 - /* * VT restores TR but not its size. Useless. */ struct descriptor_table gdt; - struct segment_descriptor *descs; + struct desc_struct *descs; - get_gdt(&gdt); + kvm_get_gdt(&gdt); descs = (void *)gdt.base; descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ load_TR_desc(); -#endif } static void load_transition_efer(struct vcpu_vmx *vmx) @@ -414,9 +553,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - vmx->host_state.ldt_sel = read_ldt(); + vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = read_fs(); + vmx->host_state.fs_sel = kvm_read_fs(); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -424,7 +563,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - vmx->host_state.gs_sel = read_gs(); + vmx->host_state.gs_sel = kvm_read_gs(); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -450,7 +589,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) load_transition_efer(vmx); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +static void __vmx_load_host_state(struct vcpu_vmx *vmx) { unsigned long flags; @@ -460,15 +599,15 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) - load_fs(vmx->host_state.fs_sel); + kvm_load_fs(vmx->host_state.fs_sel); if (vmx->host_state.gs_ldt_reload_needed) { - load_ldt(vmx->host_state.ldt_sel); + kvm_load_ldt(vmx->host_state.ldt_sel); /* * If we have to reload gs, we must take care to * preserve our gs base. */ local_irq_save(flags); - load_gs(vmx->host_state.gs_sel); + kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); #endif @@ -480,6 +619,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) reload_host_efer(vmx); } +static void vmx_load_host_state(struct vcpu_vmx *vmx) +{ + preempt_disable(); + __vmx_load_host_state(vmx); + preempt_enable(); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -488,18 +634,23 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 phys_addr = __pa(vmx->vmcs); - u64 tsc_this, delta; + u64 tsc_this, delta, new_offset; if (vcpu->cpu != cpu) { vcpu_clear(vmx); - kvm_migrate_apic_timer(vcpu); + kvm_migrate_timers(vcpu); + vpid_sync_vcpu_all(vmx); + local_irq_disable(); + list_add(&vmx->local_vcpus_link, + &per_cpu(vcpus_on_cpu, cpu)); + local_irq_enable(); } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { u8 error; per_cpu(current_vmcs, cpu) = vmx->vmcs; - asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc"); if (error) @@ -516,8 +667,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * Linux uses per-cpu TSS and GDT, so set these when switching * processors. */ - vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ - get_gdt(&dt); + vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ + kvm_get_gdt(&dt); vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); @@ -527,14 +678,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * Make sure the time stamp counter is monotonous. */ rdtscll(tsc_this); - delta = vcpu->arch.host_tsc - tsc_this; - vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); + if (tsc_this < vcpu->arch.host_tsc) { + delta = vcpu->arch.host_tsc - tsc_this; + new_offset = vmcs_read64(TSC_OFFSET) + delta; + vmcs_write64(TSC_OFFSET, new_offset); + } } } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { - vmx_load_host_state(to_vmx(vcpu)); + __vmx_load_host_state(to_vmx(vcpu)); } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -557,11 +711,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); } -static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) -{ - vcpu_clear(to_vmx(vcpu)); -} - static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { return vmcs_readl(GUEST_RFLAGS); @@ -579,9 +728,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) unsigned long rip; u32 interruptibility; - rip = vmcs_readl(GUEST_RIP); + rip = kvm_rip_read(vcpu); rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs_writel(GUEST_RIP, rip); + kvm_rip_write(vcpu, rip); /* * We emulated an instruction, so temporary interrupt blocking @@ -597,19 +746,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (has_error_code) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = nr; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + if (nr == BP_VECTOR) + vmx->rmode.irq.rip++; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + nr | INTR_TYPE_SOFT_INTR + | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) + | INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, nr | INTR_TYPE_EXCEPTION - | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) + | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | INTR_INFO_VALID_MASK); - if (has_error_code) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); } static bool vmx_exception_injected(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + return false; } /* @@ -638,6 +803,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) { int save_nmsrs; + vmx_load_host_state(vmx); save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { @@ -763,11 +929,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) switch (msr_index) { #ifdef CONFIG_X86_64 case MSR_EFER: + vmx_load_host_state(vmx); ret = kvm_set_msr_common(vcpu, msr_index, data); - if (vmx->host_state.loaded) { - reload_host_efer(vmx); - load_transition_efer(vmx); - } break; case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); @@ -788,12 +951,30 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_TIME_STAMP_COUNTER: guest_write_tsc(data); break; + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + /* + * Just discard all writes to the performance counters; this + * should keep both older linux and windows 64-bit guests + * happy + */ + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); + + break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, data); + vcpu->arch.pat = data; + break; + } + /* Otherwise falls through to kvm_set_msr_common */ default: + vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; - if (vmx->host_state.loaded) - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); break; } ret = kvm_set_msr_common(vcpu, msr_index, data); @@ -802,24 +983,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) return ret; } -/* - * Sync the rsp and rip registers into the vcpu structure. This allows - * registers to be accessed by indexing vcpu->arch.regs. - */ -static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) -{ - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - vcpu->arch.rip = vmcs_readl(GUEST_RIP); -} - -/* - * Syncs rsp and rip back into the vmcs. Should be called after possible - * modification. - */ -static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) +static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - vmcs_writel(GUEST_RIP, vcpu->arch.rip); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + switch (reg) { + case VCPU_REGS_RSP: + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + break; + case VCPU_REGS_RIP: + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + break; + default: + break; + } } static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) @@ -862,23 +1038,14 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) static int vmx_get_irq(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field; - - idtv_info_field = vmx->idt_vectoring_info; - if (idtv_info_field & INTR_INFO_VALID_MASK) { - if (is_external_interrupt(idtv_info_field)) - return idtv_info_field & VECTORING_INFO_VECTOR_MASK; - else - printk(KERN_DEBUG "pending exception: not handled yet\n"); - } - return -1; + if (!vcpu->arch.interrupt.pending) + return -1; + return vcpu->arch.interrupt.nr; } static __init int cpu_has_kvm_support(void) { - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ + return cpu_has_vmx(); } static __init int vmx_disabled_by_bios(void) @@ -886,9 +1053,9 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - == MSR_IA32_FEATURE_CONTROL_LOCKED; + return (msr & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + == FEATURE_CONTROL_LOCKED; /* locked but not enabled */ } @@ -898,23 +1065,46 @@ static void hardware_enable(void *garbage) u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); u64 old; + INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - != (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + if ((old & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + != (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); + FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED); write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) + asm volatile (ASM_VMX_VMXON_RAX + : : "a"(&phys_addr), "m"(phys_addr) : "memory", "cc"); } +static void vmclear_local_vcpus(void) +{ + int cpu = raw_smp_processor_id(); + struct vcpu_vmx *vmx, *n; + + list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), + local_vcpus_link) + __vcpu_clear(vmx); +} + + +/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() + * tricks. + */ +static void kvm_cpu_vmxoff(void) +{ + asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); + write_cr4(read_cr4() & ~X86_CR4_VMXE); +} + static void hardware_disable(void *garbage) { - asm volatile (ASM_VMX_VMXOFF : : : "cc"); + vmclear_local_vcpus(); + kvm_cpu_vmxoff(); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -939,7 +1129,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) { u32 vmx_msr_low, vmx_msr_high; - u32 min, opt; + u32 min, opt, min2, opt2; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; @@ -947,7 +1137,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 _vmentry_control = 0; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = 0; + opt = PIN_BASED_VIRTUAL_NMIS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -957,10 +1147,14 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | #endif + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING; + CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | + CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) @@ -971,10 +1165,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) ~CPU_BASED_CR8_STORE_EXITING; #endif if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { - min = 0; - opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_WBINVD_EXITING; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, + min2 = 0; + opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_ENABLE_VPID | + SECONDARY_EXEC_ENABLE_EPT; + if (adjust_vmx_controls(min2, opt2, + MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) return -EIO; } @@ -983,17 +1180,30 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { + /* CR3 accesses and invlpg don't need to cause VM Exits when EPT + enabled */ + min &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, + &_cpu_based_exec_control) < 0) + return -EIO; + rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, + vmx_capability.ept, vmx_capability.vpid); + } min = 0; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif - opt = 0; + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; - min = opt = 0; + min = 0; + opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -1082,6 +1292,10 @@ static __init int hardware_setup(void) { if (setup_vmcs_config(&vmcs_config) < 0) return -EIO; + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + return alloc_kvm_area(); } @@ -1109,7 +1323,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) static void enter_pmode(struct kvm_vcpu *vcpu) { unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->emulation_required = 1; vcpu->arch.rmode.active = 0; vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); @@ -1126,6 +1342,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); + if (emulate_invalid_guest_state) + return; + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); @@ -1166,7 +1385,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->emulation_required = 1; vcpu->arch.rmode.active = 1; vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); @@ -1188,6 +1409,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); + if (emulate_invalid_guest_state) + goto continue_rmode; + vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); vmcs_write32(GUEST_SS_LIMIT, 0xffff); vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); @@ -1203,8 +1427,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); +continue_rmode: kvm_mmu_reset_context(vcpu); - init_rmode_tss(vcpu->kvm); + init_rmode(vcpu->kvm); } #ifdef CONFIG_X86_64 @@ -1216,7 +1441,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { printk(KERN_DEBUG "%s: tss fixup for long mode. \n", - __FUNCTION__); + __func__); vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); @@ -1241,14 +1466,77 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + vpid_sync_vcpu_all(to_vmx(vcpu)); + if (vm_need_ept()) + ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; } +static void ept_load_pdptrs(struct kvm_vcpu *vcpu) +{ + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { + printk(KERN_ERR "EPT: Fail to load pdptrs!\n"); + return; + } + vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); + } +} + +static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); + +static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, + unsigned long cr0, + struct kvm_vcpu *vcpu) +{ + if (!(cr0 & X86_CR0_PG)) { + /* From paging/starting to nonpaging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | + (CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 = cr0; + vmx_set_cr4(vcpu, vcpu->arch.cr4); + *hw_cr0 |= X86_CR0_PE | X86_CR0_PG; + *hw_cr0 &= ~X86_CR0_WP; + } else if (!is_paging(vcpu)) { + /* From nonpaging to paging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 = cr0; + vmx_set_cr4(vcpu, vcpu->arch.cr4); + if (!(vcpu->arch.cr0 & X86_CR0_WP)) + *hw_cr0 &= ~X86_CR0_WP; + } +} + +static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, + struct kvm_vcpu *vcpu) +{ + if (!is_paging(vcpu)) { + *hw_cr4 &= ~X86_CR4_PAE; + *hw_cr4 |= X86_CR4_PSE; + } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) + *hw_cr4 &= ~X86_CR4_PAE; +} + static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | + KVM_VM_CR0_ALWAYS_ON; + vmx_fpu_deactivate(vcpu); if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) @@ -1266,31 +1554,62 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif + if (vm_need_ept()) + ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, - (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); + vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) vmx_fpu_activate(vcpu); } +static u64 construct_eptp(unsigned long root_hpa) +{ + u64 eptp; + + /* TODO write the value reading from MSR */ + eptp = VMX_EPT_DEFAULT_MT | + VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; + eptp |= (root_hpa & PAGE_MASK); + + return eptp; +} + static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { - vmcs_writel(GUEST_CR3, cr3); + unsigned long guest_cr3; + u64 eptp; + + guest_cr3 = cr3; + if (vm_need_ept()) { + eptp = construct_eptp(cr3); + vmcs_write64(EPT_POINTER, eptp); + ept_sync_context(eptp); + ept_load_pdptrs(vcpu); + guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : + VMX_EPT_IDENTITY_PAGETABLE_ADDR; + } + + vmx_flush_tlb(vcpu); + vmcs_writel(GUEST_CR3, guest_cr3); if (vcpu->arch.cr0 & X86_CR0_PE) vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - vmcs_writel(CR4_READ_SHADOW, cr4); - vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); + unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + vcpu->arch.cr4 = cr4; -} + if (vm_need_ept()) + ept_update_paging_mode_cr4(&hw_cr4, vcpu); -#ifdef CONFIG_X86_64 + vmcs_writel(CR4_READ_SHADOW, cr4); + vmcs_writel(GUEST_CR4, hw_cr4); +} static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { @@ -1298,6 +1617,8 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); vcpu->arch.shadow_efer = efer; + if (!msr) + return; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1314,8 +1635,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) setup_msrs(vmx); } -#endif - static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -1346,6 +1665,20 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->unusable = (ar >> 16) & 1; } +static int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + struct kvm_segment kvm_seg; + + if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ + return 0; + + if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ + return 3; + + vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); + return kvm_seg.selector & 3; +} + static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; @@ -1428,6 +1761,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) vmcs_writel(GUEST_GDTR_BASE, dt->base); } +static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + u32 ar; + + vmx_get_segment(vcpu, &var, seg); + ar = vmx_segment_access_rights(&var); + + if (var.base != (var.selector << 4)) + return false; + if (var.limit != 0xffff) + return false; + if (ar != 0xf3) + return false; + + return true; +} + +static bool code_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + unsigned int cs_rpl; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + cs_rpl = cs.selector & SELECTOR_RPL_MASK; + + if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) + return false; + if (!cs.s) + return false; + if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { + if (cs.dpl > cs_rpl) + return false; + } else if (cs.type & AR_TYPE_CODE_MASK) { + if (cs.dpl != cs_rpl) + return false; + } + if (!cs.present) + return false; + + /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ + return true; +} + +static bool stack_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ss; + unsigned int ss_rpl; + + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + ss_rpl = ss.selector & SELECTOR_RPL_MASK; + + if ((ss.type != 3) || (ss.type != 7)) + return false; + if (!ss.s) + return false; + if (ss.dpl != ss_rpl) /* DPL != RPL */ + return false; + if (!ss.present) + return false; + + return true; +} + +static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + unsigned int rpl; + + vmx_get_segment(vcpu, &var, seg); + rpl = var.selector & SELECTOR_RPL_MASK; + + if (!var.s) + return false; + if (!var.present) + return false; + if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { + if (var.dpl < rpl) /* DPL < RPL */ + return false; + } + + /* TODO: Add other members to kvm_segment_field to allow checking for other access + * rights flags + */ + return true; +} + +static bool tr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment tr; + + vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); + + if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ + return false; + if (!tr.present) + return false; + + return true; +} + +static bool ldtr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ldtr; + + vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); + + if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if (ldtr.type != 2) + return false; + if (!ldtr.present) + return false; + + return true; +} + +static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ss; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + + return ((cs.selector & SELECTOR_RPL_MASK) == + (ss.selector & SELECTOR_RPL_MASK)); +} + +/* + * Check if guest state is valid. Returns true if valid, false if + * not. + * We assume that registers are always usable + */ +static bool guest_state_valid(struct kvm_vcpu *vcpu) +{ + /* real mode guest state checks */ + if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + } else { + /* protected mode guest state checks */ + if (!cs_ss_rpl_check(vcpu)) + return false; + if (!code_segment_valid(vcpu)) + return false; + if (!stack_segment_valid(vcpu)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + if (!tr_valid(vcpu)) + return false; + if (!ldtr_valid(vcpu)) + return false; + } + /* TODO: + * - Add checks on RIP + * - Add checks on RFLAGS + */ + + return true; +} + static int init_rmode_tss(struct kvm *kvm) { gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; @@ -1435,12 +1948,12 @@ static int init_rmode_tss(struct kvm *kvm) int ret = 0; int r; - down_read(¤t->mm->mmap_sem); r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); + r = kvm_write_guest_page(kvm, fn++, &data, + TSS_IOPB_BASE_OFFSET, sizeof(u16)); if (r < 0) goto out; r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); @@ -1458,7 +1971,41 @@ static int init_rmode_tss(struct kvm *kvm) ret = 1; out: - up_read(¤t->mm->mmap_sem); + return ret; +} + +static int init_rmode_identity_map(struct kvm *kvm) +{ + int i, r, ret; + pfn_t identity_map_pfn; + u32 tmp; + + if (!vm_need_ept()) + return 1; + if (unlikely(!kvm->arch.ept_identity_pagetable)) { + printk(KERN_ERR "EPT: identity-mapping pagetable " + "haven't been allocated!\n"); + return 0; + } + if (likely(kvm->arch.ept_identity_pagetable_done)) + return 1; + ret = 0; + identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; + r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); + if (r < 0) + goto out; + /* Set up identity-mapping pagetable for EPT in real mode */ + for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); + r = kvm_write_guest_page(kvm, identity_map_pfn, + &tmp, i * sizeof(tmp), sizeof(tmp)); + if (r < 0) + goto out; + } + kvm->arch.ept_identity_pagetable_done = true; + ret = 1; +out: return ret; } @@ -1469,7 +2016,7 @@ static void seg_setup(int seg) vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0x93); + vmcs_write32(sf->ar_bytes, 0xf3); } static int alloc_apic_access_page(struct kvm *kvm) @@ -1488,31 +2035,96 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - down_read(¤t->mm->mmap_sem); kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); - up_read(¤t->mm->mmap_sem); out: up_write(&kvm->slots_lock); return r; } -/* - * Sets up the vmcs for emulated real mode. - */ -static int vmx_vcpu_setup(struct vcpu_vmx *vmx) +static int alloc_identity_pagetable(struct kvm *kvm) { - u32 host_sysenter_cs; - u32 junk; - unsigned long a; - struct descriptor_table dt; - int i; - unsigned long kvm_vmx_return; + struct kvm_userspace_memory_region kvm_userspace_mem; + int r = 0; + + down_write(&kvm->slots_lock); + if (kvm->arch.ept_identity_pagetable) + goto out; + kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; + kvm_userspace_mem.flags = 0; + kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + kvm_userspace_mem.memory_size = PAGE_SIZE; + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; + + kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, + VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); +out: + up_write(&kvm->slots_lock); + return r; +} + +static void allocate_vpid(struct vcpu_vmx *vmx) +{ + int vpid; + + vmx->vpid = 0; + if (!enable_vpid || !cpu_has_vmx_vpid()) + return; + spin_lock(&vmx_vpid_lock); + vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); + if (vpid < VMX_NR_VPIDS) { + vmx->vpid = vpid; + __set_bit(vpid, vmx_vpid_bitmap); + } + spin_unlock(&vmx_vpid_lock); +} + +static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) +{ + void *va; + + if (!cpu_has_vmx_msr_bitmap()) + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + va = kmap(msr_bitmap); + if (msr <= 0x1fff) { + __clear_bit(msr, va + 0x000); /* read-low */ + __clear_bit(msr, va + 0x800); /* write-low */ + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + __clear_bit(msr, va + 0x400); /* read-high */ + __clear_bit(msr, va + 0xc00); /* write-high */ + } + kunmap(msr_bitmap); +} + +/* + * Sets up the vmcs for emulated real mode. + */ +static int vmx_vcpu_setup(struct vcpu_vmx *vmx) +{ + u32 host_sysenter_cs, msr_low, msr_high; + u32 junk; + u64 host_pat; + unsigned long a; + struct descriptor_table dt; + int i; + unsigned long kvm_vmx_return; u32 exec_control; /* I/O */ vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ @@ -1527,6 +2139,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) CPU_BASED_CR8_LOAD_EXITING; #endif } + if (!vm_need_ept()) + exec_control |= CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_INVLPG_EXITING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); if (cpu_has_secondary_exec_ctrls()) { @@ -1534,6 +2150,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (vmx->vpid == 0) + exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; + if (!vm_need_ept()) + exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -1548,8 +2168,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); @@ -1563,7 +2183,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - get_idt(&dt); + kvm_get_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); @@ -1579,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) rdmsrl(MSR_IA32_SYSENTER_EIP, a); vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((u64) msr_high << 32); + vmcs_write64(HOST_IA32_PAT, host_pat); + } + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((u64) msr_high << 32); + /* Write the default value follow host pat */ + vmcs_write64(GUEST_IA32_PAT, host_pat); + /* Keep arch.pat sync with GUEST_IA32_PAT */ + vmx->vcpu.arch.pat = host_pat; + } + for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; @@ -1609,21 +2243,34 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) return 0; } +static int init_rmode(struct kvm *kvm) +{ + if (!init_rmode_tss(kvm)) + return 0; + if (!init_rmode_identity_map(kvm)) + return 0; + return 1; +} + static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; int ret; - if (!init_rmode_tss(vmx->vcpu.kvm)) { + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + down_read(&vcpu->kvm->slots_lock); + if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; } vmx->vcpu.arch.rmode.active = 0; + vmx->soft_vnmi_blocked = 0; + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - set_cr8(&vmx->vcpu, 0); + kvm_set_cr8(&vmx->vcpu, 0); msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (vmx->vcpu.vcpu_id == 0) msr |= MSR_IA32_APICBASE_BSP; @@ -1631,6 +2278,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) fx_init(&vmx->vcpu); + seg_setup(VCPU_SREG_CS); /* * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. @@ -1642,8 +2290,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); } - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); @@ -1667,10 +2313,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RFLAGS, 0x02); if (vmx->vcpu.vcpu_id == 0) - vmcs_writel(GUEST_RIP, 0xfff0); + kvm_rip_write(vcpu, 0xfff0); else - vmcs_writel(GUEST_RIP, 0); - vmcs_writel(GUEST_RSP, 0); + kvm_rip_write(vcpu, 0); + kvm_register_write(vcpu, VCPU_REGS_RSP, 0); /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ vmcs_writel(GUEST_DR7, 0x400); @@ -1706,39 +2352,122 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + if (vmx->vpid != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmx->vcpu.arch.cr0 = 0x60000010; vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); -#ifdef CONFIG_X86_64 vmx_set_efer(&vmx->vcpu, 0); -#endif vmx_fpu_activate(&vmx->vcpu); update_exception_bitmap(&vmx->vcpu); - return 0; + vpid_sync_vcpu_all(vmx); + + ret = 0; + + /* HACK: Don't enable emulation on guest boot/reset */ + vmx->emulation_required = 0; out: + up_read(&vcpu->kvm->slots_lock); return ret; } +static void enable_irq_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + +static void enable_nmi_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + if (!cpu_has_virtual_nmis()) { + enable_irq_window(vcpu); + return; + } + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) { struct vcpu_vmx *vmx = to_vmx(vcpu); + KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + + ++vcpu->stat.irq_injections; if (vcpu->arch.rmode.active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; - vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); + vmx->rmode.irq.rip = kvm_rip_read(vcpu); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } +static void vmx_inject_nmi(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_virtual_nmis()) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->soft_vnmi_blocked = 1; + vmx->vnmi_blocked_time = 0; + } + + ++vcpu->stat.nmi_injections; + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = NMI_VECTOR; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + NMI_VECTOR | INTR_TYPE_SOFT_INTR | + INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); +} + +static void vmx_update_window_states(struct kvm_vcpu *vcpu) +{ + u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + + vcpu->arch.nmi_window_open = + !(guest_intr & (GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_NMI)); + if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) + vcpu->arch.nmi_window_open = 0; + + vcpu->arch.interrupt_window_open = + ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(guest_intr & (GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS))); +} + static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { int word_index = __ffs(vcpu->arch.irq_summary); @@ -1748,44 +2477,52 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); if (!vcpu->arch.irq_pending[word_index]) clear_bit(word_index, &vcpu->arch.irq_summary); - vmx_inject_irq(vcpu, irq); + kvm_queue_interrupt(vcpu, irq); } - static void do_interrupt_requests(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u32 cpu_based_vm_exec_control; + vmx_update_window_states(vcpu); - vcpu->arch.interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vcpu->arch.interrupt.pending) { + enable_nmi_window(vcpu); + } else if (vcpu->arch.nmi_window_open) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_nmi_window(vcpu); + return; + } + } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (vcpu->arch.irq_summary + || kvm_run->request_interrupt_window) + enable_irq_window(vcpu); + return; + } - if (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary && - !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) - /* - * If interrupts enabled, and not blocked by sti or mov ss. Good. - */ - kvm_do_inject_irq(vcpu); + if (vcpu->arch.interrupt_window_open) { + if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) + kvm_do_inject_irq(vcpu); - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + if (vcpu->arch.interrupt.pending) + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + } if (!vcpu->arch.interrupt_window_open && (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) - /* - * Interrupts blocked. Wait for unblock. - */ - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - else - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + enable_irq_window(vcpu); } static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { int ret; struct kvm_userspace_memory_region tss_mem = { - .slot = 8, + .slot = TSS_PRIVATE_MEMSLOT, .guest_phys_addr = addr, .memory_size = PAGE_SIZE * 3, .flags = 0, @@ -1819,9 +2556,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { - if (!vcpu->arch.rmode.active) - return 0; - /* * Instruction with address size override prefix opcode 0x67 * Cause the #SS fault with 0 error code in VM86 mode. @@ -1829,6 +2563,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) return 1; + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + switch (vec) { + case DE_VECTOR: + case DB_VECTOR: + case BP_VECTOR: + case OF_VECTOR: + case BR_VECTOR: + case UD_VECTOR: + case DF_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + case MF_VECTOR: + kvm_queue_exception(vcpu, vec); + return 1; + } return 0; } @@ -1846,7 +2599,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " - "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); + "intr info 0x%x\n", __func__, vect_info, intr_info); if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { int irq = vect_info & VECTORING_INFO_VECTOR_MASK; @@ -1854,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ if (is_no_device(intr_info)) { @@ -1870,11 +2623,18 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } error_code = 0; - rip = vmcs_readl(GUEST_RIP); - if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) + rip = kvm_rip_read(vcpu); + if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { + /* EPT won't cause page fault directly */ + if (vm_need_ept()) + BUG(); cr2 = vmcs_readl(EXIT_QUALIFICATION); + KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, + (u32)((u64)cr2 >> 32), handler); + if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) + kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } @@ -1903,6 +2663,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { ++vcpu->stat.irq_exits; + KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler); return 1; } @@ -1935,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) rep = (exit_qualification & 32) != 0; port = exit_qualification >> 16; + skip_emulated_instruction(vcpu); return kvm_emulate_pio(vcpu, kvm_run, in, size, port); } @@ -1960,25 +2722,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ + KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, + (u32)kvm_register_read(vcpu, reg), + (u32)((u64)kvm_register_read(vcpu, reg) >> 32), + handler); switch (cr) { case 0: - vcpu_load_rsp_rip(vcpu); - set_cr0(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 3: - vcpu_load_rsp_rip(vcpu); - set_cr3(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 4: - vcpu_load_rsp_rip(vcpu); - set_cr4(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 8: - vcpu_load_rsp_rip(vcpu); - set_cr8(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); if (irqchip_in_kernel(vcpu->kvm)) return 1; @@ -1987,31 +2749,33 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) }; break; case 2: /* clts */ - vcpu_load_rsp_rip(vcpu); vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); vmx_fpu_activate(vcpu); + KVMTRACE_0D(CLTS, vcpu, handler); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { case 3: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = vcpu->arch.cr3; - vcpu_put_rsp_rip(vcpu); + kvm_register_write(vcpu, reg, vcpu->arch.cr3); + KVMTRACE_3D(CR_READ, vcpu, (u32)cr, + (u32)kvm_register_read(vcpu, reg), + (u32)((u64)kvm_register_read(vcpu, reg) >> 32), + handler); skip_emulated_instruction(vcpu); return 1; case 8: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = get_cr8(vcpu); - vcpu_put_rsp_rip(vcpu); + kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); + KVMTRACE_2D(CR_READ, vcpu, (u32)cr, + (u32)kvm_register_read(vcpu, reg), handler); skip_emulated_instruction(vcpu); return 1; } break; case 3: /* lmsw */ - lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); skip_emulated_instruction(vcpu); return 1; @@ -2037,7 +2801,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & 7; reg = (exit_qualification >> 8) & 15; - vcpu_load_rsp_rip(vcpu); if (exit_qualification & 16) { /* mov from dr */ switch (dr) { @@ -2050,11 +2813,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) default: val = 0; } - vcpu->arch.regs[reg] = val; + kvm_register_write(vcpu, reg, val); + KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { /* mov to dr */ } - vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; } @@ -2075,6 +2838,9 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } + KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), + handler); + /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; @@ -2088,6 +2854,9 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), + handler); + if (vmx_set_msr(vcpu, ecx, data) != 0) { kvm_inject_gp(vcpu, 0); return 1; @@ -2112,6 +2881,10 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + KVMTRACE_0D(PEND_INTR, vcpu, handler); + ++vcpu->stat.irq_window_exits; + /* * If the user space waits to inject interrupts, exit as soon as * possible @@ -2119,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, if (kvm_run->request_interrupt_window && !vcpu->arch.irq_summary) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - ++vcpu->stat.irq_window_exits; return 0; } return 1; @@ -2138,6 +2910,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + + kvm_mmu_invlpg(vcpu, exit_qualification); + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); @@ -2165,6 +2946,136 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification; + u16 tss_selector; + int reason; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + reason = (u32)exit_qualification >> 30; + if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && + (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) + == INTR_TYPE_NMI_INTR) { + vcpu->arch.nmi_injected = false; + if (cpu_has_virtual_nmis()) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } + tss_selector = exit_qualification; + + return kvm_task_switch(vcpu, tss_selector, reason); +} + +static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification; + enum emulation_result er; + gpa_t gpa; + unsigned long hva; + int gla_validity; + int r; + + exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + + if (exit_qualification & (1 << 6)) { + printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); + return -ENOTSUPP; + } + + gla_validity = (exit_qualification >> 7) & 0x3; + if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { + printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); + printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", + (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), + (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", + (long unsigned int)exit_qualification); + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = 0; + return -ENOTSUPP; + } + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!kvm_is_error_hva(hva)) { + r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); + if (r < 0) { + printk(KERN_ERR "EPT: Not enough memory!\n"); + return -ENOMEM; + } + return 1; + } else { + /* must be MMIO */ + er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + if (er == EMULATE_FAIL) { + printk(KERN_ERR + "EPT: Fail to handle EPT violation vmexit!er is %d\n", + er); + printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", + (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), + (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", + (long unsigned int)exit_qualification); + return -ENOTSUPP; + } else if (er == EMULATE_DO_MMIO) + return 0; + } + return 1; +} + +static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 cpu_based_vm_exec_control; + + /* clear pending NMI */ + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + ++vcpu->stat.nmi_window_exits; + + return 1; +} + +static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int err; + + preempt_enable(); + local_irq_enable(); + + while (!guest_state_valid(vcpu)) { + err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + if (err == EMULATE_DO_MMIO) + break; + + if (err != EMULATE_DONE) { + kvm_report_emulation_failure(vcpu, "emulation failure"); + return; + } + + if (signal_pending(current)) + break; + if (need_resched()) + schedule(); + } + + local_irq_disable(); + preempt_disable(); + + /* Guest state should be valid now except if we need to + * emulate an MMIO */ + if (guest_state_valid(vcpu)) + vmx->emulation_required = 0; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -2175,6 +3086,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, + [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, [EXIT_REASON_IO_INSTRUCTION] = handle_io, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, @@ -2183,10 +3095,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_MSR_WRITE] = handle_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_TASK_SWITCH] = handle_task_switch, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, }; static const int kvm_vmx_max_exit_handlers = @@ -2202,6 +3117,21 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; + KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), + (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); + + /* If we need to emulate an MMIO from handle_invalid_guest_state + * we just return 0 */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return 0; + + /* Access CR3 don't cause VMExit in paging mode, so we need + * to sync with guest real CR3. */ + if (vm_need_ept() && is_paging(vcpu)) { + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + ept_load_pdptrs(vcpu); + } + if (unlikely(vmx->fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason @@ -2210,9 +3140,33 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } if ((vectoring_info & VECTORING_INFO_VALID_MASK) && - exit_reason != EXIT_REASON_EXCEPTION_NMI) - printk(KERN_WARNING "%s: unexpected, valid vectoring info and " - "exit reason is 0x%x\n", __FUNCTION__, exit_reason); + (exit_reason != EXIT_REASON_EXCEPTION_NMI && + exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_TASK_SWITCH)) + printk(KERN_WARNING "%s: unexpected, valid vectoring info " + "(0x%x) and exit reason is 0x%x\n", + __func__, vectoring_info, exit_reason); + + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { + if (vcpu->arch.interrupt_window_open) { + vmx->soft_vnmi_blocked = 0; + vcpu->arch.nmi_window_open = 1; + } else if (vmx->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + printk(KERN_WARNING "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->soft_vnmi_blocked = 0; + vmx->vcpu.arch.nmi_window_open = 1; + } + } + if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); @@ -2223,10 +3177,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) -{ -} - static void update_tpr_threshold(struct kvm_vcpu *vcpu) { int max_irr, tpr; @@ -2244,71 +3194,101 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); } -static void enable_irq_window(struct kvm_vcpu *vcpu) +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { - u32 cpu_based_vm_exec_control; + u32 exit_intr_info; + u32 idt_vectoring_info; + bool unblock_nmi; + u8 vector; + int type; + bool idtv_info_valid; + u32 error; - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if (cpu_has_virtual_nmis()) { + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 25.7.1.2 + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + */ + if (unblock_nmi && vector != DF_VECTOR) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } else if (unlikely(vmx->soft_vnmi_blocked)) + vmx->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); + + idt_vectoring_info = vmx->idt_vectoring_info; + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; + if (vmx->vcpu.arch.nmi_injected) { + /* + * SDM 3: 25.7.1.2 + * Clear bit "block by NMI" before VM entry if a NMI delivery + * faulted. + */ + if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->vcpu.arch.nmi_injected = false; + } + kvm_clear_exception_queue(&vmx->vcpu); + if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { + error = vmcs_read32(IDT_VECTORING_ERROR_CODE); + kvm_queue_exception_e(&vmx->vcpu, vector, error); + } else + kvm_queue_exception(&vmx->vcpu, vector); + vmx->idt_vectoring_info = 0; + } + kvm_clear_interrupt_queue(&vmx->vcpu); + if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { + kvm_queue_interrupt(&vmx->vcpu, vector); + vmx->idt_vectoring_info = 0; + } } static void vmx_intr_assist(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field, intr_info_field; - int has_ext_irq, interrupt_window_open; - int vector; - update_tpr_threshold(vcpu); - has_ext_irq = kvm_cpu_has_interrupt(vcpu); - intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); - idtv_info_field = vmx->idt_vectoring_info; - if (intr_info_field & INTR_INFO_VALID_MASK) { - if (idtv_info_field & INTR_INFO_VALID_MASK) { - /* TODO: fault when IDT_Vectoring */ - if (printk_ratelimit()) - printk(KERN_ERR "Fault when IDT_Vectoring\n"); - } - if (has_ext_irq) - enable_irq_window(vcpu); - return; - } - if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { - if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_EXT_INTR - && vcpu->arch.rmode.active) { - u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; + vmx_update_window_states(vcpu); - vmx_inject_irq(vcpu, vect); - if (unlikely(has_ext_irq)) - enable_irq_window(vcpu); + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vcpu->arch.interrupt.pending) { + enable_nmi_window(vcpu); + } else if (vcpu->arch.nmi_window_open) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_nmi_window(vcpu); return; } - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - - if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs_read32(IDT_VECTORING_ERROR_CODE)); - if (unlikely(has_ext_irq)) + } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu)) enable_irq_window(vcpu); return; } - if (!has_ext_irq) - return; - interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); - if (interrupt_window_open) { - vector = kvm_cpu_get_interrupt(vcpu); - vmx_inject_irq(vcpu, vector); - kvm_timer_intr_post(vcpu, vector); - } else - enable_irq_window(vcpu); + if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { + if (vcpu->arch.interrupt_window_open) + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); + else + enable_irq_window(vcpu); + } + if (vcpu->arch.interrupt.pending) { + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); + if (kvm_cpu_has_interrupt(vcpu)) + enable_irq_window(vcpu); + } } /* @@ -2320,9 +3300,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) static void fixup_rmode_irq(struct vcpu_vmx *vmx) { vmx->rmode.irq.pending = 0; - if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) + if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) return; - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); + kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; @@ -2334,11 +3314,34 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | vmx->rmode.irq.vector; } +#ifdef CONFIG_X86_64 +#define R "r" +#define Q "q" +#else +#define R "e" +#define Q "l" +#endif + static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) + vmx->entry_time = ktime_get(); + + /* Handle invalid guest state instead of entering VMX */ + if (vmx->emulation_required && emulate_invalid_guest_state) { + handle_invalid_guest_state(vcpu, kvm_run); + return; + } + + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + /* * Loading guest fpu may have cleared host cr0.ts */ @@ -2346,26 +3349,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm( /* Store host registers */ -#ifdef CONFIG_X86_64 - "push %%rdx; push %%rbp;" - "push %%rcx \n\t" -#else - "push %%edx; push %%ebp;" - "push %%ecx \n\t" -#endif - ASM_VMX_VMWRITE_RSP_RDX "\n\t" + "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" + "cmp %%"R"sp, %c[host_rsp](%0) \n\t" + "je 1f \n\t" + "mov %%"R"sp, %c[host_rsp](%0) \n\t" + __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" + "1: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "mov %c[rax](%0), %%"R"ax \n\t" + "mov %c[rbx](%0), %%"R"bx \n\t" + "mov %c[rdx](%0), %%"R"dx \n\t" + "mov %c[rsi](%0), %%"R"si \n\t" + "mov %c[rdi](%0), %%"R"di \n\t" + "mov %c[rbp](%0), %%"R"bp \n\t" #ifdef CONFIG_X86_64 - "mov %c[cr2](%0), %%rax \n\t" - "mov %%rax, %%cr2 \n\t" - "mov %c[rax](%0), %%rax \n\t" - "mov %c[rbx](%0), %%rbx \n\t" - "mov %c[rdx](%0), %%rdx \n\t" - "mov %c[rsi](%0), %%rsi \n\t" - "mov %c[rdi](%0), %%rdi \n\t" - "mov %c[rbp](%0), %%rbp \n\t" "mov %c[r8](%0), %%r8 \n\t" "mov %c[r9](%0), %%r9 \n\t" "mov %c[r10](%0), %%r10 \n\t" @@ -2374,34 +3376,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %c[r13](%0), %%r13 \n\t" "mov %c[r14](%0), %%r14 \n\t" "mov %c[r15](%0), %%r15 \n\t" - "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ -#else - "mov %c[cr2](%0), %%eax \n\t" - "mov %%eax, %%cr2 \n\t" - "mov %c[rax](%0), %%eax \n\t" - "mov %c[rbx](%0), %%ebx \n\t" - "mov %c[rdx](%0), %%edx \n\t" - "mov %c[rsi](%0), %%esi \n\t" - "mov %c[rdi](%0), %%edi \n\t" - "mov %c[rbp](%0), %%ebp \n\t" - "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ #endif + "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ + /* Enter guest mode */ "jne .Llaunched \n\t" - ASM_VMX_VMLAUNCH "\n\t" + __ex(ASM_VMX_VMLAUNCH) "\n\t" "jmp .Lkvm_vmx_return \n\t" - ".Llaunched: " ASM_VMX_VMRESUME "\n\t" + ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ + "xchg %0, (%%"R"sp) \n\t" + "mov %%"R"ax, %c[rax](%0) \n\t" + "mov %%"R"bx, %c[rbx](%0) \n\t" + "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "mov %%"R"dx, %c[rdx](%0) \n\t" + "mov %%"R"si, %c[rsi](%0) \n\t" + "mov %%"R"di, %c[rdi](%0) \n\t" + "mov %%"R"bp, %c[rbp](%0) \n\t" #ifdef CONFIG_X86_64 - "xchg %0, (%%rsp) \n\t" - "mov %%rax, %c[rax](%0) \n\t" - "mov %%rbx, %c[rbx](%0) \n\t" - "pushq (%%rsp); popq %c[rcx](%0) \n\t" - "mov %%rdx, %c[rdx](%0) \n\t" - "mov %%rsi, %c[rsi](%0) \n\t" - "mov %%rdi, %c[rdi](%0) \n\t" - "mov %%rbp, %c[rbp](%0) \n\t" "mov %%r8, %c[r8](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t" "mov %%r10, %c[r10](%0) \n\t" @@ -2410,28 +3403,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" - "mov %%cr2, %%rax \n\t" - "mov %%rax, %c[cr2](%0) \n\t" - - "pop %%rbp; pop %%rbp; pop %%rdx \n\t" -#else - "xchg %0, (%%esp) \n\t" - "mov %%eax, %c[rax](%0) \n\t" - "mov %%ebx, %c[rbx](%0) \n\t" - "pushl (%%esp); popl %c[rcx](%0) \n\t" - "mov %%edx, %c[rdx](%0) \n\t" - "mov %%esi, %c[rsi](%0) \n\t" - "mov %%edi, %c[rdi](%0) \n\t" - "mov %%ebp, %c[rbp](%0) \n\t" - "mov %%cr2, %%eax \n\t" - "mov %%eax, %c[cr2](%0) \n\t" - - "pop %%ebp; pop %%ebp; pop %%edx \n\t" #endif + "mov %%cr2, %%"R"ax \n\t" + "mov %%"R"ax, %c[cr2](%0) \n\t" + + "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), @@ -2451,20 +3432,20 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) : "cc", "memory" + , R"bx", R"di", R"si" #ifdef CONFIG_X86_64 - , "rbx", "rdi", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ebx", "edi", "rsi" #endif ); + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + vcpu->arch.regs_dirty = 0; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); - vcpu->arch.interrupt_window_open = - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; + vmx_update_window_states(vcpu); asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; @@ -2472,16 +3453,24 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && + (intr_info & INTR_INFO_VALID_MASK)) { + KVMTRACE_0D(NMI, vcpu, handler); asm("int $2"); + } + + vmx_complete_interrupts(vmx); } +#undef R +#undef Q + static void vmx_free_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->vmcs) { - on_each_cpu(__vcpu_clear, vmx, 0, 1); + vcpu_clear(vmx); free_vmcs(vmx->vmcs); vmx->vmcs = NULL; } @@ -2491,6 +3480,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + spin_lock(&vmx_vpid_lock); + if (vmx->vpid != 0) + __clear_bit(vmx->vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); vmx_free_vmcs(vcpu); kfree(vmx->host_msrs); kfree(vmx->guest_msrs); @@ -2507,6 +3500,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); + allocate_vpid(vmx); + err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) goto free_vcpu; @@ -2538,6 +3533,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (alloc_apic_access_page(kvm) != 0) goto free_vmcs; + if (vm_need_ept()) + if (alloc_identity_pagetable(kvm) != 0) + goto free_vmcs; + return &vmx->vcpu; free_vmcs: @@ -2567,6 +3566,16 @@ static void __init vmx_check_processor_compat(void *rtn) } } +static int get_ept_level(void) +{ + return VMX_EPT_DEFAULT_GAW + 1; +} + +static int vmx_get_mt_mask_shift(void) +{ + return VMX_EPT_MT_EPTE_SHIFT; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -2584,7 +3593,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .prepare_guest_switch = vmx_save_host_state, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .vcpu_decache = vmx_vcpu_decache, .set_guest_debug = set_guest_debug, .guest_debug_pre = kvm_guest_debug_pre, @@ -2593,20 +3601,18 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_segment_base = vmx_get_segment_base, .get_segment = vmx_get_segment, .set_segment = vmx_set_segment, + .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, -#ifdef CONFIG_X86_64 .set_efer = vmx_set_efer, -#endif .get_idt = vmx_get_idt, .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, - .cache_regs = vcpu_load_rsp_rip, - .decache_regs = vcpu_put_rsp_rip, + .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, @@ -2624,11 +3630,13 @@ static struct kvm_x86_ops vmx_x86_ops = { .inject_pending_vectors = do_interrupt_requests, .set_tss_addr = vmx_set_tss_addr, + .get_tdp_level = get_ept_level, + .get_mt_mask_shift = vmx_get_mt_mask_shift, }; static int __init vmx_init(void) { - void *iova; + void *va; int r; vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); @@ -2641,28 +3649,61 @@ static int __init vmx_init(void) goto out; } + vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!vmx_msr_bitmap) { + r = -ENOMEM; + goto out1; + } + /* * Allow direct access to the PC debug port (it is often used for I/O * delays, but the vmexits simply slow things down). */ - iova = kmap(vmx_io_bitmap_a); - memset(iova, 0xff, PAGE_SIZE); - clear_bit(0x80, iova); + va = kmap(vmx_io_bitmap_a); + memset(va, 0xff, PAGE_SIZE); + clear_bit(0x80, va); kunmap(vmx_io_bitmap_a); - iova = kmap(vmx_io_bitmap_b); - memset(iova, 0xff, PAGE_SIZE); + va = kmap(vmx_io_bitmap_b); + memset(va, 0xff, PAGE_SIZE); kunmap(vmx_io_bitmap_b); + va = kmap(vmx_msr_bitmap); + memset(va, 0xff, PAGE_SIZE); + kunmap(vmx_msr_bitmap); + + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) - goto out1; + goto out2; + + vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); + vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); + vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); + vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); + vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); + + if (vm_need_ept()) { + bypass_guest_pf = 0; + kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | + VMX_EPT_WRITABLE_MASK); + kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, + VMX_EPT_EXECUTABLE_MASK, + VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); + ept_sync_global(); + return 0; +out2: + __free_page(vmx_msr_bitmap); out1: __free_page(vmx_io_bitmap_b); out: @@ -2672,6 +3713,7 @@ out: static void __exit vmx_exit(void) { + __free_page(vmx_msr_bitmap); __free_page(vmx_io_bitmap_b); __free_page(vmx_io_bitmap_a);