KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx autoload msr area
[safe/jmp/linux-2.6] / arch / x86 / kvm / vmx.c
index a187570..3251251 100644 (file)
@@ -61,6 +61,25 @@ module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * ple_gap:    upper bound on the amount of time between two successive
+ *             executions of PAUSE in a loop. Also indicate if ple enabled.
+ *             According to test, this time is usually small than 41 cycles.
+ * ple_window: upper bound on the amount of time a guest is allowed to execute
+ *             in a PAUSE loop. Tests indicate that most spinlocks are held for
+ *             less than 2^12 cycles
+ * Time is measured based on a counter that runs at the same rate as the TSC,
+ * refer SDM volume 3b section 21.6.13 & 22.1.3.
+ */
+#define KVM_VMX_DEFAULT_PLE_GAP    41
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
+module_param(ple_gap, int, S_IRUGO);
+
+static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, int, S_IRUGO);
+
 struct vmcs {
        u32 revision_id;
        u32 abort;
@@ -80,7 +99,8 @@ struct vcpu_vmx {
        int                   save_nmsrs;
        int                   msr_offset_efer;
 #ifdef CONFIG_X86_64
-       int                   msr_offset_kernel_gs_base;
+       u64                   msr_host_kernel_gs_base;
+       u64                   msr_guest_kernel_gs_base;
 #endif
        struct vmcs          *vmcs;
        struct {
@@ -183,7 +203,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
  */
 static const u32 vmx_msr_index[] = {
 #ifdef CONFIG_X86_64
-       MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
+       MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 #endif
        MSR_EFER, MSR_K6_STAR,
 };
@@ -319,6 +339,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
                SECONDARY_EXEC_UNRESTRICTED_GUEST;
 }
 
+static inline int cpu_has_vmx_ple(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
        return flexpriority_enabled &&
@@ -649,10 +675,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 #endif
 
 #ifdef CONFIG_X86_64
-       if (is_long_mode(&vmx->vcpu))
-               save_msrs(vmx->host_msrs +
-                         vmx->msr_offset_kernel_gs_base, 1);
-
+       if (is_long_mode(&vmx->vcpu)) {
+               rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+               wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+       }
 #endif
        load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
        load_transition_efer(vmx);
@@ -686,6 +712,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
        save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
        load_msrs(vmx->host_msrs, vmx->save_nmsrs);
        reload_host_efer(vmx);
+#ifdef CONFIG_X86_64
+       if (is_long_mode(&vmx->vcpu)) {
+               rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+               wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+       }
+#endif
 }
 
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -915,9 +947,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                index = __find_msr_index(vmx, MSR_CSTAR);
                if (index >= 0)
                        move_msr_up(vmx, index, save_nmsrs++);
-               index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
                /*
                 * MSR_K6_STAR is only needed on long mode guests, and only
                 * if efer.sce is enabled.
@@ -929,10 +958,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 #endif
        vmx->save_nmsrs = save_nmsrs;
 
-#ifdef CONFIG_X86_64
-       vmx->msr_offset_kernel_gs_base =
-               __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-#endif
        vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
 
        if (cpu_has_vmx_msr_bitmap()) {
@@ -990,6 +1015,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        case MSR_GS_BASE:
                data = vmcs_readl(GUEST_GS_BASE);
                break;
+       case MSR_KERNEL_GS_BASE:
+               vmx_load_host_state(to_vmx(vcpu));
+               data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+               break;
        case MSR_EFER:
                return kvm_get_msr_common(vcpu, msr_index, pdata);
 #endif
@@ -1043,6 +1072,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
        case MSR_GS_BASE:
                vmcs_writel(GUEST_GS_BASE, data);
                break;
+       case MSR_KERNEL_GS_BASE:
+               vmx_load_host_state(vmx);
+               vmx->msr_guest_kernel_gs_base = data;
+               break;
 #endif
        case MSR_IA32_SYSENTER_CS:
                vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1096,30 +1129,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
        }
 }
 
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-       int old_debug = vcpu->guest_debug;
-       unsigned long flags;
-
-       vcpu->guest_debug = dbg->control;
-       if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
-               vcpu->guest_debug = 0;
-
        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
                vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
        else
                vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
 
-       flags = vmcs_readl(GUEST_RFLAGS);
-       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-               flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-       else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
-               flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-       vmcs_writel(GUEST_RFLAGS, flags);
-
        update_exception_bitmap(vcpu);
-
-       return 0;
 }
 
 static __init int cpu_has_kvm_support(void)
@@ -1256,7 +1273,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_WBINVD_EXITING |
                        SECONDARY_EXEC_ENABLE_VPID |
                        SECONDARY_EXEC_ENABLE_EPT |
-                       SECONDARY_EXEC_UNRESTRICTED_GUEST;
+                       SECONDARY_EXEC_UNRESTRICTED_GUEST |
+                       SECONDARY_EXEC_PAUSE_LOOP_EXITING;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@ -1350,15 +1368,17 @@ static void free_kvm_area(void)
 {
        int cpu;
 
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu) {
                free_vmcs(per_cpu(vmxarea, cpu));
+               per_cpu(vmxarea, cpu) = NULL;
+       }
 }
 
 static __init int alloc_kvm_area(void)
 {
        int cpu;
 
-       for_each_online_cpu(cpu) {
+       for_each_possible_cpu(cpu) {
                struct vmcs *vmcs;
 
                vmcs = alloc_vmcs_cpu(cpu);
@@ -1400,6 +1420,9 @@ static __init int hardware_setup(void)
        if (enable_ept && !cpu_has_vmx_ept_2m_page())
                kvm_disable_largepages();
 
+       if (!cpu_has_vmx_ple())
+               ple_gap = 0;
+
        return alloc_kvm_area();
 }
 
@@ -1544,6 +1567,11 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
 
+       /*
+        * Force kernel_gs_base reloading before EFER changes, as control
+        * of this msr depends on is_long_mode().
+        */
+       vmx_load_host_state(to_vmx(vcpu));
        vcpu->arch.shadow_efer = efer;
        if (!msr)
                return;
@@ -2312,9 +2340,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                        exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
                if (!enable_unrestricted_guest)
                        exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+               if (!ple_gap)
+                       exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
 
+       if (ple_gap) {
+               vmcs_write32(PLE_GAP, ple_gap);
+               vmcs_write32(PLE_WINDOW, ple_window);
+       }
+
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
@@ -2516,7 +2551,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        if (vmx->vpid != 0)
                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
-       vmx->vcpu.arch.cr0 = 0x60000010;
+       vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
        vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
        vmx_set_cr4(&vmx->vcpu, 0);
        vmx_set_efer(&vmx->vcpu, 0);
@@ -3362,6 +3397,18 @@ out:
 }
 
 /*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+       skip_emulated_instruction(vcpu);
+       kvm_vcpu_on_spin(vcpu);
+
+       return 1;
+}
+
+/*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
  * to be done to userspace and return 0.
@@ -3397,6 +3444,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
+       [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
 };
 
 static const int kvm_vmx_max_exit_handlers =