static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * ple_gap: upper bound on the amount of time between two successive
+ * executions of PAUSE in a loop. Also indicate if ple enabled.
+ * According to test, this time is usually small than 41 cycles.
+ * ple_window: upper bound on the amount of time a guest is allowed to execute
+ * in a PAUSE loop. Tests indicate that most spinlocks are held for
+ * less than 2^12 cycles
+ * Time is measured based on a counter that runs at the same rate as the TSC,
+ * refer SDM volume 3b section 21.6.13 & 22.1.3.
+ */
+#define KVM_VMX_DEFAULT_PLE_GAP 41
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
+module_param(ple_gap, int, S_IRUGO);
+
+static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, int, S_IRUGO);
+
struct vmcs {
u32 revision_id;
u32 abort;
int save_nmsrs;
int msr_offset_efer;
#ifdef CONFIG_X86_64
- int msr_offset_kernel_gs_base;
+ u64 msr_host_kernel_gs_base;
+ u64 msr_guest_kernel_gs_base;
#endif
struct vmcs *vmcs;
struct {
*/
static const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
MSR_EFER, MSR_K6_STAR,
};
SECONDARY_EXEC_UNRESTRICTED_GUEST;
}
+static inline int cpu_has_vmx_ple(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
return flexpriority_enabled &&
#endif
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu))
- save_msrs(vmx->host_msrs +
- vmx->msr_offset_kernel_gs_base, 1);
-
+ if (is_long_mode(&vmx->vcpu)) {
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ }
#endif
load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
load_transition_efer(vmx);
save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
load_msrs(vmx->host_msrs, vmx->save_nmsrs);
reload_host_efer(vmx);
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu)) {
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ }
+#endif
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
index = __find_msr_index(vmx, MSR_CSTAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_K6_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
#endif
vmx->save_nmsrs = save_nmsrs;
-#ifdef CONFIG_X86_64
- vmx->msr_offset_kernel_gs_base =
- __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-#endif
vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
if (cpu_has_vmx_msr_bitmap()) {
case MSR_GS_BASE:
data = vmcs_readl(GUEST_GS_BASE);
break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_load_host_state(to_vmx(vcpu));
+ data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+ break;
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_index, pdata);
#endif
case MSR_GS_BASE:
vmcs_writel(GUEST_GS_BASE, data);
break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_load_host_state(vmx);
+ vmx->msr_guest_kernel_gs_base = data;
+ break;
#endif
case MSR_IA32_SYSENTER_CS:
vmcs_write32(GUEST_SYSENTER_CS, data);
}
}
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
{
- int old_debug = vcpu->guest_debug;
- unsigned long flags;
-
- vcpu->guest_debug = dbg->control;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
- vcpu->guest_debug = 0;
-
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
else
vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
- flags = vmcs_readl(GUEST_RFLAGS);
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
- else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
- flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- vmcs_writel(GUEST_RFLAGS, flags);
-
update_exception_bitmap(vcpu);
-
- return 0;
}
static __init int cpu_has_kvm_support(void)
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_ENABLE_EPT |
- SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ SECONDARY_EXEC_UNRESTRICTED_GUEST |
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
{
int cpu;
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu) {
free_vmcs(per_cpu(vmxarea, cpu));
+ per_cpu(vmxarea, cpu) = NULL;
+ }
}
static __init int alloc_kvm_area(void)
{
int cpu;
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
vmcs = alloc_vmcs_cpu(cpu);
if (enable_ept && !cpu_has_vmx_ept_2m_page())
kvm_disable_largepages();
+ if (!cpu_has_vmx_ple())
+ ple_gap = 0;
+
return alloc_kvm_area();
}
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+ /*
+ * Force kernel_gs_base reloading before EFER changes, as control
+ * of this msr depends on is_long_mode().
+ */
+ vmx_load_host_state(to_vmx(vcpu));
vcpu->arch.shadow_efer = efer;
if (!msr)
return;
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
if (!enable_unrestricted_guest)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (!ple_gap)
+ exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
+ if (ple_gap) {
+ vmcs_write32(PLE_GAP, ple_gap);
+ vmcs_write32(PLE_WINDOW, ple_window);
+ }
+
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
if (vmx->vpid != 0)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- vmx->vcpu.arch.cr0 = 0x60000010;
+ vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
vmx_set_cr4(&vmx->vcpu, 0);
vmx_set_efer(&vmx->vcpu, 0);
}
/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+ skip_emulated_instruction(vcpu);
+ kvm_vcpu_on_spin(vcpu);
+
+ return 1;
+}
+
+/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
* to be done to userspace and return 0.
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
+ [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
};
static const int kvm_vmx_max_exit_handlers =