#include <linux/highmem.h>
#include <linux/iommu.h>
#include <linux/intel-iommu.h>
+#include <linux/cpufreq.h>
+#define CREATE_TRACE_POINTS
+#include "trace.h"
#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
+#include <asm/mce.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+#define KVM_MAX_MCE_BANKS 32
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
- { "request_nmi", VCPU_STAT(request_nmi_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "efer_reload", VCPU_STAT(efer_reload) },
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
- { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
{ NULL }
++vcpu->stat.pf_guest;
if (vcpu->arch.exception.pending) {
- if (vcpu->arch.exception.nr == PF_VECTOR) {
- printk(KERN_DEBUG "kvm: inject_page_fault:"
- " double fault 0x%lx\n", addr);
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+ switch(vcpu->arch.exception.nr) {
+ case DF_VECTOR:
/* triple fault -> shutdown */
set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return;
+ case PF_VECTOR:
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ return;
+ default:
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ vcpu->arch.exception.pending = false;
+ break;
}
- return;
}
vcpu->arch.cr2 = addr;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
+ if (is_present_gpte(pdpte[i]) &&
+ (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
}
ret = 1;
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
out:
return ret;
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ return true;
+
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
- kvm_mmu_sync_global(vcpu);
kvm_mmu_reset_context(vcpu);
return;
}
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
- KVMTRACE_1D(LMSW, vcpu,
- (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
- handler);
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
+ unsigned long old_cr4 = vcpu->arch.cr4;
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
if (cr4 & CR4_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
kvm_inject_gp(vcpu, 0);
kvm_inject_gp(vcpu, 0);
return;
}
- } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
+ } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+ && ((cr4 ^ old_cr4) & pdptr_bits)
&& !load_pdptrs(vcpu, vcpu->arch.cr3)) {
printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
kvm_inject_gp(vcpu, 0);
}
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->arch.cr4 = cr4;
- vcpu->arch.mmu.base_role.cr4_pge = !!(cr4 & X86_CR4_PGE);
- kvm_mmu_sync_global(vcpu);
+ vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
};
return;
}
+ if (efer & EFER_FFXSR) {
+ struct kvm_cpuid_entry2 *feat;
+
+ feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
+ printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ }
+
if (efer & EFER_SVME) {
struct kvm_cpuid_entry2 *feat;
efer |= vcpu->arch.shadow_efer & EFER_LMA;
vcpu->arch.shadow_efer = efer;
+
+ vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
+ kvm_mmu_reset_context(vcpu);
}
void kvm_enable_efer_bits(u64 mask)
hv_clock->tsc_to_system_mul);
}
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
unsigned long flags;
struct kvm_vcpu_arch *vcpu = &v->arch;
void *shared_kaddr;
+ unsigned long this_tsc_khz;
if ((!vcpu->time_page))
return;
- if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
- kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
- vcpu->hv_clock_tsc_khz = tsc_khz;
+ this_tsc_khz = get_cpu_var(cpu_tsc_khz);
+ if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+ kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+ vcpu->hv_clock_tsc_khz = this_tsc_khz;
}
+ put_cpu_var(cpu_tsc_khz);
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
- &vcpu->hv_clock.tsc_timestamp);
+ kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
ktime_get_ts(&ts);
local_irq_restore(flags);
mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
}
+static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+
+ if (!vcpu->time_page)
+ return 0;
+ set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+ return 1;
+}
+
static bool msr_mtrr_valid(unsigned msr)
{
switch (msr) {
return false;
}
+static bool valid_pat_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+
+ if (!msr_mtrr_valid(msr))
+ return false;
+
+ if (msr == MSR_IA32_CR_PAT) {
+ for (i = 0; i < 8; i++)
+ if (!valid_pat_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ } else if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ return valid_mtrr_type(data & 0xff);
+}
+
static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
- if (!msr_mtrr_valid(msr))
+ if (!mtrr_valid(vcpu, msr, data))
return 1;
if (msr == MSR_MTRRdefType) {
return 0;
}
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
switch (msr) {
- case MSR_EFER:
- set_efer(vcpu, data);
- break;
- case MSR_IA32_MC0_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
- __func__, data);
- break;
case MSR_IA32_MCG_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
- __func__, data);
+ vcpu->arch.mcg_status = data;
break;
case MSR_IA32_MCG_CTL:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
- __func__, data);
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ /* only 0 or all 1s can be written to IA32_MCi_CTL */
+ if ((offset & 0x3) == 0 &&
+ data != 0 && data != ~(u64)0)
+ return -1;
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ switch (msr) {
+ case MSR_EFER:
+ set_efer(vcpu, data);
break;
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
vcpu->arch.time_page = NULL;
}
- kvm_write_guest_time(vcpu);
+ kvm_request_guest_time_update(vcpu);
break;
}
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return set_msr_mce(vcpu, msr, data);
+
+ /* Performance counters are not protected by a CPUID bit,
+ * so we should check all of them in the generic path for the sake of
+ * cross vendor migration.
+ * Writing a zero into the event select MSRs disables them,
+ * which we perfectly emulate ;-). Any other value should be at least
+ * reported, some guests depend on them.
+ */
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
+ if (data != 0)
+ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
+ /* at least RHEL 4 unconditionally writes to the perfctr registers,
+ * so we ignore writes to make it happy.
+ */
+ case MSR_P6_PERFCTR0:
+ case MSR_P6_PERFCTR1:
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
default:
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
return 1;
return 0;
}
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
switch (msr) {
- case 0xc0010010: /* SYSCFG */
- case 0xc0010015: /* HWCR */
- case MSR_IA32_PLATFORM_ID:
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
- case MSR_IA32_MC0_CTL:
- case MSR_IA32_MCG_STATUS:
+ data = 0;
+ break;
case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
case MSR_IA32_MCG_CTL:
- case MSR_IA32_MC0_MISC:
- case MSR_IA32_MC0_MISC+4:
- case MSR_IA32_MC0_MISC+8:
- case MSR_IA32_MC0_MISC+12:
- case MSR_IA32_MC0_MISC+16:
- case MSR_IA32_MC0_MISC+20:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ u32 offset = msr - MSR_IA32_MC0_CTL;
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ }
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data;
+
+ switch (msr) {
+ case MSR_IA32_PLATFORM_ID:
case MSR_IA32_UCODE_REV:
case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
+ case MSR_K8_SYSCFG:
+ case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
data = 0;
break;
case MSR_MTRRcap:
case MSR_KVM_SYSTEM_TIME:
data = vcpu->arch.time;
break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return get_msr_mce(vcpu, msr, pdata);
default:
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
return 1;
case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
+ case KVM_CAP_CLOCKSOURCE:
case KVM_CAP_PIT:
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_REINJECT_CONTROL:
+ case KVM_CAP_IRQ_INJECT_STATUS:
+ case KVM_CAP_ASSIGN_DEV_IRQ:
+ case KVM_CAP_IRQFD:
+ case KVM_CAP_PIT2:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
case KVM_CAP_IOMMU:
r = iommu_found();
break;
- case KVM_CAP_CLOCKSOURCE:
- r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC);
+ case KVM_CAP_MCE:
+ r = KVM_MAX_MCE_BANKS;
break;
default:
r = 0;
if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
goto out;
r = -E2BIG;
- if (n < num_msrs_to_save)
+ if (n < msr_list.nmsrs)
goto out;
r = -EFAULT;
if (copy_to_user(user_msr_list->indices, &msrs_to_save,
num_msrs_to_save * sizeof(u32)))
goto out;
- if (copy_to_user(user_msr_list->indices
- + num_msrs_to_save * sizeof(u32),
+ if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
&emulated_msrs,
ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
goto out;
if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
goto out;
r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
- cpuid_arg->entries);
+ cpuid_arg->entries);
if (r)
goto out;
r = 0;
break;
}
+ case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ u64 mce_cap;
+
+ mce_cap = KVM_MCE_CAP_SUPPORTED;
+ r = -EFAULT;
+ if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+ goto out;
+ r = 0;
+ break;
+ }
default:
r = -EINVAL;
}
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
kvm_x86_ops->vcpu_load(vcpu, cpu);
- kvm_write_guest_time(vcpu);
+ kvm_request_guest_time_update(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int is_efer_nx(void)
{
- u64 efer;
+ unsigned long long efer = 0;
- rdmsrl(MSR_EFER, efer);
+ rdmsrl_safe(MSR_EFER, &efer);
return efer & EFER_NX;
}
}
static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
- struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
{
int r;
}
static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
- struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
{
int r;
goto out;
r = -EFAULT;
if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
- vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
return 0;
}
static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
- u32 index)
+ u32 index)
{
entry->function = function;
entry->index = index;
cpuid_count(entry->function, entry->index,
- &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
entry->flags = 0;
}
+#define F(x) bit(X86_FEATURE_##x)
+
static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index, int *nent, int maxnent)
{
- const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
- bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
- bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
- bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
- bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
- bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
- bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
- bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
- bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
- bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
- const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
- bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
- bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
- bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
- bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
- bit(X86_FEATURE_PGE) |
- bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
- bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
- bit(X86_FEATURE_SYSCALL) |
- (bit(X86_FEATURE_NX) && is_efer_nx()) |
+ unsigned f_nx = is_efer_nx() ? F(NX) : 0;
#ifdef CONFIG_X86_64
- bit(X86_FEATURE_LM) |
+ unsigned f_lm = F(LM);
+#else
+ unsigned f_lm = 0;
#endif
- bit(X86_FEATURE_MMXEXT) |
- bit(X86_FEATURE_3DNOWEXT) |
- bit(X86_FEATURE_3DNOW);
- const u32 kvm_supported_word3_x86_features =
- bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+
+ /* cpuid 1.edx */
+ const u32 kvm_supported_word0_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+ 0 /* Reserved, DS, ACPI */ | F(MMX) |
+ F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+ 0 /* HTT, TM, Reserved, PBE */;
+ /* cpuid 0x80000001.edx */
+ const u32 kvm_supported_word1_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* Reserved */ |
+ f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
+ 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+ /* cpuid 1.ecx */
+ const u32 kvm_supported_word4_x86_features =
+ F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+ 0 /* DS-CPL, VMX, SMX, EST */ |
+ 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+ 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+ 0 /* Reserved, XSAVE, OSXSAVE */;
+ /* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
- bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
- bit(X86_FEATURE_SVM);
+ F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+ F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+ F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
+ 0 /* SKINIT */ | 0 /* WDT */;
- /* all func 2 cpuid_count() should be called on the same cpu */
+ /* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
do_cpuid_1_ent(entry, function, index);
++*nent;
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
- entry->ecx &= kvm_supported_word3_x86_features;
+ entry->ecx &= kvm_supported_word4_x86_features;
break;
/* function 2 entries are STATEFUL. That is, repeated cpuid commands
* may return different values. This forces us to get_cpu() before
put_cpu();
}
+#undef F
+
static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
+ struct kvm_cpuid_entry2 __user *entries)
{
struct kvm_cpuid_entry2 *cpuid_entries;
int limit, nent = 0, r = -E2BIG;
limit = cpuid_entries[0].eax;
for (func = 1; func <= limit && nent < cpuid->nent; ++func)
do_cpuid_ent(&cpuid_entries[nent], func, 0,
- &nent, cpuid->nent);
+ &nent, cpuid->nent);
r = -E2BIG;
if (nent >= cpuid->nent)
goto out_free;
limit = cpuid_entries[nent - 1].eax;
for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
do_cpuid_ent(&cpuid_entries[nent], func, 0,
- &nent, cpuid->nent);
+ &nent, cpuid->nent);
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
r = -EFAULT;
if (copy_to_user(entries, cpuid_entries,
- nent * sizeof(struct kvm_cpuid_entry2)))
+ nent * sizeof(struct kvm_cpuid_entry2)))
goto out_free;
cpuid->nent = nent;
r = 0;
return -ENXIO;
vcpu_load(vcpu);
- set_bit(irq->irq, vcpu->arch.irq_pending);
- set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+ kvm_queue_interrupt(vcpu, irq->irq, false);
vcpu_put(vcpu);
return 0;
}
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+ u64 mcg_cap)
+{
+ int r;
+ unsigned bank_num = mcg_cap & 0xff, bank;
+
+ r = -EINVAL;
+ if (!bank_num)
+ goto out;
+ if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+ goto out;
+ r = 0;
+ vcpu->arch.mcg_cap = mcg_cap;
+ /* Init IA32_MCG_CTL to all 1s */
+ if (mcg_cap & MCG_CTL_P)
+ vcpu->arch.mcg_ctl = ~(u64)0;
+ /* Init IA32_MCi_CTL to all 1s */
+ for (bank = 0; bank < bank_num; bank++)
+ vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+ struct kvm_x86_mce *mce)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u64 *banks = vcpu->arch.mce_banks;
+
+ if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+ return -EINVAL;
+ /*
+ * if IA32_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+ vcpu->arch.mcg_ctl != ~(u64)0)
+ return 0;
+ banks += 4 * mce->bank;
+ /*
+ * if IA32_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+ return 0;
+ if (mce->status & MCI_STATUS_UC) {
+ if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+ !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+ printk(KERN_DEBUG "kvm: set_mce: "
+ "injects mce exception while "
+ "previous one is in progress!\n");
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return 0;
+ }
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+ banks[1] = mce->status;
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ } else if (!(banks[1] & MCI_STATUS_VAL)
+ || !(banks[1] & MCI_STATUS_UC)) {
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ banks[1] = mce->status;
+ } else
+ banks[1] |= MCI_STATUS_OVER;
+ return 0;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
goto out;
r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
- cpuid_arg->entries);
+ cpuid_arg->entries);
if (r)
goto out;
break;
if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
goto out;
r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
- cpuid_arg->entries);
+ cpuid_arg->entries);
if (r)
goto out;
r = -EFAULT;
kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
break;
}
+ case KVM_X86_SETUP_MCE: {
+ u64 mcg_cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+ break;
+ }
+ case KVM_X86_SET_MCE: {
+ struct kvm_x86_mce mce;
+
+ r = -EFAULT;
+ if (copy_from_user(&mce, argp, sizeof mce))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ break;
+ }
default:
r = -EINVAL;
}
out:
- if (lapic)
- kfree(lapic);
+ kfree(lapic);
return r;
}
return -EINVAL;
down_write(&kvm->slots_lock);
+ spin_lock(&kvm->mmu_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+ spin_unlock(&kvm->mmu_lock);
up_write(&kvm->slots_lock);
return 0;
}
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
+ spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
+ spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
memslot = &kvm->memslots[log->slot];
n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
union {
struct kvm_pit_state ps;
struct kvm_memory_alias alias;
+ struct kvm_pit_config pit_config;
} u;
switch (ioctl) {
}
} else
goto out;
+ r = kvm_setup_default_irq_routing(kvm);
+ if (r) {
+ kfree(kvm->arch.vpic);
+ kfree(kvm->arch.vioapic);
+ goto out;
+ }
break;
case KVM_CREATE_PIT:
+ u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+ goto create_pit;
+ case KVM_CREATE_PIT2:
+ r = -EFAULT;
+ if (copy_from_user(&u.pit_config, argp,
+ sizeof(struct kvm_pit_config)))
+ goto out;
+ create_pit:
+ mutex_lock(&kvm->lock);
+ r = -EEXIST;
+ if (kvm->arch.vpit)
+ goto create_pit_unlock;
r = -ENOMEM;
- kvm->arch.vpit = kvm_create_pit(kvm);
+ kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
if (kvm->arch.vpit)
r = 0;
+ create_pit_unlock:
+ mutex_unlock(&kvm->lock);
break;
+ case KVM_IRQ_LINE_STATUS:
case KVM_IRQ_LINE: {
struct kvm_irq_level irq_event;
if (copy_from_user(&irq_event, argp, sizeof irq_event))
goto out;
if (irqchip_in_kernel(kvm)) {
- mutex_lock(&kvm->lock);
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- mutex_unlock(&kvm->lock);
+ __s32 status;
+ mutex_lock(&kvm->irq_lock);
+ status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event.irq, irq_event.level);
+ mutex_unlock(&kvm->irq_lock);
+ if (ioctl == KVM_IRQ_LINE_STATUS) {
+ irq_event.status = status;
+ if (copy_to_user(argp, &irq_event,
+ sizeof irq_event))
+ goto out;
+ }
r = 0;
}
break;
if (vcpu->arch.apic) {
dev = &vcpu->arch.apic->dev;
- if (dev->in_range(dev, addr, len, is_write))
+ if (kvm_iodevice_in_range(dev, addr, len, is_write))
return dev;
}
return NULL;
return dev;
}
-int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu)
{
void *data = val;
int r = X86EMUL_CONTINUE;
return r;
}
-int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu)
{
void *data = val;
int r = X86EMUL_CONTINUE;
*/
mutex_lock(&vcpu->kvm->lock);
mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
+ mutex_unlock(&vcpu->kvm->lock);
if (mmio_dev) {
kvm_iodevice_read(mmio_dev, gpa, bytes, val);
- mutex_unlock(&vcpu->kvm->lock);
return X86EMUL_CONTINUE;
}
- mutex_unlock(&vcpu->kvm->lock);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
*/
mutex_lock(&vcpu->kvm->lock);
mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
+ mutex_unlock(&vcpu->kvm->lock);
if (mmio_dev) {
kvm_iodevice_write(mmio_dev, gpa, bytes, val);
- mutex_unlock(&vcpu->kvm->lock);
return X86EMUL_CONTINUE;
}
- mutex_unlock(&vcpu->kvm->lock);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
int emulate_clts(struct kvm_vcpu *vcpu)
{
- KVMTRACE_0D(CLTS, vcpu, handler);
kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
return X86EMUL_CONTINUE;
}
u16 error_code,
int emulation_type)
{
- int r;
+ int r, shadow_mask;
struct decode_cache *c;
kvm_clear_exception_queue(vcpu);
}
}
+ if (emulation_type & EMULTYPE_SKIP) {
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+ return EMULATE_DONE;
+ }
+
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+ shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+ if (r == 0)
+ kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
if (vcpu->arch.pio.string)
return EMULATE_DO_MMIO;
{
/* TODO: String I/O for in kernel device */
- mutex_lock(&vcpu->kvm->lock);
if (vcpu->arch.pio.in)
kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
vcpu->arch.pio.size,
kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
vcpu->arch.pio.size,
pd);
- mutex_unlock(&vcpu->kvm->lock);
}
static void pio_string_write(struct kvm_io_device *pio_dev,
void *pd = vcpu->arch.pio_data;
int i;
- mutex_lock(&vcpu->kvm->lock);
for (i = 0; i < io->cur_count; i++) {
kvm_iodevice_write(pio_dev, io->port,
io->size,
pd);
pd += io->size;
}
- mutex_unlock(&vcpu->kvm->lock);
}
static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
vcpu->arch.pio.down = 0;
vcpu->arch.pio.rep = 0;
- if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
- KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
- handler);
- else
- KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
- handler);
+ trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+ size, 1);
val = kvm_register_read(vcpu, VCPU_REGS_RAX);
memcpy(vcpu->arch.pio_data, &val, 4);
+ mutex_lock(&vcpu->kvm->lock);
pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
+ mutex_unlock(&vcpu->kvm->lock);
if (pio_dev) {
kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
complete_pio(vcpu);
vcpu->arch.pio.down = down;
vcpu->arch.pio.rep = rep;
- if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
- KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
- handler);
- else
- KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
- handler);
+ trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+ size, count);
if (!count) {
kvm_x86_ops->skip_emulated_instruction(vcpu);
vcpu->arch.pio.guest_gva = address;
+ mutex_lock(&vcpu->kvm->lock);
pio_dev = vcpu_find_pio_dev(vcpu, port,
vcpu->arch.pio.cur_count,
!vcpu->arch.pio.in);
+ mutex_unlock(&vcpu->kvm->lock);
+
if (!vcpu->arch.pio.in) {
/* string PIO write */
ret = pio_copy_data(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+static void bounce_off(void *info)
+{
+ /* nothing */
+}
+
+static unsigned int ref_freq;
+static unsigned long tsc_khz_ref;
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i, send_ipi = 0;
+
+ if (!ref_freq)
+ ref_freq = freq->old;
+
+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+ return 0;
+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+ return 0;
+ per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->cpu != freq->cpu)
+ continue;
+ if (!kvm_request_guest_time_update(vcpu))
+ continue;
+ if (vcpu->cpu != smp_processor_id())
+ send_ipi++;
+ }
+ }
+ spin_unlock(&kvm_lock);
+
+ if (freq->old < freq->new && send_ipi) {
+ /*
+ * We upscale the frequency. Must make the guest
+ * doesn't see old kvmclock values while running with
+ * the new frequency, otherwise we risk the guest sees
+ * time go backwards.
+ *
+ * In case we update the frequency for another cpu
+ * (which might be in guest context) send an interrupt
+ * to kick the cpu out of guest context. Next time
+ * guest context is entered kvmclock will be updated,
+ * so the guest will not see stale values.
+ */
+ smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+ }
+ return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+ .notifier_call = kvmclock_cpufreq_notifier
+};
+
int kvm_arch_init(void *opaque)
{
- int r;
+ int r, cpu;
struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
if (kvm_x86_ops) {
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+ PT_DIRTY_MASK, PT64_NX_MASK, 0);
+
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ tsc_khz_ref = tsc_khz;
+ cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+
return 0;
out:
void kvm_arch_exit(void)
{
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
}
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
- KVMTRACE_0D(HLT, vcpu, handler);
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
- KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+ trace_kvm_hypercall(nr, a0, a1, a2, a3);
if (!is_long_mode(vcpu)) {
nr &= 0xFFFFFFFF;
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
return 0;
}
- KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
- (u32)((u64)value >> 32), handler);
return value;
}
void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
unsigned long *rflags)
{
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
- (u32)((u64)val >> 32), handler);
-
switch (cr) {
case 0:
kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
return 0;
if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
- !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+ !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
return 0;
return 1;
}
if (!best || e->function > best->function)
best = e;
}
-
return best;
}
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best)
+ return best->eax & 0xff;
+ return 36;
+}
+
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
u32 function, index;
kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
}
kvm_x86_ops->skip_emulated_instruction(vcpu);
- KVMTRACE_5D(CPUID, vcpu, function,
- (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
+ trace_kvm_cpuid(function,
+ kvm_register_read(vcpu, VCPU_REGS_RAX),
+ kvm_register_read(vcpu, VCPU_REGS_RBX),
+ kvm_register_read(vcpu, VCPU_REGS_RCX),
+ kvm_register_read(vcpu, VCPU_REGS_RDX));
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
- return (!vcpu->arch.irq_summary &&
+ return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
kvm_run->request_interrupt_window &&
- vcpu->arch.interrupt_window_open &&
- (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+ kvm_arch_interrupt_allowed(vcpu));
}
static void post_kvm_run_save(struct kvm_vcpu *vcpu,
kvm_run->ready_for_interrupt_injection = 1;
else
kvm_run->ready_for_interrupt_injection =
- (vcpu->arch.interrupt_window_open &&
- vcpu->arch.irq_summary == 0);
+ kvm_arch_interrupt_allowed(vcpu) &&
+ !kvm_cpu_has_interrupt(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu);
}
static void vapic_enter(struct kvm_vcpu *vcpu)
up_read(&vcpu->kvm->slots_lock);
}
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+ int max_irr, tpr;
+
+ if (!kvm_x86_ops->update_cr8_intercept)
+ return;
+
+ if (!vcpu->arch.apic->vapic_addr)
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ else
+ max_irr = -1;
+
+ if (max_irr != -1)
+ max_irr >>= 4;
+
+ tpr = kvm_lapic_get_cr8(vcpu);
+
+ kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+}
+
+static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ /* try to reinject previous events if any */
+ if (vcpu->arch.nmi_injected) {
+ kvm_x86_ops->set_nmi(vcpu);
+ return;
+ }
+
+ if (vcpu->arch.interrupt.pending) {
+ kvm_x86_ops->set_irq(vcpu);
+ return;
+ }
+
+ /* try to inject new event if pending */
+ if (vcpu->arch.nmi_pending) {
+ if (kvm_x86_ops->nmi_allowed(vcpu)) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_ops->set_nmi(vcpu);
+ }
+ } else if (kvm_cpu_has_interrupt(vcpu)) {
+ if (kvm_x86_ops->interrupt_allowed(vcpu)) {
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+ false);
+ kvm_x86_ops->set_irq(vcpu);
+ }
+ }
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
+ bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+ kvm_run->request_interrupt_window;
if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
+ if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+ kvm_write_guest_time(vcpu);
if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
kvm_mmu_sync_roots(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
}
}
- clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
- kvm_inject_pending_timer_irqs(vcpu);
-
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
local_irq_disable();
+ clear_bit(KVM_REQ_KICK, &vcpu->requests);
+ smp_mb__after_clear_bit();
+
if (vcpu->requests || need_resched() || signal_pending(current)) {
local_irq_enable();
preempt_enable();
goto out;
}
- vcpu->guest_mode = 1;
- /*
- * Make sure that guest_mode assignment won't happen after
- * testing the pending IRQ vector bitmap.
- */
- smp_wmb();
-
if (vcpu->arch.exception.pending)
__queue_exception(vcpu);
- else if (irqchip_in_kernel(vcpu->kvm))
- kvm_x86_ops->inject_pending_irq(vcpu);
else
- kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+ inject_pending_irq(vcpu, kvm_run);
+
+ /* enable NMI/IRQ window open exits if needed */
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
- kvm_lapic_sync_to_vapic(vcpu);
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
up_read(&vcpu->kvm->slots_lock);
set_debugreg(vcpu->arch.eff_db[3], 3);
}
- KVMTRACE_0D(VMENTRY, vcpu, entryexit);
+ trace_kvm_entry(vcpu->vcpu_id);
kvm_x86_ops->run(vcpu, kvm_run);
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(vcpu->arch.host_dr6, 6);
set_debugreg(vcpu->arch.host_dr7, 7);
- vcpu->guest_mode = 0;
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
++vcpu->stat.exits;
profile_hit(KVM_PROFILING, (void *)rip);
}
- if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
- vcpu->arch.exception.pending = false;
kvm_lapic_sync_from_vapic(vcpu);
return r;
}
+
static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
kvm_vcpu_block(vcpu);
down_read(&vcpu->kvm->slots_lock);
if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
- if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ {
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
vcpu->arch.mp_state =
- KVM_MP_STATE_RUNNABLE;
- if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
- r = -EINTR;
+ KVM_MP_STATE_RUNNABLE;
+ case KVM_MP_STATE_RUNNABLE:
+ break;
+ case KVM_MP_STATE_SIPI_RECEIVED:
+ default:
+ r = -EINTR;
+ break;
+ }
+ }
}
- if (r > 0) {
- if (dm_request_for_irq_injection(vcpu, kvm_run)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.request_irq_exits;
- }
- if (signal_pending(current)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- }
- if (need_resched()) {
- up_read(&vcpu->kvm->slots_lock);
- kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- }
+ if (r <= 0)
+ break;
+
+ clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ if (kvm_cpu_has_pending_timer(vcpu))
+ kvm_inject_pending_timer_irqs(vcpu);
+
+ if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.request_irq_exits;
+ }
+ if (signal_pending(current)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ if (need_resched()) {
+ up_read(&vcpu->kvm->slots_lock);
+ kvm_resched(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
}
}
struct kvm_sregs *sregs)
{
struct descriptor_table dt;
- int pending_vec;
vcpu_load(vcpu);
sregs->efer = vcpu->arch.shadow_efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm)) {
- memset(sregs->interrupt_bitmap, 0,
- sizeof sregs->interrupt_bitmap);
- pending_vec = kvm_x86_ops->get_irq(vcpu);
- if (pending_vec >= 0)
- set_bit(pending_vec,
- (unsigned long *)sregs->interrupt_bitmap);
- } else
- memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
- sizeof sregs->interrupt_bitmap);
+ memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+
+ if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
+ set_bit(vcpu->arch.interrupt.nr,
+ (unsigned long *)sregs->interrupt_bitmap);
vcpu_put(vcpu);
tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
- tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
}
static int load_state_from_tss32(struct kvm_vcpu *vcpu,
}
static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
- u32 old_tss_base,
- struct desc_struct *nseg_desc)
+ u16 old_tss_sel, u32 old_tss_base,
+ struct desc_struct *nseg_desc)
{
struct tss_segment_16 tss_segment_16;
int ret = 0;
&tss_segment_16, sizeof tss_segment_16))
goto out;
+ if (old_tss_sel != 0xffff) {
+ tss_segment_16.prev_task_link = old_tss_sel;
+
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr(vcpu, nseg_desc),
+ &tss_segment_16.prev_task_link,
+ sizeof tss_segment_16.prev_task_link))
+ goto out;
+ }
+
if (load_state_from_tss16(vcpu, &tss_segment_16))
goto out;
}
static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
- u32 old_tss_base,
+ u16 old_tss_sel, u32 old_tss_base,
struct desc_struct *nseg_desc)
{
struct tss_segment_32 tss_segment_32;
&tss_segment_32, sizeof tss_segment_32))
goto out;
+ if (old_tss_sel != 0xffff) {
+ tss_segment_32.prev_task_link = old_tss_sel;
+
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr(vcpu, nseg_desc),
+ &tss_segment_32.prev_task_link,
+ sizeof tss_segment_32.prev_task_link))
+ goto out;
+ }
+
if (load_state_from_tss32(vcpu, &tss_segment_32))
goto out;
kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
}
- kvm_x86_ops->skip_emulated_instruction(vcpu);
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
if (nseg_desc.type & 8)
- ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
- &nseg_desc);
+ ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
else
- ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
- &nseg_desc);
+ ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
u32 eflags = kvm_x86_ops->get_rflags(vcpu);
struct kvm_sregs *sregs)
{
int mmu_reset_needed = 0;
- int i, pending_vec, max_bits;
+ int pending_vec, max_bits;
struct descriptor_table dt;
vcpu_load(vcpu);
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
- vcpu->arch.cr3 = sregs->cr3;
+
+ down_read(&vcpu->kvm->slots_lock);
+ if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
+ vcpu->arch.cr3 = sregs->cr3;
+ else
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ up_read(&vcpu->kvm->slots_lock);
kvm_set_cr8(vcpu, sregs->cr8);
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
- if (!irqchip_in_kernel(vcpu->kvm)) {
- memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
- sizeof vcpu->arch.irq_pending);
- vcpu->arch.irq_summary = 0;
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
- if (vcpu->arch.irq_pending[i])
- __set_bit(i, &vcpu->arch.irq_summary);
- } else {
- max_bits = (sizeof sregs->interrupt_bitmap) << 3;
- pending_vec = find_first_bit(
- (const unsigned long *)sregs->interrupt_bitmap,
- max_bits);
- /* Only pending external irq is handled here */
- if (pending_vec < max_bits) {
- kvm_x86_ops->set_irq(vcpu, pending_vec);
- pr_debug("Set back pending irq %d\n",
- pending_vec);
- }
- kvm_pic_clear_isr_ack(vcpu->kvm);
+ max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_pic_clear_isr_ack(vcpu->kvm);
}
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
/* Older userspace won't unhalt the vcpu on reset. */
- if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
!(vcpu->arch.cr0 & X86_CR0_PE))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
+ if (vcpu->arch.time_page) {
+ kvm_release_page_dirty(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+
kvm_x86_ops->vcpu_free(vcpu);
}
kvm = vcpu->kvm;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+ if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
goto fail_mmu_destroy;
}
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL);
+ if (!vcpu->arch.mce_banks) {
+ r = -ENOMEM;
+ goto fail_mmu_destroy;
+ }
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
return 0;
fail_mmu_destroy:
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
static void kvm_free_vcpus(struct kvm *kvm)
{
unsigned int i;
+ struct kvm_vcpu *vcpu;
/*
* Unpin any mmu pages first.
*/
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm_unload_vcpu_mmu(kvm->vcpus[i]);
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm_arch_vcpu_free(kvm->vcpus[i]);
- kvm->vcpus[i] = NULL;
- }
- }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_unload_vcpu_mmu(vcpu);
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
}
void kvm_arch_sync_events(struct kvm *kvm)
}
}
+ spin_lock(&kvm->mmu_lock);
if (!kvm->arch.n_requested_mmu_pages) {
unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
}
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+ spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
return 0;
void kvm_arch_flush_shadow(struct kvm *kvm)
{
kvm_mmu_zap_all(kvm);
+ kvm_reload_remote_mmus(kvm);
}
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
|| vcpu->arch.nmi_pending;
}
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
- struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
- printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
{
- int ipi_pcpu = vcpu->cpu;
- int cpu = get_cpu();
+ int me;
+ int cpu = vcpu->cpu;
if (waitqueue_active(&vcpu->wq)) {
wake_up_interruptible(&vcpu->wq);
++vcpu->stat.halt_wakeup;
}
- /*
- * We may be called synchronously with irqs disabled in guest mode,
- * So need not to call smp_call_function_single() in that case.
- */
- if (vcpu->guest_mode && vcpu->cpu != cpu)
- smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
+
+ me = get_cpu();
+ if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+ if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+ smp_send_reschedule(cpu);
put_cpu();
}
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);