X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fx86%2Fkvm%2Fx86.c;h=49079a46687b740f2e91d2bbfbc31c1c5094c22a;hb=a5355c27d26001865a5ac32c868c82a523c275d3;hp=3f3cb7107c03179f8b46fc45488b72ce21075d66;hpb=fa89a81766e33343fa8f0fe0e371819bf94a17a1;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3f3cb71..49079a4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -30,15 +30,18 @@ #include #include #include -#include #include #include #include #include +#include +#include +#include #include #include #include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -67,6 +70,8 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -86,6 +91,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, + { "request_nmi", VCPU_STAT(request_nmi_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, { "efer_reload", VCPU_STAT(efer_reload) }, @@ -93,6 +99,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "irq_injections", VCPU_STAT(irq_injections) }, + { "nmi_injections", VCPU_STAT(nmi_injections) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -100,230 +107,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "mmu_unsync", VM_STAT(mmu_unsync) }, + { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } }; -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct list_head *ptr; - struct kvm_assigned_dev_kernel *match; - - list_for_each(ptr, head) { - match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) -{ - struct kvm_assigned_dev_kernel *assigned_dev; - - assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, - interrupt_work); - - /* This is taken to safely inject irq inside the guest. When - * the interrupt injection (or the ioapic code) uses a - * finer-grained lock, update this - */ - mutex_lock(&assigned_dev->kvm->lock); - kvm_set_irq(assigned_dev->kvm, - assigned_dev->guest_irq, 1); - mutex_unlock(&assigned_dev->kvm->lock); - kvm_put_kvm(assigned_dev->kvm); -} - -/* FIXME: Implement the OR logic needed to make shared interrupts on - * this line behave properly - */ -static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = - (struct kvm_assigned_dev_kernel *) dev_id; - - kvm_get_kvm(assigned_dev->kvm); - schedule_work(&assigned_dev->interrupt_work); - disable_irq_nosync(irq); - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev; - - if (kian->gsi == -1) - return; - - dev = container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - kvm_set_irq(dev->kvm, dev->guest_irq, 0); - enable_irq(dev->host_irq); -} - -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) { - mutex_unlock(&kvm->lock); - return -EINVAL; - } - - if (match->irq_requested) { - match->guest_irq = assigned_irq->guest_irq; - match->ack_notifier.gsi = assigned_irq->guest_irq; - mutex_unlock(&kvm->lock); - return 0; - } - - INIT_WORK(&match->interrupt_work, - kvm_assigned_dev_interrupt_work_handler); - - if (irqchip_in_kernel(kvm)) { - if (!capable(CAP_SYS_RAWIO)) { - return -EPERM; - goto out; - } - - if (assigned_irq->host_irq) - match->host_irq = assigned_irq->host_irq; - else - match->host_irq = match->dev->irq; - match->guest_irq = assigned_irq->guest_irq; - match->ack_notifier.gsi = assigned_irq->guest_irq; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - kvm_register_irq_ack_notifier(kvm, &match->ack_notifier); - - /* Even though this is PCI, we don't want to use shared - * interrupts. Sharing host devices with guest-assigned devices - * on the same interrupt line is not a happy situation: there - * are going to be long delays in accepting, acking, etc. - */ - if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0, - "kvm_assigned_device", (void *)match)) { - printk(KERN_INFO "%s: couldn't allocate irq for pv " - "device\n", __func__); - r = -EIO; - goto out; - } - } - - match->irq_requested = true; -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EINVAL; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_bus_and_slot(assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->dev = dev; - - match->kvm = kvm; - - list_add(&match->list, &kvm->arch.assigned_dev_head); - -out: - mutex_unlock(&kvm->lock); - return r; -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - mutex_unlock(&kvm->lock); - return r; -} - -static void kvm_free_assigned_devices(struct kvm *kvm) -{ - struct list_head *ptr, *ptr2; - struct kvm_assigned_dev_kernel *assigned_dev; - - list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { - assigned_dev = list_entry(ptr, - struct kvm_assigned_dev_kernel, - list); - - if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) { - free_irq(assigned_dev->host_irq, - (void *)assigned_dev); - - kvm_unregister_irq_ack_notifier(kvm, - &assigned_dev-> - ack_notifier); - } - - if (cancel_work_sync(&assigned_dev->interrupt_work)) - /* We had pending work. That means we will have to take - * care of kvm_put_kvm. - */ - kvm_put_kvm(kvm); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); - } -} - unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -386,6 +176,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; + if (vcpu->arch.exception.pending) { if (vcpu->arch.exception.nr == PF_VECTOR) { printk(KERN_DEBUG "kvm: inject_page_fault:" @@ -530,6 +321,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; + kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); return; } @@ -573,6 +365,8 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; + vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; + kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); @@ -580,6 +374,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); return; } @@ -652,6 +447,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. @@ -666,7 +466,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_IA32_PERF_STATUS, + MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; @@ -691,6 +491,28 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) return; } + if (efer & EFER_FFXSR) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { + printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } + + if (efer & EFER_SVME) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { + printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } + kvm_x86_ops->set_efer(vcpu, efer); efer &= ~EFER_LMA; @@ -792,10 +614,12 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", - __FUNCTION__, tsc_khz, hv_clock->tsc_shift, + __func__, tsc_khz, hv_clock->tsc_shift, hv_clock->tsc_to_system_mul); } +static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); + static void kvm_write_guest_time(struct kvm_vcpu *v) { struct timespec ts; @@ -806,9 +630,9 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) if ((!vcpu->time_page)) return; - if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { - kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = tsc_khz; + if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { + kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); } /* Keep irq disabled to prevent changes to the clock */ @@ -839,6 +663,16 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); } +static int kvm_request_guest_time_update(struct kvm_vcpu *v) +{ + struct kvm_vcpu_arch *vcpu = &v->arch; + + if (!vcpu->time_page) + return 0; + set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + return 1; +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -865,10 +699,38 @@ static bool msr_mtrr_valid(unsigned msr) static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + if (!msr_mtrr_valid(msr)) return 1; - vcpu->arch.mtrr[msr - 0x200] = data; + if (msr == MSR_MTRRdefType) { + vcpu->arch.mtrr_state.def_type = data; + vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; + } else if (msr == MSR_MTRRfix64K_00000) + p[0] = data; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + p[1 + msr - MSR_MTRRfix16K_80000] = data; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + p[3 + msr - MSR_MTRRfix4K_C0000] = data; + else if (msr == MSR_IA32_CR_PAT) + vcpu->arch.pat = data; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pt = data; + } + + kvm_mmu_reset_context(vcpu); return 0; } @@ -904,6 +766,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: break; case 0x200 ... 0x2ff: return set_msr_mtrr(vcpu, msr, data); @@ -932,17 +795,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - down_read(¤t->mm->mmap_sem); vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); if (is_error_page(vcpu->arch.time_page)) { kvm_release_page_clean(vcpu->arch.time_page); vcpu->arch.time_page = NULL; } - kvm_write_guest_time(vcpu); + kvm_request_guest_time_update(vcpu); break; } default: @@ -966,10 +827,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + if (!msr_mtrr_valid(msr)) return 1; - *pdata = vcpu->arch.mtrr[msr - 0x200]; + if (msr == MSR_MTRRdefType) + *pdata = vcpu->arch.mtrr_state.def_type + + (vcpu->arch.mtrr_state.enabled << 10); + else if (msr == MSR_MTRRfix64K_00000) + *pdata = p[0]; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; + else if (msr == MSR_IA32_CR_PAT) + *pdata = vcpu->arch.pat; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pdata = *pt; + } + return 0; } @@ -1000,6 +888,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: + case MSR_VM_HSAVE_PA: data = 0; break; case MSR_MTRRcap: @@ -1122,7 +1011,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IRQCHIP: case KVM_CAP_HLT: case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: - case KVM_CAP_USER_MEMORY: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: case KVM_CAP_CLOCKSOURCE: @@ -1130,6 +1018,8 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: + case KVM_CAP_REINJECT_CONTROL: + case KVM_CAP_IRQ_INJECT_STATUS: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1147,6 +1037,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PV_MMU: r = !tdp_enabled; break; + case KVM_CAP_IOMMU: + r = iommu_found(); + break; default: r = 0; break; @@ -1197,7 +1090,7 @@ long kvm_arch_dev_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, - cpuid_arg->entries); + cpuid_arg->entries); if (r) goto out; @@ -1217,7 +1110,7 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); - kvm_write_guest_time(vcpu); + kvm_request_guest_time_update(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -1228,9 +1121,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int is_efer_nx(void) { - u64 efer; + unsigned long long efer = 0; - rdmsrl(MSR_EFER, efer); + rdmsrl_safe(MSR_EFER, &efer); return efer & EFER_NX; } @@ -1295,8 +1188,8 @@ out: } static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) { int r; @@ -1315,8 +1208,8 @@ out: } static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) { int r; @@ -1325,7 +1218,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, goto out; r = -EFAULT; if (copy_to_user(entries, &vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) goto out; return 0; @@ -1334,18 +1227,13 @@ out: return r; } -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index) + u32 index) { entry->function = function; entry->index = index; cpuid_count(entry->function, entry->index, - &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); entry->flags = 0; } @@ -1371,19 +1259,21 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_SYSCALL) | - (bit(X86_FEATURE_NX) && is_efer_nx()) | + (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) | #ifdef CONFIG_X86_64 bit(X86_FEATURE_LM) | #endif + bit(X86_FEATURE_FXSR_OPT) | bit(X86_FEATURE_MMXEXT) | bit(X86_FEATURE_3DNOWEXT) | bit(X86_FEATURE_3DNOW); const u32 kvm_supported_word3_x86_features = bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); const u32 kvm_supported_word6_x86_features = - bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); + bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | + bit(X86_FEATURE_SVM); - /* all func 2 cpuid_count() should be called on the same cpu */ + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); do_cpuid_1_ent(entry, function, index); ++*nent; @@ -1404,6 +1294,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, int t, times = entry->eax & 0xff; entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; for (t = 1; t < times && *nent < maxnent; ++t) { do_cpuid_1_ent(&entry[t], function, 0); entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; @@ -1434,7 +1325,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until level_type is zero */ for (i = 1; *nent < maxnent; ++i) { - level_type = entry[i - 1].ecx & 0xff; + level_type = entry[i - 1].ecx & 0xff00; if (!level_type) break; do_cpuid_1_ent(&entry[i], function, i); @@ -1456,7 +1347,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid_entry2 __user *entries) { struct kvm_cpuid_entry2 *cpuid_entries; int limit, nent = 0, r = -E2BIG; @@ -1473,7 +1364,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[0].eax; for (func = 1; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); + &nent, cpuid->nent); r = -E2BIG; if (nent >= cpuid->nent) goto out_free; @@ -1482,10 +1373,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[nent - 1].eax; for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); + &nent, cpuid->nent); r = -EFAULT; if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) + nent * sizeof(struct kvm_cpuid_entry2))) goto out_free; cpuid->nent = nent; r = 0; @@ -1534,6 +1425,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return 0; } +static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + kvm_inject_nmi(vcpu); + vcpu_put(vcpu); + + return 0; +} + static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -1593,6 +1493,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = 0; break; } + case KVM_NMI: { + r = kvm_vcpu_ioctl_nmi(vcpu); + if (r) + goto out; + r = 0; + break; + } case KVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; @@ -1613,7 +1520,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); + cpuid_arg->entries); if (r) goto out; break; @@ -1626,7 +1533,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); + cpuid_arg->entries); if (r) goto out; r = -EFAULT; @@ -1846,6 +1753,15 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) return r; } +static int kvm_vm_ioctl_reinject(struct kvm *kvm, + struct kvm_reinject_control *control) +{ + if (!kvm->arch.vpit) + return -ENXIO; + kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; + return 0; +} + /* * Get (and clear) the dirty memory log for a memory slot. */ @@ -1943,13 +1859,26 @@ long kvm_arch_vm_ioctl(struct file *filp, } } else goto out; + r = kvm_setup_default_irq_routing(kvm); + if (r) { + kfree(kvm->arch.vpic); + kfree(kvm->arch.vioapic); + goto out; + } break; case KVM_CREATE_PIT: + mutex_lock(&kvm->lock); + r = -EEXIST; + if (kvm->arch.vpit) + goto create_pit_unlock; r = -ENOMEM; kvm->arch.vpit = kvm_create_pit(kvm); if (kvm->arch.vpit) r = 0; + create_pit_unlock: + mutex_unlock(&kvm->lock); break; + case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { struct kvm_irq_level irq_event; @@ -1957,9 +1886,17 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&irq_event, argp, sizeof irq_event)) goto out; if (irqchip_in_kernel(kvm)) { + __s32 status; mutex_lock(&kvm->lock); - kvm_set_irq(kvm, irq_event.irq, irq_event.level); + status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event.irq, irq_event.level); mutex_unlock(&kvm->lock); + if (ioctl == KVM_IRQ_LINE_STATUS) { + irq_event.status = status; + if (copy_to_user(argp, &irq_event, + sizeof irq_event)) + goto out; + } r = 0; } break; @@ -2013,28 +1950,6 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; break; } - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } case KVM_GET_PIT: { r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) @@ -2064,6 +1979,17 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_REINJECT_CONTROL: { + struct kvm_reinject_control control; + r = -EFAULT; + if (copy_from_user(&control, argp, sizeof(control))) + goto out; + r = kvm_vm_ioctl_reinject(kvm, &control); + if (r) + goto out; + r = 0; + break; + } default: ; } @@ -2117,10 +2043,38 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, return dev; } -int emulator_read_std(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + void *data = val; + int r = X86EMUL_CONTINUE; + + while (bytes) { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + unsigned offset = addr & (PAGE_SIZE-1); + unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } + ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); + if (ret < 0) { + r = X86EMUL_UNHANDLEABLE; + goto out; + } + + bytes -= toread; + data += toread; + addr += toread; + } +out: + return r; +} + +static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) { void *data = val; int r = X86EMUL_CONTINUE; @@ -2128,27 +2082,27 @@ int emulator_read_std(unsigned long addr, while (bytes) { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); unsigned offset = addr & (PAGE_SIZE-1); - unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); + unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; if (gpa == UNMAPPED_GVA) { r = X86EMUL_PROPAGATE_FAULT; goto out; } - ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); + ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); if (ret < 0) { r = X86EMUL_UNHANDLEABLE; goto out; } - bytes -= tocopy; - data += tocopy; - addr += tocopy; + bytes -= towrite; + data += towrite; + addr += towrite; } out: return r; } -EXPORT_SYMBOL_GPL(emulator_read_std); + static int emulator_read_emulated(unsigned long addr, void *val, @@ -2170,8 +2124,8 @@ static int emulator_read_emulated(unsigned long addr, if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (emulator_read_std(addr, val, bytes, vcpu) - == X86EMUL_CONTINUE) + if (kvm_read_guest_virt(addr, val, bytes, vcpu) + == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; @@ -2205,7 +2159,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); if (ret < 0) return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes); + kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); return 1; } @@ -2305,9 +2259,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, val = *(u64 *)new; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); kaddr = kmap_atomic(page, KM_USER0); set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); @@ -2327,6 +2279,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) { + kvm_mmu_invlpg(vcpu, address); return X86EMUL_CONTINUE; } @@ -2375,7 +2328,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -2383,7 +2336,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { - .read_std = emulator_read_std, + .read_std = kvm_read_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, @@ -2485,40 +2438,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(emulate_instruction); -static void free_pio_guest_pages(struct kvm_vcpu *vcpu) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) - if (vcpu->arch.pio.guest_pages[i]) { - kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); - vcpu->arch.pio.guest_pages[i] = NULL; - } -} - static int pio_copy_data(struct kvm_vcpu *vcpu) { void *p = vcpu->arch.pio_data; - void *q; + gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; - int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; + int ret; - q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, - PAGE_KERNEL); - if (!q) { - free_pio_guest_pages(vcpu); - return -ENOMEM; - } - q += vcpu->arch.pio.guest_page_offset; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - memcpy(q, p, bytes); + ret = kvm_write_guest_virt(q, p, bytes, vcpu); else - memcpy(p, q, bytes); - q -= vcpu->arch.pio.guest_page_offset; - vunmap(q); - free_pio_guest_pages(vcpu); - return 0; + ret = kvm_read_guest_virt(q, p, bytes, vcpu); + return ret; } int complete_pio(struct kvm_vcpu *vcpu) @@ -2629,7 +2561,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.in = in; vcpu->arch.pio.string = 0; vcpu->arch.pio.down = 0; - vcpu->arch.pio.guest_page_offset = 0; vcpu->arch.pio.rep = 0; if (vcpu->run->io.direction == KVM_EXIT_IO_IN) @@ -2642,8 +2573,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); - kvm_x86_ops->skip_emulated_instruction(vcpu); - pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); if (pio_dev) { kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); @@ -2659,9 +2588,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, gva_t address, int rep, unsigned port) { unsigned now, in_page; - int i, ret = 0; - int nr_pages = 1; - struct page *page; + int ret = 0; struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; @@ -2673,7 +2600,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.in = in; vcpu->arch.pio.string = 1; vcpu->arch.pio.down = down; - vcpu->arch.pio.guest_page_offset = offset_in_page(address); vcpu->arch.pio.rep = rep; if (vcpu->run->io.direction == KVM_EXIT_IO_IN) @@ -2693,15 +2619,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, else in_page = offset_in_page(address) + size; now = min(count, (unsigned long)in_page / size); - if (!now) { - /* - * String I/O straddles page boundary. Pin two guest pages - * so that we satisfy atomicity constraints. Do just one - * transaction to avoid complexity. - */ - nr_pages = 2; + if (!now) now = 1; - } if (down) { /* * String I/O in reverse. Yuck. Kill the guest, fix later. @@ -2716,15 +2635,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) kvm_x86_ops->skip_emulated_instruction(vcpu); - for (i = 0; i < nr_pages; ++i) { - page = gva_to_page(vcpu, address + i * PAGE_SIZE); - vcpu->arch.pio.guest_pages[i] = page; - if (!page) { - kvm_inject_gp(vcpu, 0); - free_pio_guest_pages(vcpu); - return 1; - } - } + vcpu->arch.pio.guest_gva = address; pio_dev = vcpu_find_pio_dev(vcpu, port, vcpu->arch.pio.cur_count, @@ -2732,7 +2643,11 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret >= 0 && pio_dev) { + if (ret == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (ret == 0 && pio_dev) { pio_string_write(pio_dev, vcpu); complete_pio(vcpu); if (vcpu->arch.pio.count == 0) @@ -2747,9 +2662,72 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, } EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); +static void bounce_off(void *info) +{ + /* nothing */ +} + +static unsigned int ref_freq; +static unsigned long tsc_khz_ref; + +static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i, send_ipi = 0; + + if (!ref_freq) + ref_freq = freq->old; + + if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) + return 0; + if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) + return 0; + per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (vcpu->cpu != freq->cpu) + continue; + if (!kvm_request_guest_time_update(vcpu)) + continue; + if (vcpu->cpu != smp_processor_id()) + send_ipi++; + } + } + spin_unlock(&kvm_lock); + + if (freq->old < freq->new && send_ipi) { + /* + * We upscale the frequency. Must make the guest + * doesn't see old kvmclock values while running with + * the new frequency, otherwise we risk the guest sees + * time go backwards. + * + * In case we update the frequency for another cpu + * (which might be in guest context) send an interrupt + * to kick the cpu out of guest context. Next time + * guest context is entered kvmclock will be updated, + * so the guest will not see stale values. + */ + smp_call_function_single(freq->cpu, bounce_off, NULL, 1); + } + return 0; +} + +static struct notifier_block kvmclock_cpufreq_notifier_block = { + .notifier_call = kvmclock_cpufreq_notifier +}; + int kvm_arch_init(void *opaque) { - int r; + int r, cpu; struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; if (kvm_x86_ops) { @@ -2779,7 +2757,16 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_nonpresent_ptes(0ull, 0ull); kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); + PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); + + for_each_possible_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + tsc_khz_ref = tsc_khz; + cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + } + return 0; out: @@ -2788,6 +2775,9 @@ out: void kvm_arch_exit(void) { + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); kvm_x86_ops = NULL; kvm_mmu_module_exit(); } @@ -2798,11 +2788,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) KVMTRACE_0D(HLT, vcpu, handler); if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; - up_read(&vcpu->kvm->slots_lock); - kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); - if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) - return -EINTR; return 1; } else { vcpu->run->exit_reason = KVM_EXIT_HLT; @@ -2972,7 +2957,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; j == i; j = (j + 1) % nent) { + for (j = i + 1; ; j = (j + 1) % nent) { struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; if (ej->function == e->function) { ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; @@ -2992,25 +2977,20 @@ static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) return 0; if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) return 0; return 1; } -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index) { int i; - u32 function, index; - struct kvm_cpuid_entry2 *e, *best; + struct kvm_cpuid_entry2 *best = NULL; - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); - best = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + struct kvm_cpuid_entry2 *e; + e = &vcpu->arch.cpuid_entries[i]; if (is_matching_cpuid_entry(e, function, index)) { if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) @@ -3025,6 +3005,21 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) if (!best || e->function > best->function) best = e; } + return best; +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 function, index; + struct kvm_cpuid_entry2 *best; + + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); + best = kvm_find_cpuid_entry(vcpu, function, index); if (best) { kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); @@ -3077,9 +3072,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu) if (!apic || !apic->vapic_addr) return; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); vcpu->arch.apic->vapic_page = page; } @@ -3097,24 +3090,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) up_read(&vcpu->kvm->slots_lock); } -static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - r = kvm_x86_ops->vcpu_reset(vcpu); - if (r) - return r; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - } - - down_read(&vcpu->kvm->slots_lock); - vapic_enter(vcpu); - -again: if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) kvm_mmu_unload(vcpu); @@ -3126,6 +3105,10 @@ again: if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) + kvm_write_guest_time(vcpu); + if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + kvm_mmu_sync_roots(vcpu); if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) kvm_x86_ops->tlb_flush(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, @@ -3151,25 +3134,13 @@ again: local_irq_disable(); - if (vcpu->requests || need_resched()) { + if (vcpu->requests || need_resched() || signal_pending(current)) { local_irq_enable(); preempt_enable(); r = 1; goto out; } - if (signal_pending(current)) { - local_irq_enable(); - preempt_enable(); - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - goto out; - } - - if (vcpu->guest_debug.enabled) - kvm_x86_ops->guest_debug_pre(vcpu); - vcpu->guest_mode = 1; /* * Make sure that guest_mode assignment won't happen after @@ -3190,10 +3161,34 @@ again: kvm_guest_enter(); + get_debugreg(vcpu->arch.host_dr6, 6); + get_debugreg(vcpu->arch.host_dr7, 7); + if (unlikely(vcpu->arch.switch_db_regs)) { + get_debugreg(vcpu->arch.host_db[0], 0); + get_debugreg(vcpu->arch.host_db[1], 1); + get_debugreg(vcpu->arch.host_db[2], 2); + get_debugreg(vcpu->arch.host_db[3], 3); + + set_debugreg(0, 7); + set_debugreg(vcpu->arch.eff_db[0], 0); + set_debugreg(vcpu->arch.eff_db[1], 1); + set_debugreg(vcpu->arch.eff_db[2], 2); + set_debugreg(vcpu->arch.eff_db[3], 3); + } KVMTRACE_0D(VMENTRY, vcpu, entryexit); kvm_x86_ops->run(vcpu, kvm_run); + if (unlikely(vcpu->arch.switch_db_regs)) { + set_debugreg(0, 7); + set_debugreg(vcpu->arch.host_db[0], 0); + set_debugreg(vcpu->arch.host_db[1], 1); + set_debugreg(vcpu->arch.host_db[2], 2); + set_debugreg(vcpu->arch.host_db[3], 3); + } + set_debugreg(vcpu->arch.host_dr6, 6); + set_debugreg(vcpu->arch.host_dr7, 7); + vcpu->guest_mode = 0; local_irq_enable(); @@ -3227,26 +3222,63 @@ again: kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(kvm_run, vcpu); +out: + return r; +} - if (r > 0) { - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - goto out; - } - if (!need_resched()) - goto again; +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + r = kvm_arch_vcpu_reset(vcpu); + if (r) + return r; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -out: - up_read(&vcpu->kvm->slots_lock); - if (r > 0) { - kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); - goto again; + down_read(&vcpu->kvm->slots_lock); + vapic_enter(vcpu); + + r = 1; + while (r > 0) { + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) + r = vcpu_enter_guest(vcpu, kvm_run); + else { + up_read(&vcpu->kvm->slots_lock); + kvm_vcpu_block(vcpu); + down_read(&vcpu->kvm->slots_lock); + if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) + r = -EINTR; + } + + if (r > 0) { + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + } + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } + if (need_resched()) { + up_read(&vcpu->kvm->slots_lock); + kvm_resched(vcpu); + down_read(&vcpu->kvm->slots_lock); + } + } } + up_read(&vcpu->kvm->slots_lock); post_kvm_run_save(vcpu, kvm_run); vapic_exit(vcpu); @@ -3266,6 +3298,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); r = -EAGAIN; goto out; } @@ -3342,7 +3375,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) /* * Don't leak debug flags in case they were set for guest debugging */ - if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); vcpu_put(vcpu); @@ -3503,9 +3536,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, kvm_desct->padding = 0; } -static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, - u16 selector, - struct descriptor_table *dtable) +static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, + u16 selector, + struct descriptor_table *dtable) { if (selector & 1 << 2) { struct kvm_segment kvm_seg; @@ -3530,7 +3563,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct descriptor_table dtable; u16 index = selector >> 3; - get_segment_descritptor_dtable(vcpu, selector, &dtable); + get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) { kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); @@ -3549,7 +3582,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct descriptor_table dtable; u16 index = selector >> 3; - get_segment_descritptor_dtable(vcpu, selector, &dtable); + get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) return 1; @@ -3590,7 +3623,7 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, return 0; } -int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) +static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { .base = selector << 4, @@ -3937,6 +3970,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, pr_debug("Set back pending irq %d\n", pending_vec); } + kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); @@ -3949,20 +3983,43 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + /* Older userspace won't unhalt the vcpu on reset. */ + if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && + sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && + !(vcpu->arch.cr0 & X86_CR0_PE)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu_put(vcpu); return 0; } -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg) +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) { - int r; + int i, r; vcpu_load(vcpu); + if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { + for (i = 0; i < KVM_NR_DB_REGS; ++i) + vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; + vcpu->arch.switch_db_regs = + (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); + } else { + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); + } + r = kvm_x86_ops->set_guest_debug(vcpu, dbg); + if (dbg->control & KVM_GUESTDBG_INJECT_DB) + kvm_queue_exception(vcpu, DB_VECTOR); + else if (dbg->control & KVM_GUESTDBG_INJECT_BP) + kvm_queue_exception(vcpu, BP_VECTOR); + vcpu_put(vcpu); return r; @@ -4105,6 +4162,11 @@ EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + kvm_x86_ops->vcpu_free(vcpu); } @@ -4121,6 +4183,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) /* We do fxsave: this must be aligned. */ BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); + vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); if (r == 0) @@ -4146,6 +4209,14 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = false; + + vcpu->arch.switch_db_regs = 0; + memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); + vcpu->arch.dr6 = DR6_FIXED_1; + vcpu->arch.dr7 = DR7_FIXED_1; + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -4233,8 +4304,14 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.oos_global_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); + /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ + set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); + + rdtscll(kvm->arch.vm_init_tsc); + return kvm; } @@ -4264,9 +4341,14 @@ static void kvm_free_vcpus(struct kvm *kvm) } +void kvm_arch_sync_events(struct kvm *kvm) +{ + kvm_free_all_assigned_devices(kvm); +} + void kvm_arch_destroy_vm(struct kvm *kvm) { - kvm_free_assigned_devices(kvm); + kvm_iommu_unmap_guest(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); @@ -4344,7 +4426,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; + || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || vcpu->arch.nmi_pending; } static void vcpu_kick_intr(void *info)