KVM: Reduce runnability interface with arch support code
[safe/jmp/linux-2.6] / arch / x86 / kvm / x86.c
index d8adc1d..b87d65d 100644 (file)
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 #include <linux/cpufreq.h>
+#include <trace/events/kvm.h>
+#undef TRACE_INCLUDE_FILE
+#define CREATE_TRACE_POINTS
+#include "trace.h"
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -75,12 +79,13 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
 
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                                    struct kvm_cpuid_entry2 __user *entries);
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
-                                             u32 function, u32 index);
 
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
+int ignore_msrs = 0;
+module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
        { "pf_guest", VCPU_STAT(pf_guest) },
@@ -181,16 +186,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
        ++vcpu->stat.pf_guest;
 
        if (vcpu->arch.exception.pending) {
-               if (vcpu->arch.exception.nr == PF_VECTOR) {
-                       printk(KERN_DEBUG "kvm: inject_page_fault:"
-                                       " double fault 0x%lx\n", addr);
-                       vcpu->arch.exception.nr = DF_VECTOR;
-                       vcpu->arch.exception.error_code = 0;
-               } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+               switch(vcpu->arch.exception.nr) {
+               case DF_VECTOR:
                        /* triple fault -> shutdown */
                        set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+                       return;
+               case PF_VECTOR:
+                       vcpu->arch.exception.nr = DF_VECTOR;
+                       vcpu->arch.exception.error_code = 0;
+                       return;
+               default:
+                       /* replace previous exception with a new one in a hope
+                          that instruction re-execution will regenerate lost
+                          exception */
+                       vcpu->arch.exception.pending = false;
+                       break;
                }
-               return;
        }
        vcpu->arch.cr2 = addr;
        kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
@@ -212,13 +223,6 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 
-static void __queue_exception(struct kvm_vcpu *vcpu)
-{
-       kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
-                                    vcpu->arch.exception.has_error_code,
-                                    vcpu->arch.exception.error_code);
-}
-
 /*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
@@ -341,9 +345,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
        kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
-       KVMTRACE_1D(LMSW, vcpu,
-                   (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
-                   handler);
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
@@ -828,6 +829,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_EFER:
                set_efer(vcpu, data);
                break;
+       case MSR_K7_HWCR:
+               data &= ~(u64)0x40;     /* ignore flush filter disable */
+               if (data != 0) {
+                       pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+                               data);
+                       return 1;
+               }
+               break;
+       case MSR_FAM10H_MMIO_CONF_BASE:
+               if (data != 0) {
+                       pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+                               "0x%llx\n", data);
+                       return 1;
+               }
+               break;
+       case MSR_AMD64_NB_CFG:
+               break;
        case MSR_IA32_DEBUGCTLMSR:
                if (!data) {
                        /* We support the non-activated case already */
@@ -843,12 +861,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_UCODE_WRITE:
        case MSR_VM_HSAVE_PA:
+       case MSR_AMD64_PATCH_LOADER:
                break;
        case 0x200 ... 0x2ff:
                return set_msr_mtrr(vcpu, msr, data);
        case MSR_IA32_APICBASE:
                kvm_set_apic_base(vcpu, data);
                break;
+       case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+               return kvm_x2apic_msr_write(vcpu, msr, data);
        case MSR_IA32_MISC_ENABLE:
                vcpu->arch.ia32_misc_enable_msr = data;
                break;
@@ -886,9 +907,46 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_IA32_MCG_STATUS:
        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
                return set_msr_mce(vcpu, msr, data);
+
+       /* Performance counters are not protected by a CPUID bit,
+        * so we should check all of them in the generic path for the sake of
+        * cross vendor migration.
+        * Writing a zero into the event select MSRs disables them,
+        * which we perfectly emulate ;-). Any other value should be at least
+        * reported, some guests depend on them.
+        */
+       case MSR_P6_EVNTSEL0:
+       case MSR_P6_EVNTSEL1:
+       case MSR_K7_EVNTSEL0:
+       case MSR_K7_EVNTSEL1:
+       case MSR_K7_EVNTSEL2:
+       case MSR_K7_EVNTSEL3:
+               if (data != 0)
+                       pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+                               "0x%x data 0x%llx\n", msr, data);
+               break;
+       /* at least RHEL 4 unconditionally writes to the perfctr registers,
+        * so we ignore writes to make it happy.
+        */
+       case MSR_P6_PERFCTR0:
+       case MSR_P6_PERFCTR1:
+       case MSR_K7_PERFCTR0:
+       case MSR_K7_PERFCTR1:
+       case MSR_K7_PERFCTR2:
+       case MSR_K7_PERFCTR3:
+               pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+                       "0x%x data 0x%llx\n", msr, data);
+               break;
        default:
-               pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
-               return 1;
+               if (!ignore_msrs) {
+                       pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
+                               msr, data);
+                       return 1;
+               } else {
+                       pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
+                               msr, data);
+                       break;
+               }
        }
        return 0;
 }
@@ -995,6 +1053,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_P6_EVNTSEL0:
        case MSR_P6_EVNTSEL1:
        case MSR_K7_EVNTSEL0:
+       case MSR_K8_INT_PENDING_MSG:
+       case MSR_AMD64_NB_CFG:
+       case MSR_FAM10H_MMIO_CONF_BASE:
                data = 0;
                break;
        case MSR_MTRRcap:
@@ -1008,6 +1069,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_APICBASE:
                data = kvm_get_apic_base(vcpu);
                break;
+       case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+               return kvm_x2apic_msr_read(vcpu, msr, pdata);
+               break;
        case MSR_IA32_MISC_ENABLE:
                data = vcpu->arch.ia32_misc_enable_msr;
                break;
@@ -1034,8 +1098,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
                return get_msr_mce(vcpu, msr, pdata);
        default:
-               pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
-               return 1;
+               if (!ignore_msrs) {
+                       pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+                       return 1;
+               } else {
+                       pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
+                       data = 0;
+               }
+               break;
        }
        *pdata = data;
        return 0;
@@ -1135,7 +1205,9 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_IRQ_INJECT_STATUS:
        case KVM_CAP_ASSIGN_DEV_IRQ:
        case KVM_CAP_IRQFD:
+       case KVM_CAP_IOEVENTFD:
        case KVM_CAP_PIT2:
+       case KVM_CAP_PIT_STATE2:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -1308,6 +1380,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
        vcpu->arch.cpuid_nent = cpuid->nent;
        cpuid_fix_nx_cap(vcpu);
        r = 0;
+       kvm_apic_set_version(vcpu);
 
 out_free:
        vfree(cpuid_entries);
@@ -1329,6 +1402,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
                           cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
                goto out;
        vcpu->arch.cpuid_nent = cpuid->nent;
+       kvm_apic_set_version(vcpu);
        return 0;
 
 out:
@@ -1404,7 +1478,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
                0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
                0 /* Reserved, DCA */ | F(XMM4_1) |
-               F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+               F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
                0 /* Reserved, XSAVE, OSXSAVE */;
        /* cpuid 0x80000001.ecx */
        const u32 kvm_supported_word6_x86_features =
@@ -1425,6 +1499,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        case 1:
                entry->edx &= kvm_supported_word0_x86_features;
                entry->ecx &= kvm_supported_word4_x86_features;
+               /* we support x2apic emulation even if host does not support
+                * it since we emulate x2apic in software */
+               entry->ecx |= F(X2APIC);
                break;
        /* function 2 entries are STATEFUL. That is, repeated cpuid commands
         * may return different values. This forces us to get_cpu() before
@@ -1952,19 +2029,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
        r = 0;
        switch (chip->chip_id) {
        case KVM_IRQCHIP_PIC_MASTER:
+               spin_lock(&pic_irqchip(kvm)->lock);
                memcpy(&pic_irqchip(kvm)->pics[0],
                        &chip->chip.pic,
                        sizeof(struct kvm_pic_state));
+               spin_unlock(&pic_irqchip(kvm)->lock);
                break;
        case KVM_IRQCHIP_PIC_SLAVE:
+               spin_lock(&pic_irqchip(kvm)->lock);
                memcpy(&pic_irqchip(kvm)->pics[1],
                        &chip->chip.pic,
                        sizeof(struct kvm_pic_state));
+               spin_unlock(&pic_irqchip(kvm)->lock);
                break;
        case KVM_IRQCHIP_IOAPIC:
+               mutex_lock(&kvm->irq_lock);
                memcpy(ioapic_irqchip(kvm),
                        &chip->chip.ioapic,
                        sizeof(struct kvm_ioapic_state));
+               mutex_unlock(&kvm->irq_lock);
                break;
        default:
                r = -EINVAL;
@@ -1978,7 +2061,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
        int r = 0;
 
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
        memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return r;
 }
 
@@ -1986,8 +2071,39 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
        int r = 0;
 
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
        memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
-       kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+       kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+       int r = 0;
+
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
+       memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+               sizeof(ps->channels));
+       ps->flags = kvm->arch.vpit->pit_state.flags;
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+       int r = 0, start = 0;
+       u32 prev_legacy, cur_legacy;
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
+       prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       if (!prev_legacy && cur_legacy)
+               start = 1;
+       memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
+              sizeof(kvm->arch.vpit->pit_state.channels));
+       kvm->arch.vpit->pit_state.flags = ps->flags;
+       kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return r;
 }
 
@@ -1996,7 +2112,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 {
        if (!kvm->arch.vpit)
                return -ENXIO;
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
        kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return 0;
 }
 
@@ -2046,6 +2164,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
         */
        union {
                struct kvm_pit_state ps;
+               struct kvm_pit_state2 ps2;
                struct kvm_memory_alias alias;
                struct kvm_pit_config pit_config;
        } u;
@@ -2116,7 +2235,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                                   sizeof(struct kvm_pit_config)))
                        goto out;
        create_pit:
-               mutex_lock(&kvm->lock);
+               down_write(&kvm->slots_lock);
                r = -EEXIST;
                if (kvm->arch.vpit)
                        goto create_pit_unlock;
@@ -2125,7 +2244,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (kvm->arch.vpit)
                        r = 0;
        create_pit_unlock:
-               mutex_unlock(&kvm->lock);
+               up_write(&kvm->slots_lock);
                break;
        case KVM_IRQ_LINE_STATUS:
        case KVM_IRQ_LINE: {
@@ -2228,6 +2347,32 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
                break;
        }
+       case KVM_GET_PIT2: {
+               r = -ENXIO;
+               if (!kvm->arch.vpit)
+                       goto out;
+               r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_PIT2: {
+               r = -EFAULT;
+               if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+                       goto out;
+               r = -ENXIO;
+               if (!kvm->arch.vpit)
+                       goto out;
+               r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
        case KVM_REINJECT_CONTROL: {
                struct kvm_reinject_control control;
                r =  -EFAULT;
@@ -2261,35 +2406,23 @@ static void kvm_init_msr_list(void)
        num_msrs_to_save = j;
 }
 
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
-                                               gpa_t addr, int len,
-                                               int is_write)
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+                          const void *v)
 {
-       struct kvm_io_device *dev;
+       if (vcpu->arch.apic &&
+           !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+               return 0;
 
-       if (vcpu->arch.apic) {
-               dev = &vcpu->arch.apic->dev;
-               if (kvm_iodevice_in_range(dev, addr, len, is_write))
-                       return dev;
-       }
-       return NULL;
+       return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
 }
 
-
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-                                               gpa_t addr, int len,
-                                               int is_write)
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 {
-       struct kvm_io_device *dev;
+       if (vcpu->arch.apic &&
+           !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
+               return 0;
 
-       dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
-       if (dev == NULL)
-               dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
-                                         is_write);
-       return dev;
+       return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
 }
 
 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
@@ -2358,11 +2491,12 @@ static int emulator_read_emulated(unsigned long addr,
                                  unsigned int bytes,
                                  struct kvm_vcpu *vcpu)
 {
-       struct kvm_io_device *mmio_dev;
        gpa_t                 gpa;
 
        if (vcpu->mmio_read_completed) {
                memcpy(val, vcpu->mmio_data, bytes);
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+                              vcpu->mmio_phys_addr, *(u64 *)val);
                vcpu->mmio_read_completed = 0;
                return X86EMUL_CONTINUE;
        }
@@ -2383,14 +2517,13 @@ mmio:
        /*
         * Is this MMIO handled locally?
         */
-       mutex_lock(&vcpu->kvm->lock);
-       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
-       mutex_unlock(&vcpu->kvm->lock);
-       if (mmio_dev) {
-               kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+       if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
                return X86EMUL_CONTINUE;
        }
 
+       trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+
        vcpu->mmio_needed = 1;
        vcpu->mmio_phys_addr = gpa;
        vcpu->mmio_size = bytes;
@@ -2416,7 +2549,6 @@ static int emulator_write_emulated_onepage(unsigned long addr,
                                           unsigned int bytes,
                                           struct kvm_vcpu *vcpu)
 {
-       struct kvm_io_device *mmio_dev;
        gpa_t                 gpa;
 
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
@@ -2434,16 +2566,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
                return X86EMUL_CONTINUE;
 
 mmio:
+       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
        /*
         * Is this MMIO handled locally?
         */
-       mutex_lock(&vcpu->kvm->lock);
-       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
-       mutex_unlock(&vcpu->kvm->lock);
-       if (mmio_dev) {
-               kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+       if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
                return X86EMUL_CONTINUE;
-       }
 
        vcpu->mmio_needed = 1;
        vcpu->mmio_phys_addr = gpa;
@@ -2532,7 +2660,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 
 int emulate_clts(struct kvm_vcpu *vcpu)
 {
-       KVMTRACE_0D(CLTS, vcpu, handler);
        kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
        return X86EMUL_CONTINUE;
 }
@@ -2633,14 +2760,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
                r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 
-               /* Reject the instructions other than VMCALL/VMMCALL when
-                * try to emulate invalid opcode */
+               /* Only allow emulation of specific instructions on #UD
+                * (namely VMMCALL, sysenter, sysexit, syscall)*/
                c = &vcpu->arch.emulate_ctxt.decode;
-               if ((emulation_type & EMULTYPE_TRAP_UD) &&
-                   (!(c->twobyte && c->b == 0x01 &&
-                     (c->modrm_reg == 0 || c->modrm_reg == 3) &&
-                      c->modrm_mod == 3 && c->modrm_rm == 1)))
-                       return EMULATE_FAIL;
+               if (emulation_type & EMULTYPE_TRAP_UD) {
+                       if (!c->twobyte)
+                               return EMULATE_FAIL;
+                       switch (c->b) {
+                       case 0x01: /* VMMCALL */
+                               if (c->modrm_mod != 3 || c->modrm_rm != 1)
+                                       return EMULATE_FAIL;
+                               break;
+                       case 0x34: /* sysenter */
+                       case 0x35: /* sysexit */
+                               if (c->modrm_mod != 0 || c->modrm_rm != 0)
+                                       return EMULATE_FAIL;
+                               break;
+                       case 0x05: /* syscall */
+                               if (c->modrm_mod != 0 || c->modrm_rm != 0)
+                                       return EMULATE_FAIL;
+                               break;
+                       default:
+                               return EMULATE_FAIL;
+                       }
+
+                       if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+                               return EMULATE_FAIL;
+               }
 
                ++vcpu->stat.insn_emulation;
                if (r)  {
@@ -2760,48 +2906,40 @@ int complete_pio(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void kernel_pio(struct kvm_io_device *pio_dev,
-                      struct kvm_vcpu *vcpu,
-                      void *pd)
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 {
        /* TODO: String I/O for in kernel device */
+       int r;
 
        if (vcpu->arch.pio.in)
-               kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
-                                 vcpu->arch.pio.size,
-                                 pd);
+               r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+                                   vcpu->arch.pio.size, pd);
        else
-               kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
-                                  vcpu->arch.pio.size,
-                                  pd);
+               r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+                                    vcpu->arch.pio.size, pd);
+       return r;
 }
 
-static void pio_string_write(struct kvm_io_device *pio_dev,
-                            struct kvm_vcpu *vcpu)
+static int pio_string_write(struct kvm_vcpu *vcpu)
 {
        struct kvm_pio_request *io = &vcpu->arch.pio;
        void *pd = vcpu->arch.pio_data;
-       int i;
+       int i, r = 0;
 
        for (i = 0; i < io->cur_count; i++) {
-               kvm_iodevice_write(pio_dev, io->port,
-                                  io->size,
-                                  pd);
+               if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
+                                    io->port, io->size, pd)) {
+                       r = -EOPNOTSUPP;
+                       break;
+               }
                pd += io->size;
        }
-}
-
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
-                                              gpa_t addr, int len,
-                                              int is_write)
-{
-       return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
+       return r;
 }
 
 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                  int size, unsigned port)
 {
-       struct kvm_io_device *pio_dev;
        unsigned long val;
 
        vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2815,21 +2953,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        vcpu->arch.pio.down = 0;
        vcpu->arch.pio.rep = 0;
 
-       if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
-               KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
-       else
-               KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
+       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+                     size, 1);
 
        val = kvm_register_read(vcpu, VCPU_REGS_RAX);
        memcpy(vcpu->arch.pio_data, &val, 4);
 
-       mutex_lock(&vcpu->kvm->lock);
-       pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
-       mutex_unlock(&vcpu->kvm->lock);
-       if (pio_dev) {
-               kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
+       if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
                complete_pio(vcpu);
                return 1;
        }
@@ -2843,7 +2973,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 {
        unsigned now, in_page;
        int ret = 0;
-       struct kvm_io_device *pio_dev;
 
        vcpu->run->exit_reason = KVM_EXIT_IO;
        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2856,12 +2985,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        vcpu->arch.pio.down = down;
        vcpu->arch.pio.rep = rep;
 
-       if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
-               KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
-       else
-               KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
+       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+                     size, count);
 
        if (!count) {
                kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2891,12 +3016,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 
        vcpu->arch.pio.guest_gva = address;
 
-       mutex_lock(&vcpu->kvm->lock);
-       pio_dev = vcpu_find_pio_dev(vcpu, port,
-                                   vcpu->arch.pio.cur_count,
-                                   !vcpu->arch.pio.in);
-       mutex_unlock(&vcpu->kvm->lock);
-
        if (!vcpu->arch.pio.in) {
                /* string PIO write */
                ret = pio_copy_data(vcpu);
@@ -2904,16 +3023,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                        kvm_inject_gp(vcpu, 0);
                        return 1;
                }
-               if (ret == 0 && pio_dev) {
-                       pio_string_write(pio_dev, vcpu);
+               if (ret == 0 && !pio_string_write(vcpu)) {
                        complete_pio(vcpu);
                        if (vcpu->arch.pio.count == 0)
                                ret = 1;
                }
-       } else if (pio_dev)
-               pr_unimpl(vcpu, "no string pio read support yet, "
-                      "port %x size %d count %ld\n",
-                       port, size, count);
+       }
+       /* no string PIO read support yet */
 
        return ret;
 }
@@ -2946,10 +3062,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 
        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-                       vcpu = kvm->vcpus[i];
-                       if (!vcpu)
-                               continue;
+               kvm_for_each_vcpu(i, vcpu, kvm) {
                        if (vcpu->cpu != freq->cpu)
                                continue;
                        if (!kvm_request_guest_time_update(vcpu))
@@ -3042,7 +3155,6 @@ void kvm_arch_exit(void)
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.halt_exits;
-       KVMTRACE_0D(HLT, vcpu, handler);
        if (irqchip_in_kernel(vcpu->kvm)) {
                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
                return 1;
@@ -3073,7 +3185,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
        a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
 
-       KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+       trace_kvm_hypercall(nr, a0, a1, a2, a3);
 
        if (!is_long_mode(vcpu)) {
                nr &= 0xFFFFFFFF;
@@ -3173,8 +3285,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
                return 0;
        }
-       KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
-                   (u32)((u64)value >> 32), handler);
 
        return value;
 }
@@ -3182,9 +3292,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
                     unsigned long *rflags)
 {
-       KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
-                   (u32)((u64)val >> 32), handler);
-
        switch (cr) {
        case 0:
                kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -3294,11 +3401,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
                kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
        }
        kvm_x86_ops->skip_emulated_instruction(vcpu);
-       KVMTRACE_5D(CPUID, vcpu, function,
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
+       trace_kvm_cpuid(function,
+                       kvm_register_read(vcpu, VCPU_REGS_RAX),
+                       kvm_register_read(vcpu, VCPU_REGS_RBX),
+                       kvm_register_read(vcpu, VCPU_REGS_RCX),
+                       kvm_register_read(vcpu, VCPU_REGS_RDX));
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
 
@@ -3377,9 +3484,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        /* try to reinject previous events if any */
+       if (vcpu->arch.exception.pending) {
+               kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+                                         vcpu->arch.exception.has_error_code,
+                                         vcpu->arch.exception.error_code);
+               return;
+       }
+
        if (vcpu->arch.nmi_injected) {
                kvm_x86_ops->set_nmi(vcpu);
                return;
@@ -3453,16 +3567,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        smp_mb__after_clear_bit();
 
        if (vcpu->requests || need_resched() || signal_pending(current)) {
+               set_bit(KVM_REQ_KICK, &vcpu->requests);
                local_irq_enable();
                preempt_enable();
                r = 1;
                goto out;
        }
 
-       if (vcpu->arch.exception.pending)
-               __queue_exception(vcpu);
-       else
-               inject_pending_irq(vcpu, kvm_run);
+       inject_pending_event(vcpu, kvm_run);
 
        /* enable NMI/IRQ window open exits if needed */
        if (vcpu->arch.nmi_pending)
@@ -3494,7 +3606,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                set_debugreg(vcpu->arch.eff_db[3], 3);
        }
 
-       KVMTRACE_0D(VMENTRY, vcpu, entryexit);
+       trace_kvm_entry(vcpu->vcpu_id);
        kvm_x86_ops->run(vcpu, kvm_run);
 
        if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -4281,13 +4393,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
        vcpu->arch.cr2 = sregs->cr2;
        mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-
-       down_read(&vcpu->kvm->slots_lock);
-       if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
-               vcpu->arch.cr3 = sregs->cr3;
-       else
-               set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
-       up_read(&vcpu->kvm->slots_lock);
+       vcpu->arch.cr3 = sregs->cr3;
 
        kvm_set_cr8(vcpu, sregs->cr8);
 
@@ -4678,20 +4784,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 static void kvm_free_vcpus(struct kvm *kvm)
 {
        unsigned int i;
+       struct kvm_vcpu *vcpu;
 
        /*
         * Unpin any mmu pages first.
         */
-       for (i = 0; i < KVM_MAX_VCPUS; ++i)
-               if (kvm->vcpus[i])
-                       kvm_unload_vcpu_mmu(kvm->vcpus[i]);
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm_arch_vcpu_free(kvm->vcpus[i]);
-                       kvm->vcpus[i] = NULL;
-               }
-       }
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_unload_vcpu_mmu(vcpu);
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_arch_vcpu_free(vcpu);
+
+       mutex_lock(&kvm->lock);
+       for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+               kvm->vcpus[i] = NULL;
 
+       atomic_set(&kvm->online_vcpus, 0);
+       mutex_unlock(&kvm->lock);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
@@ -4782,8 +4890,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
-              || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
-              || vcpu->arch.nmi_pending;
+               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+               || vcpu->arch.nmi_pending ||
+               (kvm_arch_interrupt_allowed(vcpu) &&
+                kvm_cpu_has_interrupt(vcpu));
 }
 
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
@@ -4807,3 +4917,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
        return kvm_x86_ops->interrupt_allowed(vcpu);
 }
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);