Merge branch 'kvm-updates/2.6.33' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 8 Dec 2009 16:02:38 +0000 (08:02 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 8 Dec 2009 16:02:38 +0000 (08:02 -0800)
* 'kvm-updates/2.6.33' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (84 commits)
  KVM: VMX: Fix comparison of guest efer with stale host value
  KVM: s390: Fix prefix register checking in arch/s390/kvm/sigp.c
  KVM: Drop user return notifier when disabling virtualization on a cpu
  KVM: VMX: Disable unrestricted guest when EPT disabled
  KVM: x86 emulator: limit instructions to 15 bytes
  KVM: s390: Make psw available on all exits, not just a subset
  KVM: x86: Add KVM_GET/SET_VCPU_EVENTS
  KVM: VMX: Report unexpected simultaneous exceptions as internal errors
  KVM: Allow internal errors reported to userspace to carry extra data
  KVM: Reorder IOCTLs in main kvm.h
  KVM: x86: Polish exception injection via KVM_SET_GUEST_DEBUG
  KVM: only clear irq_source_id if irqchip is present
  KVM: x86: disallow KVM_{SET,GET}_LAPIC without allocated in-kernel lapic
  KVM: x86: disallow multiple KVM_CREATE_IRQCHIP
  KVM: VMX: Remove vmx->msr_offset_efer
  KVM: MMU: update invlpg handler comment
  KVM: VMX: move CR3/PDPTR update to vmx_set_cr3
  KVM: remove duplicated task_switch check
  KVM: powerpc: Fix BUILD_BUG_ON condition
  KVM: VMX: Use shared msr infrastructure
  ...

Trivial conflicts due to new Kconfig options in arch/Kconfig and kernel/Makefile

45 files changed:
Documentation/kvm/api.txt
arch/Kconfig
arch/ia64/include/asm/kvm.h
arch/ia64/include/asm/kvm_host.h
arch/ia64/kvm/Makefile
arch/ia64/kvm/kvm-ia64.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/timing.h
arch/s390/include/asm/kvm.h
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/sigp.c
arch/x86/Kconfig
arch/x86/include/asm/kvm.h
arch/x86/include/asm/kvm_emulate.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/svm.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/vmx.h
arch/x86/kernel/process.c
arch/x86/kernel/signal.c
arch/x86/kvm/Kconfig
arch/x86/kvm/Makefile
arch/x86/kvm/emulate.c
arch/x86/kvm/i8254.c
arch/x86/kvm/i8259.c
arch/x86/kvm/irq.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
include/linux/kvm.h
include/linux/kvm_host.h
include/linux/user-return-notifier.h [new file with mode: 0644]
kernel/Makefile
kernel/fork.c
kernel/user-return-notifier.c [new file with mode: 0644]
virt/kvm/assigned-dev.c [new file with mode: 0644]
virt/kvm/eventfd.c
virt/kvm/ioapic.c
virt/kvm/ioapic.h
virt/kvm/irq_comm.c
virt/kvm/kvm_main.c

index 5a4bc8c..e1a1141 100644 (file)
@@ -593,6 +593,115 @@ struct kvm_irqchip {
        } chip;
 };
 
+4.27 KVM_XEN_HVM_CONFIG
+
+Capability: KVM_CAP_XEN_HVM
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_xen_hvm_config (in)
+Returns: 0 on success, -1 on error
+
+Sets the MSR that the Xen HVM guest uses to initialize its hypercall
+page, and provides the starting address and size of the hypercall
+blobs in userspace.  When the guest writes the MSR, kvm copies one
+page of a blob (32- or 64-bit, depending on the vcpu mode) to guest
+memory.
+
+struct kvm_xen_hvm_config {
+       __u32 flags;
+       __u32 msr;
+       __u64 blob_addr_32;
+       __u64 blob_addr_64;
+       __u8 blob_size_32;
+       __u8 blob_size_64;
+       __u8 pad2[30];
+};
+
+4.27 KVM_GET_CLOCK
+
+Capability: KVM_CAP_ADJUST_CLOCK
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_clock_data (out)
+Returns: 0 on success, -1 on error
+
+Gets the current timestamp of kvmclock as seen by the current guest. In
+conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios
+such as migration.
+
+struct kvm_clock_data {
+       __u64 clock;  /* kvmclock current value */
+       __u32 flags;
+       __u32 pad[9];
+};
+
+4.28 KVM_SET_CLOCK
+
+Capability: KVM_CAP_ADJUST_CLOCK
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_clock_data (in)
+Returns: 0 on success, -1 on error
+
+Sets the current timestamp of kvmclock to the valued specific in its parameter.
+In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios
+such as migration.
+
+struct kvm_clock_data {
+       __u64 clock;  /* kvmclock current value */
+       __u32 flags;
+       __u32 pad[9];
+};
+
+4.29 KVM_GET_VCPU_EVENTS
+
+Capability: KVM_CAP_VCPU_EVENTS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_vcpu_event (out)
+Returns: 0 on success, -1 on error
+
+Gets currently pending exceptions, interrupts, and NMIs as well as related
+states of the vcpu.
+
+struct kvm_vcpu_events {
+       struct {
+               __u8 injected;
+               __u8 nr;
+               __u8 has_error_code;
+               __u8 pad;
+               __u32 error_code;
+       } exception;
+       struct {
+               __u8 injected;
+               __u8 nr;
+               __u8 soft;
+               __u8 pad;
+       } interrupt;
+       struct {
+               __u8 injected;
+               __u8 pending;
+               __u8 masked;
+               __u8 pad;
+       } nmi;
+       __u32 sipi_vector;
+       __u32 flags;   /* must be zero */
+};
+
+4.30 KVM_SET_VCPU_EVENTS
+
+Capability: KVM_CAP_VCPU_EVENTS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_vcpu_event (in)
+Returns: 0 on success, -1 on error
+
+Set pending exceptions, interrupts, and NMIs as well as related states of the
+vcpu.
+
+See KVM_GET_VCPU_EVENTS for the data structure.
+
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
index eef3bbb..d828758 100644 (file)
@@ -83,6 +83,13 @@ config KRETPROBES
        def_bool y
        depends on KPROBES && HAVE_KRETPROBES
 
+config USER_RETURN_NOTIFIER
+       bool
+       depends on HAVE_USER_RETURN_NOTIFIER
+       help
+         Provide a kernel-internal notification when a cpu is about to
+         switch to user mode.
+
 config HAVE_IOREMAP_PROT
        bool
 
@@ -132,5 +139,7 @@ config HAVE_HW_BREAKPOINT
        select ANON_INODES
        select PERF_EVENTS
 
+config HAVE_USER_RETURN_NOTIFIER
+       bool
 
 source "kernel/gcov/Kconfig"
index 18a7e49..bc90c75 100644 (file)
@@ -60,6 +60,7 @@ struct kvm_ioapic_state {
 #define KVM_IRQCHIP_PIC_MASTER   0
 #define KVM_IRQCHIP_PIC_SLAVE    1
 #define KVM_IRQCHIP_IOAPIC       2
+#define KVM_NR_IRQCHIPS          3
 
 #define KVM_CONTEXT_SIZE       8*1024
 
index d9b6325..a362e67 100644 (file)
@@ -475,7 +475,6 @@ struct kvm_arch {
        struct list_head assigned_dev_head;
        struct iommu_domain *iommu_domain;
        int iommu_flags;
-       struct hlist_head irq_ack_notifier_list;
 
        unsigned long irq_sources_bitmap;
        unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
index 0bb99b7..1089b3e 100644 (file)
@@ -49,7 +49,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-               coalesced_mmio.o irq_comm.o)
+               coalesced_mmio.o irq_comm.o assigned-dev.o)
 
 ifeq ($(CONFIG_IOMMU_API),y)
 common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
index 0ad09f0..5fdeec5 100644 (file)
@@ -124,7 +124,7 @@ long ia64_pal_vp_create(u64 *vpd, u64 *host_iva, u64 *opt_handler)
 
 static  DEFINE_SPINLOCK(vp_lock);
 
-void kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void *garbage)
 {
        long  status;
        long  tmp_base;
@@ -137,7 +137,7 @@ void kvm_arch_hardware_enable(void *garbage)
        slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT);
        local_irq_restore(saved_psr);
        if (slot < 0)
-               return;
+               return -EINVAL;
 
        spin_lock(&vp_lock);
        status = ia64_pal_vp_init_env(kvm_vsa_base ?
@@ -145,7 +145,7 @@ void kvm_arch_hardware_enable(void *garbage)
                        __pa(kvm_vm_buffer), KVM_VM_BUFFER_BASE, &tmp_base);
        if (status != 0) {
                printk(KERN_WARNING"kvm: Failed to Enable VT Support!!!!\n");
-               return ;
+               return -EINVAL;
        }
 
        if (!kvm_vsa_base) {
@@ -154,6 +154,8 @@ void kvm_arch_hardware_enable(void *garbage)
        }
        spin_unlock(&vp_lock);
        ia64_ptr_entry(0x3, slot);
+
+       return 0;
 }
 
 void kvm_arch_hardware_disable(void *garbage)
@@ -851,8 +853,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
        r = 0;
        switch (chip->chip_id) {
        case KVM_IRQCHIP_IOAPIC:
-               memcpy(&chip->chip.ioapic, ioapic_irqchip(kvm),
-                               sizeof(struct kvm_ioapic_state));
+               r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
                break;
        default:
                r = -EINVAL;
@@ -868,9 +869,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
        r = 0;
        switch (chip->chip_id) {
        case KVM_IRQCHIP_IOAPIC:
-               memcpy(ioapic_irqchip(kvm),
-                               &chip->chip.ioapic,
-                               sizeof(struct kvm_ioapic_state));
+               r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
                break;
        default:
                r = -EINVAL;
@@ -944,7 +943,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 {
        struct kvm *kvm = filp->private_data;
        void __user *argp = (void __user *)arg;
-       int r = -EINVAL;
+       int r = -ENOTTY;
 
        switch (ioctl) {
        case KVM_SET_MEMORY_REGION: {
@@ -985,10 +984,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        goto out;
                if (irqchip_in_kernel(kvm)) {
                        __s32 status;
-                       mutex_lock(&kvm->irq_lock);
                        status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
                                    irq_event.irq, irq_event.level);
-                       mutex_unlock(&kvm->irq_lock);
                        if (ioctl == KVM_IRQ_LINE_STATUS) {
                                irq_event.status = status;
                                if (copy_to_user(argp, &irq_event,
index 2a4551f..5902bbc 100644 (file)
@@ -78,8 +78,9 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
        return r;
 }
 
-void kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void *garbage)
 {
+       return 0;
 }
 
 void kvm_arch_hardware_disable(void *garbage)
@@ -421,7 +422,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
        switch (ioctl) {
        default:
-               r = -EINVAL;
+               r = -ENOTTY;
        }
 
        return r;
index 806ef67..8167d42 100644 (file)
@@ -51,7 +51,7 @@ static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
 
        /* The BUILD_BUG_ON below breaks in funny ways, commented out
         * for now ... -BenH
-       BUILD_BUG_ON(__builtin_constant_p(type));
+       BUILD_BUG_ON(!__builtin_constant_p(type));
        */
        switch (type) {
        case EXT_INTR_EXITS:
index 3dfcaeb..82b32a1 100644 (file)
@@ -1,6 +1,5 @@
 #ifndef __LINUX_KVM_S390_H
 #define __LINUX_KVM_S390_H
-
 /*
  * asm-s390/kvm.h - KVM s390 specific structures and definitions
  *
@@ -15,6 +14,8 @@
  */
 #include <linux/types.h>
 
+#define __KVM_S390
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
        /* general purpose regs for s390 */
index 07ced89..f8bcaef 100644 (file)
@@ -74,9 +74,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 static unsigned long long *facilities;
 
 /* Section: not file related */
-void kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void *garbage)
 {
        /* every s390 is virtualization enabled ;-) */
+       return 0;
 }
 
 void kvm_arch_hardware_disable(void *garbage)
@@ -116,10 +117,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
 
 int kvm_dev_ioctl_check_extension(long ext)
 {
+       int r;
+
        switch (ext) {
+       case KVM_CAP_S390_PSW:
+               r = 1;
+               break;
        default:
-               return 0;
+               r = 0;
        }
+       return r;
 }
 
 /* Section: vm related */
@@ -150,7 +157,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                break;
        }
        default:
-               r = -EINVAL;
+               r = -ENOTTY;
        }
 
        return r;
@@ -419,8 +426,10 @@ static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw)
        vcpu_load(vcpu);
        if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
                rc = -EBUSY;
-       else
-               vcpu->arch.sie_block->gpsw = psw;
+       else {
+               vcpu->run->psw_mask = psw.mask;
+               vcpu->run->psw_addr = psw.addr;
+       }
        vcpu_put(vcpu);
        return rc;
 }
@@ -508,9 +517,6 @@ rerun_vcpu:
 
        switch (kvm_run->exit_reason) {
        case KVM_EXIT_S390_SIEIC:
-               vcpu->arch.sie_block->gpsw.mask = kvm_run->s390_sieic.mask;
-               vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr;
-               break;
        case KVM_EXIT_UNKNOWN:
        case KVM_EXIT_INTR:
        case KVM_EXIT_S390_RESET:
@@ -519,6 +525,9 @@ rerun_vcpu:
                BUG();
        }
 
+       vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
+       vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
+
        might_fault();
 
        do {
@@ -538,8 +547,6 @@ rerun_vcpu:
                /* intercept cannot be handled in-kernel, prepare kvm-run */
                kvm_run->exit_reason         = KVM_EXIT_S390_SIEIC;
                kvm_run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode;
-               kvm_run->s390_sieic.mask     = vcpu->arch.sie_block->gpsw.mask;
-               kvm_run->s390_sieic.addr     = vcpu->arch.sie_block->gpsw.addr;
                kvm_run->s390_sieic.ipa      = vcpu->arch.sie_block->ipa;
                kvm_run->s390_sieic.ipb      = vcpu->arch.sie_block->ipb;
                rc = 0;
@@ -551,6 +558,9 @@ rerun_vcpu:
                rc = 0;
        }
 
+       kvm_run->psw_mask     = vcpu->arch.sie_block->gpsw.mask;
+       kvm_run->psw_addr     = vcpu->arch.sie_block->gpsw.addr;
+
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
index 40c8c67..15ee111 100644 (file)
@@ -188,9 +188,9 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
 
        /* make sure that the new value is valid memory */
        address = address & 0x7fffe000u;
-       if ((copy_from_guest(vcpu, &tmp,
-               (u64) (address + vcpu->arch.sie_block->gmsor) , 1)) ||
-          (copy_from_guest(vcpu, &tmp, (u64) (address +
+       if ((copy_from_user(&tmp, (void __user *)
+               (address + vcpu->arch.sie_block->gmsor) , 1)) ||
+          (copy_from_user(&tmp, (void __user *)(address +
                        vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) {
                *reg |= SIGP_STAT_INVALID_PARAMETER;
                return 1; /* invalid parameter */
index 178084b..1b2182b 100644 (file)
@@ -51,6 +51,7 @@ config X86
        select HAVE_KERNEL_LZMA
        select HAVE_HW_BREAKPOINT
        select HAVE_ARCH_KMEMCHECK
+       select HAVE_USER_RETURN_NOTIFIER
 
 config OUTPUT_FORMAT
        string
index 4a5fe91..950df43 100644 (file)
@@ -19,6 +19,8 @@
 #define __KVM_HAVE_MSIX
 #define __KVM_HAVE_MCE
 #define __KVM_HAVE_PIT_STATE2
+#define __KVM_HAVE_XEN_HVM
+#define __KVM_HAVE_VCPU_EVENTS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -79,6 +81,7 @@ struct kvm_ioapic_state {
 #define KVM_IRQCHIP_PIC_MASTER   0
 #define KVM_IRQCHIP_PIC_SLAVE    1
 #define KVM_IRQCHIP_IOAPIC       2
+#define KVM_NR_IRQCHIPS          3
 
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
@@ -250,4 +253,31 @@ struct kvm_reinject_control {
        __u8 pit_reinject;
        __u8 reserved[31];
 };
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+       struct {
+               __u8 injected;
+               __u8 nr;
+               __u8 has_error_code;
+               __u8 pad;
+               __u32 error_code;
+       } exception;
+       struct {
+               __u8 injected;
+               __u8 nr;
+               __u8 soft;
+               __u8 pad;
+       } interrupt;
+       struct {
+               __u8 injected;
+               __u8 pending;
+               __u8 masked;
+               __u8 pad;
+       } nmi;
+       __u32 sipi_vector;
+       __u32 flags;
+       __u32 reserved[10];
+};
+
 #endif /* _ASM_X86_KVM_H */
index b7ed2c4..7c18e12 100644 (file)
@@ -129,7 +129,7 @@ struct decode_cache {
        u8 seg_override;
        unsigned int d;
        unsigned long regs[NR_VCPU_REGS];
-       unsigned long eip;
+       unsigned long eip, eip_orig;
        /* modrm */
        u8 modrm;
        u8 modrm_mod;
index d838922..4f865e8 100644 (file)
@@ -354,7 +354,6 @@ struct kvm_vcpu_arch {
        unsigned int time_offset;
        struct page *time_page;
 
-       bool singlestep; /* guest is single stepped by KVM */
        bool nmi_pending;
        bool nmi_injected;
 
@@ -371,6 +370,10 @@ struct kvm_vcpu_arch {
        u64 mcg_status;
        u64 mcg_ctl;
        u64 *mce_banks;
+
+       /* used for guest single stepping over the given code position */
+       u16 singlestep_cs;
+       unsigned long singlestep_rip;
 };
 
 struct kvm_mem_alias {
@@ -397,7 +400,6 @@ struct kvm_arch{
        struct kvm_pic *vpic;
        struct kvm_ioapic *vioapic;
        struct kvm_pit *vpit;
-       struct hlist_head irq_ack_notifier_list;
        int vapics_in_nmi_mode;
 
        unsigned int tss_addr;
@@ -410,8 +412,10 @@ struct kvm_arch{
        gpa_t ept_identity_map_addr;
 
        unsigned long irq_sources_bitmap;
-       unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
        u64 vm_init_tsc;
+       s64 kvmclock_offset;
+
+       struct kvm_xen_hvm_config xen_hvm_config;
 };
 
 struct kvm_vm_stat {
@@ -461,7 +465,7 @@ struct descriptor_table {
 struct kvm_x86_ops {
        int (*cpu_has_kvm_support)(void);          /* __init */
        int (*disabled_by_bios)(void);             /* __init */
-       void (*hardware_enable)(void *dummy);      /* __init */
+       int (*hardware_enable)(void *dummy);
        void (*hardware_disable)(void *dummy);
        void (*check_processor_compatibility)(void *rtn);
        int (*hardware_setup)(void);               /* __init */
@@ -477,8 +481,8 @@ struct kvm_x86_ops {
        void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
        void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
-       int (*set_guest_debug)(struct kvm_vcpu *vcpu,
-                              struct kvm_guest_debug *dbg);
+       void (*set_guest_debug)(struct kvm_vcpu *vcpu,
+                               struct kvm_guest_debug *dbg);
        int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
        int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
        u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -506,8 +510,8 @@ struct kvm_x86_ops {
 
        void (*tlb_flush)(struct kvm_vcpu *vcpu);
 
-       void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
-       int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+       void (*run)(struct kvm_vcpu *vcpu);
+       int (*handle_exit)(struct kvm_vcpu *vcpu);
        void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
        void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
        u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -519,6 +523,8 @@ struct kvm_x86_ops {
                                bool has_error_code, u32 error_code);
        int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
        int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+       bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
+       void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
        void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
        void (*enable_irq_window)(struct kvm_vcpu *vcpu);
        void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -568,7 +574,7 @@ enum emulation_result {
 #define EMULTYPE_NO_DECODE         (1 << 0)
 #define EMULTYPE_TRAP_UD           (1 << 1)
 #define EMULTYPE_SKIP              (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
+int emulate_instruction(struct kvm_vcpu *vcpu,
                        unsigned long cr2, u16 error_code, int emulation_type);
 void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -585,9 +591,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 
 struct x86_emulate_ctxt;
 
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
                     int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
                           int size, unsigned long count, int down,
                            gva_t address, int rep, unsigned port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -616,6 +622,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
@@ -802,4 +811,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 
+void kvm_define_shared_msr(unsigned index, u32 msr);
+void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+
 #endif /* _ASM_X86_KVM_HOST_H */
index 85574b7..1fecb7e 100644 (file)
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
        u16 intercept_dr_write;
        u32 intercept_exceptions;
        u64 intercept;
-       u8 reserved_1[44];
+       u8 reserved_1[42];
+       u16 pause_filter_count;
        u64 iopm_base_pa;
        u64 msrpm_base_pa;
        u64 tsc_offset;
index d27d0a2..375c917 100644 (file)
@@ -83,6 +83,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
 #define TIF_SECCOMP            8       /* secure computing */
 #define TIF_MCE_NOTIFY         10      /* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
 #define TIF_NOTSC              16      /* TSC is not accessible in userland */
 #define TIF_IA32               17      /* 32bit process */
 #define TIF_FORK               18      /* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY                (1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_NOTSC             (1 << TIF_NOTSC)
 #define _TIF_IA32              (1 << TIF_IA32)
 #define _TIF_FORK              (1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK                                            \
-       (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+       (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |       \
+        _TIF_USER_RETURN_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW                                                        \
        (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
 
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
 #define PREEMPT_ACTIVE         0x10000000
index 272514c..2b49454 100644 (file)
@@ -56,6 +56,7 @@
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING          0x00000040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST      0x00000080
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING      0x00000400
 
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
@@ -144,6 +145,8 @@ enum vmcs_field {
        VM_ENTRY_INSTRUCTION_LEN        = 0x0000401a,
        TPR_THRESHOLD                   = 0x0000401c,
        SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
+       PLE_GAP                         = 0x00004020,
+       PLE_WINDOW                      = 0x00004022,
        VM_INSTRUCTION_ERROR            = 0x00004400,
        VM_EXIT_REASON                  = 0x00004402,
        VM_EXIT_INTR_INFO               = 0x00004404,
@@ -248,6 +251,7 @@ enum vmcs_field {
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_PAUSE_INSTRUCTION   40
 #define EXIT_REASON_MCE_DURING_VMENTRY  41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
index 744508e..5e2ba63 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/random.h>
+#include <linux/user-return-notifier.h>
 #include <trace/events/power.h>
 #include <linux/hw_breakpoint.h>
 #include <asm/system.h>
@@ -209,6 +210,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
                 */
                memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
        }
+       propagate_user_return_notify(prev_p, next_p);
 }
 
 int sys_fork(struct pt_regs *regs)
index fbf3b07..74fe6d8 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/stddef.h>
 #include <linux/personality.h>
 #include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -863,6 +864,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
                if (current->replacement_session_keyring)
                        key_replace_session_keyring();
        }
+       if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+               fire_user_return_notifiers();
 
 #ifdef CONFIG_X86_32
        clear_thread_flag(TIF_IRET);
index b84e571..4cd4983 100644 (file)
@@ -28,6 +28,7 @@ config KVM
        select HAVE_KVM_IRQCHIP
        select HAVE_KVM_EVENTFD
        select KVM_APIC_ARCHITECTURE
+       select USER_RETURN_NOTIFIER
        ---help---
          Support hosting fully virtualized guest machines using hardware
          virtualization extensions.  You will need a fairly recent
index 0e7fe78..31a7035 100644 (file)
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
 CFLAGS_vmx.o := -I.
 
 kvm-y                  += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-                               coalesced_mmio.o irq_comm.o eventfd.o)
+                               coalesced_mmio.o irq_comm.o eventfd.o \
+                               assigned-dev.o)
 kvm-$(CONFIG_IOMMU_API)        += $(addprefix ../../../virt/kvm/, iommu.o)
 
 kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
index 1be5cd6..7e8faea 100644 (file)
@@ -75,6 +75,8 @@
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
+/* Misc flags */
+#define No64       (1<<28)
 /* Source 2 operand type */
 #define Src2None    (0<<29)
 #define Src2CL      (1<<29)
@@ -92,19 +94,23 @@ static u32 opcode_table[256] = {
        /* 0x00 - 0x07 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
+       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+       ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
        /* 0x08 - 0x0F */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
+       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+       ImplicitOps | Stack | No64, 0,
        /* 0x10 - 0x17 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
+       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+       ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
        /* 0x18 - 0x1F */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
+       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+       ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
        /* 0x20 - 0x27 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -133,7 +139,8 @@ static u32 opcode_table[256] = {
        DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
        DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
        /* 0x60 - 0x67 */
-       0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
+       ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+       0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
        0, 0, 0, 0,
        /* 0x68 - 0x6F */
        SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
@@ -158,7 +165,7 @@ static u32 opcode_table[256] = {
        /* 0x90 - 0x97 */
        DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
        /* 0x98 - 0x9F */
-       0, 0, SrcImm | Src2Imm16, 0,
+       0, 0, SrcImm | Src2Imm16 | No64, 0,
        ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
        /* 0xA0 - 0xA7 */
        ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -185,7 +192,7 @@ static u32 opcode_table[256] = {
        ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
        /* 0xC8 - 0xCF */
        0, 0, 0, ImplicitOps | Stack,
-       ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
+       ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
        /* 0xD0 - 0xD7 */
        ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
        ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -198,7 +205,7 @@ static u32 opcode_table[256] = {
        ByteOp | SrcImmUByte, SrcImmUByte,
        /* 0xE8 - 0xEF */
        SrcImm | Stack, SrcImm | ImplicitOps,
-       SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps,
+       SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
        SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
        SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
        /* 0xF0 - 0xF7 */
@@ -244,11 +251,13 @@ static u32 twobyte_table[256] = {
        /* 0x90 - 0x9F */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        /* 0xA0 - 0xA7 */
-       0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+       ImplicitOps | Stack, ImplicitOps | Stack,
+       0, DstMem | SrcReg | ModRM | BitOp,
        DstMem | SrcReg | Src2ImmByte | ModRM,
        DstMem | SrcReg | Src2CL | ModRM, 0, 0,
        /* 0xA8 - 0xAF */
-       0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+       ImplicitOps | Stack, ImplicitOps | Stack,
+       0, DstMem | SrcReg | ModRM | BitOp,
        DstMem | SrcReg | Src2ImmByte | ModRM,
        DstMem | SrcReg | Src2CL | ModRM,
        ModRM, 0,
@@ -613,6 +622,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 {
        int rc = 0;
 
+       /* x86 instructions are limited to 15 bytes. */
+       if (eip + size - ctxt->decode.eip_orig > 15)
+               return X86EMUL_UNHANDLEABLE;
        eip += ctxt->cs_base;
        while (size--) {
                rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
@@ -871,7 +883,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        /* Shadow copy of register state. Committed on successful emulation. */
 
        memset(c, 0, sizeof(struct decode_cache));
-       c->eip = kvm_rip_read(ctxt->vcpu);
+       c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
        ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
        memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
@@ -962,6 +974,11 @@ done_prefixes:
                }
        }
 
+       if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+               kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");;
+               return -1;
+       }
+
        if (c->d & Group) {
                group = c->d & GroupMask;
                c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1186,6 +1203,69 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
        return rc;
 }
 
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+{
+       struct decode_cache *c = &ctxt->decode;
+       struct kvm_segment segment;
+
+       kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+
+       c->src.val = segment.selector;
+       emulate_push(ctxt);
+}
+
+static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
+                            struct x86_emulate_ops *ops, int seg)
+{
+       struct decode_cache *c = &ctxt->decode;
+       unsigned long selector;
+       int rc;
+
+       rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
+       if (rc != 0)
+               return rc;
+
+       rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg);
+       return rc;
+}
+
+static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+       unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+       int reg = VCPU_REGS_RAX;
+
+       while (reg <= VCPU_REGS_RDI) {
+               (reg == VCPU_REGS_RSP) ?
+               (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
+
+               emulate_push(ctxt);
+               ++reg;
+       }
+}
+
+static int emulate_popa(struct x86_emulate_ctxt *ctxt,
+                       struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+       int reg = VCPU_REGS_RDI;
+
+       while (reg >= VCPU_REGS_RAX) {
+               if (reg == VCPU_REGS_RSP) {
+                       register_address_increment(c, &c->regs[VCPU_REGS_RSP],
+                                                       c->op_bytes);
+                       --reg;
+               }
+
+               rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
+               if (rc != 0)
+                       break;
+               --reg;
+       }
+       return rc;
+}
+
 static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
                                struct x86_emulate_ops *ops)
 {
@@ -1707,18 +1787,45 @@ special_insn:
              add:              /* add */
                emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
                break;
+       case 0x06:              /* push es */
+               emulate_push_sreg(ctxt, VCPU_SREG_ES);
+               break;
+       case 0x07:              /* pop es */
+               rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
+               if (rc != 0)
+                       goto done;
+               break;
        case 0x08 ... 0x0d:
              or:               /* or */
                emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
                break;
+       case 0x0e:              /* push cs */
+               emulate_push_sreg(ctxt, VCPU_SREG_CS);
+               break;
        case 0x10 ... 0x15:
              adc:              /* adc */
                emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
                break;
+       case 0x16:              /* push ss */
+               emulate_push_sreg(ctxt, VCPU_SREG_SS);
+               break;
+       case 0x17:              /* pop ss */
+               rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
+               if (rc != 0)
+                       goto done;
+               break;
        case 0x18 ... 0x1d:
              sbb:              /* sbb */
                emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
                break;
+       case 0x1e:              /* push ds */
+               emulate_push_sreg(ctxt, VCPU_SREG_DS);
+               break;
+       case 0x1f:              /* pop ds */
+               rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
+               if (rc != 0)
+                       goto done;
+               break;
        case 0x20 ... 0x25:
              and:              /* and */
                emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
@@ -1750,6 +1857,14 @@ special_insn:
                if (rc != 0)
                        goto done;
                break;
+       case 0x60:      /* pusha */
+               emulate_pusha(ctxt);
+               break;
+       case 0x61:      /* popa */
+               rc = emulate_popa(ctxt, ops);
+               if (rc != 0)
+                       goto done;
+               break;
        case 0x63:              /* movsxd */
                if (ctxt->mode != X86EMUL_MODE_PROT64)
                        goto cannot_emulate;
@@ -1761,7 +1876,7 @@ special_insn:
                break;
        case 0x6c:              /* insb */
        case 0x6d:              /* insw/insd */
-                if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+                if (kvm_emulate_pio_string(ctxt->vcpu,
                                1,
                                (c->d & ByteOp) ? 1 : c->op_bytes,
                                c->rep_prefix ?
@@ -1777,7 +1892,7 @@ special_insn:
                return 0;
        case 0x6e:              /* outsb */
        case 0x6f:              /* outsw/outsd */
-               if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+               if (kvm_emulate_pio_string(ctxt->vcpu,
                                0,
                                (c->d & ByteOp) ? 1 : c->op_bytes,
                                c->rep_prefix ?
@@ -2070,7 +2185,7 @@ special_insn:
        case 0xef: /* out (e/r)ax,dx */
                port = c->regs[VCPU_REGS_RDX];
                io_dir_in = 0;
-       do_io:  if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
+       do_io:  if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
                                   (c->d & ByteOp) ? 1 : c->op_bytes,
                                   port) != 0) {
                        c->eip = saved_eip;
@@ -2297,6 +2412,14 @@ twobyte_insn:
                        jmp_rel(c, c->src.val);
                c->dst.type = OP_NONE;
                break;
+       case 0xa0:        /* push fs */
+               emulate_push_sreg(ctxt, VCPU_SREG_FS);
+               break;
+       case 0xa1:       /* pop fs */
+               rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
+               if (rc != 0)
+                       goto done;
+               break;
        case 0xa3:
              bt:               /* bt */
                c->dst.type = OP_NONE;
@@ -2308,6 +2431,14 @@ twobyte_insn:
        case 0xa5: /* shld cl, r, r/m */
                emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
                break;
+       case 0xa8:      /* push gs */
+               emulate_push_sreg(ctxt, VCPU_SREG_GS);
+               break;
+       case 0xa9:      /* pop gs */
+               rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
+               if (rc != 0)
+                       goto done;
+               break;
        case 0xab:
              bts:              /* bts */
                /* only subword offset */
index 144e7f6..fab7440 100644 (file)
@@ -688,10 +688,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
        struct kvm_vcpu *vcpu;
        int i;
 
-       mutex_lock(&kvm->irq_lock);
        kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
        kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-       mutex_unlock(&kvm->irq_lock);
 
        /*
         * Provides NMI watchdog support via Virtual Wire mode.
index 01f1516..d057c0c 100644 (file)
@@ -38,7 +38,15 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
        s->isr_ack |= (1 << irq);
        if (s != &s->pics_state->pics[0])
                irq += 8;
+       /*
+        * We are dropping lock while calling ack notifiers since ack
+        * notifier callbacks for assigned devices call into PIC recursively.
+        * Other interrupt may be delivered to PIC while lock is dropped but
+        * it should be safe since PIC state is already updated at this stage.
+        */
+       spin_unlock(&s->pics_state->lock);
        kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
+       spin_lock(&s->pics_state->lock);
 }
 
 void kvm_pic_clear_isr_ack(struct kvm *kvm)
@@ -176,16 +184,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
 static inline void pic_intack(struct kvm_kpic_state *s, int irq)
 {
        s->isr |= 1 << irq;
-       if (s->auto_eoi) {
-               if (s->rotate_on_auto_eoi)
-                       s->priority_add = (irq + 1) & 7;
-               pic_clear_isr(s, irq);
-       }
        /*
         * We don't clear a level sensitive interrupt here
         */
        if (!(s->elcr & (1 << irq)))
                s->irr &= ~(1 << irq);
+
+       if (s->auto_eoi) {
+               if (s->rotate_on_auto_eoi)
+                       s->priority_add = (irq + 1) & 7;
+               pic_clear_isr(s, irq);
+       }
+
 }
 
 int kvm_pic_read_irq(struct kvm *kvm)
@@ -225,22 +235,11 @@ int kvm_pic_read_irq(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-       int irq, irqbase, n;
+       int irq;
        struct kvm *kvm = s->pics_state->irq_request_opaque;
        struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
+       u8 irr = s->irr, isr = s->imr;
 
-       if (s == &s->pics_state->pics[0])
-               irqbase = 0;
-       else
-               irqbase = 8;
-
-       for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
-               if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
-                       if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
-                               n = irq + irqbase;
-                               kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
-                       }
-       }
        s->last_irr = 0;
        s->irr = 0;
        s->imr = 0;
@@ -256,6 +255,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
        s->rotate_on_auto_eoi = 0;
        s->special_fully_nested_mode = 0;
        s->init4 = 0;
+
+       for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
+               if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
+                       if (irr & (1 << irq) || isr & (1 << irq)) {
+                               pic_clear_isr(s, irq);
+                       }
+       }
 }
 
 static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -298,9 +304,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
                                priority = get_priority(s, s->isr);
                                if (priority != 8) {
                                        irq = (priority + s->priority_add) & 7;
-                                       pic_clear_isr(s, irq);
                                        if (cmd == 5)
                                                s->priority_add = (irq + 1) & 7;
+                                       pic_clear_isr(s, irq);
                                        pic_update_irq(s->pics_state);
                                }
                                break;
index 7d6058a..be399e2 100644 (file)
@@ -71,6 +71,7 @@ struct kvm_pic {
        int output;             /* intr from master PIC */
        struct kvm_io_device dev;
        void (*ack_notifier)(void *opaque, int irq);
+       unsigned long irq_states[16];
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
@@ -85,7 +86,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
 
 static inline int irqchip_in_kernel(struct kvm *kvm)
 {
-       return pic_irqchip(kvm) != NULL;
+       int ret;
+
+       ret = (pic_irqchip(kvm) != NULL);
+       smp_rmb();
+       return ret;
 }
 
 void kvm_pic_reset(struct kvm_kpic_state *s);
index 23c2176..cd60c0b 100644 (file)
@@ -32,7 +32,6 @@
 #include <asm/current.h>
 #include <asm/apicdef.h>
 #include <asm/atomic.h>
-#include <asm/apicdef.h>
 #include "kvm_cache_regs.h"
 #include "irq.h"
 #include "trace.h"
@@ -471,11 +470,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
                trigger_mode = IOAPIC_LEVEL_TRIG;
        else
                trigger_mode = IOAPIC_EDGE_TRIG;
-       if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) {
-               mutex_lock(&apic->vcpu->kvm->irq_lock);
+       if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
                kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
-               mutex_unlock(&apic->vcpu->kvm->irq_lock);
-       }
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
@@ -504,9 +500,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
                   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
                   irq.vector);
 
-       mutex_lock(&apic->vcpu->kvm->irq_lock);
        kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
-       mutex_unlock(&apic->vcpu->kvm->irq_lock);
 }
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
index 818b92a..4c3e5b2 100644 (file)
@@ -2789,7 +2789,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
        if (r)
                goto out;
 
-       er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
+       er = emulate_instruction(vcpu, cr2, error_code, 0);
 
        switch (er) {
        case EMULATE_DONE:
@@ -2800,6 +2800,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
        case EMULATE_FAIL:
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
                return 0;
        default:
                BUG();
index 72558f8..a601713 100644 (file)
@@ -467,7 +467,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
                level = iterator.level;
                sptep = iterator.sptep;
 
-               /* FIXME: properly handle invlpg on large guest pages */
                if (level == PT_PAGE_TABLE_LEVEL  ||
                    ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
                    ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
index c17404a..3de0b37 100644 (file)
@@ -46,6 +46,7 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
 
 #define NESTED_EXIT_HOST       0       /* Exit handled on host level */
 #define NESTED_EXIT_DONE       1       /* Exit caused nested vmexit  */
@@ -53,15 +54,6 @@ MODULE_LICENSE("GPL");
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
-/* Turn on to get debugging output*/
-/* #define NESTED_DEBUG */
-
-#ifdef NESTED_DEBUG
-#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
-#else
-#define nsvm_printk(fmt, args...) do {} while(0)
-#endif
-
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
        MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +77,9 @@ struct nested_state {
        /* gpa pointers to the real vectors */
        u64 vmcb_msrpm;
 
+       /* A VMEXIT is required but not yet emulated */
+       bool exit_required;
+
        /* cache for intercepts of the guest */
        u16 intercept_cr_read;
        u16 intercept_cr_write;
@@ -112,6 +107,8 @@ struct vcpu_svm {
        u32 *msrpm;
 
        struct nested_state nested;
+
+       bool nmi_singlestep;
 };
 
 /* enable NPT for AMD64 and X86 with PAE */
@@ -286,7 +283,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        if (!svm->next_rip) {
-               if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
+               if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
                                EMULATE_DONE)
                        printk(KERN_DEBUG "%s: NOP\n", __func__);
                return;
@@ -316,7 +313,7 @@ static void svm_hardware_disable(void *garbage)
        cpu_svm_disable();
 }
 
-static void svm_hardware_enable(void *garbage)
+static int svm_hardware_enable(void *garbage)
 {
 
        struct svm_cpu_data *svm_data;
@@ -325,16 +322,21 @@ static void svm_hardware_enable(void *garbage)
        struct desc_struct *gdt;
        int me = raw_smp_processor_id();
 
+       rdmsrl(MSR_EFER, efer);
+       if (efer & EFER_SVME)
+               return -EBUSY;
+
        if (!has_svm()) {
-               printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
-               return;
+               printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
+                      me);
+               return -EINVAL;
        }
        svm_data = per_cpu(svm_data, me);
 
        if (!svm_data) {
-               printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
+               printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
                       me);
-               return;
+               return -EINVAL;
        }
 
        svm_data->asid_generation = 1;
@@ -345,11 +347,12 @@ static void svm_hardware_enable(void *garbage)
        gdt = (struct desc_struct *)gdt_descr.base;
        svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
-       rdmsrl(MSR_EFER, efer);
        wrmsrl(MSR_EFER, efer | EFER_SVME);
 
        wrmsrl(MSR_VM_HSAVE_PA,
               page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
+
+       return 0;
 }
 
 static void svm_cpu_uninit(int cpu)
@@ -476,7 +479,7 @@ static __init int svm_hardware_setup(void)
                kvm_enable_efer_bits(EFER_SVME);
        }
 
-       for_each_online_cpu(cpu) {
+       for_each_possible_cpu(cpu) {
                r = svm_cpu_init(cpu);
                if (r)
                        goto err;
@@ -510,7 +513,7 @@ static __exit void svm_hardware_unsetup(void)
 {
        int cpu;
 
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                svm_cpu_uninit(cpu);
 
        __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -625,11 +628,12 @@ static void init_vmcb(struct vcpu_svm *svm)
        save->rip = 0x0000fff0;
        svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
-       /*
-        * cr0 val on cpu init should be 0x60000010, we enable cpu
-        * cache by default. the orderly way is to enable cache in bios.
+       /* This is the guest-visible cr0 value.
+        * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
         */
-       save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
+       svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+       kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+
        save->cr4 = X86_CR4_PAE;
        /* rdx = ?? */
 
@@ -644,8 +648,6 @@ static void init_vmcb(struct vcpu_svm *svm)
                control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
                                                 INTERCEPT_CR3_MASK);
                save->g_pat = 0x0007040600070406ULL;
-               /* enable caching because the QEMU Bios doesn't enable it */
-               save->cr0 = X86_CR0_ET;
                save->cr3 = 0;
                save->cr4 = 0;
        }
@@ -654,6 +656,11 @@ static void init_vmcb(struct vcpu_svm *svm)
        svm->nested.vmcb = 0;
        svm->vcpu.arch.hflags = 0;
 
+       if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+               control->pause_filter_count = 3000;
+               control->intercept |= (1ULL << INTERCEPT_PAUSE);
+       }
+
        enable_gif(svm);
 }
 
@@ -758,14 +765,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        int i;
 
        if (unlikely(cpu != vcpu->cpu)) {
-               u64 tsc_this, delta;
+               u64 delta;
 
                /*
                 * Make sure that the guest sees a monotonically
                 * increasing TSC.
                 */
-               rdtscll(tsc_this);
-               delta = vcpu->arch.host_tsc - tsc_this;
+               delta = vcpu->arch.host_tsc - native_read_tsc();
                svm->vmcb->control.tsc_offset += delta;
                if (is_nested(svm))
                        svm->nested.hsave->control.tsc_offset += delta;
@@ -787,7 +793,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 
-       rdtscll(vcpu->arch.host_tsc);
+       vcpu->arch.host_tsc = native_read_tsc();
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1045,7 +1051,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
        svm->vmcb->control.intercept_exceptions &=
                ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
 
-       if (vcpu->arch.singlestep)
+       if (svm->nmi_singlestep)
                svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
 
        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1060,26 +1066,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
                vcpu->guest_debug = 0;
 }
 
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-       int old_debug = vcpu->guest_debug;
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       vcpu->guest_debug = dbg->control;
-
-       update_db_intercept(vcpu);
-
        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
                svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
        else
                svm->vmcb->save.dr7 = vcpu->arch.dr7;
 
-       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-               svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-       else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
-               svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-
-       return 0;
+       update_db_intercept(vcpu);
 }
 
 static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1180,7 +1176,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
        }
 }
 
-static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int pf_interception(struct vcpu_svm *svm)
 {
        u64 fault_address;
        u32 error_code;
@@ -1194,17 +1190,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
 
-static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int db_interception(struct vcpu_svm *svm)
 {
+       struct kvm_run *kvm_run = svm->vcpu.run;
+
        if (!(svm->vcpu.guest_debug &
              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
-               !svm->vcpu.arch.singlestep) {
+               !svm->nmi_singlestep) {
                kvm_queue_exception(&svm->vcpu, DB_VECTOR);
                return 1;
        }
 
-       if (svm->vcpu.arch.singlestep) {
-               svm->vcpu.arch.singlestep = false;
+       if (svm->nmi_singlestep) {
+               svm->nmi_singlestep = false;
                if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
                        svm->vmcb->save.rflags &=
                                ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1223,25 +1221,27 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int bp_interception(struct vcpu_svm *svm)
 {
+       struct kvm_run *kvm_run = svm->vcpu.run;
+
        kvm_run->exit_reason = KVM_EXIT_DEBUG;
        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
        kvm_run->debug.arch.exception = BP_VECTOR;
        return 0;
 }
 
-static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int ud_interception(struct vcpu_svm *svm)
 {
        int er;
 
-       er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+       er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
        if (er != EMULATE_DONE)
                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
        return 1;
 }
 
-static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int nm_interception(struct vcpu_svm *svm)
 {
        svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
        if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
@@ -1251,7 +1251,7 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int mc_interception(struct vcpu_svm *svm)
 {
        /*
         * On an #MC intercept the MCE handler is not called automatically in
@@ -1264,8 +1264,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int shutdown_interception(struct vcpu_svm *svm)
 {
+       struct kvm_run *kvm_run = svm->vcpu.run;
+
        /*
         * VMCB is undefined after a SHUTDOWN intercept
         * so reinitialize it.
@@ -1277,7 +1279,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 0;
 }
 
-static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int io_interception(struct vcpu_svm *svm)
 {
        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
        int size, in, string;
@@ -1291,7 +1293,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 
        if (string) {
                if (emulate_instruction(&svm->vcpu,
-                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+                                       0, 0, 0) == EMULATE_DO_MMIO)
                        return 0;
                return 1;
        }
@@ -1301,33 +1303,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
 
        skip_emulated_instruction(&svm->vcpu);
-       return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
+       return kvm_emulate_pio(&svm->vcpu, in, size, port);
 }
 
-static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int nmi_interception(struct vcpu_svm *svm)
 {
        return 1;
 }
 
-static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int intr_interception(struct vcpu_svm *svm)
 {
        ++svm->vcpu.stat.irq_exits;
        return 1;
 }
 
-static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int nop_on_interception(struct vcpu_svm *svm)
 {
        return 1;
 }
 
-static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int halt_interception(struct vcpu_svm *svm)
 {
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
        skip_emulated_instruction(&svm->vcpu);
        return kvm_emulate_halt(&svm->vcpu);
 }
 
-static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmmcall_interception(struct vcpu_svm *svm)
 {
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
        skip_emulated_instruction(&svm->vcpu);
@@ -1378,8 +1380,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
 
        svm->vmcb->control.exit_code = SVM_EXIT_INTR;
 
-       if (nested_svm_exit_handled(svm)) {
-               nsvm_printk("VMexit -> INTR\n");
+       if (svm->nested.intercept & 1ULL) {
+               /*
+                * The #vmexit can't be emulated here directly because this
+                * code path runs with irqs and preemtion disabled. A
+                * #vmexit emulation might sleep. Only signal request for
+                * the #vmexit here.
+                */
+               svm->nested.exit_required = true;
+               trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
                return 1;
        }
 
@@ -1390,10 +1399,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
 {
        struct page *page;
 
-       down_read(&current->mm->mmap_sem);
        page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
-       up_read(&current->mm->mmap_sem);
-
        if (is_error_page(page))
                goto error;
 
@@ -1532,14 +1538,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
        }
        default: {
                u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
-               nsvm_printk("exit code: 0x%x\n", exit_code);
                if (svm->nested.intercept & exit_bits)
                        vmexit = NESTED_EXIT_DONE;
        }
        }
 
        if (vmexit == NESTED_EXIT_DONE) {
-               nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
                nested_svm_vmexit(svm);
        }
 
@@ -1584,6 +1588,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
        struct vmcb *hsave = svm->nested.hsave;
        struct vmcb *vmcb = svm->vmcb;
 
+       trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
+                                      vmcb->control.exit_info_1,
+                                      vmcb->control.exit_info_2,
+                                      vmcb->control.exit_int_info,
+                                      vmcb->control.exit_int_info_err);
+
        nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
        if (!nested_vmcb)
                return 1;
@@ -1617,6 +1627,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
        nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
        nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
        nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+
+       /*
+        * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
+        * to make sure that we do not lose injected events. So check event_inj
+        * here and copy it to exit_int_info if it is valid.
+        * Exit_int_info and event_inj can't be both valid because the case
+        * below only happens on a VMRUN instruction intercept which has
+        * no valid exit_int_info set.
+        */
+       if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
+               struct vmcb_control_area *nc = &nested_vmcb->control;
+
+               nc->exit_int_info     = vmcb->control.event_inj;
+               nc->exit_int_info_err = vmcb->control.event_inj_err;
+       }
+
        nested_vmcb->control.tlb_ctl           = 0;
        nested_vmcb->control.event_inj         = 0;
        nested_vmcb->control.event_inj_err     = 0;
@@ -1628,10 +1654,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
        /* Restore the original control entries */
        copy_vmcb_control_area(vmcb, hsave);
 
-       /* Kill any pending exceptions */
-       if (svm->vcpu.arch.exception.pending == true)
-               nsvm_printk("WARNING: Pending Exception\n");
-
        kvm_clear_exception_queue(&svm->vcpu);
        kvm_clear_interrupt_queue(&svm->vcpu);
 
@@ -1702,6 +1724,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
        /* nested_vmcb is our indicator if nested SVM is activated */
        svm->nested.vmcb = svm->vmcb->save.rax;
 
+       trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
+                              nested_vmcb->save.rip,
+                              nested_vmcb->control.int_ctl,
+                              nested_vmcb->control.event_inj,
+                              nested_vmcb->control.nested_ctl);
+
        /* Clear internal status */
        kvm_clear_exception_queue(&svm->vcpu);
        kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1789,28 +1817,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
        svm->nested.intercept            = nested_vmcb->control.intercept;
 
        force_new_asid(&svm->vcpu);
-       svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
-       svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
        svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
-       if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
-               nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
-                               nested_vmcb->control.int_ctl);
-       }
        if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
                svm->vcpu.arch.hflags |= HF_VINTR_MASK;
        else
                svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
 
-       nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
-                       nested_vmcb->control.exit_int_info,
-                       nested_vmcb->control.int_state);
-
        svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
        svm->vmcb->control.int_state = nested_vmcb->control.int_state;
        svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
-       if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
-               nsvm_printk("Injecting Event: 0x%x\n",
-                               nested_vmcb->control.event_inj);
        svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
        svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
@@ -1837,7 +1852,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
        to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
 }
 
-static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmload_interception(struct vcpu_svm *svm)
 {
        struct vmcb *nested_vmcb;
 
@@ -1857,7 +1872,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmsave_interception(struct vcpu_svm *svm)
 {
        struct vmcb *nested_vmcb;
 
@@ -1877,10 +1892,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmrun_interception(struct vcpu_svm *svm)
 {
-       nsvm_printk("VMrun\n");
-
        if (nested_svm_check_permissions(svm))
                return 1;
 
@@ -1907,7 +1920,7 @@ failed:
        return 1;
 }
 
-static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int stgi_interception(struct vcpu_svm *svm)
 {
        if (nested_svm_check_permissions(svm))
                return 1;
@@ -1920,7 +1933,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int clgi_interception(struct vcpu_svm *svm)
 {
        if (nested_svm_check_permissions(svm))
                return 1;
@@ -1937,10 +1950,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int invlpga_interception(struct vcpu_svm *svm)
 {
        struct kvm_vcpu *vcpu = &svm->vcpu;
-       nsvm_printk("INVLPGA\n");
+
+       trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
+                         vcpu->arch.regs[VCPU_REGS_RAX]);
 
        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
        kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1950,15 +1965,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int invalid_op_interception(struct vcpu_svm *svm,
-                                  struct kvm_run *kvm_run)
+static int skinit_interception(struct vcpu_svm *svm)
 {
+       trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
+
        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
        return 1;
 }
 
-static int task_switch_interception(struct vcpu_svm *svm,
-                                   struct kvm_run *kvm_run)
+static int invalid_op_interception(struct vcpu_svm *svm)
+{
+       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       return 1;
+}
+
+static int task_switch_interception(struct vcpu_svm *svm)
 {
        u16 tss_selector;
        int reason;
@@ -2008,14 +2029,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
        return kvm_task_switch(&svm->vcpu, tss_selector, reason);
 }
 
-static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int cpuid_interception(struct vcpu_svm *svm)
 {
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
        kvm_emulate_cpuid(&svm->vcpu);
        return 1;
 }
 
-static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int iret_interception(struct vcpu_svm *svm)
 {
        ++svm->vcpu.stat.nmi_window_exits;
        svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
@@ -2023,26 +2044,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int invlpg_interception(struct vcpu_svm *svm)
 {
-       if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
+       if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
                pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
        return 1;
 }
 
-static int emulate_on_interception(struct vcpu_svm *svm,
-                                  struct kvm_run *kvm_run)
+static int emulate_on_interception(struct vcpu_svm *svm)
 {
-       if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
+       if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
                pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
        return 1;
 }
 
-static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int cr8_write_interception(struct vcpu_svm *svm)
 {
+       struct kvm_run *kvm_run = svm->vcpu.run;
+
        u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
        /* instruction emulation calls kvm_set_cr8() */
-       emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
+       emulate_instruction(&svm->vcpu, 0, 0, 0);
        if (irqchip_in_kernel(svm->vcpu.kvm)) {
                svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
                return 1;
@@ -2128,7 +2150,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
        return 0;
 }
 
-static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int rdmsr_interception(struct vcpu_svm *svm)
 {
        u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
        u64 data;
@@ -2221,7 +2243,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
        return 0;
 }
 
-static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int wrmsr_interception(struct vcpu_svm *svm)
 {
        u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
        u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
@@ -2237,17 +2259,18 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int msr_interception(struct vcpu_svm *svm)
 {
        if (svm->vmcb->control.exit_info_1)
-               return wrmsr_interception(svm, kvm_run);
+               return wrmsr_interception(svm);
        else
-               return rdmsr_interception(svm, kvm_run);
+               return rdmsr_interception(svm);
 }
 
-static int interrupt_window_interception(struct vcpu_svm *svm,
-                                  struct kvm_run *kvm_run)
+static int interrupt_window_interception(struct vcpu_svm *svm)
 {
+       struct kvm_run *kvm_run = svm->vcpu.run;
+
        svm_clear_vintr(svm);
        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
        /*
@@ -2265,8 +2288,13 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
        return 1;
 }
 
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
-                                     struct kvm_run *kvm_run) = {
+static int pause_interception(struct vcpu_svm *svm)
+{
+       kvm_vcpu_on_spin(&(svm->vcpu));
+       return 1;
+}
+
+static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_READ_CR0]                     = emulate_on_interception,
        [SVM_EXIT_READ_CR3]                     = emulate_on_interception,
        [SVM_EXIT_READ_CR4]                     = emulate_on_interception,
@@ -2301,6 +2329,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
        [SVM_EXIT_CPUID]                        = cpuid_interception,
        [SVM_EXIT_IRET]                         = iret_interception,
        [SVM_EXIT_INVD]                         = emulate_on_interception,
+       [SVM_EXIT_PAUSE]                        = pause_interception,
        [SVM_EXIT_HLT]                          = halt_interception,
        [SVM_EXIT_INVLPG]                       = invlpg_interception,
        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
@@ -2314,26 +2343,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
        [SVM_EXIT_STGI]                         = stgi_interception,
        [SVM_EXIT_CLGI]                         = clgi_interception,
-       [SVM_EXIT_SKINIT]                       = invalid_op_interception,
+       [SVM_EXIT_SKINIT]                       = skinit_interception,
        [SVM_EXIT_WBINVD]                       = emulate_on_interception,
        [SVM_EXIT_MONITOR]                      = invalid_op_interception,
        [SVM_EXIT_MWAIT]                        = invalid_op_interception,
        [SVM_EXIT_NPF]                          = pf_interception,
 };
 
-static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+static int handle_exit(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_run *kvm_run = vcpu->run;
        u32 exit_code = svm->vmcb->control.exit_code;
 
        trace_kvm_exit(exit_code, svm->vmcb->save.rip);
 
+       if (unlikely(svm->nested.exit_required)) {
+               nested_svm_vmexit(svm);
+               svm->nested.exit_required = false;
+
+               return 1;
+       }
+
        if (is_nested(svm)) {
                int vmexit;
 
-               nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
-                           exit_code, svm->vmcb->control.exit_info_1,
-                           svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
+               trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
+                                       svm->vmcb->control.exit_info_1,
+                                       svm->vmcb->control.exit_info_2,
+                                       svm->vmcb->control.exit_int_info,
+                                       svm->vmcb->control.exit_int_info_err);
 
                vmexit = nested_svm_exit_special(svm);
 
@@ -2383,7 +2422,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       return svm_exit_handlers[exit_code](svm, kvm_run);
+       return svm_exit_handlers[exit_code](svm);
 }
 
 static void reload_tss(struct kvm_vcpu *vcpu)
@@ -2460,20 +2499,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
                !(svm->vcpu.arch.hflags & HF_NMI_MASK);
 }
 
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (masked) {
+               svm->vcpu.arch.hflags |= HF_NMI_MASK;
+               svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+       } else {
+               svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+               svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+       }
+}
+
 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb *vmcb = svm->vmcb;
-       return (vmcb->save.rflags & X86_EFLAGS_IF) &&
-               !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-               gif_set(svm) &&
-               !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK));
+       int ret;
+
+       if (!gif_set(svm) ||
+            (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
+               return 0;
+
+       ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
+
+       if (is_nested(svm))
+               return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
+
+       return ret;
 }
 
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       nsvm_printk("Trying to open IRQ window\n");
 
        nested_svm_intr(svm);
 
@@ -2498,7 +2564,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
        /* Something prevents NMI from been injected. Single step over
           possible problem (IRET or exception injection or interrupt
           shadow) */
-       vcpu->arch.singlestep = true;
+       svm->nmi_singlestep = true;
        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
        update_db_intercept(vcpu);
 }
@@ -2588,13 +2654,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 #define R "e"
 #endif
 
-static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        u16 fs_selector;
        u16 gs_selector;
        u16 ldt_selector;
 
+       /*
+        * A vmexit emulation is required before the vcpu can be executed
+        * again.
+        */
+       if (unlikely(svm->nested.exit_required))
+               return;
+
        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2893,6 +2966,8 @@ static struct kvm_x86_ops svm_x86_ops = {
        .queue_exception = svm_queue_exception,
        .interrupt_allowed = svm_interrupt_allowed,
        .nmi_allowed = svm_nmi_allowed,
+       .get_nmi_mask = svm_get_nmi_mask,
+       .set_nmi_mask = svm_set_nmi_mask,
        .enable_nmi_window = enable_nmi_window,
        .enable_irq_window = enable_irq_window,
        .update_cr8_intercept = update_cr8_intercept,
index 0d480e7..816e044 100644 (file)
@@ -349,6 +349,171 @@ TRACE_EVENT(kvm_apic_accept_irq,
                  __entry->coalesced ? " (coalesced)" : "")
 );
 
+/*
+ * Tracepoint for nested VMRUN
+ */
+TRACE_EVENT(kvm_nested_vmrun,
+           TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
+                    __u32 event_inj, bool npt),
+           TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
+
+       TP_STRUCT__entry(
+               __field(        __u64,          rip             )
+               __field(        __u64,          vmcb            )
+               __field(        __u64,          nested_rip      )
+               __field(        __u32,          int_ctl         )
+               __field(        __u32,          event_inj       )
+               __field(        bool,           npt             )
+       ),
+
+       TP_fast_assign(
+               __entry->rip            = rip;
+               __entry->vmcb           = vmcb;
+               __entry->nested_rip     = nested_rip;
+               __entry->int_ctl        = int_ctl;
+               __entry->event_inj      = event_inj;
+               __entry->npt            = npt;
+       ),
+
+       TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
+                 "event_inj: 0x%08x npt: %s\n",
+               __entry->rip, __entry->vmcb, __entry->nested_rip,
+               __entry->int_ctl, __entry->event_inj,
+               __entry->npt ? "on" : "off")
+);
+
+/*
+ * Tracepoint for #VMEXIT while nested
+ */
+TRACE_EVENT(kvm_nested_vmexit,
+           TP_PROTO(__u64 rip, __u32 exit_code,
+                    __u64 exit_info1, __u64 exit_info2,
+                    __u32 exit_int_info, __u32 exit_int_info_err),
+           TP_ARGS(rip, exit_code, exit_info1, exit_info2,
+                   exit_int_info, exit_int_info_err),
+
+       TP_STRUCT__entry(
+               __field(        __u64,          rip                     )
+               __field(        __u32,          exit_code               )
+               __field(        __u64,          exit_info1              )
+               __field(        __u64,          exit_info2              )
+               __field(        __u32,          exit_int_info           )
+               __field(        __u32,          exit_int_info_err       )
+       ),
+
+       TP_fast_assign(
+               __entry->rip                    = rip;
+               __entry->exit_code              = exit_code;
+               __entry->exit_info1             = exit_info1;
+               __entry->exit_info2             = exit_info2;
+               __entry->exit_int_info          = exit_int_info;
+               __entry->exit_int_info_err      = exit_int_info_err;
+       ),
+       TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
+                 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+                 __entry->rip,
+                 ftrace_print_symbols_seq(p, __entry->exit_code,
+                                          kvm_x86_ops->exit_reasons_str),
+                 __entry->exit_info1, __entry->exit_info2,
+                 __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for #VMEXIT reinjected to the guest
+ */
+TRACE_EVENT(kvm_nested_vmexit_inject,
+           TP_PROTO(__u32 exit_code,
+                    __u64 exit_info1, __u64 exit_info2,
+                    __u32 exit_int_info, __u32 exit_int_info_err),
+           TP_ARGS(exit_code, exit_info1, exit_info2,
+                   exit_int_info, exit_int_info_err),
+
+       TP_STRUCT__entry(
+               __field(        __u32,          exit_code               )
+               __field(        __u64,          exit_info1              )
+               __field(        __u64,          exit_info2              )
+               __field(        __u32,          exit_int_info           )
+               __field(        __u32,          exit_int_info_err       )
+       ),
+
+       TP_fast_assign(
+               __entry->exit_code              = exit_code;
+               __entry->exit_info1             = exit_info1;
+               __entry->exit_info2             = exit_info2;
+               __entry->exit_int_info          = exit_int_info;
+               __entry->exit_int_info_err      = exit_int_info_err;
+       ),
+
+       TP_printk("reason: %s ext_inf1: 0x%016llx "
+                 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+                 ftrace_print_symbols_seq(p, __entry->exit_code,
+                                          kvm_x86_ops->exit_reasons_str),
+               __entry->exit_info1, __entry->exit_info2,
+               __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_nested_intr_vmexit,
+           TP_PROTO(__u64 rip),
+           TP_ARGS(rip),
+
+       TP_STRUCT__entry(
+               __field(        __u64,  rip     )
+       ),
+
+       TP_fast_assign(
+               __entry->rip    =       rip
+       ),
+
+       TP_printk("rip: 0x%016llx\n", __entry->rip)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_invlpga,
+           TP_PROTO(__u64 rip, int asid, u64 address),
+           TP_ARGS(rip, asid, address),
+
+       TP_STRUCT__entry(
+               __field(        __u64,  rip     )
+               __field(        int,    asid    )
+               __field(        __u64,  address )
+       ),
+
+       TP_fast_assign(
+               __entry->rip            =       rip;
+               __entry->asid           =       asid;
+               __entry->address        =       address;
+       ),
+
+       TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
+                 __entry->rip, __entry->asid, __entry->address)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_skinit,
+           TP_PROTO(__u64 rip, __u32 slb),
+           TP_ARGS(rip, slb),
+
+       TP_STRUCT__entry(
+               __field(        __u64,  rip     )
+               __field(        __u32,  slb     )
+       ),
+
+       TP_fast_assign(
+               __entry->rip            =       rip;
+               __entry->slb            =       slb;
+       ),
+
+       TP_printk("rip: 0x%016llx slb: 0x%08x\n",
+                 __entry->rip, __entry->slb)
+);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
index ed53b42..d4918d6 100644 (file)
@@ -61,12 +61,37 @@ module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * ple_gap:    upper bound on the amount of time between two successive
+ *             executions of PAUSE in a loop. Also indicate if ple enabled.
+ *             According to test, this time is usually small than 41 cycles.
+ * ple_window: upper bound on the amount of time a guest is allowed to execute
+ *             in a PAUSE loop. Tests indicate that most spinlocks are held for
+ *             less than 2^12 cycles
+ * Time is measured based on a counter that runs at the same rate as the TSC,
+ * refer SDM volume 3b section 21.6.13 & 22.1.3.
+ */
+#define KVM_VMX_DEFAULT_PLE_GAP    41
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
+module_param(ple_gap, int, S_IRUGO);
+
+static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, int, S_IRUGO);
+
 struct vmcs {
        u32 revision_id;
        u32 abort;
        char data[0];
 };
 
+struct shared_msr_entry {
+       unsigned index;
+       u64 data;
+       u64 mask;
+};
+
 struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        struct list_head      local_vcpus_link;
@@ -74,13 +99,12 @@ struct vcpu_vmx {
        int                   launched;
        u8                    fail;
        u32                   idt_vectoring_info;
-       struct kvm_msr_entry *guest_msrs;
-       struct kvm_msr_entry *host_msrs;
+       struct shared_msr_entry *guest_msrs;
        int                   nmsrs;
        int                   save_nmsrs;
-       int                   msr_offset_efer;
 #ifdef CONFIG_X86_64
-       int                   msr_offset_kernel_gs_base;
+       u64                   msr_host_kernel_gs_base;
+       u64                   msr_guest_kernel_gs_base;
 #endif
        struct vmcs          *vmcs;
        struct {
@@ -88,7 +112,6 @@ struct vcpu_vmx {
                u16           fs_sel, gs_sel, ldt_sel;
                int           gs_ldt_reload_needed;
                int           fs_reload_needed;
-               int           guest_efer_loaded;
        } host_state;
        struct {
                int vm86_active;
@@ -107,7 +130,6 @@ struct vcpu_vmx {
        } rmode;
        int vpid;
        bool emulation_required;
-       enum emulation_result invalid_state_emulation_result;
 
        /* Support for vnmi-less CPUs */
        int soft_vnmi_blocked;
@@ -176,6 +198,8 @@ static struct kvm_vmx_segment_field {
        VMX_SEGMENT_FIELD(LDTR),
 };
 
+static u64 host_efer;
+
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
 /*
@@ -184,28 +208,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
  */
 static const u32 vmx_msr_index[] = {
 #ifdef CONFIG_X86_64
-       MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
+       MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 #endif
        MSR_EFER, MSR_K6_STAR,
 };
 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 
-static void load_msrs(struct kvm_msr_entry *e, int n)
-{
-       int i;
-
-       for (i = 0; i < n; ++i)
-               wrmsrl(e[i].index, e[i].data);
-}
-
-static void save_msrs(struct kvm_msr_entry *e, int n)
-{
-       int i;
-
-       for (i = 0; i < n; ++i)
-               rdmsrl(e[i].index, e[i].data);
-}
-
 static inline int is_page_fault(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -320,6 +328,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
                SECONDARY_EXEC_UNRESTRICTED_GUEST;
 }
 
+static inline int cpu_has_vmx_ple(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
        return flexpriority_enabled &&
@@ -348,7 +362,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
        int i;
 
        for (i = 0; i < vmx->nmsrs; ++i)
-               if (vmx->guest_msrs[i].index == msr)
+               if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
                        return i;
        return -1;
 }
@@ -379,7 +393,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
                        : : "a" (&operand), "c" (ext) : "cc", "memory");
 }
 
-static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
        int i;
 
@@ -570,17 +584,12 @@ static void reload_tss(void)
        load_TR_desc();
 }
 
-static void load_transition_efer(struct vcpu_vmx *vmx)
+static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 {
-       int efer_offset = vmx->msr_offset_efer;
-       u64 host_efer;
        u64 guest_efer;
        u64 ignore_bits;
 
-       if (efer_offset < 0)
-               return;
-       host_efer = vmx->host_msrs[efer_offset].data;
-       guest_efer = vmx->guest_msrs[efer_offset].data;
+       guest_efer = vmx->vcpu.arch.shadow_efer;
 
        /*
         * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -593,27 +602,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
        if (guest_efer & EFER_LMA)
                ignore_bits &= ~(u64)EFER_SCE;
 #endif
-       if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
-               return;
-
-       vmx->host_state.guest_efer_loaded = 1;
        guest_efer &= ~ignore_bits;
        guest_efer |= host_efer & ignore_bits;
-       wrmsrl(MSR_EFER, guest_efer);
-       vmx->vcpu.stat.efer_reload++;
-}
-
-static void reload_host_efer(struct vcpu_vmx *vmx)
-{
-       if (vmx->host_state.guest_efer_loaded) {
-               vmx->host_state.guest_efer_loaded = 0;
-               load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
-       }
+       vmx->guest_msrs[efer_offset].data = guest_efer;
+       vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+       return true;
 }
 
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int i;
 
        if (vmx->host_state.loaded)
                return;
@@ -650,13 +649,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 #endif
 
 #ifdef CONFIG_X86_64
-       if (is_long_mode(&vmx->vcpu))
-               save_msrs(vmx->host_msrs +
-                         vmx->msr_offset_kernel_gs_base, 1);
-
+       if (is_long_mode(&vmx->vcpu)) {
+               rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+               wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+       }
 #endif
-       load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-       load_transition_efer(vmx);
+       for (i = 0; i < vmx->save_nmsrs; ++i)
+               kvm_set_shared_msr(vmx->guest_msrs[i].index,
+                                  vmx->guest_msrs[i].data,
+                                  vmx->guest_msrs[i].mask);
 }
 
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +685,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
                local_irq_restore(flags);
        }
        reload_tss();
-       save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-       load_msrs(vmx->host_msrs, vmx->save_nmsrs);
-       reload_host_efer(vmx);
+#ifdef CONFIG_X86_64
+       if (is_long_mode(&vmx->vcpu)) {
+               rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+               wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+       }
+#endif
 }
 
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -877,19 +881,14 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 /*
  * Swap MSR entry in host/guest MSR entry array.
  */
-#ifdef CONFIG_X86_64
 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 {
-       struct kvm_msr_entry tmp;
+       struct shared_msr_entry tmp;
 
        tmp = vmx->guest_msrs[to];
        vmx->guest_msrs[to] = vmx->guest_msrs[from];
        vmx->guest_msrs[from] = tmp;
-       tmp = vmx->host_msrs[to];
-       vmx->host_msrs[to] = vmx->host_msrs[from];
-       vmx->host_msrs[from] = tmp;
 }
-#endif
 
 /*
  * Set up the vmcs to automatically save and restore system
@@ -898,15 +897,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
  */
 static void setup_msrs(struct vcpu_vmx *vmx)
 {
-       int save_nmsrs;
+       int save_nmsrs, index;
        unsigned long *msr_bitmap;
 
        vmx_load_host_state(vmx);
        save_nmsrs = 0;
 #ifdef CONFIG_X86_64
        if (is_long_mode(&vmx->vcpu)) {
-               int index;
-
                index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
                if (index >= 0)
                        move_msr_up(vmx, index, save_nmsrs++);
@@ -916,9 +913,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                index = __find_msr_index(vmx, MSR_CSTAR);
                if (index >= 0)
                        move_msr_up(vmx, index, save_nmsrs++);
-               index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-               if (index >= 0)
-                       move_msr_up(vmx, index, save_nmsrs++);
                /*
                 * MSR_K6_STAR is only needed on long mode guests, and only
                 * if efer.sce is enabled.
@@ -928,13 +922,11 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                        move_msr_up(vmx, index, save_nmsrs++);
        }
 #endif
-       vmx->save_nmsrs = save_nmsrs;
+       index = __find_msr_index(vmx, MSR_EFER);
+       if (index >= 0 && update_transition_efer(vmx, index))
+               move_msr_up(vmx, index, save_nmsrs++);
 
-#ifdef CONFIG_X86_64
-       vmx->msr_offset_kernel_gs_base =
-               __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-#endif
-       vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
+       vmx->save_nmsrs = save_nmsrs;
 
        if (cpu_has_vmx_msr_bitmap()) {
                if (is_long_mode(&vmx->vcpu))
@@ -976,7 +968,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
        u64 data;
-       struct kvm_msr_entry *msr;
+       struct shared_msr_entry *msr;
 
        if (!pdata) {
                printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -991,9 +983,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        case MSR_GS_BASE:
                data = vmcs_readl(GUEST_GS_BASE);
                break;
+       case MSR_KERNEL_GS_BASE:
+               vmx_load_host_state(to_vmx(vcpu));
+               data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+               break;
+#endif
        case MSR_EFER:
                return kvm_get_msr_common(vcpu, msr_index, pdata);
-#endif
        case MSR_IA32_TSC:
                data = guest_read_tsc();
                break;
@@ -1007,6 +1003,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                data = vmcs_readl(GUEST_SYSENTER_ESP);
                break;
        default:
+               vmx_load_host_state(to_vmx(vcpu));
                msr = find_msr_entry(to_vmx(vcpu), msr_index);
                if (msr) {
                        vmx_load_host_state(to_vmx(vcpu));
@@ -1028,7 +1025,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct kvm_msr_entry *msr;
+       struct shared_msr_entry *msr;
        u64 host_tsc;
        int ret = 0;
 
@@ -1044,6 +1041,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
        case MSR_GS_BASE:
                vmcs_writel(GUEST_GS_BASE, data);
                break;
+       case MSR_KERNEL_GS_BASE:
+               vmx_load_host_state(vmx);
+               vmx->msr_guest_kernel_gs_base = data;
+               break;
 #endif
        case MSR_IA32_SYSENTER_CS:
                vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1097,30 +1098,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
        }
 }
 
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-       int old_debug = vcpu->guest_debug;
-       unsigned long flags;
-
-       vcpu->guest_debug = dbg->control;
-       if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
-               vcpu->guest_debug = 0;
-
        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
                vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
        else
                vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
 
-       flags = vmcs_readl(GUEST_RFLAGS);
-       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-               flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-       else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
-               flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-       vmcs_writel(GUEST_RFLAGS, flags);
-
        update_exception_bitmap(vcpu);
-
-       return 0;
 }
 
 static __init int cpu_has_kvm_support(void)
@@ -1139,12 +1124,15 @@ static __init int vmx_disabled_by_bios(void)
        /* locked but not enabled */
 }
 
-static void hardware_enable(void *garbage)
+static int hardware_enable(void *garbage)
 {
        int cpu = raw_smp_processor_id();
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
        u64 old;
 
+       if (read_cr4() & X86_CR4_VMXE)
+               return -EBUSY;
+
        INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
        if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1159,6 +1147,10 @@ static void hardware_enable(void *garbage)
        asm volatile (ASM_VMX_VMXON_RAX
                      : : "a"(&phys_addr), "m"(phys_addr)
                      : "memory", "cc");
+
+       ept_sync_global();
+
+       return 0;
 }
 
 static void vmclear_local_vcpus(void)
@@ -1250,7 +1242,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_WBINVD_EXITING |
                        SECONDARY_EXEC_ENABLE_VPID |
                        SECONDARY_EXEC_ENABLE_EPT |
-                       SECONDARY_EXEC_UNRESTRICTED_GUEST;
+                       SECONDARY_EXEC_UNRESTRICTED_GUEST |
+                       SECONDARY_EXEC_PAUSE_LOOP_EXITING;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@ -1344,15 +1337,17 @@ static void free_kvm_area(void)
 {
        int cpu;
 
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu) {
                free_vmcs(per_cpu(vmxarea, cpu));
+               per_cpu(vmxarea, cpu) = NULL;
+       }
 }
 
 static __init int alloc_kvm_area(void)
 {
        int cpu;
 
-       for_each_online_cpu(cpu) {
+       for_each_possible_cpu(cpu) {
                struct vmcs *vmcs;
 
                vmcs = alloc_vmcs_cpu(cpu);
@@ -1394,6 +1389,9 @@ static __init int hardware_setup(void)
        if (enable_ept && !cpu_has_vmx_ept_2m_page())
                kvm_disable_largepages();
 
+       if (!cpu_has_vmx_ple())
+               ple_gap = 0;
+
        return alloc_kvm_area();
 }
 
@@ -1536,8 +1534,16 @@ continue_rmode:
 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+       struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+       if (!msr)
+               return;
 
+       /*
+        * Force kernel_gs_base reloading before EFER changes, as control
+        * of this msr depends on is_long_mode().
+        */
+       vmx_load_host_state(to_vmx(vcpu));
        vcpu->arch.shadow_efer = efer;
        if (!msr)
                return;
@@ -1727,6 +1733,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                vmcs_write64(EPT_POINTER, eptp);
                guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
                        vcpu->kvm->arch.ept_identity_map_addr;
+               ept_load_pdptrs(vcpu);
        }
 
        vmx_flush_tlb(vcpu);
@@ -2302,13 +2309,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                                ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
                if (vmx->vpid == 0)
                        exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
-               if (!enable_ept)
+               if (!enable_ept) {
                        exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+                       enable_unrestricted_guest = 0;
+               }
                if (!enable_unrestricted_guest)
                        exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+               if (!ple_gap)
+                       exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
 
+       if (ple_gap) {
+               vmcs_write32(PLE_GAP, ple_gap);
+               vmcs_write32(PLE_WINDOW, ple_window);
+       }
+
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
@@ -2376,10 +2392,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                if (wrmsr_safe(index, data_low, data_high) < 0)
                        continue;
                data = data_low | ((u64)data_high << 32);
-               vmx->host_msrs[j].index = index;
-               vmx->host_msrs[j].reserved = 0;
-               vmx->host_msrs[j].data = data;
-               vmx->guest_msrs[j] = vmx->host_msrs[j];
+               vmx->guest_msrs[j].index = i;
+               vmx->guest_msrs[j].data = 0;
+               vmx->guest_msrs[j].mask = -1ull;
                ++vmx->nmsrs;
        }
 
@@ -2510,7 +2525,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        if (vmx->vpid != 0)
                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
-       vmx->vcpu.arch.cr0 = 0x60000010;
+       vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
        vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
        vmx_set_cr4(&vmx->vcpu, 0);
        vmx_set_efer(&vmx->vcpu, 0);
@@ -2627,6 +2642,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
                                GUEST_INTR_STATE_NMI));
 }
 
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+       if (!cpu_has_virtual_nmis())
+               return to_vmx(vcpu)->soft_vnmi_blocked;
+       else
+               return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+                         GUEST_INTR_STATE_NMI);
+}
+
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!cpu_has_virtual_nmis()) {
+               if (vmx->soft_vnmi_blocked != masked) {
+                       vmx->soft_vnmi_blocked = masked;
+                       vmx->vnmi_blocked_time = 0;
+               }
+       } else {
+               if (masked)
+                       vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                     GUEST_INTR_STATE_NMI);
+               else
+                       vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                       GUEST_INTR_STATE_NMI);
+       }
+}
+
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
        return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -2659,7 +2702,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
         * Cause the #SS fault with 0 error code in VM86 mode.
         */
        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
-               if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
+               if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
                        return 1;
        /*
         * Forward all other exceptions that are valid in real mode.
@@ -2710,15 +2753,16 @@ static void kvm_machine_check(void)
 #endif
 }
 
-static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_machine_check(struct kvm_vcpu *vcpu)
 {
        /* already handled by vcpu_run */
        return 1;
 }
 
-static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_exception(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct kvm_run *kvm_run = vcpu->run;
        u32 intr_info, ex_no, error_code;
        unsigned long cr2, rip, dr6;
        u32 vect_info;
@@ -2728,12 +2772,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
        if (is_machine_check(intr_info))
-               return handle_machine_check(vcpu, kvm_run);
+               return handle_machine_check(vcpu);
 
        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-                                               !is_page_fault(intr_info))
-               printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
-                      "intr info 0x%x\n", __func__, vect_info, intr_info);
+           !is_page_fault(intr_info)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+               vcpu->run->internal.ndata = 2;
+               vcpu->run->internal.data[0] = vect_info;
+               vcpu->run->internal.data[1] = intr_info;
+               return 0;
+       }
 
        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
                return 1;  /* already handled by vmx_vcpu_run() */
@@ -2744,7 +2793,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        }
 
        if (is_invalid_opcode(intr_info)) {
-               er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+               er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
                if (er != EMULATE_DONE)
                        kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
@@ -2803,20 +2852,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 0;
 }
 
-static int handle_external_interrupt(struct kvm_vcpu *vcpu,
-                                    struct kvm_run *kvm_run)
+static int handle_external_interrupt(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.irq_exits;
        return 1;
 }
 
-static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_triple_fault(struct kvm_vcpu *vcpu)
 {
-       kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+       vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
        return 0;
 }
 
-static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_io(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification;
        int size, in, string;
@@ -2827,8 +2875,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        string = (exit_qualification & 16) != 0;
 
        if (string) {
-               if (emulate_instruction(vcpu,
-                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+               if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
                        return 0;
                return 1;
        }
@@ -2838,7 +2885,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        port = exit_qualification >> 16;
 
        skip_emulated_instruction(vcpu);
-       return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
+       return kvm_emulate_pio(vcpu, in, size, port);
 }
 
 static void
@@ -2852,7 +2899,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
 
-static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_cr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification, val;
        int cr;
@@ -2887,7 +2934,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                                        return 1;
                                if (cr8_prev <= cr8)
                                        return 1;
-                               kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+                               vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
                                return 0;
                        }
                };
@@ -2922,13 +2969,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        default:
                break;
        }
-       kvm_run->exit_reason = 0;
+       vcpu->run->exit_reason = 0;
        pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
               (int)(exit_qualification >> 4) & 3, cr);
        return 0;
 }
 
-static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_dr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification;
        unsigned long val;
@@ -2944,13 +2991,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 * guest debugging itself.
                 */
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
-                       kvm_run->debug.arch.dr6 = vcpu->arch.dr6;
-                       kvm_run->debug.arch.dr7 = dr;
-                       kvm_run->debug.arch.pc =
+                       vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
+                       vcpu->run->debug.arch.dr7 = dr;
+                       vcpu->run->debug.arch.pc =
                                vmcs_readl(GUEST_CS_BASE) +
                                vmcs_readl(GUEST_RIP);
-                       kvm_run->debug.arch.exception = DB_VECTOR;
-                       kvm_run->exit_reason = KVM_EXIT_DEBUG;
+                       vcpu->run->debug.arch.exception = DB_VECTOR;
+                       vcpu->run->exit_reason = KVM_EXIT_DEBUG;
                        return 0;
                } else {
                        vcpu->arch.dr7 &= ~DR7_GD;
@@ -3016,13 +3063,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_cpuid(struct kvm_vcpu *vcpu)
 {
        kvm_emulate_cpuid(vcpu);
        return 1;
 }
 
-static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_rdmsr(struct kvm_vcpu *vcpu)
 {
        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
        u64 data;
@@ -3041,7 +3088,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_wrmsr(struct kvm_vcpu *vcpu)
 {
        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
        u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
@@ -3058,14 +3105,12 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
-                                     struct kvm_run *kvm_run)
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 {
        return 1;
 }
 
-static int handle_interrupt_window(struct kvm_vcpu *vcpu,
-                                  struct kvm_run *kvm_run)
+static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 {
        u32 cpu_based_vm_exec_control;
 
@@ -3081,34 +3126,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
         * possible
         */
        if (!irqchip_in_kernel(vcpu->kvm) &&
-           kvm_run->request_interrupt_window &&
+           vcpu->run->request_interrupt_window &&
            !kvm_cpu_has_interrupt(vcpu)) {
-               kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+               vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
                return 0;
        }
        return 1;
 }
 
-static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_halt(struct kvm_vcpu *vcpu)
 {
        skip_emulated_instruction(vcpu);
        return kvm_emulate_halt(vcpu);
 }
 
-static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
        skip_emulated_instruction(vcpu);
        kvm_emulate_hypercall(vcpu);
        return 1;
 }
 
-static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 {
        kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
 }
 
-static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
@@ -3117,14 +3162,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
        skip_emulated_instruction(vcpu);
        /* TODO: Add support for VT-d/pass-through device */
        return 1;
 }
 
-static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification;
        enum emulation_result er;
@@ -3133,7 +3178,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        offset = exit_qualification & 0xffful;
 
-       er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+       er = emulate_instruction(vcpu, 0, 0, 0);
 
        if (er !=  EMULATE_DONE) {
                printk(KERN_ERR
@@ -3144,7 +3189,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_task_switch(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qualification;
@@ -3198,7 +3243,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_ept_violation(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification;
        gpa_t gpa;
@@ -3219,8 +3264,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                        vmcs_readl(GUEST_LINEAR_ADDRESS));
                printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
                        (long unsigned int)exit_qualification);
-               kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-               kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
+               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+               vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
                return 0;
        }
 
@@ -3290,7 +3335,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
        }
 }
 
-static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
        u64 sptes[4];
        int nr_sptes, i;
@@ -3306,13 +3351,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
                ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
 
-       kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-       kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+       vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+       vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
 
        return 0;
 }
 
-static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_nmi_window(struct kvm_vcpu *vcpu)
 {
        u32 cpu_based_vm_exec_control;
 
@@ -3325,36 +3370,50 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
 
-static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
-                               struct kvm_run *kvm_run)
+static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        enum emulation_result err = EMULATE_DONE;
-
-       local_irq_enable();
-       preempt_enable();
+       int ret = 1;
 
        while (!guest_state_valid(vcpu)) {
-               err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+               err = emulate_instruction(vcpu, 0, 0, 0);
 
-               if (err == EMULATE_DO_MMIO)
-                       break;
+               if (err == EMULATE_DO_MMIO) {
+                       ret = 0;
+                       goto out;
+               }
 
                if (err != EMULATE_DONE) {
                        kvm_report_emulation_failure(vcpu, "emulation failure");
-                       break;
+                       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+                       vcpu->run->internal.ndata = 0;
+                       ret = 0;
+                       goto out;
                }
 
                if (signal_pending(current))
-                       break;
+                       goto out;
                if (need_resched())
                        schedule();
        }
 
-       preempt_disable();
-       local_irq_disable();
+       vmx->emulation_required = 0;
+out:
+       return ret;
+}
 
-       vmx->invalid_state_emulation_result = err;
+/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+       skip_emulated_instruction(vcpu);
+       kvm_vcpu_on_spin(vcpu);
+
+       return 1;
 }
 
 /*
@@ -3362,8 +3421,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
  * to be done to userspace and return 0.
  */
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
-                                     struct kvm_run *kvm_run) = {
+static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
        [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
        [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
@@ -3394,6 +3452,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
+       [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -3403,7 +3462,7 @@ static const int kvm_vmx_max_exit_handlers =
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
  */
-static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exit_reason = vmx->exit_reason;
@@ -3411,13 +3470,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
 
-       /* If we need to emulate an MMIO from handle_invalid_guest_state
-        * we just return 0 */
-       if (vmx->emulation_required && emulate_invalid_guest_state) {
-               if (guest_state_valid(vcpu))
-                       vmx->emulation_required = 0;
-               return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
-       }
+       /* If guest state is invalid, start emulating */
+       if (vmx->emulation_required && emulate_invalid_guest_state)
+               return handle_invalid_guest_state(vcpu);
 
        /* Access CR3 don't cause VMExit in paging mode, so we need
         * to sync with guest real CR3. */
@@ -3425,8 +3480,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
 
        if (unlikely(vmx->fail)) {
-               kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-               kvm_run->fail_entry.hardware_entry_failure_reason
+               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+               vcpu->run->fail_entry.hardware_entry_failure_reason
                        = vmcs_read32(VM_INSTRUCTION_ERROR);
                return 0;
        }
@@ -3459,10 +3514,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        if (exit_reason < kvm_vmx_max_exit_handlers
            && kvm_vmx_exit_handlers[exit_reason])
-               return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
+               return kvm_vmx_exit_handlers[exit_reason](vcpu);
        else {
-               kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-               kvm_run->hw.hardware_exit_reason = exit_reason;
+               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+               vcpu->run->hw.hardware_exit_reason = exit_reason;
        }
        return 0;
 }
@@ -3600,23 +3655,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 #define Q "l"
 #endif
 
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-       if (enable_ept && is_paging(vcpu)) {
-               vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
-               ept_load_pdptrs(vcpu);
-       }
        /* Record the guest's net vcpu time for enforced NMI injections. */
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
                vmx->entry_time = ktime_get();
 
-       /* Handle invalid guest state instead of entering VMX */
-       if (vmx->emulation_required && emulate_invalid_guest_state) {
-               handle_invalid_guest_state(vcpu, kvm_run);
+       /* Don't enter VMX if guest state is invalid, let the exit handler
+          start emulation until we arrive back to a valid state */
+       if (vmx->emulation_required && emulate_invalid_guest_state)
                return;
-       }
 
        if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
                vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -3775,7 +3825,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
                __clear_bit(vmx->vpid, vmx_vpid_bitmap);
        spin_unlock(&vmx_vpid_lock);
        vmx_free_vmcs(vcpu);
-       kfree(vmx->host_msrs);
        kfree(vmx->guest_msrs);
        kvm_vcpu_uninit(vcpu);
        kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3802,10 +3851,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
                goto uninit_vcpu;
        }
 
-       vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
-       if (!vmx->host_msrs)
-               goto free_guest_msrs;
-
        vmx->vmcs = alloc_vmcs();
        if (!vmx->vmcs)
                goto free_msrs;
@@ -3836,8 +3881,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 free_vmcs:
        free_vmcs(vmx->vmcs);
 free_msrs:
-       kfree(vmx->host_msrs);
-free_guest_msrs:
        kfree(vmx->guest_msrs);
 uninit_vcpu:
        kvm_vcpu_uninit(&vmx->vcpu);
@@ -3973,6 +4016,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .queue_exception = vmx_queue_exception,
        .interrupt_allowed = vmx_interrupt_allowed,
        .nmi_allowed = vmx_nmi_allowed,
+       .get_nmi_mask = vmx_get_nmi_mask,
+       .set_nmi_mask = vmx_set_nmi_mask,
        .enable_nmi_window = enable_nmi_window,
        .enable_irq_window = enable_irq_window,
        .update_cr8_intercept = update_cr8_intercept,
@@ -3987,7 +4032,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 static int __init vmx_init(void)
 {
-       int r;
+       int r, i;
+
+       rdmsrl_safe(MSR_EFER, &host_efer);
+
+       for (i = 0; i < NR_VMX_MSR; ++i)
+               kvm_define_shared_msr(i, vmx_msr_index[i]);
 
        vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_io_bitmap_a)
@@ -4049,8 +4099,6 @@ static int __init vmx_init(void)
        if (bypass_guest_pf)
                kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
 
-       ept_sync_global();
-
        return 0;
 
 out3:
index 4fc8017..9d06896 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 #include <linux/cpufreq.h>
+#include <linux/user-return-notifier.h>
 #include <trace/events/kvm.h>
 #undef TRACE_INCLUDE_FILE
 #define CREATE_TRACE_POINTS
@@ -88,6 +89,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
 int ignore_msrs = 0;
 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
+#define KVM_NR_SHARED_MSRS 16
+
+struct kvm_shared_msrs_global {
+       int nr;
+       struct kvm_shared_msr {
+               u32 msr;
+               u64 value;
+       } msrs[KVM_NR_SHARED_MSRS];
+};
+
+struct kvm_shared_msrs {
+       struct user_return_notifier urn;
+       bool registered;
+       u64 current_value[KVM_NR_SHARED_MSRS];
+};
+
+static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
+static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
        { "pf_guest", VCPU_STAT(pf_guest) },
@@ -124,6 +144,72 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
 
+static void kvm_on_user_return(struct user_return_notifier *urn)
+{
+       unsigned slot;
+       struct kvm_shared_msr *global;
+       struct kvm_shared_msrs *locals
+               = container_of(urn, struct kvm_shared_msrs, urn);
+
+       for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
+               global = &shared_msrs_global.msrs[slot];
+               if (global->value != locals->current_value[slot]) {
+                       wrmsrl(global->msr, global->value);
+                       locals->current_value[slot] = global->value;
+               }
+       }
+       locals->registered = false;
+       user_return_notifier_unregister(urn);
+}
+
+void kvm_define_shared_msr(unsigned slot, u32 msr)
+{
+       int cpu;
+       u64 value;
+
+       if (slot >= shared_msrs_global.nr)
+               shared_msrs_global.nr = slot + 1;
+       shared_msrs_global.msrs[slot].msr = msr;
+       rdmsrl_safe(msr, &value);
+       shared_msrs_global.msrs[slot].value = value;
+       for_each_online_cpu(cpu)
+               per_cpu(shared_msrs, cpu).current_value[slot] = value;
+}
+EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+
+static void kvm_shared_msr_cpu_online(void)
+{
+       unsigned i;
+       struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
+
+       for (i = 0; i < shared_msrs_global.nr; ++i)
+               locals->current_value[i] = shared_msrs_global.msrs[i].value;
+}
+
+void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+{
+       struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+
+       if (((value ^ smsr->current_value[slot]) & mask) == 0)
+               return;
+       smsr->current_value[slot] = value;
+       wrmsrl(shared_msrs_global.msrs[slot].msr, value);
+       if (!smsr->registered) {
+               smsr->urn.on_user_return = kvm_on_user_return;
+               user_return_notifier_register(&smsr->urn);
+               smsr->registered = true;
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+
+static void drop_user_return_notifiers(void *ignore)
+{
+       struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+
+       if (smsr->registered)
+               kvm_on_user_return(&smsr->urn);
+}
+
 unsigned long segment_base(u16 selector)
 {
        struct descriptor_table gdt;
@@ -485,16 +571,19 @@ static inline u32 bit(int bitno)
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
  *
  * This list is modified at module load time to reflect the
- * capabilities of the host cpu.
+ * capabilities of the host cpu. This capabilities test skips MSRs that are
+ * kvm-specific. Those are put in the beginning of the list.
  */
+
+#define KVM_SAVE_MSRS_BEGIN    2
 static u32 msrs_to_save[] = {
+       MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
        MSR_K6_STAR,
 #ifdef CONFIG_X86_64
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-       MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-       MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+       MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 
 static unsigned num_msrs_to_save;
@@ -678,7 +767,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
        /* With all the info we got, fill in the values */
 
        vcpu->hv_clock.system_time = ts.tv_nsec +
-                                    (NSEC_PER_SEC * (u64)ts.tv_sec);
+                                    (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+
        /*
         * The interface expects us to write an even number signaling that the
         * update is finished. Since the guest won't see the intermediate
@@ -836,6 +926,38 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        return 0;
 }
 
+static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
+{
+       struct kvm *kvm = vcpu->kvm;
+       int lm = is_long_mode(vcpu);
+       u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
+               : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
+       u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+               : kvm->arch.xen_hvm_config.blob_size_32;
+       u32 page_num = data & ~PAGE_MASK;
+       u64 page_addr = data & PAGE_MASK;
+       u8 *page;
+       int r;
+
+       r = -E2BIG;
+       if (page_num >= blob_size)
+               goto out;
+       r = -ENOMEM;
+       page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!page)
+               goto out;
+       r = -EFAULT;
+       if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
+               goto out_free;
+       if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
+               goto out_free;
+       r = 0;
+out_free:
+       kfree(page);
+out:
+       return r;
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
        switch (msr) {
@@ -951,6 +1073,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                        "0x%x data 0x%llx\n", msr, data);
                break;
        default:
+               if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
+                       return xen_hvm_config(vcpu, data);
                if (!ignore_msrs) {
                        pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
                                msr, data);
@@ -1225,6 +1349,9 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_PIT2:
        case KVM_CAP_PIT_STATE2:
        case KVM_CAP_SET_IDENTITY_MAP_ADDR:
+       case KVM_CAP_XEN_HVM:
+       case KVM_CAP_ADJUST_CLOCK:
+       case KVM_CAP_VCPU_EVENTS:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -1239,8 +1366,8 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_NR_MEMSLOTS:
                r = KVM_MEMORY_SLOTS;
                break;
-       case KVM_CAP_PV_MMU:
-               r = !tdp_enabled;
+       case KVM_CAP_PV_MMU:    /* obsolete */
+               r = 0;
                break;
        case KVM_CAP_IOMMU:
                r = iommu_found();
@@ -1327,6 +1454,12 @@ out:
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        kvm_x86_ops->vcpu_load(vcpu, cpu);
+       if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
+               unsigned long khz = cpufreq_quick_get(cpu);
+               if (!khz)
+                       khz = tsc_khz;
+               per_cpu(cpu_tsc_khz, cpu) = khz;
+       }
        kvm_request_guest_time_update(vcpu);
 }
 
@@ -1760,6 +1893,61 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+                                              struct kvm_vcpu_events *events)
+{
+       vcpu_load(vcpu);
+
+       events->exception.injected = vcpu->arch.exception.pending;
+       events->exception.nr = vcpu->arch.exception.nr;
+       events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+       events->exception.error_code = vcpu->arch.exception.error_code;
+
+       events->interrupt.injected = vcpu->arch.interrupt.pending;
+       events->interrupt.nr = vcpu->arch.interrupt.nr;
+       events->interrupt.soft = vcpu->arch.interrupt.soft;
+
+       events->nmi.injected = vcpu->arch.nmi_injected;
+       events->nmi.pending = vcpu->arch.nmi_pending;
+       events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
+
+       events->sipi_vector = vcpu->arch.sipi_vector;
+
+       events->flags = 0;
+
+       vcpu_put(vcpu);
+}
+
+static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+                                             struct kvm_vcpu_events *events)
+{
+       if (events->flags)
+               return -EINVAL;
+
+       vcpu_load(vcpu);
+
+       vcpu->arch.exception.pending = events->exception.injected;
+       vcpu->arch.exception.nr = events->exception.nr;
+       vcpu->arch.exception.has_error_code = events->exception.has_error_code;
+       vcpu->arch.exception.error_code = events->exception.error_code;
+
+       vcpu->arch.interrupt.pending = events->interrupt.injected;
+       vcpu->arch.interrupt.nr = events->interrupt.nr;
+       vcpu->arch.interrupt.soft = events->interrupt.soft;
+       if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
+               kvm_pic_clear_isr_ack(vcpu->kvm);
+
+       vcpu->arch.nmi_injected = events->nmi.injected;
+       vcpu->arch.nmi_pending = events->nmi.pending;
+       kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
+
+       vcpu->arch.sipi_vector = events->sipi_vector;
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
 {
@@ -1770,6 +1958,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 
        switch (ioctl) {
        case KVM_GET_LAPIC: {
+               r = -EINVAL;
+               if (!vcpu->arch.apic)
+                       goto out;
                lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
                r = -ENOMEM;
@@ -1785,6 +1976,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                break;
        }
        case KVM_SET_LAPIC: {
+               r = -EINVAL;
+               if (!vcpu->arch.apic)
+                       goto out;
                lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
                r = -ENOMEM;
                if (!lapic)
@@ -1911,6 +2105,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
                break;
        }
+       case KVM_GET_VCPU_EVENTS: {
+               struct kvm_vcpu_events events;
+
+               kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
+
+               r = -EFAULT;
+               if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
+                       break;
+               r = 0;
+               break;
+       }
+       case KVM_SET_VCPU_EVENTS: {
+               struct kvm_vcpu_events events;
+
+               r = -EFAULT;
+               if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
+                       break;
+
+               r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+               break;
+       }
        default:
                r = -EINVAL;
        }
@@ -2039,9 +2254,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
                        sizeof(struct kvm_pic_state));
                break;
        case KVM_IRQCHIP_IOAPIC:
-               memcpy(&chip->chip.ioapic,
-                       ioapic_irqchip(kvm),
-                       sizeof(struct kvm_ioapic_state));
+               r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
                break;
        default:
                r = -EINVAL;
@@ -2071,11 +2284,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
                spin_unlock(&pic_irqchip(kvm)->lock);
                break;
        case KVM_IRQCHIP_IOAPIC:
-               mutex_lock(&kvm->irq_lock);
-               memcpy(ioapic_irqchip(kvm),
-                       &chip->chip.ioapic,
-                       sizeof(struct kvm_ioapic_state));
-               mutex_unlock(&kvm->irq_lock);
+               r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
                break;
        default:
                r = -EINVAL;
@@ -2183,7 +2392,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 {
        struct kvm *kvm = filp->private_data;
        void __user *argp = (void __user *)arg;
-       int r = -EINVAL;
+       int r = -ENOTTY;
        /*
         * This union makes it completely explicit to gcc-3.x
         * that these two variables' stack usage should be
@@ -2245,25 +2454,39 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (r)
                        goto out;
                break;
-       case KVM_CREATE_IRQCHIP:
+       case KVM_CREATE_IRQCHIP: {
+               struct kvm_pic *vpic;
+
+               mutex_lock(&kvm->lock);
+               r = -EEXIST;
+               if (kvm->arch.vpic)
+                       goto create_irqchip_unlock;
                r = -ENOMEM;
-               kvm->arch.vpic = kvm_create_pic(kvm);
-               if (kvm->arch.vpic) {
+               vpic = kvm_create_pic(kvm);
+               if (vpic) {
                        r = kvm_ioapic_init(kvm);
                        if (r) {
-                               kfree(kvm->arch.vpic);
-                               kvm->arch.vpic = NULL;
-                               goto out;
+                               kfree(vpic);
+                               goto create_irqchip_unlock;
                        }
                } else
-                       goto out;
+                       goto create_irqchip_unlock;
+               smp_wmb();
+               kvm->arch.vpic = vpic;
+               smp_wmb();
                r = kvm_setup_default_irq_routing(kvm);
                if (r) {
+                       mutex_lock(&kvm->irq_lock);
                        kfree(kvm->arch.vpic);
                        kfree(kvm->arch.vioapic);
-                       goto out;
+                       kvm->arch.vpic = NULL;
+                       kvm->arch.vioapic = NULL;
+                       mutex_unlock(&kvm->irq_lock);
                }
+       create_irqchip_unlock:
+               mutex_unlock(&kvm->lock);
                break;
+       }
        case KVM_CREATE_PIT:
                u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
                goto create_pit;
@@ -2293,10 +2516,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        goto out;
                if (irqchip_in_kernel(kvm)) {
                        __s32 status;
-                       mutex_lock(&kvm->irq_lock);
                        status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
                                        irq_event.irq, irq_event.level);
-                       mutex_unlock(&kvm->irq_lock);
                        if (ioctl == KVM_IRQ_LINE_STATUS) {
                                irq_event.status = status;
                                if (copy_to_user(argp, &irq_event,
@@ -2422,6 +2643,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
                break;
        }
+       case KVM_XEN_HVM_CONFIG: {
+               r = -EFAULT;
+               if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
+                                  sizeof(struct kvm_xen_hvm_config)))
+                       goto out;
+               r = -EINVAL;
+               if (kvm->arch.xen_hvm_config.flags)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_CLOCK: {
+               struct timespec now;
+               struct kvm_clock_data user_ns;
+               u64 now_ns;
+               s64 delta;
+
+               r = -EFAULT;
+               if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
+                       goto out;
+
+               r = -EINVAL;
+               if (user_ns.flags)
+                       goto out;
+
+               r = 0;
+               ktime_get_ts(&now);
+               now_ns = timespec_to_ns(&now);
+               delta = user_ns.clock - now_ns;
+               kvm->arch.kvmclock_offset = delta;
+               break;
+       }
+       case KVM_GET_CLOCK: {
+               struct timespec now;
+               struct kvm_clock_data user_ns;
+               u64 now_ns;
+
+               ktime_get_ts(&now);
+               now_ns = timespec_to_ns(&now);
+               user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
+               user_ns.flags = 0;
+
+               r = -EFAULT;
+               if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
+                       goto out;
+               r = 0;
+               break;
+       }
+
        default:
                ;
        }
@@ -2434,7 +2704,8 @@ static void kvm_init_msr_list(void)
        u32 dummy[2];
        unsigned i, j;
 
-       for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
+       /* skip the first msrs in the list. KVM-specific */
+       for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
                if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
                        continue;
                if (j < i)
@@ -2758,13 +3029,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
 }
 
 int emulate_instruction(struct kvm_vcpu *vcpu,
-                       struct kvm_run *run,
                        unsigned long cr2,
                        u16 error_code,
                        int emulation_type)
 {
        int r, shadow_mask;
        struct decode_cache *c;
+       struct kvm_run *run = vcpu->run;
 
        kvm_clear_exception_queue(vcpu);
        vcpu->arch.mmio_fault_cr2 = cr2;
@@ -2784,7 +3055,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
                kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
                vcpu->arch.emulate_ctxt.vcpu = vcpu;
-               vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+               vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
                vcpu->arch.emulate_ctxt.mode =
                        (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
                        ? X86EMUL_MODE_REAL : cs_l
@@ -2862,7 +3133,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
                return EMULATE_DO_MMIO;
        }
 
-       kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+       kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 
        if (vcpu->mmio_is_write) {
                vcpu->mmio_needed = 0;
@@ -2970,8 +3241,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
        return r;
 }
 
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
-                 int size, unsigned port)
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
 {
        unsigned long val;
 
@@ -3000,7 +3270,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
 
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
                  int size, unsigned long count, int down,
                  gva_t address, int rep, unsigned port)
 {
@@ -3073,9 +3343,6 @@ static void bounce_off(void *info)
        /* nothing */
 }
 
-static unsigned int  ref_freq;
-static unsigned long tsc_khz_ref;
-
 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
                                     void *data)
 {
@@ -3084,14 +3351,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
        struct kvm_vcpu *vcpu;
        int i, send_ipi = 0;
 
-       if (!ref_freq)
-               ref_freq = freq->old;
-
        if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
                return 0;
        if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
                return 0;
-       per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+       per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
 
        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3128,9 +3392,28 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
         .notifier_call  = kvmclock_cpufreq_notifier
 };
 
+static void kvm_timer_init(void)
+{
+       int cpu;
+
+       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+               cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+                                         CPUFREQ_TRANSITION_NOTIFIER);
+               for_each_online_cpu(cpu) {
+                       unsigned long khz = cpufreq_get(cpu);
+                       if (!khz)
+                               khz = tsc_khz;
+                       per_cpu(cpu_tsc_khz, cpu) = khz;
+               }
+       } else {
+               for_each_possible_cpu(cpu)
+                       per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+       }
+}
+
 int kvm_arch_init(void *opaque)
 {
-       int r, cpu;
+       int r;
        struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
 
        if (kvm_x86_ops) {
@@ -3162,13 +3445,7 @@ int kvm_arch_init(void *opaque)
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
-       for_each_possible_cpu(cpu)
-               per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
-       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-               tsc_khz_ref = tsc_khz;
-               cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
-                                         CPUFREQ_TRANSITION_NOTIFIER);
-       }
+       kvm_timer_init();
 
        return 0;
 
@@ -3296,7 +3573,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
                   unsigned long *rflags)
 {
        kvm_lmsw(vcpu, msw);
-       *rflags = kvm_x86_ops->get_rflags(vcpu);
+       *rflags = kvm_get_rflags(vcpu);
 }
 
 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
@@ -3334,7 +3611,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
        switch (cr) {
        case 0:
                kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
-               *rflags = kvm_x86_ops->get_rflags(vcpu);
+               *rflags = kvm_get_rflags(vcpu);
                break;
        case 2:
                vcpu->arch.cr2 = val;
@@ -3454,18 +3731,18 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
  *
  * No need to exit to userspace if we already have an interrupt queued.
  */
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
-                                         struct kvm_run *kvm_run)
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 {
        return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
-               kvm_run->request_interrupt_window &&
+               vcpu->run->request_interrupt_window &&
                kvm_arch_interrupt_allowed(vcpu));
 }
 
-static void post_kvm_run_save(struct kvm_vcpu *vcpu,
-                             struct kvm_run *kvm_run)
+static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 {
-       kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+       struct kvm_run *kvm_run = vcpu->run;
+
+       kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
        kvm_run->cr8 = kvm_get_cr8(vcpu);
        kvm_run->apic_base = kvm_get_apic_base(vcpu);
        if (irqchip_in_kernel(vcpu->kvm))
@@ -3526,7 +3803,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void inject_pending_event(struct kvm_vcpu *vcpu)
 {
        /* try to reinject previous events if any */
        if (vcpu->arch.exception.pending) {
@@ -3562,11 +3839,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        }
 }
 
-static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
        int r;
        bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
-               kvm_run->request_interrupt_window;
+               vcpu->run->request_interrupt_window;
 
        if (vcpu->requests)
                if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3587,12 +3864,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                        kvm_x86_ops->tlb_flush(vcpu);
                if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
                                       &vcpu->requests)) {
-                       kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
+                       vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
                        r = 0;
                        goto out;
                }
                if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
-                       kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+                       vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
                        r = 0;
                        goto out;
                }
@@ -3616,7 +3893,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                goto out;
        }
 
-       inject_pending_event(vcpu, kvm_run);
+       inject_pending_event(vcpu);
 
        /* enable NMI/IRQ window open exits if needed */
        if (vcpu->arch.nmi_pending)
@@ -3642,7 +3919,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        }
 
        trace_kvm_entry(vcpu->vcpu_id);
-       kvm_x86_ops->run(vcpu, kvm_run);
+       kvm_x86_ops->run(vcpu);
 
        /*
         * If the guest has used debug registers, at least dr7
@@ -3684,13 +3961,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        kvm_lapic_sync_from_vapic(vcpu);
 
-       r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+       r = kvm_x86_ops->handle_exit(vcpu);
 out:
        return r;
 }
 
 
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int __vcpu_run(struct kvm_vcpu *vcpu)
 {
        int r;
 
@@ -3710,7 +3987,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        r = 1;
        while (r > 0) {
                if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
-                       r = vcpu_enter_guest(vcpu, kvm_run);
+                       r = vcpu_enter_guest(vcpu);
                else {
                        up_read(&vcpu->kvm->slots_lock);
                        kvm_vcpu_block(vcpu);
@@ -3738,14 +4015,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                if (kvm_cpu_has_pending_timer(vcpu))
                        kvm_inject_pending_timer_irqs(vcpu);
 
-               if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+               if (dm_request_for_irq_injection(vcpu)) {
                        r = -EINTR;
-                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       vcpu->run->exit_reason = KVM_EXIT_INTR;
                        ++vcpu->stat.request_irq_exits;
                }
                if (signal_pending(current)) {
                        r = -EINTR;
-                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       vcpu->run->exit_reason = KVM_EXIT_INTR;
                        ++vcpu->stat.signal_exits;
                }
                if (need_resched()) {
@@ -3756,7 +4033,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        }
 
        up_read(&vcpu->kvm->slots_lock);
-       post_kvm_run_save(vcpu, kvm_run);
+       post_kvm_run_save(vcpu);
 
        vapic_exit(vcpu);
 
@@ -3789,15 +4066,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                if (r)
                        goto out;
        }
-#if CONFIG_HAS_IOMEM
        if (vcpu->mmio_needed) {
                memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
                vcpu->mmio_read_completed = 1;
                vcpu->mmio_needed = 0;
 
                down_read(&vcpu->kvm->slots_lock);
-               r = emulate_instruction(vcpu, kvm_run,
-                                       vcpu->arch.mmio_fault_cr2, 0,
+               r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
                                        EMULTYPE_NO_DECODE);
                up_read(&vcpu->kvm->slots_lock);
                if (r == EMULATE_DO_MMIO) {
@@ -3808,12 +4083,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                        goto out;
                }
        }
-#endif
        if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
                kvm_register_write(vcpu, VCPU_REGS_RAX,
                                     kvm_run->hypercall.ret);
 
-       r = __vcpu_run(vcpu, kvm_run);
+       r = __vcpu_run(vcpu);
 
 out:
        if (vcpu->sigset_active)
@@ -3847,13 +4121,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 #endif
 
        regs->rip = kvm_rip_read(vcpu);
-       regs->rflags = kvm_x86_ops->get_rflags(vcpu);
-
-       /*
-        * Don't leak debug flags in case they were set for guest debugging
-        */
-       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-               regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+       regs->rflags = kvm_get_rflags(vcpu);
 
        vcpu_put(vcpu);
 
@@ -3881,12 +4149,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
        kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
        kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
-
 #endif
 
        kvm_rip_write(vcpu, regs->rip);
-       kvm_x86_ops->set_rflags(vcpu, regs->rflags);
-
+       kvm_set_rflags(vcpu, regs->rflags);
 
        vcpu->arch.exception.pending = false;
 
@@ -4105,7 +4371,7 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
 {
        return (seg != VCPU_SREG_LDTR) &&
                (seg != VCPU_SREG_TR) &&
-               (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM);
+               (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
 }
 
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
@@ -4133,7 +4399,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
 {
        tss->cr3 = vcpu->arch.cr3;
        tss->eip = kvm_rip_read(vcpu);
-       tss->eflags = kvm_x86_ops->get_rflags(vcpu);
+       tss->eflags = kvm_get_rflags(vcpu);
        tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
        tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
        tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4157,7 +4423,7 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
        kvm_set_cr3(vcpu, tss->cr3);
 
        kvm_rip_write(vcpu, tss->eip);
-       kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
+       kvm_set_rflags(vcpu, tss->eflags | 2);
 
        kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
        kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
@@ -4195,7 +4461,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
                                struct tss_segment_16 *tss)
 {
        tss->ip = kvm_rip_read(vcpu);
-       tss->flag = kvm_x86_ops->get_rflags(vcpu);
+       tss->flag = kvm_get_rflags(vcpu);
        tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
        tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
        tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4210,14 +4476,13 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
        tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
        tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
        tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-       tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
 }
 
 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
                                 struct tss_segment_16 *tss)
 {
        kvm_rip_write(vcpu, tss->ip);
-       kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
+       kvm_set_rflags(vcpu, tss->flag | 2);
        kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
        kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
        kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
@@ -4363,8 +4628,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
        }
 
        if (reason == TASK_SWITCH_IRET) {
-               u32 eflags = kvm_x86_ops->get_rflags(vcpu);
-               kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+               u32 eflags = kvm_get_rflags(vcpu);
+               kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
        }
 
        /* set back link to prev task only if NT bit is set in eflags
@@ -4372,11 +4637,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
        if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
                old_tss_sel = 0xffff;
 
-       /* set back link to prev task only if NT bit is set in eflags
-          note that old_tss_sel is not used afetr this point */
-       if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
-               old_tss_sel = 0xffff;
-
        if (nseg_desc.type & 8)
                ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
                                         old_tss_base, &nseg_desc);
@@ -4385,8 +4645,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
                                         old_tss_base, &nseg_desc);
 
        if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
-               u32 eflags = kvm_x86_ops->get_rflags(vcpu);
-               kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
+               u32 eflags = kvm_get_rflags(vcpu);
+               kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
        }
 
        if (reason != TASK_SWITCH_IRET) {
@@ -4438,8 +4698,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
        mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
        kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
-       if (!is_long_mode(vcpu) && is_pae(vcpu))
+       if (!is_long_mode(vcpu) && is_pae(vcpu)) {
                load_pdptrs(vcpu, vcpu->arch.cr3);
+               mmu_reset_needed = 1;
+       }
 
        if (mmu_reset_needed)
                kvm_mmu_reset_context(vcpu);
@@ -4480,12 +4742,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                                        struct kvm_guest_debug *dbg)
 {
+       unsigned long rflags;
        int i, r;
 
        vcpu_load(vcpu);
 
-       if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
-           (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
+       if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
+               r = -EBUSY;
+               if (vcpu->arch.exception.pending)
+                       goto unlock_out;
+               if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+                       kvm_queue_exception(vcpu, DB_VECTOR);
+               else
+                       kvm_queue_exception(vcpu, BP_VECTOR);
+       }
+
+       /*
+        * Read rflags as long as potentially injected trace flags are still
+        * filtered out.
+        */
+       rflags = kvm_get_rflags(vcpu);
+
+       vcpu->guest_debug = dbg->control;
+       if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+               vcpu->guest_debug = 0;
+
+       if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
                for (i = 0; i < KVM_NR_DB_REGS; ++i)
                        vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
                vcpu->arch.switch_db_regs =
@@ -4496,13 +4778,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
        }
 
-       r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
+       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+               vcpu->arch.singlestep_cs =
+                       get_segment_selector(vcpu, VCPU_SREG_CS);
+               vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
+       }
+
+       /*
+        * Trigger an rflags update that will inject or remove the trace
+        * flags.
+        */
+       kvm_set_rflags(vcpu, rflags);
+
+       kvm_x86_ops->set_guest_debug(vcpu, dbg);
 
-       if (dbg->control & KVM_GUESTDBG_INJECT_DB)
-               kvm_queue_exception(vcpu, DB_VECTOR);
-       else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
-               kvm_queue_exception(vcpu, BP_VECTOR);
+       r = 0;
 
+unlock_out:
        vcpu_put(vcpu);
 
        return r;
@@ -4703,14 +4995,26 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
        return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
-void kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void *garbage)
 {
-       kvm_x86_ops->hardware_enable(garbage);
+       /*
+        * Since this may be called from a hotplug notifcation,
+        * we can't get the CPU frequency directly.
+        */
+       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+               int cpu = raw_smp_processor_id();
+               per_cpu(cpu_tsc_khz, cpu) = 0;
+       }
+
+       kvm_shared_msr_cpu_online();
+
+       return kvm_x86_ops->hardware_enable(garbage);
 }
 
 void kvm_arch_hardware_disable(void *garbage)
 {
        kvm_x86_ops->hardware_disable(garbage);
+       drop_user_return_notifiers(garbage);
 }
 
 int kvm_arch_hardware_setup(void)
@@ -4948,8 +5252,36 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
        return kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+       unsigned long rflags;
+
+       rflags = kvm_x86_ops->get_rflags(vcpu);
+       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+               rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+       return rflags;
+}
+EXPORT_SYMBOL_GPL(kvm_get_rflags);
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+           vcpu->arch.singlestep_cs ==
+                       get_segment_selector(vcpu, VCPU_SREG_CS) &&
+           vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
+               rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+       kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+EXPORT_SYMBOL_GPL(kvm_set_rflags);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
index f8f8900..2d241da 100644 (file)
 
 #define KVM_API_VERSION 12
 
-/* for KVM_TRACE_ENABLE, deprecated */
+/* *** Deprecated interfaces *** */
+
+#define KVM_TRC_SHIFT           16
+
+#define KVM_TRC_ENTRYEXIT       (1 << KVM_TRC_SHIFT)
+#define KVM_TRC_HANDLER         (1 << (KVM_TRC_SHIFT + 1))
+
+#define KVM_TRC_VMENTRY         (KVM_TRC_ENTRYEXIT + 0x01)
+#define KVM_TRC_VMEXIT          (KVM_TRC_ENTRYEXIT + 0x02)
+#define KVM_TRC_PAGE_FAULT      (KVM_TRC_HANDLER + 0x01)
+
+#define KVM_TRC_HEAD_SIZE       12
+#define KVM_TRC_CYCLE_SIZE      8
+#define KVM_TRC_EXTRA_MAX       7
+
+#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
+#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
+#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
+#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
+#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
+#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
+#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
+#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
+#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
+#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
+#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
+#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
+#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
+#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
+#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
+#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
+#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
+#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
+#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
+#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
+#define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
+#define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
+#define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
+#define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
+
 struct kvm_user_trace_setup {
-       __u32 buf_size; /* sub_buffer size of each per-cpu */
-       __u32 buf_nr; /* the number of sub_buffers of each per-cpu */
+       __u32 buf_size;
+       __u32 buf_nr;
+};
+
+#define __KVM_DEPRECATED_MAIN_W_0x06 \
+       _IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
+#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07)
+#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08)
+
+#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq)
+
+struct kvm_breakpoint {
+       __u32 enabled;
+       __u32 padding;
+       __u64 address;
+};
+
+struct kvm_debug_guest {
+       __u32 enabled;
+       __u32 pad;
+       struct kvm_breakpoint breakpoints[4];
+       __u32 singlestep;
 };
 
+#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest)
+
+/* *** End of deprecated interfaces *** */
+
+
 /* for KVM_CREATE_MEMORY_REGION */
 struct kvm_memory_region {
        __u32 slot;
@@ -99,6 +163,7 @@ struct kvm_pit_config {
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
+#define KVM_INTERNAL_ERROR_SIMUL_EX 2
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
@@ -116,6 +181,11 @@ struct kvm_run {
        __u64 cr8;
        __u64 apic_base;
 
+#ifdef __KVM_S390
+       /* the processor status word for s390 */
+       __u64 psw_mask; /* psw upper half */
+       __u64 psw_addr; /* psw lower half */
+#endif
        union {
                /* KVM_EXIT_UNKNOWN */
                struct {
@@ -167,8 +237,6 @@ struct kvm_run {
                /* KVM_EXIT_S390_SIEIC */
                struct {
                        __u8 icptcode;
-                       __u64 mask; /* psw upper half */
-                       __u64 addr; /* psw lower half */
                        __u16 ipa;
                        __u32 ipb;
                } s390_sieic;
@@ -187,6 +255,9 @@ struct kvm_run {
                } dcr;
                struct {
                        __u32 suberror;
+                       /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
+                       __u32 ndata;
+                       __u64 data[16];
                } internal;
                /* Fix the size of the union. */
                char padding[256];
@@ -329,24 +400,6 @@ struct kvm_ioeventfd {
        __u8  pad[36];
 };
 
-#define KVM_TRC_SHIFT           16
-/*
- * kvm trace categories
- */
-#define KVM_TRC_ENTRYEXIT       (1 << KVM_TRC_SHIFT)
-#define KVM_TRC_HANDLER         (1 << (KVM_TRC_SHIFT + 1)) /* only 12 bits */
-
-/*
- * kvm trace action
- */
-#define KVM_TRC_VMENTRY         (KVM_TRC_ENTRYEXIT + 0x01)
-#define KVM_TRC_VMEXIT          (KVM_TRC_ENTRYEXIT + 0x02)
-#define KVM_TRC_PAGE_FAULT      (KVM_TRC_HANDLER + 0x01)
-
-#define KVM_TRC_HEAD_SIZE       12
-#define KVM_TRC_CYCLE_SIZE      8
-#define KVM_TRC_EXTRA_MAX       7
-
 #define KVMIO 0xAE
 
 /*
@@ -367,12 +420,10 @@ struct kvm_ioeventfd {
  */
 #define KVM_GET_VCPU_MMAP_SIZE    _IO(KVMIO,   0x04) /* in bytes */
 #define KVM_GET_SUPPORTED_CPUID   _IOWR(KVMIO, 0x05, struct kvm_cpuid2)
-/*
- * ioctls for kvm trace
- */
-#define KVM_TRACE_ENABLE          _IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
-#define KVM_TRACE_PAUSE           _IO(KVMIO,  0x07)
-#define KVM_TRACE_DISABLE         _IO(KVMIO,  0x08)
+#define KVM_TRACE_ENABLE          __KVM_DEPRECATED_MAIN_W_0x06
+#define KVM_TRACE_PAUSE           __KVM_DEPRECATED_MAIN_0x07
+#define KVM_TRACE_DISABLE         __KVM_DEPRECATED_MAIN_0x08
+
 /*
  * Extension capability list.
  */
@@ -436,6 +487,15 @@ struct kvm_ioeventfd {
 #endif
 #define KVM_CAP_IOEVENTFD 36
 #define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
+#ifdef __KVM_HAVE_XEN_HVM
+#define KVM_CAP_XEN_HVM 38
+#endif
+#define KVM_CAP_ADJUST_CLOCK 39
+#define KVM_CAP_INTERNAL_ERROR_DATA 40
+#ifdef __KVM_HAVE_VCPU_EVENTS
+#define KVM_CAP_VCPU_EVENTS 41
+#endif
+#define KVM_CAP_S390_PSW 42
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -488,6 +548,18 @@ struct kvm_x86_mce {
 };
 #endif
 
+#ifdef KVM_CAP_XEN_HVM
+struct kvm_xen_hvm_config {
+       __u32 flags;
+       __u32 msr;
+       __u64 blob_addr_32;
+       __u64 blob_addr_64;
+       __u8 blob_size_32;
+       __u8 blob_size_64;
+       __u8 pad2[30];
+};
+#endif
+
 #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
 
 struct kvm_irqfd {
@@ -497,55 +569,66 @@ struct kvm_irqfd {
        __u8  pad[20];
 };
 
+struct kvm_clock_data {
+       __u64 clock;
+       __u32 flags;
+       __u32 pad[9];
+};
+
 /*
  * ioctls for VM fds
  */
-#define KVM_SET_MEMORY_REGION     _IOW(KVMIO, 0x40, struct kvm_memory_region)
+#define KVM_SET_MEMORY_REGION     _IOW(KVMIO,  0x40, struct kvm_memory_region)
 /*
  * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
  * a vcpu fd.
  */
-#define KVM_CREATE_VCPU           _IO(KVMIO,  0x41)
-#define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 0x42, struct kvm_dirty_log)
-#define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO, 0x43, struct kvm_memory_alias)
-#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO, 0x44)
-#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO, 0x45)
-#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
+#define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
+#define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
+#define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO,  0x43, struct kvm_memory_alias)
+#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
                                        struct kvm_userspace_memory_region)
-#define KVM_SET_TSS_ADDR          _IO(KVMIO, 0x47)
-#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
+#define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
+#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
 /* Device model IOC */
-#define KVM_CREATE_IRQCHIP       _IO(KVMIO,  0x60)
-#define KVM_IRQ_LINE             _IOW(KVMIO, 0x61, struct kvm_irq_level)
-#define KVM_GET_IRQCHIP                  _IOWR(KVMIO, 0x62, struct kvm_irqchip)
-#define KVM_SET_IRQCHIP                  _IOR(KVMIO,  0x63, struct kvm_irqchip)
-#define KVM_CREATE_PIT           _IO(KVMIO,  0x64)
-#define KVM_GET_PIT              _IOWR(KVMIO, 0x65, struct kvm_pit_state)
-#define KVM_SET_PIT              _IOR(KVMIO,  0x66, struct kvm_pit_state)
-#define KVM_IRQ_LINE_STATUS      _IOWR(KVMIO, 0x67, struct kvm_irq_level)
+#define KVM_CREATE_IRQCHIP        _IO(KVMIO,   0x60)
+#define KVM_IRQ_LINE              _IOW(KVMIO,  0x61, struct kvm_irq_level)
+#define KVM_GET_IRQCHIP           _IOWR(KVMIO, 0x62, struct kvm_irqchip)
+#define KVM_SET_IRQCHIP           _IOR(KVMIO,  0x63, struct kvm_irqchip)
+#define KVM_CREATE_PIT            _IO(KVMIO,   0x64)
+#define KVM_GET_PIT               _IOWR(KVMIO, 0x65, struct kvm_pit_state)
+#define KVM_SET_PIT               _IOR(KVMIO,  0x66, struct kvm_pit_state)
+#define KVM_IRQ_LINE_STATUS       _IOWR(KVMIO, 0x67, struct kvm_irq_level)
 #define KVM_REGISTER_COALESCED_MMIO \
                        _IOW(KVMIO,  0x67, struct kvm_coalesced_mmio_zone)
 #define KVM_UNREGISTER_COALESCED_MMIO \
                        _IOW(KVMIO,  0x68, struct kvm_coalesced_mmio_zone)
-#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
-                                  struct kvm_assigned_pci_dev)
-#define KVM_SET_GSI_ROUTING       _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
+#define KVM_ASSIGN_PCI_DEVICE     _IOR(KVMIO,  0x69, \
+                                      struct kvm_assigned_pci_dev)
+#define KVM_SET_GSI_ROUTING       _IOW(KVMIO,  0x6a, struct kvm_irq_routing)
 /* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
-#define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
-                           struct kvm_assigned_irq)
-#define KVM_ASSIGN_DEV_IRQ        _IOW(KVMIO, 0x70, struct kvm_assigned_irq)
-#define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
-#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
-                                    struct kvm_assigned_pci_dev)
-#define KVM_ASSIGN_SET_MSIX_NR \
-                       _IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr)
-#define KVM_ASSIGN_SET_MSIX_ENTRY \
-                       _IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
-#define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
-#define KVM_IRQFD                  _IOW(KVMIO, 0x76, struct kvm_irqfd)
-#define KVM_CREATE_PIT2                   _IOW(KVMIO, 0x77, struct kvm_pit_config)
-#define KVM_SET_BOOT_CPU_ID        _IO(KVMIO, 0x78)
-#define KVM_IOEVENTFD             _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
+#define KVM_ASSIGN_IRQ            __KVM_DEPRECATED_VM_R_0x70
+#define KVM_ASSIGN_DEV_IRQ        _IOW(KVMIO,  0x70, struct kvm_assigned_irq)
+#define KVM_REINJECT_CONTROL      _IO(KVMIO,   0x71)
+#define KVM_DEASSIGN_PCI_DEVICE   _IOW(KVMIO,  0x72, \
+                                      struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_SET_MSIX_NR    _IOW(KVMIO,  0x73, \
+                                      struct kvm_assigned_msix_nr)
+#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO,  0x74, \
+                                      struct kvm_assigned_msix_entry)
+#define KVM_DEASSIGN_DEV_IRQ      _IOW(KVMIO,  0x75, struct kvm_assigned_irq)
+#define KVM_IRQFD                 _IOW(KVMIO,  0x76, struct kvm_irqfd)
+#define KVM_CREATE_PIT2                  _IOW(KVMIO,  0x77, struct kvm_pit_config)
+#define KVM_SET_BOOT_CPU_ID       _IO(KVMIO,   0x78)
+#define KVM_IOEVENTFD             _IOW(KVMIO,  0x79, struct kvm_ioeventfd)
+#define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
+#define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
+#define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_PIT_STATE2 */
+#define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
+#define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
 
 /*
  * ioctls for vcpu fds
@@ -558,7 +641,7 @@ struct kvm_irqfd {
 #define KVM_TRANSLATE             _IOWR(KVMIO, 0x85, struct kvm_translation)
 #define KVM_INTERRUPT             _IOW(KVMIO,  0x86, struct kvm_interrupt)
 /* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */
-#define KVM_DEBUG_GUEST           __KVM_DEPRECATED_DEBUG_GUEST
+#define KVM_DEBUG_GUEST           __KVM_DEPRECATED_VCPU_W_0x87
 #define KVM_GET_MSRS              _IOWR(KVMIO, 0x88, struct kvm_msrs)
 #define KVM_SET_MSRS              _IOW(KVMIO,  0x89, struct kvm_msrs)
 #define KVM_SET_CPUID             _IOW(KVMIO,  0x8a, struct kvm_cpuid)
@@ -570,7 +653,7 @@ struct kvm_irqfd {
 #define KVM_SET_CPUID2            _IOW(KVMIO,  0x90, struct kvm_cpuid2)
 #define KVM_GET_CPUID2            _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
 /* Available with KVM_CAP_VAPIC */
-#define KVM_TPR_ACCESS_REPORTING  _IOWR(KVMIO,  0x92, struct kvm_tpr_access_ctl)
+#define KVM_TPR_ACCESS_REPORTING  _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
 /* Available with KVM_CAP_VAPIC */
 #define KVM_SET_VAPIC_ADDR        _IOW(KVMIO,  0x93, struct kvm_vapic_addr)
 /* valid for virtual machine (for floating interrupt)_and_ vcpu */
@@ -582,66 +665,23 @@ struct kvm_irqfd {
 /* initial ipl psw for s390 */
 #define KVM_S390_SET_INITIAL_PSW  _IOW(KVMIO,  0x96, struct kvm_s390_psw)
 /* initial reset for s390 */
-#define KVM_S390_INITIAL_RESET    _IO(KVMIO,  0x97)
+#define KVM_S390_INITIAL_RESET    _IO(KVMIO,   0x97)
 #define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
 /* Available with KVM_CAP_NMI */
-#define KVM_NMI                   _IO(KVMIO,  0x9a)
+#define KVM_NMI                   _IO(KVMIO,   0x9a)
 /* Available with KVM_CAP_SET_GUEST_DEBUG */
 #define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
 /* MCE for x86 */
 #define KVM_X86_SETUP_MCE         _IOW(KVMIO,  0x9c, __u64)
 #define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO,  0x9d, __u64)
 #define KVM_X86_SET_MCE           _IOW(KVMIO,  0x9e, struct kvm_x86_mce)
-
-/*
- * Deprecated interfaces
- */
-struct kvm_breakpoint {
-       __u32 enabled;
-       __u32 padding;
-       __u64 address;
-};
-
-struct kvm_debug_guest {
-       __u32 enabled;
-       __u32 pad;
-       struct kvm_breakpoint breakpoints[4];
-       __u32 singlestep;
-};
-
-#define __KVM_DEPRECATED_DEBUG_GUEST _IOW(KVMIO,  0x87, struct kvm_debug_guest)
-
+/* IA64 stack access */
 #define KVM_IA64_VCPU_GET_STACK   _IOR(KVMIO,  0x9a, void *)
 #define KVM_IA64_VCPU_SET_STACK   _IOW(KVMIO,  0x9b, void *)
-
-#define KVM_GET_PIT2   _IOR(KVMIO,   0x9f, struct kvm_pit_state2)
-#define KVM_SET_PIT2   _IOW(KVMIO,   0xa0, struct kvm_pit_state2)
-
-#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
-#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
-#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
-#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
-#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
-#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
-#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
-#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
-#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
-#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
-#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
-#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
-#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
-#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
-#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
-#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
-#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
-#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
-#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
-#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
-#define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
-#define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
-#define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
-#define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
+/* Available with KVM_CAP_VCPU_EVENTS */
+#define KVM_GET_VCPU_EVENTS       _IOR(KVMIO,  0x9f, struct kvm_vcpu_events)
+#define KVM_SET_VCPU_EVENTS       _IOW(KVMIO,  0xa0, struct kvm_vcpu_events)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
 
@@ -696,4 +736,4 @@ struct kvm_assigned_msix_entry {
        __u16 padding[3];
 };
 
-#endif
+#endif /* __LINUX_KVM_H */
index b7bbb5d..bd5a616 100644 (file)
@@ -120,7 +120,7 @@ struct kvm_kernel_irq_routing_entry {
        u32 gsi;
        u32 type;
        int (*set)(struct kvm_kernel_irq_routing_entry *e,
-                   struct kvm *kvm, int level);
+                  struct kvm *kvm, int irq_source_id, int level);
        union {
                struct {
                        unsigned irqchip;
@@ -128,9 +128,28 @@ struct kvm_kernel_irq_routing_entry {
                } irqchip;
                struct msi_msg msi;
        };
-       struct list_head link;
+       struct hlist_node link;
+};
+
+#ifdef __KVM_HAVE_IOAPIC
+
+struct kvm_irq_routing_table {
+       int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
+       struct kvm_kernel_irq_routing_entry *rt_entries;
+       u32 nr_rt_entries;
+       /*
+        * Array indexed by gsi. Each entry contains list of irq chips
+        * the gsi is connected to.
+        */
+       struct hlist_head map[0];
 };
 
+#else
+
+struct kvm_irq_routing_table {};
+
+#endif
+
 struct kvm {
        spinlock_t mmu_lock;
        spinlock_t requests_lock;
@@ -166,8 +185,9 @@ struct kvm {
 
        struct mutex irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
-       struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */
+       struct kvm_irq_routing_table *irq_routing;
        struct hlist_head mask_notifier_list;
+       struct hlist_head irq_ack_notifier_list;
 #endif
 
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
@@ -266,6 +286,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
@@ -325,7 +346,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
-void kvm_arch_hardware_enable(void *garbage);
+int kvm_arch_hardware_enable(void *garbage);
 void kvm_arch_hardware_disable(void *garbage);
 int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
@@ -390,7 +411,12 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
                                      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
+#ifdef __KVM_HAVE_IOAPIC
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+                                  union kvm_ioapic_redirect_entry *entry,
+                                  unsigned long *deliver_bitmask);
+#endif
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
                                   struct kvm_irq_ack_notifier *kian);
@@ -552,4 +578,21 @@ static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
        return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
 }
 #endif
+
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+                                 unsigned long arg);
+
+#else
+
+static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+                                               unsigned long arg)
+{
+       return -ENOTTY;
+}
+
 #endif
+
+#endif
+
diff --git a/include/linux/user-return-notifier.h b/include/linux/user-return-notifier.h
new file mode 100644 (file)
index 0000000..9c4a445
--- /dev/null
@@ -0,0 +1,49 @@
+#ifndef _LINUX_USER_RETURN_NOTIFIER_H
+#define _LINUX_USER_RETURN_NOTIFIER_H
+
+#ifdef CONFIG_USER_RETURN_NOTIFIER
+
+#include <linux/list.h>
+#include <linux/sched.h>
+
+struct user_return_notifier {
+       void (*on_user_return)(struct user_return_notifier *urn);
+       struct hlist_node link;
+};
+
+
+void user_return_notifier_register(struct user_return_notifier *urn);
+void user_return_notifier_unregister(struct user_return_notifier *urn);
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+                                               struct task_struct *next)
+{
+       if (test_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY)) {
+               clear_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY);
+               set_tsk_thread_flag(next, TIF_USER_RETURN_NOTIFY);
+       }
+}
+
+void fire_user_return_notifiers(void);
+
+static inline void clear_user_return_notifier(struct task_struct *p)
+{
+       clear_tsk_thread_flag(p, TIF_USER_RETURN_NOTIFY);
+}
+
+#else
+
+struct user_return_notifier {};
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+                                               struct task_struct *next)
+{
+}
+
+static inline void fire_user_return_notifiers(void) {}
+
+static inline void clear_user_return_notifier(struct task_struct *p) {}
+
+#endif
+
+#endif
index 9943202..864ff75 100644 (file)
@@ -99,6 +99,7 @@ obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
index 3d6f121..edeff9c 100644 (file)
@@ -64,6 +64,7 @@
 #include <linux/magic.h>
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
+#include <linux/user-return-notifier.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -249,6 +250,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                goto out;
 
        setup_thread_stack(tsk, orig);
+       clear_user_return_notifier(tsk);
        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
 
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644 (file)
index 0000000..03e2d6f
--- /dev/null
@@ -0,0 +1,46 @@
+
+#include <linux/user-return-notifier.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
+
+#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
+
+/*
+ * Request a notification when the current cpu returns to userspace.  Must be
+ * called in atomic context.  The notifier will also be called in atomic
+ * context.
+ */
+void user_return_notifier_register(struct user_return_notifier *urn)
+{
+       set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+       hlist_add_head(&urn->link, &URN_LIST_HEAD);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_register);
+
+/*
+ * Removes a registered user return notifier.  Must be called from atomic
+ * context, and from the same cpu registration occured in.
+ */
+void user_return_notifier_unregister(struct user_return_notifier *urn)
+{
+       hlist_del(&urn->link);
+       if (hlist_empty(&URN_LIST_HEAD))
+               clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
+
+/* Calls registered user return notifiers */
+void fire_user_return_notifiers(void)
+{
+       struct user_return_notifier *urn;
+       struct hlist_node *tmp1, *tmp2;
+       struct hlist_head *head;
+
+       head = &get_cpu_var(return_notifier_list);
+       hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+               urn->on_user_return(urn);
+       put_cpu_var(return_notifier_list);
+}
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
new file mode 100644 (file)
index 0000000..fd9c097
--- /dev/null
@@ -0,0 +1,818 @@
+/*
+ * Kernel-based Virtual Machine - device assignment support
+ *
+ * Copyright (C) 2006-9 Red Hat, Inc
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include "irq.h"
+
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+                                                     int assigned_dev_id)
+{
+       struct list_head *ptr;
+       struct kvm_assigned_dev_kernel *match;
+
+       list_for_each(ptr, head) {
+               match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+               if (match->assigned_dev_id == assigned_dev_id)
+                       return match;
+       }
+       return NULL;
+}
+
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+                                   *assigned_dev, int irq)
+{
+       int i, index;
+       struct msix_entry *host_msix_entries;
+
+       host_msix_entries = assigned_dev->host_msix_entries;
+
+       index = -1;
+       for (i = 0; i < assigned_dev->entries_nr; i++)
+               if (irq == host_msix_entries[i].vector) {
+                       index = i;
+                       break;
+               }
+       if (index < 0) {
+               printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+               return 0;
+       }
+
+       return index;
+}
+
+static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+{
+       struct kvm_assigned_dev_kernel *assigned_dev;
+       struct kvm *kvm;
+       int i;
+
+       assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
+                                   interrupt_work);
+       kvm = assigned_dev->kvm;
+
+       spin_lock_irq(&assigned_dev->assigned_dev_lock);
+       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+               struct kvm_guest_msix_entry *guest_entries =
+                       assigned_dev->guest_msix_entries;
+               for (i = 0; i < assigned_dev->entries_nr; i++) {
+                       if (!(guest_entries[i].flags &
+                                       KVM_ASSIGNED_MSIX_PENDING))
+                               continue;
+                       guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
+                       kvm_set_irq(assigned_dev->kvm,
+                                   assigned_dev->irq_source_id,
+                                   guest_entries[i].vector, 1);
+               }
+       } else
+               kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+                           assigned_dev->guest_irq, 1);
+
+       spin_unlock_irq(&assigned_dev->assigned_dev_lock);
+}
+
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+       unsigned long flags;
+       struct kvm_assigned_dev_kernel *assigned_dev =
+               (struct kvm_assigned_dev_kernel *) dev_id;
+
+       spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
+       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+               int index = find_index_from_host_irq(assigned_dev, irq);
+               if (index < 0)
+                       goto out;
+               assigned_dev->guest_msix_entries[index].flags |=
+                       KVM_ASSIGNED_MSIX_PENDING;
+       }
+
+       schedule_work(&assigned_dev->interrupt_work);
+
+       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+               disable_irq_nosync(irq);
+               assigned_dev->host_irq_disabled = true;
+       }
+
+out:
+       spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
+       return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+       struct kvm_assigned_dev_kernel *dev;
+       unsigned long flags;
+
+       if (kian->gsi == -1)
+               return;
+
+       dev = container_of(kian, struct kvm_assigned_dev_kernel,
+                          ack_notifier);
+
+       kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
+
+       /* The guest irq may be shared so this ack may be
+        * from another device.
+        */
+       spin_lock_irqsave(&dev->assigned_dev_lock, flags);
+       if (dev->host_irq_disabled) {
+               enable_irq(dev->host_irq);
+               dev->host_irq_disabled = false;
+       }
+       spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
+}
+
+static void deassign_guest_irq(struct kvm *kvm,
+                              struct kvm_assigned_dev_kernel *assigned_dev)
+{
+       kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+       assigned_dev->ack_notifier.gsi = -1;
+
+       if (assigned_dev->irq_source_id != -1)
+               kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+       assigned_dev->irq_source_id = -1;
+       assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
+
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+                             struct kvm_assigned_dev_kernel *assigned_dev)
+{
+       /*
+        * In kvm_free_device_irq, cancel_work_sync return true if:
+        * 1. work is scheduled, and then cancelled.
+        * 2. work callback is executed.
+        *
+        * The first one ensured that the irq is disabled and no more events
+        * would happen. But for the second one, the irq may be enabled (e.g.
+        * for MSI). So we disable irq here to prevent further events.
+        *
+        * Notice this maybe result in nested disable if the interrupt type is
+        * INTx, but it's OK for we are going to free it.
+        *
+        * If this function is a part of VM destroy, please ensure that till
+        * now, the kvm state is still legal for probably we also have to wait
+        * interrupt_work done.
+        */
+       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+               int i;
+               for (i = 0; i < assigned_dev->entries_nr; i++)
+                       disable_irq_nosync(assigned_dev->
+                                          host_msix_entries[i].vector);
+
+               cancel_work_sync(&assigned_dev->interrupt_work);
+
+               for (i = 0; i < assigned_dev->entries_nr; i++)
+                       free_irq(assigned_dev->host_msix_entries[i].vector,
+                                (void *)assigned_dev);
+
+               assigned_dev->entries_nr = 0;
+               kfree(assigned_dev->host_msix_entries);
+               kfree(assigned_dev->guest_msix_entries);
+               pci_disable_msix(assigned_dev->dev);
+       } else {
+               /* Deal with MSI and INTx */
+               disable_irq_nosync(assigned_dev->host_irq);
+               cancel_work_sync(&assigned_dev->interrupt_work);
+
+               free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+               if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
+                       pci_disable_msi(assigned_dev->dev);
+       }
+
+       assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+                           struct kvm_assigned_dev_kernel *assigned_dev,
+                           unsigned long irq_requested_type)
+{
+       unsigned long guest_irq_type, host_irq_type;
+
+       if (!irqchip_in_kernel(kvm))
+               return -EINVAL;
+       /* no irq assignment to deassign */
+       if (!assigned_dev->irq_requested_type)
+               return -ENXIO;
+
+       host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+       guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+       if (host_irq_type)
+               deassign_host_irq(kvm, assigned_dev);
+       if (guest_irq_type)
+               deassign_guest_irq(kvm, assigned_dev);
+
+       return 0;
+}
+
+static void kvm_free_assigned_irq(struct kvm *kvm,
+                                 struct kvm_assigned_dev_kernel *assigned_dev)
+{
+       kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+                                    struct kvm_assigned_dev_kernel
+                                    *assigned_dev)
+{
+       kvm_free_assigned_irq(kvm, assigned_dev);
+
+       pci_reset_function(assigned_dev->dev);
+
+       pci_release_regions(assigned_dev->dev);
+       pci_disable_device(assigned_dev->dev);
+       pci_dev_put(assigned_dev->dev);
+
+       list_del(&assigned_dev->list);
+       kfree(assigned_dev);
+}
+
+void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+       struct list_head *ptr, *ptr2;
+       struct kvm_assigned_dev_kernel *assigned_dev;
+
+       list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+               assigned_dev = list_entry(ptr,
+                                         struct kvm_assigned_dev_kernel,
+                                         list);
+
+               kvm_free_assigned_device(kvm, assigned_dev);
+       }
+}
+
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+                                           struct kvm_assigned_dev_kernel *dev)
+{
+       dev->host_irq = dev->dev->irq;
+       /* Even though this is PCI, we don't want to use shared
+        * interrupts. Sharing host devices with guest-assigned devices
+        * on the same interrupt line is not a happy situation: there
+        * are going to be long delays in accepting, acking, etc.
+        */
+       if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
+                       0, "kvm_assigned_intx_device", (void *)dev))
+               return -EIO;
+       return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+                                          struct kvm_assigned_dev_kernel *dev)
+{
+       int r;
+
+       if (!dev->dev->msi_enabled) {
+               r = pci_enable_msi(dev->dev);
+               if (r)
+                       return r;
+       }
+
+       dev->host_irq = dev->dev->irq;
+       if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
+                       "kvm_assigned_msi_device", (void *)dev)) {
+               pci_disable_msi(dev->dev);
+               return -EIO;
+       }
+
+       return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+                                           struct kvm_assigned_dev_kernel *dev)
+{
+       int i, r = -EINVAL;
+
+       /* host_msix_entries and guest_msix_entries should have been
+        * initialized */
+       if (dev->entries_nr == 0)
+               return r;
+
+       r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
+       if (r)
+               return r;
+
+       for (i = 0; i < dev->entries_nr; i++) {
+               r = request_irq(dev->host_msix_entries[i].vector,
+                               kvm_assigned_dev_intr, 0,
+                               "kvm_assigned_msix_device",
+                               (void *)dev);
+               /* FIXME: free requested_irq's on failure */
+               if (r)
+                       return r;
+       }
+
+       return 0;
+}
+
+#endif
+
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+                               struct kvm_assigned_dev_kernel *dev,
+                               struct kvm_assigned_irq *irq)
+{
+       dev->guest_irq = irq->guest_irq;
+       dev->ack_notifier.gsi = irq->guest_irq;
+       return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+                       struct kvm_assigned_dev_kernel *dev,
+                       struct kvm_assigned_irq *irq)
+{
+       dev->guest_irq = irq->guest_irq;
+       dev->ack_notifier.gsi = -1;
+       dev->host_irq_disabled = false;
+       return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+                       struct kvm_assigned_dev_kernel *dev,
+                       struct kvm_assigned_irq *irq)
+{
+       dev->guest_irq = irq->guest_irq;
+       dev->ack_notifier.gsi = -1;
+       dev->host_irq_disabled = false;
+       return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+                          struct kvm_assigned_dev_kernel *dev,
+                          __u32 host_irq_type)
+{
+       int r = -EEXIST;
+
+       if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+               return r;
+
+       switch (host_irq_type) {
+       case KVM_DEV_IRQ_HOST_INTX:
+               r = assigned_device_enable_host_intx(kvm, dev);
+               break;
+#ifdef __KVM_HAVE_MSI
+       case KVM_DEV_IRQ_HOST_MSI:
+               r = assigned_device_enable_host_msi(kvm, dev);
+               break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+       case KVM_DEV_IRQ_HOST_MSIX:
+               r = assigned_device_enable_host_msix(kvm, dev);
+               break;
+#endif
+       default:
+               r = -EINVAL;
+       }
+
+       if (!r)
+               dev->irq_requested_type |= host_irq_type;
+
+       return r;
+}
+
+static int assign_guest_irq(struct kvm *kvm,
+                           struct kvm_assigned_dev_kernel *dev,
+                           struct kvm_assigned_irq *irq,
+                           unsigned long guest_irq_type)
+{
+       int id;
+       int r = -EEXIST;
+
+       if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+               return r;
+
+       id = kvm_request_irq_source_id(kvm);
+       if (id < 0)
+               return id;
+
+       dev->irq_source_id = id;
+
+       switch (guest_irq_type) {
+       case KVM_DEV_IRQ_GUEST_INTX:
+               r = assigned_device_enable_guest_intx(kvm, dev, irq);
+               break;
+#ifdef __KVM_HAVE_MSI
+       case KVM_DEV_IRQ_GUEST_MSI:
+               r = assigned_device_enable_guest_msi(kvm, dev, irq);
+               break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+       case KVM_DEV_IRQ_GUEST_MSIX:
+               r = assigned_device_enable_guest_msix(kvm, dev, irq);
+               break;
+#endif
+       default:
+               r = -EINVAL;
+       }
+
+       if (!r) {
+               dev->irq_requested_type |= guest_irq_type;
+               kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+       } else
+               kvm_free_irq_source_id(kvm, dev->irq_source_id);
+
+       return r;
+}
+
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+                                  struct kvm_assigned_irq *assigned_irq)
+{
+       int r = -EINVAL;
+       struct kvm_assigned_dev_kernel *match;
+       unsigned long host_irq_type, guest_irq_type;
+
+       if (!capable(CAP_SYS_RAWIO))
+               return -EPERM;
+
+       if (!irqchip_in_kernel(kvm))
+               return r;
+
+       mutex_lock(&kvm->lock);
+       r = -ENODEV;
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_irq->assigned_dev_id);
+       if (!match)
+               goto out;
+
+       host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+       guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
+
+       r = -EINVAL;
+       /* can only assign one type at a time */
+       if (hweight_long(host_irq_type) > 1)
+               goto out;
+       if (hweight_long(guest_irq_type) > 1)
+               goto out;
+       if (host_irq_type == 0 && guest_irq_type == 0)
+               goto out;
+
+       r = 0;
+       if (host_irq_type)
+               r = assign_host_irq(kvm, match, host_irq_type);
+       if (r)
+               goto out;
+
+       if (guest_irq_type)
+               r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+                                        struct kvm_assigned_irq
+                                        *assigned_irq)
+{
+       int r = -ENODEV;
+       struct kvm_assigned_dev_kernel *match;
+
+       mutex_lock(&kvm->lock);
+
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_irq->assigned_dev_id);
+       if (!match)
+               goto out;
+
+       r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+                                     struct kvm_assigned_pci_dev *assigned_dev)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *match;
+       struct pci_dev *dev;
+
+       down_read(&kvm->slots_lock);
+       mutex_lock(&kvm->lock);
+
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_dev->assigned_dev_id);
+       if (match) {
+               /* device already assigned */
+               r = -EEXIST;
+               goto out;
+       }
+
+       match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+       if (match == NULL) {
+               printk(KERN_INFO "%s: Couldn't allocate memory\n",
+                      __func__);
+               r = -ENOMEM;
+               goto out;
+       }
+       dev = pci_get_bus_and_slot(assigned_dev->busnr,
+                                  assigned_dev->devfn);
+       if (!dev) {
+               printk(KERN_INFO "%s: host device not found\n", __func__);
+               r = -EINVAL;
+               goto out_free;
+       }
+       if (pci_enable_device(dev)) {
+               printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+               r = -EBUSY;
+               goto out_put;
+       }
+       r = pci_request_regions(dev, "kvm_assigned_device");
+       if (r) {
+               printk(KERN_INFO "%s: Could not get access to device regions\n",
+                      __func__);
+               goto out_disable;
+       }
+
+       pci_reset_function(dev);
+
+       match->assigned_dev_id = assigned_dev->assigned_dev_id;
+       match->host_busnr = assigned_dev->busnr;
+       match->host_devfn = assigned_dev->devfn;
+       match->flags = assigned_dev->flags;
+       match->dev = dev;
+       spin_lock_init(&match->assigned_dev_lock);
+       match->irq_source_id = -1;
+       match->kvm = kvm;
+       match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+       INIT_WORK(&match->interrupt_work,
+                 kvm_assigned_dev_interrupt_work_handler);
+
+       list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+       if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
+               if (!kvm->arch.iommu_domain) {
+                       r = kvm_iommu_map_guest(kvm);
+                       if (r)
+                               goto out_list_del;
+               }
+               r = kvm_assign_device(kvm, match);
+               if (r)
+                       goto out_list_del;
+       }
+
+out:
+       mutex_unlock(&kvm->lock);
+       up_read(&kvm->slots_lock);
+       return r;
+out_list_del:
+       list_del(&match->list);
+       pci_release_regions(dev);
+out_disable:
+       pci_disable_device(dev);
+out_put:
+       pci_dev_put(dev);
+out_free:
+       kfree(match);
+       mutex_unlock(&kvm->lock);
+       up_read(&kvm->slots_lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
+               struct kvm_assigned_pci_dev *assigned_dev)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *match;
+
+       mutex_lock(&kvm->lock);
+
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_dev->assigned_dev_id);
+       if (!match) {
+               printk(KERN_INFO "%s: device hasn't been assigned before, "
+                 "so cannot be deassigned\n", __func__);
+               r = -EINVAL;
+               goto out;
+       }
+
+       if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
+               kvm_deassign_device(kvm, match);
+
+       kvm_free_assigned_device(kvm, match);
+
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+                                   struct kvm_assigned_msix_nr *entry_nr)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *adev;
+
+       mutex_lock(&kvm->lock);
+
+       adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     entry_nr->assigned_dev_id);
+       if (!adev) {
+               r = -EINVAL;
+               goto msix_nr_out;
+       }
+
+       if (adev->entries_nr == 0) {
+               adev->entries_nr = entry_nr->entry_nr;
+               if (adev->entries_nr == 0 ||
+                   adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+                       r = -EINVAL;
+                       goto msix_nr_out;
+               }
+
+               adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+                                               entry_nr->entry_nr,
+                                               GFP_KERNEL);
+               if (!adev->host_msix_entries) {
+                       r = -ENOMEM;
+                       goto msix_nr_out;
+               }
+               adev->guest_msix_entries = kzalloc(
+                               sizeof(struct kvm_guest_msix_entry) *
+                               entry_nr->entry_nr, GFP_KERNEL);
+               if (!adev->guest_msix_entries) {
+                       kfree(adev->host_msix_entries);
+                       r = -ENOMEM;
+                       goto msix_nr_out;
+               }
+       } else /* Not allowed set MSI-X number twice */
+               r = -EINVAL;
+msix_nr_out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+                                      struct kvm_assigned_msix_entry *entry)
+{
+       int r = 0, i;
+       struct kvm_assigned_dev_kernel *adev;
+
+       mutex_lock(&kvm->lock);
+
+       adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     entry->assigned_dev_id);
+
+       if (!adev) {
+               r = -EINVAL;
+               goto msix_entry_out;
+       }
+
+       for (i = 0; i < adev->entries_nr; i++)
+               if (adev->guest_msix_entries[i].vector == 0 ||
+                   adev->guest_msix_entries[i].entry == entry->entry) {
+                       adev->guest_msix_entries[i].entry = entry->entry;
+                       adev->guest_msix_entries[i].vector = entry->gsi;
+                       adev->host_msix_entries[i].entry = entry->entry;
+                       break;
+               }
+       if (i == adev->entries_nr) {
+               r = -ENOSPC;
+               goto msix_entry_out;
+       }
+
+msix_entry_out:
+       mutex_unlock(&kvm->lock);
+
+       return r;
+}
+#endif
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+                                 unsigned long arg)
+{
+       void __user *argp = (void __user *)arg;
+       int r = -ENOTTY;
+
+       switch (ioctl) {
+       case KVM_ASSIGN_PCI_DEVICE: {
+               struct kvm_assigned_pci_dev assigned_dev;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+                       goto out;
+               r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_ASSIGN_IRQ: {
+               r = -EOPNOTSUPP;
+               break;
+       }
+#ifdef KVM_CAP_ASSIGN_DEV_IRQ
+       case KVM_ASSIGN_DEV_IRQ: {
+               struct kvm_assigned_irq assigned_irq;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+                       goto out;
+               r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_DEASSIGN_DEV_IRQ: {
+               struct kvm_assigned_irq assigned_irq;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+                       goto out;
+               r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+               if (r)
+                       goto out;
+               break;
+       }
+#endif
+#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
+       case KVM_DEASSIGN_PCI_DEVICE: {
+               struct kvm_assigned_pci_dev assigned_dev;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+                       goto out;
+               r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
+               if (r)
+                       goto out;
+               break;
+       }
+#endif
+#ifdef KVM_CAP_IRQ_ROUTING
+       case KVM_SET_GSI_ROUTING: {
+               struct kvm_irq_routing routing;
+               struct kvm_irq_routing __user *urouting;
+               struct kvm_irq_routing_entry *entries;
+
+               r = -EFAULT;
+               if (copy_from_user(&routing, argp, sizeof(routing)))
+                       goto out;
+               r = -EINVAL;
+               if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+                       goto out;
+               if (routing.flags)
+                       goto out;
+               r = -ENOMEM;
+               entries = vmalloc(routing.nr * sizeof(*entries));
+               if (!entries)
+                       goto out;
+               r = -EFAULT;
+               urouting = argp;
+               if (copy_from_user(entries, urouting->entries,
+                                  routing.nr * sizeof(*entries)))
+                       goto out_free_irq_routing;
+               r = kvm_set_irq_routing(kvm, entries, routing.nr,
+                                       routing.flags);
+       out_free_irq_routing:
+               vfree(entries);
+               break;
+       }
+#endif /* KVM_CAP_IRQ_ROUTING */
+#ifdef __KVM_HAVE_MSIX
+       case KVM_ASSIGN_SET_MSIX_NR: {
+               struct kvm_assigned_msix_nr entry_nr;
+               r = -EFAULT;
+               if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+                       goto out;
+               r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_ASSIGN_SET_MSIX_ENTRY: {
+               struct kvm_assigned_msix_entry entry;
+               r = -EFAULT;
+               if (copy_from_user(&entry, argp, sizeof entry))
+                       goto out;
+               r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+               if (r)
+                       goto out;
+               break;
+       }
+#endif
+       }
+out:
+       return r;
+}
+
index bb4ebd8..30f70fd 100644 (file)
@@ -61,10 +61,8 @@ irqfd_inject(struct work_struct *work)
        struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
        struct kvm *kvm = irqfd->kvm;
 
-       mutex_lock(&kvm->irq_lock);
        kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
        kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
-       mutex_unlock(&kvm->irq_lock);
 }
 
 /*
index 9fe140b..38a2d20 100644 (file)
@@ -182,6 +182,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
        union kvm_ioapic_redirect_entry entry;
        int ret = 1;
 
+       mutex_lock(&ioapic->lock);
        if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
                entry = ioapic->redirtbl[irq];
                level ^= entry.fields.polarity;
@@ -198,34 +199,51 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
                }
                trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
        }
+       mutex_unlock(&ioapic->lock);
+
        return ret;
 }
 
-static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin,
-                                   int trigger_mode)
+static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
+                                    int trigger_mode)
 {
-       union kvm_ioapic_redirect_entry *ent;
+       int i;
+
+       for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+               union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
 
-       ent = &ioapic->redirtbl[pin];
+               if (ent->fields.vector != vector)
+                       continue;
 
-       kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin);
+               /*
+                * We are dropping lock while calling ack notifiers because ack
+                * notifier callbacks for assigned devices call into IOAPIC
+                * recursively. Since remote_irr is cleared only after call
+                * to notifiers if the same vector will be delivered while lock
+                * is dropped it will be put into irr and will be delivered
+                * after ack notifier returns.
+                */
+               mutex_unlock(&ioapic->lock);
+               kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
+               mutex_lock(&ioapic->lock);
+
+               if (trigger_mode != IOAPIC_LEVEL_TRIG)
+                       continue;
 
-       if (trigger_mode == IOAPIC_LEVEL_TRIG) {
                ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
                ent->fields.remote_irr = 0;
-               if (!ent->fields.mask && (ioapic->irr & (1 << pin)))
-                       ioapic_service(ioapic, pin);
+               if (!ent->fields.mask && (ioapic->irr & (1 << i)))
+                       ioapic_service(ioapic, i);
        }
 }
 
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
 {
        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-       int i;
 
-       for (i = 0; i < IOAPIC_NUM_PINS; i++)
-               if (ioapic->redirtbl[i].fields.vector == vector)
-                       __kvm_ioapic_update_eoi(ioapic, i, trigger_mode);
+       mutex_lock(&ioapic->lock);
+       __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
+       mutex_unlock(&ioapic->lock);
 }
 
 static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
@@ -250,8 +268,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
        ioapic_debug("addr %lx\n", (unsigned long)addr);
        ASSERT(!(addr & 0xf));  /* check alignment */
 
-       mutex_lock(&ioapic->kvm->irq_lock);
        addr &= 0xff;
+       mutex_lock(&ioapic->lock);
        switch (addr) {
        case IOAPIC_REG_SELECT:
                result = ioapic->ioregsel;
@@ -265,6 +283,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
                result = 0;
                break;
        }
+       mutex_unlock(&ioapic->lock);
+
        switch (len) {
        case 8:
                *(u64 *) val = result;
@@ -277,7 +297,6 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
        default:
                printk(KERN_WARNING "ioapic: wrong length %d\n", len);
        }
-       mutex_unlock(&ioapic->kvm->irq_lock);
        return 0;
 }
 
@@ -293,15 +312,15 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
                     (void*)addr, len, val);
        ASSERT(!(addr & 0xf));  /* check alignment */
 
-       mutex_lock(&ioapic->kvm->irq_lock);
        if (len == 4 || len == 8)
                data = *(u32 *) val;
        else {
                printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
-               goto unlock;
+               return 0;
        }
 
        addr &= 0xff;
+       mutex_lock(&ioapic->lock);
        switch (addr) {
        case IOAPIC_REG_SELECT:
                ioapic->ioregsel = data;
@@ -312,15 +331,14 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
                break;
 #ifdef CONFIG_IA64
        case IOAPIC_REG_EOI:
-               kvm_ioapic_update_eoi(ioapic->kvm, data, IOAPIC_LEVEL_TRIG);
+               __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG);
                break;
 #endif
 
        default:
                break;
        }
-unlock:
-       mutex_unlock(&ioapic->kvm->irq_lock);
+       mutex_unlock(&ioapic->lock);
        return 0;
 }
 
@@ -349,6 +367,7 @@ int kvm_ioapic_init(struct kvm *kvm)
        ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
        if (!ioapic)
                return -ENOMEM;
+       mutex_init(&ioapic->lock);
        kvm->arch.vioapic = ioapic;
        kvm_ioapic_reset(ioapic);
        kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
@@ -360,3 +379,26 @@ int kvm_ioapic_init(struct kvm *kvm)
        return ret;
 }
 
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+       struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+       if (!ioapic)
+               return -EINVAL;
+
+       mutex_lock(&ioapic->lock);
+       memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+       mutex_unlock(&ioapic->lock);
+       return 0;
+}
+
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+       struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+       if (!ioapic)
+               return -EINVAL;
+
+       mutex_lock(&ioapic->lock);
+       memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+       mutex_unlock(&ioapic->lock);
+       return 0;
+}
index 7080b71..419c43b 100644 (file)
@@ -41,9 +41,11 @@ struct kvm_ioapic {
        u32 irr;
        u32 pad;
        union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+       unsigned long irq_states[IOAPIC_NUM_PINS];
        struct kvm_io_device dev;
        struct kvm *kvm;
        void (*ack_notifier)(void *opaque, int irq);
+       struct mutex lock;
 };
 
 #ifdef DEBUG
@@ -73,4 +75,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
                struct kvm_lapic_irq *irq);
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+
 #endif
index 001663f..9b07734 100644 (file)
 
 #include "ioapic.h"
 
+static inline int kvm_irq_line_state(unsigned long *irq_state,
+                                    int irq_source_id, int level)
+{
+       /* Logical OR for level trig interrupt */
+       if (level)
+               set_bit(irq_source_id, irq_state);
+       else
+               clear_bit(irq_source_id, irq_state);
+
+       return !!(*irq_state);
+}
+
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
-                          struct kvm *kvm, int level)
+                          struct kvm *kvm, int irq_source_id, int level)
 {
 #ifdef CONFIG_X86
-       return kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level);
+       struct kvm_pic *pic = pic_irqchip(kvm);
+       level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
+                                  irq_source_id, level);
+       return kvm_pic_set_irq(pic, e->irqchip.pin, level);
 #else
        return -1;
 #endif
 }
 
 static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
-                             struct kvm *kvm, int level)
+                             struct kvm *kvm, int irq_source_id, int level)
 {
-       return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
+       struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+       level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
+                                  irq_source_id, level);
+
+       return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
 }
 
 inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
@@ -63,8 +82,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
        int i, r = -1;
        struct kvm_vcpu *vcpu, *lowest = NULL;
 
-       WARN_ON(!mutex_is_locked(&kvm->irq_lock));
-
        if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
                        kvm_is_dm_lowest_prio(irq))
                printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
@@ -96,10 +113,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 }
 
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-                      struct kvm *kvm, int level)
+                      struct kvm *kvm, int irq_source_id, int level)
 {
        struct kvm_lapic_irq irq;
 
+       if (!level)
+               return -1;
+
        trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
 
        irq.dest_id = (e->msi.address_lo &
@@ -116,78 +136,67 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
        return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
 }
 
-/* This should be called with the kvm->irq_lock mutex held
+/*
  * Return value:
  *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
  *  = 0   Interrupt was coalesced (previous irq is still pending)
  *  > 0   Number of CPUs interrupt was delivered to
  */
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
 {
-       struct kvm_kernel_irq_routing_entry *e;
-       unsigned long *irq_state, sig_level;
-       int ret = -1;
+       struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
+       int ret = -1, i = 0;
+       struct kvm_irq_routing_table *irq_rt;
+       struct hlist_node *n;
 
        trace_kvm_set_irq(irq, level, irq_source_id);
 
-       WARN_ON(!mutex_is_locked(&kvm->irq_lock));
-
-       if (irq < KVM_IOAPIC_NUM_PINS) {
-               irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
-
-               /* Logical OR for level trig interrupt */
-               if (level)
-                       set_bit(irq_source_id, irq_state);
-               else
-                       clear_bit(irq_source_id, irq_state);
-               sig_level = !!(*irq_state);
-       } else if (!level)
-               return ret;
-       else /* Deal with MSI/MSI-X */
-               sig_level = 1;
-
        /* Not possible to detect if the guest uses the PIC or the
         * IOAPIC.  So set the bit in both. The guest will ignore
         * writes to the unused one.
         */
-       list_for_each_entry(e, &kvm->irq_routing, link)
-               if (e->gsi == irq) {
-                       int r = e->set(e, kvm, sig_level);
-                       if (r < 0)
-                               continue;
+       rcu_read_lock();
+       irq_rt = rcu_dereference(kvm->irq_routing);
+       if (irq < irq_rt->nr_rt_entries)
+               hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
+                       irq_set[i++] = *e;
+       rcu_read_unlock();
+
+       while(i--) {
+               int r;
+               r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level);
+               if (r < 0)
+                       continue;
+
+               ret = r + ((ret < 0) ? 0 : ret);
+       }
 
-                       ret = r + ((ret < 0) ? 0 : ret);
-               }
        return ret;
 }
 
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
-       struct kvm_kernel_irq_routing_entry *e;
        struct kvm_irq_ack_notifier *kian;
        struct hlist_node *n;
-       unsigned gsi = pin;
+       int gsi;
 
        trace_kvm_ack_irq(irqchip, pin);
 
-       list_for_each_entry(e, &kvm->irq_routing, link)
-               if (e->type == KVM_IRQ_ROUTING_IRQCHIP &&
-                   e->irqchip.irqchip == irqchip &&
-                   e->irqchip.pin == pin) {
-                       gsi = e->gsi;
-                       break;
-               }
-
-       hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
-               if (kian->gsi == gsi)
-                       kian->irq_acked(kian);
+       rcu_read_lock();
+       gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+       if (gsi != -1)
+               hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list,
+                                        link)
+                       if (kian->gsi == gsi)
+                               kian->irq_acked(kian);
+       rcu_read_unlock();
 }
 
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
                                   struct kvm_irq_ack_notifier *kian)
 {
        mutex_lock(&kvm->irq_lock);
-       hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
+       hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
        mutex_unlock(&kvm->irq_lock);
 }
 
@@ -195,8 +204,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
                                    struct kvm_irq_ack_notifier *kian)
 {
        mutex_lock(&kvm->irq_lock);
-       hlist_del_init(&kian->link);
+       hlist_del_init_rcu(&kian->link);
        mutex_unlock(&kvm->irq_lock);
+       synchronize_rcu();
 }
 
 int kvm_request_irq_source_id(struct kvm *kvm)
@@ -205,16 +215,17 @@ int kvm_request_irq_source_id(struct kvm *kvm)
        int irq_source_id;
 
        mutex_lock(&kvm->irq_lock);
-       irq_source_id = find_first_zero_bit(bitmap,
-                               sizeof(kvm->arch.irq_sources_bitmap));
+       irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
 
-       if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
+       if (irq_source_id >= BITS_PER_LONG) {
                printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
-               return -EFAULT;
+               irq_source_id = -EFAULT;
+               goto unlock;
        }
 
        ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
        set_bit(irq_source_id, bitmap);
+unlock:
        mutex_unlock(&kvm->irq_lock);
 
        return irq_source_id;
@@ -228,13 +239,23 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 
        mutex_lock(&kvm->irq_lock);
        if (irq_source_id < 0 ||
-           irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
+           irq_source_id >= BITS_PER_LONG) {
                printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
-               return;
+               goto unlock;
        }
-       for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
-               clear_bit(irq_source_id, &kvm->arch.irq_states[i]);
        clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+       if (!irqchip_in_kernel(kvm))
+               goto unlock;
+
+       for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) {
+               clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]);
+               if (i >= 16)
+                       continue;
+#ifdef CONFIG_X86
+               clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]);
+#endif
+       }
+unlock:
        mutex_unlock(&kvm->irq_lock);
 }
 
@@ -243,7 +264,7 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
 {
        mutex_lock(&kvm->irq_lock);
        kimn->irq = irq;
-       hlist_add_head(&kimn->link, &kvm->mask_notifier_list);
+       hlist_add_head_rcu(&kimn->link, &kvm->mask_notifier_list);
        mutex_unlock(&kvm->irq_lock);
 }
 
@@ -251,8 +272,9 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
                                      struct kvm_irq_mask_notifier *kimn)
 {
        mutex_lock(&kvm->irq_lock);
-       hlist_del(&kimn->link);
+       hlist_del_rcu(&kimn->link);
        mutex_unlock(&kvm->irq_lock);
+       synchronize_rcu();
 }
 
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
@@ -260,33 +282,37 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
        struct kvm_irq_mask_notifier *kimn;
        struct hlist_node *n;
 
-       WARN_ON(!mutex_is_locked(&kvm->irq_lock));
-
-       hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link)
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
                if (kimn->irq == irq)
                        kimn->func(kimn, mask);
-}
-
-static void __kvm_free_irq_routing(struct list_head *irq_routing)
-{
-       struct kvm_kernel_irq_routing_entry *e, *n;
-
-       list_for_each_entry_safe(e, n, irq_routing, link)
-               kfree(e);
+       rcu_read_unlock();
 }
 
 void kvm_free_irq_routing(struct kvm *kvm)
 {
-       mutex_lock(&kvm->irq_lock);
-       __kvm_free_irq_routing(&kvm->irq_routing);
-       mutex_unlock(&kvm->irq_lock);
+       /* Called only during vm destruction. Nobody can use the pointer
+          at this stage */
+       kfree(kvm->irq_routing);
 }
 
-static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+                              struct kvm_kernel_irq_routing_entry *e,
                               const struct kvm_irq_routing_entry *ue)
 {
        int r = -EINVAL;
        int delta;
+       struct kvm_kernel_irq_routing_entry *ei;
+       struct hlist_node *n;
+
+       /*
+        * Do not allow GSI to be mapped to the same irqchip more than once.
+        * Allow only one to one mapping between GSI and MSI.
+        */
+       hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
+               if (ei->type == KVM_IRQ_ROUTING_MSI ||
+                   ue->u.irqchip.irqchip == ei->irqchip.irqchip)
+                       return r;
 
        e->gsi = ue->gsi;
        e->type = ue->type;
@@ -309,6 +335,9 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
                }
                e->irqchip.irqchip = ue->u.irqchip.irqchip;
                e->irqchip.pin = ue->u.irqchip.pin + delta;
+               if (e->irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+                       goto out;
+               rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
                break;
        case KVM_IRQ_ROUTING_MSI:
                e->set = kvm_set_msi;
@@ -319,6 +348,8 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
        default:
                goto out;
        }
+
+       hlist_add_head(&e->link, &rt->map[e->gsi]);
        r = 0;
 out:
        return r;
@@ -330,43 +361,53 @@ int kvm_set_irq_routing(struct kvm *kvm,
                        unsigned nr,
                        unsigned flags)
 {
-       struct list_head irq_list = LIST_HEAD_INIT(irq_list);
-       struct list_head tmp = LIST_HEAD_INIT(tmp);
-       struct kvm_kernel_irq_routing_entry *e = NULL;
-       unsigned i;
+       struct kvm_irq_routing_table *new, *old;
+       u32 i, j, nr_rt_entries = 0;
        int r;
 
        for (i = 0; i < nr; ++i) {
+               if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
+                       return -EINVAL;
+               nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+       }
+
+       nr_rt_entries += 1;
+
+       new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
+                     + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
+                     GFP_KERNEL);
+
+       if (!new)
+               return -ENOMEM;
+
+       new->rt_entries = (void *)&new->map[nr_rt_entries];
+
+       new->nr_rt_entries = nr_rt_entries;
+       for (i = 0; i < 3; i++)
+               for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++)
+                       new->chip[i][j] = -1;
+
+       for (i = 0; i < nr; ++i) {
                r = -EINVAL;
-               if (ue->gsi >= KVM_MAX_IRQ_ROUTES)
-                       goto out;
                if (ue->flags)
                        goto out;
-               r = -ENOMEM;
-               e = kzalloc(sizeof(*e), GFP_KERNEL);
-               if (!e)
-                       goto out;
-               r = setup_routing_entry(e, ue);
+               r = setup_routing_entry(new, &new->rt_entries[i], ue);
                if (r)
                        goto out;
                ++ue;
-               list_add(&e->link, &irq_list);
-               e = NULL;
        }
 
        mutex_lock(&kvm->irq_lock);
-       list_splice(&kvm->irq_routing, &tmp);
-       INIT_LIST_HEAD(&kvm->irq_routing);
-       list_splice(&irq_list, &kvm->irq_routing);
-       INIT_LIST_HEAD(&irq_list);
-       list_splice(&tmp, &irq_list);
+       old = kvm->irq_routing;
+       rcu_assign_pointer(kvm->irq_routing, new);
        mutex_unlock(&kvm->irq_lock);
+       synchronize_rcu();
 
+       new = old;
        r = 0;
 
 out:
-       kfree(e);
-       __kvm_free_irq_routing(&irq_list);
+       kfree(new);
        return r;
 }
 
index 7495ce3..f92ba13 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/swap.h>
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
+#include <linux/compat.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
 #include "coalesced_mmio.h"
 #endif
 
-#ifdef KVM_CAP_DEVICE_ASSIGNMENT
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include "irq.h"
-#endif
-
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
@@ -75,6 +70,8 @@ DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 
 static cpumask_var_t cpus_hardware_enabled;
+static int kvm_usage_count = 0;
+static atomic_t hardware_enable_failed;
 
 struct kmem_cache *kvm_vcpu_cache;
 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
@@ -85,615 +82,13 @@ struct dentry *kvm_debugfs_dir;
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
+static int hardware_enable_all(void);
+static void hardware_disable_all(void);
 
 static bool kvm_rebooting;
 
 static bool largepages_enabled = true;
 
-#ifdef KVM_CAP_DEVICE_ASSIGNMENT
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
-                                                     int assigned_dev_id)
-{
-       struct list_head *ptr;
-       struct kvm_assigned_dev_kernel *match;
-
-       list_for_each(ptr, head) {
-               match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
-               if (match->assigned_dev_id == assigned_dev_id)
-                       return match;
-       }
-       return NULL;
-}
-
-static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
-                                   *assigned_dev, int irq)
-{
-       int i, index;
-       struct msix_entry *host_msix_entries;
-
-       host_msix_entries = assigned_dev->host_msix_entries;
-
-       index = -1;
-       for (i = 0; i < assigned_dev->entries_nr; i++)
-               if (irq == host_msix_entries[i].vector) {
-                       index = i;
-                       break;
-               }
-       if (index < 0) {
-               printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
-               return 0;
-       }
-
-       return index;
-}
-
-static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
-{
-       struct kvm_assigned_dev_kernel *assigned_dev;
-       struct kvm *kvm;
-       int i;
-
-       assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
-                                   interrupt_work);
-       kvm = assigned_dev->kvm;
-
-       mutex_lock(&kvm->irq_lock);
-       spin_lock_irq(&assigned_dev->assigned_dev_lock);
-       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-               struct kvm_guest_msix_entry *guest_entries =
-                       assigned_dev->guest_msix_entries;
-               for (i = 0; i < assigned_dev->entries_nr; i++) {
-                       if (!(guest_entries[i].flags &
-                                       KVM_ASSIGNED_MSIX_PENDING))
-                               continue;
-                       guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
-                       kvm_set_irq(assigned_dev->kvm,
-                                   assigned_dev->irq_source_id,
-                                   guest_entries[i].vector, 1);
-               }
-       } else
-               kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                           assigned_dev->guest_irq, 1);
-
-       spin_unlock_irq(&assigned_dev->assigned_dev_lock);
-       mutex_unlock(&assigned_dev->kvm->irq_lock);
-}
-
-static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
-{
-       unsigned long flags;
-       struct kvm_assigned_dev_kernel *assigned_dev =
-               (struct kvm_assigned_dev_kernel *) dev_id;
-
-       spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
-       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-               int index = find_index_from_host_irq(assigned_dev, irq);
-               if (index < 0)
-                       goto out;
-               assigned_dev->guest_msix_entries[index].flags |=
-                       KVM_ASSIGNED_MSIX_PENDING;
-       }
-
-       schedule_work(&assigned_dev->interrupt_work);
-
-       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-               disable_irq_nosync(irq);
-               assigned_dev->host_irq_disabled = true;
-       }
-
-out:
-       spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
-       return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
-       struct kvm_assigned_dev_kernel *dev;
-       unsigned long flags;
-
-       if (kian->gsi == -1)
-               return;
-
-       dev = container_of(kian, struct kvm_assigned_dev_kernel,
-                          ack_notifier);
-
-       kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
-
-       /* The guest irq may be shared so this ack may be
-        * from another device.
-        */
-       spin_lock_irqsave(&dev->assigned_dev_lock, flags);
-       if (dev->host_irq_disabled) {
-               enable_irq(dev->host_irq);
-               dev->host_irq_disabled = false;
-       }
-       spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
-}
-
-static void deassign_guest_irq(struct kvm *kvm,
-                              struct kvm_assigned_dev_kernel *assigned_dev)
-{
-       kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
-       assigned_dev->ack_notifier.gsi = -1;
-
-       if (assigned_dev->irq_source_id != -1)
-               kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
-       assigned_dev->irq_source_id = -1;
-       assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
-}
-
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void deassign_host_irq(struct kvm *kvm,
-                             struct kvm_assigned_dev_kernel *assigned_dev)
-{
-       /*
-        * In kvm_free_device_irq, cancel_work_sync return true if:
-        * 1. work is scheduled, and then cancelled.
-        * 2. work callback is executed.
-        *
-        * The first one ensured that the irq is disabled and no more events
-        * would happen. But for the second one, the irq may be enabled (e.g.
-        * for MSI). So we disable irq here to prevent further events.
-        *
-        * Notice this maybe result in nested disable if the interrupt type is
-        * INTx, but it's OK for we are going to free it.
-        *
-        * If this function is a part of VM destroy, please ensure that till
-        * now, the kvm state is still legal for probably we also have to wait
-        * interrupt_work done.
-        */
-       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-               int i;
-               for (i = 0; i < assigned_dev->entries_nr; i++)
-                       disable_irq_nosync(assigned_dev->
-                                          host_msix_entries[i].vector);
-
-               cancel_work_sync(&assigned_dev->interrupt_work);
-
-               for (i = 0; i < assigned_dev->entries_nr; i++)
-                       free_irq(assigned_dev->host_msix_entries[i].vector,
-                                (void *)assigned_dev);
-
-               assigned_dev->entries_nr = 0;
-               kfree(assigned_dev->host_msix_entries);
-               kfree(assigned_dev->guest_msix_entries);
-               pci_disable_msix(assigned_dev->dev);
-       } else {
-               /* Deal with MSI and INTx */
-               disable_irq_nosync(assigned_dev->host_irq);
-               cancel_work_sync(&assigned_dev->interrupt_work);
-
-               free_irq(assigned_dev->host_irq, (void *)assigned_dev);
-
-               if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
-                       pci_disable_msi(assigned_dev->dev);
-       }
-
-       assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
-}
-
-static int kvm_deassign_irq(struct kvm *kvm,
-                           struct kvm_assigned_dev_kernel *assigned_dev,
-                           unsigned long irq_requested_type)
-{
-       unsigned long guest_irq_type, host_irq_type;
-
-       if (!irqchip_in_kernel(kvm))
-               return -EINVAL;
-       /* no irq assignment to deassign */
-       if (!assigned_dev->irq_requested_type)
-               return -ENXIO;
-
-       host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
-       guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
-
-       if (host_irq_type)
-               deassign_host_irq(kvm, assigned_dev);
-       if (guest_irq_type)
-               deassign_guest_irq(kvm, assigned_dev);
-
-       return 0;
-}
-
-static void kvm_free_assigned_irq(struct kvm *kvm,
-                                 struct kvm_assigned_dev_kernel *assigned_dev)
-{
-       kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
-                                    struct kvm_assigned_dev_kernel
-                                    *assigned_dev)
-{
-       kvm_free_assigned_irq(kvm, assigned_dev);
-
-       pci_reset_function(assigned_dev->dev);
-
-       pci_release_regions(assigned_dev->dev);
-       pci_disable_device(assigned_dev->dev);
-       pci_dev_put(assigned_dev->dev);
-
-       list_del(&assigned_dev->list);
-       kfree(assigned_dev);
-}
-
-void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
-       struct list_head *ptr, *ptr2;
-       struct kvm_assigned_dev_kernel *assigned_dev;
-
-       list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
-               assigned_dev = list_entry(ptr,
-                                         struct kvm_assigned_dev_kernel,
-                                         list);
-
-               kvm_free_assigned_device(kvm, assigned_dev);
-       }
-}
-
-static int assigned_device_enable_host_intx(struct kvm *kvm,
-                                           struct kvm_assigned_dev_kernel *dev)
-{
-       dev->host_irq = dev->dev->irq;
-       /* Even though this is PCI, we don't want to use shared
-        * interrupts. Sharing host devices with guest-assigned devices
-        * on the same interrupt line is not a happy situation: there
-        * are going to be long delays in accepting, acking, etc.
-        */
-       if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
-                       0, "kvm_assigned_intx_device", (void *)dev))
-               return -EIO;
-       return 0;
-}
-
-#ifdef __KVM_HAVE_MSI
-static int assigned_device_enable_host_msi(struct kvm *kvm,
-                                          struct kvm_assigned_dev_kernel *dev)
-{
-       int r;
-
-       if (!dev->dev->msi_enabled) {
-               r = pci_enable_msi(dev->dev);
-               if (r)
-                       return r;
-       }
-
-       dev->host_irq = dev->dev->irq;
-       if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
-                       "kvm_assigned_msi_device", (void *)dev)) {
-               pci_disable_msi(dev->dev);
-               return -EIO;
-       }
-
-       return 0;
-}
-#endif
-
-#ifdef __KVM_HAVE_MSIX
-static int assigned_device_enable_host_msix(struct kvm *kvm,
-                                           struct kvm_assigned_dev_kernel *dev)
-{
-       int i, r = -EINVAL;
-
-       /* host_msix_entries and guest_msix_entries should have been
-        * initialized */
-       if (dev->entries_nr == 0)
-               return r;
-
-       r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
-       if (r)
-               return r;
-
-       for (i = 0; i < dev->entries_nr; i++) {
-               r = request_irq(dev->host_msix_entries[i].vector,
-                               kvm_assigned_dev_intr, 0,
-                               "kvm_assigned_msix_device",
-                               (void *)dev);
-               /* FIXME: free requested_irq's on failure */
-               if (r)
-                       return r;
-       }
-
-       return 0;
-}
-
-#endif
-
-static int assigned_device_enable_guest_intx(struct kvm *kvm,
-                               struct kvm_assigned_dev_kernel *dev,
-                               struct kvm_assigned_irq *irq)
-{
-       dev->guest_irq = irq->guest_irq;
-       dev->ack_notifier.gsi = irq->guest_irq;
-       return 0;
-}
-
-#ifdef __KVM_HAVE_MSI
-static int assigned_device_enable_guest_msi(struct kvm *kvm,
-                       struct kvm_assigned_dev_kernel *dev,
-                       struct kvm_assigned_irq *irq)
-{
-       dev->guest_irq = irq->guest_irq;
-       dev->ack_notifier.gsi = -1;
-       dev->host_irq_disabled = false;
-       return 0;
-}
-#endif
-#ifdef __KVM_HAVE_MSIX
-static int assigned_device_enable_guest_msix(struct kvm *kvm,
-                       struct kvm_assigned_dev_kernel *dev,
-                       struct kvm_assigned_irq *irq)
-{
-       dev->guest_irq = irq->guest_irq;
-       dev->ack_notifier.gsi = -1;
-       dev->host_irq_disabled = false;
-       return 0;
-}
-#endif
-
-static int assign_host_irq(struct kvm *kvm,
-                          struct kvm_assigned_dev_kernel *dev,
-                          __u32 host_irq_type)
-{
-       int r = -EEXIST;
-
-       if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
-               return r;
-
-       switch (host_irq_type) {
-       case KVM_DEV_IRQ_HOST_INTX:
-               r = assigned_device_enable_host_intx(kvm, dev);
-               break;
-#ifdef __KVM_HAVE_MSI
-       case KVM_DEV_IRQ_HOST_MSI:
-               r = assigned_device_enable_host_msi(kvm, dev);
-               break;
-#endif
-#ifdef __KVM_HAVE_MSIX
-       case KVM_DEV_IRQ_HOST_MSIX:
-               r = assigned_device_enable_host_msix(kvm, dev);
-               break;
-#endif
-       default:
-               r = -EINVAL;
-       }
-
-       if (!r)
-               dev->irq_requested_type |= host_irq_type;
-
-       return r;
-}
-
-static int assign_guest_irq(struct kvm *kvm,
-                           struct kvm_assigned_dev_kernel *dev,
-                           struct kvm_assigned_irq *irq,
-                           unsigned long guest_irq_type)
-{
-       int id;
-       int r = -EEXIST;
-
-       if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
-               return r;
-
-       id = kvm_request_irq_source_id(kvm);
-       if (id < 0)
-               return id;
-
-       dev->irq_source_id = id;
-
-       switch (guest_irq_type) {
-       case KVM_DEV_IRQ_GUEST_INTX:
-               r = assigned_device_enable_guest_intx(kvm, dev, irq);
-               break;
-#ifdef __KVM_HAVE_MSI
-       case KVM_DEV_IRQ_GUEST_MSI:
-               r = assigned_device_enable_guest_msi(kvm, dev, irq);
-               break;
-#endif
-#ifdef __KVM_HAVE_MSIX
-       case KVM_DEV_IRQ_GUEST_MSIX:
-               r = assigned_device_enable_guest_msix(kvm, dev, irq);
-               break;
-#endif
-       default:
-               r = -EINVAL;
-       }
-
-       if (!r) {
-               dev->irq_requested_type |= guest_irq_type;
-               kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
-       } else
-               kvm_free_irq_source_id(kvm, dev->irq_source_id);
-
-       return r;
-}
-
-/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-                                  struct kvm_assigned_irq *assigned_irq)
-{
-       int r = -EINVAL;
-       struct kvm_assigned_dev_kernel *match;
-       unsigned long host_irq_type, guest_irq_type;
-
-       if (!capable(CAP_SYS_RAWIO))
-               return -EPERM;
-
-       if (!irqchip_in_kernel(kvm))
-               return r;
-
-       mutex_lock(&kvm->lock);
-       r = -ENODEV;
-       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     assigned_irq->assigned_dev_id);
-       if (!match)
-               goto out;
-
-       host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
-       guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
-
-       r = -EINVAL;
-       /* can only assign one type at a time */
-       if (hweight_long(host_irq_type) > 1)
-               goto out;
-       if (hweight_long(guest_irq_type) > 1)
-               goto out;
-       if (host_irq_type == 0 && guest_irq_type == 0)
-               goto out;
-
-       r = 0;
-       if (host_irq_type)
-               r = assign_host_irq(kvm, match, host_irq_type);
-       if (r)
-               goto out;
-
-       if (guest_irq_type)
-               r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
-                                        struct kvm_assigned_irq
-                                        *assigned_irq)
-{
-       int r = -ENODEV;
-       struct kvm_assigned_dev_kernel *match;
-
-       mutex_lock(&kvm->lock);
-
-       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     assigned_irq->assigned_dev_id);
-       if (!match)
-               goto out;
-
-       r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
-                                     struct kvm_assigned_pci_dev *assigned_dev)
-{
-       int r = 0;
-       struct kvm_assigned_dev_kernel *match;
-       struct pci_dev *dev;
-
-       down_read(&kvm->slots_lock);
-       mutex_lock(&kvm->lock);
-
-       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     assigned_dev->assigned_dev_id);
-       if (match) {
-               /* device already assigned */
-               r = -EEXIST;
-               goto out;
-       }
-
-       match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
-       if (match == NULL) {
-               printk(KERN_INFO "%s: Couldn't allocate memory\n",
-                      __func__);
-               r = -ENOMEM;
-               goto out;
-       }
-       dev = pci_get_bus_and_slot(assigned_dev->busnr,
-                                  assigned_dev->devfn);
-       if (!dev) {
-               printk(KERN_INFO "%s: host device not found\n", __func__);
-               r = -EINVAL;
-               goto out_free;
-       }
-       if (pci_enable_device(dev)) {
-               printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
-               r = -EBUSY;
-               goto out_put;
-       }
-       r = pci_request_regions(dev, "kvm_assigned_device");
-       if (r) {
-               printk(KERN_INFO "%s: Could not get access to device regions\n",
-                      __func__);
-               goto out_disable;
-       }
-
-       pci_reset_function(dev);
-
-       match->assigned_dev_id = assigned_dev->assigned_dev_id;
-       match->host_busnr = assigned_dev->busnr;
-       match->host_devfn = assigned_dev->devfn;
-       match->flags = assigned_dev->flags;
-       match->dev = dev;
-       spin_lock_init(&match->assigned_dev_lock);
-       match->irq_source_id = -1;
-       match->kvm = kvm;
-       match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-       INIT_WORK(&match->interrupt_work,
-                 kvm_assigned_dev_interrupt_work_handler);
-
-       list_add(&match->list, &kvm->arch.assigned_dev_head);
-
-       if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
-               if (!kvm->arch.iommu_domain) {
-                       r = kvm_iommu_map_guest(kvm);
-                       if (r)
-                               goto out_list_del;
-               }
-               r = kvm_assign_device(kvm, match);
-               if (r)
-                       goto out_list_del;
-       }
-
-out:
-       mutex_unlock(&kvm->lock);
-       up_read(&kvm->slots_lock);
-       return r;
-out_list_del:
-       list_del(&match->list);
-       pci_release_regions(dev);
-out_disable:
-       pci_disable_device(dev);
-out_put:
-       pci_dev_put(dev);
-out_free:
-       kfree(match);
-       mutex_unlock(&kvm->lock);
-       up_read(&kvm->slots_lock);
-       return r;
-}
-#endif
-
-#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
-static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
-               struct kvm_assigned_pci_dev *assigned_dev)
-{
-       int r = 0;
-       struct kvm_assigned_dev_kernel *match;
-
-       mutex_lock(&kvm->lock);
-
-       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     assigned_dev->assigned_dev_id);
-       if (!match) {
-               printk(KERN_INFO "%s: device hasn't been assigned before, "
-                 "so cannot be deassigned\n", __func__);
-               r = -EINVAL;
-               goto out;
-       }
-
-       if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
-               kvm_deassign_device(kvm, match);
-
-       kvm_free_assigned_device(kvm, match);
-
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-#endif
-
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
        if (pfn_valid(pfn)) {
@@ -949,6 +344,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 
 static struct kvm *kvm_create_vm(void)
 {
+       int r = 0;
        struct kvm *kvm = kvm_arch_create_vm();
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
        struct page *page;
@@ -956,16 +352,21 @@ static struct kvm *kvm_create_vm(void)
 
        if (IS_ERR(kvm))
                goto out;
+
+       r = hardware_enable_all();
+       if (r)
+               goto out_err_nodisable;
+
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
-       INIT_LIST_HEAD(&kvm->irq_routing);
        INIT_HLIST_HEAD(&kvm->mask_notifier_list);
+       INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
 
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
        if (!page) {
-               kfree(kvm);
-               return ERR_PTR(-ENOMEM);
+               r = -ENOMEM;
+               goto out_err;
        }
        kvm->coalesced_mmio_ring =
                        (struct kvm_coalesced_mmio_ring *)page_address(page);
@@ -973,15 +374,13 @@ static struct kvm *kvm_create_vm(void)
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
        {
-               int err;
                kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
-               err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
-               if (err) {
+               r = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
+               if (r) {
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
                        put_page(page);
 #endif
-                       kfree(kvm);
-                       return ERR_PTR(err);
+                       goto out_err;
                }
        }
 #endif
@@ -1005,6 +404,12 @@ static struct kvm *kvm_create_vm(void)
 #endif
 out:
        return kvm;
+
+out_err:
+       hardware_disable_all();
+out_err_nodisable:
+       kfree(kvm);
+       return ERR_PTR(r);
 }
 
 /*
@@ -1063,6 +468,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
        kvm_arch_flush_shadow(kvm);
 #endif
        kvm_arch_destroy_vm(kvm);
+       hardware_disable_all();
        mmdrop(mm);
 }
 
@@ -1689,9 +1095,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                if (signal_pending(current))
                        break;
 
-               vcpu_put(vcpu);
                schedule();
-               vcpu_load(vcpu);
        }
 
        finish_wait(&vcpu->wq, &wait);
@@ -1705,6 +1109,21 @@ void kvm_resched(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
 
+void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
+{
+       ktime_t expires;
+       DEFINE_WAIT(wait);
+
+       prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+
+       /* Sleep for 100 us, and hope lock-holder got scheduled */
+       expires = ktime_add_ns(ktime_get(), 100000UL);
+       schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+
+       finish_wait(&vcpu->wq, &wait);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
+
 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct kvm_vcpu *vcpu = vma->vm_file->private_data;
@@ -1828,88 +1247,6 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
        return 0;
 }
 
-#ifdef __KVM_HAVE_MSIX
-static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
-                                   struct kvm_assigned_msix_nr *entry_nr)
-{
-       int r = 0;
-       struct kvm_assigned_dev_kernel *adev;
-
-       mutex_lock(&kvm->lock);
-
-       adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     entry_nr->assigned_dev_id);
-       if (!adev) {
-               r = -EINVAL;
-               goto msix_nr_out;
-       }
-
-       if (adev->entries_nr == 0) {
-               adev->entries_nr = entry_nr->entry_nr;
-               if (adev->entries_nr == 0 ||
-                   adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
-                       r = -EINVAL;
-                       goto msix_nr_out;
-               }
-
-               adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
-                                               entry_nr->entry_nr,
-                                               GFP_KERNEL);
-               if (!adev->host_msix_entries) {
-                       r = -ENOMEM;
-                       goto msix_nr_out;
-               }
-               adev->guest_msix_entries = kzalloc(
-                               sizeof(struct kvm_guest_msix_entry) *
-                               entry_nr->entry_nr, GFP_KERNEL);
-               if (!adev->guest_msix_entries) {
-                       kfree(adev->host_msix_entries);
-                       r = -ENOMEM;
-                       goto msix_nr_out;
-               }
-       } else /* Not allowed set MSI-X number twice */
-               r = -EINVAL;
-msix_nr_out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
-                                      struct kvm_assigned_msix_entry *entry)
-{
-       int r = 0, i;
-       struct kvm_assigned_dev_kernel *adev;
-
-       mutex_lock(&kvm->lock);
-
-       adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     entry->assigned_dev_id);
-
-       if (!adev) {
-               r = -EINVAL;
-               goto msix_entry_out;
-       }
-
-       for (i = 0; i < adev->entries_nr; i++)
-               if (adev->guest_msix_entries[i].vector == 0 ||
-                   adev->guest_msix_entries[i].entry == entry->entry) {
-                       adev->guest_msix_entries[i].entry = entry->entry;
-                       adev->guest_msix_entries[i].vector = entry->gsi;
-                       adev->host_msix_entries[i].entry = entry->entry;
-                       break;
-               }
-       if (i == adev->entries_nr) {
-               r = -ENOSPC;
-               goto msix_entry_out;
-       }
-
-msix_entry_out:
-       mutex_unlock(&kvm->lock);
-
-       return r;
-}
-#endif
-
 static long kvm_vcpu_ioctl(struct file *filp,
                           unsigned int ioctl, unsigned long arg)
 {
@@ -2168,112 +1505,6 @@ static long kvm_vm_ioctl(struct file *filp,
                break;
        }
 #endif
-#ifdef KVM_CAP_DEVICE_ASSIGNMENT
-       case KVM_ASSIGN_PCI_DEVICE: {
-               struct kvm_assigned_pci_dev assigned_dev;
-
-               r = -EFAULT;
-               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-                       goto out;
-               r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_ASSIGN_IRQ: {
-               r = -EOPNOTSUPP;
-               break;
-       }
-#ifdef KVM_CAP_ASSIGN_DEV_IRQ
-       case KVM_ASSIGN_DEV_IRQ: {
-               struct kvm_assigned_irq assigned_irq;
-
-               r = -EFAULT;
-               if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-                       goto out;
-               r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_DEASSIGN_DEV_IRQ: {
-               struct kvm_assigned_irq assigned_irq;
-
-               r = -EFAULT;
-               if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-                       goto out;
-               r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
-               if (r)
-                       goto out;
-               break;
-       }
-#endif
-#endif
-#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
-       case KVM_DEASSIGN_PCI_DEVICE: {
-               struct kvm_assigned_pci_dev assigned_dev;
-
-               r = -EFAULT;
-               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-                       goto out;
-               r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
-               if (r)
-                       goto out;
-               break;
-       }
-#endif
-#ifdef KVM_CAP_IRQ_ROUTING
-       case KVM_SET_GSI_ROUTING: {
-               struct kvm_irq_routing routing;
-               struct kvm_irq_routing __user *urouting;
-               struct kvm_irq_routing_entry *entries;
-
-               r = -EFAULT;
-               if (copy_from_user(&routing, argp, sizeof(routing)))
-                       goto out;
-               r = -EINVAL;
-               if (routing.nr >= KVM_MAX_IRQ_ROUTES)
-                       goto out;
-               if (routing.flags)
-                       goto out;
-               r = -ENOMEM;
-               entries = vmalloc(routing.nr * sizeof(*entries));
-               if (!entries)
-                       goto out;
-               r = -EFAULT;
-               urouting = argp;
-               if (copy_from_user(entries, urouting->entries,
-                                  routing.nr * sizeof(*entries)))
-                       goto out_free_irq_routing;
-               r = kvm_set_irq_routing(kvm, entries, routing.nr,
-                                       routing.flags);
-       out_free_irq_routing:
-               vfree(entries);
-               break;
-       }
-#endif /* KVM_CAP_IRQ_ROUTING */
-#ifdef __KVM_HAVE_MSIX
-       case KVM_ASSIGN_SET_MSIX_NR: {
-               struct kvm_assigned_msix_nr entry_nr;
-               r = -EFAULT;
-               if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
-                       goto out;
-               r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_ASSIGN_SET_MSIX_ENTRY: {
-               struct kvm_assigned_msix_entry entry;
-               r = -EFAULT;
-               if (copy_from_user(&entry, argp, sizeof entry))
-                       goto out;
-               r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
-               if (r)
-                       goto out;
-               break;
-       }
-#endif
        case KVM_IRQFD: {
                struct kvm_irqfd data;
 
@@ -2305,11 +1536,59 @@ static long kvm_vm_ioctl(struct file *filp,
 #endif
        default:
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
+               if (r == -ENOTTY)
+                       r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
        }
 out:
        return r;
 }
 
+#ifdef CONFIG_COMPAT
+struct compat_kvm_dirty_log {
+       __u32 slot;
+       __u32 padding1;
+       union {
+               compat_uptr_t dirty_bitmap; /* one bit per page */
+               __u64 padding2;
+       };
+};
+
+static long kvm_vm_compat_ioctl(struct file *filp,
+                          unsigned int ioctl, unsigned long arg)
+{
+       struct kvm *kvm = filp->private_data;
+       int r;
+
+       if (kvm->mm != current->mm)
+               return -EIO;
+       switch (ioctl) {
+       case KVM_GET_DIRTY_LOG: {
+               struct compat_kvm_dirty_log compat_log;
+               struct kvm_dirty_log log;
+
+               r = -EFAULT;
+               if (copy_from_user(&compat_log, (void __user *)arg,
+                                  sizeof(compat_log)))
+                       goto out;
+               log.slot         = compat_log.slot;
+               log.padding1     = compat_log.padding1;
+               log.padding2     = compat_log.padding2;
+               log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
+
+               r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+               if (r)
+                       goto out;
+               break;
+       }
+       default:
+               r = kvm_vm_ioctl(filp, ioctl, arg);
+       }
+
+out:
+       return r;
+}
+#endif
+
 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct page *page[1];
@@ -2344,7 +1623,9 @@ static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
 static struct file_operations kvm_vm_fops = {
        .release        = kvm_vm_release,
        .unlocked_ioctl = kvm_vm_ioctl,
-       .compat_ioctl   = kvm_vm_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = kvm_vm_compat_ioctl,
+#endif
        .mmap           = kvm_vm_mmap,
 };
 
@@ -2372,6 +1653,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
        case KVM_CAP_SET_BOOT_CPU_ID:
 #endif
+       case KVM_CAP_INTERNAL_ERROR_DATA:
                return 1;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
        case KVM_CAP_IRQ_ROUTING:
@@ -2442,11 +1724,21 @@ static struct miscdevice kvm_dev = {
 static void hardware_enable(void *junk)
 {
        int cpu = raw_smp_processor_id();
+       int r;
 
        if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
                return;
+
        cpumask_set_cpu(cpu, cpus_hardware_enabled);
-       kvm_arch_hardware_enable(NULL);
+
+       r = kvm_arch_hardware_enable(NULL);
+
+       if (r) {
+               cpumask_clear_cpu(cpu, cpus_hardware_enabled);
+               atomic_inc(&hardware_enable_failed);
+               printk(KERN_INFO "kvm: enabling virtualization on "
+                                "CPU%d failed\n", cpu);
+       }
 }
 
 static void hardware_disable(void *junk)
@@ -2459,11 +1751,52 @@ static void hardware_disable(void *junk)
        kvm_arch_hardware_disable(NULL);
 }
 
+static void hardware_disable_all_nolock(void)
+{
+       BUG_ON(!kvm_usage_count);
+
+       kvm_usage_count--;
+       if (!kvm_usage_count)
+               on_each_cpu(hardware_disable, NULL, 1);
+}
+
+static void hardware_disable_all(void)
+{
+       spin_lock(&kvm_lock);
+       hardware_disable_all_nolock();
+       spin_unlock(&kvm_lock);
+}
+
+static int hardware_enable_all(void)
+{
+       int r = 0;
+
+       spin_lock(&kvm_lock);
+
+       kvm_usage_count++;
+       if (kvm_usage_count == 1) {
+               atomic_set(&hardware_enable_failed, 0);
+               on_each_cpu(hardware_enable, NULL, 1);
+
+               if (atomic_read(&hardware_enable_failed)) {
+                       hardware_disable_all_nolock();
+                       r = -EBUSY;
+               }
+       }
+
+       spin_unlock(&kvm_lock);
+
+       return r;
+}
+
 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
                           void *v)
 {
        int cpu = (long)v;
 
+       if (!kvm_usage_count)
+               return NOTIFY_OK;
+
        val &= ~CPU_TASKS_FROZEN;
        switch (val) {
        case CPU_DYING:
@@ -2666,13 +1999,15 @@ static void kvm_exit_debug(void)
 
 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
 {
-       hardware_disable(NULL);
+       if (kvm_usage_count)
+               hardware_disable(NULL);
        return 0;
 }
 
 static int kvm_resume(struct sys_device *dev)
 {
-       hardware_enable(NULL);
+       if (kvm_usage_count)
+               hardware_enable(NULL);
        return 0;
 }
 
@@ -2747,7 +2082,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
                        goto out_free_1;
        }
 
-       on_each_cpu(hardware_enable, NULL, 1);
        r = register_cpu_notifier(&kvm_cpu_notifier);
        if (r)
                goto out_free_2;
@@ -2797,7 +2131,6 @@ out_free_3:
        unregister_reboot_notifier(&kvm_reboot_notifier);
        unregister_cpu_notifier(&kvm_cpu_notifier);
 out_free_2:
-       on_each_cpu(hardware_disable, NULL, 1);
 out_free_1:
        kvm_arch_hardware_unsetup();
 out_free_0a: