KVM: x86: Push potential exception error code on task switches
[safe/jmp/linux-2.6] / arch / x86 / kvm / x86.c
index 53bc06a..58a295c 100644 (file)
@@ -39,8 +39,9 @@
 #include <linux/cpufreq.h>
 #include <linux/user-return-notifier.h>
 #include <linux/srcu.h>
+#include <linux/slab.h>
 #include <trace/events/kvm.h>
-#undef TRACE_INCLUDE_FILE
+
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
@@ -222,34 +223,6 @@ static void drop_user_return_notifiers(void *ignore)
                kvm_on_user_return(&smsr->urn);
 }
 
-unsigned long segment_base(u16 selector)
-{
-       struct descriptor_table gdt;
-       struct desc_struct *d;
-       unsigned long table_base;
-       unsigned long v;
-
-       if (selector == 0)
-               return 0;
-
-       kvm_get_gdt(&gdt);
-       table_base = gdt.base;
-
-       if (selector & 4) {           /* from ldt */
-               u16 ldt_selector = kvm_read_ldt();
-
-               table_base = segment_base(ldt_selector);
-       }
-       d = (struct desc_struct *)(table_base + (selector & ~7));
-       v = get_desc_base(d);
-#ifdef CONFIG_X86_64
-       if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
-               v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-#endif
-       return v;
-}
-EXPORT_SYMBOL_GPL(segment_base);
-
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 {
        if (irqchip_in_kernel(vcpu->kvm))
@@ -428,41 +401,38 @@ out:
 
 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
-       if (cr0 & CR0_RESERVED_BITS) {
-               printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
-                      cr0, vcpu->arch.cr0);
+       cr0 |= X86_CR0_ET;
+
+#ifdef CONFIG_X86_64
+       if (cr0 & 0xffffffff00000000UL) {
                kvm_inject_gp(vcpu, 0);
                return;
        }
+#endif
+
+       cr0 &= ~CR0_RESERVED_BITS;
 
        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
-               printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
 
        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
-               printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
-                      "and a clear PE flag\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
 
        if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 #ifdef CONFIG_X86_64
-               if ((vcpu->arch.shadow_efer & EFER_LME)) {
+               if ((vcpu->arch.efer & EFER_LME)) {
                        int cs_db, cs_l;
 
                        if (!is_pae(vcpu)) {
-                               printk(KERN_DEBUG "set_cr0: #GP, start paging "
-                                      "in long mode while PAE is disabled\n");
                                kvm_inject_gp(vcpu, 0);
                                return;
                        }
                        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
                        if (cs_l) {
-                               printk(KERN_DEBUG "set_cr0: #GP, start paging "
-                                      "in long mode while CS.L == 1\n");
                                kvm_inject_gp(vcpu, 0);
                                return;
 
@@ -470,8 +440,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
                } else
 #endif
                if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-                       printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
-                              "reserved bits\n");
                        kvm_inject_gp(vcpu, 0);
                        return;
                }
@@ -479,7 +447,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        }
 
        kvm_x86_ops->set_cr0(vcpu, cr0);
-       vcpu->arch.cr0 = cr0;
 
        kvm_mmu_reset_context(vcpu);
        return;
@@ -488,7 +455,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-       kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+       kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
@@ -498,28 +465,23 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 
        if (cr4 & CR4_RESERVED_BITS) {
-               printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
 
        if (is_long_mode(vcpu)) {
                if (!(cr4 & X86_CR4_PAE)) {
-                       printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
-                              "in long mode\n");
                        kvm_inject_gp(vcpu, 0);
                        return;
                }
        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
                   && ((cr4 ^ old_cr4) & pdptr_bits)
                   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-               printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
 
        if (cr4 & X86_CR4_VMXE) {
-               printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
                kvm_inject_gp(vcpu, 0);
                return;
        }
@@ -540,21 +502,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
        if (is_long_mode(vcpu)) {
                if (cr3 & CR3_L_MODE_RESERVED_BITS) {
-                       printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
                        kvm_inject_gp(vcpu, 0);
                        return;
                }
        } else {
                if (is_pae(vcpu)) {
                        if (cr3 & CR3_PAE_RESERVED_BITS) {
-                               printk(KERN_DEBUG
-                                      "set_cr3: #GP, reserved bits\n");
                                kvm_inject_gp(vcpu, 0);
                                return;
                        }
                        if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
-                               printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
-                                      "reserved bits\n");
                                kvm_inject_gp(vcpu, 0);
                                return;
                        }
@@ -586,7 +543,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
        if (cr8 & CR8_RESERVED_BITS) {
-               printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
                kvm_inject_gp(vcpu, 0);
                return;
        }
@@ -606,6 +562,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+       switch (dr) {
+       case 0 ... 3:
+               vcpu->arch.db[dr] = val;
+               if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+                       vcpu->arch.eff_db[dr] = val;
+               break;
+       case 4:
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+                       kvm_queue_exception(vcpu, UD_VECTOR);
+                       return 1;
+               }
+               /* fall through */
+       case 6:
+               if (val & 0xffffffff00000000ULL) {
+                       kvm_inject_gp(vcpu, 0);
+                       return 1;
+               }
+               vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+               break;
+       case 5:
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+                       kvm_queue_exception(vcpu, UD_VECTOR);
+                       return 1;
+               }
+               /* fall through */
+       default: /* 7 */
+               if (val & 0xffffffff00000000ULL) {
+                       kvm_inject_gp(vcpu, 0);
+                       return 1;
+               }
+               vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+               if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+                       kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
+                       vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
+               }
+               break;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+       switch (dr) {
+       case 0 ... 3:
+               *val = vcpu->arch.db[dr];
+               break;
+       case 4:
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+                       kvm_queue_exception(vcpu, UD_VECTOR);
+                       return 1;
+               }
+               /* fall through */
+       case 6:
+               *val = vcpu->arch.dr6;
+               break;
+       case 5:
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+                       kvm_queue_exception(vcpu, UD_VECTOR);
+                       return 1;
+               }
+               /* fall through */
+       default: /* 7 */
+               *val = vcpu->arch.dr7;
+               break;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
+
 static inline u32 bit(int bitno)
 {
        return 1 << (bitno & 31);
@@ -620,9 +650,11 @@ static inline u32 bit(int bitno)
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN    2
+#define KVM_SAVE_MSRS_BEGIN    5
 static u32 msrs_to_save[] = {
        MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+       HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+       HV_X64_MSR_APIC_ASSIST_PAGE,
        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
        MSR_K6_STAR,
 #ifdef CONFIG_X86_64
@@ -640,15 +672,12 @@ static u32 emulated_msrs[] = {
 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
        if (efer & efer_reserved_bits) {
-               printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
-                      efer);
                kvm_inject_gp(vcpu, 0);
                return;
        }
 
        if (is_paging(vcpu)
-           && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
-               printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
+           && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
                kvm_inject_gp(vcpu, 0);
                return;
        }
@@ -658,7 +687,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
                if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
-                       printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
                        kvm_inject_gp(vcpu, 0);
                        return;
                }
@@ -669,7 +697,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
                if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
-                       printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
                        kvm_inject_gp(vcpu, 0);
                        return;
                }
@@ -678,9 +705,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
        kvm_x86_ops->set_efer(vcpu, efer);
 
        efer &= ~EFER_LMA;
-       efer |= vcpu->arch.shadow_efer & EFER_LMA;
+       efer |= vcpu->arch.efer & EFER_LMA;
 
-       vcpu->arch.shadow_efer = efer;
+       vcpu->arch.efer = efer;
 
        vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
        kvm_mmu_reset_context(vcpu);
@@ -958,9 +985,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                if (msr >= MSR_IA32_MC0_CTL &&
                    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
                        u32 offset = msr - MSR_IA32_MC0_CTL;
-                       /* only 0 or all 1s can be written to IA32_MCi_CTL */
+                       /* only 0 or all 1s can be written to IA32_MCi_CTL
+                        * some Linux kernels though clear bit 10 in bank 4 to
+                        * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+                        * this to avoid an uncatched #GP in the guest
+                        */
                        if ((offset & 0x3) == 0 &&
-                           data != 0 && data != ~(u64)0)
+                           data != 0 && (data | (1 << 10)) != ~(u64)0)
                                return -1;
                        vcpu->arch.mce_banks[offset] = data;
                        break;
@@ -1002,6 +1033,100 @@ out:
        return r;
 }
 
+static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+       return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+       bool r = false;
+       switch (msr) {
+       case HV_X64_MSR_GUEST_OS_ID:
+       case HV_X64_MSR_HYPERCALL:
+               r = true;
+               break;
+       }
+
+       return r;
+}
+
+static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+       struct kvm *kvm = vcpu->kvm;
+
+       switch (msr) {
+       case HV_X64_MSR_GUEST_OS_ID:
+               kvm->arch.hv_guest_os_id = data;
+               /* setting guest os id to zero disables hypercall page */
+               if (!kvm->arch.hv_guest_os_id)
+                       kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+               break;
+       case HV_X64_MSR_HYPERCALL: {
+               u64 gfn;
+               unsigned long addr;
+               u8 instructions[4];
+
+               /* if guest os id is not set hypercall should remain disabled */
+               if (!kvm->arch.hv_guest_os_id)
+                       break;
+               if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+                       kvm->arch.hv_hypercall = data;
+                       break;
+               }
+               gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+               addr = gfn_to_hva(kvm, gfn);
+               if (kvm_is_error_hva(addr))
+                       return 1;
+               kvm_x86_ops->patch_hypercall(vcpu, instructions);
+               ((unsigned char *)instructions)[3] = 0xc3; /* ret */
+               if (copy_to_user((void __user *)addr, instructions, 4))
+                       return 1;
+               kvm->arch.hv_hypercall = data;
+               break;
+       }
+       default:
+               pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+                         "data 0x%llx\n", msr, data);
+               return 1;
+       }
+       return 0;
+}
+
+static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+       switch (msr) {
+       case HV_X64_MSR_APIC_ASSIST_PAGE: {
+               unsigned long addr;
+
+               if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+                       vcpu->arch.hv_vapic = data;
+                       break;
+               }
+               addr = gfn_to_hva(vcpu->kvm, data >>
+                                 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
+               if (kvm_is_error_hva(addr))
+                       return 1;
+               if (clear_user((void __user *)addr, PAGE_SIZE))
+                       return 1;
+               vcpu->arch.hv_vapic = data;
+               break;
+       }
+       case HV_X64_MSR_EOI:
+               return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+       case HV_X64_MSR_ICR:
+               return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+       case HV_X64_MSR_TPR:
+               return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+       default:
+               pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+                         "data 0x%llx\n", msr, data);
+               return 1;
+       }
+
+       return 0;
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
        switch (msr) {
@@ -1010,6 +1135,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                break;
        case MSR_K7_HWCR:
                data &= ~(u64)0x40;     /* ignore flush filter disable */
+               data &= ~(u64)0x100;    /* ignore ignne emulation enable */
                if (data != 0) {
                        pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
                                data);
@@ -1116,6 +1242,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
                        "0x%x data 0x%llx\n", msr, data);
                break;
+       case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+               if (kvm_hv_msr_partition_wide(msr)) {
+                       int r;
+                       mutex_lock(&vcpu->kvm->lock);
+                       r = set_msr_hyperv_pw(vcpu, msr, data);
+                       mutex_unlock(&vcpu->kvm->lock);
+                       return r;
+               } else
+                       return set_msr_hyperv(vcpu, msr, data);
+               break;
        default:
                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
                        return xen_hvm_config(vcpu, data);
@@ -1215,6 +1351,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        return 0;
 }
 
+static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+       u64 data = 0;
+       struct kvm *kvm = vcpu->kvm;
+
+       switch (msr) {
+       case HV_X64_MSR_GUEST_OS_ID:
+               data = kvm->arch.hv_guest_os_id;
+               break;
+       case HV_X64_MSR_HYPERCALL:
+               data = kvm->arch.hv_hypercall;
+               break;
+       default:
+               pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+               return 1;
+       }
+
+       *pdata = data;
+       return 0;
+}
+
+static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+       u64 data = 0;
+
+       switch (msr) {
+       case HV_X64_MSR_VP_INDEX: {
+               int r;
+               struct kvm_vcpu *v;
+               kvm_for_each_vcpu(r, v, vcpu->kvm)
+                       if (v == vcpu)
+                               data = r;
+               break;
+       }
+       case HV_X64_MSR_EOI:
+               return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+       case HV_X64_MSR_ICR:
+               return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+       case HV_X64_MSR_TPR:
+               return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+       default:
+               pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+               return 1;
+       }
+       *pdata = data;
+       return 0;
+}
+
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
        u64 data;
@@ -1266,7 +1450,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
                data |= (((uint64_t)4ULL) << 40);
                break;
        case MSR_EFER:
-               data = vcpu->arch.shadow_efer;
+               data = vcpu->arch.efer;
                break;
        case MSR_KVM_WALL_CLOCK:
                data = vcpu->kvm->arch.wall_clock;
@@ -1281,6 +1465,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_MCG_STATUS:
        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
                return get_msr_mce(vcpu, msr, pdata);
+       case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+               if (kvm_hv_msr_partition_wide(msr)) {
+                       int r;
+                       mutex_lock(&vcpu->kvm->lock);
+                       r = get_msr_hyperv_pw(vcpu, msr, pdata);
+                       mutex_unlock(&vcpu->kvm->lock);
+                       return r;
+               } else
+                       return get_msr_hyperv(vcpu, msr, pdata);
+               break;
        default:
                if (!ignore_msrs) {
                        pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1396,6 +1590,12 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_XEN_HVM:
        case KVM_CAP_ADJUST_CLOCK:
        case KVM_CAP_VCPU_EVENTS:
+       case KVM_CAP_HYPERV:
+       case KVM_CAP_HYPERV_VAPIC:
+       case KVM_CAP_HYPERV_SPIN:
+       case KVM_CAP_PCI_SEGMENT:
+       case KVM_CAP_DEBUGREGS:
+       case KVM_CAP_X86_ROBUST_SINGLESTEP:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -1509,8 +1709,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
-       kvm_x86_ops->vcpu_put(vcpu);
        kvm_put_guest_fpu(vcpu);
+       kvm_x86_ops->vcpu_put(vcpu);
 }
 
 static int is_efer_nx(void)
@@ -1641,10 +1841,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                         u32 index, int *nent, int maxnent)
 {
        unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-       unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
 #ifdef CONFIG_X86_64
+       unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+                               ? F(GBPAGES) : 0;
        unsigned f_lm = F(LM);
 #else
+       unsigned f_gbpages = 0;
        unsigned f_lm = 0;
 #endif
        unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
@@ -1945,14 +2147,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 {
        vcpu_load(vcpu);
 
-       events->exception.injected = vcpu->arch.exception.pending;
+       events->exception.injected =
+               vcpu->arch.exception.pending &&
+               !kvm_exception_is_soft(vcpu->arch.exception.nr);
        events->exception.nr = vcpu->arch.exception.nr;
        events->exception.has_error_code = vcpu->arch.exception.has_error_code;
        events->exception.error_code = vcpu->arch.exception.error_code;
 
-       events->interrupt.injected = vcpu->arch.interrupt.pending;
+       events->interrupt.injected =
+               vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
        events->interrupt.nr = vcpu->arch.interrupt.nr;
-       events->interrupt.soft = vcpu->arch.interrupt.soft;
+       events->interrupt.soft = 0;
+       events->interrupt.shadow =
+               kvm_x86_ops->get_interrupt_shadow(vcpu,
+                       KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
 
        events->nmi.injected = vcpu->arch.nmi_injected;
        events->nmi.pending = vcpu->arch.nmi_pending;
@@ -1961,7 +2169,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
        events->sipi_vector = vcpu->arch.sipi_vector;
 
        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
-                        | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
+                        | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+                        | KVM_VCPUEVENT_VALID_SHADOW);
 
        vcpu_put(vcpu);
 }
@@ -1970,7 +2179,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                              struct kvm_vcpu_events *events)
 {
        if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
-                             | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
+                             | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+                             | KVM_VCPUEVENT_VALID_SHADOW))
                return -EINVAL;
 
        vcpu_load(vcpu);
@@ -1985,6 +2195,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
        vcpu->arch.interrupt.soft = events->interrupt.soft;
        if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
                kvm_pic_clear_isr_ack(vcpu->kvm);
+       if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+               kvm_x86_ops->set_interrupt_shadow(vcpu,
+                                                 events->interrupt.shadow);
 
        vcpu->arch.nmi_injected = events->nmi.injected;
        if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -1999,6 +2212,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+                                            struct kvm_debugregs *dbgregs)
+{
+       vcpu_load(vcpu);
+
+       memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+       dbgregs->dr6 = vcpu->arch.dr6;
+       dbgregs->dr7 = vcpu->arch.dr7;
+       dbgregs->flags = 0;
+
+       vcpu_put(vcpu);
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+                                           struct kvm_debugregs *dbgregs)
+{
+       if (dbgregs->flags)
+               return -EINVAL;
+
+       vcpu_load(vcpu);
+
+       memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+       vcpu->arch.dr6 = dbgregs->dr6;
+       vcpu->arch.dr7 = dbgregs->dr7;
+
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
 {
@@ -2177,6 +2420,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
                break;
        }
+       case KVM_GET_DEBUGREGS: {
+               struct kvm_debugregs dbgregs;
+
+               kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+
+               r = -EFAULT;
+               if (copy_to_user(argp, &dbgregs,
+                                sizeof(struct kvm_debugregs)))
+                       break;
+               r = 0;
+               break;
+       }
+       case KVM_SET_DEBUGREGS: {
+               struct kvm_debugregs dbgregs;
+
+               r = -EFAULT;
+               if (copy_from_user(&dbgregs, argp,
+                                  sizeof(struct kvm_debugregs)))
+                       break;
+
+               r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+               break;
+       }
        default:
                r = -EINVAL;
        }
@@ -2208,14 +2474,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
        if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
                return -EINVAL;
 
-       down_write(&kvm->slots_lock);
+       mutex_lock(&kvm->slots_lock);
        spin_lock(&kvm->mmu_lock);
 
        kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
        kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
 
        spin_unlock(&kvm->mmu_lock);
-       up_write(&kvm->slots_lock);
+       mutex_unlock(&kvm->slots_lock);
        return 0;
 }
 
@@ -2292,7 +2558,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
        if (!aliases)
                goto out;
 
-       down_write(&kvm->slots_lock);
+       mutex_lock(&kvm->slots_lock);
 
        /* invalidate any gfn reference in case of deletion/shrinking */
        memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
@@ -2328,7 +2594,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
        r = 0;
 
 out_unlock:
-       up_write(&kvm->slots_lock);
+       mutex_unlock(&kvm->slots_lock);
 out:
        return r;
 }
@@ -2366,18 +2632,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
        r = 0;
        switch (chip->chip_id) {
        case KVM_IRQCHIP_PIC_MASTER:
-               spin_lock(&pic_irqchip(kvm)->lock);
+               raw_spin_lock(&pic_irqchip(kvm)->lock);
                memcpy(&pic_irqchip(kvm)->pics[0],
                        &chip->chip.pic,
                        sizeof(struct kvm_pic_state));
-               spin_unlock(&pic_irqchip(kvm)->lock);
+               raw_spin_unlock(&pic_irqchip(kvm)->lock);
                break;
        case KVM_IRQCHIP_PIC_SLAVE:
-               spin_lock(&pic_irqchip(kvm)->lock);
+               raw_spin_lock(&pic_irqchip(kvm)->lock);
                memcpy(&pic_irqchip(kvm)->pics[1],
                        &chip->chip.pic,
                        sizeof(struct kvm_pic_state));
-               spin_unlock(&pic_irqchip(kvm)->lock);
+               raw_spin_unlock(&pic_irqchip(kvm)->lock);
                break;
        case KVM_IRQCHIP_IOAPIC:
                r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -2457,12 +2723,13 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                                      struct kvm_dirty_log *log)
 {
-       int r, n, i;
+       int r, i;
        struct kvm_memory_slot *memslot;
+       unsigned long n;
        unsigned long is_dirty = 0;
        unsigned long *dirty_bitmap = NULL;
 
-       down_write(&kvm->slots_lock);
+       mutex_lock(&kvm->slots_lock);
 
        r = -EINVAL;
        if (log->slot >= KVM_MEMORY_SLOTS)
@@ -2473,7 +2740,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        if (!memslot->dirty_bitmap)
                goto out;
 
-       n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+       n = kvm_dirty_bitmap_bytes(memslot);
 
        r = -ENOMEM;
        dirty_bitmap = vmalloc(n);
@@ -2512,7 +2779,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 out_free:
        vfree(dirty_bitmap);
 out:
-       up_write(&kvm->slots_lock);
+       mutex_unlock(&kvm->slots_lock);
        return r;
 }
 
@@ -2595,6 +2862,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (vpic) {
                        r = kvm_ioapic_init(kvm);
                        if (r) {
+                               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+                                                         &vpic->dev);
                                kfree(vpic);
                                goto create_irqchip_unlock;
                        }
@@ -2606,10 +2875,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = kvm_setup_default_irq_routing(kvm);
                if (r) {
                        mutex_lock(&kvm->irq_lock);
-                       kfree(kvm->arch.vpic);
-                       kfree(kvm->arch.vioapic);
-                       kvm->arch.vpic = NULL;
-                       kvm->arch.vioapic = NULL;
+                       kvm_ioapic_destroy(kvm);
+                       kvm_destroy_pic(kvm);
                        mutex_unlock(&kvm->irq_lock);
                }
        create_irqchip_unlock:
@@ -2625,7 +2892,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                                   sizeof(struct kvm_pit_config)))
                        goto out;
        create_pit:
-               down_write(&kvm->slots_lock);
+               mutex_lock(&kvm->slots_lock);
                r = -EEXIST;
                if (kvm->arch.vpit)
                        goto create_pit_unlock;
@@ -2634,7 +2901,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (kvm->arch.vpit)
                        r = 0;
        create_pit_unlock:
-               up_write(&kvm->slots_lock);
+               mutex_unlock(&kvm->slots_lock);
                break;
        case KVM_IRQ_LINE_STATUS:
        case KVM_IRQ_LINE: {
@@ -2643,11 +2910,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = -EFAULT;
                if (copy_from_user(&irq_event, argp, sizeof irq_event))
                        goto out;
+               r = -ENXIO;
                if (irqchip_in_kernel(kvm)) {
                        __s32 status;
                        status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
                                        irq_event.irq, irq_event.level);
                        if (ioctl == KVM_IRQ_LINE_STATUS) {
+                               r = -EFAULT;
                                irq_event.status = status;
                                if (copy_to_user(argp, &irq_event,
                                                        sizeof irq_event))
@@ -2863,14 +3132,53 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
        return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
 }
 
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
-                              struct kvm_vcpu *vcpu)
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+                       struct kvm_segment *var, int seg)
+{
+       kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+                    struct kvm_segment *var, int seg)
+{
+       kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       access |= PFERR_FETCH_MASK;
+       return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       access |= PFERR_WRITE_MASK;
+       return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+/* uses this to access any guest's mapped memory without checking CPL */
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+       return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
+}
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+                                     struct kvm_vcpu *vcpu, u32 access,
+                                     u32 *error)
 {
        void *data = val;
        int r = X86EMUL_CONTINUE;
 
        while (bytes) {
-               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
                unsigned offset = addr & (PAGE_SIZE-1);
                unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
                int ret;
@@ -2893,14 +3201,40 @@ out:
        return r;
 }
 
-static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-                               struct kvm_vcpu *vcpu)
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
+                               struct kvm_vcpu *vcpu, u32 *error)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
+                                         access | PFERR_FETCH_MASK, error);
+}
+
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+                              struct kvm_vcpu *vcpu, u32 *error)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+                                         error);
+}
+
+static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
+                              struct kvm_vcpu *vcpu, u32 *error)
+{
+       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+}
+
+static int kvm_write_guest_virt_system(gva_t addr, void *val,
+                                      unsigned int bytes,
+                                      struct kvm_vcpu *vcpu,
+                                      u32 *error)
 {
        void *data = val;
        int r = X86EMUL_CONTINUE;
 
        while (bytes) {
-               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+               gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
+                                                      PFERR_WRITE_MASK, error);
                unsigned offset = addr & (PAGE_SIZE-1);
                unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
                int ret;
@@ -2923,13 +3257,13 @@ out:
        return r;
 }
 
-
 static int emulator_read_emulated(unsigned long addr,
                                  void *val,
                                  unsigned int bytes,
                                  struct kvm_vcpu *vcpu)
 {
        gpa_t                 gpa;
+       u32 error_code;
 
        if (vcpu->mmio_read_completed) {
                memcpy(val, vcpu->mmio_data, bytes);
@@ -2939,17 +3273,20 @@ static int emulator_read_emulated(unsigned long addr,
                return X86EMUL_CONTINUE;
        }
 
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+       gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+
+       if (gpa == UNMAPPED_GVA) {
+               kvm_inject_page_fault(vcpu, addr, error_code);
+               return X86EMUL_PROPAGATE_FAULT;
+       }
 
        /* For APIC access vmexit */
        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
                goto mmio;
 
-       if (kvm_read_guest_virt(addr, val, bytes, vcpu)
+       if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
                                == X86EMUL_CONTINUE)
                return X86EMUL_CONTINUE;
-       if (gpa == UNMAPPED_GVA)
-               return X86EMUL_PROPAGATE_FAULT;
 
 mmio:
        /*
@@ -2988,11 +3325,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
                                           struct kvm_vcpu *vcpu)
 {
        gpa_t                 gpa;
+       u32 error_code;
 
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+       gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
 
        if (gpa == UNMAPPED_GVA) {
-               kvm_inject_page_fault(vcpu, addr, 2);
+               kvm_inject_page_fault(vcpu, addr, error_code);
                return X86EMUL_PROPAGATE_FAULT;
        }
 
@@ -3021,9 +3359,9 @@ mmio:
 }
 
 int emulator_write_emulated(unsigned long addr,
-                                  const void *val,
-                                  unsigned int bytes,
-                                  struct kvm_vcpu *vcpu)
+                           const void *val,
+                           unsigned int bytes,
+                           struct kvm_vcpu *vcpu)
 {
        /* Crossing a page boundary? */
        if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
@@ -3041,87 +3379,178 @@ int emulator_write_emulated(unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
+#define CMPXCHG_TYPE(t, ptr, old, new) \
+       (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+
+#ifdef CONFIG_X86_64
+#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+#else
+#  define CMPXCHG64(ptr, old, new) \
+       (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
+#endif
+
 static int emulator_cmpxchg_emulated(unsigned long addr,
                                     const void *old,
                                     const void *new,
                                     unsigned int bytes,
                                     struct kvm_vcpu *vcpu)
 {
-       printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
-#ifndef CONFIG_X86_64
-       /* guests cmpxchg8b have to be emulated atomically */
-       if (bytes == 8) {
-               gpa_t gpa;
-               struct page *page;
-               char *kaddr;
-               u64 val;
+       gpa_t gpa;
+       struct page *page;
+       char *kaddr;
+       bool exchanged;
 
-               gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+       /* guests cmpxchg8b have to be emulated atomically */
+       if (bytes > 8 || (bytes & (bytes - 1)))
+               goto emul_write;
 
-               if (gpa == UNMAPPED_GVA ||
-                  (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-                       goto emul_write;
+       gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
 
-               if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
-                       goto emul_write;
+       if (gpa == UNMAPPED_GVA ||
+           (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+               goto emul_write;
 
-               val = *(u64 *)new;
+       if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+               goto emul_write;
 
-               page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 
-               kaddr = kmap_atomic(page, KM_USER0);
-               set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
-               kunmap_atomic(kaddr, KM_USER0);
-               kvm_release_page_dirty(page);
+       kaddr = kmap_atomic(page, KM_USER0);
+       kaddr += offset_in_page(gpa);
+       switch (bytes) {
+       case 1:
+               exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+               break;
+       case 2:
+               exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+               break;
+       case 4:
+               exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+               break;
+       case 8:
+               exchanged = CMPXCHG64(kaddr, old, new);
+               break;
+       default:
+               BUG();
        }
+       kunmap_atomic(kaddr, KM_USER0);
+       kvm_release_page_dirty(page);
+
+       if (!exchanged)
+               return X86EMUL_CMPXCHG_FAILED;
+
+       kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
+
+       return X86EMUL_CONTINUE;
+
 emul_write:
-#endif
+       printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
        return emulator_write_emulated(addr, new, bytes, vcpu);
 }
 
-static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 {
-       return kvm_x86_ops->get_segment_base(vcpu, seg);
-}
+       /* TODO: String I/O for in kernel device */
+       int r;
 
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
-{
-       kvm_mmu_invlpg(vcpu, address);
-       return X86EMUL_CONTINUE;
+       if (vcpu->arch.pio.in)
+               r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+                                   vcpu->arch.pio.size, pd);
+       else
+               r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+                                    vcpu->arch.pio.port, vcpu->arch.pio.size,
+                                    pd);
+       return r;
 }
 
-int emulate_clts(struct kvm_vcpu *vcpu)
+
+static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
+                            unsigned int count, struct kvm_vcpu *vcpu)
 {
-       kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
-       return X86EMUL_CONTINUE;
+       if (vcpu->arch.pio.count)
+               goto data_avail;
+
+       trace_kvm_pio(1, port, size, 1);
+
+       vcpu->arch.pio.port = port;
+       vcpu->arch.pio.in = 1;
+       vcpu->arch.pio.count  = count;
+       vcpu->arch.pio.size = size;
+
+       if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+       data_avail:
+               memcpy(val, vcpu->arch.pio_data, size * count);
+               vcpu->arch.pio.count = 0;
+               return 1;
+       }
+
+       vcpu->run->exit_reason = KVM_EXIT_IO;
+       vcpu->run->io.direction = KVM_EXIT_IO_IN;
+       vcpu->run->io.size = size;
+       vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+       vcpu->run->io.count = count;
+       vcpu->run->io.port = port;
+
+       return 0;
 }
 
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+static int emulator_pio_out_emulated(int size, unsigned short port,
+                             const void *val, unsigned int count,
+                             struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = ctxt->vcpu;
+       trace_kvm_pio(0, port, size, 1);
 
-       switch (dr) {
-       case 0 ... 3:
-               *dest = kvm_x86_ops->get_dr(vcpu, dr);
-               return X86EMUL_CONTINUE;
-       default:
-               pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
-               return X86EMUL_UNHANDLEABLE;
+       vcpu->arch.pio.port = port;
+       vcpu->arch.pio.in = 0;
+       vcpu->arch.pio.count = count;
+       vcpu->arch.pio.size = size;
+
+       memcpy(vcpu->arch.pio_data, val, size * count);
+
+       if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+               vcpu->arch.pio.count = 0;
+               return 1;
        }
+
+       vcpu->run->exit_reason = KVM_EXIT_IO;
+       vcpu->run->io.direction = KVM_EXIT_IO_OUT;
+       vcpu->run->io.size = size;
+       vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+       vcpu->run->io.count = count;
+       vcpu->run->io.port = port;
+
+       return 0;
+}
+
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+       return kvm_x86_ops->get_segment_base(vcpu, seg);
+}
+
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+{
+       kvm_mmu_invlpg(vcpu, address);
+       return X86EMUL_CONTINUE;
+}
+
+int emulate_clts(struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+       kvm_x86_ops->fpu_activate(vcpu);
+       return X86EMUL_CONTINUE;
+}
+
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+{
+       return kvm_get_dr(ctxt->vcpu, dr, dest);
 }
 
 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
 {
        unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-       int exception;
 
-       kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
-       if (exception) {
-               /* FIXME: better handling */
-               return X86EMUL_UNHANDLEABLE;
-       }
-       return X86EMUL_CONTINUE;
+       return kvm_set_dr(ctxt->vcpu, dr, value & mask);
 }
 
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3135,18 +3564,174 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 
        rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
 
-       kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
+       kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
 
        printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
               context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
 }
 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+       return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
+{
+       unsigned long value;
+
+       switch (cr) {
+       case 0:
+               value = kvm_read_cr0(vcpu);
+               break;
+       case 2:
+               value = vcpu->arch.cr2;
+               break;
+       case 3:
+               value = vcpu->arch.cr3;
+               break;
+       case 4:
+               value = kvm_read_cr4(vcpu);
+               break;
+       case 8:
+               value = kvm_get_cr8(vcpu);
+               break;
+       default:
+               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+               return 0;
+       }
+
+       return value;
+}
+
+static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+{
+       switch (cr) {
+       case 0:
+               kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+               break;
+       case 2:
+               vcpu->arch.cr2 = val;
+               break;
+       case 3:
+               kvm_set_cr3(vcpu, val);
+               break;
+       case 4:
+               kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+               break;
+       case 8:
+               kvm_set_cr8(vcpu, val & 0xfUL);
+               break;
+       default:
+               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+       }
+}
+
+static int emulator_get_cpl(struct kvm_vcpu *vcpu)
+{
+       return kvm_x86_ops->get_cpl(vcpu);
+}
+
+static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->get_gdt(vcpu, dt);
+}
+
+static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
+                                          struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment var;
+
+       kvm_get_segment(vcpu, &var, seg);
+
+       if (var.unusable)
+               return false;
+
+       if (var.g)
+               var.limit >>= 12;
+       set_desc_limit(desc, var.limit);
+       set_desc_base(desc, (unsigned long)var.base);
+       desc->type = var.type;
+       desc->s = var.s;
+       desc->dpl = var.dpl;
+       desc->p = var.present;
+       desc->avl = var.avl;
+       desc->l = var.l;
+       desc->d = var.db;
+       desc->g = var.g;
+
+       return true;
+}
+
+static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
+                                          struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment var;
+
+       /* needed to preserve selector */
+       kvm_get_segment(vcpu, &var, seg);
+
+       var.base = get_desc_base(desc);
+       var.limit = get_desc_limit(desc);
+       if (desc->g)
+               var.limit = (var.limit << 12) | 0xfff;
+       var.type = desc->type;
+       var.present = desc->p;
+       var.dpl = desc->dpl;
+       var.db = desc->d;
+       var.s = desc->s;
+       var.l = desc->l;
+       var.g = desc->g;
+       var.avl = desc->avl;
+       var.present = desc->p;
+       var.unusable = !var.present;
+       var.padding = 0;
+
+       kvm_set_segment(vcpu, &var, seg);
+       return;
+}
+
+static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment kvm_seg;
+
+       kvm_get_segment(vcpu, &kvm_seg, seg);
+       return kvm_seg.selector;
+}
+
+static void emulator_set_segment_selector(u16 sel, int seg,
+                                         struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment kvm_seg;
+
+       kvm_get_segment(vcpu, &kvm_seg, seg);
+       kvm_seg.selector = sel;
+       kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
+static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+       kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
 static struct x86_emulate_ops emulate_ops = {
-       .read_std            = kvm_read_guest_virt,
+       .read_std            = kvm_read_guest_virt_system,
+       .write_std           = kvm_write_guest_virt_system,
+       .fetch               = kvm_fetch_guest_virt,
        .read_emulated       = emulator_read_emulated,
        .write_emulated      = emulator_write_emulated,
        .cmpxchg_emulated    = emulator_cmpxchg_emulated,
+       .pio_in_emulated     = emulator_pio_in_emulated,
+       .pio_out_emulated    = emulator_pio_out_emulated,
+       .get_cached_descriptor = emulator_get_cached_descriptor,
+       .set_cached_descriptor = emulator_set_cached_descriptor,
+       .get_segment_selector = emulator_get_segment_selector,
+       .set_segment_selector = emulator_set_segment_selector,
+       .get_gdt             = emulator_get_gdt,
+       .get_cr              = emulator_get_cr,
+       .set_cr              = emulator_set_cr,
+       .cpl                 = emulator_get_cpl,
+       .set_rflags          = emulator_set_rflags,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3177,21 +3762,23 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
        cache_all_regs(vcpu);
 
        vcpu->mmio_is_write = 0;
-       vcpu->arch.pio.string = 0;
 
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                int cs_db, cs_l;
                kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
                vcpu->arch.emulate_ctxt.vcpu = vcpu;
-               vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+               vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+               vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
                vcpu->arch.emulate_ctxt.mode =
+                       (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
                        (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-                       ? X86EMUL_MODE_REAL : cs_l
+                       ? X86EMUL_MODE_VM86 : cs_l
                        ? X86EMUL_MODE_PROT64 : cs_db
                        ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 
                r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+               trace_kvm_emulate_insn_start(vcpu);
 
                /* Only allow emulation of specific instructions on #UD
                 * (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3224,6 +3811,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
                ++vcpu->stat.insn_emulation;
                if (r)  {
                        ++vcpu->stat.insn_emulation_fail;
+                       trace_kvm_emulate_insn_failed(vcpu);
                        if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
                                return EMULATE_DONE;
                        return EMULATE_FAIL;
@@ -3235,16 +3823,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
                return EMULATE_DONE;
        }
 
+restart:
        r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
        shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 
        if (r == 0)
                kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
 
-       if (vcpu->arch.pio.string)
+       if (vcpu->arch.pio.count) {
+               if (!vcpu->arch.pio.in)
+                       vcpu->arch.pio.count = 0;
                return EMULATE_DO_MMIO;
+       }
 
-       if ((r || vcpu->mmio_is_write) && run) {
+       if (r || vcpu->mmio_is_write) {
                run->exit_reason = KVM_EXIT_MMIO;
                run->mmio.phys_addr = vcpu->mmio_phys_addr;
                memcpy(run->mmio.data, vcpu->mmio_data, 8);
@@ -3254,219 +3846,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
        if (r) {
                if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-                       return EMULATE_DONE;
+                       goto done;
                if (!vcpu->mmio_needed) {
+                       ++vcpu->stat.insn_emulation_fail;
+                       trace_kvm_emulate_insn_failed(vcpu);
                        kvm_report_emulation_failure(vcpu, "mmio");
                        return EMULATE_FAIL;
                }
                return EMULATE_DO_MMIO;
        }
 
-       kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
        if (vcpu->mmio_is_write) {
                vcpu->mmio_needed = 0;
                return EMULATE_DO_MMIO;
        }
 
-       return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(emulate_instruction);
-
-static int pio_copy_data(struct kvm_vcpu *vcpu)
-{
-       void *p = vcpu->arch.pio_data;
-       gva_t q = vcpu->arch.pio.guest_gva;
-       unsigned bytes;
-       int ret;
-
-       bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
-       if (vcpu->arch.pio.in)
-               ret = kvm_write_guest_virt(q, p, bytes, vcpu);
-       else
-               ret = kvm_read_guest_virt(q, p, bytes, vcpu);
-       return ret;
-}
-
-int complete_pio(struct kvm_vcpu *vcpu)
-{
-       struct kvm_pio_request *io = &vcpu->arch.pio;
-       long delta;
-       int r;
-       unsigned long val;
-
-       if (!io->string) {
-               if (io->in) {
-                       val = kvm_register_read(vcpu, VCPU_REGS_RAX);
-                       memcpy(&val, vcpu->arch.pio_data, io->size);
-                       kvm_register_write(vcpu, VCPU_REGS_RAX, val);
-               }
-       } else {
-               if (io->in) {
-                       r = pio_copy_data(vcpu);
-                       if (r)
-                               return r;
-               }
-
-               delta = 1;
-               if (io->rep) {
-                       delta *= io->cur_count;
-                       /*
-                        * The size of the register should really depend on
-                        * current address size.
-                        */
-                       val = kvm_register_read(vcpu, VCPU_REGS_RCX);
-                       val -= delta;
-                       kvm_register_write(vcpu, VCPU_REGS_RCX, val);
-               }
-               if (io->down)
-                       delta = -delta;
-               delta *= io->size;
-               if (io->in) {
-                       val = kvm_register_read(vcpu, VCPU_REGS_RDI);
-                       val += delta;
-                       kvm_register_write(vcpu, VCPU_REGS_RDI, val);
-               } else {
-                       val = kvm_register_read(vcpu, VCPU_REGS_RSI);
-                       val += delta;
-                       kvm_register_write(vcpu, VCPU_REGS_RSI, val);
-               }
-       }
-
-       io->count -= io->cur_count;
-       io->cur_count = 0;
-
-       return 0;
-}
-
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-{
-       /* TODO: String I/O for in kernel device */
-       int r;
-
-       if (vcpu->arch.pio.in)
-               r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
-                                   vcpu->arch.pio.size, pd);
-       else
-               r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
-                                    vcpu->arch.pio.port, vcpu->arch.pio.size,
-                                    pd);
-       return r;
-}
-
-static int pio_string_write(struct kvm_vcpu *vcpu)
-{
-       struct kvm_pio_request *io = &vcpu->arch.pio;
-       void *pd = vcpu->arch.pio_data;
-       int i, r = 0;
-
-       for (i = 0; i < io->cur_count; i++) {
-               if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
-                                    io->port, io->size, pd)) {
-                       r = -EOPNOTSUPP;
-                       break;
-               }
-               pd += io->size;
-       }
-       return r;
-}
-
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
-{
-       unsigned long val;
-
-       vcpu->run->exit_reason = KVM_EXIT_IO;
-       vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-       vcpu->run->io.size = vcpu->arch.pio.size = size;
-       vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-       vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
-       vcpu->run->io.port = vcpu->arch.pio.port = port;
-       vcpu->arch.pio.in = in;
-       vcpu->arch.pio.string = 0;
-       vcpu->arch.pio.down = 0;
-       vcpu->arch.pio.rep = 0;
-
-       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
-                     size, 1);
+done:
+       if (vcpu->arch.exception.pending)
+               vcpu->arch.emulate_ctxt.restart = false;
 
-       val = kvm_register_read(vcpu, VCPU_REGS_RAX);
-       memcpy(vcpu->arch.pio_data, &val, 4);
+       if (vcpu->arch.emulate_ctxt.restart)
+               goto restart;
 
-       if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
-               complete_pio(vcpu);
-               return 1;
-       }
-       return 0;
+       return EMULATE_DONE;
 }
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
+EXPORT_SYMBOL_GPL(emulate_instruction);
 
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
-                 int size, unsigned long count, int down,
-                 gva_t address, int rep, unsigned port)
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 {
-       unsigned now, in_page;
-       int ret = 0;
-
-       vcpu->run->exit_reason = KVM_EXIT_IO;
-       vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-       vcpu->run->io.size = vcpu->arch.pio.size = size;
-       vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-       vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
-       vcpu->run->io.port = vcpu->arch.pio.port = port;
-       vcpu->arch.pio.in = in;
-       vcpu->arch.pio.string = 1;
-       vcpu->arch.pio.down = down;
-       vcpu->arch.pio.rep = rep;
-
-       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
-                     size, count);
-
-       if (!count) {
-               kvm_x86_ops->skip_emulated_instruction(vcpu);
-               return 1;
-       }
-
-       if (!down)
-               in_page = PAGE_SIZE - offset_in_page(address);
-       else
-               in_page = offset_in_page(address) + size;
-       now = min(count, (unsigned long)in_page / size);
-       if (!now)
-               now = 1;
-       if (down) {
-               /*
-                * String I/O in reverse.  Yuck.  Kill the guest, fix later.
-                */
-               pr_unimpl(vcpu, "guest string pio down\n");
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-       vcpu->run->io.count = now;
-       vcpu->arch.pio.cur_count = now;
-
-       if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
-               kvm_x86_ops->skip_emulated_instruction(vcpu);
-
-       vcpu->arch.pio.guest_gva = address;
-
-       if (!vcpu->arch.pio.in) {
-               /* string PIO write */
-               ret = pio_copy_data(vcpu);
-               if (ret == X86EMUL_PROPAGATE_FAULT) {
-                       kvm_inject_gp(vcpu, 0);
-                       return 1;
-               }
-               if (ret == 0 && !pio_string_write(vcpu)) {
-                       complete_pio(vcpu);
-                       if (vcpu->arch.pio.count == 0)
-                               ret = 1;
-               }
-       }
-       /* no string PIO read support yet */
-
+       unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+       int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
+       /* do not return to emulator after return from userspace */
+       vcpu->arch.pio.count = 0;
        return ret;
 }
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
 
 static void bounce_off(void *info)
 {
@@ -3614,11 +4028,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
                return a0 | ((gpa_t)a1 << 32);
 }
 
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+       u64 param, ingpa, outgpa, ret;
+       uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+       bool fast, longmode;
+       int cs_db, cs_l;
+
+       /*
+        * hypercall generates UD from non zero cpl and real mode
+        * per HYPER-V spec
+        */
+       if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 0;
+       }
+
+       kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+       longmode = is_long_mode(vcpu) && cs_l == 1;
+
+       if (!longmode) {
+               param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+                       (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+               ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+                       (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+               outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+                       (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+       }
+#ifdef CONFIG_X86_64
+       else {
+               param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+               ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+               outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+       }
+#endif
+
+       code = param & 0xffff;
+       fast = (param >> 16) & 0x1;
+       rep_cnt = (param >> 32) & 0xfff;
+       rep_idx = (param >> 48) & 0xfff;
+
+       trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+
+       switch (code) {
+       case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+               kvm_vcpu_on_spin(vcpu);
+               break;
+       default:
+               res = HV_STATUS_INVALID_HYPERCALL_CODE;
+               break;
+       }
+
+       ret = res | (((u64)rep_done & 0xfff) << 32);
+       if (longmode) {
+               kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+       } else {
+               kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
+               kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+       }
+
+       return 1;
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
        int r = 1;
 
+       if (kvm_hv_hypercall_enabled(vcpu->kvm))
+               return kvm_hv_hypercall(vcpu);
+
        nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
        a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
        a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -3661,10 +4140,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 {
        char instruction[3];
-       int ret = 0;
        unsigned long rip = kvm_rip_read(vcpu);
 
-
        /*
         * Blow out the MMU to ensure that no other VCPU has an active mapping
         * to ensure that the updated hypercall appears atomically across all
@@ -3673,92 +4150,24 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
        kvm_mmu_zap_all(vcpu->kvm);
 
        kvm_x86_ops->patch_hypercall(vcpu, instruction);
-       if (emulator_write_emulated(rip, instruction, 3, vcpu)
-           != X86EMUL_CONTINUE)
-               ret = -EFAULT;
-
-       return ret;
-}
 
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
-       return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+       return emulator_write_emulated(rip, instruction, 3, vcpu);
 }
 
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
-       struct descriptor_table dt = { limit, base };
+       struct desc_ptr dt = { limit, base };
 
        kvm_x86_ops->set_gdt(vcpu, &dt);
 }
 
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
-       struct descriptor_table dt = { limit, base };
+       struct desc_ptr dt = { limit, base };
 
        kvm_x86_ops->set_idt(vcpu, &dt);
 }
 
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-                  unsigned long *rflags)
-{
-       kvm_lmsw(vcpu, msw);
-       *rflags = kvm_get_rflags(vcpu);
-}
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
-       unsigned long value;
-
-       switch (cr) {
-       case 0:
-               value = vcpu->arch.cr0;
-               break;
-       case 2:
-               value = vcpu->arch.cr2;
-               break;
-       case 3:
-               value = vcpu->arch.cr3;
-               break;
-       case 4:
-               value = kvm_read_cr4(vcpu);
-               break;
-       case 8:
-               value = kvm_get_cr8(vcpu);
-               break;
-       default:
-               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
-               return 0;
-       }
-
-       return value;
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
-                    unsigned long *rflags)
-{
-       switch (cr) {
-       case 0:
-               kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
-               *rflags = kvm_get_rflags(vcpu);
-               break;
-       case 2:
-               vcpu->arch.cr2 = val;
-               break;
-       case 3:
-               kvm_set_cr3(vcpu, val);
-               break;
-       case 4:
-               kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
-               break;
-       case 8:
-               kvm_set_cr8(vcpu, val & 0xfUL);
-               break;
-       default:
-               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
-       }
-}
-
 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
 {
        struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -3822,9 +4231,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
 
+       best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+       if (!best || best->eax < 0x80000008)
+               goto not_found;
        best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
        if (best)
                return best->eax & 0xff;
+not_found:
        return 36;
 }
 
@@ -3938,6 +4351,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 {
        /* try to reinject previous events if any */
        if (vcpu->arch.exception.pending) {
+               trace_kvm_inj_exception(vcpu->arch.exception.nr,
+                                       vcpu->arch.exception.has_error_code,
+                                       vcpu->arch.exception.error_code);
                kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
                                          vcpu->arch.exception.has_error_code,
                                          vcpu->arch.exception.error_code);
@@ -4004,12 +4420,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        r = 0;
                        goto out;
                }
+               if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+                       vcpu->fpu_active = 0;
+                       kvm_x86_ops->fpu_deactivate(vcpu);
+               }
        }
 
        preempt_disable();
 
        kvm_x86_ops->prepare_guest_switch(vcpu);
-       kvm_load_guest_fpu(vcpu);
+       if (vcpu->fpu_active)
+               kvm_load_guest_fpu(vcpu);
 
        local_irq_disable();
 
@@ -4193,24 +4614,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        if (!irqchip_in_kernel(vcpu->kvm))
                kvm_set_cr8(vcpu, kvm_run->cr8);
 
-       if (vcpu->arch.pio.cur_count) {
-               r = complete_pio(vcpu);
-               if (r)
-                       goto out;
-       }
-       if (vcpu->mmio_needed) {
-               memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
-               vcpu->mmio_read_completed = 1;
-               vcpu->mmio_needed = 0;
-
+       if (vcpu->arch.pio.count || vcpu->mmio_needed ||
+           vcpu->arch.emulate_ctxt.restart) {
+               if (vcpu->mmio_needed) {
+                       memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+                       vcpu->mmio_read_completed = 1;
+                       vcpu->mmio_needed = 0;
+               }
                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-               r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
-                                       EMULTYPE_NO_DECODE);
+               r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
                srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
                if (r == EMULATE_DO_MMIO) {
-                       /*
-                        * Read-modify-write.  Back to userspace.
-                        */
                        r = 0;
                        goto out;
                }
@@ -4293,12 +4707,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        return 0;
 }
 
-void kvm_get_segment(struct kvm_vcpu *vcpu,
-                    struct kvm_segment *var, int seg)
-{
-       kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 {
        struct kvm_segment cs;
@@ -4312,7 +4720,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
-       struct descriptor_table dt;
+       struct desc_ptr dt;
 
        vcpu_load(vcpu);
 
@@ -4327,18 +4735,18 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
        kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
        kvm_x86_ops->get_idt(vcpu, &dt);
-       sregs->idt.limit = dt.limit;
-       sregs->idt.base = dt.base;
+       sregs->idt.limit = dt.size;
+       sregs->idt.base = dt.address;
        kvm_x86_ops->get_gdt(vcpu, &dt);
-       sregs->gdt.limit = dt.limit;
-       sregs->gdt.base = dt.base;
+       sregs->gdt.limit = dt.size;
+       sregs->gdt.base = dt.address;
 
-       sregs->cr0 = vcpu->arch.cr0;
+       sregs->cr0 = kvm_read_cr0(vcpu);
        sregs->cr2 = vcpu->arch.cr2;
        sregs->cr3 = vcpu->arch.cr3;
        sregs->cr4 = kvm_read_cr4(vcpu);
        sregs->cr8 = kvm_get_cr8(vcpu);
-       sregs->efer = vcpu->arch.shadow_efer;
+       sregs->efer = vcpu->arch.efer;
        sregs->apic_base = kvm_get_apic_base(vcpu);
 
        memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
@@ -4370,439 +4778,32 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
-                       struct kvm_segment *var, int seg)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+                   bool has_error_code, u32 error_code)
 {
-       kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
-                                  struct kvm_segment *kvm_desct)
-{
-       kvm_desct->base = get_desc_base(seg_desc);
-       kvm_desct->limit = get_desc_limit(seg_desc);
-       if (seg_desc->g) {
-               kvm_desct->limit <<= 12;
-               kvm_desct->limit |= 0xfff;
-       }
-       kvm_desct->selector = selector;
-       kvm_desct->type = seg_desc->type;
-       kvm_desct->present = seg_desc->p;
-       kvm_desct->dpl = seg_desc->dpl;
-       kvm_desct->db = seg_desc->d;
-       kvm_desct->s = seg_desc->s;
-       kvm_desct->l = seg_desc->l;
-       kvm_desct->g = seg_desc->g;
-       kvm_desct->avl = seg_desc->avl;
-       if (!selector)
-               kvm_desct->unusable = 1;
-       else
-               kvm_desct->unusable = 0;
-       kvm_desct->padding = 0;
-}
-
-static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
-                                         u16 selector,
-                                         struct descriptor_table *dtable)
-{
-       if (selector & 1 << 2) {
-               struct kvm_segment kvm_seg;
-
-               kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
-
-               if (kvm_seg.unusable)
-                       dtable->limit = 0;
-               else
-                       dtable->limit = kvm_seg.limit;
-               dtable->base = kvm_seg.base;
-       }
-       else
-               kvm_x86_ops->get_gdt(vcpu, dtable);
-}
-
-/* allowed just for 8 bytes segments */
-static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-                                        struct desc_struct *seg_desc)
-{
-       struct descriptor_table dtable;
-       u16 index = selector >> 3;
-
-       get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
-       if (dtable.limit < index * 8 + 7) {
-               kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
-               return 1;
-       }
-       return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
-}
-
-/* allowed just for 8 bytes segments */
-static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-                                        struct desc_struct *seg_desc)
-{
-       struct descriptor_table dtable;
-       u16 index = selector >> 3;
-
-       get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
-       if (dtable.limit < index * 8 + 7)
-               return 1;
-       return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
-}
-
-static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
-                            struct desc_struct *seg_desc)
-{
-       u32 base_addr = get_desc_base(seg_desc);
-
-       return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
-}
-
-static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
-{
-       struct kvm_segment kvm_seg;
-
-       kvm_get_segment(vcpu, &kvm_seg, seg);
-       return kvm_seg.selector;
-}
-
-static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
-                                               u16 selector,
-                                               struct kvm_segment *kvm_seg)
-{
-       struct desc_struct seg_desc;
-
-       if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
-               return 1;
-       seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
-       return 0;
-}
-
-static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
-       struct kvm_segment segvar = {
-               .base = selector << 4,
-               .limit = 0xffff,
-               .selector = selector,
-               .type = 3,
-               .present = 1,
-               .dpl = 3,
-               .db = 0,
-               .s = 1,
-               .l = 0,
-               .g = 0,
-               .avl = 0,
-               .unusable = 0,
-       };
-       kvm_x86_ops->set_segment(vcpu, &segvar, seg);
-       return 0;
-}
-
-static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
-{
-       return (seg != VCPU_SREG_LDTR) &&
-               (seg != VCPU_SREG_TR) &&
-               (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
-}
-
-static void kvm_check_segment_descriptor(struct kvm_vcpu *vcpu, int seg,
-                                        u16 selector)
-{
-       /* NULL selector is not valid for CS and SS */
-       if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
-               if (!selector)
-                       kvm_queue_exception_e(vcpu, TS_VECTOR, selector >> 3);
-}
-
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-                               int type_bits, int seg)
-{
-       struct kvm_segment kvm_seg;
-
-       if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
-               return kvm_load_realmode_segment(vcpu, selector, seg);
-       if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
-               return 1;
-
-       kvm_check_segment_descriptor(vcpu, seg, selector);
-       kvm_seg.type |= type_bits;
-
-       if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
-           seg != VCPU_SREG_LDTR)
-               if (!kvm_seg.s)
-                       kvm_seg.unusable = 1;
-
-       kvm_set_segment(vcpu, &kvm_seg, seg);
-       return 0;
-}
-
-static void save_state_to_tss32(struct kvm_vcpu *vcpu,
-                               struct tss_segment_32 *tss)
-{
-       tss->cr3 = vcpu->arch.cr3;
-       tss->eip = kvm_rip_read(vcpu);
-       tss->eflags = kvm_get_rflags(vcpu);
-       tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-       tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-       tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-       tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-       tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-       tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-       tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
-       tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
-       tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
-       tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
-       tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
-       tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
-       tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
-       tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
-       tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static int load_state_from_tss32(struct kvm_vcpu *vcpu,
-                                 struct tss_segment_32 *tss)
-{
-       kvm_set_cr3(vcpu, tss->cr3);
-
-       kvm_rip_write(vcpu, tss->eip);
-       kvm_set_rflags(vcpu, tss->eflags | 2);
-
-       kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
-       kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
-       kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
-       kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
-       kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
-       kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
-       kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
-       kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
-
-       if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
-               return 1;
-
-       if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
-               return 1;
-
-       if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
-               return 1;
-
-       if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
-               return 1;
-
-       if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
-               return 1;
-
-       if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
-               return 1;
-
-       if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
-               return 1;
-       return 0;
-}
-
-static void save_state_to_tss16(struct kvm_vcpu *vcpu,
-                               struct tss_segment_16 *tss)
-{
-       tss->ip = kvm_rip_read(vcpu);
-       tss->flag = kvm_get_rflags(vcpu);
-       tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-       tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-       tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-       tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-       tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-       tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-       tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
-       tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
-
-       tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
-       tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
-       tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
-       tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
-       tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static int load_state_from_tss16(struct kvm_vcpu *vcpu,
-                                struct tss_segment_16 *tss)
-{
-       kvm_rip_write(vcpu, tss->ip);
-       kvm_set_rflags(vcpu, tss->flag | 2);
-       kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
-       kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
-       kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
-       kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
-       kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
-       kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
-       kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
-       kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
-
-       if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
-               return 1;
-
-       if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
-               return 1;
-
-       if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
-               return 1;
-
-       if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
-               return 1;
-
-       if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
-               return 1;
-       return 0;
-}
-
-static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
-                             u16 old_tss_sel, u32 old_tss_base,
-                             struct desc_struct *nseg_desc)
-{
-       struct tss_segment_16 tss_segment_16;
-       int ret = 0;
-
-       if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
-                          sizeof tss_segment_16))
-               goto out;
-
-       save_state_to_tss16(vcpu, &tss_segment_16);
-
-       if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
-                           sizeof tss_segment_16))
-               goto out;
-
-       if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
-                          &tss_segment_16, sizeof tss_segment_16))
-               goto out;
-
-       if (old_tss_sel != 0xffff) {
-               tss_segment_16.prev_task_link = old_tss_sel;
-
-               if (kvm_write_guest(vcpu->kvm,
-                                   get_tss_base_addr(vcpu, nseg_desc),
-                                   &tss_segment_16.prev_task_link,
-                                   sizeof tss_segment_16.prev_task_link))
-                       goto out;
-       }
-
-       if (load_state_from_tss16(vcpu, &tss_segment_16))
-               goto out;
-
-       ret = 1;
-out:
-       return ret;
-}
-
-static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
-                      u16 old_tss_sel, u32 old_tss_base,
-                      struct desc_struct *nseg_desc)
-{
-       struct tss_segment_32 tss_segment_32;
-       int ret = 0;
-
-       if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
-                          sizeof tss_segment_32))
-               goto out;
-
-       save_state_to_tss32(vcpu, &tss_segment_32);
-
-       if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
-                           sizeof tss_segment_32))
-               goto out;
-
-       if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
-                          &tss_segment_32, sizeof tss_segment_32))
-               goto out;
-
-       if (old_tss_sel != 0xffff) {
-               tss_segment_32.prev_task_link = old_tss_sel;
-
-               if (kvm_write_guest(vcpu->kvm,
-                                   get_tss_base_addr(vcpu, nseg_desc),
-                                   &tss_segment_32.prev_task_link,
-                                   sizeof tss_segment_32.prev_task_link))
-                       goto out;
-       }
-
-       if (load_state_from_tss32(vcpu, &tss_segment_32))
-               goto out;
-
-       ret = 1;
-out:
-       return ret;
-}
-
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
-{
-       struct kvm_segment tr_seg;
-       struct desc_struct cseg_desc;
-       struct desc_struct nseg_desc;
-       int ret = 0;
-       u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
-       u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
-
-       old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
-
-       /* FIXME: Handle errors. Failure to read either TSS or their
-        * descriptors should generate a pagefault.
-        */
-       if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
-               goto out;
-
-       if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
-               goto out;
-
-       if (reason != TASK_SWITCH_IRET) {
-               int cpl;
-
-               cpl = kvm_x86_ops->get_cpl(vcpu);
-               if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
-                       kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
-                       return 1;
-               }
-       }
-
-       if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
-               kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
-               return 1;
-       }
-
-       if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
-               cseg_desc.type &= ~(1 << 1); //clear the B flag
-               save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
-       }
-
-       if (reason == TASK_SWITCH_IRET) {
-               u32 eflags = kvm_get_rflags(vcpu);
-               kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
-       }
+       int cs_db, cs_l, ret;
+       cache_all_regs(vcpu);
 
-       /* set back link to prev task only if NT bit is set in eflags
-          note that old_tss_sel is not used afetr this point */
-       if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
-               old_tss_sel = 0xffff;
+       kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
-       if (nseg_desc.type & 8)
-               ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
-                                        old_tss_base, &nseg_desc);
-       else
-               ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
-                                        old_tss_base, &nseg_desc);
+       vcpu->arch.emulate_ctxt.vcpu = vcpu;
+       vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+       vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+       vcpu->arch.emulate_ctxt.mode =
+               (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+               (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+               ? X86EMUL_MODE_VM86 : cs_l
+               ? X86EMUL_MODE_PROT64 : cs_db
+               ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 
-       if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
-               u32 eflags = kvm_get_rflags(vcpu);
-               kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
-       }
+       ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+                                  tss_selector, reason, has_error_code,
+                                  error_code);
 
-       if (reason != TASK_SWITCH_IRET) {
-               nseg_desc.type |= (1 << 1);
-               save_guest_segment_descriptor(vcpu, tss_selector,
-                                             &nseg_desc);
-       }
+       if (ret == X86EMUL_CONTINUE)
+               kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 
-       kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
-       seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
-       tr_seg.type = 11;
-       kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
-out:
-       return ret;
+       return (ret != X86EMUL_CONTINUE);
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
@@ -4811,15 +4812,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 {
        int mmu_reset_needed = 0;
        int pending_vec, max_bits;
-       struct descriptor_table dt;
+       struct desc_ptr dt;
 
        vcpu_load(vcpu);
 
-       dt.limit = sregs->idt.limit;
-       dt.base = sregs->idt.base;
+       dt.size = sregs->idt.limit;
+       dt.address = sregs->idt.base;
        kvm_x86_ops->set_idt(vcpu, &dt);
-       dt.limit = sregs->gdt.limit;
-       dt.base = sregs->gdt.base;
+       dt.size = sregs->gdt.limit;
+       dt.address = sregs->gdt.base;
        kvm_x86_ops->set_gdt(vcpu, &dt);
 
        vcpu->arch.cr2 = sregs->cr2;
@@ -4828,11 +4829,11 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
        kvm_set_cr8(vcpu, sregs->cr8);
 
-       mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
+       mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
        kvm_x86_ops->set_efer(vcpu, sregs->efer);
        kvm_set_apic_base(vcpu, sregs->apic_base);
 
-       mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
+       mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
        kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
        vcpu->arch.cr0 = sregs->cr0;
 
@@ -4871,7 +4872,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        /* Older userspace won't unhalt the vcpu on reset. */
        if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
            sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
-           !(vcpu->arch.cr0 & X86_CR0_PE))
+           !is_protmode(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
        vcpu_put(vcpu);
@@ -4918,11 +4919,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
        }
 
-       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
-               vcpu->arch.singlestep_cs =
-                       get_segment_selector(vcpu, VCPU_SREG_CS);
-               vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
-       }
+       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+               vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
+                       get_segment_base(vcpu, VCPU_SREG_CS);
 
        /*
         * Trigger an rflags update that will inject or remove the trace
@@ -4973,7 +4972,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 
        vcpu_load(vcpu);
        idx = srcu_read_lock(&vcpu->kvm->srcu);
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
+       gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
        tr->physical_address = gpa;
        tr->valid = gpa != UNMAPPED_GVA;
@@ -5055,14 +5054,14 @@ EXPORT_SYMBOL_GPL(fx_init);
 
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-       if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
+       if (vcpu->guest_fpu_loaded)
                return;
 
        vcpu->guest_fpu_loaded = 1;
        kvm_fx_save(&vcpu->arch.host_fx_image);
        kvm_fx_restore(&vcpu->arch.guest_fx_image);
+       trace_kvm_fpu(1);
 }
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
 
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
@@ -5073,8 +5072,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
        kvm_fx_save(&vcpu->arch.guest_fx_image);
        kvm_fx_restore(&vcpu->arch.host_fx_image);
        ++vcpu->stat.fpu_reload;
+       set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+       trace_kvm_fpu(0);
 }
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
@@ -5305,6 +5305,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
                put_page(kvm->arch.apic_access_page);
        if (kvm->arch.ept_identity_pagetable)
                put_page(kvm->arch.ept_identity_pagetable);
+       cleanup_srcu_struct(&kvm->srcu);
        kfree(kvm->arch.aliases);
        kfree(kvm);
 }
@@ -5411,13 +5412,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
        return kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+       unsigned long current_rip = kvm_rip_read(vcpu) +
+               get_segment_base(vcpu, VCPU_SREG_CS);
+
+       return current_rip == linear_rip;
+}
+EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 {
        unsigned long rflags;
 
        rflags = kvm_x86_ops->get_rflags(vcpu);
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-               rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+               rflags &= ~X86_EFLAGS_TF;
        return rflags;
 }
 EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5425,10 +5435,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
-           vcpu->arch.singlestep_cs ==
-                       get_segment_selector(vcpu, VCPU_SREG_CS) &&
-           vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
-               rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+           kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+               rflags |= X86_EFLAGS_TF;
        kvm_x86_ops->set_rflags(vcpu, rflags);
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
@@ -5444,3 +5452,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);