KVM: convert custom marker based tracing to event traces

[safe/jmp/linux-2.6] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index a1f1461..892a7a6 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -36,11 +36,15 @@
  #include <linux/highmem.h>
  #include <linux/iommu.h>
  #include <linux/intel-iommu.h>
+#include <linux/cpufreq.h>
+#define CREATE_TRACE_POINTS
+#include "trace.h"
  
  #include <asm/uaccess.h>
  #include <asm/msr.h>
  #include <asm/desc.h>
  #include <asm/mtrr.h>
+#include <asm/mce.h>
  
  #define MAX_IO_MSRS 256
  #define CR0_RESERVED_BITS                                              \
@@ -54,6 +58,10 @@
                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  
  #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+#define KVM_MAX_MCE_BANKS 32
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+
  /* EFER defaults:
   * - enable syscall per default because its emulated by KVM
   * - enable LME and LMA per default on 64 bit KVM
@@ -90,7 +98,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
         { "hypercalls", VCPU_STAT(hypercalls) },
         { "request_irq", VCPU_STAT(request_irq_exits) },
-       { "request_nmi", VCPU_STAT(request_nmi_exits) },
         { "irq_exits", VCPU_STAT(irq_exits) },
         { "host_state_reload", VCPU_STAT(host_state_reload) },
         { "efer_reload", VCPU_STAT(efer_reload) },
@@ -107,7 +114,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
         { "mmu_recycled", VM_STAT(mmu_recycled) },
         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
         { "mmu_unsync", VM_STAT(mmu_unsync) },
-       { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
         { "largepages", VM_STAT(lpages) },
         { NULL }
@@ -177,16 +183,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
         ++vcpu->stat.pf_guest;
  
         if (vcpu->arch.exception.pending) {
-               if (vcpu->arch.exception.nr == PF_VECTOR) {
-                       printk(KERN_DEBUG "kvm: inject_page_fault:"
-                                       " double fault 0x%lx\n", addr);
-                       vcpu->arch.exception.nr = DF_VECTOR;
-                       vcpu->arch.exception.error_code = 0;
-               } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+               switch(vcpu->arch.exception.nr) {
+               case DF_VECTOR:
                         /* triple fault -> shutdown */
                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+                       return;
+               case PF_VECTOR:
+                       vcpu->arch.exception.nr = DF_VECTOR;
+                       vcpu->arch.exception.error_code = 0;
+                       return;
+               default:
+                       /* replace previous exception with a new one in a hope
+                          that instruction re-execution will regenerate lost
+                          exception */
+                       vcpu->arch.exception.pending = false;
+                       break;
                 }
-               return;
         }
         vcpu->arch.cr2 = addr;
         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
@@ -233,7 +245,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
                 goto out;
         }
         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-               if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
+               if (is_present_gpte(pdpte[i]) &&
+                   (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
                         ret = 0;
                         goto out;
                 }
@@ -241,6 +254,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
         ret = 1;
  
         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+       __set_bit(VCPU_EXREG_PDPTR,
+                 (unsigned long *)&vcpu->arch.regs_avail);
+       __set_bit(VCPU_EXREG_PDPTR,
+                 (unsigned long *)&vcpu->arch.regs_dirty);
  out:
  
         return ret;
@@ -256,6 +273,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
         if (is_long_mode(vcpu) || !is_pae(vcpu))
                 return false;
  
+       if (!test_bit(VCPU_EXREG_PDPTR,
+                     (unsigned long *)&vcpu->arch.regs_avail))
+               return true;
+
         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
         if (r < 0)
                 goto out;
@@ -320,7 +341,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         kvm_x86_ops->set_cr0(vcpu, cr0);
         vcpu->arch.cr0 = cr0;
  
-       kvm_mmu_sync_global(vcpu);
         kvm_mmu_reset_context(vcpu);
         return;
  }
@@ -329,14 +349,14 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
  void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
  {
         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
-       KVMTRACE_1D(LMSW, vcpu,
-                   (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
-                   handler);
  }
  EXPORT_SYMBOL_GPL(kvm_lmsw);
  
  void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
+       unsigned long old_cr4 = vcpu->arch.cr4;
+       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
         if (cr4 & CR4_RESERVED_BITS) {
                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
                 kvm_inject_gp(vcpu, 0);
@@ -350,7 +370,8 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                         kvm_inject_gp(vcpu, 0);
                         return;
                 }
-       } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
+       } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+                  && ((cr4 ^ old_cr4) & pdptr_bits)
                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
                 kvm_inject_gp(vcpu, 0);
@@ -364,8 +385,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
         }
         kvm_x86_ops->set_cr4(vcpu, cr4);
         vcpu->arch.cr4 = cr4;
-       vcpu->arch.mmu.base_role.cr4_pge = !!(cr4 & X86_CR4_PGE);
-       kvm_mmu_sync_global(vcpu);
+       vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
         kvm_mmu_reset_context(vcpu);
  }
  EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -464,7 +484,7 @@ static u32 msrs_to_save[] = {
  #ifdef CONFIG_X86_64
         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
  #endif
-       MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+       MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
         MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
  };
  
@@ -490,6 +510,17 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
                 return;
         }
  
+       if (efer & EFER_FFXSR) {
+               struct kvm_cpuid_entry2 *feat;
+
+               feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+               if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
+                       printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
+                       kvm_inject_gp(vcpu, 0);
+                       return;
+               }
+       }
+
         if (efer & EFER_SVME) {
                 struct kvm_cpuid_entry2 *feat;
  
@@ -507,6 +538,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
         efer |= vcpu->arch.shadow_efer & EFER_LMA;
  
         vcpu->arch.shadow_efer = efer;
+
+       vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
+       kvm_mmu_reset_context(vcpu);
  }
  
  void kvm_enable_efer_bits(u64 mask)
@@ -606,25 +640,29 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
                  hv_clock->tsc_to_system_mul);
  }
  
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+
  static void kvm_write_guest_time(struct kvm_vcpu *v)
  {
         struct timespec ts;
         unsigned long flags;
         struct kvm_vcpu_arch *vcpu = &v->arch;
         void *shared_kaddr;
+       unsigned long this_tsc_khz;
  
         if ((!vcpu->time_page))
                 return;
  
-       if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
-               kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
-               vcpu->hv_clock_tsc_khz = tsc_khz;
+       this_tsc_khz = get_cpu_var(cpu_tsc_khz);
+       if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+               kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+               vcpu->hv_clock_tsc_khz = this_tsc_khz;
         }
+       put_cpu_var(cpu_tsc_khz);
  
         /* Keep irq disabled to prevent changes to the clock */
         local_irq_save(flags);
-       kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
-                         &vcpu->hv_clock.tsc_timestamp);
+       kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
         ktime_get_ts(&ts);
         local_irq_restore(flags);
  
@@ -649,6 +687,16 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
  }
  
+static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+{
+       struct kvm_vcpu_arch *vcpu = &v->arch;
+
+       if (!vcpu->time_page)
+               return 0;
+       set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+       return 1;
+}
+
  static bool msr_mtrr_valid(unsigned msr)
  {
         switch (msr) {
@@ -673,11 +721,48 @@ static bool msr_mtrr_valid(unsigned msr)
         return false;
  }
  
+static bool valid_pat_type(unsigned t)
+{
+       return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+       return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+       int i;
+
+       if (!msr_mtrr_valid(msr))
+               return false;
+
+       if (msr == MSR_IA32_CR_PAT) {
+               for (i = 0; i < 8; i++)
+                       if (!valid_pat_type((data >> (i * 8)) & 0xff))
+                               return false;
+               return true;
+       } else if (msr == MSR_MTRRdefType) {
+               if (data & ~0xcff)
+                       return false;
+               return valid_mtrr_type(data & 0xff);
+       } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+               for (i = 0; i < 8 ; i++)
+                       if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+                               return false;
+               return true;
+       }
+
+       /* variable MTRRs */
+       return valid_mtrr_type(data & 0xff);
+}
+
  static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
  {
         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
  
-       if (!msr_mtrr_valid(msr))
+       if (!mtrr_valid(vcpu, msr, data))
                 return 1;
  
         if (msr == MSR_MTRRdefType) {
@@ -710,23 +795,43 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
         return 0;
  }
  
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
  {
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
+
         switch (msr) {
-       case MSR_EFER:
-               set_efer(vcpu, data);
-               break;
-       case MSR_IA32_MC0_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-                      __func__, data);
-               break;
         case MSR_IA32_MCG_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-                       __func__, data);
+               vcpu->arch.mcg_status = data;
                 break;
         case MSR_IA32_MCG_CTL:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
-                       __func__, data);
+               if (!(mcg_cap & MCG_CTL_P))
+                       return 1;
+               if (data != 0 && data != ~(u64)0)
+                       return -1;
+               vcpu->arch.mcg_ctl = data;
+               break;
+       default:
+               if (msr >= MSR_IA32_MC0_CTL &&
+                   msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       /* only 0 or all 1s can be written to IA32_MCi_CTL */
+                       if ((offset & 0x3) == 0 &&
+                           data != 0 && data != ~(u64)0)
+                               return -1;
+                       vcpu->arch.mce_banks[offset] = data;
+                       break;
+               }
+               return 1;
+       }
+       return 0;
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+       switch (msr) {
+       case MSR_EFER:
+               set_efer(vcpu, data);
                 break;
         case MSR_IA32_DEBUGCTLMSR:
                 if (!data) {
@@ -779,9 +884,43 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                         vcpu->arch.time_page = NULL;
                 }
  
-               kvm_write_guest_time(vcpu);
+               kvm_request_guest_time_update(vcpu);
                 break;
         }
+       case MSR_IA32_MCG_CTL:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+               return set_msr_mce(vcpu, msr, data);
+
+       /* Performance counters are not protected by a CPUID bit,
+        * so we should check all of them in the generic path for the sake of
+        * cross vendor migration.
+        * Writing a zero into the event select MSRs disables them,
+        * which we perfectly emulate ;-). Any other value should be at least
+        * reported, some guests depend on them.
+        */
+       case MSR_P6_EVNTSEL0:
+       case MSR_P6_EVNTSEL1:
+       case MSR_K7_EVNTSEL0:
+       case MSR_K7_EVNTSEL1:
+       case MSR_K7_EVNTSEL2:
+       case MSR_K7_EVNTSEL3:
+               if (data != 0)
+                       pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+                               "0x%x data 0x%llx\n", msr, data);
+               break;
+       /* at least RHEL 4 unconditionally writes to the perfctr registers,
+        * so we ignore writes to make it happy.
+        */
+       case MSR_P6_PERFCTR0:
+       case MSR_P6_PERFCTR1:
+       case MSR_K7_PERFCTR0:
+       case MSR_K7_PERFCTR1:
+       case MSR_K7_PERFCTR2:
+       case MSR_K7_PERFCTR3:
+               pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+                       "0x%x data 0x%llx\n", msr, data);
+               break;
         default:
                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
                 return 1;
@@ -837,26 +976,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         return 0;
  }
  
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
  {
         u64 data;
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
  
         switch (msr) {
-       case 0xc0010010: /* SYSCFG */
-       case 0xc0010015: /* HWCR */
-       case MSR_IA32_PLATFORM_ID:
         case MSR_IA32_P5_MC_ADDR:
         case MSR_IA32_P5_MC_TYPE:
-       case MSR_IA32_MC0_CTL:
-       case MSR_IA32_MCG_STATUS:
+               data = 0;
+               break;
         case MSR_IA32_MCG_CAP:
+               data = vcpu->arch.mcg_cap;
+               break;
         case MSR_IA32_MCG_CTL:
-       case MSR_IA32_MC0_MISC:
-       case MSR_IA32_MC0_MISC+4:
-       case MSR_IA32_MC0_MISC+8:
-       case MSR_IA32_MC0_MISC+12:
-       case MSR_IA32_MC0_MISC+16:
-       case MSR_IA32_MC0_MISC+20:
+               if (!(mcg_cap & MCG_CTL_P))
+                       return 1;
+               data = vcpu->arch.mcg_ctl;
+               break;
+       case MSR_IA32_MCG_STATUS:
+               data = vcpu->arch.mcg_status;
+               break;
+       default:
+               if (msr >= MSR_IA32_MC0_CTL &&
+                   msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       data = vcpu->arch.mce_banks[offset];
+                       break;
+               }
+               return 1;
+       }
+       *pdata = data;
+       return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+       u64 data;
+
+       switch (msr) {
+       case MSR_IA32_PLATFORM_ID:
         case MSR_IA32_UCODE_REV:
         case MSR_IA32_EBL_CR_POWERON:
         case MSR_IA32_DEBUGCTLMSR:
@@ -864,7 +1024,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         case MSR_IA32_LASTBRANCHTOIP:
         case MSR_IA32_LASTINTFROMIP:
         case MSR_IA32_LASTINTTOIP:
+       case MSR_K8_SYSCFG:
+       case MSR_K7_HWCR:
         case MSR_VM_HSAVE_PA:
+       case MSR_P6_EVNTSEL0:
+       case MSR_P6_EVNTSEL1:
+       case MSR_K7_EVNTSEL0:
                 data = 0;
                 break;
         case MSR_MTRRcap:
@@ -896,6 +1061,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         case MSR_KVM_SYSTEM_TIME:
                 data = vcpu->arch.time;
                 break;
+       case MSR_IA32_P5_MC_ADDR:
+       case MSR_IA32_P5_MC_TYPE:
+       case MSR_IA32_MCG_CAP:
+       case MSR_IA32_MCG_CTL:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+               return get_msr_mce(vcpu, msr, pdata);
         default:
                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
                 return 1;
@@ -989,11 +1161,16 @@ int kvm_dev_ioctl_check_extension(long ext)
         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
         case KVM_CAP_SET_TSS_ADDR:
         case KVM_CAP_EXT_CPUID:
+       case KVM_CAP_CLOCKSOURCE:
         case KVM_CAP_PIT:
         case KVM_CAP_NOP_IO_DELAY:
         case KVM_CAP_MP_STATE:
         case KVM_CAP_SYNC_MMU:
         case KVM_CAP_REINJECT_CONTROL:
+       case KVM_CAP_IRQ_INJECT_STATUS:
+       case KVM_CAP_ASSIGN_DEV_IRQ:
+       case KVM_CAP_IRQFD:
+       case KVM_CAP_PIT2:
                 r = 1;
                 break;
         case KVM_CAP_COALESCED_MMIO:
@@ -1014,8 +1191,8 @@ int kvm_dev_ioctl_check_extension(long ext)
         case KVM_CAP_IOMMU:
                 r = iommu_found();
                 break;
-       case KVM_CAP_CLOCKSOURCE:
-               r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC);
+       case KVM_CAP_MCE:
+               r = KVM_MAX_MCE_BANKS;
                 break;
         default:
                 r = 0;
@@ -1045,14 +1222,13 @@ long kvm_arch_dev_ioctl(struct file *filp,
                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
                         goto out;
                 r = -E2BIG;
-               if (n < num_msrs_to_save)
+               if (n < msr_list.nmsrs)
                         goto out;
                 r = -EFAULT;
                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
                                  num_msrs_to_save * sizeof(u32)))
                         goto out;
-               if (copy_to_user(user_msr_list->indices
-                                + num_msrs_to_save * sizeof(u32),
+               if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
                                  &emulated_msrs,
                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
                         goto out;
@@ -1067,7 +1243,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
                         goto out;
                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
-                       cpuid_arg->entries);
+                                                     cpuid_arg->entries);
                 if (r)
                         goto out;
  
@@ -1077,6 +1253,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
                 r = 0;
                 break;
         }
+       case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+               u64 mce_cap;
+
+               mce_cap = KVM_MCE_CAP_SUPPORTED;
+               r = -EFAULT;
+               if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+                       goto out;
+               r = 0;
+               break;
+       }
         default:
                 r = -EINVAL;
         }
@@ -1087,7 +1273,7 @@ out:
  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  {
         kvm_x86_ops->vcpu_load(vcpu, cpu);
-       kvm_write_guest_time(vcpu);
+       kvm_request_guest_time_update(vcpu);
  }
  
  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1098,9 +1284,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  
  static int is_efer_nx(void)
  {
-       u64 efer;
+       unsigned long long efer = 0;
  
-       rdmsrl(MSR_EFER, efer);
+       rdmsrl_safe(MSR_EFER, &efer);
         return efer & EFER_NX;
  }
  
@@ -1165,8 +1351,8 @@ out:
  }
  
  static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
-                                   struct kvm_cpuid2 *cpuid,
-                                   struct kvm_cpuid_entry2 __user *entries)
+                                    struct kvm_cpuid2 *cpuid,
+                                    struct kvm_cpuid_entry2 __user *entries)
  {
         int r;
  
@@ -1185,8 +1371,8 @@ out:
  }
  
  static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
-                                   struct kvm_cpuid2 *cpuid,
-                                   struct kvm_cpuid_entry2 __user *entries)
+                                    struct kvm_cpuid2 *cpuid,
+                                    struct kvm_cpuid_entry2 __user *entries)
  {
         int r;
  
@@ -1195,7 +1381,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
                 goto out;
         r = -EFAULT;
         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
-                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+                        vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
                 goto out;
         return 0;
  
@@ -1205,51 +1391,64 @@ out:
  }
  
  static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-                         u32 index)
+                          u32 index)
  {
         entry->function = function;
         entry->index = index;
         cpuid_count(entry->function, entry->index,
-               &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+                   &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
         entry->flags = 0;
  }
  
+#define F(x) bit(X86_FEATURE_##x)
+
  static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                          u32 index, int *nent, int maxnent)
  {
-       const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
-               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
-               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
-               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
-               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-               bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
-               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
-               bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
-               bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
-               bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
-       const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
-               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
-               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
-               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
-               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-               bit(X86_FEATURE_PGE) |
-               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
-               bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
-               bit(X86_FEATURE_SYSCALL) |
-               (bit(X86_FEATURE_NX) && is_efer_nx()) |
+       unsigned f_nx = is_efer_nx() ? F(NX) : 0;
  #ifdef CONFIG_X86_64
-               bit(X86_FEATURE_LM) |
+       unsigned f_lm = F(LM);
+#else
+       unsigned f_lm = 0;
  #endif
-               bit(X86_FEATURE_MMXEXT) |
-               bit(X86_FEATURE_3DNOWEXT) |
-               bit(X86_FEATURE_3DNOW);
-       const u32 kvm_supported_word3_x86_features =
-               bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+
+       /* cpuid 1.edx */
+       const u32 kvm_supported_word0_x86_features =
+               F(FPU) | F(VME) | F(DE) | F(PSE) |
+               F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+               F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+               F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+               F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+               0 /* Reserved, DS, ACPI */ | F(MMX) |
+               F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+               0 /* HTT, TM, Reserved, PBE */;
+       /* cpuid 0x80000001.edx */
+       const u32 kvm_supported_word1_x86_features =
+               F(FPU) | F(VME) | F(DE) | F(PSE) |
+               F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+               F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+               F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+               F(PAT) | F(PSE36) | 0 /* Reserved */ |
+               f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+               F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
+               0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+       /* cpuid 1.ecx */
+       const u32 kvm_supported_word4_x86_features =
+               F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+               0 /* DS-CPL, VMX, SMX, EST */ |
+               0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+               0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+               0 /* Reserved, DCA */ | F(XMM4_1) |
+               F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+               0 /* Reserved, XSAVE, OSXSAVE */;
+       /* cpuid 0x80000001.ecx */
         const u32 kvm_supported_word6_x86_features =
-               bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
-               bit(X86_FEATURE_SVM);
+               F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+               F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+               F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
+               0 /* SKINIT */ | 0 /* WDT */;
  
-       /* all func 2 cpuid_count() should be called on the same cpu */
+       /* all calls to cpuid_count() should be made on the same cpu */
         get_cpu();
         do_cpuid_1_ent(entry, function, index);
         ++*nent;
@@ -1260,7 +1459,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 break;
         case 1:
                 entry->edx &= kvm_supported_word0_x86_features;
-               entry->ecx &= kvm_supported_word3_x86_features;
+               entry->ecx &= kvm_supported_word4_x86_features;
                 break;
         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
          * may return different values. This forces us to get_cpu() before
@@ -1322,8 +1521,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
         put_cpu();
  }
  
+#undef F
+
  static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-                                   struct kvm_cpuid_entry2 __user *entries)
+                                    struct kvm_cpuid_entry2 __user *entries)
  {
         struct kvm_cpuid_entry2 *cpuid_entries;
         int limit, nent = 0, r = -E2BIG;
@@ -1340,7 +1541,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
         limit = cpuid_entries[0].eax;
         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
-                               &nent, cpuid->nent);
+                            &nent, cpuid->nent);
         r = -E2BIG;
         if (nent >= cpuid->nent)
                 goto out_free;
@@ -1349,10 +1550,14 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
         limit = cpuid_entries[nent - 1].eax;
         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
-                              &nent, cpuid->nent);
+                            &nent, cpuid->nent);
+       r = -E2BIG;
+       if (nent >= cpuid->nent)
+               goto out_free;
+
         r = -EFAULT;
         if (copy_to_user(entries, cpuid_entries,
-                       nent * sizeof(struct kvm_cpuid_entry2)))
+                        nent * sizeof(struct kvm_cpuid_entry2)))
                 goto out_free;
         cpuid->nent = nent;
         r = 0;
@@ -1393,8 +1598,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
                 return -ENXIO;
         vcpu_load(vcpu);
  
-       set_bit(irq->irq, vcpu->arch.irq_pending);
-       set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+       kvm_queue_interrupt(vcpu, irq->irq, false);
  
         vcpu_put(vcpu);
  
@@ -1419,6 +1623,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
         return 0;
  }
  
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+                                       u64 mcg_cap)
+{
+       int r;
+       unsigned bank_num = mcg_cap & 0xff, bank;
+
+       r = -EINVAL;
+       if (!bank_num)
+               goto out;
+       if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+               goto out;
+       r = 0;
+       vcpu->arch.mcg_cap = mcg_cap;
+       /* Init IA32_MCG_CTL to all 1s */
+       if (mcg_cap & MCG_CTL_P)
+               vcpu->arch.mcg_ctl = ~(u64)0;
+       /* Init IA32_MCi_CTL to all 1s */
+       for (bank = 0; bank < bank_num; bank++)
+               vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+       return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+                                     struct kvm_x86_mce *mce)
+{
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
+       u64 *banks = vcpu->arch.mce_banks;
+
+       if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+               return -EINVAL;
+       /*
+        * if IA32_MCG_CTL is not all 1s, the uncorrected error
+        * reporting is disabled
+        */
+       if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+           vcpu->arch.mcg_ctl != ~(u64)0)
+               return 0;
+       banks += 4 * mce->bank;
+       /*
+        * if IA32_MCi_CTL is not all 1s, the uncorrected error
+        * reporting is disabled for the bank
+        */
+       if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+               return 0;
+       if (mce->status & MCI_STATUS_UC) {
+               if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+                   !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+                       printk(KERN_DEBUG "kvm: set_mce: "
+                              "injects mce exception while "
+                              "previous one is in progress!\n");
+                       set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+                       return 0;
+               }
+               if (banks[1] & MCI_STATUS_VAL)
+                       mce->status |= MCI_STATUS_OVER;
+               banks[2] = mce->addr;
+               banks[3] = mce->misc;
+               vcpu->arch.mcg_status = mce->mcg_status;
+               banks[1] = mce->status;
+               kvm_queue_exception(vcpu, MC_VECTOR);
+       } else if (!(banks[1] & MCI_STATUS_VAL)
+                  || !(banks[1] & MCI_STATUS_UC)) {
+               if (banks[1] & MCI_STATUS_VAL)
+                       mce->status |= MCI_STATUS_OVER;
+               banks[2] = mce->addr;
+               banks[3] = mce->misc;
+               banks[1] = mce->status;
+       } else
+               banks[1] |= MCI_STATUS_OVER;
+       return 0;
+}
+
  long kvm_arch_vcpu_ioctl(struct file *filp,
                          unsigned int ioctl, unsigned long arg)
  {
@@ -1496,7 +1774,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
                         goto out;
                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
-                               cpuid_arg->entries);
+                                             cpuid_arg->entries);
                 if (r)
                         goto out;
                 break;
@@ -1509,7 +1787,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
                         goto out;
                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
-                               cpuid_arg->entries);
+                                             cpuid_arg->entries);
                 if (r)
                         goto out;
                 r = -EFAULT;
@@ -1552,12 +1830,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
                 break;
         }
+       case KVM_X86_SETUP_MCE: {
+               u64 mcg_cap;
+
+               r = -EFAULT;
+               if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+                       goto out;
+               r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+               break;
+       }
+       case KVM_X86_SET_MCE: {
+               struct kvm_x86_mce mce;
+
+               r = -EFAULT;
+               if (copy_from_user(&mce, argp, sizeof mce))
+                       goto out;
+               r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+               break;
+       }
         default:
                 r = -EINVAL;
         }
  out:
-       if (lapic)
-               kfree(lapic);
+       kfree(lapic);
         return r;
  }
  
@@ -1578,10 +1873,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
                 return -EINVAL;
  
         down_write(&kvm->slots_lock);
+       spin_lock(&kvm->mmu_lock);
  
         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
  
+       spin_unlock(&kvm->mmu_lock);
         up_write(&kvm->slots_lock);
         return 0;
  }
@@ -1757,7 +2054,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
  
         /* If nothing is dirty, don't bother messing with page tables. */
         if (is_dirty) {
+               spin_lock(&kvm->mmu_lock);
                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
+               spin_unlock(&kvm->mmu_lock);
                 kvm_flush_remote_tlbs(kvm);
                 memslot = &kvm->memslots[log->slot];
                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
@@ -1783,6 +2082,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
         union {
                 struct kvm_pit_state ps;
                 struct kvm_memory_alias alias;
+               struct kvm_pit_config pit_config;
         } u;
  
         switch (ioctl) {
@@ -1835,13 +2135,34 @@ long kvm_arch_vm_ioctl(struct file *filp,
                         }
                 } else
                         goto out;
+               r = kvm_setup_default_irq_routing(kvm);
+               if (r) {
+                       kfree(kvm->arch.vpic);
+                       kfree(kvm->arch.vioapic);
+                       goto out;
+               }
                 break;
         case KVM_CREATE_PIT:
+               u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+               goto create_pit;
+       case KVM_CREATE_PIT2:
+               r = -EFAULT;
+               if (copy_from_user(&u.pit_config, argp,
+                                  sizeof(struct kvm_pit_config)))
+                       goto out;
+       create_pit:
+               mutex_lock(&kvm->lock);
+               r = -EEXIST;
+               if (kvm->arch.vpit)
+                       goto create_pit_unlock;
                 r = -ENOMEM;
-               kvm->arch.vpit = kvm_create_pit(kvm);
+               kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
                 if (kvm->arch.vpit)
                         r = 0;
+       create_pit_unlock:
+               mutex_unlock(&kvm->lock);
                 break;
+       case KVM_IRQ_LINE_STATUS:
         case KVM_IRQ_LINE: {
                 struct kvm_irq_level irq_event;
  
@@ -1849,10 +2170,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
                         goto out;
                 if (irqchip_in_kernel(kvm)) {
-                       mutex_lock(&kvm->lock);
-                       kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-                                   irq_event.irq, irq_event.level);
-                       mutex_unlock(&kvm->lock);
+                       __s32 status;
+                       mutex_lock(&kvm->irq_lock);
+                       status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+                                       irq_event.irq, irq_event.level);
+                       mutex_unlock(&kvm->irq_lock);
+                       if (ioctl == KVM_IRQ_LINE_STATUS) {
+                               irq_event.status = status;
+                               if (copy_to_user(argp, &irq_event,
+                                                       sizeof irq_event))
+                                       goto out;
+                       }
                         r = 0;
                 }
                 break;
@@ -1979,7 +2307,7 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
  
         if (vcpu->arch.apic) {
                 dev = &vcpu->arch.apic->dev;
-               if (dev->in_range(dev, addr, len, is_write))
+               if (kvm_iodevice_in_range(dev, addr, len, is_write))
                         return dev;
         }
         return NULL;
@@ -1999,8 +2327,8 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
         return dev;
  }
  
-int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
-                       struct kvm_vcpu *vcpu)
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+                              struct kvm_vcpu *vcpu)
  {
         void *data = val;
         int r = X86EMUL_CONTINUE;
@@ -2029,8 +2357,8 @@ out:
         return r;
  }
  
-int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-                        struct kvm_vcpu *vcpu)
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+                               struct kvm_vcpu *vcpu)
  {
         void *data = val;
         int r = X86EMUL_CONTINUE;
@@ -2092,12 +2420,11 @@ mmio:
          */
         mutex_lock(&vcpu->kvm->lock);
         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
+       mutex_unlock(&vcpu->kvm->lock);
         if (mmio_dev) {
                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
-               mutex_unlock(&vcpu->kvm->lock);
                 return X86EMUL_CONTINUE;
         }
-       mutex_unlock(&vcpu->kvm->lock);
  
         vcpu->mmio_needed = 1;
         vcpu->mmio_phys_addr = gpa;
@@ -2147,12 +2474,11 @@ mmio:
          */
         mutex_lock(&vcpu->kvm->lock);
         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
+       mutex_unlock(&vcpu->kvm->lock);
         if (mmio_dev) {
                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
-               mutex_unlock(&vcpu->kvm->lock);
                 return X86EMUL_CONTINUE;
         }
-       mutex_unlock(&vcpu->kvm->lock);
  
         vcpu->mmio_needed = 1;
         vcpu->mmio_phys_addr = gpa;
@@ -2241,7 +2567,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
  
  int emulate_clts(struct kvm_vcpu *vcpu)
  {
-       KVMTRACE_0D(CLTS, vcpu, handler);
         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
         return X86EMUL_CONTINUE;
  }
@@ -2312,7 +2637,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
                         u16 error_code,
                         int emulation_type)
  {
-       int r;
+       int r, shadow_mask;
         struct decode_cache *c;
  
         kvm_clear_exception_queue(vcpu);
@@ -2360,7 +2685,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
                 }
         }
  
+       if (emulation_type & EMULTYPE_SKIP) {
+               kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+               return EMULATE_DONE;
+       }
+
         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+       shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+       if (r == 0)
+               kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
  
         if (vcpu->arch.pio.string)
                 return EMULATE_DO_MMIO;
@@ -2466,7 +2800,6 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
  {
         /* TODO: String I/O for in kernel device */
  
-       mutex_lock(&vcpu->kvm->lock);
         if (vcpu->arch.pio.in)
                 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
                                   vcpu->arch.pio.size,
@@ -2475,7 +2808,6 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
                 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
                                    vcpu->arch.pio.size,
                                    pd);
-       mutex_unlock(&vcpu->kvm->lock);
  }
  
  static void pio_string_write(struct kvm_io_device *pio_dev,
@@ -2485,14 +2817,12 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
         void *pd = vcpu->arch.pio_data;
         int i;
  
-       mutex_lock(&vcpu->kvm->lock);
         for (i = 0; i < io->cur_count; i++) {
                 kvm_iodevice_write(pio_dev, io->port,
                                    io->size,
                                    pd);
                 pd += io->size;
         }
-       mutex_unlock(&vcpu->kvm->lock);
  }
  
  static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
@@ -2519,17 +2849,15 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
         vcpu->arch.pio.down = 0;
         vcpu->arch.pio.rep = 0;
  
-       if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
-               KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
-       else
-               KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
+       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+                     size, 1);
  
         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
         memcpy(vcpu->arch.pio_data, &val, 4);
  
+       mutex_lock(&vcpu->kvm->lock);
         pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
+       mutex_unlock(&vcpu->kvm->lock);
         if (pio_dev) {
                 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
                 complete_pio(vcpu);
@@ -2558,12 +2886,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
         vcpu->arch.pio.down = down;
         vcpu->arch.pio.rep = rep;
  
-       if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
-               KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
-       else
-               KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
+       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+                     size, count);
  
         if (!count) {
                 kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2593,9 +2917,12 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
  
         vcpu->arch.pio.guest_gva = address;
  
+       mutex_lock(&vcpu->kvm->lock);
         pio_dev = vcpu_find_pio_dev(vcpu, port,
                                     vcpu->arch.pio.cur_count,
                                     !vcpu->arch.pio.in);
+       mutex_unlock(&vcpu->kvm->lock);
+
         if (!vcpu->arch.pio.in) {
                 /* string PIO write */
                 ret = pio_copy_data(vcpu);
@@ -2618,9 +2945,69 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
  }
  EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
  
+static void bounce_off(void *info)
+{
+       /* nothing */
+}
+
+static unsigned int  ref_freq;
+static unsigned long tsc_khz_ref;
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+                                    void *data)
+{
+       struct cpufreq_freqs *freq = data;
+       struct kvm *kvm;
+       struct kvm_vcpu *vcpu;
+       int i, send_ipi = 0;
+
+       if (!ref_freq)
+               ref_freq = freq->old;
+
+       if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+               return 0;
+       if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+               return 0;
+       per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+
+       spin_lock(&kvm_lock);
+       list_for_each_entry(kvm, &vm_list, vm_list) {
+               kvm_for_each_vcpu(i, vcpu, kvm) {
+                       if (vcpu->cpu != freq->cpu)
+                               continue;
+                       if (!kvm_request_guest_time_update(vcpu))
+                               continue;
+                       if (vcpu->cpu != smp_processor_id())
+                               send_ipi++;
+               }
+       }
+       spin_unlock(&kvm_lock);
+
+       if (freq->old < freq->new && send_ipi) {
+               /*
+                * We upscale the frequency.  Must make the guest
+                * doesn't see old kvmclock values while running with
+                * the new frequency, otherwise we risk the guest sees
+                * time go backwards.
+                *
+                * In case we update the frequency for another cpu
+                * (which might be in guest context) send an interrupt
+                * to kick the cpu out of guest context.  Next time
+                * guest context is entered kvmclock will be updated,
+                * so the guest will not see stale values.
+                */
+               smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+       }
+       return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+        .notifier_call  = kvmclock_cpufreq_notifier
+};
+
  int kvm_arch_init(void *opaque)
  {
-       int r;
+       int r, cpu;
         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
  
         if (kvm_x86_ops) {
@@ -2650,7 +3037,16 @@ int kvm_arch_init(void *opaque)
         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
         kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                       PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+                       PT_DIRTY_MASK, PT64_NX_MASK, 0);
+
+       for_each_possible_cpu(cpu)
+               per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+               tsc_khz_ref = tsc_khz;
+               cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+                                         CPUFREQ_TRANSITION_NOTIFIER);
+       }
+
         return 0;
  
  out:
@@ -2659,6 +3055,9 @@ out:
  
  void kvm_arch_exit(void)
  {
+       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+               cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
+                                           CPUFREQ_TRANSITION_NOTIFIER);
         kvm_x86_ops = NULL;
         kvm_mmu_module_exit();
  }
@@ -2666,7 +3065,6 @@ void kvm_arch_exit(void)
  int kvm_emulate_halt(struct kvm_vcpu *vcpu)
  {
         ++vcpu->stat.halt_exits;
-       KVMTRACE_0D(HLT, vcpu, handler);
         if (irqchip_in_kernel(vcpu->kvm)) {
                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
                 return 1;
@@ -2697,7 +3095,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
         a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
         a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
  
-       KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+       trace_kvm_hypercall(nr, a0, a1, a2, a3);
  
         if (!is_long_mode(vcpu)) {
                 nr &= 0xFFFFFFFF;
@@ -2797,8 +3195,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
                 return 0;
         }
-       KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
-                   (u32)((u64)value >> 32), handler);
  
         return value;
  }
@@ -2806,9 +3202,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
  void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
                      unsigned long *rflags)
  {
-       KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
-                   (u32)((u64)val >> 32), handler);
-
         switch (cr) {
         case 0:
                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -2858,7 +3251,7 @@ static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
                 return 0;
         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
-               !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+           !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
                 return 0;
         return 1;
  }
@@ -2886,10 +3279,19 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                         if (!best || e->function > best->function)
                                 best = e;
         }
-
         return best;
  }
  
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+       if (best)
+               return best->eax & 0xff;
+       return 36;
+}
+
  void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
  {
         u32 function, index;
@@ -2909,11 +3311,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
                 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
         }
         kvm_x86_ops->skip_emulated_instruction(vcpu);
-       KVMTRACE_5D(CPUID, vcpu, function,
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
+       trace_kvm_cpuid(function,
+                       kvm_register_read(vcpu, VCPU_REGS_RAX),
+                       kvm_register_read(vcpu, VCPU_REGS_RBX),
+                       kvm_register_read(vcpu, VCPU_REGS_RCX),
+                       kvm_register_read(vcpu, VCPU_REGS_RDX));
  }
  EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
  
@@ -2926,10 +3328,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
  static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
                                           struct kvm_run *kvm_run)
  {
-       return (!vcpu->arch.irq_summary &&
+       return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
                 kvm_run->request_interrupt_window &&
-               vcpu->arch.interrupt_window_open &&
-               (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+               kvm_arch_interrupt_allowed(vcpu));
  }
  
  static void post_kvm_run_save(struct kvm_vcpu *vcpu,
@@ -2942,8 +3343,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
                 kvm_run->ready_for_interrupt_injection = 1;
         else
                 kvm_run->ready_for_interrupt_injection =
-                                       (vcpu->arch.interrupt_window_open &&
-                                        vcpu->arch.irq_summary == 0);
+                       kvm_arch_interrupt_allowed(vcpu) &&
+                       !kvm_cpu_has_interrupt(vcpu) &&
+                       !kvm_event_needs_reinjection(vcpu);
  }
  
  static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -2972,9 +3374,60 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
         up_read(&vcpu->kvm->slots_lock);
  }
  
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+       int max_irr, tpr;
+
+       if (!kvm_x86_ops->update_cr8_intercept)
+               return;
+
+       if (!vcpu->arch.apic->vapic_addr)
+               max_irr = kvm_lapic_find_highest_irr(vcpu);
+       else
+               max_irr = -1;
+
+       if (max_irr != -1)
+               max_irr >>= 4;
+
+       tpr = kvm_lapic_get_cr8(vcpu);
+
+       kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+}
+
+static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       /* try to reinject previous events if any */
+       if (vcpu->arch.nmi_injected) {
+               kvm_x86_ops->set_nmi(vcpu);
+               return;
+       }
+
+       if (vcpu->arch.interrupt.pending) {
+               kvm_x86_ops->set_irq(vcpu);
+               return;
+       }
+
+       /* try to inject new event if pending */
+       if (vcpu->arch.nmi_pending) {
+               if (kvm_x86_ops->nmi_allowed(vcpu)) {
+                       vcpu->arch.nmi_pending = false;
+                       vcpu->arch.nmi_injected = true;
+                       kvm_x86_ops->set_nmi(vcpu);
+               }
+       } else if (kvm_cpu_has_interrupt(vcpu)) {
+               if (kvm_x86_ops->interrupt_allowed(vcpu)) {
+                       kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+                                           false);
+                       kvm_x86_ops->set_irq(vcpu);
+               }
+       }
+}
+
  static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  {
         int r;
+       bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+               kvm_run->request_interrupt_window;
  
         if (vcpu->requests)
                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -2987,6 +3440,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
         if (vcpu->requests) {
                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
                         __kvm_migrate_timers(vcpu);
+               if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+                       kvm_write_guest_time(vcpu);
                 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
                         kvm_mmu_sync_roots(vcpu);
                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
@@ -3004,9 +3459,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 }
         }
  
-       clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-       kvm_inject_pending_timer_irqs(vcpu);
-
         preempt_disable();
  
         kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -3014,6 +3466,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  
         local_irq_disable();
  
+       clear_bit(KVM_REQ_KICK, &vcpu->requests);
+       smp_mb__after_clear_bit();
+
         if (vcpu->requests || need_resched() || signal_pending(current)) {
                 local_irq_enable();
                 preempt_enable();
@@ -3021,21 +3476,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 goto out;
         }
  
-       vcpu->guest_mode = 1;
-       /*
-        * Make sure that guest_mode assignment won't happen after
-        * testing the pending IRQ vector bitmap.
-        */
-       smp_wmb();
-
         if (vcpu->arch.exception.pending)
                 __queue_exception(vcpu);
-       else if (irqchip_in_kernel(vcpu->kvm))
-               kvm_x86_ops->inject_pending_irq(vcpu);
         else
-               kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+               inject_pending_irq(vcpu, kvm_run);
+
+       /* enable NMI/IRQ window open exits if needed */
+       if (vcpu->arch.nmi_pending)
+               kvm_x86_ops->enable_nmi_window(vcpu);
+       else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+               kvm_x86_ops->enable_irq_window(vcpu);
  
-       kvm_lapic_sync_to_vapic(vcpu);
+       if (kvm_lapic_enabled(vcpu)) {
+               update_cr8_intercept(vcpu);
+               kvm_lapic_sync_to_vapic(vcpu);
+       }
  
         up_read(&vcpu->kvm->slots_lock);
  
@@ -3056,7 +3511,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 set_debugreg(vcpu->arch.eff_db[3], 3);
         }
  
-       KVMTRACE_0D(VMENTRY, vcpu, entryexit);
+       trace_kvm_entry(vcpu->vcpu_id);
         kvm_x86_ops->run(vcpu, kvm_run);
  
         if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -3069,7 +3524,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
         set_debugreg(vcpu->arch.host_dr6, 6);
         set_debugreg(vcpu->arch.host_dr7, 7);
  
-       vcpu->guest_mode = 0;
+       set_bit(KVM_REQ_KICK, &vcpu->requests);
         local_irq_enable();
  
         ++vcpu->stat.exits;
@@ -3096,8 +3551,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 profile_hit(KVM_PROFILING, (void *)rip);
         }
  
-       if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
-               vcpu->arch.exception.pending = false;
  
         kvm_lapic_sync_from_vapic(vcpu);
  
@@ -3106,6 +3559,7 @@ out:
         return r;
  }
  
+
  static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  {
         int r;
@@ -3132,29 +3586,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                         kvm_vcpu_block(vcpu);
                         down_read(&vcpu->kvm->slots_lock);
                         if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
-                               if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+                       {
+                               switch(vcpu->arch.mp_state) {
+                               case KVM_MP_STATE_HALTED:
                                         vcpu->arch.mp_state =
-                                                       KVM_MP_STATE_RUNNABLE;
-                       if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
-                               r = -EINTR;
+                                               KVM_MP_STATE_RUNNABLE;
+                               case KVM_MP_STATE_RUNNABLE:
+                                       break;
+                               case KVM_MP_STATE_SIPI_RECEIVED:
+                               default:
+                                       r = -EINTR;
+                                       break;
+                               }
+                       }
                 }
  
-               if (r > 0) {
-                       if (dm_request_for_irq_injection(vcpu, kvm_run)) {
-                               r = -EINTR;
-                               kvm_run->exit_reason = KVM_EXIT_INTR;
-                               ++vcpu->stat.request_irq_exits;
-                       }
-                       if (signal_pending(current)) {
-                               r = -EINTR;
-                               kvm_run->exit_reason = KVM_EXIT_INTR;
-                               ++vcpu->stat.signal_exits;
-                       }
-                       if (need_resched()) {
-                               up_read(&vcpu->kvm->slots_lock);
-                               kvm_resched(vcpu);
-                               down_read(&vcpu->kvm->slots_lock);
-                       }
+               if (r <= 0)
+                       break;
+
+               clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+               if (kvm_cpu_has_pending_timer(vcpu))
+                       kvm_inject_pending_timer_irqs(vcpu);
+
+               if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+                       r = -EINTR;
+                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       ++vcpu->stat.request_irq_exits;
+               }
+               if (signal_pending(current)) {
+                       r = -EINTR;
+                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       ++vcpu->stat.signal_exits;
+               }
+               if (need_resched()) {
+                       up_read(&vcpu->kvm->slots_lock);
+                       kvm_resched(vcpu);
+                       down_read(&vcpu->kvm->slots_lock);
                 }
         }
  
@@ -3318,7 +3785,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                   struct kvm_sregs *sregs)
  {
         struct descriptor_table dt;
-       int pending_vec;
  
         vcpu_load(vcpu);
  
@@ -3348,16 +3814,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
         sregs->efer = vcpu->arch.shadow_efer;
         sregs->apic_base = kvm_get_apic_base(vcpu);
  
-       if (irqchip_in_kernel(vcpu->kvm)) {
-               memset(sregs->interrupt_bitmap, 0,
-                      sizeof sregs->interrupt_bitmap);
-               pending_vec = kvm_x86_ops->get_irq(vcpu);
-               if (pending_vec >= 0)
-                       set_bit(pending_vec,
-                               (unsigned long *)sregs->interrupt_bitmap);
-       } else
-               memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
-                      sizeof sregs->interrupt_bitmap);
+       memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+
+       if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
+               set_bit(vcpu->arch.interrupt.nr,
+                       (unsigned long *)sregs->interrupt_bitmap);
  
         vcpu_put(vcpu);
  
@@ -3564,7 +4025,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-       tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
  }
  
  static int load_state_from_tss32(struct kvm_vcpu *vcpu,
@@ -3661,8 +4121,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
  }
  
  static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
-                      u32 old_tss_base,
-                      struct desc_struct *nseg_desc)
+                             u16 old_tss_sel, u32 old_tss_base,
+                             struct desc_struct *nseg_desc)
  {
         struct tss_segment_16 tss_segment_16;
         int ret = 0;
@@ -3681,6 +4141,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
                            &tss_segment_16, sizeof tss_segment_16))
                 goto out;
  
+       if (old_tss_sel != 0xffff) {
+               tss_segment_16.prev_task_link = old_tss_sel;
+
+               if (kvm_write_guest(vcpu->kvm,
+                                   get_tss_base_addr(vcpu, nseg_desc),
+                                   &tss_segment_16.prev_task_link,
+                                   sizeof tss_segment_16.prev_task_link))
+                       goto out;
+       }
+
         if (load_state_from_tss16(vcpu, &tss_segment_16))
                 goto out;
  
@@ -3690,7 +4160,7 @@ out:
  }
  
  static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
-                      u32 old_tss_base,
+                      u16 old_tss_sel, u32 old_tss_base,
                        struct desc_struct *nseg_desc)
  {
         struct tss_segment_32 tss_segment_32;
@@ -3710,6 +4180,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
                            &tss_segment_32, sizeof tss_segment_32))
                 goto out;
  
+       if (old_tss_sel != 0xffff) {
+               tss_segment_32.prev_task_link = old_tss_sel;
+
+               if (kvm_write_guest(vcpu->kvm,
+                                   get_tss_base_addr(vcpu, nseg_desc),
+                                   &tss_segment_32.prev_task_link,
+                                   sizeof tss_segment_32.prev_task_link))
+                       goto out;
+       }
+
         if (load_state_from_tss32(vcpu, &tss_segment_32))
                 goto out;
  
@@ -3763,14 +4243,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
                 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
         }
  
-       kvm_x86_ops->skip_emulated_instruction(vcpu);
+       /* set back link to prev task only if NT bit is set in eflags
+          note that old_tss_sel is not used afetr this point */
+       if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+               old_tss_sel = 0xffff;
+
+       /* set back link to prev task only if NT bit is set in eflags
+          note that old_tss_sel is not used afetr this point */
+       if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+               old_tss_sel = 0xffff;
  
         if (nseg_desc.type & 8)
-               ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
-                                        &nseg_desc);
+               ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+                                        old_tss_base, &nseg_desc);
         else
-               ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
-                                        &nseg_desc);
+               ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+                                        old_tss_base, &nseg_desc);
  
         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
@@ -3796,7 +4284,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                   struct kvm_sregs *sregs)
  {
         int mmu_reset_needed = 0;
-       int i, pending_vec, max_bits;
+       int pending_vec, max_bits;
         struct descriptor_table dt;
  
         vcpu_load(vcpu);
@@ -3810,7 +4298,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
  
         vcpu->arch.cr2 = sregs->cr2;
         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-       vcpu->arch.cr3 = sregs->cr3;
+
+       down_read(&vcpu->kvm->slots_lock);
+       if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
+               vcpu->arch.cr3 = sregs->cr3;
+       else
+               set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+       up_read(&vcpu->kvm->slots_lock);
  
         kvm_set_cr8(vcpu, sregs->cr8);
  
@@ -3832,25 +4326,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
         if (mmu_reset_needed)
                 kvm_mmu_reset_context(vcpu);
  
-       if (!irqchip_in_kernel(vcpu->kvm)) {
-               memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
-                      sizeof vcpu->arch.irq_pending);
-               vcpu->arch.irq_summary = 0;
-               for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
-                       if (vcpu->arch.irq_pending[i])
-                               __set_bit(i, &vcpu->arch.irq_summary);
-       } else {
-               max_bits = (sizeof sregs->interrupt_bitmap) << 3;
-               pending_vec = find_first_bit(
-                       (const unsigned long *)sregs->interrupt_bitmap,
-                       max_bits);
-               /* Only pending external irq is handled here */
-               if (pending_vec < max_bits) {
-                       kvm_x86_ops->set_irq(vcpu, pending_vec);
-                       pr_debug("Set back pending irq %d\n",
-                                pending_vec);
-               }
-               kvm_pic_clear_isr_ack(vcpu->kvm);
+       max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+       pending_vec = find_first_bit(
+               (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+       if (pending_vec < max_bits) {
+               kvm_queue_interrupt(vcpu, pending_vec, false);
+               pr_debug("Set back pending irq %d\n", pending_vec);
+               if (irqchip_in_kernel(vcpu->kvm))
+                       kvm_pic_clear_isr_ack(vcpu->kvm);
         }
  
         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -3864,7 +4347,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
         kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
  
         /* Older userspace won't unhalt the vcpu on reset. */
-       if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+       if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
             sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
             !(vcpu->arch.cr0 & X86_CR0_PE))
                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4042,6 +4525,11 @@ EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
  
  void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
  {
+       if (vcpu->arch.time_page) {
+               kvm_release_page_dirty(vcpu->arch.time_page);
+               vcpu->arch.time_page = NULL;
+       }
+
         kvm_x86_ops->vcpu_free(vcpu);
  }
  
@@ -4130,7 +4618,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
         kvm = vcpu->kvm;
  
         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+       if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
         else
                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -4152,6 +4640,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                         goto fail_mmu_destroy;
         }
  
+       vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+                                      GFP_KERNEL);
+       if (!vcpu->arch.mce_banks) {
+               r = -ENOMEM;
+               goto fail_mmu_destroy;
+       }
+       vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
         return 0;
  
  fail_mmu_destroy:
@@ -4179,7 +4675,6 @@ struct  kvm *kvm_arch_create_vm(void)
                 return ERR_PTR(-ENOMEM);
  
         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
-       INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
  
         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4200,20 +4695,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
  static void kvm_free_vcpus(struct kvm *kvm)
  {
         unsigned int i;
+       struct kvm_vcpu *vcpu;
  
         /*
          * Unpin any mmu pages first.
          */
-       for (i = 0; i < KVM_MAX_VCPUS; ++i)
-               if (kvm->vcpus[i])
-                       kvm_unload_vcpu_mmu(kvm->vcpus[i]);
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm_arch_vcpu_free(kvm->vcpus[i]);
-                       kvm->vcpus[i] = NULL;
-               }
-       }
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_unload_vcpu_mmu(vcpu);
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_arch_vcpu_free(vcpu);
  
+       mutex_lock(&kvm->lock);
+       for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+               kvm->vcpus[i] = NULL;
+
+       atomic_set(&kvm->online_vcpus, 0);
+       mutex_unlock(&kvm->lock);
  }
  
  void kvm_arch_sync_events(struct kvm *kvm)
@@ -4282,12 +4779,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
                 }
         }
  
+       spin_lock(&kvm->mmu_lock);
         if (!kvm->arch.n_requested_mmu_pages) {
                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
         }
  
         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+       spin_unlock(&kvm->mmu_lock);
         kvm_flush_remote_tlbs(kvm);
  
         return 0;
@@ -4296,6 +4795,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
  void kvm_arch_flush_shadow(struct kvm *kvm)
  {
         kvm_mmu_zap_all(kvm);
+       kvm_reload_remote_mmus(kvm);
  }
  
  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -4305,28 +4805,30 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
                || vcpu->arch.nmi_pending;
  }
  
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-       struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-       printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
  void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
  {
-       int ipi_pcpu = vcpu->cpu;
-       int cpu = get_cpu();
+       int me;
+       int cpu = vcpu->cpu;
  
         if (waitqueue_active(&vcpu->wq)) {
                 wake_up_interruptible(&vcpu->wq);
                 ++vcpu->stat.halt_wakeup;
         }
-       /*
-        * We may be called synchronously with irqs disabled in guest mode,
-        * So need not to call smp_call_function_single() in that case.
-        */
-       if (vcpu->guest_mode && vcpu->cpu != cpu)
-               smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
+
+       me = get_cpu();
+       if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+               if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+                       smp_send_reschedule(cpu);
         put_cpu();
  }
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+       return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);