Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

[safe/jmp/linux-2.6] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index b2f91b9..dd9bc8f 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -39,6 +39,8 @@
  #include <linux/cpufreq.h>
  #include <linux/user-return-notifier.h>
  #include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
  #include <trace/events/kvm.h>
  #undef TRACE_INCLUDE_FILE
  #define CREATE_TRACE_POINTS
@@ -432,8 +434,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  
  #ifdef CONFIG_X86_64
         if (cr0 & 0xffffffff00000000UL) {
-               printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
-                      cr0, kvm_read_cr0(vcpu));
                 kvm_inject_gp(vcpu, 0);
                 return;
         }
@@ -442,14 +442,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         cr0 &= ~CR0_RESERVED_BITS;
  
         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
-               printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
                 kvm_inject_gp(vcpu, 0);
                 return;
         }
  
         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
-               printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
-                      "and a clear PE flag\n");
                 kvm_inject_gp(vcpu, 0);
                 return;
         }
@@ -460,15 +457,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
                         int cs_db, cs_l;
  
                         if (!is_pae(vcpu)) {
-                               printk(KERN_DEBUG "set_cr0: #GP, start paging "
-                                      "in long mode while PAE is disabled\n");
                                 kvm_inject_gp(vcpu, 0);
                                 return;
                         }
                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
                         if (cs_l) {
-                               printk(KERN_DEBUG "set_cr0: #GP, start paging "
-                                      "in long mode while CS.L == 1\n");
                                 kvm_inject_gp(vcpu, 0);
                                 return;
  
@@ -476,8 +469,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
                 } else
  #endif
                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-                       printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
-                              "reserved bits\n");
                         kvm_inject_gp(vcpu, 0);
                         return;
                 }
@@ -504,28 +495,23 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
  
         if (cr4 & CR4_RESERVED_BITS) {
-               printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
                 kvm_inject_gp(vcpu, 0);
                 return;
         }
  
         if (is_long_mode(vcpu)) {
                 if (!(cr4 & X86_CR4_PAE)) {
-                       printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
-                              "in long mode\n");
                         kvm_inject_gp(vcpu, 0);
                         return;
                 }
         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
                    && ((cr4 ^ old_cr4) & pdptr_bits)
                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-               printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
                 kvm_inject_gp(vcpu, 0);
                 return;
         }
  
         if (cr4 & X86_CR4_VMXE) {
-               printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
                 kvm_inject_gp(vcpu, 0);
                 return;
         }
@@ -546,21 +532,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  
         if (is_long_mode(vcpu)) {
                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
-                       printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
                         kvm_inject_gp(vcpu, 0);
                         return;
                 }
         } else {
                 if (is_pae(vcpu)) {
                         if (cr3 & CR3_PAE_RESERVED_BITS) {
-                               printk(KERN_DEBUG
-                                      "set_cr3: #GP, reserved bits\n");
                                 kvm_inject_gp(vcpu, 0);
                                 return;
                         }
                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
-                               printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
-                                      "reserved bits\n");
                                 kvm_inject_gp(vcpu, 0);
                                 return;
                         }
@@ -592,7 +573,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3);
  void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
  {
         if (cr8 & CR8_RESERVED_BITS) {
-               printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
                 kvm_inject_gp(vcpu, 0);
                 return;
         }
@@ -648,15 +628,12 @@ static u32 emulated_msrs[] = {
  static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
  {
         if (efer & efer_reserved_bits) {
-               printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
-                      efer);
                 kvm_inject_gp(vcpu, 0);
                 return;
         }
  
         if (is_paging(vcpu)
             && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
-               printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
                 kvm_inject_gp(vcpu, 0);
                 return;
         }
@@ -666,7 +643,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
  
                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
                 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
-                       printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
                         kvm_inject_gp(vcpu, 0);
                         return;
                 }
@@ -677,7 +653,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
  
                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
-                       printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
                         kvm_inject_gp(vcpu, 0);
                         return;
                 }
@@ -966,9 +941,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                 if (msr >= MSR_IA32_MC0_CTL &&
                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
                         u32 offset = msr - MSR_IA32_MC0_CTL;
-                       /* only 0 or all 1s can be written to IA32_MCi_CTL */
+                       /* only 0 or all 1s can be written to IA32_MCi_CTL
+                        * some Linux kernels though clear bit 10 in bank 4 to
+                        * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+                        * this to avoid an uncatched #GP in the guest
+                        */
                         if ((offset & 0x3) == 0 &&
-                           data != 0 && data != ~(u64)0)
+                           data != 0 && (data | (1 << 10)) != ~(u64)0)
                                 return -1;
                         vcpu->arch.mce_banks[offset] = data;
                         break;
@@ -1570,6 +1549,7 @@ int kvm_dev_ioctl_check_extension(long ext)
         case KVM_CAP_HYPERV_VAPIC:
         case KVM_CAP_HYPERV_SPIN:
         case KVM_CAP_PCI_SEGMENT:
+       case KVM_CAP_X86_ROBUST_SINGLESTEP:
                 r = 1;
                 break;
         case KVM_CAP_COALESCED_MMIO:
@@ -1733,6 +1713,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
         if (copy_from_user(cpuid_entries, entries,
                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
                 goto out_free;
+       vcpu_load(vcpu);
         for (i = 0; i < cpuid->nent; i++) {
                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1750,6 +1731,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
         r = 0;
         kvm_apic_set_version(vcpu);
         kvm_x86_ops->cpuid_update(vcpu);
+       vcpu_put(vcpu);
  
  out_free:
         vfree(cpuid_entries);
@@ -1770,9 +1752,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
                 goto out;
+       vcpu_load(vcpu);
         vcpu->arch.cpuid_nent = cpuid->nent;
         kvm_apic_set_version(vcpu);
         kvm_x86_ops->cpuid_update(vcpu);
+       vcpu_put(vcpu);
         return 0;
  
  out:
@@ -2542,18 +2526,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
         r = 0;
         switch (chip->chip_id) {
         case KVM_IRQCHIP_PIC_MASTER:
-               spin_lock(&pic_irqchip(kvm)->lock);
+               raw_spin_lock(&pic_irqchip(kvm)->lock);
                 memcpy(&pic_irqchip(kvm)->pics[0],
                         &chip->chip.pic,
                         sizeof(struct kvm_pic_state));
-               spin_unlock(&pic_irqchip(kvm)->lock);
+               raw_spin_unlock(&pic_irqchip(kvm)->lock);
                 break;
         case KVM_IRQCHIP_PIC_SLAVE:
-               spin_lock(&pic_irqchip(kvm)->lock);
+               raw_spin_lock(&pic_irqchip(kvm)->lock);
                 memcpy(&pic_irqchip(kvm)->pics[1],
                         &chip->chip.pic,
                         sizeof(struct kvm_pic_state));
-               spin_unlock(&pic_irqchip(kvm)->lock);
+               raw_spin_unlock(&pic_irqchip(kvm)->lock);
                 break;
         case KVM_IRQCHIP_IOAPIC:
                 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -2633,8 +2617,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                                       struct kvm_dirty_log *log)
  {
-       int r, n, i;
+       int r, i;
         struct kvm_memory_slot *memslot;
+       unsigned long n;
         unsigned long is_dirty = 0;
         unsigned long *dirty_bitmap = NULL;
  
@@ -2649,7 +2634,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
         if (!memslot->dirty_bitmap)
                 goto out;
  
-       n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+       n = kvm_dirty_bitmap_bytes(memslot);
  
         r = -ENOMEM;
         dirty_bitmap = vmalloc(n);
@@ -3039,14 +3024,41 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
         return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
  }
  
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
-                              struct kvm_vcpu *vcpu)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       access |= PFERR_FETCH_MASK;
+       return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       access |= PFERR_WRITE_MASK;
+       return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+/* uses this to access any guest's mapped memory without checking CPL */
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+       return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
+}
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+                                     struct kvm_vcpu *vcpu, u32 access,
+                                     u32 *error)
  {
         void *data = val;
         int r = X86EMUL_CONTINUE;
  
         while (bytes) {
-               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
                 unsigned offset = addr & (PAGE_SIZE-1);
                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
                 int ret;
@@ -3069,14 +3081,37 @@ out:
         return r;
  }
  
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
+                               struct kvm_vcpu *vcpu, u32 *error)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
+                                         access | PFERR_FETCH_MASK, error);
+}
+
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+                              struct kvm_vcpu *vcpu, u32 *error)
+{
+       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+                                         error);
+}
+
+static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
+                              struct kvm_vcpu *vcpu, u32 *error)
+{
+       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+}
+
  static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-                               struct kvm_vcpu *vcpu)
+                               struct kvm_vcpu *vcpu, u32 *error)
  {
         void *data = val;
         int r = X86EMUL_CONTINUE;
  
         while (bytes) {
-               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+               gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
                 unsigned offset = addr & (PAGE_SIZE-1);
                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
                 int ret;
@@ -3106,6 +3141,7 @@ static int emulator_read_emulated(unsigned long addr,
                                   struct kvm_vcpu *vcpu)
  {
         gpa_t                 gpa;
+       u32 error_code;
  
         if (vcpu->mmio_read_completed) {
                 memcpy(val, vcpu->mmio_data, bytes);
@@ -3115,17 +3151,20 @@ static int emulator_read_emulated(unsigned long addr,
                 return X86EMUL_CONTINUE;
         }
  
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+       gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+
+       if (gpa == UNMAPPED_GVA) {
+               kvm_inject_page_fault(vcpu, addr, error_code);
+               return X86EMUL_PROPAGATE_FAULT;
+       }
  
         /* For APIC access vmexit */
         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
                 goto mmio;
  
-       if (kvm_read_guest_virt(addr, val, bytes, vcpu)
+       if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
                                 == X86EMUL_CONTINUE)
                 return X86EMUL_CONTINUE;
-       if (gpa == UNMAPPED_GVA)
-               return X86EMUL_PROPAGATE_FAULT;
  
  mmio:
         /*
@@ -3164,11 +3203,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
                                            struct kvm_vcpu *vcpu)
  {
         gpa_t                 gpa;
+       u32 error_code;
  
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+       gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
  
         if (gpa == UNMAPPED_GVA) {
-               kvm_inject_page_fault(vcpu, addr, 2);
+               kvm_inject_page_fault(vcpu, addr, error_code);
                 return X86EMUL_PROPAGATE_FAULT;
         }
  
@@ -3232,7 +3272,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
                 char *kaddr;
                 u64 val;
  
-               gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+               gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
  
                 if (gpa == UNMAPPED_GVA ||
                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3297,7 +3337,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
  
         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
  
-       kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
+       kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
  
         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -3305,7 +3345,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
  EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
  
  static struct x86_emulate_ops emulate_ops = {
-       .read_std            = kvm_read_guest_virt,
+       .read_std            = kvm_read_guest_virt_system,
+       .fetch               = kvm_fetch_guest_virt,
         .read_emulated       = emulator_read_emulated,
         .write_emulated      = emulator_write_emulated,
         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
@@ -3348,8 +3389,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
                 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
                 vcpu->arch.emulate_ctxt.mode =
+                       (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-                       ? X86EMUL_MODE_REAL : cs_l
+                       ? X86EMUL_MODE_VM86 : cs_l
                         ? X86EMUL_MODE_PROT64 : cs_db
                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
  
@@ -3441,12 +3483,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
         gva_t q = vcpu->arch.pio.guest_gva;
         unsigned bytes;
         int ret;
+       u32 error_code;
  
         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
         if (vcpu->arch.pio.in)
-               ret = kvm_write_guest_virt(q, p, bytes, vcpu);
+               ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
         else
-               ret = kvm_read_guest_virt(q, p, bytes, vcpu);
+               ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
+
+       if (ret == X86EMUL_PROPAGATE_FAULT)
+               kvm_inject_page_fault(vcpu, q, error_code);
+
         return ret;
  }
  
@@ -3467,7 +3514,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
                 if (io->in) {
                         r = pio_copy_data(vcpu);
                         if (r)
-                               return r;
+                               goto out;
                 }
  
                 delta = 1;
@@ -3494,7 +3541,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
                         kvm_register_write(vcpu, VCPU_REGS_RSI, val);
                 }
         }
-
+out:
         io->count -= io->cur_count;
         io->cur_count = 0;
  
@@ -3537,6 +3584,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
  {
         unsigned long val;
  
+       trace_kvm_pio(!in, port, size, 1);
+
         vcpu->run->exit_reason = KVM_EXIT_IO;
         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
         vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3548,9 +3597,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
         vcpu->arch.pio.down = 0;
         vcpu->arch.pio.rep = 0;
  
-       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
-                     size, 1);
-
         if (!vcpu->arch.pio.in) {
                 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
                 memcpy(vcpu->arch.pio_data, &val, 4);
@@ -3571,6 +3617,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
         unsigned now, in_page;
         int ret = 0;
  
+       trace_kvm_pio(!in, port, size, count);
+
         vcpu->run->exit_reason = KVM_EXIT_IO;
         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
         vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3582,9 +3630,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
         vcpu->arch.pio.down = down;
         vcpu->arch.pio.rep = rep;
  
-       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
-                     size, count);
-
         if (!count) {
                 kvm_x86_ops->skip_emulated_instruction(vcpu);
                 return 1;
@@ -3616,10 +3661,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
         if (!vcpu->arch.pio.in) {
                 /* string PIO write */
                 ret = pio_copy_data(vcpu);
-               if (ret == X86EMUL_PROPAGATE_FAULT) {
-                       kvm_inject_gp(vcpu, 0);
+               if (ret == X86EMUL_PROPAGATE_FAULT)
                         return 1;
-               }
                 if (ret == 0 && !pio_string_write(vcpu)) {
                         complete_pio(vcpu);
                         if (vcpu->arch.pio.count == 0)
@@ -3705,6 +3748,51 @@ static void kvm_timer_init(void)
         }
  }
  
+static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static int kvm_is_in_guest(void)
+{
+       return percpu_read(current_vcpu) != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+       int user_mode = 3;
+
+       if (percpu_read(current_vcpu))
+               user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+
+       return user_mode != 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+       unsigned long ip = 0;
+
+       if (percpu_read(current_vcpu))
+               ip = kvm_rip_read(percpu_read(current_vcpu));
+
+       return ip;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+       .is_in_guest            = kvm_is_in_guest,
+       .is_user_mode           = kvm_is_user_mode,
+       .get_guest_ip           = kvm_get_guest_ip,
+};
+
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
+{
+       percpu_write(current_vcpu, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
+
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
+{
+       percpu_write(current_vcpu, NULL);
+}
+EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
+
  int kvm_arch_init(void *opaque)
  {
         int r;
@@ -3741,6 +3829,8 @@ int kvm_arch_init(void *opaque)
  
         kvm_timer_init();
  
+       perf_register_guest_info_callbacks(&kvm_guest_cbs);
+
         return 0;
  
  out:
@@ -3749,6 +3839,8 @@ out:
  
  void kvm_arch_exit(void)
  {
+       perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+
         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
                                             CPUFREQ_TRANSITION_NOTIFIER);
@@ -4423,7 +4515,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 kvm_set_cr8(vcpu, kvm_run->cr8);
  
         if (vcpu->arch.pio.cur_count) {
+               vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
                 r = complete_pio(vcpu);
+               srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
                 if (r)
                         goto out;
         }
@@ -4655,6 +4749,9 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
  {
         struct descriptor_table dtable;
         u16 index = selector >> 3;
+       int ret;
+       u32 err;
+       gva_t addr;
  
         get_segment_descriptor_dtable(vcpu, selector, &dtable);
  
@@ -4662,7 +4759,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
                 return X86EMUL_PROPAGATE_FAULT;
         }
-       return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
+       addr = dtable.base + index * 8;
+       ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
+                                        vcpu,  &err);
+       if (ret == X86EMUL_PROPAGATE_FAULT)
+               kvm_inject_page_fault(vcpu, addr, err);
+
+       return ret;
  }
  
  /* allowed just for 8 bytes segments */
@@ -4676,15 +4779,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
  
         if (dtable.limit < index * 8 + 7)
                 return 1;
-       return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
+       return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
+}
+
+static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
+                              struct desc_struct *seg_desc)
+{
+       u32 base_addr = get_desc_base(seg_desc);
+
+       return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
  }
  
-static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
+static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
                              struct desc_struct *seg_desc)
  {
         u32 base_addr = get_desc_base(seg_desc);
  
-       return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
+       return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
  }
  
  static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -4712,7 +4823,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
                 .unusable = 0,
         };
         kvm_x86_ops->set_segment(vcpu, &segvar, seg);
-       return 0;
+       return X86EMUL_CONTINUE;
  }
  
  static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
@@ -4722,43 +4833,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
                 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
  }
  
-static void kvm_check_segment_descriptor(struct kvm_vcpu *vcpu, int seg,
-                                        u16 selector)
-{
-       /* NULL selector is not valid for CS and SS */
-       if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
-               if (!selector)
-                       kvm_queue_exception_e(vcpu, TS_VECTOR, selector >> 3);
-}
-
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-                               int type_bits, int seg)
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
  {
         struct kvm_segment kvm_seg;
         struct desc_struct seg_desc;
+       u8 dpl, rpl, cpl;
+       unsigned err_vec = GP_VECTOR;
+       u32 err_code = 0;
+       bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+       int ret;
  
         if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
                 return kvm_load_realmode_segment(vcpu, selector, seg);
  
-       if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
-               return 1;
+       /* NULL selector is not valid for TR, CS and SS */
+       if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+           && null_selector)
+               goto exception;
+
+       /* TR should be in GDT only */
+       if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+               goto exception;
+
+       ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
+       if (ret)
+               return ret;
+
         seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
  
-       kvm_check_segment_descriptor(vcpu, seg, selector);
-       kvm_seg.type |= type_bits;
+       if (null_selector) { /* for NULL selector skip all following checks */
+               kvm_seg.unusable = 1;
+               goto load;
+       }
+
+       err_code = selector & 0xfffc;
+       err_vec = GP_VECTOR;
  
-       if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
-           seg != VCPU_SREG_LDTR)
-               if (!kvm_seg.s)
-                       kvm_seg.unusable = 1;
+       /* can't load system descriptor into segment selecor */
+       if (seg <= VCPU_SREG_GS && !kvm_seg.s)
+               goto exception;
  
-       kvm_set_segment(vcpu, &kvm_seg, seg);
-       if (selector && !kvm_seg.unusable && kvm_seg.s) {
+       if (!kvm_seg.present) {
+               err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+               goto exception;
+       }
+
+       rpl = selector & 3;
+       dpl = kvm_seg.dpl;
+       cpl = kvm_x86_ops->get_cpl(vcpu);
+
+       switch (seg) {
+       case VCPU_SREG_SS:
+               /*
+                * segment is not a writable data segment or segment
+                * selector's RPL != CPL or segment selector's RPL != CPL
+                */
+               if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
+                       goto exception;
+               break;
+       case VCPU_SREG_CS:
+               if (!(kvm_seg.type & 8))
+                       goto exception;
+
+               if (kvm_seg.type & 4) {
+                       /* conforming */
+                       if (dpl > cpl)
+                               goto exception;
+               } else {
+                       /* nonconforming */
+                       if (rpl > cpl || dpl != cpl)
+                               goto exception;
+               }
+               /* CS(RPL) <- CPL */
+               selector = (selector & 0xfffc) | cpl;
+            break;
+       case VCPU_SREG_TR:
+               if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
+                       goto exception;
+               break;
+       case VCPU_SREG_LDTR:
+               if (kvm_seg.s || kvm_seg.type != 2)
+                       goto exception;
+               break;
+       default: /*  DS, ES, FS, or GS */
+               /*
+                * segment is not a data or readable code segment or
+                * ((segment is a data or nonconforming code segment)
+                * and (both RPL and CPL > DPL))
+                */
+               if ((kvm_seg.type & 0xa) == 0x8 ||
+                   (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
+                       goto exception;
+               break;
+       }
+
+       if (!kvm_seg.unusable && kvm_seg.s) {
                 /* mark segment as accessed */
+               kvm_seg.type |= 1;
                 seg_desc.type |= 1;
                 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
         }
-       return 0;
+load:
+       kvm_set_segment(vcpu, &kvm_seg, seg);
+       return X86EMUL_CONTINUE;
+exception:
+       kvm_queue_exception_e(vcpu, err_vec, err_code);
+       return X86EMUL_PROPAGATE_FAULT;
  }
  
  static void save_state_to_tss32(struct kvm_vcpu *vcpu,
@@ -4784,6 +4964,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
  }
  
+static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
+{
+       struct kvm_segment kvm_seg;
+       kvm_get_segment(vcpu, &kvm_seg, seg);
+       kvm_seg.selector = sel;
+       kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
  static int load_state_from_tss32(struct kvm_vcpu *vcpu,
                                   struct tss_segment_32 *tss)
  {
@@ -4801,25 +4989,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
  
-       if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
+       /*
+        * SDM says that segment selectors are loaded before segment
+        * descriptors
+        */
+       kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
+       kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+       kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+       kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+       kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+       kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
+       kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
+
+       /*
+        * Now load segment descriptors. If fault happenes at this stage
+        * it is handled in a context of new task
+        */
+       if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
                 return 1;
  
-       if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+       if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
                 return 1;
  
-       if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+       if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
                 return 1;
  
-       if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+       if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
                 return 1;
  
-       if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+       if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
                 return 1;
  
-       if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
+       if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
                 return 1;
  
-       if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
+       if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
                 return 1;
         return 0;
  }
@@ -4859,19 +5063,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
  
-       if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
+       /*
+        * SDM says that segment selectors are loaded before segment
+        * descriptors
+        */
+       kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
+       kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+       kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+       kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+       kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+
+       /*
+        * Now load segment descriptors. If fault happenes at this stage
+        * it is handled in a context of new task
+        */
+       if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
                 return 1;
  
-       if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+       if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
                 return 1;
  
-       if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+       if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
                 return 1;
  
-       if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+       if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
                 return 1;
  
-       if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+       if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
                 return 1;
         return 0;
  }
@@ -4893,7 +5111,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
                             sizeof tss_segment_16))
                 goto out;
  
-       if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+       if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
                            &tss_segment_16, sizeof tss_segment_16))
                 goto out;
  
@@ -4901,7 +5119,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
                 tss_segment_16.prev_task_link = old_tss_sel;
  
                 if (kvm_write_guest(vcpu->kvm,
-                                   get_tss_base_addr(vcpu, nseg_desc),
+                                   get_tss_base_addr_write(vcpu, nseg_desc),
                                     &tss_segment_16.prev_task_link,
                                     sizeof tss_segment_16.prev_task_link))
                         goto out;
@@ -4932,7 +5150,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
                             sizeof tss_segment_32))
                 goto out;
  
-       if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+       if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
                            &tss_segment_32, sizeof tss_segment_32))
                 goto out;
  
@@ -4940,7 +5158,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
                 tss_segment_32.prev_task_link = old_tss_sel;
  
                 if (kvm_write_guest(vcpu->kvm,
-                                   get_tss_base_addr(vcpu, nseg_desc),
+                                   get_tss_base_addr_write(vcpu, nseg_desc),
                                     &tss_segment_32.prev_task_link,
                                     sizeof tss_segment_32.prev_task_link))
                         goto out;
@@ -4962,8 +5180,9 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
         int ret = 0;
         u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
         u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
+       u32 desc_limit;
  
-       old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
+       old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
  
         /* FIXME: Handle errors. Failure to read either TSS or their
          * descriptors should generate a pagefault.
@@ -4984,7 +5203,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
                 }
         }
  
-       if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
+       desc_limit = get_desc_limit(&nseg_desc);
+       if (!nseg_desc.p ||
+           ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
+            desc_limit < 0x2b)) {
                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
                 return 1;
         }
@@ -5198,7 +5420,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
  
         vcpu_load(vcpu);
         idx = srcu_read_lock(&vcpu->kvm->srcu);
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
+       gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
         srcu_read_unlock(&vcpu->kvm->srcu, idx);
         tr->physical_address = gpa;
         tr->valid = gpa != UNMAPPED_GVA;