KVM: VMX: initialize TSC offset relative to vm creation time
[safe/jmp/linux-2.6] / arch / x86 / kvm / vmx.c
index 2180109..3312047 100644 (file)
@@ -16,7 +16,6 @@
  */
 
 #include "irq.h"
-#include "vmx.h"
 #include "mmu.h"
 
 #include <linux/kvm_host.h>
@@ -31,6 +30,8 @@
 
 #include <asm/io.h>
 #include <asm/desc.h>
+#include <asm/vmx.h>
+#include <asm/virtext.h>
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
@@ -127,7 +128,7 @@ static struct vmcs_config {
        u32 vmentry_ctrl;
 } vmcs_config;
 
-struct vmx_capability {
+static struct vmx_capability {
        u32 ept;
        u32 vpid;
 } vmx_capability;
@@ -188,21 +189,21 @@ static inline int is_page_fault(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+               (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_no_device(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+               (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_invalid_opcode(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+               (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_external_interrupt(u32 intr_info)
@@ -479,8 +480,13 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
        eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
        if (!vcpu->fpu_active)
                eb |= 1u << NM_VECTOR;
-       if (vcpu->guest_debug.enabled)
-               eb |= 1u << DB_VECTOR;
+       if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+               if (vcpu->guest_debug &
+                   (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+                       eb |= 1u << DB_VECTOR;
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+                       eb |= 1u << BP_VECTOR;
+       }
        if (vcpu->arch.rmode.active)
                eb = ~0;
        if (vm_need_ept())
@@ -746,29 +752,33 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
                                bool has_error_code, u32 error_code)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-       if (has_error_code)
+       if (has_error_code) {
                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+               intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+       }
 
        if (vcpu->arch.rmode.active) {
                vmx->rmode.irq.pending = true;
                vmx->rmode.irq.vector = nr;
                vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-               if (nr == BP_VECTOR)
+               if (nr == BP_VECTOR || nr == OF_VECTOR)
                        vmx->rmode.irq.rip++;
-               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                            nr | INTR_TYPE_SOFT_INTR
-                            | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
-                            | INTR_INFO_VALID_MASK);
+               intr_info |= INTR_TYPE_SOFT_INTR;
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
                kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
                return;
        }
 
-       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                    nr | INTR_TYPE_EXCEPTION
-                    | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
-                    | INTR_INFO_VALID_MASK);
+       if (nr == BP_VECTOR || nr == OF_VECTOR) {
+               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+               intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+       } else
+               intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 }
 
 static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
@@ -855,11 +865,8 @@ static u64 guest_read_tsc(void)
  * writes 'guest_tsc' into guest's timestamp counter "register"
  * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
  */
-static void guest_write_tsc(u64 guest_tsc)
+static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 {
-       u64 host_tsc;
-
-       rdtscll(host_tsc);
        vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
 }
 
@@ -902,6 +909,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                data = vmcs_readl(GUEST_SYSENTER_ESP);
                break;
        default:
+               vmx_load_host_state(to_vmx(vcpu));
                msr = find_msr_entry(to_vmx(vcpu), msr_index);
                if (msr) {
                        data = msr->data;
@@ -923,6 +931,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct kvm_msr_entry *msr;
+       u64 host_tsc;
        int ret = 0;
 
        switch (msr_index) {
@@ -948,7 +957,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
        case MSR_IA32_TIME_STAMP_COUNTER:
-               guest_write_tsc(data);
+               rdtscll(host_tsc);
+               guest_write_tsc(data, host_tsc);
                break;
        case MSR_P6_PERFCTR0:
        case MSR_P6_PERFCTR1:
@@ -962,6 +972,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
 
                break;
+       case MSR_IA32_CR_PAT:
+               if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+                       vmcs_write64(GUEST_IA32_PAT, data);
+                       vcpu->arch.pat = data;
+                       break;
+               }
+               /* Otherwise falls through to kvm_set_msr_common */
        default:
                vmx_load_host_state(vmx);
                msr = find_msr_entry(vmx, msr_index);
@@ -990,40 +1007,28 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
        }
 }
 
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-       unsigned long dr7 = 0x400;
-       int old_singlestep;
-
-       old_singlestep = vcpu->guest_debug.singlestep;
-
-       vcpu->guest_debug.enabled = dbg->enabled;
-       if (vcpu->guest_debug.enabled) {
-               int i;
-
-               dr7 |= 0x200;  /* exact */
-               for (i = 0; i < 4; ++i) {
-                       if (!dbg->breakpoints[i].enabled)
-                               continue;
-                       vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
-                       dr7 |= 2 << (i*2);    /* global enable */
-                       dr7 |= 0 << (i*4+16); /* execution breakpoint */
-               }
+       int old_debug = vcpu->guest_debug;
+       unsigned long flags;
 
-               vcpu->guest_debug.singlestep = dbg->singlestep;
-       } else
-               vcpu->guest_debug.singlestep = 0;
+       vcpu->guest_debug = dbg->control;
+       if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+               vcpu->guest_debug = 0;
 
-       if (old_singlestep && !vcpu->guest_debug.singlestep) {
-               unsigned long flags;
+       if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+               vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
+       else
+               vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
 
-               flags = vmcs_readl(GUEST_RFLAGS);
+       flags = vmcs_readl(GUEST_RFLAGS);
+       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+               flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+       else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
                flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-               vmcs_writel(GUEST_RFLAGS, flags);
-       }
+       vmcs_writel(GUEST_RFLAGS, flags);
 
        update_exception_bitmap(vcpu);
-       vmcs_writel(GUEST_DR7, dr7);
 
        return 0;
 }
@@ -1037,8 +1042,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu)
 
 static __init int cpu_has_kvm_support(void)
 {
-       unsigned long ecx = cpuid_ecx(1);
-       return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+       return cpu_has_vmx();
 }
 
 static __init int vmx_disabled_by_bios(void)
@@ -1084,13 +1088,22 @@ static void vmclear_local_vcpus(void)
                __vcpu_clear(vmx);
 }
 
-static void hardware_disable(void *garbage)
+
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
 {
-       vmclear_local_vcpus();
        asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
        write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
+static void hardware_disable(void *garbage)
+{
+       vmclear_local_vcpus();
+       kvm_cpu_vmxoff();
+}
+
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
                                      u32 msr, u32 *result)
 {
@@ -1181,12 +1194,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #ifdef CONFIG_X86_64
        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
-       opt = 0;
+       opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                &_vmexit_control) < 0)
                return -EIO;
 
-       min = opt = 0;
+       min = 0;
+       opt = VM_ENTRY_LOAD_IA32_PAT;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
                                &_vmentry_control) < 0)
                return -EIO;
@@ -2092,8 +2106,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
  */
 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
-       u32 host_sysenter_cs;
+       u32 host_sysenter_cs, msr_low, msr_high;
        u32 junk;
+       u64 host_pat, tsc_this, tsc_base;
        unsigned long a;
        struct descriptor_table dt;
        int i;
@@ -2181,6 +2196,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        rdmsrl(MSR_IA32_SYSENTER_EIP, a);
        vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
 
+       if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+               rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+               host_pat = msr_low | ((u64) msr_high << 32);
+               vmcs_write64(HOST_IA32_PAT, host_pat);
+       }
+       if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+               rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+               host_pat = msr_low | ((u64) msr_high << 32);
+               /* Write the default value follow host pat */
+               vmcs_write64(GUEST_IA32_PAT, host_pat);
+               /* Keep arch.pat sync with GUEST_IA32_PAT */
+               vmx->vcpu.arch.pat = host_pat;
+       }
+
        for (i = 0; i < NR_VMX_MSR; ++i) {
                u32 index = vmx_msr_index[i];
                u32 data_low, data_high;
@@ -2207,6 +2236,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
        vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
 
+       tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
+       rdtscll(tsc_this);
+       if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
+               tsc_base = tsc_this;
+
+       guest_write_tsc(0, tsc_base);
 
        return 0;
 }
@@ -2286,7 +2321,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                kvm_rip_write(vcpu, 0);
        kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
-       /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
        vmcs_writel(GUEST_DR7, 0x400);
 
        vmcs_writel(GUEST_GDTR_BASE, 0);
@@ -2299,8 +2333,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
-       guest_write_tsc(0);
-
        /* Special registers */
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 
@@ -2453,8 +2485,15 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 {
        vmx_update_window_states(vcpu);
 
+       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+               vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+                               GUEST_INTR_STATE_STI |
+                               GUEST_INTR_STATE_MOV_SS);
+
        if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-               if (vcpu->arch.nmi_window_open) {
+               if (vcpu->arch.interrupt.pending) {
+                       enable_nmi_window(vcpu);
+               } else if (vcpu->arch.nmi_window_open) {
                        vcpu->arch.nmi_pending = false;
                        vcpu->arch.nmi_injected = true;
                } else {
@@ -2464,15 +2503,13 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
        }
        if (vcpu->arch.nmi_injected) {
                vmx_inject_nmi(vcpu);
-               if (vcpu->arch.nmi_pending || kvm_run->request_nmi_window)
+               if (vcpu->arch.nmi_pending)
                        enable_nmi_window(vcpu);
                else if (vcpu->arch.irq_summary
                         || kvm_run->request_interrupt_window)
                        enable_irq_window(vcpu);
                return;
        }
-       if (!vcpu->arch.nmi_window_open || kvm_run->request_nmi_window)
-               enable_nmi_window(vcpu);
 
        if (vcpu->arch.interrupt_window_open) {
                if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
@@ -2490,7 +2527,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
        int ret;
        struct kvm_userspace_memory_region tss_mem = {
-               .slot = 8,
+               .slot = TSS_PRIVATE_MEMSLOT,
                .guest_phys_addr = addr,
                .memory_size = PAGE_SIZE * 3,
                .flags = 0,
@@ -2503,24 +2540,6 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
        return 0;
 }
 
-static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
-{
-       struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-
-       set_debugreg(dbg->bp[0], 0);
-       set_debugreg(dbg->bp[1], 1);
-       set_debugreg(dbg->bp[2], 2);
-       set_debugreg(dbg->bp[3], 3);
-
-       if (dbg->singlestep) {
-               unsigned long flags;
-
-               flags = vmcs_readl(GUEST_RFLAGS);
-               flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-               vmcs_writel(GUEST_RFLAGS, flags);
-       }
-}
-
 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
                                  int vec, u32 err_code)
 {
@@ -2537,9 +2556,17 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
         *        the required debugging infrastructure rework.
         */
        switch (vec) {
-       case DE_VECTOR:
        case DB_VECTOR:
+               if (vcpu->guest_debug &
+                   (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+                       return 0;
+               kvm_queue_exception(vcpu, vec);
+               return 1;
        case BP_VECTOR:
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+                       return 0;
+               /* fall through */
+       case DE_VECTOR:
        case OF_VECTOR:
        case BR_VECTOR:
        case UD_VECTOR:
@@ -2556,8 +2583,8 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 intr_info, error_code;
-       unsigned long cr2, rip;
+       u32 intr_info, ex_no, error_code;
+       unsigned long cr2, rip, dr6;
        u32 vect_info;
        enum emulation_result er;
 
@@ -2616,14 +2643,30 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                return 1;
        }
 
-       if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
-           (INTR_TYPE_EXCEPTION | 1)) {
+       ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+       switch (ex_no) {
+       case DB_VECTOR:
+               dr6 = vmcs_readl(EXIT_QUALIFICATION);
+               if (!(vcpu->guest_debug &
+                     (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+                       vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
+                       kvm_queue_exception(vcpu, DB_VECTOR);
+                       return 1;
+               }
+               kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+               kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+               /* fall through */
+       case BP_VECTOR:
                kvm_run->exit_reason = KVM_EXIT_DEBUG;
-               return 0;
+               kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+               kvm_run->debug.arch.exception = ex_no;
+               break;
+       default:
+               kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+               kvm_run->ex.exception = ex_no;
+               kvm_run->ex.error_code = error_code;
+               break;
        }
-       kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
-       kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
-       kvm_run->ex.error_code = error_code;
        return 0;
 }
 
@@ -2664,6 +2707,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        rep = (exit_qualification & 32) != 0;
        port = exit_qualification >> 16;
 
+       skip_emulated_instruction(vcpu);
        return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
 }
 
@@ -2761,21 +2805,44 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        unsigned long val;
        int dr, reg;
 
-       /*
-        * FIXME: this code assumes the host is debugging the guest.
-        *        need to deal with guest debugging itself too.
-        */
+       dr = vmcs_readl(GUEST_DR7);
+       if (dr & DR7_GD) {
+               /*
+                * As the vm-exit takes precedence over the debug trap, we
+                * need to emulate the latter, either for the host or the
+                * guest debugging itself.
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+                       kvm_run->debug.arch.dr6 = vcpu->arch.dr6;
+                       kvm_run->debug.arch.dr7 = dr;
+                       kvm_run->debug.arch.pc =
+                               vmcs_readl(GUEST_CS_BASE) +
+                               vmcs_readl(GUEST_RIP);
+                       kvm_run->debug.arch.exception = DB_VECTOR;
+                       kvm_run->exit_reason = KVM_EXIT_DEBUG;
+                       return 0;
+               } else {
+                       vcpu->arch.dr7 &= ~DR7_GD;
+                       vcpu->arch.dr6 |= DR6_BD;
+                       vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+                       kvm_queue_exception(vcpu, DB_VECTOR);
+                       return 1;
+               }
+       }
+
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-       dr = exit_qualification & 7;
-       reg = (exit_qualification >> 8) & 15;
-       if (exit_qualification & 16) {
-               /* mov from dr */
+       dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+       reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+       if (exit_qualification & TYPE_MOV_FROM_DR) {
                switch (dr) {
+               case 0 ... 3:
+                       val = vcpu->arch.db[dr];
+                       break;
                case 6:
-                       val = 0xffff0ff0;
+                       val = vcpu->arch.dr6;
                        break;
                case 7:
-                       val = 0x400;
+                       val = vcpu->arch.dr7;
                        break;
                default:
                        val = 0;
@@ -2783,7 +2850,38 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                kvm_register_write(vcpu, reg, val);
                KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
        } else {
-               /* mov to dr */
+               val = vcpu->arch.regs[reg];
+               switch (dr) {
+               case 0 ... 3:
+                       vcpu->arch.db[dr] = val;
+                       if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+                               vcpu->arch.eff_db[dr] = val;
+                       break;
+               case 4 ... 5:
+                       if (vcpu->arch.cr4 & X86_CR4_DE)
+                               kvm_queue_exception(vcpu, UD_VECTOR);
+                       break;
+               case 6:
+                       if (val & 0xffffffff00000000ULL) {
+                               kvm_queue_exception(vcpu, GP_VECTOR);
+                               break;
+                       }
+                       vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+                       break;
+               case 7:
+                       if (val & 0xffffffff00000000ULL) {
+                               kvm_queue_exception(vcpu, GP_VECTOR);
+                               break;
+                       }
+                       vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+                       if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+                               vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+                               vcpu->arch.switch_db_regs =
+                                       (val & DR7_BP_EN_MASK);
+                       }
+                       break;
+               }
+               KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
        }
        skip_emulated_instruction(vcpu);
        return 1;
@@ -2934,7 +3032,18 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        }
        tss_selector = exit_qualification;
 
-       return kvm_task_switch(vcpu, tss_selector, reason);
+       if (!kvm_task_switch(vcpu, tss_selector, reason))
+               return 0;
+
+       /* clear all local breakpoint enable flags */
+       vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
+
+       /*
+        * TODO: What about debug traps on tss switch?
+        *       Are we supposed to inject them and update dr6?
+        */
+
+       return 1;
 }
 
 static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -3005,14 +3114,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
        ++vcpu->stat.nmi_window_exits;
 
-       /*
-        * If the user space waits to inject a NMI, exit as soon as possible
-        */
-       if (kvm_run->request_nmi_window && !vcpu->arch.nmi_pending) {
-               kvm_run->exit_reason = KVM_EXIT_NMI_WINDOW_OPEN;
-               return 0;
-       }
-
        return 1;
 }
 
@@ -3028,16 +3129,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
        while (!guest_state_valid(vcpu)) {
                err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
 
-               switch (err) {
-                       case EMULATE_DONE:
-                               break;
-                       case EMULATE_DO_MMIO:
-                               kvm_report_emulation_failure(vcpu, "mmio");
-                               /* TODO: Handle MMIO */
-                               return;
-                       default:
-                               kvm_report_emulation_failure(vcpu, "emulation failure");
-                               return;
+               if (err == EMULATE_DO_MMIO)
+                       break;
+
+               if (err != EMULATE_DONE) {
+                       kvm_report_emulation_failure(vcpu, "emulation failure");
+                       return;
                }
 
                if (signal_pending(current))
@@ -3049,8 +3146,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
        local_irq_disable();
        preempt_disable();
 
-       /* Guest state should be valid now, no more emulation should be needed */
-       vmx->emulation_required = 0;
+       /* Guest state should be valid now except if we need to
+        * emulate an MMIO */
+       if (guest_state_valid(vcpu))
+               vmx->emulation_required = 0;
 }
 
 /*
@@ -3097,6 +3196,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
                    (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
 
+       /* If we need to emulate an MMIO from handle_invalid_guest_state
+        * we just return 0 */
+       if (vmx->emulation_required && emulate_invalid_guest_state)
+               return 0;
+
        /* Access CR3 don't cause VMExit in paging mode, so we need
         * to sync with guest real CR3. */
        if (vm_need_ept() && is_paging(vcpu)) {
@@ -3124,7 +3228,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                        vmx->soft_vnmi_blocked = 0;
                        vcpu->arch.nmi_window_open = 1;
                } else if (vmx->vnmi_blocked_time > 1000000000LL &&
-                   (kvm_run->request_nmi_window || vcpu->arch.nmi_pending)) {
+                          vcpu->arch.nmi_pending) {
                        /*
                         * This CPU don't support us in finding the end of an
                         * NMI-blocked window if the guest runs with IRQs
@@ -3137,16 +3241,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                        vmx->soft_vnmi_blocked = 0;
                        vmx->vcpu.arch.nmi_window_open = 1;
                }
-
-               /*
-                * If the user space waits to inject an NNI, exit ASAP
-                */
-               if (vcpu->arch.nmi_window_open && kvm_run->request_nmi_window
-                   && !vcpu->arch.nmi_pending) {
-                       kvm_run->exit_reason = KVM_EXIT_NMI_WINDOW_OPEN;
-                       ++vcpu->stat.nmi_window_exits;
-                       return 0;
-               }
        }
 
        if (exit_reason < kvm_vmx_max_exit_handlers
@@ -3219,7 +3313,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
                        vmx->vcpu.arch.nmi_injected = false;
        }
        kvm_clear_exception_queue(&vmx->vcpu);
-       if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
+       if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION ||
+                               type == INTR_TYPE_SOFT_EXCEPTION)) {
                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
                        error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
                        kvm_queue_exception_e(&vmx->vcpu, vector, error);
@@ -3240,6 +3335,11 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 
        vmx_update_window_states(vcpu);
 
+       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+               vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+                               GUEST_INTR_STATE_STI |
+                               GUEST_INTR_STATE_MOV_SS);
+
        if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
                if (vcpu->arch.interrupt.pending) {
                        enable_nmi_window(vcpu);
@@ -3267,7 +3367,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
        }
        if (vcpu->arch.interrupt.pending) {
                vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-               kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+               if (kvm_cpu_has_interrupt(vcpu))
+                       enable_irq_window(vcpu);
        }
 }
 
@@ -3327,6 +3428,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
         */
        vmcs_writel(HOST_CR0, read_cr0());
 
+       set_debugreg(vcpu->arch.dr6, 6);
+
        asm(
                /* Store host registers */
                "push %%"R"dx; push %%"R"bp;"
@@ -3421,6 +3524,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
        vcpu->arch.regs_dirty = 0;
 
+       get_debugreg(vcpu->arch.dr6, 6);
+
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
        if (vmx->rmode.irq.pending)
                fixup_rmode_irq(vmx);
@@ -3551,6 +3656,11 @@ static int get_ept_level(void)
        return VMX_EPT_DEFAULT_GAW + 1;
 }
 
+static int vmx_get_mt_mask_shift(void)
+{
+       return VMX_EPT_MT_EPTE_SHIFT;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
@@ -3570,7 +3680,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .vcpu_put = vmx_vcpu_put,
 
        .set_guest_debug = set_guest_debug,
-       .guest_debug_pre = kvm_guest_debug_pre,
        .get_msr = vmx_get_msr,
        .set_msr = vmx_set_msr,
        .get_segment_base = vmx_get_segment_base,
@@ -3606,6 +3715,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
        .set_tss_addr = vmx_set_tss_addr,
        .get_tdp_level = get_ept_level,
+       .get_mt_mask_shift = vmx_get_mt_mask_shift,
 };
 
 static int __init vmx_init(void)
@@ -3661,11 +3771,10 @@ static int __init vmx_init(void)
        if (vm_need_ept()) {
                bypass_guest_pf = 0;
                kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
-                       VMX_EPT_WRITABLE_MASK |
-                       VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
-                       VMX_EPT_IGMT_BIT);
+                       VMX_EPT_WRITABLE_MASK);
                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
-                               VMX_EPT_EXECUTABLE_MASK);
+                               VMX_EPT_EXECUTABLE_MASK,
+                               VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
                kvm_enable_tdp();
        } else
                kvm_disable_tdp();