SAFE public projects git trees. - safe/jmp/linux-2.6/blob - arch/x86/kvm/x86.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * derived from drivers/kvm/kvm_main.c
   5  *
   6  * Copyright (C) 2006 Qumranet, Inc.
   7  * Copyright (C) 2008 Qumranet, Inc.
   8  * Copyright IBM Corporation, 2008
   9  *
  10  * Authors:
  11  *   Avi Kivity   <avi@qumranet.com>
  12  *   Yaniv Kamay  <yaniv@qumranet.com>
  13  *   Amit Shah    <amit.shah@qumranet.com>
  14  *   Ben-Ami Yassour <benami@il.ibm.com>
  15  *
  16  * This work is licensed under the terms of the GNU GPL, version 2.  See
  17  * the COPYING file in the top-level directory.
  18  *
  19  */
  20
  21 #include <linux/kvm_host.h>
  22 #include "irq.h"
  23 #include "mmu.h"
  24 #include "i8254.h"
  25 #include "tss.h"
  26 #include "kvm_cache_regs.h"
  27 #include "x86.h"
  28
  29 #include <linux/clocksource.h>
  30 #include <linux/interrupt.h>
  31 #include <linux/kvm.h>
  32 #include <linux/fs.h>
  33 #include <linux/vmalloc.h>
  34 #include <linux/module.h>
  35 #include <linux/mman.h>
  36 #include <linux/highmem.h>
  37 #include <linux/iommu.h>
  38 #include <linux/intel-iommu.h>
  39 #include <linux/cpufreq.h>
  40 #include <trace/events/kvm.h>
  41 #undef TRACE_INCLUDE_FILE
  42 #define CREATE_TRACE_POINTS
  43 #include "trace.h"
  44
  45 #include <asm/uaccess.h>
  46 #include <asm/msr.h>
  47 #include <asm/desc.h>
  48 #include <asm/mtrr.h>
  49 #include <asm/mce.h>
  50
  51 #define MAX_IO_MSRS 256
  52 #define CR0_RESERVED_BITS                                               \
  53         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  54                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  55                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  56 #define CR4_RESERVED_BITS                                               \
  57         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  58                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  59                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  60                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  61
  62 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  63
  64 #define KVM_MAX_MCE_BANKS 32
  65 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
  66
  67 /* EFER defaults:
  68  * - enable syscall per default because its emulated by KVM
  69  * - enable LME and LMA per default on 64 bit KVM
  70  */
  71 #ifdef CONFIG_X86_64
  72 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  73 #else
  74 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  75 #endif
  76
  77 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  78 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  79
  80 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  81                                     struct kvm_cpuid_entry2 __user *entries);
  82
  83 struct kvm_x86_ops *kvm_x86_ops;
  84 EXPORT_SYMBOL_GPL(kvm_x86_ops);
  85
  86 int ignore_msrs = 0;
  87 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
  88
  89 struct kvm_stats_debugfs_item debugfs_entries[] = {
  90         { "pf_fixed", VCPU_STAT(pf_fixed) },
  91         { "pf_guest", VCPU_STAT(pf_guest) },
  92         { "tlb_flush", VCPU_STAT(tlb_flush) },
  93         { "invlpg", VCPU_STAT(invlpg) },
  94         { "exits", VCPU_STAT(exits) },
  95         { "io_exits", VCPU_STAT(io_exits) },
  96         { "mmio_exits", VCPU_STAT(mmio_exits) },
  97         { "signal_exits", VCPU_STAT(signal_exits) },
  98         { "irq_window", VCPU_STAT(irq_window_exits) },
  99         { "nmi_window", VCPU_STAT(nmi_window_exits) },
 100         { "halt_exits", VCPU_STAT(halt_exits) },
 101         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
 102         { "hypercalls", VCPU_STAT(hypercalls) },
 103         { "request_irq", VCPU_STAT(request_irq_exits) },
 104         { "irq_exits", VCPU_STAT(irq_exits) },
 105         { "host_state_reload", VCPU_STAT(host_state_reload) },
 106         { "efer_reload", VCPU_STAT(efer_reload) },
 107         { "fpu_reload", VCPU_STAT(fpu_reload) },
 108         { "insn_emulation", VCPU_STAT(insn_emulation) },
 109         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 110         { "irq_injections", VCPU_STAT(irq_injections) },
 111         { "nmi_injections", VCPU_STAT(nmi_injections) },
 112         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 113         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
 114         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 115         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 116         { "mmu_flooded", VM_STAT(mmu_flooded) },
 117         { "mmu_recycled", VM_STAT(mmu_recycled) },
 118         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 119         { "mmu_unsync", VM_STAT(mmu_unsync) },
 120         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 121         { "largepages", VM_STAT(lpages) },
 122         { NULL }
 123 };
 124
 125 unsigned long segment_base(u16 selector)
 126 {
 127         struct descriptor_table gdt;
 128         struct desc_struct *d;
 129         unsigned long table_base;
 130         unsigned long v;
 131
 132         if (selector == 0)
 133                 return 0;
 134
 135         asm("sgdt %0" : "=m"(gdt));
 136         table_base = gdt.base;
 137
 138         if (selector & 4) {           /* from ldt */
 139                 u16 ldt_selector;
 140
 141                 asm("sldt %0" : "=g"(ldt_selector));
 142                 table_base = segment_base(ldt_selector);
 143         }
 144         d = (struct desc_struct *)(table_base + (selector & ~7));
 145         v = d->base0 | ((unsigned long)d->base1 << 16) |
 146                 ((unsigned long)d->base2 << 24);
 147 #ifdef CONFIG_X86_64
 148         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 149                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 150 #endif
 151         return v;
 152 }
 153 EXPORT_SYMBOL_GPL(segment_base);
 154
 155 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 156 {
 157         if (irqchip_in_kernel(vcpu->kvm))
 158                 return vcpu->arch.apic_base;
 159         else
 160                 return vcpu->arch.apic_base;
 161 }
 162 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 163
 164 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 165 {
 166         /* TODO: reserve bits check */
 167         if (irqchip_in_kernel(vcpu->kvm))
 168                 kvm_lapic_set_base(vcpu, data);
 169         else
 170                 vcpu->arch.apic_base = data;
 171 }
 172 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 173
 174 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 175 {
 176         WARN_ON(vcpu->arch.exception.pending);
 177         vcpu->arch.exception.pending = true;
 178         vcpu->arch.exception.has_error_code = false;
 179         vcpu->arch.exception.nr = nr;
 180 }
 181 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 182
 183 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 184                            u32 error_code)
 185 {
 186         ++vcpu->stat.pf_guest;
 187
 188         if (vcpu->arch.exception.pending) {
 189                 switch(vcpu->arch.exception.nr) {
 190                 case DF_VECTOR:
 191                         /* triple fault -> shutdown */
 192                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
 193                         return;
 194                 case PF_VECTOR:
 195                         vcpu->arch.exception.nr = DF_VECTOR;
 196                         vcpu->arch.exception.error_code = 0;
 197                         return;
 198                 default:
 199                         /* replace previous exception with a new one in a hope
 200                            that instruction re-execution will regenerate lost
 201                            exception */
 202                         vcpu->arch.exception.pending = false;
 203                         break;
 204                 }
 205         }
 206         vcpu->arch.cr2 = addr;
 207         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 208 }
 209
 210 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 211 {
 212         vcpu->arch.nmi_pending = 1;
 213 }
 214 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 215
 216 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 217 {
 218         WARN_ON(vcpu->arch.exception.pending);
 219         vcpu->arch.exception.pending = true;
 220         vcpu->arch.exception.has_error_code = true;
 221         vcpu->arch.exception.nr = nr;
 222         vcpu->arch.exception.error_code = error_code;
 223 }
 224 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 225
 226 static void __queue_exception(struct kvm_vcpu *vcpu)
 227 {
 228         kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 229                                      vcpu->arch.exception.has_error_code,
 230                                      vcpu->arch.exception.error_code);
 231 }
 232
 233 /*
 234  * Load the pae pdptrs.  Return true is they are all valid.
 235  */
 236 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 237 {
 238         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 239         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 240         int i;
 241         int ret;
 242         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 243
 244         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 245                                   offset * sizeof(u64), sizeof(pdpte));
 246         if (ret < 0) {
 247                 ret = 0;
 248                 goto out;
 249         }
 250         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 251                 if (is_present_gpte(pdpte[i]) &&
 252                     (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 253                         ret = 0;
 254                         goto out;
 255                 }
 256         }
 257         ret = 1;
 258
 259         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 260         __set_bit(VCPU_EXREG_PDPTR,
 261                   (unsigned long *)&vcpu->arch.regs_avail);
 262         __set_bit(VCPU_EXREG_PDPTR,
 263                   (unsigned long *)&vcpu->arch.regs_dirty);
 264 out:
 265
 266         return ret;
 267 }
 268 EXPORT_SYMBOL_GPL(load_pdptrs);
 269
 270 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 271 {
 272         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 273         bool changed = true;
 274         int r;
 275
 276         if (is_long_mode(vcpu) || !is_pae(vcpu))
 277                 return false;
 278
 279         if (!test_bit(VCPU_EXREG_PDPTR,
 280                       (unsigned long *)&vcpu->arch.regs_avail))
 281                 return true;
 282
 283         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 284         if (r < 0)
 285                 goto out;
 286         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 287 out:
 288
 289         return changed;
 290 }
 291
 292 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 293 {
 294         if (cr0 & CR0_RESERVED_BITS) {
 295                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 296                        cr0, vcpu->arch.cr0);
 297                 kvm_inject_gp(vcpu, 0);
 298                 return;
 299         }
 300
 301         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 302                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 303                 kvm_inject_gp(vcpu, 0);
 304                 return;
 305         }
 306
 307         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 308                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 309                        "and a clear PE flag\n");
 310                 kvm_inject_gp(vcpu, 0);
 311                 return;
 312         }
 313
 314         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 315 #ifdef CONFIG_X86_64
 316                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
 317                         int cs_db, cs_l;
 318
 319                         if (!is_pae(vcpu)) {
 320                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 321                                        "in long mode while PAE is disabled\n");
 322                                 kvm_inject_gp(vcpu, 0);
 323                                 return;
 324                         }
 325                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 326                         if (cs_l) {
 327                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 328                                        "in long mode while CS.L == 1\n");
 329                                 kvm_inject_gp(vcpu, 0);
 330                                 return;
 331
 332                         }
 333                 } else
 334 #endif
 335                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 336                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 337                                "reserved bits\n");
 338                         kvm_inject_gp(vcpu, 0);
 339                         return;
 340                 }
 341
 342         }
 343
 344         kvm_x86_ops->set_cr0(vcpu, cr0);
 345         vcpu->arch.cr0 = cr0;
 346
 347         kvm_mmu_reset_context(vcpu);
 348         return;
 349 }
 350 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 351
 352 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 353 {
 354         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 355 }
 356 EXPORT_SYMBOL_GPL(kvm_lmsw);
 357
 358 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 359 {
 360         unsigned long old_cr4 = vcpu->arch.cr4;
 361         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 362
 363         if (cr4 & CR4_RESERVED_BITS) {
 364                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 365                 kvm_inject_gp(vcpu, 0);
 366                 return;
 367         }
 368
 369         if (is_long_mode(vcpu)) {
 370                 if (!(cr4 & X86_CR4_PAE)) {
 371                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 372                                "in long mode\n");
 373                         kvm_inject_gp(vcpu, 0);
 374                         return;
 375                 }
 376         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 377                    && ((cr4 ^ old_cr4) & pdptr_bits)
 378                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 379                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 380                 kvm_inject_gp(vcpu, 0);
 381                 return;
 382         }
 383
 384         if (cr4 & X86_CR4_VMXE) {
 385                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 386                 kvm_inject_gp(vcpu, 0);
 387                 return;
 388         }
 389         kvm_x86_ops->set_cr4(vcpu, cr4);
 390         vcpu->arch.cr4 = cr4;
 391         vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 392         kvm_mmu_reset_context(vcpu);
 393 }
 394 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 395
 396 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 397 {
 398         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 399                 kvm_mmu_sync_roots(vcpu);
 400                 kvm_mmu_flush_tlb(vcpu);
 401                 return;
 402         }
 403
 404         if (is_long_mode(vcpu)) {
 405                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 406                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 407                         kvm_inject_gp(vcpu, 0);
 408                         return;
 409                 }
 410         } else {
 411                 if (is_pae(vcpu)) {
 412                         if (cr3 & CR3_PAE_RESERVED_BITS) {
 413                                 printk(KERN_DEBUG
 414                                        "set_cr3: #GP, reserved bits\n");
 415                                 kvm_inject_gp(vcpu, 0);
 416                                 return;
 417                         }
 418                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 419                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 420                                        "reserved bits\n");
 421                                 kvm_inject_gp(vcpu, 0);
 422                                 return;
 423                         }
 424                 }
 425                 /*
 426                  * We don't check reserved bits in nonpae mode, because
 427                  * this isn't enforced, and VMware depends on this.
 428                  */
 429         }
 430
 431         /*
 432          * Does the new cr3 value map to physical memory? (Note, we
 433          * catch an invalid cr3 even in real-mode, because it would
 434          * cause trouble later on when we turn on paging anyway.)
 435          *
 436          * A real CPU would silently accept an invalid cr3 and would
 437          * attempt to use it - with largely undefined (and often hard
 438          * to debug) behavior on the guest side.
 439          */
 440         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 441                 kvm_inject_gp(vcpu, 0);
 442         else {
 443                 vcpu->arch.cr3 = cr3;
 444                 vcpu->arch.mmu.new_cr3(vcpu);
 445         }
 446 }
 447 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 448
 449 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 450 {
 451         if (cr8 & CR8_RESERVED_BITS) {
 452                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 453                 kvm_inject_gp(vcpu, 0);
 454                 return;
 455         }
 456         if (irqchip_in_kernel(vcpu->kvm))
 457                 kvm_lapic_set_tpr(vcpu, cr8);
 458         else
 459                 vcpu->arch.cr8 = cr8;
 460 }
 461 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 462
 463 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 464 {
 465         if (irqchip_in_kernel(vcpu->kvm))
 466                 return kvm_lapic_get_cr8(vcpu);
 467         else
 468                 return vcpu->arch.cr8;
 469 }
 470 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 471
 472 static inline u32 bit(int bitno)
 473 {
 474         return 1 << (bitno & 31);
 475 }
 476
 477 /*
 478  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 479  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 480  *
 481  * This list is modified at module load time to reflect the
 482  * capabilities of the host cpu.
 483  */
 484 static u32 msrs_to_save[] = {
 485         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 486         MSR_K6_STAR,
 487 #ifdef CONFIG_X86_64
 488         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 489 #endif
 490         MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 491         MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 492 };
 493
 494 static unsigned num_msrs_to_save;
 495
 496 static u32 emulated_msrs[] = {
 497         MSR_IA32_MISC_ENABLE,
 498 };
 499
 500 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 501 {
 502         if (efer & efer_reserved_bits) {
 503                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 504                        efer);
 505                 kvm_inject_gp(vcpu, 0);
 506                 return;
 507         }
 508
 509         if (is_paging(vcpu)
 510             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
 511                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
 512                 kvm_inject_gp(vcpu, 0);
 513                 return;
 514         }
 515
 516         if (efer & EFER_FFXSR) {
 517                 struct kvm_cpuid_entry2 *feat;
 518
 519                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 520                 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
 521                         printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
 522                         kvm_inject_gp(vcpu, 0);
 523                         return;
 524                 }
 525         }
 526
 527         if (efer & EFER_SVME) {
 528                 struct kvm_cpuid_entry2 *feat;
 529
 530                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 531                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
 532                         printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
 533                         kvm_inject_gp(vcpu, 0);
 534                         return;
 535                 }
 536         }
 537
 538         kvm_x86_ops->set_efer(vcpu, efer);
 539
 540         efer &= ~EFER_LMA;
 541         efer |= vcpu->arch.shadow_efer & EFER_LMA;
 542
 543         vcpu->arch.shadow_efer = efer;
 544
 545         vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 546         kvm_mmu_reset_context(vcpu);
 547 }
 548
 549 void kvm_enable_efer_bits(u64 mask)
 550 {
 551        efer_reserved_bits &= ~mask;
 552 }
 553 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 554
 555
 556 /*
 557  * Writes msr value into into the appropriate "register".
 558  * Returns 0 on success, non-0 otherwise.
 559  * Assumes vcpu_load() was already called.
 560  */
 561 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 562 {
 563         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 564 }
 565
 566 /*
 567  * Adapt set_msr() to msr_io()'s calling convention
 568  */
 569 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 570 {
 571         return kvm_set_msr(vcpu, index, *data);
 572 }
 573
 574 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 575 {
 576         static int version;
 577         struct pvclock_wall_clock wc;
 578         struct timespec now, sys, boot;
 579
 580         if (!wall_clock)
 581                 return;
 582
 583         version++;
 584
 585         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 586
 587         /*
 588          * The guest calculates current wall clock time by adding
 589          * system time (updated by kvm_write_guest_time below) to the
 590          * wall clock specified here.  guest system time equals host
 591          * system time for us, thus we must fill in host boot time here.
 592          */
 593         now = current_kernel_time();
 594         ktime_get_ts(&sys);
 595         boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
 596
 597         wc.sec = boot.tv_sec;
 598         wc.nsec = boot.tv_nsec;
 599         wc.version = version;
 600
 601         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 602
 603         version++;
 604         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 605 }
 606
 607 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 608 {
 609         uint32_t quotient, remainder;
 610
 611         /* Don't try to replace with do_div(), this one calculates
 612          * "(dividend << 32) / divisor" */
 613         __asm__ ( "divl %4"
 614                   : "=a" (quotient), "=d" (remainder)
 615                   : "0" (0), "1" (dividend), "r" (divisor) );
 616         return quotient;
 617 }
 618
 619 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
 620 {
 621         uint64_t nsecs = 1000000000LL;
 622         int32_t  shift = 0;
 623         uint64_t tps64;
 624         uint32_t tps32;
 625
 626         tps64 = tsc_khz * 1000LL;
 627         while (tps64 > nsecs*2) {
 628                 tps64 >>= 1;
 629                 shift--;
 630         }
 631
 632         tps32 = (uint32_t)tps64;
 633         while (tps32 <= (uint32_t)nsecs) {
 634                 tps32 <<= 1;
 635                 shift++;
 636         }
 637
 638         hv_clock->tsc_shift = shift;
 639         hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
 640
 641         pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
 642                  __func__, tsc_khz, hv_clock->tsc_shift,
 643                  hv_clock->tsc_to_system_mul);
 644 }
 645
 646 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 647
 648 static void kvm_write_guest_time(struct kvm_vcpu *v)
 649 {
 650         struct timespec ts;
 651         unsigned long flags;
 652         struct kvm_vcpu_arch *vcpu = &v->arch;
 653         void *shared_kaddr;
 654         unsigned long this_tsc_khz;
 655
 656         if ((!vcpu->time_page))
 657                 return;
 658
 659         this_tsc_khz = get_cpu_var(cpu_tsc_khz);
 660         if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
 661                 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
 662                 vcpu->hv_clock_tsc_khz = this_tsc_khz;
 663         }
 664         put_cpu_var(cpu_tsc_khz);
 665
 666         /* Keep irq disabled to prevent changes to the clock */
 667         local_irq_save(flags);
 668         kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
 669         ktime_get_ts(&ts);
 670         local_irq_restore(flags);
 671
 672         /* With all the info we got, fill in the values */
 673
 674         vcpu->hv_clock.system_time = ts.tv_nsec +
 675                                      (NSEC_PER_SEC * (u64)ts.tv_sec);
 676         /*
 677          * The interface expects us to write an even number signaling that the
 678          * update is finished. Since the guest won't see the intermediate
 679          * state, we just increase by 2 at the end.
 680          */
 681         vcpu->hv_clock.version += 2;
 682
 683         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 684
 685         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 686                sizeof(vcpu->hv_clock));
 687
 688         kunmap_atomic(shared_kaddr, KM_USER0);
 689
 690         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 691 }
 692
 693 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 694 {
 695         struct kvm_vcpu_arch *vcpu = &v->arch;
 696
 697         if (!vcpu->time_page)
 698                 return 0;
 699         set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
 700         return 1;
 701 }
 702
 703 static bool msr_mtrr_valid(unsigned msr)
 704 {
 705         switch (msr) {
 706         case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
 707         case MSR_MTRRfix64K_00000:
 708         case MSR_MTRRfix16K_80000:
 709         case MSR_MTRRfix16K_A0000:
 710         case MSR_MTRRfix4K_C0000:
 711         case MSR_MTRRfix4K_C8000:
 712         case MSR_MTRRfix4K_D0000:
 713         case MSR_MTRRfix4K_D8000:
 714         case MSR_MTRRfix4K_E0000:
 715         case MSR_MTRRfix4K_E8000:
 716         case MSR_MTRRfix4K_F0000:
 717         case MSR_MTRRfix4K_F8000:
 718         case MSR_MTRRdefType:
 719         case MSR_IA32_CR_PAT:
 720                 return true;
 721         case 0x2f8:
 722                 return true;
 723         }
 724         return false;
 725 }
 726
 727 static bool valid_pat_type(unsigned t)
 728 {
 729         return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
 730 }
 731
 732 static bool valid_mtrr_type(unsigned t)
 733 {
 734         return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
 735 }
 736
 737 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 738 {
 739         int i;
 740
 741         if (!msr_mtrr_valid(msr))
 742                 return false;
 743
 744         if (msr == MSR_IA32_CR_PAT) {
 745                 for (i = 0; i < 8; i++)
 746                         if (!valid_pat_type((data >> (i * 8)) & 0xff))
 747                                 return false;
 748                 return true;
 749         } else if (msr == MSR_MTRRdefType) {
 750                 if (data & ~0xcff)
 751                         return false;
 752                 return valid_mtrr_type(data & 0xff);
 753         } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
 754                 for (i = 0; i < 8 ; i++)
 755                         if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
 756                                 return false;
 757                 return true;
 758         }
 759
 760         /* variable MTRRs */
 761         return valid_mtrr_type(data & 0xff);
 762 }
 763
 764 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 765 {
 766         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 767
 768         if (!mtrr_valid(vcpu, msr, data))
 769                 return 1;
 770
 771         if (msr == MSR_MTRRdefType) {
 772                 vcpu->arch.mtrr_state.def_type = data;
 773                 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
 774         } else if (msr == MSR_MTRRfix64K_00000)
 775                 p[0] = data;
 776         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 777                 p[1 + msr - MSR_MTRRfix16K_80000] = data;
 778         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 779                 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
 780         else if (msr == MSR_IA32_CR_PAT)
 781                 vcpu->arch.pat = data;
 782         else {  /* Variable MTRRs */
 783                 int idx, is_mtrr_mask;
 784                 u64 *pt;
 785
 786                 idx = (msr - 0x200) / 2;
 787                 is_mtrr_mask = msr - 0x200 - 2 * idx;
 788                 if (!is_mtrr_mask)
 789                         pt =
 790                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 791                 else
 792                         pt =
 793                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 794                 *pt = data;
 795         }
 796
 797         kvm_mmu_reset_context(vcpu);
 798         return 0;
 799 }
 800
 801 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 802 {
 803         u64 mcg_cap = vcpu->arch.mcg_cap;
 804         unsigned bank_num = mcg_cap & 0xff;
 805
 806         switch (msr) {
 807         case MSR_IA32_MCG_STATUS:
 808                 vcpu->arch.mcg_status = data;
 809                 break;
 810         case MSR_IA32_MCG_CTL:
 811                 if (!(mcg_cap & MCG_CTL_P))
 812                         return 1;
 813                 if (data != 0 && data != ~(u64)0)
 814                         return -1;
 815                 vcpu->arch.mcg_ctl = data;
 816                 break;
 817         default:
 818                 if (msr >= MSR_IA32_MC0_CTL &&
 819                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
 820                         u32 offset = msr - MSR_IA32_MC0_CTL;
 821                         /* only 0 or all 1s can be written to IA32_MCi_CTL */
 822                         if ((offset & 0x3) == 0 &&
 823                             data != 0 && data != ~(u64)0)
 824                                 return -1;
 825                         vcpu->arch.mce_banks[offset] = data;
 826                         break;
 827                 }
 828                 return 1;
 829         }
 830         return 0;
 831 }
 832
 833 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 834 {
 835         switch (msr) {
 836         case MSR_EFER:
 837                 set_efer(vcpu, data);
 838                 break;
 839         case MSR_K7_HWCR:
 840                 data &= ~(u64)0x40;     /* ignore flush filter disable */
 841                 if (data != 0) {
 842                         pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 843                                 data);
 844                         return 1;
 845                 }
 846                 break;
 847         case MSR_FAM10H_MMIO_CONF_BASE:
 848                 if (data != 0) {
 849                         pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
 850                                 "0x%llx\n", data);
 851                         return 1;
 852                 }
 853                 break;
 854         case MSR_AMD64_NB_CFG:
 855                 break;
 856         case MSR_IA32_DEBUGCTLMSR:
 857                 if (!data) {
 858                         /* We support the non-activated case already */
 859                         break;
 860                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
 861                         /* Values other than LBR and BTF are vendor-specific,
 862                            thus reserved and should throw a #GP */
 863                         return 1;
 864                 }
 865                 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
 866                         __func__, data);
 867                 break;
 868         case MSR_IA32_UCODE_REV:
 869         case MSR_IA32_UCODE_WRITE:
 870         case MSR_VM_HSAVE_PA:
 871         case MSR_AMD64_PATCH_LOADER:
 872                 break;
 873         case 0x200 ... 0x2ff:
 874                 return set_msr_mtrr(vcpu, msr, data);
 875         case MSR_IA32_APICBASE:
 876                 kvm_set_apic_base(vcpu, data);
 877                 break;
 878         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 879                 return kvm_x2apic_msr_write(vcpu, msr, data);
 880         case MSR_IA32_MISC_ENABLE:
 881                 vcpu->arch.ia32_misc_enable_msr = data;
 882                 break;
 883         case MSR_KVM_WALL_CLOCK:
 884                 vcpu->kvm->arch.wall_clock = data;
 885                 kvm_write_wall_clock(vcpu->kvm, data);
 886                 break;
 887         case MSR_KVM_SYSTEM_TIME: {
 888                 if (vcpu->arch.time_page) {
 889                         kvm_release_page_dirty(vcpu->arch.time_page);
 890                         vcpu->arch.time_page = NULL;
 891                 }
 892
 893                 vcpu->arch.time = data;
 894
 895                 /* we verify if the enable bit is set... */
 896                 if (!(data & 1))
 897                         break;
 898
 899                 /* ...but clean it before doing the actual write */
 900                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 901
 902                 vcpu->arch.time_page =
 903                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 904
 905                 if (is_error_page(vcpu->arch.time_page)) {
 906                         kvm_release_page_clean(vcpu->arch.time_page);
 907                         vcpu->arch.time_page = NULL;
 908                 }
 909
 910                 kvm_request_guest_time_update(vcpu);
 911                 break;
 912         }
 913         case MSR_IA32_MCG_CTL:
 914         case MSR_IA32_MCG_STATUS:
 915         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
 916                 return set_msr_mce(vcpu, msr, data);
 917
 918         /* Performance counters are not protected by a CPUID bit,
 919          * so we should check all of them in the generic path for the sake of
 920          * cross vendor migration.
 921          * Writing a zero into the event select MSRs disables them,
 922          * which we perfectly emulate ;-). Any other value should be at least
 923          * reported, some guests depend on them.
 924          */
 925         case MSR_P6_EVNTSEL0:
 926         case MSR_P6_EVNTSEL1:
 927         case MSR_K7_EVNTSEL0:
 928         case MSR_K7_EVNTSEL1:
 929         case MSR_K7_EVNTSEL2:
 930         case MSR_K7_EVNTSEL3:
 931                 if (data != 0)
 932                         pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
 933                                 "0x%x data 0x%llx\n", msr, data);
 934                 break;
 935         /* at least RHEL 4 unconditionally writes to the perfctr registers,
 936          * so we ignore writes to make it happy.
 937          */
 938         case MSR_P6_PERFCTR0:
 939         case MSR_P6_PERFCTR1:
 940         case MSR_K7_PERFCTR0:
 941         case MSR_K7_PERFCTR1:
 942         case MSR_K7_PERFCTR2:
 943         case MSR_K7_PERFCTR3:
 944                 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
 945                         "0x%x data 0x%llx\n", msr, data);
 946                 break;
 947         default:
 948                 if (!ignore_msrs) {
 949                         pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
 950                                 msr, data);
 951                         return 1;
 952                 } else {
 953                         pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
 954                                 msr, data);
 955                         break;
 956                 }
 957         }
 958         return 0;
 959 }
 960 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 961
 962
 963 /*
 964  * Reads an msr value (of 'msr_index') into 'pdata'.
 965  * Returns 0 on success, non-0 otherwise.
 966  * Assumes vcpu_load() was already called.
 967  */
 968 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 969 {
 970         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 971 }
 972
 973 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 974 {
 975         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 976
 977         if (!msr_mtrr_valid(msr))
 978                 return 1;
 979
 980         if (msr == MSR_MTRRdefType)
 981                 *pdata = vcpu->arch.mtrr_state.def_type +
 982                          (vcpu->arch.mtrr_state.enabled << 10);
 983         else if (msr == MSR_MTRRfix64K_00000)
 984                 *pdata = p[0];
 985         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 986                 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
 987         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 988                 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
 989         else if (msr == MSR_IA32_CR_PAT)
 990                 *pdata = vcpu->arch.pat;
 991         else {  /* Variable MTRRs */
 992                 int idx, is_mtrr_mask;
 993                 u64 *pt;
 994
 995                 idx = (msr - 0x200) / 2;
 996                 is_mtrr_mask = msr - 0x200 - 2 * idx;
 997                 if (!is_mtrr_mask)
 998                         pt =
 999                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1000                 else
1001                         pt =
1002                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1003                 *pdata = *pt;
1004         }
1005
1006         return 0;
1007 }
1008
1009 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1010 {
1011         u64 data;
1012         u64 mcg_cap = vcpu->arch.mcg_cap;
1013         unsigned bank_num = mcg_cap & 0xff;
1014
1015         switch (msr) {
1016         case MSR_IA32_P5_MC_ADDR:
1017         case MSR_IA32_P5_MC_TYPE:
1018                 data = 0;
1019                 break;
1020         case MSR_IA32_MCG_CAP:
1021                 data = vcpu->arch.mcg_cap;
1022                 break;
1023         case MSR_IA32_MCG_CTL:
1024                 if (!(mcg_cap & MCG_CTL_P))
1025                         return 1;
1026                 data = vcpu->arch.mcg_ctl;
1027                 break;
1028         case MSR_IA32_MCG_STATUS:
1029                 data = vcpu->arch.mcg_status;
1030                 break;
1031         default:
1032                 if (msr >= MSR_IA32_MC0_CTL &&
1033                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1034                         u32 offset = msr - MSR_IA32_MC0_CTL;
1035                         data = vcpu->arch.mce_banks[offset];
1036                         break;
1037                 }
1038                 return 1;
1039         }
1040         *pdata = data;
1041         return 0;
1042 }
1043
1044 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1045 {
1046         u64 data;
1047
1048         switch (msr) {
1049         case MSR_IA32_PLATFORM_ID:
1050         case MSR_IA32_UCODE_REV:
1051         case MSR_IA32_EBL_CR_POWERON:
1052         case MSR_IA32_DEBUGCTLMSR:
1053         case MSR_IA32_LASTBRANCHFROMIP:
1054         case MSR_IA32_LASTBRANCHTOIP:
1055         case MSR_IA32_LASTINTFROMIP:
1056         case MSR_IA32_LASTINTTOIP:
1057         case MSR_K8_SYSCFG:
1058         case MSR_K7_HWCR:
1059         case MSR_VM_HSAVE_PA:
1060         case MSR_P6_EVNTSEL0:
1061         case MSR_P6_EVNTSEL1:
1062         case MSR_K7_EVNTSEL0:
1063         case MSR_K8_INT_PENDING_MSG:
1064         case MSR_AMD64_NB_CFG:
1065         case MSR_FAM10H_MMIO_CONF_BASE:
1066                 data = 0;
1067                 break;
1068         case MSR_MTRRcap:
1069                 data = 0x500 | KVM_NR_VAR_MTRR;
1070                 break;
1071         case 0x200 ... 0x2ff:
1072                 return get_msr_mtrr(vcpu, msr, pdata);
1073         case 0xcd: /* fsb frequency */
1074                 data = 3;
1075                 break;
1076         case MSR_IA32_APICBASE:
1077                 data = kvm_get_apic_base(vcpu);
1078                 break;
1079         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1080                 return kvm_x2apic_msr_read(vcpu, msr, pdata);
1081                 break;
1082         case MSR_IA32_MISC_ENABLE:
1083                 data = vcpu->arch.ia32_misc_enable_msr;
1084                 break;
1085         case MSR_IA32_PERF_STATUS:
1086                 /* TSC increment by tick */
1087                 data = 1000ULL;
1088                 /* CPU multiplier */
1089                 data |= (((uint64_t)4ULL) << 40);
1090                 break;
1091         case MSR_EFER:
1092                 data = vcpu->arch.shadow_efer;
1093                 break;
1094         case MSR_KVM_WALL_CLOCK:
1095                 data = vcpu->kvm->arch.wall_clock;
1096                 break;
1097         case MSR_KVM_SYSTEM_TIME:
1098                 data = vcpu->arch.time;
1099                 break;
1100         case MSR_IA32_P5_MC_ADDR:
1101         case MSR_IA32_P5_MC_TYPE:
1102         case MSR_IA32_MCG_CAP:
1103         case MSR_IA32_MCG_CTL:
1104         case MSR_IA32_MCG_STATUS:
1105         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1106                 return get_msr_mce(vcpu, msr, pdata);
1107         default:
1108                 if (!ignore_msrs) {
1109                         pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1110                         return 1;
1111                 } else {
1112                         pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
1113                         data = 0;
1114                 }
1115                 break;
1116         }
1117         *pdata = data;
1118         return 0;
1119 }
1120 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1121
1122 /*
1123  * Read or write a bunch of msrs. All parameters are kernel addresses.
1124  *
1125  * @return number of msrs set successfully.
1126  */
1127 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1128                     struct kvm_msr_entry *entries,
1129                     int (*do_msr)(struct kvm_vcpu *vcpu,
1130                                   unsigned index, u64 *data))
1131 {
1132         int i;
1133
1134         vcpu_load(vcpu);
1135
1136         down_read(&vcpu->kvm->slots_lock);
1137         for (i = 0; i < msrs->nmsrs; ++i)
1138                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1139                         break;
1140         up_read(&vcpu->kvm->slots_lock);
1141
1142         vcpu_put(vcpu);
1143
1144         return i;
1145 }
1146
1147 /*
1148  * Read or write a bunch of msrs. Parameters are user addresses.
1149  *
1150  * @return number of msrs set successfully.
1151  */
1152 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1153                   int (*do_msr)(struct kvm_vcpu *vcpu,
1154                                 unsigned index, u64 *data),
1155                   int writeback)
1156 {
1157         struct kvm_msrs msrs;
1158         struct kvm_msr_entry *entries;
1159         int r, n;
1160         unsigned size;
1161
1162         r = -EFAULT;
1163         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1164                 goto out;
1165
1166         r = -E2BIG;
1167         if (msrs.nmsrs >= MAX_IO_MSRS)
1168                 goto out;
1169
1170         r = -ENOMEM;
1171         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1172         entries = vmalloc(size);
1173         if (!entries)
1174                 goto out;
1175
1176         r = -EFAULT;
1177         if (copy_from_user(entries, user_msrs->entries, size))
1178                 goto out_free;
1179
1180         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1181         if (r < 0)
1182                 goto out_free;
1183
1184         r = -EFAULT;
1185         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1186                 goto out_free;
1187
1188         r = n;
1189
1190 out_free:
1191         vfree(entries);
1192 out:
1193         return r;
1194 }
1195
1196 int kvm_dev_ioctl_check_extension(long ext)
1197 {
1198         int r;
1199
1200         switch (ext) {
1201         case KVM_CAP_IRQCHIP:
1202         case KVM_CAP_HLT:
1203         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1204         case KVM_CAP_SET_TSS_ADDR:
1205         case KVM_CAP_EXT_CPUID:
1206         case KVM_CAP_CLOCKSOURCE:
1207         case KVM_CAP_PIT:
1208         case KVM_CAP_NOP_IO_DELAY:
1209         case KVM_CAP_MP_STATE:
1210         case KVM_CAP_SYNC_MMU:
1211         case KVM_CAP_REINJECT_CONTROL:
1212         case KVM_CAP_IRQ_INJECT_STATUS:
1213         case KVM_CAP_ASSIGN_DEV_IRQ:
1214         case KVM_CAP_IRQFD:
1215         case KVM_CAP_IOEVENTFD:
1216         case KVM_CAP_PIT2:
1217         case KVM_CAP_PIT_STATE2:
1218                 r = 1;
1219                 break;
1220         case KVM_CAP_COALESCED_MMIO:
1221                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1222                 break;
1223         case KVM_CAP_VAPIC:
1224                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1225                 break;
1226         case KVM_CAP_NR_VCPUS:
1227                 r = KVM_MAX_VCPUS;
1228                 break;
1229         case KVM_CAP_NR_MEMSLOTS:
1230                 r = KVM_MEMORY_SLOTS;
1231                 break;
1232         case KVM_CAP_PV_MMU:
1233                 r = !tdp_enabled;
1234                 break;
1235         case KVM_CAP_IOMMU:
1236                 r = iommu_found();
1237                 break;
1238         case KVM_CAP_MCE:
1239                 r = KVM_MAX_MCE_BANKS;
1240                 break;
1241         default:
1242                 r = 0;
1243                 break;
1244         }
1245         return r;
1246
1247 }
1248
1249 long kvm_arch_dev_ioctl(struct file *filp,
1250                         unsigned int ioctl, unsigned long arg)
1251 {
1252         void __user *argp = (void __user *)arg;
1253         long r;
1254
1255         switch (ioctl) {
1256         case KVM_GET_MSR_INDEX_LIST: {
1257                 struct kvm_msr_list __user *user_msr_list = argp;
1258                 struct kvm_msr_list msr_list;
1259                 unsigned n;
1260
1261                 r = -EFAULT;
1262                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1263                         goto out;
1264                 n = msr_list.nmsrs;
1265                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1266                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1267                         goto out;
1268                 r = -E2BIG;
1269                 if (n < msr_list.nmsrs)
1270                         goto out;
1271                 r = -EFAULT;
1272                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1273                                  num_msrs_to_save * sizeof(u32)))
1274                         goto out;
1275                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1276                                  &emulated_msrs,
1277                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1278                         goto out;
1279                 r = 0;
1280                 break;
1281         }
1282         case KVM_GET_SUPPORTED_CPUID: {
1283                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1284                 struct kvm_cpuid2 cpuid;
1285
1286                 r = -EFAULT;
1287                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1288                         goto out;
1289                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1290                                                       cpuid_arg->entries);
1291                 if (r)
1292                         goto out;
1293
1294                 r = -EFAULT;
1295                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1296                         goto out;
1297                 r = 0;
1298                 break;
1299         }
1300         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
1301                 u64 mce_cap;
1302
1303                 mce_cap = KVM_MCE_CAP_SUPPORTED;
1304                 r = -EFAULT;
1305                 if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
1306                         goto out;
1307                 r = 0;
1308                 break;
1309         }
1310         default:
1311                 r = -EINVAL;
1312         }
1313 out:
1314         return r;
1315 }
1316
1317 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1318 {
1319         kvm_x86_ops->vcpu_load(vcpu, cpu);
1320         kvm_request_guest_time_update(vcpu);
1321 }
1322
1323 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1324 {
1325         kvm_x86_ops->vcpu_put(vcpu);
1326         kvm_put_guest_fpu(vcpu);
1327 }
1328
1329 static int is_efer_nx(void)
1330 {
1331         unsigned long long efer = 0;
1332
1333         rdmsrl_safe(MSR_EFER, &efer);
1334         return efer & EFER_NX;
1335 }
1336
1337 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1338 {
1339         int i;
1340         struct kvm_cpuid_entry2 *e, *entry;
1341
1342         entry = NULL;
1343         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1344                 e = &vcpu->arch.cpuid_entries[i];
1345                 if (e->function == 0x80000001) {
1346                         entry = e;
1347                         break;
1348                 }
1349         }
1350         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1351                 entry->edx &= ~(1 << 20);
1352                 printk(KERN_INFO "kvm: guest NX capability removed\n");
1353         }
1354 }
1355
1356 /* when an old userspace process fills a new kernel module */
1357 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1358                                     struct kvm_cpuid *cpuid,
1359                                     struct kvm_cpuid_entry __user *entries)
1360 {
1361         int r, i;
1362         struct kvm_cpuid_entry *cpuid_entries;
1363
1364         r = -E2BIG;
1365         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1366                 goto out;
1367         r = -ENOMEM;
1368         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1369         if (!cpuid_entries)
1370                 goto out;
1371         r = -EFAULT;
1372         if (copy_from_user(cpuid_entries, entries,
1373                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1374                 goto out_free;
1375         for (i = 0; i < cpuid->nent; i++) {
1376                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1377                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1378                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1379                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1380                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1381                 vcpu->arch.cpuid_entries[i].index = 0;
1382                 vcpu->arch.cpuid_entries[i].flags = 0;
1383                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
1384                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
1385                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
1386         }
1387         vcpu->arch.cpuid_nent = cpuid->nent;
1388         cpuid_fix_nx_cap(vcpu);
1389         r = 0;
1390         kvm_apic_set_version(vcpu);
1391
1392 out_free:
1393         vfree(cpuid_entries);
1394 out:
1395         return r;
1396 }
1397
1398 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1399                                      struct kvm_cpuid2 *cpuid,
1400                                      struct kvm_cpuid_entry2 __user *entries)
1401 {
1402         int r;
1403
1404         r = -E2BIG;
1405         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1406                 goto out;
1407         r = -EFAULT;
1408         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1409                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1410                 goto out;
1411         vcpu->arch.cpuid_nent = cpuid->nent;
1412         kvm_apic_set_version(vcpu);
1413         return 0;
1414
1415 out:
1416         return r;
1417 }
1418
1419 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1420                                      struct kvm_cpuid2 *cpuid,
1421                                      struct kvm_cpuid_entry2 __user *entries)
1422 {
1423         int r;
1424
1425         r = -E2BIG;
1426         if (cpuid->nent < vcpu->arch.cpuid_nent)
1427                 goto out;
1428         r = -EFAULT;
1429         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1430                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1431                 goto out;
1432         return 0;
1433
1434 out:
1435         cpuid->nent = vcpu->arch.cpuid_nent;
1436         return r;
1437 }
1438
1439 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1440                            u32 index)
1441 {
1442         entry->function = function;
1443         entry->index = index;
1444         cpuid_count(entry->function, entry->index,
1445                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1446         entry->flags = 0;
1447 }
1448
1449 #define F(x) bit(X86_FEATURE_##x)
1450
1451 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1452                          u32 index, int *nent, int maxnent)
1453 {
1454         unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1455 #ifdef CONFIG_X86_64
1456         unsigned f_lm = F(LM);
1457 #else
1458         unsigned f_lm = 0;
1459 #endif
1460
1461         /* cpuid 1.edx */
1462         const u32 kvm_supported_word0_x86_features =
1463                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1464                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1465                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1466                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1467                 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1468                 0 /* Reserved, DS, ACPI */ | F(MMX) |
1469                 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1470                 0 /* HTT, TM, Reserved, PBE */;
1471         /* cpuid 0x80000001.edx */
1472         const u32 kvm_supported_word1_x86_features =
1473                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1474                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1475                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1476                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1477                 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1478                 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1479                 F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
1480                 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1481         /* cpuid 1.ecx */
1482         const u32 kvm_supported_word4_x86_features =
1483                 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1484                 0 /* DS-CPL, VMX, SMX, EST */ |
1485                 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1486                 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1487                 0 /* Reserved, DCA */ | F(XMM4_1) |
1488                 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1489                 0 /* Reserved, XSAVE, OSXSAVE */;
1490         /* cpuid 0x80000001.ecx */
1491         const u32 kvm_supported_word6_x86_features =
1492                 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1493                 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1494                 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1495                 0 /* SKINIT */ | 0 /* WDT */;
1496
1497         /* all calls to cpuid_count() should be made on the same cpu */
1498         get_cpu();
1499         do_cpuid_1_ent(entry, function, index);
1500         ++*nent;
1501
1502         switch (function) {
1503         case 0:
1504                 entry->eax = min(entry->eax, (u32)0xb);
1505                 break;
1506         case 1:
1507                 entry->edx &= kvm_supported_word0_x86_features;
1508                 entry->ecx &= kvm_supported_word4_x86_features;
1509                 /* we support x2apic emulation even if host does not support
1510                  * it since we emulate x2apic in software */
1511                 entry->ecx |= F(X2APIC);
1512                 break;
1513         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1514          * may return different values. This forces us to get_cpu() before
1515          * issuing the first command, and also to emulate this annoying behavior
1516          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1517         case 2: {
1518                 int t, times = entry->eax & 0xff;
1519
1520                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1521                 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1522                 for (t = 1; t < times && *nent < maxnent; ++t) {
1523                         do_cpuid_1_ent(&entry[t], function, 0);
1524                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1525                         ++*nent;
1526                 }
1527                 break;
1528         }
1529         /* function 4 and 0xb have additional index. */
1530         case 4: {
1531                 int i, cache_type;
1532
1533                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1534                 /* read more entries until cache_type is zero */
1535                 for (i = 1; *nent < maxnent; ++i) {
1536                         cache_type = entry[i - 1].eax & 0x1f;
1537                         if (!cache_type)
1538                                 break;
1539                         do_cpuid_1_ent(&entry[i], function, i);
1540                         entry[i].flags |=
1541                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1542                         ++*nent;
1543                 }
1544                 break;
1545         }
1546         case 0xb: {
1547                 int i, level_type;
1548
1549                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1550                 /* read more entries until level_type is zero */
1551                 for (i = 1; *nent < maxnent; ++i) {
1552                         level_type = entry[i - 1].ecx & 0xff00;
1553                         if (!level_type)
1554                                 break;
1555                         do_cpuid_1_ent(&entry[i], function, i);
1556                         entry[i].flags |=
1557                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1558                         ++*nent;
1559                 }
1560                 break;
1561         }
1562         case 0x80000000:
1563                 entry->eax = min(entry->eax, 0x8000001a);
1564                 break;
1565         case 0x80000001:
1566                 entry->edx &= kvm_supported_word1_x86_features;
1567                 entry->ecx &= kvm_supported_word6_x86_features;
1568                 break;
1569         }
1570         put_cpu();
1571 }
1572
1573 #undef F
1574
1575 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1576                                      struct kvm_cpuid_entry2 __user *entries)
1577 {
1578         struct kvm_cpuid_entry2 *cpuid_entries;
1579         int limit, nent = 0, r = -E2BIG;
1580         u32 func;
1581
1582         if (cpuid->nent < 1)
1583                 goto out;
1584         r = -ENOMEM;
1585         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1586         if (!cpuid_entries)
1587                 goto out;
1588
1589         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1590         limit = cpuid_entries[0].eax;
1591         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1592                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1593                              &nent, cpuid->nent);
1594         r = -E2BIG;
1595         if (nent >= cpuid->nent)
1596                 goto out_free;
1597
1598         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1599         limit = cpuid_entries[nent - 1].eax;
1600         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1601                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1602                              &nent, cpuid->nent);
1603         r = -E2BIG;
1604         if (nent >= cpuid->nent)
1605                 goto out_free;
1606
1607         r = -EFAULT;
1608         if (copy_to_user(entries, cpuid_entries,
1609                          nent * sizeof(struct kvm_cpuid_entry2)))
1610                 goto out_free;
1611         cpuid->nent = nent;
1612         r = 0;
1613
1614 out_free:
1615         vfree(cpuid_entries);
1616 out:
1617         return r;
1618 }
1619
1620 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1621                                     struct kvm_lapic_state *s)
1622 {
1623         vcpu_load(vcpu);
1624         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1625         vcpu_put(vcpu);
1626
1627         return 0;
1628 }
1629
1630 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1631                                     struct kvm_lapic_state *s)
1632 {
1633         vcpu_load(vcpu);
1634         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1635         kvm_apic_post_state_restore(vcpu);
1636         vcpu_put(vcpu);
1637
1638         return 0;
1639 }
1640
1641 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1642                                     struct kvm_interrupt *irq)
1643 {
1644         if (irq->irq < 0 || irq->irq >= 256)
1645                 return -EINVAL;
1646         if (irqchip_in_kernel(vcpu->kvm))
1647                 return -ENXIO;
1648         vcpu_load(vcpu);
1649
1650         kvm_queue_interrupt(vcpu, irq->irq, false);
1651
1652         vcpu_put(vcpu);
1653
1654         return 0;
1655 }
1656
1657 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1658 {
1659         vcpu_load(vcpu);
1660         kvm_inject_nmi(vcpu);
1661         vcpu_put(vcpu);
1662
1663         return 0;
1664 }
1665
1666 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1667                                            struct kvm_tpr_access_ctl *tac)
1668 {
1669         if (tac->flags)
1670                 return -EINVAL;
1671         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1672         return 0;
1673 }
1674
1675 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
1676                                         u64 mcg_cap)
1677 {
1678         int r;
1679         unsigned bank_num = mcg_cap & 0xff, bank;
1680
1681         r = -EINVAL;
1682         if (!bank_num)
1683                 goto out;
1684         if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1685                 goto out;
1686         r = 0;
1687         vcpu->arch.mcg_cap = mcg_cap;
1688         /* Init IA32_MCG_CTL to all 1s */
1689         if (mcg_cap & MCG_CTL_P)
1690                 vcpu->arch.mcg_ctl = ~(u64)0;
1691         /* Init IA32_MCi_CTL to all 1s */
1692         for (bank = 0; bank < bank_num; bank++)
1693                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
1694 out:
1695         return r;
1696 }
1697
1698 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1699                                       struct kvm_x86_mce *mce)
1700 {
1701         u64 mcg_cap = vcpu->arch.mcg_cap;
1702         unsigned bank_num = mcg_cap & 0xff;
1703         u64 *banks = vcpu->arch.mce_banks;
1704
1705         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
1706                 return -EINVAL;
1707         /*
1708          * if IA32_MCG_CTL is not all 1s, the uncorrected error
1709          * reporting is disabled
1710          */
1711         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
1712             vcpu->arch.mcg_ctl != ~(u64)0)
1713                 return 0;
1714         banks += 4 * mce->bank;
1715         /*
1716          * if IA32_MCi_CTL is not all 1s, the uncorrected error
1717          * reporting is disabled for the bank
1718          */
1719         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
1720                 return 0;
1721         if (mce->status & MCI_STATUS_UC) {
1722                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1723                     !(vcpu->arch.cr4 & X86_CR4_MCE)) {
1724                         printk(KERN_DEBUG "kvm: set_mce: "
1725                                "injects mce exception while "
1726                                "previous one is in progress!\n");
1727                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1728                         return 0;
1729                 }
1730                 if (banks[1] & MCI_STATUS_VAL)
1731                         mce->status |= MCI_STATUS_OVER;
1732                 banks[2] = mce->addr;
1733                 banks[3] = mce->misc;
1734                 vcpu->arch.mcg_status = mce->mcg_status;
1735                 banks[1] = mce->status;
1736                 kvm_queue_exception(vcpu, MC_VECTOR);
1737         } else if (!(banks[1] & MCI_STATUS_VAL)
1738                    || !(banks[1] & MCI_STATUS_UC)) {
1739                 if (banks[1] & MCI_STATUS_VAL)
1740                         mce->status |= MCI_STATUS_OVER;
1741                 banks[2] = mce->addr;
1742                 banks[3] = mce->misc;
1743                 banks[1] = mce->status;
1744         } else
1745                 banks[1] |= MCI_STATUS_OVER;
1746         return 0;
1747 }
1748
1749 long kvm_arch_vcpu_ioctl(struct file *filp,
1750                          unsigned int ioctl, unsigned long arg)
1751 {
1752         struct kvm_vcpu *vcpu = filp->private_data;
1753         void __user *argp = (void __user *)arg;
1754         int r;
1755         struct kvm_lapic_state *lapic = NULL;
1756
1757         switch (ioctl) {
1758         case KVM_GET_LAPIC: {
1759                 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1760
1761                 r = -ENOMEM;
1762                 if (!lapic)
1763                         goto out;
1764                 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1765                 if (r)
1766                         goto out;
1767                 r = -EFAULT;
1768                 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1769                         goto out;
1770                 r = 0;
1771                 break;
1772         }
1773         case KVM_SET_LAPIC: {
1774                 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1775                 r = -ENOMEM;
1776                 if (!lapic)
1777                         goto out;
1778                 r = -EFAULT;
1779                 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1780                         goto out;
1781                 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1782                 if (r)
1783                         goto out;
1784                 r = 0;
1785                 break;
1786         }
1787         case KVM_INTERRUPT: {
1788                 struct kvm_interrupt irq;
1789
1790                 r = -EFAULT;
1791                 if (copy_from_user(&irq, argp, sizeof irq))
1792                         goto out;
1793                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1794                 if (r)
1795                         goto out;
1796                 r = 0;
1797                 break;
1798         }
1799         case KVM_NMI: {
1800                 r = kvm_vcpu_ioctl_nmi(vcpu);
1801                 if (r)
1802                         goto out;
1803                 r = 0;
1804                 break;
1805         }
1806         case KVM_SET_CPUID: {
1807                 struct kvm_cpuid __user *cpuid_arg = argp;
1808                 struct kvm_cpuid cpuid;
1809
1810                 r = -EFAULT;
1811                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1812                         goto out;
1813                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1814                 if (r)
1815                         goto out;
1816                 break;
1817         }
1818         case KVM_SET_CPUID2: {
1819                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1820                 struct kvm_cpuid2 cpuid;
1821
1822                 r = -EFAULT;
1823                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1824                         goto out;
1825                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1826                                               cpuid_arg->entries);
1827                 if (r)
1828                         goto out;
1829                 break;
1830         }
1831         case KVM_GET_CPUID2: {
1832                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1833                 struct kvm_cpuid2 cpuid;
1834
1835                 r = -EFAULT;
1836                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1837                         goto out;
1838                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1839                                               cpuid_arg->entries);
1840                 if (r)
1841                         goto out;
1842                 r = -EFAULT;
1843                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1844                         goto out;
1845                 r = 0;
1846                 break;
1847         }
1848         case KVM_GET_MSRS:
1849                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1850                 break;
1851         case KVM_SET_MSRS:
1852                 r = msr_io(vcpu, argp, do_set_msr, 0);
1853                 break;
1854         case KVM_TPR_ACCESS_REPORTING: {
1855                 struct kvm_tpr_access_ctl tac;
1856
1857                 r = -EFAULT;
1858                 if (copy_from_user(&tac, argp, sizeof tac))
1859                         goto out;
1860                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1861                 if (r)
1862                         goto out;
1863                 r = -EFAULT;
1864                 if (copy_to_user(argp, &tac, sizeof tac))
1865                         goto out;
1866                 r = 0;
1867                 break;
1868         };
1869         case KVM_SET_VAPIC_ADDR: {
1870                 struct kvm_vapic_addr va;
1871
1872                 r = -EINVAL;
1873                 if (!irqchip_in_kernel(vcpu->kvm))
1874                         goto out;
1875                 r = -EFAULT;
1876                 if (copy_from_user(&va, argp, sizeof va))
1877                         goto out;
1878                 r = 0;
1879                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1880                 break;
1881         }
1882         case KVM_X86_SETUP_MCE: {
1883                 u64 mcg_cap;
1884
1885                 r = -EFAULT;
1886                 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
1887                         goto out;
1888                 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
1889                 break;
1890         }
1891         case KVM_X86_SET_MCE: {
1892                 struct kvm_x86_mce mce;
1893
1894                 r = -EFAULT;
1895                 if (copy_from_user(&mce, argp, sizeof mce))
1896                         goto out;
1897                 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1898                 break;
1899         }
1900         default:
1901                 r = -EINVAL;
1902         }
1903 out:
1904         kfree(lapic);
1905         return r;
1906 }
1907
1908 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1909 {
1910         int ret;
1911
1912         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1913                 return -1;
1914         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1915         return ret;
1916 }
1917
1918 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1919                                           u32 kvm_nr_mmu_pages)
1920 {
1921         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1922                 return -EINVAL;
1923
1924         down_write(&kvm->slots_lock);
1925         spin_lock(&kvm->mmu_lock);
1926
1927         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1928         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1929
1930         spin_unlock(&kvm->mmu_lock);
1931         up_write(&kvm->slots_lock);
1932         return 0;
1933 }
1934
1935 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1936 {
1937         return kvm->arch.n_alloc_mmu_pages;
1938 }
1939
1940 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1941 {
1942         int i;
1943         struct kvm_mem_alias *alias;
1944
1945         for (i = 0; i < kvm->arch.naliases; ++i) {
1946                 alias = &kvm->arch.aliases[i];
1947                 if (gfn >= alias->base_gfn
1948                     && gfn < alias->base_gfn + alias->npages)
1949                         return alias->target_gfn + gfn - alias->base_gfn;
1950         }
1951         return gfn;
1952 }
1953
1954 /*
1955  * Set a new alias region.  Aliases map a portion of physical memory into
1956  * another portion.  This is useful for memory windows, for example the PC
1957  * VGA region.
1958  */
1959 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1960                                          struct kvm_memory_alias *alias)
1961 {
1962         int r, n;
1963         struct kvm_mem_alias *p;
1964
1965         r = -EINVAL;
1966         /* General sanity checks */
1967         if (alias->memory_size & (PAGE_SIZE - 1))
1968                 goto out;
1969         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1970                 goto out;
1971         if (alias->slot >= KVM_ALIAS_SLOTS)
1972                 goto out;
1973         if (alias->guest_phys_addr + alias->memory_size
1974             < alias->guest_phys_addr)
1975                 goto out;
1976         if (alias->target_phys_addr + alias->memory_size
1977             < alias->target_phys_addr)
1978                 goto out;
1979
1980         down_write(&kvm->slots_lock);
1981         spin_lock(&kvm->mmu_lock);
1982
1983         p = &kvm->arch.aliases[alias->slot];
1984         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1985         p->npages = alias->memory_size >> PAGE_SHIFT;
1986         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1987
1988         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1989                 if (kvm->arch.aliases[n - 1].npages)
1990                         break;
1991         kvm->arch.naliases = n;
1992
1993         spin_unlock(&kvm->mmu_lock);
1994         kvm_mmu_zap_all(kvm);
1995
1996         up_write(&kvm->slots_lock);
1997
1998         return 0;
1999
2000 out:
2001         return r;
2002 }
2003
2004 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2005 {
2006         int r;
2007
2008         r = 0;
2009         switch (chip->chip_id) {
2010         case KVM_IRQCHIP_PIC_MASTER:
2011                 memcpy(&chip->chip.pic,
2012                         &pic_irqchip(kvm)->pics[0],
2013                         sizeof(struct kvm_pic_state));
2014                 break;
2015         case KVM_IRQCHIP_PIC_SLAVE:
2016                 memcpy(&chip->chip.pic,
2017                         &pic_irqchip(kvm)->pics[1],
2018                         sizeof(struct kvm_pic_state));
2019                 break;
2020         case KVM_IRQCHIP_IOAPIC:
2021                 memcpy(&chip->chip.ioapic,
2022                         ioapic_irqchip(kvm),
2023                         sizeof(struct kvm_ioapic_state));
2024                 break;
2025         default:
2026                 r = -EINVAL;
2027                 break;
2028         }
2029         return r;
2030 }
2031
2032 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2033 {
2034         int r;
2035
2036         r = 0;
2037         switch (chip->chip_id) {
2038         case KVM_IRQCHIP_PIC_MASTER:
2039                 spin_lock(&pic_irqchip(kvm)->lock);
2040                 memcpy(&pic_irqchip(kvm)->pics[0],
2041                         &chip->chip.pic,
2042                         sizeof(struct kvm_pic_state));
2043                 spin_unlock(&pic_irqchip(kvm)->lock);
2044                 break;
2045         case KVM_IRQCHIP_PIC_SLAVE:
2046                 spin_lock(&pic_irqchip(kvm)->lock);
2047                 memcpy(&pic_irqchip(kvm)->pics[1],
2048                         &chip->chip.pic,
2049                         sizeof(struct kvm_pic_state));
2050                 spin_unlock(&pic_irqchip(kvm)->lock);
2051                 break;
2052         case KVM_IRQCHIP_IOAPIC:
2053                 mutex_lock(&kvm->irq_lock);
2054                 memcpy(ioapic_irqchip(kvm),
2055                         &chip->chip.ioapic,
2056                         sizeof(struct kvm_ioapic_state));
2057                 mutex_unlock(&kvm->irq_lock);
2058                 break;
2059         default:
2060                 r = -EINVAL;
2061                 break;
2062         }
2063         kvm_pic_update_irq(pic_irqchip(kvm));
2064         return r;
2065 }
2066
2067 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2068 {
2069         int r = 0;
2070
2071         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2072         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
2073         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2074         return r;
2075 }
2076
2077 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2078 {
2079         int r = 0;
2080
2081         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2082         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
2083         kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
2084         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2085         return r;
2086 }
2087
2088 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2089 {
2090         int r = 0;
2091
2092         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2093         memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
2094                 sizeof(ps->channels));
2095         ps->flags = kvm->arch.vpit->pit_state.flags;
2096         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2097         return r;
2098 }
2099
2100 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2101 {
2102         int r = 0, start = 0;
2103         u32 prev_legacy, cur_legacy;
2104         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2105         prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
2106         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
2107         if (!prev_legacy && cur_legacy)
2108                 start = 1;
2109         memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
2110                sizeof(kvm->arch.vpit->pit_state.channels));
2111         kvm->arch.vpit->pit_state.flags = ps->flags;
2112         kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
2113         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2114         return r;
2115 }
2116
2117 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2118                                  struct kvm_reinject_control *control)
2119 {
2120         if (!kvm->arch.vpit)
2121                 return -ENXIO;
2122         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2123         kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
2124         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2125         return 0;
2126 }
2127
2128 /*
2129  * Get (and clear) the dirty memory log for a memory slot.
2130  */
2131 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2132                                       struct kvm_dirty_log *log)
2133 {
2134         int r;
2135         int n;
2136         struct kvm_memory_slot *memslot;
2137         int is_dirty = 0;
2138
2139         down_write(&kvm->slots_lock);
2140
2141         r = kvm_get_dirty_log(kvm, log, &is_dirty);
2142         if (r)
2143                 goto out;
2144
2145         /* If nothing is dirty, don't bother messing with page tables. */
2146         if (is_dirty) {
2147                 spin_lock(&kvm->mmu_lock);
2148                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2149                 spin_unlock(&kvm->mmu_lock);
2150                 kvm_flush_remote_tlbs(kvm);
2151                 memslot = &kvm->memslots[log->slot];
2152                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
2153                 memset(memslot->dirty_bitmap, 0, n);
2154         }
2155         r = 0;
2156 out:
2157         up_write(&kvm->slots_lock);
2158         return r;
2159 }
2160
2161 long kvm_arch_vm_ioctl(struct file *filp,
2162                        unsigned int ioctl, unsigned long arg)
2163 {
2164         struct kvm *kvm = filp->private_data;
2165         void __user *argp = (void __user *)arg;
2166         int r = -EINVAL;
2167         /*
2168          * This union makes it completely explicit to gcc-3.x
2169          * that these two variables' stack usage should be
2170          * combined, not added together.
2171          */
2172         union {
2173                 struct kvm_pit_state ps;
2174                 struct kvm_pit_state2 ps2;
2175                 struct kvm_memory_alias alias;
2176                 struct kvm_pit_config pit_config;
2177         } u;
2178
2179         switch (ioctl) {
2180         case KVM_SET_TSS_ADDR:
2181                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
2182                 if (r < 0)
2183                         goto out;
2184                 break;
2185         case KVM_SET_MEMORY_REGION: {
2186                 struct kvm_memory_region kvm_mem;
2187                 struct kvm_userspace_memory_region kvm_userspace_mem;
2188
2189                 r = -EFAULT;
2190                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2191                         goto out;
2192                 kvm_userspace_mem.slot = kvm_mem.slot;
2193                 kvm_userspace_mem.flags = kvm_mem.flags;
2194                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2195                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2196                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2197                 if (r)
2198                         goto out;
2199                 break;
2200         }
2201         case KVM_SET_NR_MMU_PAGES:
2202                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2203                 if (r)
2204                         goto out;
2205                 break;
2206         case KVM_GET_NR_MMU_PAGES:
2207                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2208                 break;
2209         case KVM_SET_MEMORY_ALIAS:
2210                 r = -EFAULT;
2211                 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2212                         goto out;
2213                 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2214                 if (r)
2215                         goto out;
2216                 break;
2217         case KVM_CREATE_IRQCHIP:
2218                 r = -ENOMEM;
2219                 kvm->arch.vpic = kvm_create_pic(kvm);
2220                 if (kvm->arch.vpic) {
2221                         r = kvm_ioapic_init(kvm);
2222                         if (r) {
2223                                 kfree(kvm->arch.vpic);
2224                                 kvm->arch.vpic = NULL;
2225                                 goto out;
2226                         }
2227                 } else
2228                         goto out;
2229                 r = kvm_setup_default_irq_routing(kvm);
2230                 if (r) {
2231                         kfree(kvm->arch.vpic);
2232                         kfree(kvm->arch.vioapic);
2233                         goto out;
2234                 }
2235                 break;
2236         case KVM_CREATE_PIT:
2237                 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2238                 goto create_pit;
2239         case KVM_CREATE_PIT2:
2240                 r = -EFAULT;
2241                 if (copy_from_user(&u.pit_config, argp,
2242                                    sizeof(struct kvm_pit_config)))
2243                         goto out;
2244         create_pit:
2245                 down_write(&kvm->slots_lock);
2246                 r = -EEXIST;
2247                 if (kvm->arch.vpit)
2248                         goto create_pit_unlock;
2249                 r = -ENOMEM;
2250                 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
2251                 if (kvm->arch.vpit)
2252                         r = 0;
2253         create_pit_unlock:
2254                 up_write(&kvm->slots_lock);
2255                 break;
2256         case KVM_IRQ_LINE_STATUS:
2257         case KVM_IRQ_LINE: {
2258                 struct kvm_irq_level irq_event;
2259
2260                 r = -EFAULT;
2261                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2262                         goto out;
2263                 if (irqchip_in_kernel(kvm)) {
2264                         __s32 status;
2265                         mutex_lock(&kvm->irq_lock);
2266                         status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2267                                         irq_event.irq, irq_event.level);
2268                         mutex_unlock(&kvm->irq_lock);
2269                         if (ioctl == KVM_IRQ_LINE_STATUS) {
2270                                 irq_event.status = status;
2271                                 if (copy_to_user(argp, &irq_event,
2272                                                         sizeof irq_event))
2273                                         goto out;
2274                         }
2275                         r = 0;
2276                 }
2277                 break;
2278         }
2279         case KVM_GET_IRQCHIP: {
2280                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2281                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2282
2283                 r = -ENOMEM;
2284                 if (!chip)
2285                         goto out;
2286                 r = -EFAULT;
2287                 if (copy_from_user(chip, argp, sizeof *chip))
2288                         goto get_irqchip_out;
2289                 r = -ENXIO;
2290                 if (!irqchip_in_kernel(kvm))
2291                         goto get_irqchip_out;
2292                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
2293                 if (r)
2294                         goto get_irqchip_out;
2295                 r = -EFAULT;
2296                 if (copy_to_user(argp, chip, sizeof *chip))
2297                         goto get_irqchip_out;
2298                 r = 0;
2299         get_irqchip_out:
2300                 kfree(chip);
2301                 if (r)
2302                         goto out;
2303                 break;
2304         }
2305         case KVM_SET_IRQCHIP: {
2306                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2307                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2308
2309                 r = -ENOMEM;
2310                 if (!chip)
2311                         goto out;
2312                 r = -EFAULT;
2313                 if (copy_from_user(chip, argp, sizeof *chip))
2314                         goto set_irqchip_out;
2315                 r = -ENXIO;
2316                 if (!irqchip_in_kernel(kvm))
2317                         goto set_irqchip_out;
2318                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
2319                 if (r)
2320                         goto set_irqchip_out;
2321                 r = 0;
2322         set_irqchip_out:
2323                 kfree(chip);
2324                 if (r)
2325                         goto out;
2326                 break;
2327         }
2328         case KVM_GET_PIT: {
2329                 r = -EFAULT;
2330                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
2331                         goto out;
2332                 r = -ENXIO;
2333                 if (!kvm->arch.vpit)
2334                         goto out;
2335                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
2336                 if (r)
2337                         goto out;
2338                 r = -EFAULT;
2339                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
2340                         goto out;
2341                 r = 0;
2342                 break;
2343         }
2344         case KVM_SET_PIT: {
2345                 r = -EFAULT;
2346                 if (copy_from_user(&u.ps, argp, sizeof u.ps))
2347                         goto out;
2348                 r = -ENXIO;
2349                 if (!kvm->arch.vpit)
2350                         goto out;
2351                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
2352                 if (r)
2353                         goto out;
2354                 r = 0;
2355                 break;
2356         }
2357         case KVM_GET_PIT2: {
2358                 r = -ENXIO;
2359                 if (!kvm->arch.vpit)
2360                         goto out;
2361                 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
2362                 if (r)
2363                         goto out;
2364                 r = -EFAULT;
2365                 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
2366                         goto out;
2367                 r = 0;
2368                 break;
2369         }
2370         case KVM_SET_PIT2: {
2371                 r = -EFAULT;
2372                 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
2373                         goto out;
2374                 r = -ENXIO;
2375                 if (!kvm->arch.vpit)
2376                         goto out;
2377                 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
2378                 if (r)
2379                         goto out;
2380                 r = 0;
2381                 break;
2382         }
2383         case KVM_REINJECT_CONTROL: {
2384                 struct kvm_reinject_control control;
2385                 r =  -EFAULT;
2386                 if (copy_from_user(&control, argp, sizeof(control)))
2387                         goto out;
2388                 r = kvm_vm_ioctl_reinject(kvm, &control);
2389                 if (r)
2390                         goto out;
2391                 r = 0;
2392                 break;
2393         }
2394         default:
2395                 ;
2396         }
2397 out:
2398         return r;
2399 }
2400
2401 static void kvm_init_msr_list(void)
2402 {
2403         u32 dummy[2];
2404         unsigned i, j;
2405
2406         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2407                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2408                         continue;
2409                 if (j < i)
2410                         msrs_to_save[j] = msrs_to_save[i];
2411                 j++;
2412         }
2413         num_msrs_to_save = j;
2414 }
2415
2416 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2417                            const void *v)
2418 {
2419         if (vcpu->arch.apic &&
2420             !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2421                 return 0;
2422
2423         return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
2424 }
2425
2426 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2427 {
2428         if (vcpu->arch.apic &&
2429             !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2430                 return 0;
2431
2432         return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
2433 }
2434
2435 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2436                                struct kvm_vcpu *vcpu)
2437 {
2438         void *data = val;
2439         int r = X86EMUL_CONTINUE;
2440
2441         while (bytes) {
2442                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2443                 unsigned offset = addr & (PAGE_SIZE-1);
2444                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2445                 int ret;
2446
2447                 if (gpa == UNMAPPED_GVA) {
2448                         r = X86EMUL_PROPAGATE_FAULT;
2449                         goto out;
2450                 }
2451                 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2452                 if (ret < 0) {
2453                         r = X86EMUL_UNHANDLEABLE;
2454                         goto out;
2455                 }
2456
2457                 bytes -= toread;
2458                 data += toread;
2459                 addr += toread;
2460         }
2461 out:
2462         return r;
2463 }
2464
2465 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2466                                 struct kvm_vcpu *vcpu)
2467 {
2468         void *data = val;
2469         int r = X86EMUL_CONTINUE;
2470
2471         while (bytes) {
2472                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2473                 unsigned offset = addr & (PAGE_SIZE-1);
2474                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2475                 int ret;
2476
2477                 if (gpa == UNMAPPED_GVA) {
2478                         r = X86EMUL_PROPAGATE_FAULT;
2479                         goto out;
2480                 }
2481                 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
2482                 if (ret < 0) {
2483                         r = X86EMUL_UNHANDLEABLE;
2484                         goto out;
2485                 }
2486
2487                 bytes -= towrite;
2488                 data += towrite;
2489                 addr += towrite;
2490         }
2491 out:
2492         return r;
2493 }
2494
2495
2496 static int emulator_read_emulated(unsigned long addr,
2497                                   void *val,
2498                                   unsigned int bytes,
2499                                   struct kvm_vcpu *vcpu)
2500 {
2501         gpa_t                 gpa;
2502
2503         if (vcpu->mmio_read_completed) {
2504                 memcpy(val, vcpu->mmio_data, bytes);
2505                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
2506                                vcpu->mmio_phys_addr, *(u64 *)val);
2507                 vcpu->mmio_read_completed = 0;
2508                 return X86EMUL_CONTINUE;
2509         }
2510
2511         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2512
2513         /* For APIC access vmexit */
2514         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2515                 goto mmio;
2516
2517         if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2518                                 == X86EMUL_CONTINUE)
2519                 return X86EMUL_CONTINUE;
2520         if (gpa == UNMAPPED_GVA)
2521                 return X86EMUL_PROPAGATE_FAULT;
2522
2523 mmio:
2524         /*
2525          * Is this MMIO handled locally?
2526          */
2527         if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
2528                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
2529                 return X86EMUL_CONTINUE;
2530         }
2531
2532         trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
2533
2534         vcpu->mmio_needed = 1;
2535         vcpu->mmio_phys_addr = gpa;
2536         vcpu->mmio_size = bytes;
2537         vcpu->mmio_is_write = 0;
2538
2539         return X86EMUL_UNHANDLEABLE;
2540 }
2541
2542 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2543                           const void *val, int bytes)
2544 {
2545         int ret;
2546
2547         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2548         if (ret < 0)
2549                 return 0;
2550         kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2551         return 1;
2552 }
2553
2554 static int emulator_write_emulated_onepage(unsigned long addr,
2555                                            const void *val,
2556                                            unsigned int bytes,
2557                                            struct kvm_vcpu *vcpu)
2558 {
2559         gpa_t                 gpa;
2560
2561         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2562
2563         if (gpa == UNMAPPED_GVA) {
2564                 kvm_inject_page_fault(vcpu, addr, 2);
2565                 return X86EMUL_PROPAGATE_FAULT;
2566         }
2567
2568         /* For APIC access vmexit */
2569         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2570                 goto mmio;
2571
2572         if (emulator_write_phys(vcpu, gpa, val, bytes))
2573                 return X86EMUL_CONTINUE;
2574
2575 mmio:
2576         trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
2577         /*
2578          * Is this MMIO handled locally?
2579          */
2580         if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
2581                 return X86EMUL_CONTINUE;
2582
2583         vcpu->mmio_needed = 1;
2584         vcpu->mmio_phys_addr = gpa;
2585         vcpu->mmio_size = bytes;
2586         vcpu->mmio_is_write = 1;
2587         memcpy(vcpu->mmio_data, val, bytes);
2588
2589         return X86EMUL_CONTINUE;
2590 }
2591
2592 int emulator_write_emulated(unsigned long addr,
2593                                    const void *val,
2594                                    unsigned int bytes,
2595                                    struct kvm_vcpu *vcpu)
2596 {
2597         /* Crossing a page boundary? */
2598         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2599                 int rc, now;
2600
2601                 now = -addr & ~PAGE_MASK;
2602                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2603                 if (rc != X86EMUL_CONTINUE)
2604                         return rc;
2605                 addr += now;
2606                 val += now;
2607                 bytes -= now;
2608         }
2609         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2610 }
2611 EXPORT_SYMBOL_GPL(emulator_write_emulated);
2612
2613 static int emulator_cmpxchg_emulated(unsigned long addr,
2614                                      const void *old,
2615                                      const void *new,
2616                                      unsigned int bytes,
2617                                      struct kvm_vcpu *vcpu)
2618 {
2619         static int reported;
2620
2621         if (!reported) {
2622                 reported = 1;
2623                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
2624         }
2625 #ifndef CONFIG_X86_64
2626         /* guests cmpxchg8b have to be emulated atomically */
2627         if (bytes == 8) {
2628                 gpa_t gpa;
2629                 struct page *page;
2630                 char *kaddr;
2631                 u64 val;
2632
2633                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2634
2635                 if (gpa == UNMAPPED_GVA ||
2636                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2637                         goto emul_write;
2638
2639                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2640                         goto emul_write;
2641
2642                 val = *(u64 *)new;
2643
2644                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2645
2646                 kaddr = kmap_atomic(page, KM_USER0);
2647                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2648                 kunmap_atomic(kaddr, KM_USER0);
2649                 kvm_release_page_dirty(page);
2650         }
2651 emul_write:
2652 #endif
2653
2654         return emulator_write_emulated(addr, new, bytes, vcpu);
2655 }
2656
2657 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2658 {
2659         return kvm_x86_ops->get_segment_base(vcpu, seg);
2660 }
2661
2662 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2663 {
2664         kvm_mmu_invlpg(vcpu, address);
2665         return X86EMUL_CONTINUE;
2666 }
2667
2668 int emulate_clts(struct kvm_vcpu *vcpu)
2669 {
2670         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2671         return X86EMUL_CONTINUE;
2672 }
2673
2674 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2675 {
2676         struct kvm_vcpu *vcpu = ctxt->vcpu;
2677
2678         switch (dr) {
2679         case 0 ... 3:
2680                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2681                 return X86EMUL_CONTINUE;
2682         default:
2683                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2684                 return X86EMUL_UNHANDLEABLE;
2685         }
2686 }
2687
2688 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2689 {
2690         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2691         int exception;
2692
2693         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2694         if (exception) {
2695                 /* FIXME: better handling */
2696                 return X86EMUL_UNHANDLEABLE;
2697         }
2698         return X86EMUL_CONTINUE;
2699 }
2700
2701 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2702 {
2703         u8 opcodes[4];
2704         unsigned long rip = kvm_rip_read(vcpu);
2705         unsigned long rip_linear;
2706
2707         if (!printk_ratelimit())
2708                 return;
2709
2710         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2711
2712         kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
2713
2714         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2715                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2716 }
2717 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2718
2719 static struct x86_emulate_ops emulate_ops = {
2720         .read_std            = kvm_read_guest_virt,
2721         .read_emulated       = emulator_read_emulated,
2722         .write_emulated      = emulator_write_emulated,
2723         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2724 };
2725
2726 static void cache_all_regs(struct kvm_vcpu *vcpu)
2727 {
2728         kvm_register_read(vcpu, VCPU_REGS_RAX);
2729         kvm_register_read(vcpu, VCPU_REGS_RSP);
2730         kvm_register_read(vcpu, VCPU_REGS_RIP);
2731         vcpu->arch.regs_dirty = ~0;
2732 }
2733
2734 int emulate_instruction(struct kvm_vcpu *vcpu,
2735                         struct kvm_run *run,
2736                         unsigned long cr2,
2737                         u16 error_code,
2738                         int emulation_type)
2739 {
2740         int r, shadow_mask;
2741         struct decode_cache *c;
2742
2743         kvm_clear_exception_queue(vcpu);
2744         vcpu->arch.mmio_fault_cr2 = cr2;
2745         /*
2746          * TODO: fix x86_emulate.c to use guest_read/write_register
2747          * instead of direct ->regs accesses, can save hundred cycles
2748          * on Intel for instructions that don't read/change RSP, for
2749          * for example.
2750          */
2751         cache_all_regs(vcpu);
2752
2753         vcpu->mmio_is_write = 0;
2754         vcpu->arch.pio.string = 0;
2755
2756         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2757                 int cs_db, cs_l;
2758                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2759
2760                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2761                 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2762                 vcpu->arch.emulate_ctxt.mode =
2763                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2764                         ? X86EMUL_MODE_REAL : cs_l
2765                         ? X86EMUL_MODE_PROT64 : cs_db
2766                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2767
2768                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2769
2770                 /* Only allow emulation of specific instructions on #UD
2771                  * (namely VMMCALL, sysenter, sysexit, syscall)*/
2772                 c = &vcpu->arch.emulate_ctxt.decode;
2773                 if (emulation_type & EMULTYPE_TRAP_UD) {
2774                         if (!c->twobyte)
2775                                 return EMULATE_FAIL;
2776                         switch (c->b) {
2777                         case 0x01: /* VMMCALL */
2778                                 if (c->modrm_mod != 3 || c->modrm_rm != 1)
2779                                         return EMULATE_FAIL;
2780                                 break;
2781                         case 0x34: /* sysenter */
2782                         case 0x35: /* sysexit */
2783                                 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2784                                         return EMULATE_FAIL;
2785                                 break;
2786                         case 0x05: /* syscall */
2787                                 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2788                                         return EMULATE_FAIL;
2789                                 break;
2790                         default:
2791                                 return EMULATE_FAIL;
2792                         }
2793
2794                         if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
2795                                 return EMULATE_FAIL;
2796                 }
2797
2798                 ++vcpu->stat.insn_emulation;
2799                 if (r)  {
2800                         ++vcpu->stat.insn_emulation_fail;
2801                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2802                                 return EMULATE_DONE;
2803                         return EMULATE_FAIL;
2804                 }
2805         }
2806
2807         if (emulation_type & EMULTYPE_SKIP) {
2808                 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2809                 return EMULATE_DONE;
2810         }
2811
2812         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2813         shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2814
2815         if (r == 0)
2816                 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2817
2818         if (vcpu->arch.pio.string)
2819                 return EMULATE_DO_MMIO;
2820
2821         if ((r || vcpu->mmio_is_write) && run) {
2822                 run->exit_reason = KVM_EXIT_MMIO;
2823                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2824                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2825                 run->mmio.len = vcpu->mmio_size;
2826                 run->mmio.is_write = vcpu->mmio_is_write;
2827         }
2828
2829         if (r) {
2830                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2831                         return EMULATE_DONE;
2832                 if (!vcpu->mmio_needed) {
2833                         kvm_report_emulation_failure(vcpu, "mmio");
2834                         return EMULATE_FAIL;
2835                 }
2836                 return EMULATE_DO_MMIO;
2837         }
2838
2839         kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2840
2841         if (vcpu->mmio_is_write) {
2842                 vcpu->mmio_needed = 0;
2843                 return EMULATE_DO_MMIO;
2844         }
2845
2846         return EMULATE_DONE;
2847 }
2848 EXPORT_SYMBOL_GPL(emulate_instruction);
2849
2850 static int pio_copy_data(struct kvm_vcpu *vcpu)
2851 {
2852         void *p = vcpu->arch.pio_data;
2853         gva_t q = vcpu->arch.pio.guest_gva;
2854         unsigned bytes;
2855         int ret;
2856
2857         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2858         if (vcpu->arch.pio.in)
2859                 ret = kvm_write_guest_virt(q, p, bytes, vcpu);
2860         else
2861                 ret = kvm_read_guest_virt(q, p, bytes, vcpu);
2862         return ret;
2863 }
2864
2865 int complete_pio(struct kvm_vcpu *vcpu)
2866 {
2867         struct kvm_pio_request *io = &vcpu->arch.pio;
2868         long delta;
2869         int r;
2870         unsigned long val;
2871
2872         if (!io->string) {
2873                 if (io->in) {
2874                         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2875                         memcpy(&val, vcpu->arch.pio_data, io->size);
2876                         kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2877                 }
2878         } else {
2879                 if (io->in) {
2880                         r = pio_copy_data(vcpu);
2881                         if (r)
2882                                 return r;
2883                 }
2884
2885                 delta = 1;
2886                 if (io->rep) {
2887                         delta *= io->cur_count;
2888                         /*
2889                          * The size of the register should really depend on
2890                          * current address size.
2891                          */
2892                         val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2893                         val -= delta;
2894                         kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2895                 }
2896                 if (io->down)
2897                         delta = -delta;
2898                 delta *= io->size;
2899                 if (io->in) {
2900                         val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2901                         val += delta;
2902                         kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2903                 } else {
2904                         val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2905                         val += delta;
2906                         kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2907                 }
2908         }
2909
2910         io->count -= io->cur_count;
2911         io->cur_count = 0;
2912
2913         return 0;
2914 }
2915
2916 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
2917 {
2918         /* TODO: String I/O for in kernel device */
2919         int r;
2920
2921         if (vcpu->arch.pio.in)
2922                 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
2923                                     vcpu->arch.pio.size, pd);
2924         else
2925                 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
2926                                      vcpu->arch.pio.size, pd);
2927         return r;
2928 }
2929
2930 static int pio_string_write(struct kvm_vcpu *vcpu)
2931 {
2932         struct kvm_pio_request *io = &vcpu->arch.pio;
2933         void *pd = vcpu->arch.pio_data;
2934         int i, r = 0;
2935
2936         for (i = 0; i < io->cur_count; i++) {
2937                 if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
2938                                      io->port, io->size, pd)) {
2939                         r = -EOPNOTSUPP;
2940                         break;
2941                 }
2942                 pd += io->size;
2943         }
2944         return r;
2945 }
2946
2947 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2948                   int size, unsigned port)
2949 {
2950         unsigned long val;
2951
2952         vcpu->run->exit_reason = KVM_EXIT_IO;
2953         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2954         vcpu->run->io.size = vcpu->arch.pio.size = size;
2955         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2956         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2957         vcpu->run->io.port = vcpu->arch.pio.port = port;
2958         vcpu->arch.pio.in = in;
2959         vcpu->arch.pio.string = 0;
2960         vcpu->arch.pio.down = 0;
2961         vcpu->arch.pio.rep = 0;
2962
2963         trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
2964                       size, 1);
2965
2966         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2967         memcpy(vcpu->arch.pio_data, &val, 4);
2968
2969         if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
2970                 complete_pio(vcpu);
2971                 return 1;
2972         }
2973         return 0;
2974 }
2975 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2976
2977 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2978                   int size, unsigned long count, int down,
2979                   gva_t address, int rep, unsigned port)
2980 {
2981         unsigned now, in_page;
2982         int ret = 0;
2983
2984         vcpu->run->exit_reason = KVM_EXIT_IO;
2985         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2986         vcpu->run->io.size = vcpu->arch.pio.size = size;
2987         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2988         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2989         vcpu->run->io.port = vcpu->arch.pio.port = port;
2990         vcpu->arch.pio.in = in;
2991         vcpu->arch.pio.string = 1;
2992         vcpu->arch.pio.down = down;
2993         vcpu->arch.pio.rep = rep;
2994
2995         trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
2996                       size, count);
2997
2998         if (!count) {
2999                 kvm_x86_ops->skip_emulated_instruction(vcpu);
3000                 return 1;
3001         }
3002
3003         if (!down)
3004                 in_page = PAGE_SIZE - offset_in_page(address);
3005         else
3006                 in_page = offset_in_page(address) + size;
3007         now = min(count, (unsigned long)in_page / size);
3008         if (!now)
3009                 now = 1;
3010         if (down) {
3011                 /*
3012                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
3013                  */
3014                 pr_unimpl(vcpu, "guest string pio down\n");
3015                 kvm_inject_gp(vcpu, 0);
3016                 return 1;
3017         }
3018         vcpu->run->io.count = now;
3019         vcpu->arch.pio.cur_count = now;
3020
3021         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
3022                 kvm_x86_ops->skip_emulated_instruction(vcpu);
3023
3024         vcpu->arch.pio.guest_gva = address;
3025
3026         if (!vcpu->arch.pio.in) {
3027                 /* string PIO write */
3028                 ret = pio_copy_data(vcpu);
3029                 if (ret == X86EMUL_PROPAGATE_FAULT) {
3030                         kvm_inject_gp(vcpu, 0);
3031                         return 1;
3032                 }
3033                 if (ret == 0 && !pio_string_write(vcpu)) {
3034                         complete_pio(vcpu);
3035                         if (vcpu->arch.pio.count == 0)
3036                                 ret = 1;
3037                 }
3038         }
3039         /* no string PIO read support yet */
3040
3041         return ret;
3042 }
3043 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
3044
3045 static void bounce_off(void *info)
3046 {
3047         /* nothing */
3048 }
3049
3050 static unsigned int  ref_freq;
3051 static unsigned long tsc_khz_ref;
3052
3053 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3054                                      void *data)
3055 {
3056         struct cpufreq_freqs *freq = data;
3057         struct kvm *kvm;
3058         struct kvm_vcpu *vcpu;
3059         int i, send_ipi = 0;
3060
3061         if (!ref_freq)
3062                 ref_freq = freq->old;
3063
3064         if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3065                 return 0;
3066         if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3067                 return 0;
3068         per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
3069
3070         spin_lock(&kvm_lock);
3071         list_for_each_entry(kvm, &vm_list, vm_list) {
3072                 kvm_for_each_vcpu(i, vcpu, kvm) {
3073                         if (vcpu->cpu != freq->cpu)
3074                                 continue;
3075                         if (!kvm_request_guest_time_update(vcpu))
3076                                 continue;
3077                         if (vcpu->cpu != smp_processor_id())
3078                                 send_ipi++;
3079                 }
3080         }
3081         spin_unlock(&kvm_lock);
3082
3083         if (freq->old < freq->new && send_ipi) {
3084                 /*
3085                  * We upscale the frequency.  Must make the guest
3086                  * doesn't see old kvmclock values while running with
3087                  * the new frequency, otherwise we risk the guest sees
3088                  * time go backwards.
3089                  *
3090                  * In case we update the frequency for another cpu
3091                  * (which might be in guest context) send an interrupt
3092                  * to kick the cpu out of guest context.  Next time
3093                  * guest context is entered kvmclock will be updated,
3094                  * so the guest will not see stale values.
3095                  */
3096                 smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
3097         }
3098         return 0;
3099 }
3100
3101 static struct notifier_block kvmclock_cpufreq_notifier_block = {
3102         .notifier_call  = kvmclock_cpufreq_notifier
3103 };
3104
3105 int kvm_arch_init(void *opaque)
3106 {
3107         int r, cpu;
3108         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3109
3110         if (kvm_x86_ops) {
3111                 printk(KERN_ERR "kvm: already loaded the other module\n");
3112                 r = -EEXIST;
3113                 goto out;
3114         }
3115
3116         if (!ops->cpu_has_kvm_support()) {
3117                 printk(KERN_ERR "kvm: no hardware support\n");
3118                 r = -EOPNOTSUPP;
3119                 goto out;
3120         }
3121         if (ops->disabled_by_bios()) {
3122                 printk(KERN_ERR "kvm: disabled by bios\n");
3123                 r = -EOPNOTSUPP;
3124                 goto out;
3125         }
3126
3127         r = kvm_mmu_module_init();
3128         if (r)
3129                 goto out;
3130
3131         kvm_init_msr_list();
3132
3133         kvm_x86_ops = ops;
3134         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
3135         kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
3136         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3137                         PT_DIRTY_MASK, PT64_NX_MASK, 0);
3138
3139         for_each_possible_cpu(cpu)
3140                 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3141         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3142                 tsc_khz_ref = tsc_khz;
3143                 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3144                                           CPUFREQ_TRANSITION_NOTIFIER);
3145         }
3146
3147         return 0;
3148
3149 out:
3150         return r;
3151 }
3152
3153 void kvm_arch_exit(void)
3154 {
3155         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
3156                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
3157                                             CPUFREQ_TRANSITION_NOTIFIER);
3158         kvm_x86_ops = NULL;
3159         kvm_mmu_module_exit();
3160 }
3161
3162 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
3163 {
3164         ++vcpu->stat.halt_exits;
3165         if (irqchip_in_kernel(vcpu->kvm)) {
3166                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
3167                 return 1;
3168         } else {
3169                 vcpu->run->exit_reason = KVM_EXIT_HLT;
3170                 return 0;
3171         }
3172 }
3173 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
3174
3175 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
3176                            unsigned long a1)
3177 {
3178         if (is_long_mode(vcpu))
3179                 return a0;
3180         else
3181                 return a0 | ((gpa_t)a1 << 32);
3182 }
3183
3184 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3185 {
3186         unsigned long nr, a0, a1, a2, a3, ret;
3187         int r = 1;
3188
3189         nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3190         a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3191         a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
3192         a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
3193         a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
3194
3195         trace_kvm_hypercall(nr, a0, a1, a2, a3);
3196
3197         if (!is_long_mode(vcpu)) {
3198                 nr &= 0xFFFFFFFF;
3199                 a0 &= 0xFFFFFFFF;
3200                 a1 &= 0xFFFFFFFF;
3201                 a2 &= 0xFFFFFFFF;
3202                 a3 &= 0xFFFFFFFF;
3203         }
3204
3205         switch (nr) {
3206         case KVM_HC_VAPIC_POLL_IRQ:
3207                 ret = 0;
3208                 break;
3209         case KVM_HC_MMU_OP:
3210                 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
3211                 break;
3212         default:
3213                 ret = -KVM_ENOSYS;
3214                 break;
3215         }
3216         kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3217         ++vcpu->stat.hypercalls;
3218         return r;
3219 }
3220 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
3221
3222 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3223 {
3224         char instruction[3];
3225         int ret = 0;
3226         unsigned long rip = kvm_rip_read(vcpu);
3227
3228
3229         /*
3230          * Blow out the MMU to ensure that no other VCPU has an active mapping
3231          * to ensure that the updated hypercall appears atomically across all
3232          * VCPUs.
3233          */
3234         kvm_mmu_zap_all(vcpu->kvm);
3235
3236         kvm_x86_ops->patch_hypercall(vcpu, instruction);
3237         if (emulator_write_emulated(rip, instruction, 3, vcpu)
3238             != X86EMUL_CONTINUE)
3239                 ret = -EFAULT;
3240
3241         return ret;
3242 }
3243
3244 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3245 {
3246         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3247 }
3248
3249 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3250 {
3251         struct descriptor_table dt = { limit, base };
3252
3253         kvm_x86_ops->set_gdt(vcpu, &dt);
3254 }
3255
3256 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3257 {
3258         struct descriptor_table dt = { limit, base };
3259
3260         kvm_x86_ops->set_idt(vcpu, &dt);
3261 }
3262
3263 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3264                    unsigned long *rflags)
3265 {
3266         kvm_lmsw(vcpu, msw);
3267         *rflags = kvm_x86_ops->get_rflags(vcpu);
3268 }
3269
3270 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3271 {
3272         unsigned long value;
3273
3274         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3275         switch (cr) {
3276         case 0:
3277                 value = vcpu->arch.cr0;
3278                 break;
3279         case 2:
3280                 value = vcpu->arch.cr2;
3281                 break;
3282         case 3:
3283                 value = vcpu->arch.cr3;
3284                 break;
3285         case 4:
3286                 value = vcpu->arch.cr4;
3287                 break;
3288         case 8:
3289                 value = kvm_get_cr8(vcpu);
3290                 break;
3291         default:
3292                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3293                 return 0;
3294         }
3295
3296         return value;
3297 }
3298
3299 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3300                      unsigned long *rflags)
3301 {
3302         switch (cr) {
3303         case 0:
3304                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3305                 *rflags = kvm_x86_ops->get_rflags(vcpu);
3306                 break;
3307         case 2:
3308                 vcpu->arch.cr2 = val;
3309                 break;
3310         case 3:
3311                 kvm_set_cr3(vcpu, val);
3312                 break;
3313         case 4:
3314                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
3315                 break;
3316         case 8:
3317                 kvm_set_cr8(vcpu, val & 0xfUL);
3318                 break;
3319         default:
3320                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3321         }
3322 }
3323
3324 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
3325 {
3326         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
3327         int j, nent = vcpu->arch.cpuid_nent;
3328
3329         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
3330         /* when no next entry is found, the current entry[i] is reselected */
3331         for (j = i + 1; ; j = (j + 1) % nent) {
3332                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
3333                 if (ej->function == e->function) {
3334                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
3335                         return j;
3336                 }
3337         }
3338         return 0; /* silence gcc, even though control never reaches here */
3339 }
3340
3341 /* find an entry with matching function, matching index (if needed), and that
3342  * should be read next (if it's stateful) */
3343 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
3344         u32 function, u32 index)
3345 {
3346         if (e->function != function)
3347                 return 0;
3348         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
3349                 return 0;
3350         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
3351             !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
3352                 return 0;
3353         return 1;
3354 }
3355
3356 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3357                                               u32 function, u32 index)
3358 {
3359         int i;
3360         struct kvm_cpuid_entry2 *best = NULL;
3361
3362         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3363                 struct kvm_cpuid_entry2 *e;
3364
3365                 e = &vcpu->arch.cpuid_entries[i];
3366                 if (is_matching_cpuid_entry(e, function, index)) {
3367                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3368                                 move_to_next_stateful_cpuid_entry(vcpu, i);
3369                         best = e;
3370                         break;
3371                 }
3372                 /*
3373                  * Both basic or both extended?
3374                  */
3375                 if (((e->function ^ function) & 0x80000000) == 0)
3376                         if (!best || e->function > best->function)
3377                                 best = e;
3378         }
3379         return best;
3380 }
3381
3382 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3383 {
3384         struct kvm_cpuid_entry2 *best;
3385
3386         best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3387         if (best)
3388                 return best->eax & 0xff;
3389         return 36;
3390 }
3391
3392 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3393 {
3394         u32 function, index;
3395         struct kvm_cpuid_entry2 *best;
3396
3397         function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3398         index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3399         kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3400         kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3401         kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3402         kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3403         best = kvm_find_cpuid_entry(vcpu, function, index);
3404         if (best) {
3405                 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
3406                 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
3407                 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
3408                 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3409         }
3410         kvm_x86_ops->skip_emulated_instruction(vcpu);
3411         trace_kvm_cpuid(function,
3412                         kvm_register_read(vcpu, VCPU_REGS_RAX),
3413                         kvm_register_read(vcpu, VCPU_REGS_RBX),
3414                         kvm_register_read(vcpu, VCPU_REGS_RCX),
3415                         kvm_register_read(vcpu, VCPU_REGS_RDX));
3416 }
3417 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3418
3419 /*
3420  * Check if userspace requested an interrupt window, and that the
3421  * interrupt window is open.
3422  *
3423  * No need to exit to userspace if we already have an interrupt queued.
3424  */
3425 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
3426                                           struct kvm_run *kvm_run)
3427 {
3428         return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3429                 kvm_run->request_interrupt_window &&
3430                 kvm_arch_interrupt_allowed(vcpu));
3431 }
3432
3433 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
3434                               struct kvm_run *kvm_run)
3435 {
3436         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3437         kvm_run->cr8 = kvm_get_cr8(vcpu);
3438         kvm_run->apic_base = kvm_get_apic_base(vcpu);
3439         if (irqchip_in_kernel(vcpu->kvm))
3440                 kvm_run->ready_for_interrupt_injection = 1;
3441         else
3442                 kvm_run->ready_for_interrupt_injection =
3443                         kvm_arch_interrupt_allowed(vcpu) &&
3444                         !kvm_cpu_has_interrupt(vcpu) &&
3445                         !kvm_event_needs_reinjection(vcpu);
3446 }
3447
3448 static void vapic_enter(struct kvm_vcpu *vcpu)
3449 {
3450         struct kvm_lapic *apic = vcpu->arch.apic;
3451         struct page *page;
3452
3453         if (!apic || !apic->vapic_addr)
3454                 return;
3455
3456         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3457
3458         vcpu->arch.apic->vapic_page = page;
3459 }
3460
3461 static void vapic_exit(struct kvm_vcpu *vcpu)
3462 {
3463         struct kvm_lapic *apic = vcpu->arch.apic;
3464
3465         if (!apic || !apic->vapic_addr)
3466                 return;
3467
3468         down_read(&vcpu->kvm->slots_lock);
3469         kvm_release_page_dirty(apic->vapic_page);
3470         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3471         up_read(&vcpu->kvm->slots_lock);
3472 }
3473
3474 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3475 {
3476         int max_irr, tpr;
3477
3478         if (!kvm_x86_ops->update_cr8_intercept)
3479                 return;
3480
3481         if (!vcpu->arch.apic->vapic_addr)
3482                 max_irr = kvm_lapic_find_highest_irr(vcpu);
3483         else
3484                 max_irr = -1;
3485
3486         if (max_irr != -1)
3487                 max_irr >>= 4;
3488
3489         tpr = kvm_lapic_get_cr8(vcpu);
3490
3491         kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3492 }
3493
3494 static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3495 {
3496         /* try to reinject previous events if any */
3497         if (vcpu->arch.nmi_injected) {
3498                 kvm_x86_ops->set_nmi(vcpu);
3499                 return;
3500         }
3501
3502         if (vcpu->arch.interrupt.pending) {
3503                 kvm_x86_ops->set_irq(vcpu);
3504                 return;
3505         }
3506
3507         /* try to inject new event if pending */
3508         if (vcpu->arch.nmi_pending) {
3509                 if (kvm_x86_ops->nmi_allowed(vcpu)) {
3510                         vcpu->arch.nmi_pending = false;
3511                         vcpu->arch.nmi_injected = true;
3512                         kvm_x86_ops->set_nmi(vcpu);
3513                 }
3514         } else if (kvm_cpu_has_interrupt(vcpu)) {
3515                 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3516                         kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3517                                             false);
3518                         kvm_x86_ops->set_irq(vcpu);
3519                 }
3520         }
3521 }
3522
3523 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3524 {
3525         int r;
3526         bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3527                 kvm_run->request_interrupt_window;
3528
3529         if (vcpu->requests)
3530                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
3531                         kvm_mmu_unload(vcpu);
3532
3533         r = kvm_mmu_reload(vcpu);
3534         if (unlikely(r))
3535                 goto out;
3536
3537         if (vcpu->requests) {
3538                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
3539                         __kvm_migrate_timers(vcpu);
3540                 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3541                         kvm_write_guest_time(vcpu);
3542                 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
3543                         kvm_mmu_sync_roots(vcpu);
3544                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
3545                         kvm_x86_ops->tlb_flush(vcpu);
3546                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3547                                        &vcpu->requests)) {
3548                         kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
3549                         r = 0;
3550                         goto out;
3551                 }
3552                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3553                         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
3554                         r = 0;
3555                         goto out;
3556                 }
3557         }
3558
3559         preempt_disable();
3560
3561         kvm_x86_ops->prepare_guest_switch(vcpu);
3562         kvm_load_guest_fpu(vcpu);
3563
3564         local_irq_disable();
3565
3566         clear_bit(KVM_REQ_KICK, &vcpu->requests);
3567         smp_mb__after_clear_bit();
3568
3569         if (vcpu->requests || need_resched() || signal_pending(current)) {
3570                 set_bit(KVM_REQ_KICK, &vcpu->requests);
3571                 local_irq_enable();
3572                 preempt_enable();
3573                 r = 1;
3574                 goto out;
3575         }
3576
3577         if (vcpu->arch.exception.pending)
3578                 __queue_exception(vcpu);
3579         else
3580                 inject_pending_irq(vcpu, kvm_run);
3581
3582         /* enable NMI/IRQ window open exits if needed */
3583         if (vcpu->arch.nmi_pending)
3584                 kvm_x86_ops->enable_nmi_window(vcpu);
3585         else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3586                 kvm_x86_ops->enable_irq_window(vcpu);
3587
3588         if (kvm_lapic_enabled(vcpu)) {
3589                 update_cr8_intercept(vcpu);
3590                 kvm_lapic_sync_to_vapic(vcpu);
3591         }
3592
3593         up_read(&vcpu->kvm->slots_lock);
3594
3595         kvm_guest_enter();
3596
3597         get_debugreg(vcpu->arch.host_dr6, 6);
3598         get_debugreg(vcpu->arch.host_dr7, 7);
3599         if (unlikely(vcpu->arch.switch_db_regs)) {
3600                 get_debugreg(vcpu->arch.host_db[0], 0);
3601                 get_debugreg(vcpu->arch.host_db[1], 1);
3602                 get_debugreg(vcpu->arch.host_db[2], 2);
3603                 get_debugreg(vcpu->arch.host_db[3], 3);
3604
3605                 set_debugreg(0, 7);
3606                 set_debugreg(vcpu->arch.eff_db[0], 0);
3607                 set_debugreg(vcpu->arch.eff_db[1], 1);
3608                 set_debugreg(vcpu->arch.eff_db[2], 2);
3609                 set_debugreg(vcpu->arch.eff_db[3], 3);
3610         }
3611
3612         trace_kvm_entry(vcpu->vcpu_id);
3613         kvm_x86_ops->run(vcpu, kvm_run);
3614
3615         if (unlikely(vcpu->arch.switch_db_regs)) {
3616                 set_debugreg(0, 7);
3617                 set_debugreg(vcpu->arch.host_db[0], 0);
3618                 set_debugreg(vcpu->arch.host_db[1], 1);
3619                 set_debugreg(vcpu->arch.host_db[2], 2);
3620                 set_debugreg(vcpu->arch.host_db[3], 3);
3621         }
3622         set_debugreg(vcpu->arch.host_dr6, 6);
3623         set_debugreg(vcpu->arch.host_dr7, 7);
3624
3625         set_bit(KVM_REQ_KICK, &vcpu->requests);
3626         local_irq_enable();
3627
3628         ++vcpu->stat.exits;
3629
3630         /*
3631          * We must have an instruction between local_irq_enable() and
3632          * kvm_guest_exit(), so the timer interrupt isn't delayed by
3633          * the interrupt shadow.  The stat.exits increment will do nicely.
3634          * But we need to prevent reordering, hence this barrier():
3635          */
3636         barrier();
3637
3638         kvm_guest_exit();
3639
3640         preempt_enable();
3641
3642         down_read(&vcpu->kvm->slots_lock);
3643
3644         /*
3645          * Profile KVM exit RIPs:
3646          */
3647         if (unlikely(prof_on == KVM_PROFILING)) {
3648                 unsigned long rip = kvm_rip_read(vcpu);
3649                 profile_hit(KVM_PROFILING, (void *)rip);
3650         }
3651
3652
3653         kvm_lapic_sync_from_vapic(vcpu);
3654
3655         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
3656 out:
3657         return r;
3658 }
3659
3660
3661 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3662 {
3663         int r;
3664
3665         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3666                 pr_debug("vcpu %d received sipi with vector # %x\n",
3667                          vcpu->vcpu_id, vcpu->arch.sipi_vector);
3668                 kvm_lapic_reset(vcpu);
3669                 r = kvm_arch_vcpu_reset(vcpu);
3670                 if (r)
3671                         return r;
3672                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3673         }
3674
3675         down_read(&vcpu->kvm->slots_lock);
3676         vapic_enter(vcpu);
3677
3678         r = 1;
3679         while (r > 0) {
3680                 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3681                         r = vcpu_enter_guest(vcpu, kvm_run);
3682                 else {
3683                         up_read(&vcpu->kvm->slots_lock);
3684                         kvm_vcpu_block(vcpu);
3685                         down_read(&vcpu->kvm->slots_lock);
3686                         if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3687                         {
3688                                 switch(vcpu->arch.mp_state) {
3689                                 case KVM_MP_STATE_HALTED:
3690                                         vcpu->arch.mp_state =
3691                                                 KVM_MP_STATE_RUNNABLE;
3692                                 case KVM_MP_STATE_RUNNABLE:
3693                                         break;
3694                                 case KVM_MP_STATE_SIPI_RECEIVED:
3695                                 default:
3696                                         r = -EINTR;
3697                                         break;
3698                                 }
3699                         }
3700                 }
3701
3702                 if (r <= 0)
3703                         break;
3704
3705                 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3706                 if (kvm_cpu_has_pending_timer(vcpu))
3707                         kvm_inject_pending_timer_irqs(vcpu);
3708
3709                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3710                         r = -EINTR;
3711                         kvm_run->exit_reason = KVM_EXIT_INTR;
3712                         ++vcpu->stat.request_irq_exits;
3713                 }
3714                 if (signal_pending(current)) {
3715                         r = -EINTR;
3716                         kvm_run->exit_reason = KVM_EXIT_INTR;
3717                         ++vcpu->stat.signal_exits;
3718                 }
3719                 if (need_resched()) {
3720                         up_read(&vcpu->kvm->slots_lock);
3721                         kvm_resched(vcpu);
3722                         down_read(&vcpu->kvm->slots_lock);
3723                 }
3724         }
3725
3726         up_read(&vcpu->kvm->slots_lock);
3727         post_kvm_run_save(vcpu, kvm_run);
3728
3729         vapic_exit(vcpu);
3730
3731         return r;
3732 }
3733
3734 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3735 {
3736         int r;
3737         sigset_t sigsaved;
3738
3739         vcpu_load(vcpu);
3740
3741         if (vcpu->sigset_active)
3742                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3743
3744         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3745                 kvm_vcpu_block(vcpu);
3746                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3747                 r = -EAGAIN;
3748                 goto out;
3749         }
3750
3751         /* re-sync apic's tpr */
3752         if (!irqchip_in_kernel(vcpu->kvm))
3753                 kvm_set_cr8(vcpu, kvm_run->cr8);
3754
3755         if (vcpu->arch.pio.cur_count) {
3756                 r = complete_pio(vcpu);
3757                 if (r)
3758                         goto out;
3759         }
3760 #if CONFIG_HAS_IOMEM
3761         if (vcpu->mmio_needed) {
3762                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3763                 vcpu->mmio_read_completed = 1;
3764                 vcpu->mmio_needed = 0;
3765
3766                 down_read(&vcpu->kvm->slots_lock);
3767                 r = emulate_instruction(vcpu, kvm_run,
3768                                         vcpu->arch.mmio_fault_cr2, 0,
3769                                         EMULTYPE_NO_DECODE);
3770                 up_read(&vcpu->kvm->slots_lock);
3771                 if (r == EMULATE_DO_MMIO) {
3772                         /*
3773                          * Read-modify-write.  Back to userspace.
3774                          */
3775                         r = 0;
3776                         goto out;
3777                 }
3778         }
3779 #endif
3780         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3781                 kvm_register_write(vcpu, VCPU_REGS_RAX,
3782                                      kvm_run->hypercall.ret);
3783
3784         r = __vcpu_run(vcpu, kvm_run);
3785
3786 out:
3787         if (vcpu->sigset_active)
3788                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3789
3790         vcpu_put(vcpu);
3791         return r;
3792 }
3793
3794 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3795 {
3796         vcpu_load(vcpu);
3797
3798         regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3799         regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3800         regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3801         regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3802         regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3803         regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3804         regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3805         regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3806 #ifdef CONFIG_X86_64
3807         regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3808         regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3809         regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3810         regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3811         regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3812         regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3813         regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3814         regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3815 #endif
3816
3817         regs->rip = kvm_rip_read(vcpu);
3818         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3819
3820         /*
3821          * Don't leak debug flags in case they were set for guest debugging
3822          */
3823         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3824                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3825
3826         vcpu_put(vcpu);
3827
3828         return 0;
3829 }
3830
3831 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3832 {
3833         vcpu_load(vcpu);
3834
3835         kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3836         kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3837         kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3838         kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3839         kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3840         kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3841         kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3842         kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3843 #ifdef CONFIG_X86_64
3844         kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3845         kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3846         kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3847         kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3848         kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3849         kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3850         kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3851         kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3852
3853 #endif
3854
3855         kvm_rip_write(vcpu, regs->rip);
3856         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3857
3858
3859         vcpu->arch.exception.pending = false;
3860
3861         vcpu_put(vcpu);
3862
3863         return 0;
3864 }
3865
3866 void kvm_get_segment(struct kvm_vcpu *vcpu,
3867                      struct kvm_segment *var, int seg)
3868 {
3869         kvm_x86_ops->get_segment(vcpu, var, seg);
3870 }
3871
3872 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3873 {
3874         struct kvm_segment cs;
3875
3876         kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3877         *db = cs.db;
3878         *l = cs.l;
3879 }
3880 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3881
3882 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3883                                   struct kvm_sregs *sregs)
3884 {
3885         struct descriptor_table dt;
3886
3887         vcpu_load(vcpu);
3888
3889         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3890         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3891         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3892         kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3893         kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3894         kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3895
3896         kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3897         kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3898
3899         kvm_x86_ops->get_idt(vcpu, &dt);
3900         sregs->idt.limit = dt.limit;
3901         sregs->idt.base = dt.base;
3902         kvm_x86_ops->get_gdt(vcpu, &dt);
3903         sregs->gdt.limit = dt.limit;
3904         sregs->gdt.base = dt.base;
3905
3906         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3907         sregs->cr0 = vcpu->arch.cr0;
3908         sregs->cr2 = vcpu->arch.cr2;
3909         sregs->cr3 = vcpu->arch.cr3;
3910         sregs->cr4 = vcpu->arch.cr4;
3911         sregs->cr8 = kvm_get_cr8(vcpu);
3912         sregs->efer = vcpu->arch.shadow_efer;
3913         sregs->apic_base = kvm_get_apic_base(vcpu);
3914
3915         memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
3916
3917         if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
3918                 set_bit(vcpu->arch.interrupt.nr,
3919                         (unsigned long *)sregs->interrupt_bitmap);
3920
3921         vcpu_put(vcpu);
3922
3923         return 0;
3924 }
3925
3926 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3927                                     struct kvm_mp_state *mp_state)
3928 {
3929         vcpu_load(vcpu);
3930         mp_state->mp_state = vcpu->arch.mp_state;
3931         vcpu_put(vcpu);
3932         return 0;
3933 }
3934
3935 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3936                                     struct kvm_mp_state *mp_state)
3937 {
3938         vcpu_load(vcpu);
3939         vcpu->arch.mp_state = mp_state->mp_state;
3940         vcpu_put(vcpu);
3941         return 0;
3942 }
3943
3944 static void kvm_set_segment(struct kvm_vcpu *vcpu,
3945                         struct kvm_segment *var, int seg)
3946 {
3947         kvm_x86_ops->set_segment(vcpu, var, seg);
3948 }
3949
3950 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3951                                    struct kvm_segment *kvm_desct)
3952 {
3953         kvm_desct->base = seg_desc->base0;
3954         kvm_desct->base |= seg_desc->base1 << 16;
3955         kvm_desct->base |= seg_desc->base2 << 24;
3956         kvm_desct->limit = seg_desc->limit0;
3957         kvm_desct->limit |= seg_desc->limit << 16;
3958         if (seg_desc->g) {
3959                 kvm_desct->limit <<= 12;
3960                 kvm_desct->limit |= 0xfff;
3961         }
3962         kvm_desct->selector = selector;
3963         kvm_desct->type = seg_desc->type;
3964         kvm_desct->present = seg_desc->p;
3965         kvm_desct->dpl = seg_desc->dpl;
3966         kvm_desct->db = seg_desc->d;
3967         kvm_desct->s = seg_desc->s;
3968         kvm_desct->l = seg_desc->l;
3969         kvm_desct->g = seg_desc->g;
3970         kvm_desct->avl = seg_desc->avl;
3971         if (!selector)
3972                 kvm_desct->unusable = 1;
3973         else
3974                 kvm_desct->unusable = 0;
3975         kvm_desct->padding = 0;
3976 }
3977
3978 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3979                                           u16 selector,
3980                                           struct descriptor_table *dtable)
3981 {
3982         if (selector & 1 << 2) {
3983                 struct kvm_segment kvm_seg;
3984
3985                 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3986
3987                 if (kvm_seg.unusable)
3988                         dtable->limit = 0;
3989                 else
3990                         dtable->limit = kvm_seg.limit;
3991                 dtable->base = kvm_seg.base;
3992         }
3993         else
3994                 kvm_x86_ops->get_gdt(vcpu, dtable);
3995 }
3996
3997 /* allowed just for 8 bytes segments */
3998 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3999                                          struct desc_struct *seg_desc)
4000 {
4001         gpa_t gpa;
4002         struct descriptor_table dtable;
4003         u16 index = selector >> 3;
4004
4005         get_segment_descriptor_dtable(vcpu, selector, &dtable);
4006
4007         if (dtable.limit < index * 8 + 7) {
4008                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4009                 return 1;
4010         }
4011         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
4012         gpa += index * 8;
4013         return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
4014 }
4015
4016 /* allowed just for 8 bytes segments */
4017 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4018                                          struct desc_struct *seg_desc)
4019 {
4020         gpa_t gpa;
4021         struct descriptor_table dtable;
4022         u16 index = selector >> 3;
4023
4024         get_segment_descriptor_dtable(vcpu, selector, &dtable);
4025
4026         if (dtable.limit < index * 8 + 7)
4027                 return 1;
4028         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
4029         gpa += index * 8;
4030         return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
4031 }
4032
4033 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
4034                              struct desc_struct *seg_desc)
4035 {
4036         u32 base_addr;
4037
4038         base_addr = seg_desc->base0;
4039         base_addr |= (seg_desc->base1 << 16);
4040         base_addr |= (seg_desc->base2 << 24);
4041
4042         return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
4043 }
4044
4045 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4046 {
4047         struct kvm_segment kvm_seg;
4048
4049         kvm_get_segment(vcpu, &kvm_seg, seg);
4050         return kvm_seg.selector;
4051 }
4052
4053 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
4054                                                 u16 selector,
4055                                                 struct kvm_segment *kvm_seg)
4056 {
4057         struct desc_struct seg_desc;
4058
4059         if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
4060                 return 1;
4061         seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
4062         return 0;
4063 }
4064
4065 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4066 {
4067         struct kvm_segment segvar = {
4068                 .base = selector << 4,
4069                 .limit = 0xffff,
4070                 .selector = selector,
4071                 .type = 3,
4072                 .present = 1,
4073                 .dpl = 3,
4074                 .db = 0,
4075                 .s = 1,
4076                 .l = 0,
4077                 .g = 0,
4078                 .avl = 0,
4079                 .unusable = 0,
4080         };
4081         kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4082         return 0;
4083 }
4084
4085 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4086                                 int type_bits, int seg)
4087 {
4088         struct kvm_segment kvm_seg;
4089
4090         if (!(vcpu->arch.cr0 & X86_CR0_PE))
4091                 return kvm_load_realmode_segment(vcpu, selector, seg);
4092         if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
4093                 return 1;
4094         kvm_seg.type |= type_bits;
4095
4096         if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
4097             seg != VCPU_SREG_LDTR)
4098                 if (!kvm_seg.s)
4099                         kvm_seg.unusable = 1;
4100
4101         kvm_set_segment(vcpu, &kvm_seg, seg);
4102         return 0;
4103 }
4104
4105 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4106                                 struct tss_segment_32 *tss)
4107 {
4108         tss->cr3 = vcpu->arch.cr3;
4109         tss->eip = kvm_rip_read(vcpu);
4110         tss->eflags = kvm_x86_ops->get_rflags(vcpu);
4111         tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4112         tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4113         tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4114         tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4115         tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4116         tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4117         tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4118         tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4119         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4120         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4121         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4122         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4123         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
4124         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
4125         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4126 }
4127
4128 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4129                                   struct tss_segment_32 *tss)
4130 {
4131         kvm_set_cr3(vcpu, tss->cr3);
4132
4133         kvm_rip_write(vcpu, tss->eip);
4134         kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
4135
4136         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4137         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
4138         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
4139         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
4140         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
4141         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
4142         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4143         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4144
4145         if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
4146                 return 1;
4147
4148         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
4149                 return 1;
4150
4151         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
4152                 return 1;
4153
4154         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
4155                 return 1;
4156
4157         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
4158                 return 1;
4159
4160         if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
4161                 return 1;
4162
4163         if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
4164                 return 1;
4165         return 0;
4166 }
4167
4168 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4169                                 struct tss_segment_16 *tss)
4170 {
4171         tss->ip = kvm_rip_read(vcpu);
4172         tss->flag = kvm_x86_ops->get_rflags(vcpu);
4173         tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4174         tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4175         tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4176         tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4177         tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4178         tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4179         tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
4180         tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
4181
4182         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4183         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4184         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4185         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4186         tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4187         tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
4188 }
4189
4190 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4191                                  struct tss_segment_16 *tss)
4192 {
4193         kvm_rip_write(vcpu, tss->ip);
4194         kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
4195         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4196         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4197         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
4198         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
4199         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
4200         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
4201         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
4202         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
4203
4204         if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
4205                 return 1;
4206
4207         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
4208                 return 1;
4209
4210         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
4211                 return 1;
4212
4213         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
4214                 return 1;
4215
4216         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
4217                 return 1;
4218         return 0;
4219 }
4220
4221 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4222                               u16 old_tss_sel, u32 old_tss_base,
4223                               struct desc_struct *nseg_desc)
4224 {
4225         struct tss_segment_16 tss_segment_16;
4226         int ret = 0;
4227
4228         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
4229                            sizeof tss_segment_16))
4230                 goto out;
4231
4232         save_state_to_tss16(vcpu, &tss_segment_16);
4233
4234         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
4235                             sizeof tss_segment_16))
4236                 goto out;
4237
4238         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4239                            &tss_segment_16, sizeof tss_segment_16))
4240                 goto out;
4241
4242         if (old_tss_sel != 0xffff) {
4243                 tss_segment_16.prev_task_link = old_tss_sel;
4244
4245                 if (kvm_write_guest(vcpu->kvm,
4246                                     get_tss_base_addr(vcpu, nseg_desc),
4247                                     &tss_segment_16.prev_task_link,
4248                                     sizeof tss_segment_16.prev_task_link))
4249                         goto out;
4250         }
4251
4252         if (load_state_from_tss16(vcpu, &tss_segment_16))
4253                 goto out;
4254
4255         ret = 1;
4256 out:
4257         return ret;
4258 }
4259
4260 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4261                        u16 old_tss_sel, u32 old_tss_base,
4262                        struct desc_struct *nseg_desc)
4263 {
4264         struct tss_segment_32 tss_segment_32;
4265         int ret = 0;
4266
4267         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4268                            sizeof tss_segment_32))
4269                 goto out;
4270
4271         save_state_to_tss32(vcpu, &tss_segment_32);
4272
4273         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4274                             sizeof tss_segment_32))
4275                 goto out;
4276
4277         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4278                            &tss_segment_32, sizeof tss_segment_32))
4279                 goto out;
4280
4281         if (old_tss_sel != 0xffff) {
4282                 tss_segment_32.prev_task_link = old_tss_sel;
4283
4284                 if (kvm_write_guest(vcpu->kvm,
4285                                     get_tss_base_addr(vcpu, nseg_desc),
4286                                     &tss_segment_32.prev_task_link,
4287                                     sizeof tss_segment_32.prev_task_link))
4288                         goto out;
4289         }
4290
4291         if (load_state_from_tss32(vcpu, &tss_segment_32))
4292                 goto out;
4293
4294         ret = 1;
4295 out:
4296         return ret;
4297 }
4298
4299 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4300 {
4301         struct kvm_segment tr_seg;
4302         struct desc_struct cseg_desc;
4303         struct desc_struct nseg_desc;
4304         int ret = 0;
4305         u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4306         u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
4307
4308         old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
4309
4310         /* FIXME: Handle errors. Failure to read either TSS or their
4311          * descriptors should generate a pagefault.
4312          */
4313         if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
4314                 goto out;
4315
4316         if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
4317                 goto out;
4318
4319         if (reason != TASK_SWITCH_IRET) {
4320                 int cpl;
4321
4322                 cpl = kvm_x86_ops->get_cpl(vcpu);
4323                 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
4324                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4325                         return 1;
4326                 }
4327         }
4328
4329         if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
4330                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4331                 return 1;
4332         }
4333
4334         if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
4335                 cseg_desc.type &= ~(1 << 1); //clear the B flag
4336                 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
4337         }
4338
4339         if (reason == TASK_SWITCH_IRET) {
4340                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
4341                 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4342         }
4343
4344         /* set back link to prev task only if NT bit is set in eflags
4345            note that old_tss_sel is not used afetr this point */
4346         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4347                 old_tss_sel = 0xffff;
4348
4349         /* set back link to prev task only if NT bit is set in eflags
4350            note that old_tss_sel is not used afetr this point */
4351         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4352                 old_tss_sel = 0xffff;
4353
4354         if (nseg_desc.type & 8)
4355                 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4356                                          old_tss_base, &nseg_desc);
4357         else
4358                 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
4359                                          old_tss_base, &nseg_desc);
4360
4361         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4362                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
4363                 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4364         }
4365
4366         if (reason != TASK_SWITCH_IRET) {
4367                 nseg_desc.type |= (1 << 1);
4368                 save_guest_segment_descriptor(vcpu, tss_selector,
4369                                               &nseg_desc);
4370         }
4371
4372         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
4373         seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4374         tr_seg.type = 11;
4375         kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
4376 out:
4377         return ret;
4378 }
4379 EXPORT_SYMBOL_GPL(kvm_task_switch);
4380
4381 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4382                                   struct kvm_sregs *sregs)
4383 {
4384         int mmu_reset_needed = 0;
4385         int pending_vec, max_bits;
4386         struct descriptor_table dt;
4387
4388         vcpu_load(vcpu);
4389
4390         dt.limit = sregs->idt.limit;
4391         dt.base = sregs->idt.base;
4392         kvm_x86_ops->set_idt(vcpu, &dt);
4393         dt.limit = sregs->gdt.limit;
4394         dt.base = sregs->gdt.base;
4395         kvm_x86_ops->set_gdt(vcpu, &dt);
4396
4397         vcpu->arch.cr2 = sregs->cr2;
4398         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4399         vcpu->arch.cr3 = sregs->cr3;
4400
4401         kvm_set_cr8(vcpu, sregs->cr8);
4402
4403         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
4404         kvm_x86_ops->set_efer(vcpu, sregs->efer);
4405         kvm_set_apic_base(vcpu, sregs->apic_base);
4406
4407         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4408
4409         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4410         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4411         vcpu->arch.cr0 = sregs->cr0;
4412
4413         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4414         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4415         if (!is_long_mode(vcpu) && is_pae(vcpu))
4416                 load_pdptrs(vcpu, vcpu->arch.cr3);
4417
4418         if (mmu_reset_needed)
4419                 kvm_mmu_reset_context(vcpu);
4420
4421         max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4422         pending_vec = find_first_bit(
4423                 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
4424         if (pending_vec < max_bits) {
4425                 kvm_queue_interrupt(vcpu, pending_vec, false);
4426                 pr_debug("Set back pending irq %d\n", pending_vec);
4427                 if (irqchip_in_kernel(vcpu->kvm))
4428                         kvm_pic_clear_isr_ack(vcpu->kvm);
4429         }
4430
4431         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4432         kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4433         kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4434         kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4435         kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4436         kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4437
4438         kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4439         kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4440
4441         /* Older userspace won't unhalt the vcpu on reset. */
4442         if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4443             sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4444             !(vcpu->arch.cr0 & X86_CR0_PE))
4445                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4446
4447         vcpu_put(vcpu);
4448
4449         return 0;
4450 }
4451
4452 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4453                                         struct kvm_guest_debug *dbg)
4454 {
4455         int i, r;
4456
4457         vcpu_load(vcpu);
4458
4459         if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
4460             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
4461                 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4462                         vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4463                 vcpu->arch.switch_db_regs =
4464                         (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4465         } else {
4466                 for (i = 0; i < KVM_NR_DB_REGS; i++)
4467                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4468                 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4469         }
4470
4471         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
4472
4473         if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4474                 kvm_queue_exception(vcpu, DB_VECTOR);
4475         else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4476                 kvm_queue_exception(vcpu, BP_VECTOR);
4477
4478         vcpu_put(vcpu);
4479
4480         return r;
4481 }
4482
4483 /*
4484  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
4485  * we have asm/x86/processor.h
4486  */
4487 struct fxsave {
4488         u16     cwd;
4489         u16     swd;
4490         u16     twd;
4491         u16     fop;
4492         u64     rip;
4493         u64     rdp;
4494         u32     mxcsr;
4495         u32     mxcsr_mask;
4496         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
4497 #ifdef CONFIG_X86_64
4498         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
4499 #else
4500         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
4501 #endif
4502 };
4503
4504 /*
4505  * Translate a guest virtual address to a guest physical address.
4506  */
4507 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4508                                     struct kvm_translation *tr)
4509 {
4510         unsigned long vaddr = tr->linear_address;
4511         gpa_t gpa;
4512
4513         vcpu_load(vcpu);
4514         down_read(&vcpu->kvm->slots_lock);
4515         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
4516         up_read(&vcpu->kvm->slots_lock);
4517         tr->physical_address = gpa;
4518         tr->valid = gpa != UNMAPPED_GVA;
4519         tr->writeable = 1;
4520         tr->usermode = 0;
4521         vcpu_put(vcpu);
4522
4523         return 0;
4524 }
4525
4526 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4527 {
4528         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4529
4530         vcpu_load(vcpu);
4531
4532         memcpy(fpu->fpr, fxsave->st_space, 128);
4533         fpu->fcw = fxsave->cwd;
4534         fpu->fsw = fxsave->swd;
4535         fpu->ftwx = fxsave->twd;
4536         fpu->last_opcode = fxsave->fop;
4537         fpu->last_ip = fxsave->rip;
4538         fpu->last_dp = fxsave->rdp;
4539         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
4540
4541         vcpu_put(vcpu);
4542
4543         return 0;
4544 }
4545
4546 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4547 {
4548         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4549
4550         vcpu_load(vcpu);
4551
4552         memcpy(fxsave->st_space, fpu->fpr, 128);
4553         fxsave->cwd = fpu->fcw;
4554         fxsave->swd = fpu->fsw;
4555         fxsave->twd = fpu->ftwx;
4556         fxsave->fop = fpu->last_opcode;
4557         fxsave->rip = fpu->last_ip;
4558         fxsave->rdp = fpu->last_dp;
4559         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
4560
4561         vcpu_put(vcpu);
4562
4563         return 0;
4564 }
4565
4566 void fx_init(struct kvm_vcpu *vcpu)
4567 {
4568         unsigned after_mxcsr_mask;
4569
4570         /*
4571          * Touch the fpu the first time in non atomic context as if
4572          * this is the first fpu instruction the exception handler
4573          * will fire before the instruction returns and it'll have to
4574          * allocate ram with GFP_KERNEL.
4575          */
4576         if (!used_math())
4577                 kvm_fx_save(&vcpu->arch.host_fx_image);
4578
4579         /* Initialize guest FPU by resetting ours and saving into guest's */
4580         preempt_disable();
4581         kvm_fx_save(&vcpu->arch.host_fx_image);
4582         kvm_fx_finit();
4583         kvm_fx_save(&vcpu->arch.guest_fx_image);
4584         kvm_fx_restore(&vcpu->arch.host_fx_image);
4585         preempt_enable();
4586
4587         vcpu->arch.cr0 |= X86_CR0_ET;
4588         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
4589         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
4590         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
4591                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
4592 }
4593 EXPORT_SYMBOL_GPL(fx_init);
4594
4595 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4596 {
4597         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
4598                 return;
4599
4600         vcpu->guest_fpu_loaded = 1;
4601         kvm_fx_save(&vcpu->arch.host_fx_image);
4602         kvm_fx_restore(&vcpu->arch.guest_fx_image);
4603 }
4604 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4605
4606 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4607 {
4608         if (!vcpu->guest_fpu_loaded)
4609                 return;
4610
4611         vcpu->guest_fpu_loaded = 0;
4612         kvm_fx_save(&vcpu->arch.guest_fx_image);
4613         kvm_fx_restore(&vcpu->arch.host_fx_image);
4614         ++vcpu->stat.fpu_reload;
4615 }
4616 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4617
4618 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4619 {
4620         if (vcpu->arch.time_page) {
4621                 kvm_release_page_dirty(vcpu->arch.time_page);
4622                 vcpu->arch.time_page = NULL;
4623         }
4624
4625         kvm_x86_ops->vcpu_free(vcpu);
4626 }
4627
4628 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4629                                                 unsigned int id)
4630 {
4631         return kvm_x86_ops->vcpu_create(kvm, id);
4632 }
4633
4634 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4635 {
4636         int r;
4637
4638         /* We do fxsave: this must be aligned. */
4639         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4640
4641         vcpu->arch.mtrr_state.have_fixed = 1;
4642         vcpu_load(vcpu);
4643         r = kvm_arch_vcpu_reset(vcpu);
4644         if (r == 0)
4645                 r = kvm_mmu_setup(vcpu);
4646         vcpu_put(vcpu);
4647         if (r < 0)
4648                 goto free_vcpu;
4649
4650         return 0;
4651 free_vcpu:
4652         kvm_x86_ops->vcpu_free(vcpu);
4653         return r;
4654 }
4655
4656 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4657 {
4658         vcpu_load(vcpu);
4659         kvm_mmu_unload(vcpu);
4660         vcpu_put(vcpu);
4661
4662         kvm_x86_ops->vcpu_free(vcpu);
4663 }
4664
4665 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4666 {
4667         vcpu->arch.nmi_pending = false;
4668         vcpu->arch.nmi_injected = false;
4669
4670         vcpu->arch.switch_db_regs = 0;
4671         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4672         vcpu->arch.dr6 = DR6_FIXED_1;
4673         vcpu->arch.dr7 = DR7_FIXED_1;
4674
4675         return kvm_x86_ops->vcpu_reset(vcpu);
4676 }
4677
4678 void kvm_arch_hardware_enable(void *garbage)
4679 {
4680         kvm_x86_ops->hardware_enable(garbage);
4681 }
4682
4683 void kvm_arch_hardware_disable(void *garbage)
4684 {
4685         kvm_x86_ops->hardware_disable(garbage);
4686 }
4687
4688 int kvm_arch_hardware_setup(void)
4689 {
4690         return kvm_x86_ops->hardware_setup();
4691 }
4692
4693 void kvm_arch_hardware_unsetup(void)
4694 {
4695         kvm_x86_ops->hardware_unsetup();
4696 }
4697
4698 void kvm_arch_check_processor_compat(void *rtn)
4699 {
4700         kvm_x86_ops->check_processor_compatibility(rtn);
4701 }
4702
4703 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4704 {
4705         struct page *page;
4706         struct kvm *kvm;
4707         int r;
4708
4709         BUG_ON(vcpu->kvm == NULL);
4710         kvm = vcpu->kvm;
4711
4712         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4713         if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
4714                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4715         else
4716                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
4717
4718         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4719         if (!page) {
4720                 r = -ENOMEM;
4721                 goto fail;
4722         }
4723         vcpu->arch.pio_data = page_address(page);
4724
4725         r = kvm_mmu_create(vcpu);
4726         if (r < 0)
4727                 goto fail_free_pio_data;
4728
4729         if (irqchip_in_kernel(kvm)) {
4730                 r = kvm_create_lapic(vcpu);
4731                 if (r < 0)
4732                         goto fail_mmu_destroy;
4733         }
4734
4735         vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
4736                                        GFP_KERNEL);
4737         if (!vcpu->arch.mce_banks) {
4738                 r = -ENOMEM;
4739                 goto fail_mmu_destroy;
4740         }
4741         vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
4742
4743         return 0;
4744
4745 fail_mmu_destroy:
4746         kvm_mmu_destroy(vcpu);
4747 fail_free_pio_data:
4748         free_page((unsigned long)vcpu->arch.pio_data);
4749 fail:
4750         return r;
4751 }
4752
4753 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4754 {
4755         kvm_free_lapic(vcpu);
4756         down_read(&vcpu->kvm->slots_lock);
4757         kvm_mmu_destroy(vcpu);
4758         up_read(&vcpu->kvm->slots_lock);
4759         free_page((unsigned long)vcpu->arch.pio_data);
4760 }
4761
4762 struct  kvm *kvm_arch_create_vm(void)
4763 {
4764         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
4765
4766         if (!kvm)
4767                 return ERR_PTR(-ENOMEM);
4768
4769         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4770         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4771
4772         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4773         set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4774
4775         rdtscll(kvm->arch.vm_init_tsc);
4776
4777         return kvm;
4778 }
4779
4780 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4781 {
4782         vcpu_load(vcpu);
4783         kvm_mmu_unload(vcpu);
4784         vcpu_put(vcpu);
4785 }
4786
4787 static void kvm_free_vcpus(struct kvm *kvm)
4788 {
4789         unsigned int i;
4790         struct kvm_vcpu *vcpu;
4791
4792         /*
4793          * Unpin any mmu pages first.
4794          */
4795         kvm_for_each_vcpu(i, vcpu, kvm)
4796                 kvm_unload_vcpu_mmu(vcpu);
4797         kvm_for_each_vcpu(i, vcpu, kvm)
4798                 kvm_arch_vcpu_free(vcpu);
4799
4800         mutex_lock(&kvm->lock);
4801         for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
4802                 kvm->vcpus[i] = NULL;
4803
4804         atomic_set(&kvm->online_vcpus, 0);
4805         mutex_unlock(&kvm->lock);
4806 }
4807
4808 void kvm_arch_sync_events(struct kvm *kvm)
4809 {
4810         kvm_free_all_assigned_devices(kvm);
4811 }
4812
4813 void kvm_arch_destroy_vm(struct kvm *kvm)
4814 {
4815         kvm_iommu_unmap_guest(kvm);
4816         kvm_free_pit(kvm);
4817         kfree(kvm->arch.vpic);
4818         kfree(kvm->arch.vioapic);
4819         kvm_free_vcpus(kvm);
4820         kvm_free_physmem(kvm);
4821         if (kvm->arch.apic_access_page)
4822                 put_page(kvm->arch.apic_access_page);
4823         if (kvm->arch.ept_identity_pagetable)
4824                 put_page(kvm->arch.ept_identity_pagetable);
4825         kfree(kvm);
4826 }
4827
4828 int kvm_arch_set_memory_region(struct kvm *kvm,
4829                                 struct kvm_userspace_memory_region *mem,
4830                                 struct kvm_memory_slot old,
4831                                 int user_alloc)
4832 {
4833         int npages = mem->memory_size >> PAGE_SHIFT;
4834         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4835
4836         /*To keep backward compatibility with older userspace,
4837          *x86 needs to hanlde !user_alloc case.
4838          */
4839         if (!user_alloc) {
4840                 if (npages && !old.rmap) {
4841                         unsigned long userspace_addr;
4842
4843                         down_write(&current->mm->mmap_sem);
4844                         userspace_addr = do_mmap(NULL, 0,
4845                                                  npages * PAGE_SIZE,
4846                                                  PROT_READ | PROT_WRITE,
4847                                                  MAP_PRIVATE | MAP_ANONYMOUS,
4848                                                  0);
4849                         up_write(&current->mm->mmap_sem);
4850
4851                         if (IS_ERR((void *)userspace_addr))
4852                                 return PTR_ERR((void *)userspace_addr);
4853
4854                         /* set userspace_addr atomically for kvm_hva_to_rmapp */
4855                         spin_lock(&kvm->mmu_lock);
4856                         memslot->userspace_addr = userspace_addr;
4857                         spin_unlock(&kvm->mmu_lock);
4858                 } else {
4859                         if (!old.user_alloc && old.rmap) {
4860                                 int ret;
4861
4862                                 down_write(&current->mm->mmap_sem);
4863                                 ret = do_munmap(current->mm, old.userspace_addr,
4864                                                 old.npages * PAGE_SIZE);
4865                                 up_write(&current->mm->mmap_sem);
4866                                 if (ret < 0)
4867                                         printk(KERN_WARNING
4868                                        "kvm_vm_ioctl_set_memory_region: "
4869                                        "failed to munmap memory\n");
4870                         }
4871                 }
4872         }
4873
4874         spin_lock(&kvm->mmu_lock);
4875         if (!kvm->arch.n_requested_mmu_pages) {
4876                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4877                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4878         }
4879
4880         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4881         spin_unlock(&kvm->mmu_lock);
4882         kvm_flush_remote_tlbs(kvm);
4883
4884         return 0;
4885 }
4886
4887 void kvm_arch_flush_shadow(struct kvm *kvm)
4888 {
4889         kvm_mmu_zap_all(kvm);
4890         kvm_reload_remote_mmus(kvm);
4891 }
4892
4893 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4894 {
4895         return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4896                || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4897                || vcpu->arch.nmi_pending;
4898 }
4899
4900 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4901 {
4902         int me;
4903         int cpu = vcpu->cpu;
4904
4905         if (waitqueue_active(&vcpu->wq)) {
4906                 wake_up_interruptible(&vcpu->wq);
4907                 ++vcpu->stat.halt_wakeup;
4908         }
4909
4910         me = get_cpu();
4911         if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
4912                 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
4913                         smp_send_reschedule(cpu);
4914         put_cpu();
4915 }
4916
4917 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4918 {
4919         return kvm_x86_ops->interrupt_allowed(vcpu);
4920 }
4921
4922 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4923 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4924 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4925 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4926 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);