KVM: VMX: Fix comparison of guest efer with stale host value
[safe/jmp/linux-2.6] / arch / x86 / kvm / x86.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright (C) 2008 Qumranet, Inc.
8  * Copyright IBM Corporation, 2008
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *   Amit Shah    <amit.shah@qumranet.com>
14  *   Ben-Ami Yassour <benami@il.ibm.com>
15  *
16  * This work is licensed under the terms of the GNU GPL, version 2.  See
17  * the COPYING file in the top-level directory.
18  *
19  */
20
21 #include <linux/kvm_host.h>
22 #include "irq.h"
23 #include "mmu.h"
24 #include "i8254.h"
25 #include "tss.h"
26 #include "kvm_cache_regs.h"
27 #include "x86.h"
28
29 #include <linux/clocksource.h>
30 #include <linux/interrupt.h>
31 #include <linux/kvm.h>
32 #include <linux/fs.h>
33 #include <linux/vmalloc.h>
34 #include <linux/module.h>
35 #include <linux/mman.h>
36 #include <linux/highmem.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/cpufreq.h>
40 #include <linux/user-return-notifier.h>
41 #include <trace/events/kvm.h>
42 #undef TRACE_INCLUDE_FILE
43 #define CREATE_TRACE_POINTS
44 #include "trace.h"
45
46 #include <asm/uaccess.h>
47 #include <asm/msr.h>
48 #include <asm/desc.h>
49 #include <asm/mtrr.h>
50 #include <asm/mce.h>
51
52 #define MAX_IO_MSRS 256
53 #define CR0_RESERVED_BITS                                               \
54         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
55                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
56                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
57 #define CR4_RESERVED_BITS                                               \
58         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
59                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
60                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
61                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
62
63 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
64
65 #define KVM_MAX_MCE_BANKS 32
66 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
67
68 /* EFER defaults:
69  * - enable syscall per default because its emulated by KVM
70  * - enable LME and LMA per default on 64 bit KVM
71  */
72 #ifdef CONFIG_X86_64
73 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
74 #else
75 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
76 #endif
77
78 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
79 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
80
81 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
82 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
83                                     struct kvm_cpuid_entry2 __user *entries);
84
85 struct kvm_x86_ops *kvm_x86_ops;
86 EXPORT_SYMBOL_GPL(kvm_x86_ops);
87
88 int ignore_msrs = 0;
89 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
90
91 #define KVM_NR_SHARED_MSRS 16
92
93 struct kvm_shared_msrs_global {
94         int nr;
95         struct kvm_shared_msr {
96                 u32 msr;
97                 u64 value;
98         } msrs[KVM_NR_SHARED_MSRS];
99 };
100
101 struct kvm_shared_msrs {
102         struct user_return_notifier urn;
103         bool registered;
104         u64 current_value[KVM_NR_SHARED_MSRS];
105 };
106
107 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
108 static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
109
110 struct kvm_stats_debugfs_item debugfs_entries[] = {
111         { "pf_fixed", VCPU_STAT(pf_fixed) },
112         { "pf_guest", VCPU_STAT(pf_guest) },
113         { "tlb_flush", VCPU_STAT(tlb_flush) },
114         { "invlpg", VCPU_STAT(invlpg) },
115         { "exits", VCPU_STAT(exits) },
116         { "io_exits", VCPU_STAT(io_exits) },
117         { "mmio_exits", VCPU_STAT(mmio_exits) },
118         { "signal_exits", VCPU_STAT(signal_exits) },
119         { "irq_window", VCPU_STAT(irq_window_exits) },
120         { "nmi_window", VCPU_STAT(nmi_window_exits) },
121         { "halt_exits", VCPU_STAT(halt_exits) },
122         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
123         { "hypercalls", VCPU_STAT(hypercalls) },
124         { "request_irq", VCPU_STAT(request_irq_exits) },
125         { "irq_exits", VCPU_STAT(irq_exits) },
126         { "host_state_reload", VCPU_STAT(host_state_reload) },
127         { "efer_reload", VCPU_STAT(efer_reload) },
128         { "fpu_reload", VCPU_STAT(fpu_reload) },
129         { "insn_emulation", VCPU_STAT(insn_emulation) },
130         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
131         { "irq_injections", VCPU_STAT(irq_injections) },
132         { "nmi_injections", VCPU_STAT(nmi_injections) },
133         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
134         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
135         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
136         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
137         { "mmu_flooded", VM_STAT(mmu_flooded) },
138         { "mmu_recycled", VM_STAT(mmu_recycled) },
139         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
140         { "mmu_unsync", VM_STAT(mmu_unsync) },
141         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
142         { "largepages", VM_STAT(lpages) },
143         { NULL }
144 };
145
146 static void kvm_on_user_return(struct user_return_notifier *urn)
147 {
148         unsigned slot;
149         struct kvm_shared_msr *global;
150         struct kvm_shared_msrs *locals
151                 = container_of(urn, struct kvm_shared_msrs, urn);
152
153         for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
154                 global = &shared_msrs_global.msrs[slot];
155                 if (global->value != locals->current_value[slot]) {
156                         wrmsrl(global->msr, global->value);
157                         locals->current_value[slot] = global->value;
158                 }
159         }
160         locals->registered = false;
161         user_return_notifier_unregister(urn);
162 }
163
164 void kvm_define_shared_msr(unsigned slot, u32 msr)
165 {
166         int cpu;
167         u64 value;
168
169         if (slot >= shared_msrs_global.nr)
170                 shared_msrs_global.nr = slot + 1;
171         shared_msrs_global.msrs[slot].msr = msr;
172         rdmsrl_safe(msr, &value);
173         shared_msrs_global.msrs[slot].value = value;
174         for_each_online_cpu(cpu)
175                 per_cpu(shared_msrs, cpu).current_value[slot] = value;
176 }
177 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
178
179 static void kvm_shared_msr_cpu_online(void)
180 {
181         unsigned i;
182         struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
183
184         for (i = 0; i < shared_msrs_global.nr; ++i)
185                 locals->current_value[i] = shared_msrs_global.msrs[i].value;
186 }
187
188 void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
189 {
190         struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
191
192         if (((value ^ smsr->current_value[slot]) & mask) == 0)
193                 return;
194         smsr->current_value[slot] = value;
195         wrmsrl(shared_msrs_global.msrs[slot].msr, value);
196         if (!smsr->registered) {
197                 smsr->urn.on_user_return = kvm_on_user_return;
198                 user_return_notifier_register(&smsr->urn);
199                 smsr->registered = true;
200         }
201 }
202 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
203
204 static void drop_user_return_notifiers(void *ignore)
205 {
206         struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
207
208         if (smsr->registered)
209                 kvm_on_user_return(&smsr->urn);
210 }
211
212 unsigned long segment_base(u16 selector)
213 {
214         struct descriptor_table gdt;
215         struct desc_struct *d;
216         unsigned long table_base;
217         unsigned long v;
218
219         if (selector == 0)
220                 return 0;
221
222         kvm_get_gdt(&gdt);
223         table_base = gdt.base;
224
225         if (selector & 4) {           /* from ldt */
226                 u16 ldt_selector = kvm_read_ldt();
227
228                 table_base = segment_base(ldt_selector);
229         }
230         d = (struct desc_struct *)(table_base + (selector & ~7));
231         v = get_desc_base(d);
232 #ifdef CONFIG_X86_64
233         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
234                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
235 #endif
236         return v;
237 }
238 EXPORT_SYMBOL_GPL(segment_base);
239
240 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
241 {
242         if (irqchip_in_kernel(vcpu->kvm))
243                 return vcpu->arch.apic_base;
244         else
245                 return vcpu->arch.apic_base;
246 }
247 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
248
249 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
250 {
251         /* TODO: reserve bits check */
252         if (irqchip_in_kernel(vcpu->kvm))
253                 kvm_lapic_set_base(vcpu, data);
254         else
255                 vcpu->arch.apic_base = data;
256 }
257 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
258
259 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
260 {
261         WARN_ON(vcpu->arch.exception.pending);
262         vcpu->arch.exception.pending = true;
263         vcpu->arch.exception.has_error_code = false;
264         vcpu->arch.exception.nr = nr;
265 }
266 EXPORT_SYMBOL_GPL(kvm_queue_exception);
267
268 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
269                            u32 error_code)
270 {
271         ++vcpu->stat.pf_guest;
272
273         if (vcpu->arch.exception.pending) {
274                 switch(vcpu->arch.exception.nr) {
275                 case DF_VECTOR:
276                         /* triple fault -> shutdown */
277                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
278                         return;
279                 case PF_VECTOR:
280                         vcpu->arch.exception.nr = DF_VECTOR;
281                         vcpu->arch.exception.error_code = 0;
282                         return;
283                 default:
284                         /* replace previous exception with a new one in a hope
285                            that instruction re-execution will regenerate lost
286                            exception */
287                         vcpu->arch.exception.pending = false;
288                         break;
289                 }
290         }
291         vcpu->arch.cr2 = addr;
292         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
293 }
294
295 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
296 {
297         vcpu->arch.nmi_pending = 1;
298 }
299 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
300
301 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
302 {
303         WARN_ON(vcpu->arch.exception.pending);
304         vcpu->arch.exception.pending = true;
305         vcpu->arch.exception.has_error_code = true;
306         vcpu->arch.exception.nr = nr;
307         vcpu->arch.exception.error_code = error_code;
308 }
309 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
310
311 /*
312  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
313  * a #GP and return false.
314  */
315 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
316 {
317         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
318                 return true;
319         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
320         return false;
321 }
322 EXPORT_SYMBOL_GPL(kvm_require_cpl);
323
324 /*
325  * Load the pae pdptrs.  Return true is they are all valid.
326  */
327 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
328 {
329         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
330         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
331         int i;
332         int ret;
333         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
334
335         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
336                                   offset * sizeof(u64), sizeof(pdpte));
337         if (ret < 0) {
338                 ret = 0;
339                 goto out;
340         }
341         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
342                 if (is_present_gpte(pdpte[i]) &&
343                     (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
344                         ret = 0;
345                         goto out;
346                 }
347         }
348         ret = 1;
349
350         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
351         __set_bit(VCPU_EXREG_PDPTR,
352                   (unsigned long *)&vcpu->arch.regs_avail);
353         __set_bit(VCPU_EXREG_PDPTR,
354                   (unsigned long *)&vcpu->arch.regs_dirty);
355 out:
356
357         return ret;
358 }
359 EXPORT_SYMBOL_GPL(load_pdptrs);
360
361 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
362 {
363         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
364         bool changed = true;
365         int r;
366
367         if (is_long_mode(vcpu) || !is_pae(vcpu))
368                 return false;
369
370         if (!test_bit(VCPU_EXREG_PDPTR,
371                       (unsigned long *)&vcpu->arch.regs_avail))
372                 return true;
373
374         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
375         if (r < 0)
376                 goto out;
377         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
378 out:
379
380         return changed;
381 }
382
383 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
384 {
385         if (cr0 & CR0_RESERVED_BITS) {
386                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
387                        cr0, vcpu->arch.cr0);
388                 kvm_inject_gp(vcpu, 0);
389                 return;
390         }
391
392         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
393                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
394                 kvm_inject_gp(vcpu, 0);
395                 return;
396         }
397
398         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
399                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
400                        "and a clear PE flag\n");
401                 kvm_inject_gp(vcpu, 0);
402                 return;
403         }
404
405         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
406 #ifdef CONFIG_X86_64
407                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
408                         int cs_db, cs_l;
409
410                         if (!is_pae(vcpu)) {
411                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
412                                        "in long mode while PAE is disabled\n");
413                                 kvm_inject_gp(vcpu, 0);
414                                 return;
415                         }
416                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
417                         if (cs_l) {
418                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
419                                        "in long mode while CS.L == 1\n");
420                                 kvm_inject_gp(vcpu, 0);
421                                 return;
422
423                         }
424                 } else
425 #endif
426                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
427                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
428                                "reserved bits\n");
429                         kvm_inject_gp(vcpu, 0);
430                         return;
431                 }
432
433         }
434
435         kvm_x86_ops->set_cr0(vcpu, cr0);
436         vcpu->arch.cr0 = cr0;
437
438         kvm_mmu_reset_context(vcpu);
439         return;
440 }
441 EXPORT_SYMBOL_GPL(kvm_set_cr0);
442
443 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
444 {
445         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
446 }
447 EXPORT_SYMBOL_GPL(kvm_lmsw);
448
449 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
450 {
451         unsigned long old_cr4 = vcpu->arch.cr4;
452         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
453
454         if (cr4 & CR4_RESERVED_BITS) {
455                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
456                 kvm_inject_gp(vcpu, 0);
457                 return;
458         }
459
460         if (is_long_mode(vcpu)) {
461                 if (!(cr4 & X86_CR4_PAE)) {
462                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
463                                "in long mode\n");
464                         kvm_inject_gp(vcpu, 0);
465                         return;
466                 }
467         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
468                    && ((cr4 ^ old_cr4) & pdptr_bits)
469                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
470                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
471                 kvm_inject_gp(vcpu, 0);
472                 return;
473         }
474
475         if (cr4 & X86_CR4_VMXE) {
476                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
477                 kvm_inject_gp(vcpu, 0);
478                 return;
479         }
480         kvm_x86_ops->set_cr4(vcpu, cr4);
481         vcpu->arch.cr4 = cr4;
482         vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
483         kvm_mmu_reset_context(vcpu);
484 }
485 EXPORT_SYMBOL_GPL(kvm_set_cr4);
486
487 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
488 {
489         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
490                 kvm_mmu_sync_roots(vcpu);
491                 kvm_mmu_flush_tlb(vcpu);
492                 return;
493         }
494
495         if (is_long_mode(vcpu)) {
496                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
497                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
498                         kvm_inject_gp(vcpu, 0);
499                         return;
500                 }
501         } else {
502                 if (is_pae(vcpu)) {
503                         if (cr3 & CR3_PAE_RESERVED_BITS) {
504                                 printk(KERN_DEBUG
505                                        "set_cr3: #GP, reserved bits\n");
506                                 kvm_inject_gp(vcpu, 0);
507                                 return;
508                         }
509                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
510                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
511                                        "reserved bits\n");
512                                 kvm_inject_gp(vcpu, 0);
513                                 return;
514                         }
515                 }
516                 /*
517                  * We don't check reserved bits in nonpae mode, because
518                  * this isn't enforced, and VMware depends on this.
519                  */
520         }
521
522         /*
523          * Does the new cr3 value map to physical memory? (Note, we
524          * catch an invalid cr3 even in real-mode, because it would
525          * cause trouble later on when we turn on paging anyway.)
526          *
527          * A real CPU would silently accept an invalid cr3 and would
528          * attempt to use it - with largely undefined (and often hard
529          * to debug) behavior on the guest side.
530          */
531         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
532                 kvm_inject_gp(vcpu, 0);
533         else {
534                 vcpu->arch.cr3 = cr3;
535                 vcpu->arch.mmu.new_cr3(vcpu);
536         }
537 }
538 EXPORT_SYMBOL_GPL(kvm_set_cr3);
539
540 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
541 {
542         if (cr8 & CR8_RESERVED_BITS) {
543                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
544                 kvm_inject_gp(vcpu, 0);
545                 return;
546         }
547         if (irqchip_in_kernel(vcpu->kvm))
548                 kvm_lapic_set_tpr(vcpu, cr8);
549         else
550                 vcpu->arch.cr8 = cr8;
551 }
552 EXPORT_SYMBOL_GPL(kvm_set_cr8);
553
554 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
555 {
556         if (irqchip_in_kernel(vcpu->kvm))
557                 return kvm_lapic_get_cr8(vcpu);
558         else
559                 return vcpu->arch.cr8;
560 }
561 EXPORT_SYMBOL_GPL(kvm_get_cr8);
562
563 static inline u32 bit(int bitno)
564 {
565         return 1 << (bitno & 31);
566 }
567
568 /*
569  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
570  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
571  *
572  * This list is modified at module load time to reflect the
573  * capabilities of the host cpu. This capabilities test skips MSRs that are
574  * kvm-specific. Those are put in the beginning of the list.
575  */
576
577 #define KVM_SAVE_MSRS_BEGIN     2
578 static u32 msrs_to_save[] = {
579         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
580         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
581         MSR_K6_STAR,
582 #ifdef CONFIG_X86_64
583         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
584 #endif
585         MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
586 };
587
588 static unsigned num_msrs_to_save;
589
590 static u32 emulated_msrs[] = {
591         MSR_IA32_MISC_ENABLE,
592 };
593
594 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
595 {
596         if (efer & efer_reserved_bits) {
597                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
598                        efer);
599                 kvm_inject_gp(vcpu, 0);
600                 return;
601         }
602
603         if (is_paging(vcpu)
604             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
605                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
606                 kvm_inject_gp(vcpu, 0);
607                 return;
608         }
609
610         if (efer & EFER_FFXSR) {
611                 struct kvm_cpuid_entry2 *feat;
612
613                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
614                 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
615                         printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
616                         kvm_inject_gp(vcpu, 0);
617                         return;
618                 }
619         }
620
621         if (efer & EFER_SVME) {
622                 struct kvm_cpuid_entry2 *feat;
623
624                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
625                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
626                         printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
627                         kvm_inject_gp(vcpu, 0);
628                         return;
629                 }
630         }
631
632         kvm_x86_ops->set_efer(vcpu, efer);
633
634         efer &= ~EFER_LMA;
635         efer |= vcpu->arch.shadow_efer & EFER_LMA;
636
637         vcpu->arch.shadow_efer = efer;
638
639         vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
640         kvm_mmu_reset_context(vcpu);
641 }
642
643 void kvm_enable_efer_bits(u64 mask)
644 {
645        efer_reserved_bits &= ~mask;
646 }
647 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
648
649
650 /*
651  * Writes msr value into into the appropriate "register".
652  * Returns 0 on success, non-0 otherwise.
653  * Assumes vcpu_load() was already called.
654  */
655 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
656 {
657         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
658 }
659
660 /*
661  * Adapt set_msr() to msr_io()'s calling convention
662  */
663 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
664 {
665         return kvm_set_msr(vcpu, index, *data);
666 }
667
668 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
669 {
670         static int version;
671         struct pvclock_wall_clock wc;
672         struct timespec now, sys, boot;
673
674         if (!wall_clock)
675                 return;
676
677         version++;
678
679         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
680
681         /*
682          * The guest calculates current wall clock time by adding
683          * system time (updated by kvm_write_guest_time below) to the
684          * wall clock specified here.  guest system time equals host
685          * system time for us, thus we must fill in host boot time here.
686          */
687         now = current_kernel_time();
688         ktime_get_ts(&sys);
689         boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
690
691         wc.sec = boot.tv_sec;
692         wc.nsec = boot.tv_nsec;
693         wc.version = version;
694
695         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
696
697         version++;
698         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
699 }
700
701 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
702 {
703         uint32_t quotient, remainder;
704
705         /* Don't try to replace with do_div(), this one calculates
706          * "(dividend << 32) / divisor" */
707         __asm__ ( "divl %4"
708                   : "=a" (quotient), "=d" (remainder)
709                   : "0" (0), "1" (dividend), "r" (divisor) );
710         return quotient;
711 }
712
713 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
714 {
715         uint64_t nsecs = 1000000000LL;
716         int32_t  shift = 0;
717         uint64_t tps64;
718         uint32_t tps32;
719
720         tps64 = tsc_khz * 1000LL;
721         while (tps64 > nsecs*2) {
722                 tps64 >>= 1;
723                 shift--;
724         }
725
726         tps32 = (uint32_t)tps64;
727         while (tps32 <= (uint32_t)nsecs) {
728                 tps32 <<= 1;
729                 shift++;
730         }
731
732         hv_clock->tsc_shift = shift;
733         hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
734
735         pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
736                  __func__, tsc_khz, hv_clock->tsc_shift,
737                  hv_clock->tsc_to_system_mul);
738 }
739
740 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
741
742 static void kvm_write_guest_time(struct kvm_vcpu *v)
743 {
744         struct timespec ts;
745         unsigned long flags;
746         struct kvm_vcpu_arch *vcpu = &v->arch;
747         void *shared_kaddr;
748         unsigned long this_tsc_khz;
749
750         if ((!vcpu->time_page))
751                 return;
752
753         this_tsc_khz = get_cpu_var(cpu_tsc_khz);
754         if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
755                 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
756                 vcpu->hv_clock_tsc_khz = this_tsc_khz;
757         }
758         put_cpu_var(cpu_tsc_khz);
759
760         /* Keep irq disabled to prevent changes to the clock */
761         local_irq_save(flags);
762         kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
763         ktime_get_ts(&ts);
764         local_irq_restore(flags);
765
766         /* With all the info we got, fill in the values */
767
768         vcpu->hv_clock.system_time = ts.tv_nsec +
769                                      (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
770
771         /*
772          * The interface expects us to write an even number signaling that the
773          * update is finished. Since the guest won't see the intermediate
774          * state, we just increase by 2 at the end.
775          */
776         vcpu->hv_clock.version += 2;
777
778         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
779
780         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
781                sizeof(vcpu->hv_clock));
782
783         kunmap_atomic(shared_kaddr, KM_USER0);
784
785         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
786 }
787
788 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
789 {
790         struct kvm_vcpu_arch *vcpu = &v->arch;
791
792         if (!vcpu->time_page)
793                 return 0;
794         set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
795         return 1;
796 }
797
798 static bool msr_mtrr_valid(unsigned msr)
799 {
800         switch (msr) {
801         case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
802         case MSR_MTRRfix64K_00000:
803         case MSR_MTRRfix16K_80000:
804         case MSR_MTRRfix16K_A0000:
805         case MSR_MTRRfix4K_C0000:
806         case MSR_MTRRfix4K_C8000:
807         case MSR_MTRRfix4K_D0000:
808         case MSR_MTRRfix4K_D8000:
809         case MSR_MTRRfix4K_E0000:
810         case MSR_MTRRfix4K_E8000:
811         case MSR_MTRRfix4K_F0000:
812         case MSR_MTRRfix4K_F8000:
813         case MSR_MTRRdefType:
814         case MSR_IA32_CR_PAT:
815                 return true;
816         case 0x2f8:
817                 return true;
818         }
819         return false;
820 }
821
822 static bool valid_pat_type(unsigned t)
823 {
824         return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
825 }
826
827 static bool valid_mtrr_type(unsigned t)
828 {
829         return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
830 }
831
832 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
833 {
834         int i;
835
836         if (!msr_mtrr_valid(msr))
837                 return false;
838
839         if (msr == MSR_IA32_CR_PAT) {
840                 for (i = 0; i < 8; i++)
841                         if (!valid_pat_type((data >> (i * 8)) & 0xff))
842                                 return false;
843                 return true;
844         } else if (msr == MSR_MTRRdefType) {
845                 if (data & ~0xcff)
846                         return false;
847                 return valid_mtrr_type(data & 0xff);
848         } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
849                 for (i = 0; i < 8 ; i++)
850                         if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
851                                 return false;
852                 return true;
853         }
854
855         /* variable MTRRs */
856         return valid_mtrr_type(data & 0xff);
857 }
858
859 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
860 {
861         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
862
863         if (!mtrr_valid(vcpu, msr, data))
864                 return 1;
865
866         if (msr == MSR_MTRRdefType) {
867                 vcpu->arch.mtrr_state.def_type = data;
868                 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
869         } else if (msr == MSR_MTRRfix64K_00000)
870                 p[0] = data;
871         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
872                 p[1 + msr - MSR_MTRRfix16K_80000] = data;
873         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
874                 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
875         else if (msr == MSR_IA32_CR_PAT)
876                 vcpu->arch.pat = data;
877         else {  /* Variable MTRRs */
878                 int idx, is_mtrr_mask;
879                 u64 *pt;
880
881                 idx = (msr - 0x200) / 2;
882                 is_mtrr_mask = msr - 0x200 - 2 * idx;
883                 if (!is_mtrr_mask)
884                         pt =
885                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
886                 else
887                         pt =
888                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
889                 *pt = data;
890         }
891
892         kvm_mmu_reset_context(vcpu);
893         return 0;
894 }
895
896 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
897 {
898         u64 mcg_cap = vcpu->arch.mcg_cap;
899         unsigned bank_num = mcg_cap & 0xff;
900
901         switch (msr) {
902         case MSR_IA32_MCG_STATUS:
903                 vcpu->arch.mcg_status = data;
904                 break;
905         case MSR_IA32_MCG_CTL:
906                 if (!(mcg_cap & MCG_CTL_P))
907                         return 1;
908                 if (data != 0 && data != ~(u64)0)
909                         return -1;
910                 vcpu->arch.mcg_ctl = data;
911                 break;
912         default:
913                 if (msr >= MSR_IA32_MC0_CTL &&
914                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
915                         u32 offset = msr - MSR_IA32_MC0_CTL;
916                         /* only 0 or all 1s can be written to IA32_MCi_CTL */
917                         if ((offset & 0x3) == 0 &&
918                             data != 0 && data != ~(u64)0)
919                                 return -1;
920                         vcpu->arch.mce_banks[offset] = data;
921                         break;
922                 }
923                 return 1;
924         }
925         return 0;
926 }
927
928 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
929 {
930         struct kvm *kvm = vcpu->kvm;
931         int lm = is_long_mode(vcpu);
932         u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
933                 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
934         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
935                 : kvm->arch.xen_hvm_config.blob_size_32;
936         u32 page_num = data & ~PAGE_MASK;
937         u64 page_addr = data & PAGE_MASK;
938         u8 *page;
939         int r;
940
941         r = -E2BIG;
942         if (page_num >= blob_size)
943                 goto out;
944         r = -ENOMEM;
945         page = kzalloc(PAGE_SIZE, GFP_KERNEL);
946         if (!page)
947                 goto out;
948         r = -EFAULT;
949         if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
950                 goto out_free;
951         if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
952                 goto out_free;
953         r = 0;
954 out_free:
955         kfree(page);
956 out:
957         return r;
958 }
959
960 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
961 {
962         switch (msr) {
963         case MSR_EFER:
964                 set_efer(vcpu, data);
965                 break;
966         case MSR_K7_HWCR:
967                 data &= ~(u64)0x40;     /* ignore flush filter disable */
968                 if (data != 0) {
969                         pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
970                                 data);
971                         return 1;
972                 }
973                 break;
974         case MSR_FAM10H_MMIO_CONF_BASE:
975                 if (data != 0) {
976                         pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
977                                 "0x%llx\n", data);
978                         return 1;
979                 }
980                 break;
981         case MSR_AMD64_NB_CFG:
982                 break;
983         case MSR_IA32_DEBUGCTLMSR:
984                 if (!data) {
985                         /* We support the non-activated case already */
986                         break;
987                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
988                         /* Values other than LBR and BTF are vendor-specific,
989                            thus reserved and should throw a #GP */
990                         return 1;
991                 }
992                 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
993                         __func__, data);
994                 break;
995         case MSR_IA32_UCODE_REV:
996         case MSR_IA32_UCODE_WRITE:
997         case MSR_VM_HSAVE_PA:
998         case MSR_AMD64_PATCH_LOADER:
999                 break;
1000         case 0x200 ... 0x2ff:
1001                 return set_msr_mtrr(vcpu, msr, data);
1002         case MSR_IA32_APICBASE:
1003                 kvm_set_apic_base(vcpu, data);
1004                 break;
1005         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1006                 return kvm_x2apic_msr_write(vcpu, msr, data);
1007         case MSR_IA32_MISC_ENABLE:
1008                 vcpu->arch.ia32_misc_enable_msr = data;
1009                 break;
1010         case MSR_KVM_WALL_CLOCK:
1011                 vcpu->kvm->arch.wall_clock = data;
1012                 kvm_write_wall_clock(vcpu->kvm, data);
1013                 break;
1014         case MSR_KVM_SYSTEM_TIME: {
1015                 if (vcpu->arch.time_page) {
1016                         kvm_release_page_dirty(vcpu->arch.time_page);
1017                         vcpu->arch.time_page = NULL;
1018                 }
1019
1020                 vcpu->arch.time = data;
1021
1022                 /* we verify if the enable bit is set... */
1023                 if (!(data & 1))
1024                         break;
1025
1026                 /* ...but clean it before doing the actual write */
1027                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
1028
1029                 vcpu->arch.time_page =
1030                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
1031
1032                 if (is_error_page(vcpu->arch.time_page)) {
1033                         kvm_release_page_clean(vcpu->arch.time_page);
1034                         vcpu->arch.time_page = NULL;
1035                 }
1036
1037                 kvm_request_guest_time_update(vcpu);
1038                 break;
1039         }
1040         case MSR_IA32_MCG_CTL:
1041         case MSR_IA32_MCG_STATUS:
1042         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1043                 return set_msr_mce(vcpu, msr, data);
1044
1045         /* Performance counters are not protected by a CPUID bit,
1046          * so we should check all of them in the generic path for the sake of
1047          * cross vendor migration.
1048          * Writing a zero into the event select MSRs disables them,
1049          * which we perfectly emulate ;-). Any other value should be at least
1050          * reported, some guests depend on them.
1051          */
1052         case MSR_P6_EVNTSEL0:
1053         case MSR_P6_EVNTSEL1:
1054         case MSR_K7_EVNTSEL0:
1055         case MSR_K7_EVNTSEL1:
1056         case MSR_K7_EVNTSEL2:
1057         case MSR_K7_EVNTSEL3:
1058                 if (data != 0)
1059                         pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1060                                 "0x%x data 0x%llx\n", msr, data);
1061                 break;
1062         /* at least RHEL 4 unconditionally writes to the perfctr registers,
1063          * so we ignore writes to make it happy.
1064          */
1065         case MSR_P6_PERFCTR0:
1066         case MSR_P6_PERFCTR1:
1067         case MSR_K7_PERFCTR0:
1068         case MSR_K7_PERFCTR1:
1069         case MSR_K7_PERFCTR2:
1070         case MSR_K7_PERFCTR3:
1071                 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1072                         "0x%x data 0x%llx\n", msr, data);
1073                 break;
1074         default:
1075                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1076                         return xen_hvm_config(vcpu, data);
1077                 if (!ignore_msrs) {
1078                         pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1079                                 msr, data);
1080                         return 1;
1081                 } else {
1082                         pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
1083                                 msr, data);
1084                         break;
1085                 }
1086         }
1087         return 0;
1088 }
1089 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1090
1091
1092 /*
1093  * Reads an msr value (of 'msr_index') into 'pdata'.
1094  * Returns 0 on success, non-0 otherwise.
1095  * Assumes vcpu_load() was already called.
1096  */
1097 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1098 {
1099         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1100 }
1101
1102 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1103 {
1104         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1105
1106         if (!msr_mtrr_valid(msr))
1107                 return 1;
1108
1109         if (msr == MSR_MTRRdefType)
1110                 *pdata = vcpu->arch.mtrr_state.def_type +
1111                          (vcpu->arch.mtrr_state.enabled << 10);
1112         else if (msr == MSR_MTRRfix64K_00000)
1113                 *pdata = p[0];
1114         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1115                 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
1116         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1117                 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
1118         else if (msr == MSR_IA32_CR_PAT)
1119                 *pdata = vcpu->arch.pat;
1120         else {  /* Variable MTRRs */
1121                 int idx, is_mtrr_mask;
1122                 u64 *pt;
1123
1124                 idx = (msr - 0x200) / 2;
1125                 is_mtrr_mask = msr - 0x200 - 2 * idx;
1126                 if (!is_mtrr_mask)
1127                         pt =
1128                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1129                 else
1130                         pt =
1131                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1132                 *pdata = *pt;
1133         }
1134
1135         return 0;
1136 }
1137
1138 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1139 {
1140         u64 data;
1141         u64 mcg_cap = vcpu->arch.mcg_cap;
1142         unsigned bank_num = mcg_cap & 0xff;
1143
1144         switch (msr) {
1145         case MSR_IA32_P5_MC_ADDR:
1146         case MSR_IA32_P5_MC_TYPE:
1147                 data = 0;
1148                 break;
1149         case MSR_IA32_MCG_CAP:
1150                 data = vcpu->arch.mcg_cap;
1151                 break;
1152         case MSR_IA32_MCG_CTL:
1153                 if (!(mcg_cap & MCG_CTL_P))
1154                         return 1;
1155                 data = vcpu->arch.mcg_ctl;
1156                 break;
1157         case MSR_IA32_MCG_STATUS:
1158                 data = vcpu->arch.mcg_status;
1159                 break;
1160         default:
1161                 if (msr >= MSR_IA32_MC0_CTL &&
1162                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1163                         u32 offset = msr - MSR_IA32_MC0_CTL;
1164                         data = vcpu->arch.mce_banks[offset];
1165                         break;
1166                 }
1167                 return 1;
1168         }
1169         *pdata = data;
1170         return 0;
1171 }
1172
1173 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1174 {
1175         u64 data;
1176
1177         switch (msr) {
1178         case MSR_IA32_PLATFORM_ID:
1179         case MSR_IA32_UCODE_REV:
1180         case MSR_IA32_EBL_CR_POWERON:
1181         case MSR_IA32_DEBUGCTLMSR:
1182         case MSR_IA32_LASTBRANCHFROMIP:
1183         case MSR_IA32_LASTBRANCHTOIP:
1184         case MSR_IA32_LASTINTFROMIP:
1185         case MSR_IA32_LASTINTTOIP:
1186         case MSR_K8_SYSCFG:
1187         case MSR_K7_HWCR:
1188         case MSR_VM_HSAVE_PA:
1189         case MSR_P6_PERFCTR0:
1190         case MSR_P6_PERFCTR1:
1191         case MSR_P6_EVNTSEL0:
1192         case MSR_P6_EVNTSEL1:
1193         case MSR_K7_EVNTSEL0:
1194         case MSR_K7_PERFCTR0:
1195         case MSR_K8_INT_PENDING_MSG:
1196         case MSR_AMD64_NB_CFG:
1197         case MSR_FAM10H_MMIO_CONF_BASE:
1198                 data = 0;
1199                 break;
1200         case MSR_MTRRcap:
1201                 data = 0x500 | KVM_NR_VAR_MTRR;
1202                 break;
1203         case 0x200 ... 0x2ff:
1204                 return get_msr_mtrr(vcpu, msr, pdata);
1205         case 0xcd: /* fsb frequency */
1206                 data = 3;
1207                 break;
1208         case MSR_IA32_APICBASE:
1209                 data = kvm_get_apic_base(vcpu);
1210                 break;
1211         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1212                 return kvm_x2apic_msr_read(vcpu, msr, pdata);
1213                 break;
1214         case MSR_IA32_MISC_ENABLE:
1215                 data = vcpu->arch.ia32_misc_enable_msr;
1216                 break;
1217         case MSR_IA32_PERF_STATUS:
1218                 /* TSC increment by tick */
1219                 data = 1000ULL;
1220                 /* CPU multiplier */
1221                 data |= (((uint64_t)4ULL) << 40);
1222                 break;
1223         case MSR_EFER:
1224                 data = vcpu->arch.shadow_efer;
1225                 break;
1226         case MSR_KVM_WALL_CLOCK:
1227                 data = vcpu->kvm->arch.wall_clock;
1228                 break;
1229         case MSR_KVM_SYSTEM_TIME:
1230                 data = vcpu->arch.time;
1231                 break;
1232         case MSR_IA32_P5_MC_ADDR:
1233         case MSR_IA32_P5_MC_TYPE:
1234         case MSR_IA32_MCG_CAP:
1235         case MSR_IA32_MCG_CTL:
1236         case MSR_IA32_MCG_STATUS:
1237         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1238                 return get_msr_mce(vcpu, msr, pdata);
1239         default:
1240                 if (!ignore_msrs) {
1241                         pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1242                         return 1;
1243                 } else {
1244                         pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
1245                         data = 0;
1246                 }
1247                 break;
1248         }
1249         *pdata = data;
1250         return 0;
1251 }
1252 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1253
1254 /*
1255  * Read or write a bunch of msrs. All parameters are kernel addresses.
1256  *
1257  * @return number of msrs set successfully.
1258  */
1259 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1260                     struct kvm_msr_entry *entries,
1261                     int (*do_msr)(struct kvm_vcpu *vcpu,
1262                                   unsigned index, u64 *data))
1263 {
1264         int i;
1265
1266         vcpu_load(vcpu);
1267
1268         down_read(&vcpu->kvm->slots_lock);
1269         for (i = 0; i < msrs->nmsrs; ++i)
1270                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1271                         break;
1272         up_read(&vcpu->kvm->slots_lock);
1273
1274         vcpu_put(vcpu);
1275
1276         return i;
1277 }
1278
1279 /*
1280  * Read or write a bunch of msrs. Parameters are user addresses.
1281  *
1282  * @return number of msrs set successfully.
1283  */
1284 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1285                   int (*do_msr)(struct kvm_vcpu *vcpu,
1286                                 unsigned index, u64 *data),
1287                   int writeback)
1288 {
1289         struct kvm_msrs msrs;
1290         struct kvm_msr_entry *entries;
1291         int r, n;
1292         unsigned size;
1293
1294         r = -EFAULT;
1295         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1296                 goto out;
1297
1298         r = -E2BIG;
1299         if (msrs.nmsrs >= MAX_IO_MSRS)
1300                 goto out;
1301
1302         r = -ENOMEM;
1303         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1304         entries = vmalloc(size);
1305         if (!entries)
1306                 goto out;
1307
1308         r = -EFAULT;
1309         if (copy_from_user(entries, user_msrs->entries, size))
1310                 goto out_free;
1311
1312         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1313         if (r < 0)
1314                 goto out_free;
1315
1316         r = -EFAULT;
1317         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1318                 goto out_free;
1319
1320         r = n;
1321
1322 out_free:
1323         vfree(entries);
1324 out:
1325         return r;
1326 }
1327
1328 int kvm_dev_ioctl_check_extension(long ext)
1329 {
1330         int r;
1331
1332         switch (ext) {
1333         case KVM_CAP_IRQCHIP:
1334         case KVM_CAP_HLT:
1335         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1336         case KVM_CAP_SET_TSS_ADDR:
1337         case KVM_CAP_EXT_CPUID:
1338         case KVM_CAP_CLOCKSOURCE:
1339         case KVM_CAP_PIT:
1340         case KVM_CAP_NOP_IO_DELAY:
1341         case KVM_CAP_MP_STATE:
1342         case KVM_CAP_SYNC_MMU:
1343         case KVM_CAP_REINJECT_CONTROL:
1344         case KVM_CAP_IRQ_INJECT_STATUS:
1345         case KVM_CAP_ASSIGN_DEV_IRQ:
1346         case KVM_CAP_IRQFD:
1347         case KVM_CAP_IOEVENTFD:
1348         case KVM_CAP_PIT2:
1349         case KVM_CAP_PIT_STATE2:
1350         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1351         case KVM_CAP_XEN_HVM:
1352         case KVM_CAP_ADJUST_CLOCK:
1353         case KVM_CAP_VCPU_EVENTS:
1354                 r = 1;
1355                 break;
1356         case KVM_CAP_COALESCED_MMIO:
1357                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1358                 break;
1359         case KVM_CAP_VAPIC:
1360                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1361                 break;
1362         case KVM_CAP_NR_VCPUS:
1363                 r = KVM_MAX_VCPUS;
1364                 break;
1365         case KVM_CAP_NR_MEMSLOTS:
1366                 r = KVM_MEMORY_SLOTS;
1367                 break;
1368         case KVM_CAP_PV_MMU:    /* obsolete */
1369                 r = 0;
1370                 break;
1371         case KVM_CAP_IOMMU:
1372                 r = iommu_found();
1373                 break;
1374         case KVM_CAP_MCE:
1375                 r = KVM_MAX_MCE_BANKS;
1376                 break;
1377         default:
1378                 r = 0;
1379                 break;
1380         }
1381         return r;
1382
1383 }
1384
1385 long kvm_arch_dev_ioctl(struct file *filp,
1386                         unsigned int ioctl, unsigned long arg)
1387 {
1388         void __user *argp = (void __user *)arg;
1389         long r;
1390
1391         switch (ioctl) {
1392         case KVM_GET_MSR_INDEX_LIST: {
1393                 struct kvm_msr_list __user *user_msr_list = argp;
1394                 struct kvm_msr_list msr_list;
1395                 unsigned n;
1396
1397                 r = -EFAULT;
1398                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1399                         goto out;
1400                 n = msr_list.nmsrs;
1401                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1402                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1403                         goto out;
1404                 r = -E2BIG;
1405                 if (n < msr_list.nmsrs)
1406                         goto out;
1407                 r = -EFAULT;
1408                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1409                                  num_msrs_to_save * sizeof(u32)))
1410                         goto out;
1411                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1412                                  &emulated_msrs,
1413                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1414                         goto out;
1415                 r = 0;
1416                 break;
1417         }
1418         case KVM_GET_SUPPORTED_CPUID: {
1419                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1420                 struct kvm_cpuid2 cpuid;
1421
1422                 r = -EFAULT;
1423                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1424                         goto out;
1425                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1426                                                       cpuid_arg->entries);
1427                 if (r)
1428                         goto out;
1429
1430                 r = -EFAULT;
1431                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1432                         goto out;
1433                 r = 0;
1434                 break;
1435         }
1436         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
1437                 u64 mce_cap;
1438
1439                 mce_cap = KVM_MCE_CAP_SUPPORTED;
1440                 r = -EFAULT;
1441                 if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
1442                         goto out;
1443                 r = 0;
1444                 break;
1445         }
1446         default:
1447                 r = -EINVAL;
1448         }
1449 out:
1450         return r;
1451 }
1452
1453 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1454 {
1455         kvm_x86_ops->vcpu_load(vcpu, cpu);
1456         if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1457                 unsigned long khz = cpufreq_quick_get(cpu);
1458                 if (!khz)
1459                         khz = tsc_khz;
1460                 per_cpu(cpu_tsc_khz, cpu) = khz;
1461         }
1462         kvm_request_guest_time_update(vcpu);
1463 }
1464
1465 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1466 {
1467         kvm_x86_ops->vcpu_put(vcpu);
1468         kvm_put_guest_fpu(vcpu);
1469 }
1470
1471 static int is_efer_nx(void)
1472 {
1473         unsigned long long efer = 0;
1474
1475         rdmsrl_safe(MSR_EFER, &efer);
1476         return efer & EFER_NX;
1477 }
1478
1479 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1480 {
1481         int i;
1482         struct kvm_cpuid_entry2 *e, *entry;
1483
1484         entry = NULL;
1485         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1486                 e = &vcpu->arch.cpuid_entries[i];
1487                 if (e->function == 0x80000001) {
1488                         entry = e;
1489                         break;
1490                 }
1491         }
1492         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1493                 entry->edx &= ~(1 << 20);
1494                 printk(KERN_INFO "kvm: guest NX capability removed\n");
1495         }
1496 }
1497
1498 /* when an old userspace process fills a new kernel module */
1499 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1500                                     struct kvm_cpuid *cpuid,
1501                                     struct kvm_cpuid_entry __user *entries)
1502 {
1503         int r, i;
1504         struct kvm_cpuid_entry *cpuid_entries;
1505
1506         r = -E2BIG;
1507         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1508                 goto out;
1509         r = -ENOMEM;
1510         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1511         if (!cpuid_entries)
1512                 goto out;
1513         r = -EFAULT;
1514         if (copy_from_user(cpuid_entries, entries,
1515                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1516                 goto out_free;
1517         for (i = 0; i < cpuid->nent; i++) {
1518                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1519                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1520                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1521                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1522                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1523                 vcpu->arch.cpuid_entries[i].index = 0;
1524                 vcpu->arch.cpuid_entries[i].flags = 0;
1525                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
1526                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
1527                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
1528         }
1529         vcpu->arch.cpuid_nent = cpuid->nent;
1530         cpuid_fix_nx_cap(vcpu);
1531         r = 0;
1532         kvm_apic_set_version(vcpu);
1533
1534 out_free:
1535         vfree(cpuid_entries);
1536 out:
1537         return r;
1538 }
1539
1540 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1541                                      struct kvm_cpuid2 *cpuid,
1542                                      struct kvm_cpuid_entry2 __user *entries)
1543 {
1544         int r;
1545
1546         r = -E2BIG;
1547         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1548                 goto out;
1549         r = -EFAULT;
1550         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1551                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1552                 goto out;
1553         vcpu->arch.cpuid_nent = cpuid->nent;
1554         kvm_apic_set_version(vcpu);
1555         return 0;
1556
1557 out:
1558         return r;
1559 }
1560
1561 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1562                                      struct kvm_cpuid2 *cpuid,
1563                                      struct kvm_cpuid_entry2 __user *entries)
1564 {
1565         int r;
1566
1567         r = -E2BIG;
1568         if (cpuid->nent < vcpu->arch.cpuid_nent)
1569                 goto out;
1570         r = -EFAULT;
1571         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1572                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1573                 goto out;
1574         return 0;
1575
1576 out:
1577         cpuid->nent = vcpu->arch.cpuid_nent;
1578         return r;
1579 }
1580
1581 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1582                            u32 index)
1583 {
1584         entry->function = function;
1585         entry->index = index;
1586         cpuid_count(entry->function, entry->index,
1587                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1588         entry->flags = 0;
1589 }
1590
1591 #define F(x) bit(X86_FEATURE_##x)
1592
1593 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1594                          u32 index, int *nent, int maxnent)
1595 {
1596         unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1597         unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1598 #ifdef CONFIG_X86_64
1599         unsigned f_lm = F(LM);
1600 #else
1601         unsigned f_lm = 0;
1602 #endif
1603
1604         /* cpuid 1.edx */
1605         const u32 kvm_supported_word0_x86_features =
1606                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1607                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1608                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1609                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1610                 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1611                 0 /* Reserved, DS, ACPI */ | F(MMX) |
1612                 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1613                 0 /* HTT, TM, Reserved, PBE */;
1614         /* cpuid 0x80000001.edx */
1615         const u32 kvm_supported_word1_x86_features =
1616                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1617                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1618                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1619                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1620                 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1621                 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1622                 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
1623                 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1624         /* cpuid 1.ecx */
1625         const u32 kvm_supported_word4_x86_features =
1626                 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1627                 0 /* DS-CPL, VMX, SMX, EST */ |
1628                 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1629                 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1630                 0 /* Reserved, DCA */ | F(XMM4_1) |
1631                 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1632                 0 /* Reserved, XSAVE, OSXSAVE */;
1633         /* cpuid 0x80000001.ecx */
1634         const u32 kvm_supported_word6_x86_features =
1635                 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1636                 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1637                 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1638                 0 /* SKINIT */ | 0 /* WDT */;
1639
1640         /* all calls to cpuid_count() should be made on the same cpu */
1641         get_cpu();
1642         do_cpuid_1_ent(entry, function, index);
1643         ++*nent;
1644
1645         switch (function) {
1646         case 0:
1647                 entry->eax = min(entry->eax, (u32)0xb);
1648                 break;
1649         case 1:
1650                 entry->edx &= kvm_supported_word0_x86_features;
1651                 entry->ecx &= kvm_supported_word4_x86_features;
1652                 /* we support x2apic emulation even if host does not support
1653                  * it since we emulate x2apic in software */
1654                 entry->ecx |= F(X2APIC);
1655                 break;
1656         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1657          * may return different values. This forces us to get_cpu() before
1658          * issuing the first command, and also to emulate this annoying behavior
1659          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1660         case 2: {
1661                 int t, times = entry->eax & 0xff;
1662
1663                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1664                 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1665                 for (t = 1; t < times && *nent < maxnent; ++t) {
1666                         do_cpuid_1_ent(&entry[t], function, 0);
1667                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1668                         ++*nent;
1669                 }
1670                 break;
1671         }
1672         /* function 4 and 0xb have additional index. */
1673         case 4: {
1674                 int i, cache_type;
1675
1676                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1677                 /* read more entries until cache_type is zero */
1678                 for (i = 1; *nent < maxnent; ++i) {
1679                         cache_type = entry[i - 1].eax & 0x1f;
1680                         if (!cache_type)
1681                                 break;
1682                         do_cpuid_1_ent(&entry[i], function, i);
1683                         entry[i].flags |=
1684                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1685                         ++*nent;
1686                 }
1687                 break;
1688         }
1689         case 0xb: {
1690                 int i, level_type;
1691
1692                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1693                 /* read more entries until level_type is zero */
1694                 for (i = 1; *nent < maxnent; ++i) {
1695                         level_type = entry[i - 1].ecx & 0xff00;
1696                         if (!level_type)
1697                                 break;
1698                         do_cpuid_1_ent(&entry[i], function, i);
1699                         entry[i].flags |=
1700                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1701                         ++*nent;
1702                 }
1703                 break;
1704         }
1705         case 0x80000000:
1706                 entry->eax = min(entry->eax, 0x8000001a);
1707                 break;
1708         case 0x80000001:
1709                 entry->edx &= kvm_supported_word1_x86_features;
1710                 entry->ecx &= kvm_supported_word6_x86_features;
1711                 break;
1712         }
1713         put_cpu();
1714 }
1715
1716 #undef F
1717
1718 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1719                                      struct kvm_cpuid_entry2 __user *entries)
1720 {
1721         struct kvm_cpuid_entry2 *cpuid_entries;
1722         int limit, nent = 0, r = -E2BIG;
1723         u32 func;
1724
1725         if (cpuid->nent < 1)
1726                 goto out;
1727         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1728                 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1729         r = -ENOMEM;
1730         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1731         if (!cpuid_entries)
1732                 goto out;
1733
1734         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1735         limit = cpuid_entries[0].eax;
1736         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1737                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1738                              &nent, cpuid->nent);
1739         r = -E2BIG;
1740         if (nent >= cpuid->nent)
1741                 goto out_free;
1742
1743         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1744         limit = cpuid_entries[nent - 1].eax;
1745         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1746                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1747                              &nent, cpuid->nent);
1748         r = -E2BIG;
1749         if (nent >= cpuid->nent)
1750                 goto out_free;
1751
1752         r = -EFAULT;
1753         if (copy_to_user(entries, cpuid_entries,
1754                          nent * sizeof(struct kvm_cpuid_entry2)))
1755                 goto out_free;
1756         cpuid->nent = nent;
1757         r = 0;
1758
1759 out_free:
1760         vfree(cpuid_entries);
1761 out:
1762         return r;
1763 }
1764
1765 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1766                                     struct kvm_lapic_state *s)
1767 {
1768         vcpu_load(vcpu);
1769         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1770         vcpu_put(vcpu);
1771
1772         return 0;
1773 }
1774
1775 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1776                                     struct kvm_lapic_state *s)
1777 {
1778         vcpu_load(vcpu);
1779         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1780         kvm_apic_post_state_restore(vcpu);
1781         update_cr8_intercept(vcpu);
1782         vcpu_put(vcpu);
1783
1784         return 0;
1785 }
1786
1787 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1788                                     struct kvm_interrupt *irq)
1789 {
1790         if (irq->irq < 0 || irq->irq >= 256)
1791                 return -EINVAL;
1792         if (irqchip_in_kernel(vcpu->kvm))
1793                 return -ENXIO;
1794         vcpu_load(vcpu);
1795
1796         kvm_queue_interrupt(vcpu, irq->irq, false);
1797
1798         vcpu_put(vcpu);
1799
1800         return 0;
1801 }
1802
1803 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1804 {
1805         vcpu_load(vcpu);
1806         kvm_inject_nmi(vcpu);
1807         vcpu_put(vcpu);
1808
1809         return 0;
1810 }
1811
1812 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1813                                            struct kvm_tpr_access_ctl *tac)
1814 {
1815         if (tac->flags)
1816                 return -EINVAL;
1817         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1818         return 0;
1819 }
1820
1821 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
1822                                         u64 mcg_cap)
1823 {
1824         int r;
1825         unsigned bank_num = mcg_cap & 0xff, bank;
1826
1827         r = -EINVAL;
1828         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
1829                 goto out;
1830         if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1831                 goto out;
1832         r = 0;
1833         vcpu->arch.mcg_cap = mcg_cap;
1834         /* Init IA32_MCG_CTL to all 1s */
1835         if (mcg_cap & MCG_CTL_P)
1836                 vcpu->arch.mcg_ctl = ~(u64)0;
1837         /* Init IA32_MCi_CTL to all 1s */
1838         for (bank = 0; bank < bank_num; bank++)
1839                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
1840 out:
1841         return r;
1842 }
1843
1844 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1845                                       struct kvm_x86_mce *mce)
1846 {
1847         u64 mcg_cap = vcpu->arch.mcg_cap;
1848         unsigned bank_num = mcg_cap & 0xff;
1849         u64 *banks = vcpu->arch.mce_banks;
1850
1851         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
1852                 return -EINVAL;
1853         /*
1854          * if IA32_MCG_CTL is not all 1s, the uncorrected error
1855          * reporting is disabled
1856          */
1857         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
1858             vcpu->arch.mcg_ctl != ~(u64)0)
1859                 return 0;
1860         banks += 4 * mce->bank;
1861         /*
1862          * if IA32_MCi_CTL is not all 1s, the uncorrected error
1863          * reporting is disabled for the bank
1864          */
1865         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
1866                 return 0;
1867         if (mce->status & MCI_STATUS_UC) {
1868                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1869                     !(vcpu->arch.cr4 & X86_CR4_MCE)) {
1870                         printk(KERN_DEBUG "kvm: set_mce: "
1871                                "injects mce exception while "
1872                                "previous one is in progress!\n");
1873                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1874                         return 0;
1875                 }
1876                 if (banks[1] & MCI_STATUS_VAL)
1877                         mce->status |= MCI_STATUS_OVER;
1878                 banks[2] = mce->addr;
1879                 banks[3] = mce->misc;
1880                 vcpu->arch.mcg_status = mce->mcg_status;
1881                 banks[1] = mce->status;
1882                 kvm_queue_exception(vcpu, MC_VECTOR);
1883         } else if (!(banks[1] & MCI_STATUS_VAL)
1884                    || !(banks[1] & MCI_STATUS_UC)) {
1885                 if (banks[1] & MCI_STATUS_VAL)
1886                         mce->status |= MCI_STATUS_OVER;
1887                 banks[2] = mce->addr;
1888                 banks[3] = mce->misc;
1889                 banks[1] = mce->status;
1890         } else
1891                 banks[1] |= MCI_STATUS_OVER;
1892         return 0;
1893 }
1894
1895 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
1896                                                struct kvm_vcpu_events *events)
1897 {
1898         vcpu_load(vcpu);
1899
1900         events->exception.injected = vcpu->arch.exception.pending;
1901         events->exception.nr = vcpu->arch.exception.nr;
1902         events->exception.has_error_code = vcpu->arch.exception.has_error_code;
1903         events->exception.error_code = vcpu->arch.exception.error_code;
1904
1905         events->interrupt.injected = vcpu->arch.interrupt.pending;
1906         events->interrupt.nr = vcpu->arch.interrupt.nr;
1907         events->interrupt.soft = vcpu->arch.interrupt.soft;
1908
1909         events->nmi.injected = vcpu->arch.nmi_injected;
1910         events->nmi.pending = vcpu->arch.nmi_pending;
1911         events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
1912
1913         events->sipi_vector = vcpu->arch.sipi_vector;
1914
1915         events->flags = 0;
1916
1917         vcpu_put(vcpu);
1918 }
1919
1920 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
1921                                               struct kvm_vcpu_events *events)
1922 {
1923         if (events->flags)
1924                 return -EINVAL;
1925
1926         vcpu_load(vcpu);
1927
1928         vcpu->arch.exception.pending = events->exception.injected;
1929         vcpu->arch.exception.nr = events->exception.nr;
1930         vcpu->arch.exception.has_error_code = events->exception.has_error_code;
1931         vcpu->arch.exception.error_code = events->exception.error_code;
1932
1933         vcpu->arch.interrupt.pending = events->interrupt.injected;
1934         vcpu->arch.interrupt.nr = events->interrupt.nr;
1935         vcpu->arch.interrupt.soft = events->interrupt.soft;
1936         if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
1937                 kvm_pic_clear_isr_ack(vcpu->kvm);
1938
1939         vcpu->arch.nmi_injected = events->nmi.injected;
1940         vcpu->arch.nmi_pending = events->nmi.pending;
1941         kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
1942
1943         vcpu->arch.sipi_vector = events->sipi_vector;
1944
1945         vcpu_put(vcpu);
1946
1947         return 0;
1948 }
1949
1950 long kvm_arch_vcpu_ioctl(struct file *filp,
1951                          unsigned int ioctl, unsigned long arg)
1952 {
1953         struct kvm_vcpu *vcpu = filp->private_data;
1954         void __user *argp = (void __user *)arg;
1955         int r;
1956         struct kvm_lapic_state *lapic = NULL;
1957
1958         switch (ioctl) {
1959         case KVM_GET_LAPIC: {
1960                 r = -EINVAL;
1961                 if (!vcpu->arch.apic)
1962                         goto out;
1963                 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1964
1965                 r = -ENOMEM;
1966                 if (!lapic)
1967                         goto out;
1968                 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1969                 if (r)
1970                         goto out;
1971                 r = -EFAULT;
1972                 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1973                         goto out;
1974                 r = 0;
1975                 break;
1976         }
1977         case KVM_SET_LAPIC: {
1978                 r = -EINVAL;
1979                 if (!vcpu->arch.apic)
1980                         goto out;
1981                 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1982                 r = -ENOMEM;
1983                 if (!lapic)
1984                         goto out;
1985                 r = -EFAULT;
1986                 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1987                         goto out;
1988                 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1989                 if (r)
1990                         goto out;
1991                 r = 0;
1992                 break;
1993         }
1994         case KVM_INTERRUPT: {
1995                 struct kvm_interrupt irq;
1996
1997                 r = -EFAULT;
1998                 if (copy_from_user(&irq, argp, sizeof irq))
1999                         goto out;
2000                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2001                 if (r)
2002                         goto out;
2003                 r = 0;
2004                 break;
2005         }
2006         case KVM_NMI: {
2007                 r = kvm_vcpu_ioctl_nmi(vcpu);
2008                 if (r)
2009                         goto out;
2010                 r = 0;
2011                 break;
2012         }
2013         case KVM_SET_CPUID: {
2014                 struct kvm_cpuid __user *cpuid_arg = argp;
2015                 struct kvm_cpuid cpuid;
2016
2017                 r = -EFAULT;
2018                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2019                         goto out;
2020                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2021                 if (r)
2022                         goto out;
2023                 break;
2024         }
2025         case KVM_SET_CPUID2: {
2026                 struct kvm_cpuid2 __user *cpuid_arg = argp;
2027                 struct kvm_cpuid2 cpuid;
2028
2029                 r = -EFAULT;
2030                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2031                         goto out;
2032                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
2033                                               cpuid_arg->entries);
2034                 if (r)
2035                         goto out;
2036                 break;
2037         }
2038         case KVM_GET_CPUID2: {
2039                 struct kvm_cpuid2 __user *cpuid_arg = argp;
2040                 struct kvm_cpuid2 cpuid;
2041
2042                 r = -EFAULT;
2043                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2044                         goto out;
2045                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
2046                                               cpuid_arg->entries);
2047                 if (r)
2048                         goto out;
2049                 r = -EFAULT;
2050                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
2051                         goto out;
2052                 r = 0;
2053                 break;
2054         }
2055         case KVM_GET_MSRS:
2056                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
2057                 break;
2058         case KVM_SET_MSRS:
2059                 r = msr_io(vcpu, argp, do_set_msr, 0);
2060                 break;
2061         case KVM_TPR_ACCESS_REPORTING: {
2062                 struct kvm_tpr_access_ctl tac;
2063
2064                 r = -EFAULT;
2065                 if (copy_from_user(&tac, argp, sizeof tac))
2066                         goto out;
2067                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
2068                 if (r)
2069                         goto out;
2070                 r = -EFAULT;
2071                 if (copy_to_user(argp, &tac, sizeof tac))
2072                         goto out;
2073                 r = 0;
2074                 break;
2075         };
2076         case KVM_SET_VAPIC_ADDR: {
2077                 struct kvm_vapic_addr va;
2078
2079                 r = -EINVAL;
2080                 if (!irqchip_in_kernel(vcpu->kvm))
2081                         goto out;
2082                 r = -EFAULT;
2083                 if (copy_from_user(&va, argp, sizeof va))
2084                         goto out;
2085                 r = 0;
2086                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
2087                 break;
2088         }
2089         case KVM_X86_SETUP_MCE: {
2090                 u64 mcg_cap;
2091
2092                 r = -EFAULT;
2093                 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
2094                         goto out;
2095                 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
2096                 break;
2097         }
2098         case KVM_X86_SET_MCE: {
2099                 struct kvm_x86_mce mce;
2100
2101                 r = -EFAULT;
2102                 if (copy_from_user(&mce, argp, sizeof mce))
2103                         goto out;
2104                 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2105                 break;
2106         }
2107         case KVM_GET_VCPU_EVENTS: {
2108                 struct kvm_vcpu_events events;
2109
2110                 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
2111
2112                 r = -EFAULT;
2113                 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
2114                         break;
2115                 r = 0;
2116                 break;
2117         }
2118         case KVM_SET_VCPU_EVENTS: {
2119                 struct kvm_vcpu_events events;
2120
2121                 r = -EFAULT;
2122                 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
2123                         break;
2124
2125                 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2126                 break;
2127         }
2128         default:
2129                 r = -EINVAL;
2130         }
2131 out:
2132         kfree(lapic);
2133         return r;
2134 }
2135
2136 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
2137 {
2138         int ret;
2139
2140         if (addr > (unsigned int)(-3 * PAGE_SIZE))
2141                 return -1;
2142         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
2143         return ret;
2144 }
2145
2146 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
2147                                               u64 ident_addr)
2148 {
2149         kvm->arch.ept_identity_map_addr = ident_addr;
2150         return 0;
2151 }
2152
2153 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2154                                           u32 kvm_nr_mmu_pages)
2155 {
2156         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
2157                 return -EINVAL;
2158
2159         down_write(&kvm->slots_lock);
2160         spin_lock(&kvm->mmu_lock);
2161
2162         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
2163         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
2164
2165         spin_unlock(&kvm->mmu_lock);
2166         up_write(&kvm->slots_lock);
2167         return 0;
2168 }
2169
2170 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2171 {
2172         return kvm->arch.n_alloc_mmu_pages;
2173 }
2174
2175 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2176 {
2177         int i;
2178         struct kvm_mem_alias *alias;
2179
2180         for (i = 0; i < kvm->arch.naliases; ++i) {
2181                 alias = &kvm->arch.aliases[i];
2182                 if (gfn >= alias->base_gfn
2183                     && gfn < alias->base_gfn + alias->npages)
2184                         return alias->target_gfn + gfn - alias->base_gfn;
2185         }
2186         return gfn;
2187 }
2188
2189 /*
2190  * Set a new alias region.  Aliases map a portion of physical memory into
2191  * another portion.  This is useful for memory windows, for example the PC
2192  * VGA region.
2193  */
2194 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2195                                          struct kvm_memory_alias *alias)
2196 {
2197         int r, n;
2198         struct kvm_mem_alias *p;
2199
2200         r = -EINVAL;
2201         /* General sanity checks */
2202         if (alias->memory_size & (PAGE_SIZE - 1))
2203                 goto out;
2204         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
2205                 goto out;
2206         if (alias->slot >= KVM_ALIAS_SLOTS)
2207                 goto out;
2208         if (alias->guest_phys_addr + alias->memory_size
2209             < alias->guest_phys_addr)
2210                 goto out;
2211         if (alias->target_phys_addr + alias->memory_size
2212             < alias->target_phys_addr)
2213                 goto out;
2214
2215         down_write(&kvm->slots_lock);
2216         spin_lock(&kvm->mmu_lock);
2217
2218         p = &kvm->arch.aliases[alias->slot];
2219         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2220         p->npages = alias->memory_size >> PAGE_SHIFT;
2221         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2222
2223         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2224                 if (kvm->arch.aliases[n - 1].npages)
2225                         break;
2226         kvm->arch.naliases = n;
2227
2228         spin_unlock(&kvm->mmu_lock);
2229         kvm_mmu_zap_all(kvm);
2230
2231         up_write(&kvm->slots_lock);
2232
2233         return 0;
2234
2235 out:
2236         return r;
2237 }
2238
2239 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2240 {
2241         int r;
2242
2243         r = 0;
2244         switch (chip->chip_id) {
2245         case KVM_IRQCHIP_PIC_MASTER:
2246                 memcpy(&chip->chip.pic,
2247                         &pic_irqchip(kvm)->pics[0],
2248                         sizeof(struct kvm_pic_state));
2249                 break;
2250         case KVM_IRQCHIP_PIC_SLAVE:
2251                 memcpy(&chip->chip.pic,
2252                         &pic_irqchip(kvm)->pics[1],
2253                         sizeof(struct kvm_pic_state));
2254                 break;
2255         case KVM_IRQCHIP_IOAPIC:
2256                 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2257                 break;
2258         default:
2259                 r = -EINVAL;
2260                 break;
2261         }
2262         return r;
2263 }
2264
2265 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2266 {
2267         int r;
2268
2269         r = 0;
2270         switch (chip->chip_id) {
2271         case KVM_IRQCHIP_PIC_MASTER:
2272                 spin_lock(&pic_irqchip(kvm)->lock);
2273                 memcpy(&pic_irqchip(kvm)->pics[0],
2274                         &chip->chip.pic,
2275                         sizeof(struct kvm_pic_state));
2276                 spin_unlock(&pic_irqchip(kvm)->lock);
2277                 break;
2278         case KVM_IRQCHIP_PIC_SLAVE:
2279                 spin_lock(&pic_irqchip(kvm)->lock);
2280                 memcpy(&pic_irqchip(kvm)->pics[1],
2281                         &chip->chip.pic,
2282                         sizeof(struct kvm_pic_state));
2283                 spin_unlock(&pic_irqchip(kvm)->lock);
2284                 break;
2285         case KVM_IRQCHIP_IOAPIC:
2286                 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2287                 break;
2288         default:
2289                 r = -EINVAL;
2290                 break;
2291         }
2292         kvm_pic_update_irq(pic_irqchip(kvm));
2293         return r;
2294 }
2295
2296 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2297 {
2298         int r = 0;
2299
2300         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2301         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
2302         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2303         return r;
2304 }
2305
2306 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2307 {
2308         int r = 0;
2309
2310         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2311         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
2312         kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
2313         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2314         return r;
2315 }
2316
2317 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2318 {
2319         int r = 0;
2320
2321         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2322         memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
2323                 sizeof(ps->channels));
2324         ps->flags = kvm->arch.vpit->pit_state.flags;
2325         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2326         return r;
2327 }
2328
2329 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2330 {
2331         int r = 0, start = 0;
2332         u32 prev_legacy, cur_legacy;
2333         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2334         prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
2335         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
2336         if (!prev_legacy && cur_legacy)
2337                 start = 1;
2338         memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
2339                sizeof(kvm->arch.vpit->pit_state.channels));
2340         kvm->arch.vpit->pit_state.flags = ps->flags;
2341         kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
2342         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2343         return r;
2344 }
2345
2346 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2347                                  struct kvm_reinject_control *control)
2348 {
2349         if (!kvm->arch.vpit)
2350                 return -ENXIO;
2351         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2352         kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
2353         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2354         return 0;
2355 }
2356
2357 /*
2358  * Get (and clear) the dirty memory log for a memory slot.
2359  */
2360 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2361                                       struct kvm_dirty_log *log)
2362 {
2363         int r;
2364         int n;
2365         struct kvm_memory_slot *memslot;
2366         int is_dirty = 0;
2367
2368         down_write(&kvm->slots_lock);
2369
2370         r = kvm_get_dirty_log(kvm, log, &is_dirty);
2371         if (r)
2372                 goto out;
2373
2374         /* If nothing is dirty, don't bother messing with page tables. */
2375         if (is_dirty) {
2376                 spin_lock(&kvm->mmu_lock);
2377                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2378                 spin_unlock(&kvm->mmu_lock);
2379                 memslot = &kvm->memslots[log->slot];
2380                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
2381                 memset(memslot->dirty_bitmap, 0, n);
2382         }
2383         r = 0;
2384 out:
2385         up_write(&kvm->slots_lock);
2386         return r;
2387 }
2388
2389 long kvm_arch_vm_ioctl(struct file *filp,
2390                        unsigned int ioctl, unsigned long arg)
2391 {
2392         struct kvm *kvm = filp->private_data;
2393         void __user *argp = (void __user *)arg;
2394         int r = -ENOTTY;
2395         /*
2396          * This union makes it completely explicit to gcc-3.x
2397          * that these two variables' stack usage should be
2398          * combined, not added together.
2399          */
2400         union {
2401                 struct kvm_pit_state ps;
2402                 struct kvm_pit_state2 ps2;
2403                 struct kvm_memory_alias alias;
2404                 struct kvm_pit_config pit_config;
2405         } u;
2406
2407         switch (ioctl) {
2408         case KVM_SET_TSS_ADDR:
2409                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
2410                 if (r < 0)
2411                         goto out;
2412                 break;
2413         case KVM_SET_IDENTITY_MAP_ADDR: {
2414                 u64 ident_addr;
2415
2416                 r = -EFAULT;
2417                 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
2418                         goto out;
2419                 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
2420                 if (r < 0)
2421                         goto out;
2422                 break;
2423         }
2424         case KVM_SET_MEMORY_REGION: {
2425                 struct kvm_memory_region kvm_mem;
2426                 struct kvm_userspace_memory_region kvm_userspace_mem;
2427
2428                 r = -EFAULT;
2429                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2430                         goto out;
2431                 kvm_userspace_mem.slot = kvm_mem.slot;
2432                 kvm_userspace_mem.flags = kvm_mem.flags;
2433                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2434                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2435                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2436                 if (r)
2437                         goto out;
2438                 break;
2439         }
2440         case KVM_SET_NR_MMU_PAGES:
2441                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2442                 if (r)
2443                         goto out;
2444                 break;
2445         case KVM_GET_NR_MMU_PAGES:
2446                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2447                 break;
2448         case KVM_SET_MEMORY_ALIAS:
2449                 r = -EFAULT;
2450                 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2451                         goto out;
2452                 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2453                 if (r)
2454                         goto out;
2455                 break;
2456         case KVM_CREATE_IRQCHIP: {
2457                 struct kvm_pic *vpic;
2458
2459                 mutex_lock(&kvm->lock);
2460                 r = -EEXIST;
2461                 if (kvm->arch.vpic)
2462                         goto create_irqchip_unlock;
2463                 r = -ENOMEM;
2464                 vpic = kvm_create_pic(kvm);
2465                 if (vpic) {
2466                         r = kvm_ioapic_init(kvm);
2467                         if (r) {
2468                                 kfree(vpic);
2469                                 goto create_irqchip_unlock;
2470                         }
2471                 } else
2472                         goto create_irqchip_unlock;
2473                 smp_wmb();
2474                 kvm->arch.vpic = vpic;
2475                 smp_wmb();
2476                 r = kvm_setup_default_irq_routing(kvm);
2477                 if (r) {
2478                         mutex_lock(&kvm->irq_lock);
2479                         kfree(kvm->arch.vpic);
2480                         kfree(kvm->arch.vioapic);
2481                         kvm->arch.vpic = NULL;
2482                         kvm->arch.vioapic = NULL;
2483                         mutex_unlock(&kvm->irq_lock);
2484                 }
2485         create_irqchip_unlock:
2486                 mutex_unlock(&kvm->lock);
2487                 break;
2488         }
2489         case KVM_CREATE_PIT:
2490                 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2491                 goto create_pit;
2492         case KVM_CREATE_PIT2:
2493                 r = -EFAULT;
2494                 if (copy_from_user(&u.pit_config, argp,
2495                                    sizeof(struct kvm_pit_config)))
2496                         goto out;
2497         create_pit:
2498                 down_write(&kvm->slots_lock);
2499                 r = -EEXIST;
2500                 if (kvm->arch.vpit)
2501                         goto create_pit_unlock;
2502                 r = -ENOMEM;
2503                 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
2504                 if (kvm->arch.vpit)
2505                         r = 0;
2506         create_pit_unlock:
2507                 up_write(&kvm->slots_lock);
2508                 break;
2509         case KVM_IRQ_LINE_STATUS:
2510         case KVM_IRQ_LINE: {
2511                 struct kvm_irq_level irq_event;
2512
2513                 r = -EFAULT;
2514                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2515                         goto out;
2516                 if (irqchip_in_kernel(kvm)) {
2517                         __s32 status;
2518                         status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2519                                         irq_event.irq, irq_event.level);
2520                         if (ioctl == KVM_IRQ_LINE_STATUS) {
2521                                 irq_event.status = status;
2522                                 if (copy_to_user(argp, &irq_event,
2523                                                         sizeof irq_event))
2524                                         goto out;
2525                         }
2526                         r = 0;
2527                 }
2528                 break;
2529         }
2530         case KVM_GET_IRQCHIP: {
2531                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2532                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2533
2534                 r = -ENOMEM;
2535                 if (!chip)
2536                         goto out;
2537                 r = -EFAULT;
2538                 if (copy_from_user(chip, argp, sizeof *chip))
2539                         goto get_irqchip_out;
2540                 r = -ENXIO;
2541                 if (!irqchip_in_kernel(kvm))
2542                         goto get_irqchip_out;
2543                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
2544                 if (r)
2545                         goto get_irqchip_out;
2546                 r = -EFAULT;
2547                 if (copy_to_user(argp, chip, sizeof *chip))
2548                         goto get_irqchip_out;
2549                 r = 0;
2550         get_irqchip_out:
2551                 kfree(chip);
2552                 if (r)
2553                         goto out;
2554                 break;
2555         }
2556         case KVM_SET_IRQCHIP: {
2557                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2558                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2559
2560                 r = -ENOMEM;
2561                 if (!chip)
2562                         goto out;
2563                 r = -EFAULT;
2564                 if (copy_from_user(chip, argp, sizeof *chip))
2565                         goto set_irqchip_out;
2566                 r = -ENXIO;
2567                 if (!irqchip_in_kernel(kvm))
2568                         goto set_irqchip_out;
2569                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
2570                 if (r)
2571                         goto set_irqchip_out;
2572                 r = 0;
2573         set_irqchip_out:
2574                 kfree(chip);
2575                 if (r)
2576                         goto out;
2577                 break;
2578         }
2579         case KVM_GET_PIT: {
2580                 r = -EFAULT;
2581                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
2582                         goto out;
2583                 r = -ENXIO;
2584                 if (!kvm->arch.vpit)
2585                         goto out;
2586                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
2587                 if (r)
2588                         goto out;
2589                 r = -EFAULT;
2590                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
2591                         goto out;
2592                 r = 0;
2593                 break;
2594         }
2595         case KVM_SET_PIT: {
2596                 r = -EFAULT;
2597                 if (copy_from_user(&u.ps, argp, sizeof u.ps))
2598                         goto out;
2599                 r = -ENXIO;
2600                 if (!kvm->arch.vpit)
2601                         goto out;
2602                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
2603                 if (r)
2604                         goto out;
2605                 r = 0;
2606                 break;
2607         }
2608         case KVM_GET_PIT2: {
2609                 r = -ENXIO;
2610                 if (!kvm->arch.vpit)
2611                         goto out;
2612                 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
2613                 if (r)
2614                         goto out;
2615                 r = -EFAULT;
2616                 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
2617                         goto out;
2618                 r = 0;
2619                 break;
2620         }
2621         case KVM_SET_PIT2: {
2622                 r = -EFAULT;
2623                 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
2624                         goto out;
2625                 r = -ENXIO;
2626                 if (!kvm->arch.vpit)
2627                         goto out;
2628                 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
2629                 if (r)
2630                         goto out;
2631                 r = 0;
2632                 break;
2633         }
2634         case KVM_REINJECT_CONTROL: {
2635                 struct kvm_reinject_control control;
2636                 r =  -EFAULT;
2637                 if (copy_from_user(&control, argp, sizeof(control)))
2638                         goto out;
2639                 r = kvm_vm_ioctl_reinject(kvm, &control);
2640                 if (r)
2641                         goto out;
2642                 r = 0;
2643                 break;
2644         }
2645         case KVM_XEN_HVM_CONFIG: {
2646                 r = -EFAULT;
2647                 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2648                                    sizeof(struct kvm_xen_hvm_config)))
2649                         goto out;
2650                 r = -EINVAL;
2651                 if (kvm->arch.xen_hvm_config.flags)
2652                         goto out;
2653                 r = 0;
2654                 break;
2655         }
2656         case KVM_SET_CLOCK: {
2657                 struct timespec now;
2658                 struct kvm_clock_data user_ns;
2659                 u64 now_ns;
2660                 s64 delta;
2661
2662                 r = -EFAULT;
2663                 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
2664                         goto out;
2665
2666                 r = -EINVAL;
2667                 if (user_ns.flags)
2668                         goto out;
2669
2670                 r = 0;
2671                 ktime_get_ts(&now);
2672                 now_ns = timespec_to_ns(&now);
2673                 delta = user_ns.clock - now_ns;
2674                 kvm->arch.kvmclock_offset = delta;
2675                 break;
2676         }
2677         case KVM_GET_CLOCK: {
2678                 struct timespec now;
2679                 struct kvm_clock_data user_ns;
2680                 u64 now_ns;
2681
2682                 ktime_get_ts(&now);
2683                 now_ns = timespec_to_ns(&now);
2684                 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
2685                 user_ns.flags = 0;
2686
2687                 r = -EFAULT;
2688                 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
2689                         goto out;
2690                 r = 0;
2691                 break;
2692         }
2693
2694         default:
2695                 ;
2696         }
2697 out:
2698         return r;
2699 }
2700
2701 static void kvm_init_msr_list(void)
2702 {
2703         u32 dummy[2];
2704         unsigned i, j;
2705
2706         /* skip the first msrs in the list. KVM-specific */
2707         for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2708                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2709                         continue;
2710                 if (j < i)
2711                         msrs_to_save[j] = msrs_to_save[i];
2712                 j++;
2713         }
2714         num_msrs_to_save = j;
2715 }
2716
2717 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2718                            const void *v)
2719 {
2720         if (vcpu->arch.apic &&
2721             !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2722                 return 0;
2723
2724         return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
2725 }
2726
2727 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2728 {
2729         if (vcpu->arch.apic &&
2730             !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2731                 return 0;
2732
2733         return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
2734 }
2735
2736 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2737                                struct kvm_vcpu *vcpu)
2738 {
2739         void *data = val;
2740         int r = X86EMUL_CONTINUE;
2741
2742         while (bytes) {
2743                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2744                 unsigned offset = addr & (PAGE_SIZE-1);
2745                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2746                 int ret;
2747
2748                 if (gpa == UNMAPPED_GVA) {
2749                         r = X86EMUL_PROPAGATE_FAULT;
2750                         goto out;
2751                 }
2752                 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2753                 if (ret < 0) {
2754                         r = X86EMUL_UNHANDLEABLE;
2755                         goto out;
2756                 }
2757
2758                 bytes -= toread;
2759                 data += toread;
2760                 addr += toread;
2761         }
2762 out:
2763         return r;
2764 }
2765
2766 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2767                                 struct kvm_vcpu *vcpu)
2768 {
2769         void *data = val;
2770         int r = X86EMUL_CONTINUE;
2771
2772         while (bytes) {
2773                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2774                 unsigned offset = addr & (PAGE_SIZE-1);
2775                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2776                 int ret;
2777
2778                 if (gpa == UNMAPPED_GVA) {
2779                         r = X86EMUL_PROPAGATE_FAULT;
2780                         goto out;
2781                 }
2782                 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
2783                 if (ret < 0) {
2784                         r = X86EMUL_UNHANDLEABLE;
2785                         goto out;
2786                 }
2787
2788                 bytes -= towrite;
2789                 data += towrite;
2790                 addr += towrite;
2791         }
2792 out:
2793         return r;
2794 }
2795
2796
2797 static int emulator_read_emulated(unsigned long addr,
2798                                   void *val,
2799                                   unsigned int bytes,
2800                                   struct kvm_vcpu *vcpu)
2801 {
2802         gpa_t                 gpa;
2803
2804         if (vcpu->mmio_read_completed) {
2805                 memcpy(val, vcpu->mmio_data, bytes);
2806                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
2807                                vcpu->mmio_phys_addr, *(u64 *)val);
2808                 vcpu->mmio_read_completed = 0;
2809                 return X86EMUL_CONTINUE;
2810         }
2811
2812         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2813
2814         /* For APIC access vmexit */
2815         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2816                 goto mmio;
2817
2818         if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2819                                 == X86EMUL_CONTINUE)
2820                 return X86EMUL_CONTINUE;
2821         if (gpa == UNMAPPED_GVA)
2822                 return X86EMUL_PROPAGATE_FAULT;
2823
2824 mmio:
2825         /*
2826          * Is this MMIO handled locally?
2827          */
2828         if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
2829                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
2830                 return X86EMUL_CONTINUE;
2831         }
2832
2833         trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
2834
2835         vcpu->mmio_needed = 1;
2836         vcpu->mmio_phys_addr = gpa;
2837         vcpu->mmio_size = bytes;
2838         vcpu->mmio_is_write = 0;
2839
2840         return X86EMUL_UNHANDLEABLE;
2841 }
2842
2843 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2844                           const void *val, int bytes)
2845 {
2846         int ret;
2847
2848         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2849         if (ret < 0)
2850                 return 0;
2851         kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2852         return 1;
2853 }
2854
2855 static int emulator_write_emulated_onepage(unsigned long addr,
2856                                            const void *val,
2857                                            unsigned int bytes,
2858                                            struct kvm_vcpu *vcpu)
2859 {
2860         gpa_t                 gpa;
2861
2862         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2863
2864         if (gpa == UNMAPPED_GVA) {
2865                 kvm_inject_page_fault(vcpu, addr, 2);
2866                 return X86EMUL_PROPAGATE_FAULT;
2867         }
2868
2869         /* For APIC access vmexit */
2870         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2871                 goto mmio;
2872
2873         if (emulator_write_phys(vcpu, gpa, val, bytes))
2874                 return X86EMUL_CONTINUE;
2875
2876 mmio:
2877         trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
2878         /*
2879          * Is this MMIO handled locally?
2880          */
2881         if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
2882                 return X86EMUL_CONTINUE;
2883
2884         vcpu->mmio_needed = 1;
2885         vcpu->mmio_phys_addr = gpa;
2886         vcpu->mmio_size = bytes;
2887         vcpu->mmio_is_write = 1;
2888         memcpy(vcpu->mmio_data, val, bytes);
2889
2890         return X86EMUL_CONTINUE;
2891 }
2892
2893 int emulator_write_emulated(unsigned long addr,
2894                                    const void *val,
2895                                    unsigned int bytes,
2896                                    struct kvm_vcpu *vcpu)
2897 {
2898         /* Crossing a page boundary? */
2899         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2900                 int rc, now;
2901
2902                 now = -addr & ~PAGE_MASK;
2903                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2904                 if (rc != X86EMUL_CONTINUE)
2905                         return rc;
2906                 addr += now;
2907                 val += now;
2908                 bytes -= now;
2909         }
2910         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2911 }
2912 EXPORT_SYMBOL_GPL(emulator_write_emulated);
2913
2914 static int emulator_cmpxchg_emulated(unsigned long addr,
2915                                      const void *old,
2916                                      const void *new,
2917                                      unsigned int bytes,
2918                                      struct kvm_vcpu *vcpu)
2919 {
2920         printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
2921 #ifndef CONFIG_X86_64
2922         /* guests cmpxchg8b have to be emulated atomically */
2923         if (bytes == 8) {
2924                 gpa_t gpa;
2925                 struct page *page;
2926                 char *kaddr;
2927                 u64 val;
2928
2929                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2930
2931                 if (gpa == UNMAPPED_GVA ||
2932                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2933                         goto emul_write;
2934
2935                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2936                         goto emul_write;
2937
2938                 val = *(u64 *)new;
2939
2940                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2941
2942                 kaddr = kmap_atomic(page, KM_USER0);
2943                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2944                 kunmap_atomic(kaddr, KM_USER0);
2945                 kvm_release_page_dirty(page);
2946         }
2947 emul_write:
2948 #endif
2949
2950         return emulator_write_emulated(addr, new, bytes, vcpu);
2951 }
2952
2953 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2954 {
2955         return kvm_x86_ops->get_segment_base(vcpu, seg);
2956 }
2957
2958 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2959 {
2960         kvm_mmu_invlpg(vcpu, address);
2961         return X86EMUL_CONTINUE;
2962 }
2963
2964 int emulate_clts(struct kvm_vcpu *vcpu)
2965 {
2966         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2967         return X86EMUL_CONTINUE;
2968 }
2969
2970 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2971 {
2972         struct kvm_vcpu *vcpu = ctxt->vcpu;
2973
2974         switch (dr) {
2975         case 0 ... 3:
2976                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2977                 return X86EMUL_CONTINUE;
2978         default:
2979                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2980                 return X86EMUL_UNHANDLEABLE;
2981         }
2982 }
2983
2984 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2985 {
2986         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2987         int exception;
2988
2989         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2990         if (exception) {
2991                 /* FIXME: better handling */
2992                 return X86EMUL_UNHANDLEABLE;
2993         }
2994         return X86EMUL_CONTINUE;
2995 }
2996
2997 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2998 {
2999         u8 opcodes[4];
3000         unsigned long rip = kvm_rip_read(vcpu);
3001         unsigned long rip_linear;
3002
3003         if (!printk_ratelimit())
3004                 return;
3005
3006         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
3007
3008         kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
3009
3010         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
3011                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
3012 }
3013 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3014
3015 static struct x86_emulate_ops emulate_ops = {
3016         .read_std            = kvm_read_guest_virt,
3017         .read_emulated       = emulator_read_emulated,
3018         .write_emulated      = emulator_write_emulated,
3019         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
3020 };
3021
3022 static void cache_all_regs(struct kvm_vcpu *vcpu)
3023 {
3024         kvm_register_read(vcpu, VCPU_REGS_RAX);
3025         kvm_register_read(vcpu, VCPU_REGS_RSP);
3026         kvm_register_read(vcpu, VCPU_REGS_RIP);
3027         vcpu->arch.regs_dirty = ~0;
3028 }
3029
3030 int emulate_instruction(struct kvm_vcpu *vcpu,
3031                         unsigned long cr2,
3032                         u16 error_code,
3033                         int emulation_type)
3034 {
3035         int r, shadow_mask;
3036         struct decode_cache *c;
3037         struct kvm_run *run = vcpu->run;
3038
3039         kvm_clear_exception_queue(vcpu);
3040         vcpu->arch.mmio_fault_cr2 = cr2;
3041         /*
3042          * TODO: fix emulate.c to use guest_read/write_register
3043          * instead of direct ->regs accesses, can save hundred cycles
3044          * on Intel for instructions that don't read/change RSP, for
3045          * for example.
3046          */
3047         cache_all_regs(vcpu);
3048
3049         vcpu->mmio_is_write = 0;
3050         vcpu->arch.pio.string = 0;
3051
3052         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3053                 int cs_db, cs_l;
3054                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3055
3056                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3057                 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
3058                 vcpu->arch.emulate_ctxt.mode =
3059                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3060                         ? X86EMUL_MODE_REAL : cs_l
3061                         ? X86EMUL_MODE_PROT64 : cs_db
3062                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3063
3064                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3065
3066                 /* Only allow emulation of specific instructions on #UD
3067                  * (namely VMMCALL, sysenter, sysexit, syscall)*/
3068                 c = &vcpu->arch.emulate_ctxt.decode;
3069                 if (emulation_type & EMULTYPE_TRAP_UD) {
3070                         if (!c->twobyte)
3071                                 return EMULATE_FAIL;
3072                         switch (c->b) {
3073                         case 0x01: /* VMMCALL */
3074                                 if (c->modrm_mod != 3 || c->modrm_rm != 1)
3075                                         return EMULATE_FAIL;
3076                                 break;
3077                         case 0x34: /* sysenter */
3078                         case 0x35: /* sysexit */
3079                                 if (c->modrm_mod != 0 || c->modrm_rm != 0)
3080                                         return EMULATE_FAIL;
3081                                 break;
3082                         case 0x05: /* syscall */
3083                                 if (c->modrm_mod != 0 || c->modrm_rm != 0)
3084                                         return EMULATE_FAIL;
3085                                 break;
3086                         default:
3087                                 return EMULATE_FAIL;
3088                         }
3089
3090                         if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
3091                                 return EMULATE_FAIL;
3092                 }
3093
3094                 ++vcpu->stat.insn_emulation;
3095                 if (r)  {
3096                         ++vcpu->stat.insn_emulation_fail;
3097                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3098                                 return EMULATE_DONE;
3099                         return EMULATE_FAIL;
3100                 }
3101         }
3102
3103         if (emulation_type & EMULTYPE_SKIP) {
3104                 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
3105                 return EMULATE_DONE;
3106         }
3107
3108         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3109         shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3110
3111         if (r == 0)
3112                 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
3113
3114         if (vcpu->arch.pio.string)
3115                 return EMULATE_DO_MMIO;
3116
3117         if ((r || vcpu->mmio_is_write) && run) {
3118                 run->exit_reason = KVM_EXIT_MMIO;
3119                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
3120                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
3121                 run->mmio.len = vcpu->mmio_size;
3122                 run->mmio.is_write = vcpu->mmio_is_write;
3123         }
3124
3125         if (r) {
3126                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3127                         return EMULATE_DONE;
3128                 if (!vcpu->mmio_needed) {
3129                         kvm_report_emulation_failure(vcpu, "mmio");
3130                         return EMULATE_FAIL;
3131                 }
3132                 return EMULATE_DO_MMIO;
3133         }
3134
3135         kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3136
3137         if (vcpu->mmio_is_write) {
3138                 vcpu->mmio_needed = 0;
3139                 return EMULATE_DO_MMIO;
3140         }
3141
3142         return EMULATE_DONE;
3143 }
3144 EXPORT_SYMBOL_GPL(emulate_instruction);
3145
3146 static int pio_copy_data(struct kvm_vcpu *vcpu)
3147 {
3148         void *p = vcpu->arch.pio_data;
3149         gva_t q = vcpu->arch.pio.guest_gva;
3150         unsigned bytes;
3151         int ret;
3152
3153         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
3154         if (vcpu->arch.pio.in)
3155                 ret = kvm_write_guest_virt(q, p, bytes, vcpu);
3156         else
3157                 ret = kvm_read_guest_virt(q, p, bytes, vcpu);
3158         return ret;
3159 }
3160
3161 int complete_pio(struct kvm_vcpu *vcpu)
3162 {
3163         struct kvm_pio_request *io = &vcpu->arch.pio;
3164         long delta;
3165         int r;
3166         unsigned long val;
3167
3168         if (!io->string) {
3169                 if (io->in) {
3170                         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3171                         memcpy(&val, vcpu->arch.pio_data, io->size);
3172                         kvm_register_write(vcpu, VCPU_REGS_RAX, val);
3173                 }
3174         } else {
3175                 if (io->in) {
3176                         r = pio_copy_data(vcpu);
3177                         if (r)
3178                                 return r;
3179                 }
3180
3181                 delta = 1;
3182                 if (io->rep) {
3183                         delta *= io->cur_count;
3184                         /*
3185                          * The size of the register should really depend on
3186                          * current address size.
3187                          */
3188                         val = kvm_register_read(vcpu, VCPU_REGS_RCX);
3189                         val -= delta;
3190                         kvm_register_write(vcpu, VCPU_REGS_RCX, val);
3191                 }
3192                 if (io->down)
3193                         delta = -delta;
3194                 delta *= io->size;
3195                 if (io->in) {
3196                         val = kvm_register_read(vcpu, VCPU_REGS_RDI);
3197                         val += delta;
3198                         kvm_register_write(vcpu, VCPU_REGS_RDI, val);
3199                 } else {
3200                         val = kvm_register_read(vcpu, VCPU_REGS_RSI);
3201                         val += delta;
3202                         kvm_register_write(vcpu, VCPU_REGS_RSI, val);
3203                 }
3204         }
3205
3206         io->count -= io->cur_count;
3207         io->cur_count = 0;
3208
3209         return 0;
3210 }
3211
3212 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3213 {
3214         /* TODO: String I/O for in kernel device */
3215         int r;
3216
3217         if (vcpu->arch.pio.in)
3218                 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
3219                                     vcpu->arch.pio.size, pd);
3220         else
3221                 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
3222                                      vcpu->arch.pio.size, pd);
3223         return r;
3224 }
3225
3226 static int pio_string_write(struct kvm_vcpu *vcpu)
3227 {
3228         struct kvm_pio_request *io = &vcpu->arch.pio;
3229         void *pd = vcpu->arch.pio_data;
3230         int i, r = 0;
3231
3232         for (i = 0; i < io->cur_count; i++) {
3233                 if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
3234                                      io->port, io->size, pd)) {
3235                         r = -EOPNOTSUPP;
3236                         break;
3237                 }
3238                 pd += io->size;
3239         }
3240         return r;
3241 }
3242
3243 int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3244 {
3245         unsigned long val;
3246
3247         vcpu->run->exit_reason = KVM_EXIT_IO;
3248         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3249         vcpu->run->io.size = vcpu->arch.pio.size = size;
3250         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3251         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
3252         vcpu->run->io.port = vcpu->arch.pio.port = port;
3253         vcpu->arch.pio.in = in;
3254         vcpu->arch.pio.string = 0;
3255         vcpu->arch.pio.down = 0;
3256         vcpu->arch.pio.rep = 0;
3257
3258         trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3259                       size, 1);
3260
3261         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3262         memcpy(vcpu->arch.pio_data, &val, 4);
3263
3264         if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3265                 complete_pio(vcpu);
3266                 return 1;
3267         }
3268         return 0;
3269 }
3270 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3271
3272 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3273                   int size, unsigned long count, int down,
3274                   gva_t address, int rep, unsigned port)
3275 {
3276         unsigned now, in_page;
3277         int ret = 0;
3278
3279         vcpu->run->exit_reason = KVM_EXIT_IO;
3280         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3281         vcpu->run->io.size = vcpu->arch.pio.size = size;
3282         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3283         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
3284         vcpu->run->io.port = vcpu->arch.pio.port = port;
3285         vcpu->arch.pio.in = in;
3286         vcpu->arch.pio.string = 1;
3287         vcpu->arch.pio.down = down;
3288         vcpu->arch.pio.rep = rep;
3289
3290         trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3291                       size, count);
3292
3293         if (!count) {
3294                 kvm_x86_ops->skip_emulated_instruction(vcpu);
3295                 return 1;
3296         }
3297
3298         if (!down)
3299                 in_page = PAGE_SIZE - offset_in_page(address);
3300         else
3301                 in_page = offset_in_page(address) + size;
3302         now = min(count, (unsigned long)in_page / size);
3303         if (!now)
3304                 now = 1;
3305         if (down) {
3306                 /*
3307                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
3308                  */
3309                 pr_unimpl(vcpu, "guest string pio down\n");
3310                 kvm_inject_gp(vcpu, 0);
3311                 return 1;
3312         }
3313         vcpu->run->io.count = now;
3314         vcpu->arch.pio.cur_count = now;
3315
3316         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
3317                 kvm_x86_ops->skip_emulated_instruction(vcpu);
3318
3319         vcpu->arch.pio.guest_gva = address;
3320
3321         if (!vcpu->arch.pio.in) {
3322                 /* string PIO write */
3323                 ret = pio_copy_data(vcpu);
3324                 if (ret == X86EMUL_PROPAGATE_FAULT) {
3325                         kvm_inject_gp(vcpu, 0);
3326                         return 1;
3327                 }
3328                 if (ret == 0 && !pio_string_write(vcpu)) {
3329                         complete_pio(vcpu);
3330                         if (vcpu->arch.pio.count == 0)
3331                                 ret = 1;
3332                 }
3333         }
3334         /* no string PIO read support yet */
3335
3336         return ret;
3337 }
3338 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
3339
3340 static void bounce_off(void *info)
3341 {
3342         /* nothing */
3343 }
3344
3345 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3346                                      void *data)
3347 {
3348         struct cpufreq_freqs *freq = data;
3349         struct kvm *kvm;
3350         struct kvm_vcpu *vcpu;
3351         int i, send_ipi = 0;
3352
3353         if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3354                 return 0;
3355         if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3356                 return 0;
3357         per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3358
3359         spin_lock(&kvm_lock);
3360         list_for_each_entry(kvm, &vm_list, vm_list) {
3361                 kvm_for_each_vcpu(i, vcpu, kvm) {
3362                         if (vcpu->cpu != freq->cpu)
3363                                 continue;
3364                         if (!kvm_request_guest_time_update(vcpu))
3365                                 continue;
3366                         if (vcpu->cpu != smp_processor_id())
3367                                 send_ipi++;
3368                 }
3369         }
3370         spin_unlock(&kvm_lock);
3371
3372         if (freq->old < freq->new && send_ipi) {
3373                 /*
3374                  * We upscale the frequency.  Must make the guest
3375                  * doesn't see old kvmclock values while running with
3376                  * the new frequency, otherwise we risk the guest sees
3377                  * time go backwards.
3378                  *
3379                  * In case we update the frequency for another cpu
3380                  * (which might be in guest context) send an interrupt
3381                  * to kick the cpu out of guest context.  Next time
3382                  * guest context is entered kvmclock will be updated,
3383                  * so the guest will not see stale values.
3384                  */
3385                 smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
3386         }
3387         return 0;
3388 }
3389
3390 static struct notifier_block kvmclock_cpufreq_notifier_block = {
3391         .notifier_call  = kvmclock_cpufreq_notifier
3392 };
3393
3394 static void kvm_timer_init(void)
3395 {
3396         int cpu;
3397
3398         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3399                 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3400                                           CPUFREQ_TRANSITION_NOTIFIER);
3401                 for_each_online_cpu(cpu) {
3402                         unsigned long khz = cpufreq_get(cpu);
3403                         if (!khz)
3404                                 khz = tsc_khz;
3405                         per_cpu(cpu_tsc_khz, cpu) = khz;
3406                 }
3407         } else {
3408                 for_each_possible_cpu(cpu)
3409                         per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3410         }
3411 }
3412
3413 int kvm_arch_init(void *opaque)
3414 {
3415         int r;
3416         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3417
3418         if (kvm_x86_ops) {
3419                 printk(KERN_ERR "kvm: already loaded the other module\n");
3420                 r = -EEXIST;
3421                 goto out;
3422         }
3423
3424         if (!ops->cpu_has_kvm_support()) {
3425                 printk(KERN_ERR "kvm: no hardware support\n");
3426                 r = -EOPNOTSUPP;
3427                 goto out;
3428         }
3429         if (ops->disabled_by_bios()) {
3430                 printk(KERN_ERR "kvm: disabled by bios\n");
3431                 r = -EOPNOTSUPP;
3432                 goto out;
3433         }
3434
3435         r = kvm_mmu_module_init();
3436         if (r)
3437                 goto out;
3438
3439         kvm_init_msr_list();
3440
3441         kvm_x86_ops = ops;
3442         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
3443         kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
3444         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3445                         PT_DIRTY_MASK, PT64_NX_MASK, 0);
3446
3447         kvm_timer_init();
3448
3449         return 0;
3450
3451 out:
3452         return r;
3453 }
3454
3455 void kvm_arch_exit(void)
3456 {
3457         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
3458                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
3459                                             CPUFREQ_TRANSITION_NOTIFIER);
3460         kvm_x86_ops = NULL;
3461         kvm_mmu_module_exit();
3462 }
3463
3464 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
3465 {
3466         ++vcpu->stat.halt_exits;
3467         if (irqchip_in_kernel(vcpu->kvm)) {
3468                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
3469                 return 1;
3470         } else {
3471                 vcpu->run->exit_reason = KVM_EXIT_HLT;
3472                 return 0;
3473         }
3474 }
3475 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
3476
3477 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
3478                            unsigned long a1)
3479 {
3480         if (is_long_mode(vcpu))
3481                 return a0;
3482         else
3483                 return a0 | ((gpa_t)a1 << 32);
3484 }
3485
3486 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3487 {
3488         unsigned long nr, a0, a1, a2, a3, ret;
3489         int r = 1;
3490
3491         nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3492         a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3493         a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
3494         a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
3495         a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
3496
3497         trace_kvm_hypercall(nr, a0, a1, a2, a3);
3498
3499         if (!is_long_mode(vcpu)) {
3500                 nr &= 0xFFFFFFFF;
3501                 a0 &= 0xFFFFFFFF;
3502                 a1 &= 0xFFFFFFFF;
3503                 a2 &= 0xFFFFFFFF;
3504                 a3 &= 0xFFFFFFFF;
3505         }
3506
3507         if (kvm_x86_ops->get_cpl(vcpu) != 0) {
3508                 ret = -KVM_EPERM;
3509                 goto out;
3510         }
3511
3512         switch (nr) {
3513         case KVM_HC_VAPIC_POLL_IRQ:
3514                 ret = 0;
3515                 break;
3516         case KVM_HC_MMU_OP:
3517                 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
3518                 break;
3519         default:
3520                 ret = -KVM_ENOSYS;
3521                 break;
3522         }
3523 out:
3524         kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3525         ++vcpu->stat.hypercalls;
3526         return r;
3527 }
3528 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
3529
3530 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3531 {
3532         char instruction[3];
3533         int ret = 0;
3534         unsigned long rip = kvm_rip_read(vcpu);
3535
3536
3537         /*
3538          * Blow out the MMU to ensure that no other VCPU has an active mapping
3539          * to ensure that the updated hypercall appears atomically across all
3540          * VCPUs.
3541          */
3542         kvm_mmu_zap_all(vcpu->kvm);
3543
3544         kvm_x86_ops->patch_hypercall(vcpu, instruction);
3545         if (emulator_write_emulated(rip, instruction, 3, vcpu)
3546             != X86EMUL_CONTINUE)
3547                 ret = -EFAULT;
3548
3549         return ret;
3550 }
3551
3552 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3553 {
3554         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3555 }
3556
3557 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3558 {
3559         struct descriptor_table dt = { limit, base };
3560
3561         kvm_x86_ops->set_gdt(vcpu, &dt);
3562 }
3563
3564 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3565 {
3566         struct descriptor_table dt = { limit, base };
3567
3568         kvm_x86_ops->set_idt(vcpu, &dt);
3569 }
3570
3571 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3572                    unsigned long *rflags)
3573 {
3574         kvm_lmsw(vcpu, msw);
3575         *rflags = kvm_get_rflags(vcpu);
3576 }
3577
3578 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3579 {
3580         unsigned long value;
3581
3582         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3583         switch (cr) {
3584         case 0:
3585                 value = vcpu->arch.cr0;
3586                 break;
3587         case 2:
3588                 value = vcpu->arch.cr2;
3589                 break;
3590         case 3:
3591                 value = vcpu->arch.cr3;
3592                 break;
3593         case 4:
3594                 value = vcpu->arch.cr4;
3595                 break;
3596         case 8:
3597                 value = kvm_get_cr8(vcpu);
3598                 break;
3599         default:
3600                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3601                 return 0;
3602         }
3603
3604         return value;
3605 }
3606
3607 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3608                      unsigned long *rflags)
3609 {
3610         switch (cr) {
3611         case 0:
3612                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3613                 *rflags = kvm_get_rflags(vcpu);
3614                 break;
3615         case 2:
3616                 vcpu->arch.cr2 = val;
3617                 break;
3618         case 3:
3619                 kvm_set_cr3(vcpu, val);
3620                 break;
3621         case 4:
3622                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
3623                 break;
3624         case 8:
3625                 kvm_set_cr8(vcpu, val & 0xfUL);
3626                 break;
3627         default:
3628                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3629         }
3630 }
3631
3632 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
3633 {
3634         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
3635         int j, nent = vcpu->arch.cpuid_nent;
3636
3637         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
3638         /* when no next entry is found, the current entry[i] is reselected */
3639         for (j = i + 1; ; j = (j + 1) % nent) {
3640                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
3641                 if (ej->function == e->function) {
3642                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
3643                         return j;
3644                 }
3645         }
3646         return 0; /* silence gcc, even though control never reaches here */
3647 }
3648
3649 /* find an entry with matching function, matching index (if needed), and that
3650  * should be read next (if it's stateful) */
3651 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
3652         u32 function, u32 index)
3653 {
3654         if (e->function != function)
3655                 return 0;
3656         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
3657                 return 0;
3658         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
3659             !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
3660                 return 0;
3661         return 1;
3662 }
3663
3664 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3665                                               u32 function, u32 index)
3666 {
3667         int i;
3668         struct kvm_cpuid_entry2 *best = NULL;
3669
3670         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3671                 struct kvm_cpuid_entry2 *e;
3672
3673                 e = &vcpu->arch.cpuid_entries[i];
3674                 if (is_matching_cpuid_entry(e, function, index)) {
3675                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3676                                 move_to_next_stateful_cpuid_entry(vcpu, i);
3677                         best = e;
3678                         break;
3679                 }
3680                 /*
3681                  * Both basic or both extended?
3682                  */
3683                 if (((e->function ^ function) & 0x80000000) == 0)
3684                         if (!best || e->function > best->function)
3685                                 best = e;
3686         }
3687         return best;
3688 }
3689
3690 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3691 {
3692         struct kvm_cpuid_entry2 *best;
3693
3694         best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3695         if (best)
3696                 return best->eax & 0xff;
3697         return 36;
3698 }
3699
3700 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3701 {
3702         u32 function, index;
3703         struct kvm_cpuid_entry2 *best;
3704
3705         function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3706         index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3707         kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3708         kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3709         kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3710         kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3711         best = kvm_find_cpuid_entry(vcpu, function, index);
3712         if (best) {
3713                 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
3714                 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
3715                 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
3716                 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3717         }
3718         kvm_x86_ops->skip_emulated_instruction(vcpu);
3719         trace_kvm_cpuid(function,
3720                         kvm_register_read(vcpu, VCPU_REGS_RAX),
3721                         kvm_register_read(vcpu, VCPU_REGS_RBX),
3722                         kvm_register_read(vcpu, VCPU_REGS_RCX),
3723                         kvm_register_read(vcpu, VCPU_REGS_RDX));
3724 }
3725 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3726
3727 /*
3728  * Check if userspace requested an interrupt window, and that the
3729  * interrupt window is open.
3730  *
3731  * No need to exit to userspace if we already have an interrupt queued.
3732  */
3733 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3734 {
3735         return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3736                 vcpu->run->request_interrupt_window &&
3737                 kvm_arch_interrupt_allowed(vcpu));
3738 }
3739
3740 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3741 {
3742         struct kvm_run *kvm_run = vcpu->run;
3743
3744         kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3745         kvm_run->cr8 = kvm_get_cr8(vcpu);
3746         kvm_run->apic_base = kvm_get_apic_base(vcpu);
3747         if (irqchip_in_kernel(vcpu->kvm))
3748                 kvm_run->ready_for_interrupt_injection = 1;
3749         else
3750                 kvm_run->ready_for_interrupt_injection =
3751                         kvm_arch_interrupt_allowed(vcpu) &&
3752                         !kvm_cpu_has_interrupt(vcpu) &&
3753                         !kvm_event_needs_reinjection(vcpu);
3754 }
3755
3756 static void vapic_enter(struct kvm_vcpu *vcpu)
3757 {
3758         struct kvm_lapic *apic = vcpu->arch.apic;
3759         struct page *page;
3760
3761         if (!apic || !apic->vapic_addr)
3762                 return;
3763
3764         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3765
3766         vcpu->arch.apic->vapic_page = page;
3767 }
3768
3769 static void vapic_exit(struct kvm_vcpu *vcpu)
3770 {
3771         struct kvm_lapic *apic = vcpu->arch.apic;
3772
3773         if (!apic || !apic->vapic_addr)
3774                 return;
3775
3776         down_read(&vcpu->kvm->slots_lock);
3777         kvm_release_page_dirty(apic->vapic_page);
3778         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3779         up_read(&vcpu->kvm->slots_lock);
3780 }
3781
3782 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3783 {
3784         int max_irr, tpr;
3785
3786         if (!kvm_x86_ops->update_cr8_intercept)
3787                 return;
3788
3789         if (!vcpu->arch.apic)
3790                 return;
3791
3792         if (!vcpu->arch.apic->vapic_addr)
3793                 max_irr = kvm_lapic_find_highest_irr(vcpu);
3794         else
3795                 max_irr = -1;
3796
3797         if (max_irr != -1)
3798                 max_irr >>= 4;
3799
3800         tpr = kvm_lapic_get_cr8(vcpu);
3801
3802         kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3803 }
3804
3805 static void inject_pending_event(struct kvm_vcpu *vcpu)
3806 {
3807         /* try to reinject previous events if any */
3808         if (vcpu->arch.exception.pending) {
3809                 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
3810                                           vcpu->arch.exception.has_error_code,
3811                                           vcpu->arch.exception.error_code);
3812                 return;
3813         }
3814
3815         if (vcpu->arch.nmi_injected) {
3816                 kvm_x86_ops->set_nmi(vcpu);
3817                 return;
3818         }
3819
3820         if (vcpu->arch.interrupt.pending) {
3821                 kvm_x86_ops->set_irq(vcpu);
3822                 return;
3823         }
3824
3825         /* try to inject new event if pending */
3826         if (vcpu->arch.nmi_pending) {
3827                 if (kvm_x86_ops->nmi_allowed(vcpu)) {
3828                         vcpu->arch.nmi_pending = false;
3829                         vcpu->arch.nmi_injected = true;
3830                         kvm_x86_ops->set_nmi(vcpu);
3831                 }
3832         } else if (kvm_cpu_has_interrupt(vcpu)) {
3833                 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3834                         kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3835                                             false);
3836                         kvm_x86_ops->set_irq(vcpu);
3837                 }
3838         }
3839 }
3840
3841 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3842 {
3843         int r;
3844         bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3845                 vcpu->run->request_interrupt_window;
3846
3847         if (vcpu->requests)
3848                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
3849                         kvm_mmu_unload(vcpu);
3850
3851         r = kvm_mmu_reload(vcpu);
3852         if (unlikely(r))
3853                 goto out;
3854
3855         if (vcpu->requests) {
3856                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
3857                         __kvm_migrate_timers(vcpu);
3858                 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3859                         kvm_write_guest_time(vcpu);
3860                 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
3861                         kvm_mmu_sync_roots(vcpu);
3862                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
3863                         kvm_x86_ops->tlb_flush(vcpu);
3864                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3865                                        &vcpu->requests)) {
3866                         vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3867                         r = 0;
3868                         goto out;
3869                 }
3870                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3871                         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3872                         r = 0;
3873                         goto out;
3874                 }
3875         }
3876
3877         preempt_disable();
3878
3879         kvm_x86_ops->prepare_guest_switch(vcpu);
3880         kvm_load_guest_fpu(vcpu);
3881
3882         local_irq_disable();
3883
3884         clear_bit(KVM_REQ_KICK, &vcpu->requests);
3885         smp_mb__after_clear_bit();
3886
3887         if (vcpu->requests || need_resched() || signal_pending(current)) {
3888                 set_bit(KVM_REQ_KICK, &vcpu->requests);
3889                 local_irq_enable();
3890                 preempt_enable();
3891                 r = 1;
3892                 goto out;
3893         }
3894
3895         inject_pending_event(vcpu);
3896
3897         /* enable NMI/IRQ window open exits if needed */
3898         if (vcpu->arch.nmi_pending)
3899                 kvm_x86_ops->enable_nmi_window(vcpu);
3900         else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3901                 kvm_x86_ops->enable_irq_window(vcpu);
3902
3903         if (kvm_lapic_enabled(vcpu)) {
3904                 update_cr8_intercept(vcpu);
3905                 kvm_lapic_sync_to_vapic(vcpu);
3906         }
3907
3908         up_read(&vcpu->kvm->slots_lock);
3909
3910         kvm_guest_enter();
3911
3912         if (unlikely(vcpu->arch.switch_db_regs)) {
3913                 set_debugreg(0, 7);
3914                 set_debugreg(vcpu->arch.eff_db[0], 0);
3915                 set_debugreg(vcpu->arch.eff_db[1], 1);
3916                 set_debugreg(vcpu->arch.eff_db[2], 2);
3917                 set_debugreg(vcpu->arch.eff_db[3], 3);
3918         }
3919
3920         trace_kvm_entry(vcpu->vcpu_id);
3921         kvm_x86_ops->run(vcpu);
3922
3923         if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
3924                 set_debugreg(current->thread.debugreg0, 0);
3925                 set_debugreg(current->thread.debugreg1, 1);
3926                 set_debugreg(current->thread.debugreg2, 2);
3927                 set_debugreg(current->thread.debugreg3, 3);
3928                 set_debugreg(current->thread.debugreg6, 6);
3929                 set_debugreg(current->thread.debugreg7, 7);
3930         }
3931
3932         set_bit(KVM_REQ_KICK, &vcpu->requests);
3933         local_irq_enable();
3934
3935         ++vcpu->stat.exits;
3936
3937         /*
3938          * We must have an instruction between local_irq_enable() and
3939          * kvm_guest_exit(), so the timer interrupt isn't delayed by
3940          * the interrupt shadow.  The stat.exits increment will do nicely.
3941          * But we need to prevent reordering, hence this barrier():
3942          */
3943         barrier();
3944
3945         kvm_guest_exit();
3946
3947         preempt_enable();
3948
3949         down_read(&vcpu->kvm->slots_lock);
3950
3951         /*
3952          * Profile KVM exit RIPs:
3953          */
3954         if (unlikely(prof_on == KVM_PROFILING)) {
3955                 unsigned long rip = kvm_rip_read(vcpu);
3956                 profile_hit(KVM_PROFILING, (void *)rip);
3957         }
3958
3959
3960         kvm_lapic_sync_from_vapic(vcpu);
3961
3962         r = kvm_x86_ops->handle_exit(vcpu);
3963 out:
3964         return r;
3965 }
3966
3967
3968 static int __vcpu_run(struct kvm_vcpu *vcpu)
3969 {
3970         int r;
3971
3972         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3973                 pr_debug("vcpu %d received sipi with vector # %x\n",
3974                          vcpu->vcpu_id, vcpu->arch.sipi_vector);
3975                 kvm_lapic_reset(vcpu);
3976                 r = kvm_arch_vcpu_reset(vcpu);
3977                 if (r)
3978                         return r;
3979                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3980         }
3981
3982         down_read(&vcpu->kvm->slots_lock);
3983         vapic_enter(vcpu);
3984
3985         r = 1;
3986         while (r > 0) {
3987                 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3988                         r = vcpu_enter_guest(vcpu);
3989                 else {
3990                         up_read(&vcpu->kvm->slots_lock);
3991                         kvm_vcpu_block(vcpu);
3992                         down_read(&vcpu->kvm->slots_lock);
3993                         if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3994                         {
3995                                 switch(vcpu->arch.mp_state) {
3996                                 case KVM_MP_STATE_HALTED:
3997                                         vcpu->arch.mp_state =
3998                                                 KVM_MP_STATE_RUNNABLE;
3999                                 case KVM_MP_STATE_RUNNABLE:
4000                                         break;
4001                                 case KVM_MP_STATE_SIPI_RECEIVED:
4002                                 default:
4003                                         r = -EINTR;
4004                                         break;
4005                                 }
4006                         }
4007                 }
4008
4009                 if (r <= 0)
4010                         break;
4011
4012                 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
4013                 if (kvm_cpu_has_pending_timer(vcpu))
4014                         kvm_inject_pending_timer_irqs(vcpu);
4015
4016                 if (dm_request_for_irq_injection(vcpu)) {
4017                         r = -EINTR;
4018                         vcpu->run->exit_reason = KVM_EXIT_INTR;
4019                         ++vcpu->stat.request_irq_exits;
4020                 }
4021                 if (signal_pending(current)) {
4022                         r = -EINTR;
4023                         vcpu->run->exit_reason = KVM_EXIT_INTR;
4024                         ++vcpu->stat.signal_exits;
4025                 }
4026                 if (need_resched()) {
4027                         up_read(&vcpu->kvm->slots_lock);
4028                         kvm_resched(vcpu);
4029                         down_read(&vcpu->kvm->slots_lock);
4030                 }
4031         }
4032
4033         up_read(&vcpu->kvm->slots_lock);
4034         post_kvm_run_save(vcpu);
4035
4036         vapic_exit(vcpu);
4037
4038         return r;
4039 }
4040
4041 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4042 {
4043         int r;
4044         sigset_t sigsaved;
4045
4046         vcpu_load(vcpu);
4047
4048         if (vcpu->sigset_active)
4049                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
4050
4051         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
4052                 kvm_vcpu_block(vcpu);
4053                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
4054                 r = -EAGAIN;
4055                 goto out;
4056         }
4057
4058         /* re-sync apic's tpr */
4059         if (!irqchip_in_kernel(vcpu->kvm))
4060                 kvm_set_cr8(vcpu, kvm_run->cr8);
4061
4062         if (vcpu->arch.pio.cur_count) {
4063                 r = complete_pio(vcpu);
4064                 if (r)
4065                         goto out;
4066         }
4067         if (vcpu->mmio_needed) {
4068                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4069                 vcpu->mmio_read_completed = 1;
4070                 vcpu->mmio_needed = 0;
4071
4072                 down_read(&vcpu->kvm->slots_lock);
4073                 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
4074                                         EMULTYPE_NO_DECODE);
4075                 up_read(&vcpu->kvm->slots_lock);
4076                 if (r == EMULATE_DO_MMIO) {
4077                         /*
4078                          * Read-modify-write.  Back to userspace.
4079                          */
4080                         r = 0;
4081                         goto out;
4082                 }
4083         }
4084         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
4085                 kvm_register_write(vcpu, VCPU_REGS_RAX,
4086                                      kvm_run->hypercall.ret);
4087
4088         r = __vcpu_run(vcpu);
4089
4090 out:
4091         if (vcpu->sigset_active)
4092                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4093
4094         vcpu_put(vcpu);
4095         return r;
4096 }
4097
4098 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4099 {
4100         vcpu_load(vcpu);
4101
4102         regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4103         regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4104         regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4105         regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4106         regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4107         regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4108         regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4109         regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4110 #ifdef CONFIG_X86_64
4111         regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
4112         regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
4113         regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
4114         regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
4115         regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
4116         regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
4117         regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
4118         regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
4119 #endif
4120
4121         regs->rip = kvm_rip_read(vcpu);
4122         regs->rflags = kvm_get_rflags(vcpu);
4123
4124         vcpu_put(vcpu);
4125
4126         return 0;
4127 }
4128
4129 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4130 {
4131         vcpu_load(vcpu);
4132
4133         kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
4134         kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
4135         kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
4136         kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
4137         kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
4138         kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
4139         kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
4140         kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
4141 #ifdef CONFIG_X86_64
4142         kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
4143         kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
4144         kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
4145         kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
4146         kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
4147         kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
4148         kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
4149         kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
4150 #endif
4151
4152         kvm_rip_write(vcpu, regs->rip);
4153         kvm_set_rflags(vcpu, regs->rflags);
4154
4155         vcpu->arch.exception.pending = false;
4156
4157         vcpu_put(vcpu);
4158
4159         return 0;
4160 }
4161
4162 void kvm_get_segment(struct kvm_vcpu *vcpu,
4163                      struct kvm_segment *var, int seg)
4164 {
4165         kvm_x86_ops->get_segment(vcpu, var, seg);
4166 }
4167
4168 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4169 {
4170         struct kvm_segment cs;
4171
4172         kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
4173         *db = cs.db;
4174         *l = cs.l;
4175 }
4176 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
4177
4178 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4179                                   struct kvm_sregs *sregs)
4180 {
4181         struct descriptor_table dt;
4182
4183         vcpu_load(vcpu);
4184
4185         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4186         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4187         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4188         kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4189         kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4190         kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4191
4192         kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4193         kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4194
4195         kvm_x86_ops->get_idt(vcpu, &dt);
4196         sregs->idt.limit = dt.limit;
4197         sregs->idt.base = dt.base;
4198         kvm_x86_ops->get_gdt(vcpu, &dt);
4199         sregs->gdt.limit = dt.limit;
4200         sregs->gdt.base = dt.base;
4201
4202         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4203         sregs->cr0 = vcpu->arch.cr0;
4204         sregs->cr2 = vcpu->arch.cr2;
4205         sregs->cr3 = vcpu->arch.cr3;
4206         sregs->cr4 = vcpu->arch.cr4;
4207         sregs->cr8 = kvm_get_cr8(vcpu);
4208         sregs->efer = vcpu->arch.shadow_efer;
4209         sregs->apic_base = kvm_get_apic_base(vcpu);
4210
4211         memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
4212
4213         if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
4214                 set_bit(vcpu->arch.interrupt.nr,
4215                         (unsigned long *)sregs->interrupt_bitmap);
4216
4217         vcpu_put(vcpu);
4218
4219         return 0;
4220 }
4221
4222 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
4223                                     struct kvm_mp_state *mp_state)
4224 {
4225         vcpu_load(vcpu);
4226         mp_state->mp_state = vcpu->arch.mp_state;
4227         vcpu_put(vcpu);
4228         return 0;
4229 }
4230
4231 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4232                                     struct kvm_mp_state *mp_state)
4233 {
4234         vcpu_load(vcpu);
4235         vcpu->arch.mp_state = mp_state->mp_state;
4236         vcpu_put(vcpu);
4237         return 0;
4238 }
4239
4240 static void kvm_set_segment(struct kvm_vcpu *vcpu,
4241                         struct kvm_segment *var, int seg)
4242 {
4243         kvm_x86_ops->set_segment(vcpu, var, seg);
4244 }
4245
4246 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
4247                                    struct kvm_segment *kvm_desct)
4248 {
4249         kvm_desct->base = get_desc_base(seg_desc);
4250         kvm_desct->limit = get_desc_limit(seg_desc);
4251         if (seg_desc->g) {
4252                 kvm_desct->limit <<= 12;
4253                 kvm_desct->limit |= 0xfff;
4254         }
4255         kvm_desct->selector = selector;
4256         kvm_desct->type = seg_desc->type;
4257         kvm_desct->present = seg_desc->p;
4258         kvm_desct->dpl = seg_desc->dpl;
4259         kvm_desct->db = seg_desc->d;
4260         kvm_desct->s = seg_desc->s;
4261         kvm_desct->l = seg_desc->l;
4262         kvm_desct->g = seg_desc->g;
4263         kvm_desct->avl = seg_desc->avl;
4264         if (!selector)
4265                 kvm_desct->unusable = 1;
4266         else
4267                 kvm_desct->unusable = 0;
4268         kvm_desct->padding = 0;
4269 }
4270
4271 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
4272                                           u16 selector,
4273                                           struct descriptor_table *dtable)
4274 {
4275         if (selector & 1 << 2) {
4276                 struct kvm_segment kvm_seg;
4277
4278                 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
4279
4280                 if (kvm_seg.unusable)
4281                         dtable->limit = 0;
4282                 else
4283                         dtable->limit = kvm_seg.limit;
4284                 dtable->base = kvm_seg.base;
4285         }
4286         else
4287                 kvm_x86_ops->get_gdt(vcpu, dtable);
4288 }
4289
4290 /* allowed just for 8 bytes segments */
4291 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4292                                          struct desc_struct *seg_desc)
4293 {
4294         struct descriptor_table dtable;
4295         u16 index = selector >> 3;
4296
4297         get_segment_descriptor_dtable(vcpu, selector, &dtable);
4298
4299         if (dtable.limit < index * 8 + 7) {
4300                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4301                 return 1;
4302         }
4303         return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4304 }
4305
4306 /* allowed just for 8 bytes segments */
4307 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4308                                          struct desc_struct *seg_desc)
4309 {
4310         struct descriptor_table dtable;
4311         u16 index = selector >> 3;
4312
4313         get_segment_descriptor_dtable(vcpu, selector, &dtable);
4314
4315         if (dtable.limit < index * 8 + 7)
4316                 return 1;
4317         return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4318 }
4319
4320 static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
4321                              struct desc_struct *seg_desc)
4322 {
4323         u32 base_addr = get_desc_base(seg_desc);
4324
4325         return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
4326 }
4327
4328 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4329 {
4330         struct kvm_segment kvm_seg;
4331
4332         kvm_get_segment(vcpu, &kvm_seg, seg);
4333         return kvm_seg.selector;
4334 }
4335
4336 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
4337                                                 u16 selector,
4338                                                 struct kvm_segment *kvm_seg)
4339 {
4340         struct desc_struct seg_desc;
4341
4342         if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
4343                 return 1;
4344         seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
4345         return 0;
4346 }
4347
4348 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4349 {
4350         struct kvm_segment segvar = {
4351                 .base = selector << 4,
4352                 .limit = 0xffff,
4353                 .selector = selector,
4354                 .type = 3,
4355                 .present = 1,
4356                 .dpl = 3,
4357                 .db = 0,
4358                 .s = 1,
4359                 .l = 0,
4360                 .g = 0,
4361                 .avl = 0,
4362                 .unusable = 0,
4363         };
4364         kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4365         return 0;
4366 }
4367
4368 static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4369 {
4370         return (seg != VCPU_SREG_LDTR) &&
4371                 (seg != VCPU_SREG_TR) &&
4372                 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4373 }
4374
4375 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4376                                 int type_bits, int seg)
4377 {
4378         struct kvm_segment kvm_seg;
4379
4380         if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
4381                 return kvm_load_realmode_segment(vcpu, selector, seg);
4382         if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
4383                 return 1;
4384         kvm_seg.type |= type_bits;
4385
4386         if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
4387             seg != VCPU_SREG_LDTR)
4388                 if (!kvm_seg.s)
4389                         kvm_seg.unusable = 1;
4390
4391         kvm_set_segment(vcpu, &kvm_seg, seg);
4392         return 0;
4393 }
4394
4395 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4396                                 struct tss_segment_32 *tss)
4397 {
4398         tss->cr3 = vcpu->arch.cr3;
4399         tss->eip = kvm_rip_read(vcpu);
4400         tss->eflags = kvm_get_rflags(vcpu);
4401         tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4402         tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4403         tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4404         tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4405         tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4406         tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4407         tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4408         tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4409         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4410         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4411         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4412         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4413         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
4414         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
4415         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4416 }
4417
4418 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4419                                   struct tss_segment_32 *tss)
4420 {
4421         kvm_set_cr3(vcpu, tss->cr3);
4422
4423         kvm_rip_write(vcpu, tss->eip);
4424         kvm_set_rflags(vcpu, tss->eflags | 2);
4425
4426         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4427         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
4428         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
4429         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
4430         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
4431         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
4432         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4433         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4434
4435         if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
4436                 return 1;
4437
4438         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
4439                 return 1;
4440
4441         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
4442                 return 1;
4443
4444         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
4445                 return 1;
4446
4447         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
4448                 return 1;
4449
4450         if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
4451                 return 1;
4452
4453         if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
4454                 return 1;
4455         return 0;
4456 }
4457
4458 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4459                                 struct tss_segment_16 *tss)
4460 {
4461         tss->ip = kvm_rip_read(vcpu);
4462         tss->flag = kvm_get_rflags(vcpu);
4463         tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4464         tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4465         tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4466         tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4467         tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4468         tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4469         tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
4470         tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
4471
4472         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4473         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4474         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4475         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4476         tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4477 }
4478
4479 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4480                                  struct tss_segment_16 *tss)
4481 {
4482         kvm_rip_write(vcpu, tss->ip);
4483         kvm_set_rflags(vcpu, tss->flag | 2);
4484         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4485         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4486         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
4487         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
4488         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
4489         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
4490         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
4491         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
4492
4493         if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
4494                 return 1;
4495
4496         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
4497                 return 1;
4498
4499         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
4500                 return 1;
4501
4502         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
4503                 return 1;
4504
4505         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
4506                 return 1;
4507         return 0;
4508 }
4509
4510 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4511                               u16 old_tss_sel, u32 old_tss_base,
4512                               struct desc_struct *nseg_desc)
4513 {
4514         struct tss_segment_16 tss_segment_16;
4515         int ret = 0;
4516
4517         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
4518                            sizeof tss_segment_16))
4519                 goto out;
4520
4521         save_state_to_tss16(vcpu, &tss_segment_16);
4522
4523         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
4524                             sizeof tss_segment_16))
4525                 goto out;
4526
4527         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4528                            &tss_segment_16, sizeof tss_segment_16))
4529                 goto out;
4530
4531         if (old_tss_sel != 0xffff) {
4532                 tss_segment_16.prev_task_link = old_tss_sel;
4533
4534                 if (kvm_write_guest(vcpu->kvm,
4535                                     get_tss_base_addr(vcpu, nseg_desc),
4536                                     &tss_segment_16.prev_task_link,
4537                                     sizeof tss_segment_16.prev_task_link))
4538                         goto out;
4539         }
4540
4541         if (load_state_from_tss16(vcpu, &tss_segment_16))
4542                 goto out;
4543
4544         ret = 1;
4545 out:
4546         return ret;
4547 }
4548
4549 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4550                        u16 old_tss_sel, u32 old_tss_base,
4551                        struct desc_struct *nseg_desc)
4552 {
4553         struct tss_segment_32 tss_segment_32;
4554         int ret = 0;
4555
4556         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4557                            sizeof tss_segment_32))
4558                 goto out;
4559
4560         save_state_to_tss32(vcpu, &tss_segment_32);
4561
4562         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4563                             sizeof tss_segment_32))
4564                 goto out;
4565
4566         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4567                            &tss_segment_32, sizeof tss_segment_32))
4568                 goto out;
4569
4570         if (old_tss_sel != 0xffff) {
4571                 tss_segment_32.prev_task_link = old_tss_sel;
4572
4573                 if (kvm_write_guest(vcpu->kvm,
4574                                     get_tss_base_addr(vcpu, nseg_desc),
4575                                     &tss_segment_32.prev_task_link,
4576                                     sizeof tss_segment_32.prev_task_link))
4577                         goto out;
4578         }
4579
4580         if (load_state_from_tss32(vcpu, &tss_segment_32))
4581                 goto out;
4582
4583         ret = 1;
4584 out:
4585         return ret;
4586 }
4587
4588 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4589 {
4590         struct kvm_segment tr_seg;
4591         struct desc_struct cseg_desc;
4592         struct desc_struct nseg_desc;
4593         int ret = 0;
4594         u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4595         u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
4596
4597         old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
4598
4599         /* FIXME: Handle errors. Failure to read either TSS or their
4600          * descriptors should generate a pagefault.
4601          */
4602         if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
4603                 goto out;
4604
4605         if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
4606                 goto out;
4607
4608         if (reason != TASK_SWITCH_IRET) {
4609                 int cpl;
4610
4611                 cpl = kvm_x86_ops->get_cpl(vcpu);
4612                 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
4613                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4614                         return 1;
4615                 }
4616         }
4617
4618         if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
4619                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4620                 return 1;
4621         }
4622
4623         if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
4624                 cseg_desc.type &= ~(1 << 1); //clear the B flag
4625                 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
4626         }
4627
4628         if (reason == TASK_SWITCH_IRET) {
4629                 u32 eflags = kvm_get_rflags(vcpu);
4630                 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4631         }
4632
4633         /* set back link to prev task only if NT bit is set in eflags
4634            note that old_tss_sel is not used afetr this point */
4635         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4636                 old_tss_sel = 0xffff;
4637
4638         if (nseg_desc.type & 8)
4639                 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4640                                          old_tss_base, &nseg_desc);
4641         else
4642                 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
4643                                          old_tss_base, &nseg_desc);
4644
4645         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4646                 u32 eflags = kvm_get_rflags(vcpu);
4647                 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4648         }
4649
4650         if (reason != TASK_SWITCH_IRET) {
4651                 nseg_desc.type |= (1 << 1);
4652                 save_guest_segment_descriptor(vcpu, tss_selector,
4653                                               &nseg_desc);
4654         }
4655
4656         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
4657         seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4658         tr_seg.type = 11;
4659         kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
4660 out:
4661         return ret;
4662 }
4663 EXPORT_SYMBOL_GPL(kvm_task_switch);
4664
4665 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4666                                   struct kvm_sregs *sregs)
4667 {
4668         int mmu_reset_needed = 0;
4669         int pending_vec, max_bits;
4670         struct descriptor_table dt;
4671
4672         vcpu_load(vcpu);
4673
4674         dt.limit = sregs->idt.limit;
4675         dt.base = sregs->idt.base;
4676         kvm_x86_ops->set_idt(vcpu, &dt);
4677         dt.limit = sregs->gdt.limit;
4678         dt.base = sregs->gdt.base;
4679         kvm_x86_ops->set_gdt(vcpu, &dt);
4680
4681         vcpu->arch.cr2 = sregs->cr2;
4682         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4683         vcpu->arch.cr3 = sregs->cr3;
4684
4685         kvm_set_cr8(vcpu, sregs->cr8);
4686
4687         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
4688         kvm_x86_ops->set_efer(vcpu, sregs->efer);
4689         kvm_set_apic_base(vcpu, sregs->apic_base);
4690
4691         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4692
4693         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4694         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4695         vcpu->arch.cr0 = sregs->cr0;
4696
4697         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4698         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4699         if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4700                 load_pdptrs(vcpu, vcpu->arch.cr3);
4701                 mmu_reset_needed = 1;
4702         }
4703
4704         if (mmu_reset_needed)
4705                 kvm_mmu_reset_context(vcpu);
4706
4707         max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4708         pending_vec = find_first_bit(
4709                 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
4710         if (pending_vec < max_bits) {
4711                 kvm_queue_interrupt(vcpu, pending_vec, false);
4712                 pr_debug("Set back pending irq %d\n", pending_vec);
4713                 if (irqchip_in_kernel(vcpu->kvm))
4714                         kvm_pic_clear_isr_ack(vcpu->kvm);
4715         }
4716
4717         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4718         kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4719         kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4720         kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4721         kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4722         kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4723
4724         kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4725         kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4726
4727         update_cr8_intercept(vcpu);
4728
4729         /* Older userspace won't unhalt the vcpu on reset. */
4730         if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4731             sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4732             !(vcpu->arch.cr0 & X86_CR0_PE))
4733                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4734
4735         vcpu_put(vcpu);
4736
4737         return 0;
4738 }
4739
4740 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4741                                         struct kvm_guest_debug *dbg)
4742 {
4743         unsigned long rflags;
4744         int i, r;
4745
4746         vcpu_load(vcpu);
4747
4748         if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
4749                 r = -EBUSY;
4750                 if (vcpu->arch.exception.pending)
4751                         goto unlock_out;
4752                 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4753                         kvm_queue_exception(vcpu, DB_VECTOR);
4754                 else
4755                         kvm_queue_exception(vcpu, BP_VECTOR);
4756         }
4757
4758         /*
4759          * Read rflags as long as potentially injected trace flags are still
4760          * filtered out.
4761          */
4762         rflags = kvm_get_rflags(vcpu);
4763
4764         vcpu->guest_debug = dbg->control;
4765         if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
4766                 vcpu->guest_debug = 0;
4767
4768         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4769                 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4770                         vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4771                 vcpu->arch.switch_db_regs =
4772                         (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4773         } else {
4774                 for (i = 0; i < KVM_NR_DB_REGS; i++)
4775                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4776                 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4777         }
4778
4779         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
4780                 vcpu->arch.singlestep_cs =
4781                         get_segment_selector(vcpu, VCPU_SREG_CS);
4782                 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
4783         }
4784
4785         /*
4786          * Trigger an rflags update that will inject or remove the trace
4787          * flags.
4788          */
4789         kvm_set_rflags(vcpu, rflags);
4790
4791         kvm_x86_ops->set_guest_debug(vcpu, dbg);
4792
4793         r = 0;
4794
4795 unlock_out:
4796         vcpu_put(vcpu);
4797
4798         return r;
4799 }
4800
4801 /*
4802  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
4803  * we have asm/x86/processor.h
4804  */
4805 struct fxsave {
4806         u16     cwd;
4807         u16     swd;
4808         u16     twd;
4809         u16     fop;
4810         u64     rip;
4811         u64     rdp;
4812         u32     mxcsr;
4813         u32     mxcsr_mask;
4814         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
4815 #ifdef CONFIG_X86_64
4816         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
4817 #else
4818         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
4819 #endif
4820 };
4821
4822 /*
4823  * Translate a guest virtual address to a guest physical address.
4824  */
4825 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4826                                     struct kvm_translation *tr)
4827 {
4828         unsigned long vaddr = tr->linear_address;
4829         gpa_t gpa;
4830
4831         vcpu_load(vcpu);
4832         down_read(&vcpu->kvm->slots_lock);
4833         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
4834         up_read(&vcpu->kvm->slots_lock);
4835         tr->physical_address = gpa;
4836         tr->valid = gpa != UNMAPPED_GVA;
4837         tr->writeable = 1;
4838         tr->usermode = 0;
4839         vcpu_put(vcpu);
4840
4841         return 0;
4842 }
4843
4844 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4845 {
4846         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4847
4848         vcpu_load(vcpu);
4849
4850         memcpy(fpu->fpr, fxsave->st_space, 128);
4851         fpu->fcw = fxsave->cwd;
4852         fpu->fsw = fxsave->swd;
4853         fpu->ftwx = fxsave->twd;
4854         fpu->last_opcode = fxsave->fop;
4855         fpu->last_ip = fxsave->rip;
4856         fpu->last_dp = fxsave->rdp;
4857         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
4858
4859         vcpu_put(vcpu);
4860
4861         return 0;
4862 }
4863
4864 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4865 {
4866         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4867
4868         vcpu_load(vcpu);
4869
4870         memcpy(fxsave->st_space, fpu->fpr, 128);
4871         fxsave->cwd = fpu->fcw;
4872         fxsave->swd = fpu->fsw;
4873         fxsave->twd = fpu->ftwx;
4874         fxsave->fop = fpu->last_opcode;
4875         fxsave->rip = fpu->last_ip;
4876         fxsave->rdp = fpu->last_dp;
4877         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
4878
4879         vcpu_put(vcpu);
4880
4881         return 0;
4882 }
4883
4884 void fx_init(struct kvm_vcpu *vcpu)
4885 {
4886         unsigned after_mxcsr_mask;
4887
4888         /*
4889          * Touch the fpu the first time in non atomic context as if
4890          * this is the first fpu instruction the exception handler
4891          * will fire before the instruction returns and it'll have to
4892          * allocate ram with GFP_KERNEL.
4893          */
4894         if (!used_math())
4895                 kvm_fx_save(&vcpu->arch.host_fx_image);
4896
4897         /* Initialize guest FPU by resetting ours and saving into guest's */
4898         preempt_disable();
4899         kvm_fx_save(&vcpu->arch.host_fx_image);
4900         kvm_fx_finit();
4901         kvm_fx_save(&vcpu->arch.guest_fx_image);
4902         kvm_fx_restore(&vcpu->arch.host_fx_image);
4903         preempt_enable();
4904
4905         vcpu->arch.cr0 |= X86_CR0_ET;
4906         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
4907         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
4908         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
4909                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
4910 }
4911 EXPORT_SYMBOL_GPL(fx_init);
4912
4913 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4914 {
4915         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
4916                 return;
4917
4918         vcpu->guest_fpu_loaded = 1;
4919         kvm_fx_save(&vcpu->arch.host_fx_image);
4920         kvm_fx_restore(&vcpu->arch.guest_fx_image);
4921 }
4922 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4923
4924 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4925 {
4926         if (!vcpu->guest_fpu_loaded)
4927                 return;
4928
4929         vcpu->guest_fpu_loaded = 0;
4930         kvm_fx_save(&vcpu->arch.guest_fx_image);
4931         kvm_fx_restore(&vcpu->arch.host_fx_image);
4932         ++vcpu->stat.fpu_reload;
4933 }
4934 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4935
4936 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4937 {
4938         if (vcpu->arch.time_page) {
4939                 kvm_release_page_dirty(vcpu->arch.time_page);
4940                 vcpu->arch.time_page = NULL;
4941         }
4942
4943         kvm_x86_ops->vcpu_free(vcpu);
4944 }
4945
4946 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4947                                                 unsigned int id)
4948 {
4949         return kvm_x86_ops->vcpu_create(kvm, id);
4950 }
4951
4952 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4953 {
4954         int r;
4955
4956         /* We do fxsave: this must be aligned. */
4957         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4958
4959         vcpu->arch.mtrr_state.have_fixed = 1;
4960         vcpu_load(vcpu);
4961         r = kvm_arch_vcpu_reset(vcpu);
4962         if (r == 0)
4963                 r = kvm_mmu_setup(vcpu);
4964         vcpu_put(vcpu);
4965         if (r < 0)
4966                 goto free_vcpu;
4967
4968         return 0;
4969 free_vcpu:
4970         kvm_x86_ops->vcpu_free(vcpu);
4971         return r;
4972 }
4973
4974 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4975 {
4976         vcpu_load(vcpu);
4977         kvm_mmu_unload(vcpu);
4978         vcpu_put(vcpu);
4979
4980         kvm_x86_ops->vcpu_free(vcpu);
4981 }
4982
4983 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4984 {
4985         vcpu->arch.nmi_pending = false;
4986         vcpu->arch.nmi_injected = false;
4987
4988         vcpu->arch.switch_db_regs = 0;
4989         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4990         vcpu->arch.dr6 = DR6_FIXED_1;
4991         vcpu->arch.dr7 = DR7_FIXED_1;
4992
4993         return kvm_x86_ops->vcpu_reset(vcpu);
4994 }
4995
4996 int kvm_arch_hardware_enable(void *garbage)
4997 {
4998         /*
4999          * Since this may be called from a hotplug notifcation,
5000          * we can't get the CPU frequency directly.
5001          */
5002         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5003                 int cpu = raw_smp_processor_id();
5004                 per_cpu(cpu_tsc_khz, cpu) = 0;
5005         }
5006
5007         kvm_shared_msr_cpu_online();
5008
5009         return kvm_x86_ops->hardware_enable(garbage);
5010 }
5011
5012 void kvm_arch_hardware_disable(void *garbage)
5013 {
5014         kvm_x86_ops->hardware_disable(garbage);
5015         drop_user_return_notifiers(garbage);
5016 }
5017
5018 int kvm_arch_hardware_setup(void)
5019 {
5020         return kvm_x86_ops->hardware_setup();
5021 }
5022
5023 void kvm_arch_hardware_unsetup(void)
5024 {
5025         kvm_x86_ops->hardware_unsetup();
5026 }
5027
5028 void kvm_arch_check_processor_compat(void *rtn)
5029 {
5030         kvm_x86_ops->check_processor_compatibility(rtn);
5031 }
5032
5033 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5034 {
5035         struct page *page;
5036         struct kvm *kvm;
5037         int r;
5038
5039         BUG_ON(vcpu->kvm == NULL);
5040         kvm = vcpu->kvm;
5041
5042         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
5043         if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
5044                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5045         else
5046                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
5047
5048         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
5049         if (!page) {
5050                 r = -ENOMEM;
5051                 goto fail;
5052         }
5053         vcpu->arch.pio_data = page_address(page);
5054
5055         r = kvm_mmu_create(vcpu);
5056         if (r < 0)
5057                 goto fail_free_pio_data;
5058
5059         if (irqchip_in_kernel(kvm)) {
5060                 r = kvm_create_lapic(vcpu);
5061                 if (r < 0)
5062                         goto fail_mmu_destroy;
5063         }
5064
5065         vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
5066                                        GFP_KERNEL);
5067         if (!vcpu->arch.mce_banks) {
5068                 r = -ENOMEM;
5069                 goto fail_mmu_destroy;
5070         }
5071         vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
5072
5073         return 0;
5074
5075 fail_mmu_destroy:
5076         kvm_mmu_destroy(vcpu);
5077 fail_free_pio_data:
5078         free_page((unsigned long)vcpu->arch.pio_data);
5079 fail:
5080         return r;
5081 }
5082
5083 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5084 {
5085         kvm_free_lapic(vcpu);
5086         down_read(&vcpu->kvm->slots_lock);
5087         kvm_mmu_destroy(vcpu);
5088         up_read(&vcpu->kvm->slots_lock);
5089         free_page((unsigned long)vcpu->arch.pio_data);
5090 }
5091
5092 struct  kvm *kvm_arch_create_vm(void)
5093 {
5094         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
5095
5096         if (!kvm)
5097                 return ERR_PTR(-ENOMEM);
5098
5099         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5100         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5101
5102         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
5103         set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
5104
5105         rdtscll(kvm->arch.vm_init_tsc);
5106
5107         return kvm;
5108 }
5109
5110 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
5111 {
5112         vcpu_load(vcpu);
5113         kvm_mmu_unload(vcpu);
5114         vcpu_put(vcpu);
5115 }
5116
5117 static void kvm_free_vcpus(struct kvm *kvm)
5118 {
5119         unsigned int i;
5120         struct kvm_vcpu *vcpu;
5121
5122         /*
5123          * Unpin any mmu pages first.
5124          */
5125         kvm_for_each_vcpu(i, vcpu, kvm)
5126                 kvm_unload_vcpu_mmu(vcpu);
5127         kvm_for_each_vcpu(i, vcpu, kvm)
5128                 kvm_arch_vcpu_free(vcpu);
5129
5130         mutex_lock(&kvm->lock);
5131         for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
5132                 kvm->vcpus[i] = NULL;
5133
5134         atomic_set(&kvm->online_vcpus, 0);
5135         mutex_unlock(&kvm->lock);
5136 }
5137
5138 void kvm_arch_sync_events(struct kvm *kvm)
5139 {
5140         kvm_free_all_assigned_devices(kvm);
5141 }
5142
5143 void kvm_arch_destroy_vm(struct kvm *kvm)
5144 {
5145         kvm_iommu_unmap_guest(kvm);
5146         kvm_free_pit(kvm);
5147         kfree(kvm->arch.vpic);
5148         kfree(kvm->arch.vioapic);
5149         kvm_free_vcpus(kvm);
5150         kvm_free_physmem(kvm);
5151         if (kvm->arch.apic_access_page)
5152                 put_page(kvm->arch.apic_access_page);
5153         if (kvm->arch.ept_identity_pagetable)
5154                 put_page(kvm->arch.ept_identity_pagetable);
5155         kfree(kvm);
5156 }
5157
5158 int kvm_arch_set_memory_region(struct kvm *kvm,
5159                                 struct kvm_userspace_memory_region *mem,
5160                                 struct kvm_memory_slot old,
5161                                 int user_alloc)
5162 {
5163         int npages = mem->memory_size >> PAGE_SHIFT;
5164         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
5165
5166         /*To keep backward compatibility with older userspace,
5167          *x86 needs to hanlde !user_alloc case.
5168          */
5169         if (!user_alloc) {
5170                 if (npages && !old.rmap) {
5171                         unsigned long userspace_addr;
5172
5173                         down_write(&current->mm->mmap_sem);
5174                         userspace_addr = do_mmap(NULL, 0,
5175                                                  npages * PAGE_SIZE,
5176                                                  PROT_READ | PROT_WRITE,
5177                                                  MAP_PRIVATE | MAP_ANONYMOUS,
5178                                                  0);
5179                         up_write(&current->mm->mmap_sem);
5180
5181                         if (IS_ERR((void *)userspace_addr))
5182                                 return PTR_ERR((void *)userspace_addr);
5183
5184                         /* set userspace_addr atomically for kvm_hva_to_rmapp */
5185                         spin_lock(&kvm->mmu_lock);
5186                         memslot->userspace_addr = userspace_addr;
5187                         spin_unlock(&kvm->mmu_lock);
5188                 } else {
5189                         if (!old.user_alloc && old.rmap) {
5190                                 int ret;
5191
5192                                 down_write(&current->mm->mmap_sem);
5193                                 ret = do_munmap(current->mm, old.userspace_addr,
5194                                                 old.npages * PAGE_SIZE);
5195                                 up_write(&current->mm->mmap_sem);
5196                                 if (ret < 0)
5197                                         printk(KERN_WARNING
5198                                        "kvm_vm_ioctl_set_memory_region: "
5199                                        "failed to munmap memory\n");
5200                         }
5201                 }
5202         }
5203
5204         spin_lock(&kvm->mmu_lock);
5205         if (!kvm->arch.n_requested_mmu_pages) {
5206                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
5207                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
5208         }
5209
5210         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
5211         spin_unlock(&kvm->mmu_lock);
5212
5213         return 0;
5214 }
5215
5216 void kvm_arch_flush_shadow(struct kvm *kvm)
5217 {
5218         kvm_mmu_zap_all(kvm);
5219         kvm_reload_remote_mmus(kvm);
5220 }
5221
5222 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
5223 {
5224         return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
5225                 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
5226                 || vcpu->arch.nmi_pending ||
5227                 (kvm_arch_interrupt_allowed(vcpu) &&
5228                  kvm_cpu_has_interrupt(vcpu));
5229 }
5230
5231 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
5232 {
5233         int me;
5234         int cpu = vcpu->cpu;
5235
5236         if (waitqueue_active(&vcpu->wq)) {
5237                 wake_up_interruptible(&vcpu->wq);
5238                 ++vcpu->stat.halt_wakeup;
5239         }
5240
5241         me = get_cpu();
5242         if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
5243                 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
5244                         smp_send_reschedule(cpu);
5245         put_cpu();
5246 }
5247
5248 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
5249 {
5250         return kvm_x86_ops->interrupt_allowed(vcpu);
5251 }
5252
5253 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5254 {
5255         unsigned long rflags;
5256
5257         rflags = kvm_x86_ops->get_rflags(vcpu);
5258         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5259                 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
5260         return rflags;
5261 }
5262 EXPORT_SYMBOL_GPL(kvm_get_rflags);
5263
5264 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5265 {
5266         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5267             vcpu->arch.singlestep_cs ==
5268                         get_segment_selector(vcpu, VCPU_SREG_CS) &&
5269             vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5270                 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5271         kvm_x86_ops->set_rflags(vcpu, rflags);
5272 }
5273 EXPORT_SYMBOL_GPL(kvm_set_rflags);
5274
5275 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
5276 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
5277 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
5278 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
5279 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5280 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5281 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5282 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5283 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5284 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5285 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);