b364d192896cc381d832a29539405c57bdef28b3
[safe/jmp/linux-2.6] / arch / x86 / kvm / x86.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  *
8  * Authors:
9  *   Avi Kivity   <avi@qumranet.com>
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *
12  * This work is licensed under the terms of the GNU GPL, version 2.  See
13  * the COPYING file in the top-level directory.
14  *
15  */
16
17 #include <linux/kvm_host.h>
18 #include "irq.h"
19 #include "mmu.h"
20 #include "i8254.h"
21 #include "tss.h"
22
23 #include <linux/clocksource.h>
24 #include <linux/kvm.h>
25 #include <linux/fs.h>
26 #include <linux/vmalloc.h>
27 #include <linux/module.h>
28 #include <linux/mman.h>
29 #include <linux/highmem.h>
30
31 #include <asm/uaccess.h>
32 #include <asm/msr.h>
33 #include <asm/desc.h>
34
35 #define MAX_IO_MSRS 256
36 #define CR0_RESERVED_BITS                                               \
37         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
38                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
39                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
40 #define CR4_RESERVED_BITS                                               \
41         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
42                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
43                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
44                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
45
46 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
47 /* EFER defaults:
48  * - enable syscall per default because its emulated by KVM
49  * - enable LME and LMA per default on 64 bit KVM
50  */
51 #ifdef CONFIG_X86_64
52 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
53 #else
54 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
55 #endif
56
57 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
58 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
59
60 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
61                                     struct kvm_cpuid_entry2 __user *entries);
62
63 struct kvm_x86_ops *kvm_x86_ops;
64
65 struct kvm_stats_debugfs_item debugfs_entries[] = {
66         { "pf_fixed", VCPU_STAT(pf_fixed) },
67         { "pf_guest", VCPU_STAT(pf_guest) },
68         { "tlb_flush", VCPU_STAT(tlb_flush) },
69         { "invlpg", VCPU_STAT(invlpg) },
70         { "exits", VCPU_STAT(exits) },
71         { "io_exits", VCPU_STAT(io_exits) },
72         { "mmio_exits", VCPU_STAT(mmio_exits) },
73         { "signal_exits", VCPU_STAT(signal_exits) },
74         { "irq_window", VCPU_STAT(irq_window_exits) },
75         { "halt_exits", VCPU_STAT(halt_exits) },
76         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
77         { "hypercalls", VCPU_STAT(hypercalls) },
78         { "request_irq", VCPU_STAT(request_irq_exits) },
79         { "irq_exits", VCPU_STAT(irq_exits) },
80         { "host_state_reload", VCPU_STAT(host_state_reload) },
81         { "efer_reload", VCPU_STAT(efer_reload) },
82         { "fpu_reload", VCPU_STAT(fpu_reload) },
83         { "insn_emulation", VCPU_STAT(insn_emulation) },
84         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
85         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
86         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
87         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
88         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
89         { "mmu_flooded", VM_STAT(mmu_flooded) },
90         { "mmu_recycled", VM_STAT(mmu_recycled) },
91         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
92         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
93         { "largepages", VM_STAT(lpages) },
94         { NULL }
95 };
96
97
98 unsigned long segment_base(u16 selector)
99 {
100         struct descriptor_table gdt;
101         struct desc_struct *d;
102         unsigned long table_base;
103         unsigned long v;
104
105         if (selector == 0)
106                 return 0;
107
108         asm("sgdt %0" : "=m"(gdt));
109         table_base = gdt.base;
110
111         if (selector & 4) {           /* from ldt */
112                 u16 ldt_selector;
113
114                 asm("sldt %0" : "=g"(ldt_selector));
115                 table_base = segment_base(ldt_selector);
116         }
117         d = (struct desc_struct *)(table_base + (selector & ~7));
118         v = d->base0 | ((unsigned long)d->base1 << 16) |
119                 ((unsigned long)d->base2 << 24);
120 #ifdef CONFIG_X86_64
121         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
122                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
123 #endif
124         return v;
125 }
126 EXPORT_SYMBOL_GPL(segment_base);
127
128 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
129 {
130         if (irqchip_in_kernel(vcpu->kvm))
131                 return vcpu->arch.apic_base;
132         else
133                 return vcpu->arch.apic_base;
134 }
135 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
136
137 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
138 {
139         /* TODO: reserve bits check */
140         if (irqchip_in_kernel(vcpu->kvm))
141                 kvm_lapic_set_base(vcpu, data);
142         else
143                 vcpu->arch.apic_base = data;
144 }
145 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
146
147 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
148 {
149         WARN_ON(vcpu->arch.exception.pending);
150         vcpu->arch.exception.pending = true;
151         vcpu->arch.exception.has_error_code = false;
152         vcpu->arch.exception.nr = nr;
153 }
154 EXPORT_SYMBOL_GPL(kvm_queue_exception);
155
156 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
157                            u32 error_code)
158 {
159         ++vcpu->stat.pf_guest;
160         if (vcpu->arch.exception.pending) {
161                 if (vcpu->arch.exception.nr == PF_VECTOR) {
162                         printk(KERN_DEBUG "kvm: inject_page_fault:"
163                                         " double fault 0x%lx\n", addr);
164                         vcpu->arch.exception.nr = DF_VECTOR;
165                         vcpu->arch.exception.error_code = 0;
166                 } else if (vcpu->arch.exception.nr == DF_VECTOR) {
167                         /* triple fault -> shutdown */
168                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
169                 }
170                 return;
171         }
172         vcpu->arch.cr2 = addr;
173         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
174 }
175
176 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
177 {
178         WARN_ON(vcpu->arch.exception.pending);
179         vcpu->arch.exception.pending = true;
180         vcpu->arch.exception.has_error_code = true;
181         vcpu->arch.exception.nr = nr;
182         vcpu->arch.exception.error_code = error_code;
183 }
184 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
185
186 static void __queue_exception(struct kvm_vcpu *vcpu)
187 {
188         kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
189                                      vcpu->arch.exception.has_error_code,
190                                      vcpu->arch.exception.error_code);
191 }
192
193 /*
194  * Load the pae pdptrs.  Return true is they are all valid.
195  */
196 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
197 {
198         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
199         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
200         int i;
201         int ret;
202         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
203
204         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
205                                   offset * sizeof(u64), sizeof(pdpte));
206         if (ret < 0) {
207                 ret = 0;
208                 goto out;
209         }
210         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
211                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
212                         ret = 0;
213                         goto out;
214                 }
215         }
216         ret = 1;
217
218         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
219 out:
220
221         return ret;
222 }
223 EXPORT_SYMBOL_GPL(load_pdptrs);
224
225 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
226 {
227         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
228         bool changed = true;
229         int r;
230
231         if (is_long_mode(vcpu) || !is_pae(vcpu))
232                 return false;
233
234         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
235         if (r < 0)
236                 goto out;
237         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
238 out:
239
240         return changed;
241 }
242
243 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
244 {
245         if (cr0 & CR0_RESERVED_BITS) {
246                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
247                        cr0, vcpu->arch.cr0);
248                 kvm_inject_gp(vcpu, 0);
249                 return;
250         }
251
252         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
253                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
254                 kvm_inject_gp(vcpu, 0);
255                 return;
256         }
257
258         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
259                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
260                        "and a clear PE flag\n");
261                 kvm_inject_gp(vcpu, 0);
262                 return;
263         }
264
265         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
266 #ifdef CONFIG_X86_64
267                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
268                         int cs_db, cs_l;
269
270                         if (!is_pae(vcpu)) {
271                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
272                                        "in long mode while PAE is disabled\n");
273                                 kvm_inject_gp(vcpu, 0);
274                                 return;
275                         }
276                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
277                         if (cs_l) {
278                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
279                                        "in long mode while CS.L == 1\n");
280                                 kvm_inject_gp(vcpu, 0);
281                                 return;
282
283                         }
284                 } else
285 #endif
286                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
287                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
288                                "reserved bits\n");
289                         kvm_inject_gp(vcpu, 0);
290                         return;
291                 }
292
293         }
294
295         kvm_x86_ops->set_cr0(vcpu, cr0);
296         vcpu->arch.cr0 = cr0;
297
298         kvm_mmu_reset_context(vcpu);
299         return;
300 }
301 EXPORT_SYMBOL_GPL(kvm_set_cr0);
302
303 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
304 {
305         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
306         KVMTRACE_1D(LMSW, vcpu,
307                     (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
308                     handler);
309 }
310 EXPORT_SYMBOL_GPL(kvm_lmsw);
311
312 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
313 {
314         if (cr4 & CR4_RESERVED_BITS) {
315                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
316                 kvm_inject_gp(vcpu, 0);
317                 return;
318         }
319
320         if (is_long_mode(vcpu)) {
321                 if (!(cr4 & X86_CR4_PAE)) {
322                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
323                                "in long mode\n");
324                         kvm_inject_gp(vcpu, 0);
325                         return;
326                 }
327         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
328                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
329                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
330                 kvm_inject_gp(vcpu, 0);
331                 return;
332         }
333
334         if (cr4 & X86_CR4_VMXE) {
335                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
336                 kvm_inject_gp(vcpu, 0);
337                 return;
338         }
339         kvm_x86_ops->set_cr4(vcpu, cr4);
340         vcpu->arch.cr4 = cr4;
341         kvm_mmu_reset_context(vcpu);
342 }
343 EXPORT_SYMBOL_GPL(kvm_set_cr4);
344
345 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
346 {
347         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
348                 kvm_mmu_flush_tlb(vcpu);
349                 return;
350         }
351
352         if (is_long_mode(vcpu)) {
353                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
354                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
355                         kvm_inject_gp(vcpu, 0);
356                         return;
357                 }
358         } else {
359                 if (is_pae(vcpu)) {
360                         if (cr3 & CR3_PAE_RESERVED_BITS) {
361                                 printk(KERN_DEBUG
362                                        "set_cr3: #GP, reserved bits\n");
363                                 kvm_inject_gp(vcpu, 0);
364                                 return;
365                         }
366                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
367                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
368                                        "reserved bits\n");
369                                 kvm_inject_gp(vcpu, 0);
370                                 return;
371                         }
372                 }
373                 /*
374                  * We don't check reserved bits in nonpae mode, because
375                  * this isn't enforced, and VMware depends on this.
376                  */
377         }
378
379         /*
380          * Does the new cr3 value map to physical memory? (Note, we
381          * catch an invalid cr3 even in real-mode, because it would
382          * cause trouble later on when we turn on paging anyway.)
383          *
384          * A real CPU would silently accept an invalid cr3 and would
385          * attempt to use it - with largely undefined (and often hard
386          * to debug) behavior on the guest side.
387          */
388         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
389                 kvm_inject_gp(vcpu, 0);
390         else {
391                 vcpu->arch.cr3 = cr3;
392                 vcpu->arch.mmu.new_cr3(vcpu);
393         }
394 }
395 EXPORT_SYMBOL_GPL(kvm_set_cr3);
396
397 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
398 {
399         if (cr8 & CR8_RESERVED_BITS) {
400                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
401                 kvm_inject_gp(vcpu, 0);
402                 return;
403         }
404         if (irqchip_in_kernel(vcpu->kvm))
405                 kvm_lapic_set_tpr(vcpu, cr8);
406         else
407                 vcpu->arch.cr8 = cr8;
408 }
409 EXPORT_SYMBOL_GPL(kvm_set_cr8);
410
411 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
412 {
413         if (irqchip_in_kernel(vcpu->kvm))
414                 return kvm_lapic_get_cr8(vcpu);
415         else
416                 return vcpu->arch.cr8;
417 }
418 EXPORT_SYMBOL_GPL(kvm_get_cr8);
419
420 /*
421  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
422  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
423  *
424  * This list is modified at module load time to reflect the
425  * capabilities of the host cpu.
426  */
427 static u32 msrs_to_save[] = {
428         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
429         MSR_K6_STAR,
430 #ifdef CONFIG_X86_64
431         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
432 #endif
433         MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
434         MSR_IA32_PERF_STATUS,
435 };
436
437 static unsigned num_msrs_to_save;
438
439 static u32 emulated_msrs[] = {
440         MSR_IA32_MISC_ENABLE,
441 };
442
443 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
444 {
445         if (efer & efer_reserved_bits) {
446                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
447                        efer);
448                 kvm_inject_gp(vcpu, 0);
449                 return;
450         }
451
452         if (is_paging(vcpu)
453             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
454                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
455                 kvm_inject_gp(vcpu, 0);
456                 return;
457         }
458
459         kvm_x86_ops->set_efer(vcpu, efer);
460
461         efer &= ~EFER_LMA;
462         efer |= vcpu->arch.shadow_efer & EFER_LMA;
463
464         vcpu->arch.shadow_efer = efer;
465 }
466
467 void kvm_enable_efer_bits(u64 mask)
468 {
469        efer_reserved_bits &= ~mask;
470 }
471 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
472
473
474 /*
475  * Writes msr value into into the appropriate "register".
476  * Returns 0 on success, non-0 otherwise.
477  * Assumes vcpu_load() was already called.
478  */
479 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
480 {
481         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
482 }
483
484 /*
485  * Adapt set_msr() to msr_io()'s calling convention
486  */
487 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
488 {
489         return kvm_set_msr(vcpu, index, *data);
490 }
491
492 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
493 {
494         static int version;
495         struct kvm_wall_clock wc;
496         struct timespec wc_ts;
497
498         if (!wall_clock)
499                 return;
500
501         version++;
502
503         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
504
505         wc_ts = current_kernel_time();
506         wc.wc_sec = wc_ts.tv_sec;
507         wc.wc_nsec = wc_ts.tv_nsec;
508         wc.wc_version = version;
509
510         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
511
512         version++;
513         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
514 }
515
516 static void kvm_write_guest_time(struct kvm_vcpu *v)
517 {
518         struct timespec ts;
519         unsigned long flags;
520         struct kvm_vcpu_arch *vcpu = &v->arch;
521         void *shared_kaddr;
522
523         if ((!vcpu->time_page))
524                 return;
525
526         /* Keep irq disabled to prevent changes to the clock */
527         local_irq_save(flags);
528         kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
529                           &vcpu->hv_clock.tsc_timestamp);
530         ktime_get_ts(&ts);
531         local_irq_restore(flags);
532
533         /* With all the info we got, fill in the values */
534
535         vcpu->hv_clock.system_time = ts.tv_nsec +
536                                      (NSEC_PER_SEC * (u64)ts.tv_sec);
537         /*
538          * The interface expects us to write an even number signaling that the
539          * update is finished. Since the guest won't see the intermediate
540          * state, we just write "2" at the end
541          */
542         vcpu->hv_clock.version = 2;
543
544         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
545
546         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
547                 sizeof(vcpu->hv_clock));
548
549         kunmap_atomic(shared_kaddr, KM_USER0);
550
551         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
552 }
553
554
555 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
556 {
557         switch (msr) {
558         case MSR_EFER:
559                 set_efer(vcpu, data);
560                 break;
561         case MSR_IA32_MC0_STATUS:
562                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
563                        __func__, data);
564                 break;
565         case MSR_IA32_MCG_STATUS:
566                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
567                         __func__, data);
568                 break;
569         case MSR_IA32_MCG_CTL:
570                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
571                         __func__, data);
572                 break;
573         case MSR_IA32_UCODE_REV:
574         case MSR_IA32_UCODE_WRITE:
575         case 0x200 ... 0x2ff: /* MTRRs */
576                 break;
577         case MSR_IA32_APICBASE:
578                 kvm_set_apic_base(vcpu, data);
579                 break;
580         case MSR_IA32_MISC_ENABLE:
581                 vcpu->arch.ia32_misc_enable_msr = data;
582                 break;
583         case MSR_KVM_WALL_CLOCK:
584                 vcpu->kvm->arch.wall_clock = data;
585                 kvm_write_wall_clock(vcpu->kvm, data);
586                 break;
587         case MSR_KVM_SYSTEM_TIME: {
588                 if (vcpu->arch.time_page) {
589                         kvm_release_page_dirty(vcpu->arch.time_page);
590                         vcpu->arch.time_page = NULL;
591                 }
592
593                 vcpu->arch.time = data;
594
595                 /* we verify if the enable bit is set... */
596                 if (!(data & 1))
597                         break;
598
599                 /* ...but clean it before doing the actual write */
600                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
601
602                 vcpu->arch.hv_clock.tsc_to_system_mul =
603                                         clocksource_khz2mult(tsc_khz, 22);
604                 vcpu->arch.hv_clock.tsc_shift = 22;
605
606                 down_read(&current->mm->mmap_sem);
607                 vcpu->arch.time_page =
608                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
609                 up_read(&current->mm->mmap_sem);
610
611                 if (is_error_page(vcpu->arch.time_page)) {
612                         kvm_release_page_clean(vcpu->arch.time_page);
613                         vcpu->arch.time_page = NULL;
614                 }
615
616                 kvm_write_guest_time(vcpu);
617                 break;
618         }
619         default:
620                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
621                 return 1;
622         }
623         return 0;
624 }
625 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
626
627
628 /*
629  * Reads an msr value (of 'msr_index') into 'pdata'.
630  * Returns 0 on success, non-0 otherwise.
631  * Assumes vcpu_load() was already called.
632  */
633 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
634 {
635         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
636 }
637
638 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
639 {
640         u64 data;
641
642         switch (msr) {
643         case 0xc0010010: /* SYSCFG */
644         case 0xc0010015: /* HWCR */
645         case MSR_IA32_PLATFORM_ID:
646         case MSR_IA32_P5_MC_ADDR:
647         case MSR_IA32_P5_MC_TYPE:
648         case MSR_IA32_MC0_CTL:
649         case MSR_IA32_MCG_STATUS:
650         case MSR_IA32_MCG_CAP:
651         case MSR_IA32_MCG_CTL:
652         case MSR_IA32_MC0_MISC:
653         case MSR_IA32_MC0_MISC+4:
654         case MSR_IA32_MC0_MISC+8:
655         case MSR_IA32_MC0_MISC+12:
656         case MSR_IA32_MC0_MISC+16:
657         case MSR_IA32_UCODE_REV:
658         case MSR_IA32_EBL_CR_POWERON:
659                 /* MTRR registers */
660         case 0xfe:
661         case 0x200 ... 0x2ff:
662                 data = 0;
663                 break;
664         case 0xcd: /* fsb frequency */
665                 data = 3;
666                 break;
667         case MSR_IA32_APICBASE:
668                 data = kvm_get_apic_base(vcpu);
669                 break;
670         case MSR_IA32_MISC_ENABLE:
671                 data = vcpu->arch.ia32_misc_enable_msr;
672                 break;
673         case MSR_IA32_PERF_STATUS:
674                 /* TSC increment by tick */
675                 data = 1000ULL;
676                 /* CPU multiplier */
677                 data |= (((uint64_t)4ULL) << 40);
678                 break;
679         case MSR_EFER:
680                 data = vcpu->arch.shadow_efer;
681                 break;
682         case MSR_KVM_WALL_CLOCK:
683                 data = vcpu->kvm->arch.wall_clock;
684                 break;
685         case MSR_KVM_SYSTEM_TIME:
686                 data = vcpu->arch.time;
687                 break;
688         default:
689                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
690                 return 1;
691         }
692         *pdata = data;
693         return 0;
694 }
695 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
696
697 /*
698  * Read or write a bunch of msrs. All parameters are kernel addresses.
699  *
700  * @return number of msrs set successfully.
701  */
702 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
703                     struct kvm_msr_entry *entries,
704                     int (*do_msr)(struct kvm_vcpu *vcpu,
705                                   unsigned index, u64 *data))
706 {
707         int i;
708
709         vcpu_load(vcpu);
710
711         down_read(&vcpu->kvm->slots_lock);
712         for (i = 0; i < msrs->nmsrs; ++i)
713                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
714                         break;
715         up_read(&vcpu->kvm->slots_lock);
716
717         vcpu_put(vcpu);
718
719         return i;
720 }
721
722 /*
723  * Read or write a bunch of msrs. Parameters are user addresses.
724  *
725  * @return number of msrs set successfully.
726  */
727 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
728                   int (*do_msr)(struct kvm_vcpu *vcpu,
729                                 unsigned index, u64 *data),
730                   int writeback)
731 {
732         struct kvm_msrs msrs;
733         struct kvm_msr_entry *entries;
734         int r, n;
735         unsigned size;
736
737         r = -EFAULT;
738         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
739                 goto out;
740
741         r = -E2BIG;
742         if (msrs.nmsrs >= MAX_IO_MSRS)
743                 goto out;
744
745         r = -ENOMEM;
746         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
747         entries = vmalloc(size);
748         if (!entries)
749                 goto out;
750
751         r = -EFAULT;
752         if (copy_from_user(entries, user_msrs->entries, size))
753                 goto out_free;
754
755         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
756         if (r < 0)
757                 goto out_free;
758
759         r = -EFAULT;
760         if (writeback && copy_to_user(user_msrs->entries, entries, size))
761                 goto out_free;
762
763         r = n;
764
765 out_free:
766         vfree(entries);
767 out:
768         return r;
769 }
770
771 /*
772  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
773  * cached on it.
774  */
775 void decache_vcpus_on_cpu(int cpu)
776 {
777         struct kvm *vm;
778         struct kvm_vcpu *vcpu;
779         int i;
780
781         spin_lock(&kvm_lock);
782         list_for_each_entry(vm, &vm_list, vm_list)
783                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
784                         vcpu = vm->vcpus[i];
785                         if (!vcpu)
786                                 continue;
787                         /*
788                          * If the vcpu is locked, then it is running on some
789                          * other cpu and therefore it is not cached on the
790                          * cpu in question.
791                          *
792                          * If it's not locked, check the last cpu it executed
793                          * on.
794                          */
795                         if (mutex_trylock(&vcpu->mutex)) {
796                                 if (vcpu->cpu == cpu) {
797                                         kvm_x86_ops->vcpu_decache(vcpu);
798                                         vcpu->cpu = -1;
799                                 }
800                                 mutex_unlock(&vcpu->mutex);
801                         }
802                 }
803         spin_unlock(&kvm_lock);
804 }
805
806 int kvm_dev_ioctl_check_extension(long ext)
807 {
808         int r;
809
810         switch (ext) {
811         case KVM_CAP_IRQCHIP:
812         case KVM_CAP_HLT:
813         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
814         case KVM_CAP_USER_MEMORY:
815         case KVM_CAP_SET_TSS_ADDR:
816         case KVM_CAP_EXT_CPUID:
817         case KVM_CAP_CLOCKSOURCE:
818         case KVM_CAP_PIT:
819         case KVM_CAP_NOP_IO_DELAY:
820                 r = 1;
821                 break;
822         case KVM_CAP_VAPIC:
823                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
824                 break;
825         case KVM_CAP_NR_VCPUS:
826                 r = KVM_MAX_VCPUS;
827                 break;
828         case KVM_CAP_NR_MEMSLOTS:
829                 r = KVM_MEMORY_SLOTS;
830                 break;
831         case KVM_CAP_PV_MMU:
832                 r = !tdp_enabled;
833                 break;
834         default:
835                 r = 0;
836                 break;
837         }
838         return r;
839
840 }
841
842 long kvm_arch_dev_ioctl(struct file *filp,
843                         unsigned int ioctl, unsigned long arg)
844 {
845         void __user *argp = (void __user *)arg;
846         long r;
847
848         switch (ioctl) {
849         case KVM_GET_MSR_INDEX_LIST: {
850                 struct kvm_msr_list __user *user_msr_list = argp;
851                 struct kvm_msr_list msr_list;
852                 unsigned n;
853
854                 r = -EFAULT;
855                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
856                         goto out;
857                 n = msr_list.nmsrs;
858                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
859                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
860                         goto out;
861                 r = -E2BIG;
862                 if (n < num_msrs_to_save)
863                         goto out;
864                 r = -EFAULT;
865                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
866                                  num_msrs_to_save * sizeof(u32)))
867                         goto out;
868                 if (copy_to_user(user_msr_list->indices
869                                  + num_msrs_to_save * sizeof(u32),
870                                  &emulated_msrs,
871                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
872                         goto out;
873                 r = 0;
874                 break;
875         }
876         case KVM_GET_SUPPORTED_CPUID: {
877                 struct kvm_cpuid2 __user *cpuid_arg = argp;
878                 struct kvm_cpuid2 cpuid;
879
880                 r = -EFAULT;
881                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
882                         goto out;
883                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
884                         cpuid_arg->entries);
885                 if (r)
886                         goto out;
887
888                 r = -EFAULT;
889                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
890                         goto out;
891                 r = 0;
892                 break;
893         }
894         default:
895                 r = -EINVAL;
896         }
897 out:
898         return r;
899 }
900
901 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
902 {
903         kvm_x86_ops->vcpu_load(vcpu, cpu);
904         kvm_write_guest_time(vcpu);
905 }
906
907 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
908 {
909         kvm_x86_ops->vcpu_put(vcpu);
910         kvm_put_guest_fpu(vcpu);
911 }
912
913 static int is_efer_nx(void)
914 {
915         u64 efer;
916
917         rdmsrl(MSR_EFER, efer);
918         return efer & EFER_NX;
919 }
920
921 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
922 {
923         int i;
924         struct kvm_cpuid_entry2 *e, *entry;
925
926         entry = NULL;
927         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
928                 e = &vcpu->arch.cpuid_entries[i];
929                 if (e->function == 0x80000001) {
930                         entry = e;
931                         break;
932                 }
933         }
934         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
935                 entry->edx &= ~(1 << 20);
936                 printk(KERN_INFO "kvm: guest NX capability removed\n");
937         }
938 }
939
940 /* when an old userspace process fills a new kernel module */
941 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
942                                     struct kvm_cpuid *cpuid,
943                                     struct kvm_cpuid_entry __user *entries)
944 {
945         int r, i;
946         struct kvm_cpuid_entry *cpuid_entries;
947
948         r = -E2BIG;
949         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
950                 goto out;
951         r = -ENOMEM;
952         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
953         if (!cpuid_entries)
954                 goto out;
955         r = -EFAULT;
956         if (copy_from_user(cpuid_entries, entries,
957                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
958                 goto out_free;
959         for (i = 0; i < cpuid->nent; i++) {
960                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
961                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
962                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
963                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
964                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
965                 vcpu->arch.cpuid_entries[i].index = 0;
966                 vcpu->arch.cpuid_entries[i].flags = 0;
967                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
968                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
969                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
970         }
971         vcpu->arch.cpuid_nent = cpuid->nent;
972         cpuid_fix_nx_cap(vcpu);
973         r = 0;
974
975 out_free:
976         vfree(cpuid_entries);
977 out:
978         return r;
979 }
980
981 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
982                                     struct kvm_cpuid2 *cpuid,
983                                     struct kvm_cpuid_entry2 __user *entries)
984 {
985         int r;
986
987         r = -E2BIG;
988         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
989                 goto out;
990         r = -EFAULT;
991         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
992                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
993                 goto out;
994         vcpu->arch.cpuid_nent = cpuid->nent;
995         return 0;
996
997 out:
998         return r;
999 }
1000
1001 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1002                                     struct kvm_cpuid2 *cpuid,
1003                                     struct kvm_cpuid_entry2 __user *entries)
1004 {
1005         int r;
1006
1007         r = -E2BIG;
1008         if (cpuid->nent < vcpu->arch.cpuid_nent)
1009                 goto out;
1010         r = -EFAULT;
1011         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1012                            vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1013                 goto out;
1014         return 0;
1015
1016 out:
1017         cpuid->nent = vcpu->arch.cpuid_nent;
1018         return r;
1019 }
1020
1021 static inline u32 bit(int bitno)
1022 {
1023         return 1 << (bitno & 31);
1024 }
1025
1026 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1027                           u32 index)
1028 {
1029         entry->function = function;
1030         entry->index = index;
1031         cpuid_count(entry->function, entry->index,
1032                 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1033         entry->flags = 0;
1034 }
1035
1036 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1037                          u32 index, int *nent, int maxnent)
1038 {
1039         const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
1040                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1041                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1042                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1043                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1044                 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1045                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1046                 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1047                 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1048                 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1049         const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1050                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1051                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1052                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1053                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1054                 bit(X86_FEATURE_PGE) |
1055                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1056                 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1057                 bit(X86_FEATURE_SYSCALL) |
1058                 (bit(X86_FEATURE_NX) && is_efer_nx()) |
1059 #ifdef CONFIG_X86_64
1060                 bit(X86_FEATURE_LM) |
1061 #endif
1062                 bit(X86_FEATURE_MMXEXT) |
1063                 bit(X86_FEATURE_3DNOWEXT) |
1064                 bit(X86_FEATURE_3DNOW);
1065         const u32 kvm_supported_word3_x86_features =
1066                 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
1067         const u32 kvm_supported_word6_x86_features =
1068                 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
1069
1070         /* all func 2 cpuid_count() should be called on the same cpu */
1071         get_cpu();
1072         do_cpuid_1_ent(entry, function, index);
1073         ++*nent;
1074
1075         switch (function) {
1076         case 0:
1077                 entry->eax = min(entry->eax, (u32)0xb);
1078                 break;
1079         case 1:
1080                 entry->edx &= kvm_supported_word0_x86_features;
1081                 entry->ecx &= kvm_supported_word3_x86_features;
1082                 break;
1083         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1084          * may return different values. This forces us to get_cpu() before
1085          * issuing the first command, and also to emulate this annoying behavior
1086          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1087         case 2: {
1088                 int t, times = entry->eax & 0xff;
1089
1090                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1091                 for (t = 1; t < times && *nent < maxnent; ++t) {
1092                         do_cpuid_1_ent(&entry[t], function, 0);
1093                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1094                         ++*nent;
1095                 }
1096                 break;
1097         }
1098         /* function 4 and 0xb have additional index. */
1099         case 4: {
1100                 int i, cache_type;
1101
1102                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1103                 /* read more entries until cache_type is zero */
1104                 for (i = 1; *nent < maxnent; ++i) {
1105                         cache_type = entry[i - 1].eax & 0x1f;
1106                         if (!cache_type)
1107                                 break;
1108                         do_cpuid_1_ent(&entry[i], function, i);
1109                         entry[i].flags |=
1110                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1111                         ++*nent;
1112                 }
1113                 break;
1114         }
1115         case 0xb: {
1116                 int i, level_type;
1117
1118                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1119                 /* read more entries until level_type is zero */
1120                 for (i = 1; *nent < maxnent; ++i) {
1121                         level_type = entry[i - 1].ecx & 0xff;
1122                         if (!level_type)
1123                                 break;
1124                         do_cpuid_1_ent(&entry[i], function, i);
1125                         entry[i].flags |=
1126                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1127                         ++*nent;
1128                 }
1129                 break;
1130         }
1131         case 0x80000000:
1132                 entry->eax = min(entry->eax, 0x8000001a);
1133                 break;
1134         case 0x80000001:
1135                 entry->edx &= kvm_supported_word1_x86_features;
1136                 entry->ecx &= kvm_supported_word6_x86_features;
1137                 break;
1138         }
1139         put_cpu();
1140 }
1141
1142 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1143                                     struct kvm_cpuid_entry2 __user *entries)
1144 {
1145         struct kvm_cpuid_entry2 *cpuid_entries;
1146         int limit, nent = 0, r = -E2BIG;
1147         u32 func;
1148
1149         if (cpuid->nent < 1)
1150                 goto out;
1151         r = -ENOMEM;
1152         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1153         if (!cpuid_entries)
1154                 goto out;
1155
1156         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1157         limit = cpuid_entries[0].eax;
1158         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1159                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1160                                 &nent, cpuid->nent);
1161         r = -E2BIG;
1162         if (nent >= cpuid->nent)
1163                 goto out_free;
1164
1165         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1166         limit = cpuid_entries[nent - 1].eax;
1167         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1168                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1169                                &nent, cpuid->nent);
1170         r = -EFAULT;
1171         if (copy_to_user(entries, cpuid_entries,
1172                         nent * sizeof(struct kvm_cpuid_entry2)))
1173                 goto out_free;
1174         cpuid->nent = nent;
1175         r = 0;
1176
1177 out_free:
1178         vfree(cpuid_entries);
1179 out:
1180         return r;
1181 }
1182
1183 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1184                                     struct kvm_lapic_state *s)
1185 {
1186         vcpu_load(vcpu);
1187         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1188         vcpu_put(vcpu);
1189
1190         return 0;
1191 }
1192
1193 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1194                                     struct kvm_lapic_state *s)
1195 {
1196         vcpu_load(vcpu);
1197         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1198         kvm_apic_post_state_restore(vcpu);
1199         vcpu_put(vcpu);
1200
1201         return 0;
1202 }
1203
1204 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1205                                     struct kvm_interrupt *irq)
1206 {
1207         if (irq->irq < 0 || irq->irq >= 256)
1208                 return -EINVAL;
1209         if (irqchip_in_kernel(vcpu->kvm))
1210                 return -ENXIO;
1211         vcpu_load(vcpu);
1212
1213         set_bit(irq->irq, vcpu->arch.irq_pending);
1214         set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1215
1216         vcpu_put(vcpu);
1217
1218         return 0;
1219 }
1220
1221 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1222                                            struct kvm_tpr_access_ctl *tac)
1223 {
1224         if (tac->flags)
1225                 return -EINVAL;
1226         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1227         return 0;
1228 }
1229
1230 long kvm_arch_vcpu_ioctl(struct file *filp,
1231                          unsigned int ioctl, unsigned long arg)
1232 {
1233         struct kvm_vcpu *vcpu = filp->private_data;
1234         void __user *argp = (void __user *)arg;
1235         int r;
1236
1237         switch (ioctl) {
1238         case KVM_GET_LAPIC: {
1239                 struct kvm_lapic_state lapic;
1240
1241                 memset(&lapic, 0, sizeof lapic);
1242                 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1243                 if (r)
1244                         goto out;
1245                 r = -EFAULT;
1246                 if (copy_to_user(argp, &lapic, sizeof lapic))
1247                         goto out;
1248                 r = 0;
1249                 break;
1250         }
1251         case KVM_SET_LAPIC: {
1252                 struct kvm_lapic_state lapic;
1253
1254                 r = -EFAULT;
1255                 if (copy_from_user(&lapic, argp, sizeof lapic))
1256                         goto out;
1257                 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1258                 if (r)
1259                         goto out;
1260                 r = 0;
1261                 break;
1262         }
1263         case KVM_INTERRUPT: {
1264                 struct kvm_interrupt irq;
1265
1266                 r = -EFAULT;
1267                 if (copy_from_user(&irq, argp, sizeof irq))
1268                         goto out;
1269                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1270                 if (r)
1271                         goto out;
1272                 r = 0;
1273                 break;
1274         }
1275         case KVM_SET_CPUID: {
1276                 struct kvm_cpuid __user *cpuid_arg = argp;
1277                 struct kvm_cpuid cpuid;
1278
1279                 r = -EFAULT;
1280                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1281                         goto out;
1282                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1283                 if (r)
1284                         goto out;
1285                 break;
1286         }
1287         case KVM_SET_CPUID2: {
1288                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1289                 struct kvm_cpuid2 cpuid;
1290
1291                 r = -EFAULT;
1292                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1293                         goto out;
1294                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1295                                 cpuid_arg->entries);
1296                 if (r)
1297                         goto out;
1298                 break;
1299         }
1300         case KVM_GET_CPUID2: {
1301                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1302                 struct kvm_cpuid2 cpuid;
1303
1304                 r = -EFAULT;
1305                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1306                         goto out;
1307                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1308                                 cpuid_arg->entries);
1309                 if (r)
1310                         goto out;
1311                 r = -EFAULT;
1312                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1313                         goto out;
1314                 r = 0;
1315                 break;
1316         }
1317         case KVM_GET_MSRS:
1318                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1319                 break;
1320         case KVM_SET_MSRS:
1321                 r = msr_io(vcpu, argp, do_set_msr, 0);
1322                 break;
1323         case KVM_TPR_ACCESS_REPORTING: {
1324                 struct kvm_tpr_access_ctl tac;
1325
1326                 r = -EFAULT;
1327                 if (copy_from_user(&tac, argp, sizeof tac))
1328                         goto out;
1329                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1330                 if (r)
1331                         goto out;
1332                 r = -EFAULT;
1333                 if (copy_to_user(argp, &tac, sizeof tac))
1334                         goto out;
1335                 r = 0;
1336                 break;
1337         };
1338         case KVM_SET_VAPIC_ADDR: {
1339                 struct kvm_vapic_addr va;
1340
1341                 r = -EINVAL;
1342                 if (!irqchip_in_kernel(vcpu->kvm))
1343                         goto out;
1344                 r = -EFAULT;
1345                 if (copy_from_user(&va, argp, sizeof va))
1346                         goto out;
1347                 r = 0;
1348                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1349                 break;
1350         }
1351         default:
1352                 r = -EINVAL;
1353         }
1354 out:
1355         return r;
1356 }
1357
1358 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1359 {
1360         int ret;
1361
1362         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1363                 return -1;
1364         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1365         return ret;
1366 }
1367
1368 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1369                                           u32 kvm_nr_mmu_pages)
1370 {
1371         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1372                 return -EINVAL;
1373
1374         down_write(&kvm->slots_lock);
1375
1376         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1377         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1378
1379         up_write(&kvm->slots_lock);
1380         return 0;
1381 }
1382
1383 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1384 {
1385         return kvm->arch.n_alloc_mmu_pages;
1386 }
1387
1388 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1389 {
1390         int i;
1391         struct kvm_mem_alias *alias;
1392
1393         for (i = 0; i < kvm->arch.naliases; ++i) {
1394                 alias = &kvm->arch.aliases[i];
1395                 if (gfn >= alias->base_gfn
1396                     && gfn < alias->base_gfn + alias->npages)
1397                         return alias->target_gfn + gfn - alias->base_gfn;
1398         }
1399         return gfn;
1400 }
1401
1402 /*
1403  * Set a new alias region.  Aliases map a portion of physical memory into
1404  * another portion.  This is useful for memory windows, for example the PC
1405  * VGA region.
1406  */
1407 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1408                                          struct kvm_memory_alias *alias)
1409 {
1410         int r, n;
1411         struct kvm_mem_alias *p;
1412
1413         r = -EINVAL;
1414         /* General sanity checks */
1415         if (alias->memory_size & (PAGE_SIZE - 1))
1416                 goto out;
1417         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1418                 goto out;
1419         if (alias->slot >= KVM_ALIAS_SLOTS)
1420                 goto out;
1421         if (alias->guest_phys_addr + alias->memory_size
1422             < alias->guest_phys_addr)
1423                 goto out;
1424         if (alias->target_phys_addr + alias->memory_size
1425             < alias->target_phys_addr)
1426                 goto out;
1427
1428         down_write(&kvm->slots_lock);
1429
1430         p = &kvm->arch.aliases[alias->slot];
1431         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1432         p->npages = alias->memory_size >> PAGE_SHIFT;
1433         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1434
1435         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1436                 if (kvm->arch.aliases[n - 1].npages)
1437                         break;
1438         kvm->arch.naliases = n;
1439
1440         kvm_mmu_zap_all(kvm);
1441
1442         up_write(&kvm->slots_lock);
1443
1444         return 0;
1445
1446 out:
1447         return r;
1448 }
1449
1450 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1451 {
1452         int r;
1453
1454         r = 0;
1455         switch (chip->chip_id) {
1456         case KVM_IRQCHIP_PIC_MASTER:
1457                 memcpy(&chip->chip.pic,
1458                         &pic_irqchip(kvm)->pics[0],
1459                         sizeof(struct kvm_pic_state));
1460                 break;
1461         case KVM_IRQCHIP_PIC_SLAVE:
1462                 memcpy(&chip->chip.pic,
1463                         &pic_irqchip(kvm)->pics[1],
1464                         sizeof(struct kvm_pic_state));
1465                 break;
1466         case KVM_IRQCHIP_IOAPIC:
1467                 memcpy(&chip->chip.ioapic,
1468                         ioapic_irqchip(kvm),
1469                         sizeof(struct kvm_ioapic_state));
1470                 break;
1471         default:
1472                 r = -EINVAL;
1473                 break;
1474         }
1475         return r;
1476 }
1477
1478 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1479 {
1480         int r;
1481
1482         r = 0;
1483         switch (chip->chip_id) {
1484         case KVM_IRQCHIP_PIC_MASTER:
1485                 memcpy(&pic_irqchip(kvm)->pics[0],
1486                         &chip->chip.pic,
1487                         sizeof(struct kvm_pic_state));
1488                 break;
1489         case KVM_IRQCHIP_PIC_SLAVE:
1490                 memcpy(&pic_irqchip(kvm)->pics[1],
1491                         &chip->chip.pic,
1492                         sizeof(struct kvm_pic_state));
1493                 break;
1494         case KVM_IRQCHIP_IOAPIC:
1495                 memcpy(ioapic_irqchip(kvm),
1496                         &chip->chip.ioapic,
1497                         sizeof(struct kvm_ioapic_state));
1498                 break;
1499         default:
1500                 r = -EINVAL;
1501                 break;
1502         }
1503         kvm_pic_update_irq(pic_irqchip(kvm));
1504         return r;
1505 }
1506
1507 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1508 {
1509         int r = 0;
1510
1511         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1512         return r;
1513 }
1514
1515 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1516 {
1517         int r = 0;
1518
1519         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1520         kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1521         return r;
1522 }
1523
1524 /*
1525  * Get (and clear) the dirty memory log for a memory slot.
1526  */
1527 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1528                                       struct kvm_dirty_log *log)
1529 {
1530         int r;
1531         int n;
1532         struct kvm_memory_slot *memslot;
1533         int is_dirty = 0;
1534
1535         down_write(&kvm->slots_lock);
1536
1537         r = kvm_get_dirty_log(kvm, log, &is_dirty);
1538         if (r)
1539                 goto out;
1540
1541         /* If nothing is dirty, don't bother messing with page tables. */
1542         if (is_dirty) {
1543                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1544                 kvm_flush_remote_tlbs(kvm);
1545                 memslot = &kvm->memslots[log->slot];
1546                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1547                 memset(memslot->dirty_bitmap, 0, n);
1548         }
1549         r = 0;
1550 out:
1551         up_write(&kvm->slots_lock);
1552         return r;
1553 }
1554
1555 long kvm_arch_vm_ioctl(struct file *filp,
1556                        unsigned int ioctl, unsigned long arg)
1557 {
1558         struct kvm *kvm = filp->private_data;
1559         void __user *argp = (void __user *)arg;
1560         int r = -EINVAL;
1561
1562         switch (ioctl) {
1563         case KVM_SET_TSS_ADDR:
1564                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1565                 if (r < 0)
1566                         goto out;
1567                 break;
1568         case KVM_SET_MEMORY_REGION: {
1569                 struct kvm_memory_region kvm_mem;
1570                 struct kvm_userspace_memory_region kvm_userspace_mem;
1571
1572                 r = -EFAULT;
1573                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1574                         goto out;
1575                 kvm_userspace_mem.slot = kvm_mem.slot;
1576                 kvm_userspace_mem.flags = kvm_mem.flags;
1577                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1578                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1579                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1580                 if (r)
1581                         goto out;
1582                 break;
1583         }
1584         case KVM_SET_NR_MMU_PAGES:
1585                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1586                 if (r)
1587                         goto out;
1588                 break;
1589         case KVM_GET_NR_MMU_PAGES:
1590                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1591                 break;
1592         case KVM_SET_MEMORY_ALIAS: {
1593                 struct kvm_memory_alias alias;
1594
1595                 r = -EFAULT;
1596                 if (copy_from_user(&alias, argp, sizeof alias))
1597                         goto out;
1598                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
1599                 if (r)
1600                         goto out;
1601                 break;
1602         }
1603         case KVM_CREATE_IRQCHIP:
1604                 r = -ENOMEM;
1605                 kvm->arch.vpic = kvm_create_pic(kvm);
1606                 if (kvm->arch.vpic) {
1607                         r = kvm_ioapic_init(kvm);
1608                         if (r) {
1609                                 kfree(kvm->arch.vpic);
1610                                 kvm->arch.vpic = NULL;
1611                                 goto out;
1612                         }
1613                 } else
1614                         goto out;
1615                 break;
1616         case KVM_CREATE_PIT:
1617                 r = -ENOMEM;
1618                 kvm->arch.vpit = kvm_create_pit(kvm);
1619                 if (kvm->arch.vpit)
1620                         r = 0;
1621                 break;
1622         case KVM_IRQ_LINE: {
1623                 struct kvm_irq_level irq_event;
1624
1625                 r = -EFAULT;
1626                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1627                         goto out;
1628                 if (irqchip_in_kernel(kvm)) {
1629                         mutex_lock(&kvm->lock);
1630                         if (irq_event.irq < 16)
1631                                 kvm_pic_set_irq(pic_irqchip(kvm),
1632                                         irq_event.irq,
1633                                         irq_event.level);
1634                         kvm_ioapic_set_irq(kvm->arch.vioapic,
1635                                         irq_event.irq,
1636                                         irq_event.level);
1637                         mutex_unlock(&kvm->lock);
1638                         r = 0;
1639                 }
1640                 break;
1641         }
1642         case KVM_GET_IRQCHIP: {
1643                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1644                 struct kvm_irqchip chip;
1645
1646                 r = -EFAULT;
1647                 if (copy_from_user(&chip, argp, sizeof chip))
1648                         goto out;
1649                 r = -ENXIO;
1650                 if (!irqchip_in_kernel(kvm))
1651                         goto out;
1652                 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1653                 if (r)
1654                         goto out;
1655                 r = -EFAULT;
1656                 if (copy_to_user(argp, &chip, sizeof chip))
1657                         goto out;
1658                 r = 0;
1659                 break;
1660         }
1661         case KVM_SET_IRQCHIP: {
1662                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1663                 struct kvm_irqchip chip;
1664
1665                 r = -EFAULT;
1666                 if (copy_from_user(&chip, argp, sizeof chip))
1667                         goto out;
1668                 r = -ENXIO;
1669                 if (!irqchip_in_kernel(kvm))
1670                         goto out;
1671                 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1672                 if (r)
1673                         goto out;
1674                 r = 0;
1675                 break;
1676         }
1677         case KVM_GET_PIT: {
1678                 struct kvm_pit_state ps;
1679                 r = -EFAULT;
1680                 if (copy_from_user(&ps, argp, sizeof ps))
1681                         goto out;
1682                 r = -ENXIO;
1683                 if (!kvm->arch.vpit)
1684                         goto out;
1685                 r = kvm_vm_ioctl_get_pit(kvm, &ps);
1686                 if (r)
1687                         goto out;
1688                 r = -EFAULT;
1689                 if (copy_to_user(argp, &ps, sizeof ps))
1690                         goto out;
1691                 r = 0;
1692                 break;
1693         }
1694         case KVM_SET_PIT: {
1695                 struct kvm_pit_state ps;
1696                 r = -EFAULT;
1697                 if (copy_from_user(&ps, argp, sizeof ps))
1698                         goto out;
1699                 r = -ENXIO;
1700                 if (!kvm->arch.vpit)
1701                         goto out;
1702                 r = kvm_vm_ioctl_set_pit(kvm, &ps);
1703                 if (r)
1704                         goto out;
1705                 r = 0;
1706                 break;
1707         }
1708         default:
1709                 ;
1710         }
1711 out:
1712         return r;
1713 }
1714
1715 static void kvm_init_msr_list(void)
1716 {
1717         u32 dummy[2];
1718         unsigned i, j;
1719
1720         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1721                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1722                         continue;
1723                 if (j < i)
1724                         msrs_to_save[j] = msrs_to_save[i];
1725                 j++;
1726         }
1727         num_msrs_to_save = j;
1728 }
1729
1730 /*
1731  * Only apic need an MMIO device hook, so shortcut now..
1732  */
1733 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1734                                                 gpa_t addr)
1735 {
1736         struct kvm_io_device *dev;
1737
1738         if (vcpu->arch.apic) {
1739                 dev = &vcpu->arch.apic->dev;
1740                 if (dev->in_range(dev, addr))
1741                         return dev;
1742         }
1743         return NULL;
1744 }
1745
1746
1747 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1748                                                 gpa_t addr)
1749 {
1750         struct kvm_io_device *dev;
1751
1752         dev = vcpu_find_pervcpu_dev(vcpu, addr);
1753         if (dev == NULL)
1754                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1755         return dev;
1756 }
1757
1758 int emulator_read_std(unsigned long addr,
1759                              void *val,
1760                              unsigned int bytes,
1761                              struct kvm_vcpu *vcpu)
1762 {
1763         void *data = val;
1764         int r = X86EMUL_CONTINUE;
1765
1766         while (bytes) {
1767                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1768                 unsigned offset = addr & (PAGE_SIZE-1);
1769                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1770                 int ret;
1771
1772                 if (gpa == UNMAPPED_GVA) {
1773                         r = X86EMUL_PROPAGATE_FAULT;
1774                         goto out;
1775                 }
1776                 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1777                 if (ret < 0) {
1778                         r = X86EMUL_UNHANDLEABLE;
1779                         goto out;
1780                 }
1781
1782                 bytes -= tocopy;
1783                 data += tocopy;
1784                 addr += tocopy;
1785         }
1786 out:
1787         return r;
1788 }
1789 EXPORT_SYMBOL_GPL(emulator_read_std);
1790
1791 static int emulator_read_emulated(unsigned long addr,
1792                                   void *val,
1793                                   unsigned int bytes,
1794                                   struct kvm_vcpu *vcpu)
1795 {
1796         struct kvm_io_device *mmio_dev;
1797         gpa_t                 gpa;
1798
1799         if (vcpu->mmio_read_completed) {
1800                 memcpy(val, vcpu->mmio_data, bytes);
1801                 vcpu->mmio_read_completed = 0;
1802                 return X86EMUL_CONTINUE;
1803         }
1804
1805         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1806
1807         /* For APIC access vmexit */
1808         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1809                 goto mmio;
1810
1811         if (emulator_read_std(addr, val, bytes, vcpu)
1812                         == X86EMUL_CONTINUE)
1813                 return X86EMUL_CONTINUE;
1814         if (gpa == UNMAPPED_GVA)
1815                 return X86EMUL_PROPAGATE_FAULT;
1816
1817 mmio:
1818         /*
1819          * Is this MMIO handled locally?
1820          */
1821         mutex_lock(&vcpu->kvm->lock);
1822         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1823         if (mmio_dev) {
1824                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1825                 mutex_unlock(&vcpu->kvm->lock);
1826                 return X86EMUL_CONTINUE;
1827         }
1828         mutex_unlock(&vcpu->kvm->lock);
1829
1830         vcpu->mmio_needed = 1;
1831         vcpu->mmio_phys_addr = gpa;
1832         vcpu->mmio_size = bytes;
1833         vcpu->mmio_is_write = 0;
1834
1835         return X86EMUL_UNHANDLEABLE;
1836 }
1837
1838 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1839                           const void *val, int bytes)
1840 {
1841         int ret;
1842
1843         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1844         if (ret < 0)
1845                 return 0;
1846         kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1847         return 1;
1848 }
1849
1850 static int emulator_write_emulated_onepage(unsigned long addr,
1851                                            const void *val,
1852                                            unsigned int bytes,
1853                                            struct kvm_vcpu *vcpu)
1854 {
1855         struct kvm_io_device *mmio_dev;
1856         gpa_t                 gpa;
1857
1858         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1859
1860         if (gpa == UNMAPPED_GVA) {
1861                 kvm_inject_page_fault(vcpu, addr, 2);
1862                 return X86EMUL_PROPAGATE_FAULT;
1863         }
1864
1865         /* For APIC access vmexit */
1866         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1867                 goto mmio;
1868
1869         if (emulator_write_phys(vcpu, gpa, val, bytes))
1870                 return X86EMUL_CONTINUE;
1871
1872 mmio:
1873         /*
1874          * Is this MMIO handled locally?
1875          */
1876         mutex_lock(&vcpu->kvm->lock);
1877         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1878         if (mmio_dev) {
1879                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1880                 mutex_unlock(&vcpu->kvm->lock);
1881                 return X86EMUL_CONTINUE;
1882         }
1883         mutex_unlock(&vcpu->kvm->lock);
1884
1885         vcpu->mmio_needed = 1;
1886         vcpu->mmio_phys_addr = gpa;
1887         vcpu->mmio_size = bytes;
1888         vcpu->mmio_is_write = 1;
1889         memcpy(vcpu->mmio_data, val, bytes);
1890
1891         return X86EMUL_CONTINUE;
1892 }
1893
1894 int emulator_write_emulated(unsigned long addr,
1895                                    const void *val,
1896                                    unsigned int bytes,
1897                                    struct kvm_vcpu *vcpu)
1898 {
1899         /* Crossing a page boundary? */
1900         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1901                 int rc, now;
1902
1903                 now = -addr & ~PAGE_MASK;
1904                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1905                 if (rc != X86EMUL_CONTINUE)
1906                         return rc;
1907                 addr += now;
1908                 val += now;
1909                 bytes -= now;
1910         }
1911         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1912 }
1913 EXPORT_SYMBOL_GPL(emulator_write_emulated);
1914
1915 static int emulator_cmpxchg_emulated(unsigned long addr,
1916                                      const void *old,
1917                                      const void *new,
1918                                      unsigned int bytes,
1919                                      struct kvm_vcpu *vcpu)
1920 {
1921         static int reported;
1922
1923         if (!reported) {
1924                 reported = 1;
1925                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1926         }
1927 #ifndef CONFIG_X86_64
1928         /* guests cmpxchg8b have to be emulated atomically */
1929         if (bytes == 8) {
1930                 gpa_t gpa;
1931                 struct page *page;
1932                 char *kaddr;
1933                 u64 val;
1934
1935                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1936
1937                 if (gpa == UNMAPPED_GVA ||
1938                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1939                         goto emul_write;
1940
1941                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1942                         goto emul_write;
1943
1944                 val = *(u64 *)new;
1945
1946                 down_read(&current->mm->mmap_sem);
1947                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1948                 up_read(&current->mm->mmap_sem);
1949
1950                 kaddr = kmap_atomic(page, KM_USER0);
1951                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
1952                 kunmap_atomic(kaddr, KM_USER0);
1953                 kvm_release_page_dirty(page);
1954         }
1955 emul_write:
1956 #endif
1957
1958         return emulator_write_emulated(addr, new, bytes, vcpu);
1959 }
1960
1961 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1962 {
1963         return kvm_x86_ops->get_segment_base(vcpu, seg);
1964 }
1965
1966 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1967 {
1968         return X86EMUL_CONTINUE;
1969 }
1970
1971 int emulate_clts(struct kvm_vcpu *vcpu)
1972 {
1973         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1974         return X86EMUL_CONTINUE;
1975 }
1976
1977 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1978 {
1979         struct kvm_vcpu *vcpu = ctxt->vcpu;
1980
1981         switch (dr) {
1982         case 0 ... 3:
1983                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1984                 return X86EMUL_CONTINUE;
1985         default:
1986                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
1987                 return X86EMUL_UNHANDLEABLE;
1988         }
1989 }
1990
1991 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1992 {
1993         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1994         int exception;
1995
1996         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1997         if (exception) {
1998                 /* FIXME: better handling */
1999                 return X86EMUL_UNHANDLEABLE;
2000         }
2001         return X86EMUL_CONTINUE;
2002 }
2003
2004 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2005 {
2006         static int reported;
2007         u8 opcodes[4];
2008         unsigned long rip = vcpu->arch.rip;
2009         unsigned long rip_linear;
2010
2011         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2012
2013         if (reported)
2014                 return;
2015
2016         emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2017
2018         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2019                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2020         reported = 1;
2021 }
2022 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2023
2024 static struct x86_emulate_ops emulate_ops = {
2025         .read_std            = emulator_read_std,
2026         .read_emulated       = emulator_read_emulated,
2027         .write_emulated      = emulator_write_emulated,
2028         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2029 };
2030
2031 int emulate_instruction(struct kvm_vcpu *vcpu,
2032                         struct kvm_run *run,
2033                         unsigned long cr2,
2034                         u16 error_code,
2035                         int emulation_type)
2036 {
2037         int r;
2038         struct decode_cache *c;
2039
2040         vcpu->arch.mmio_fault_cr2 = cr2;
2041         kvm_x86_ops->cache_regs(vcpu);
2042
2043         vcpu->mmio_is_write = 0;
2044         vcpu->arch.pio.string = 0;
2045
2046         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2047                 int cs_db, cs_l;
2048                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2049
2050                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2051                 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2052                 vcpu->arch.emulate_ctxt.mode =
2053                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2054                         ? X86EMUL_MODE_REAL : cs_l
2055                         ? X86EMUL_MODE_PROT64 : cs_db
2056                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2057
2058                 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2059                         vcpu->arch.emulate_ctxt.cs_base = 0;
2060                         vcpu->arch.emulate_ctxt.ds_base = 0;
2061                         vcpu->arch.emulate_ctxt.es_base = 0;
2062                         vcpu->arch.emulate_ctxt.ss_base = 0;
2063                 } else {
2064                         vcpu->arch.emulate_ctxt.cs_base =
2065                                         get_segment_base(vcpu, VCPU_SREG_CS);
2066                         vcpu->arch.emulate_ctxt.ds_base =
2067                                         get_segment_base(vcpu, VCPU_SREG_DS);
2068                         vcpu->arch.emulate_ctxt.es_base =
2069                                         get_segment_base(vcpu, VCPU_SREG_ES);
2070                         vcpu->arch.emulate_ctxt.ss_base =
2071                                         get_segment_base(vcpu, VCPU_SREG_SS);
2072                 }
2073
2074                 vcpu->arch.emulate_ctxt.gs_base =
2075                                         get_segment_base(vcpu, VCPU_SREG_GS);
2076                 vcpu->arch.emulate_ctxt.fs_base =
2077                                         get_segment_base(vcpu, VCPU_SREG_FS);
2078
2079                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2080
2081                 /* Reject the instructions other than VMCALL/VMMCALL when
2082                  * try to emulate invalid opcode */
2083                 c = &vcpu->arch.emulate_ctxt.decode;
2084                 if ((emulation_type & EMULTYPE_TRAP_UD) &&
2085                     (!(c->twobyte && c->b == 0x01 &&
2086                       (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2087                        c->modrm_mod == 3 && c->modrm_rm == 1)))
2088                         return EMULATE_FAIL;
2089
2090                 ++vcpu->stat.insn_emulation;
2091                 if (r)  {
2092                         ++vcpu->stat.insn_emulation_fail;
2093                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2094                                 return EMULATE_DONE;
2095                         return EMULATE_FAIL;
2096                 }
2097         }
2098
2099         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2100
2101         if (vcpu->arch.pio.string)
2102                 return EMULATE_DO_MMIO;
2103
2104         if ((r || vcpu->mmio_is_write) && run) {
2105                 run->exit_reason = KVM_EXIT_MMIO;
2106                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2107                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2108                 run->mmio.len = vcpu->mmio_size;
2109                 run->mmio.is_write = vcpu->mmio_is_write;
2110         }
2111
2112         if (r) {
2113                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2114                         return EMULATE_DONE;
2115                 if (!vcpu->mmio_needed) {
2116                         kvm_report_emulation_failure(vcpu, "mmio");
2117                         return EMULATE_FAIL;
2118                 }
2119                 return EMULATE_DO_MMIO;
2120         }
2121
2122         kvm_x86_ops->decache_regs(vcpu);
2123         kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2124
2125         if (vcpu->mmio_is_write) {
2126                 vcpu->mmio_needed = 0;
2127                 return EMULATE_DO_MMIO;
2128         }
2129
2130         return EMULATE_DONE;
2131 }
2132 EXPORT_SYMBOL_GPL(emulate_instruction);
2133
2134 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
2135 {
2136         int i;
2137
2138         for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
2139                 if (vcpu->arch.pio.guest_pages[i]) {
2140                         kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
2141                         vcpu->arch.pio.guest_pages[i] = NULL;
2142                 }
2143 }
2144
2145 static int pio_copy_data(struct kvm_vcpu *vcpu)
2146 {
2147         void *p = vcpu->arch.pio_data;
2148         void *q;
2149         unsigned bytes;
2150         int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
2151
2152         q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
2153                  PAGE_KERNEL);
2154         if (!q) {
2155                 free_pio_guest_pages(vcpu);
2156                 return -ENOMEM;
2157         }
2158         q += vcpu->arch.pio.guest_page_offset;
2159         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2160         if (vcpu->arch.pio.in)
2161                 memcpy(q, p, bytes);
2162         else
2163                 memcpy(p, q, bytes);
2164         q -= vcpu->arch.pio.guest_page_offset;
2165         vunmap(q);
2166         free_pio_guest_pages(vcpu);
2167         return 0;
2168 }
2169
2170 int complete_pio(struct kvm_vcpu *vcpu)
2171 {
2172         struct kvm_pio_request *io = &vcpu->arch.pio;
2173         long delta;
2174         int r;
2175
2176         kvm_x86_ops->cache_regs(vcpu);
2177
2178         if (!io->string) {
2179                 if (io->in)
2180                         memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
2181                                io->size);
2182         } else {
2183                 if (io->in) {
2184                         r = pio_copy_data(vcpu);
2185                         if (r) {
2186                                 kvm_x86_ops->cache_regs(vcpu);
2187                                 return r;
2188                         }
2189                 }
2190
2191                 delta = 1;
2192                 if (io->rep) {
2193                         delta *= io->cur_count;
2194                         /*
2195                          * The size of the register should really depend on
2196                          * current address size.
2197                          */
2198                         vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
2199                 }
2200                 if (io->down)
2201                         delta = -delta;
2202                 delta *= io->size;
2203                 if (io->in)
2204                         vcpu->arch.regs[VCPU_REGS_RDI] += delta;
2205                 else
2206                         vcpu->arch.regs[VCPU_REGS_RSI] += delta;
2207         }
2208
2209         kvm_x86_ops->decache_regs(vcpu);
2210
2211         io->count -= io->cur_count;
2212         io->cur_count = 0;
2213
2214         return 0;
2215 }
2216
2217 static void kernel_pio(struct kvm_io_device *pio_dev,
2218                        struct kvm_vcpu *vcpu,
2219                        void *pd)
2220 {
2221         /* TODO: String I/O for in kernel device */
2222
2223         mutex_lock(&vcpu->kvm->lock);
2224         if (vcpu->arch.pio.in)
2225                 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2226                                   vcpu->arch.pio.size,
2227                                   pd);
2228         else
2229                 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2230                                    vcpu->arch.pio.size,
2231                                    pd);
2232         mutex_unlock(&vcpu->kvm->lock);
2233 }
2234
2235 static void pio_string_write(struct kvm_io_device *pio_dev,
2236                              struct kvm_vcpu *vcpu)
2237 {
2238         struct kvm_pio_request *io = &vcpu->arch.pio;
2239         void *pd = vcpu->arch.pio_data;
2240         int i;
2241
2242         mutex_lock(&vcpu->kvm->lock);
2243         for (i = 0; i < io->cur_count; i++) {
2244                 kvm_iodevice_write(pio_dev, io->port,
2245                                    io->size,
2246                                    pd);
2247                 pd += io->size;
2248         }
2249         mutex_unlock(&vcpu->kvm->lock);
2250 }
2251
2252 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2253                                                gpa_t addr)
2254 {
2255         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
2256 }
2257
2258 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2259                   int size, unsigned port)
2260 {
2261         struct kvm_io_device *pio_dev;
2262
2263         vcpu->run->exit_reason = KVM_EXIT_IO;
2264         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2265         vcpu->run->io.size = vcpu->arch.pio.size = size;
2266         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2267         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2268         vcpu->run->io.port = vcpu->arch.pio.port = port;
2269         vcpu->arch.pio.in = in;
2270         vcpu->arch.pio.string = 0;
2271         vcpu->arch.pio.down = 0;
2272         vcpu->arch.pio.guest_page_offset = 0;
2273         vcpu->arch.pio.rep = 0;
2274
2275         if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2276                 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2277                             handler);
2278         else
2279                 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2280                             handler);
2281
2282         kvm_x86_ops->cache_regs(vcpu);
2283         memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2284         kvm_x86_ops->decache_regs(vcpu);
2285
2286         kvm_x86_ops->skip_emulated_instruction(vcpu);
2287
2288         pio_dev = vcpu_find_pio_dev(vcpu, port);
2289         if (pio_dev) {
2290                 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2291                 complete_pio(vcpu);
2292                 return 1;
2293         }
2294         return 0;
2295 }
2296 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2297
2298 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2299                   int size, unsigned long count, int down,
2300                   gva_t address, int rep, unsigned port)
2301 {
2302         unsigned now, in_page;
2303         int i, ret = 0;
2304         int nr_pages = 1;
2305         struct page *page;
2306         struct kvm_io_device *pio_dev;
2307
2308         vcpu->run->exit_reason = KVM_EXIT_IO;
2309         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2310         vcpu->run->io.size = vcpu->arch.pio.size = size;
2311         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2312         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2313         vcpu->run->io.port = vcpu->arch.pio.port = port;
2314         vcpu->arch.pio.in = in;
2315         vcpu->arch.pio.string = 1;
2316         vcpu->arch.pio.down = down;
2317         vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2318         vcpu->arch.pio.rep = rep;
2319
2320         if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2321                 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2322                             handler);
2323         else
2324                 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2325                             handler);
2326
2327         if (!count) {
2328                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2329                 return 1;
2330         }
2331
2332         if (!down)
2333                 in_page = PAGE_SIZE - offset_in_page(address);
2334         else
2335                 in_page = offset_in_page(address) + size;
2336         now = min(count, (unsigned long)in_page / size);
2337         if (!now) {
2338                 /*
2339                  * String I/O straddles page boundary.  Pin two guest pages
2340                  * so that we satisfy atomicity constraints.  Do just one
2341                  * transaction to avoid complexity.
2342                  */
2343                 nr_pages = 2;
2344                 now = 1;
2345         }
2346         if (down) {
2347                 /*
2348                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2349                  */
2350                 pr_unimpl(vcpu, "guest string pio down\n");
2351                 kvm_inject_gp(vcpu, 0);
2352                 return 1;
2353         }
2354         vcpu->run->io.count = now;
2355         vcpu->arch.pio.cur_count = now;
2356
2357         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2358                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2359
2360         for (i = 0; i < nr_pages; ++i) {
2361                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2362                 vcpu->arch.pio.guest_pages[i] = page;
2363                 if (!page) {
2364                         kvm_inject_gp(vcpu, 0);
2365                         free_pio_guest_pages(vcpu);
2366                         return 1;
2367                 }
2368         }
2369
2370         pio_dev = vcpu_find_pio_dev(vcpu, port);
2371         if (!vcpu->arch.pio.in) {
2372                 /* string PIO write */
2373                 ret = pio_copy_data(vcpu);
2374                 if (ret >= 0 && pio_dev) {
2375                         pio_string_write(pio_dev, vcpu);
2376                         complete_pio(vcpu);
2377                         if (vcpu->arch.pio.count == 0)
2378                                 ret = 1;
2379                 }
2380         } else if (pio_dev)
2381                 pr_unimpl(vcpu, "no string pio read support yet, "
2382                        "port %x size %d count %ld\n",
2383                         port, size, count);
2384
2385         return ret;
2386 }
2387 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2388
2389 int kvm_arch_init(void *opaque)
2390 {
2391         int r;
2392         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2393
2394         if (kvm_x86_ops) {
2395                 printk(KERN_ERR "kvm: already loaded the other module\n");
2396                 r = -EEXIST;
2397                 goto out;
2398         }
2399
2400         if (!ops->cpu_has_kvm_support()) {
2401                 printk(KERN_ERR "kvm: no hardware support\n");
2402                 r = -EOPNOTSUPP;
2403                 goto out;
2404         }
2405         if (ops->disabled_by_bios()) {
2406                 printk(KERN_ERR "kvm: disabled by bios\n");
2407                 r = -EOPNOTSUPP;
2408                 goto out;
2409         }
2410
2411         r = kvm_mmu_module_init();
2412         if (r)
2413                 goto out;
2414
2415         kvm_init_msr_list();
2416
2417         kvm_x86_ops = ops;
2418         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2419         return 0;
2420
2421 out:
2422         return r;
2423 }
2424
2425 void kvm_arch_exit(void)
2426 {
2427         kvm_x86_ops = NULL;
2428         kvm_mmu_module_exit();
2429 }
2430
2431 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2432 {
2433         ++vcpu->stat.halt_exits;
2434         KVMTRACE_0D(HLT, vcpu, handler);
2435         if (irqchip_in_kernel(vcpu->kvm)) {
2436                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2437                 up_read(&vcpu->kvm->slots_lock);
2438                 kvm_vcpu_block(vcpu);
2439                 down_read(&vcpu->kvm->slots_lock);
2440                 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2441                         return -EINTR;
2442                 return 1;
2443         } else {
2444                 vcpu->run->exit_reason = KVM_EXIT_HLT;
2445                 return 0;
2446         }
2447 }
2448 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2449
2450 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
2451                            unsigned long a1)
2452 {
2453         if (is_long_mode(vcpu))
2454                 return a0;
2455         else
2456                 return a0 | ((gpa_t)a1 << 32);
2457 }
2458
2459 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2460 {
2461         unsigned long nr, a0, a1, a2, a3, ret;
2462         int r = 1;
2463
2464         kvm_x86_ops->cache_regs(vcpu);
2465
2466         nr = vcpu->arch.regs[VCPU_REGS_RAX];
2467         a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2468         a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2469         a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2470         a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2471
2472         KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2473
2474         if (!is_long_mode(vcpu)) {
2475                 nr &= 0xFFFFFFFF;
2476                 a0 &= 0xFFFFFFFF;
2477                 a1 &= 0xFFFFFFFF;
2478                 a2 &= 0xFFFFFFFF;
2479                 a3 &= 0xFFFFFFFF;
2480         }
2481
2482         switch (nr) {
2483         case KVM_HC_VAPIC_POLL_IRQ:
2484                 ret = 0;
2485                 break;
2486         case KVM_HC_MMU_OP:
2487                 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
2488                 break;
2489         default:
2490                 ret = -KVM_ENOSYS;
2491                 break;
2492         }
2493         vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2494         kvm_x86_ops->decache_regs(vcpu);
2495         ++vcpu->stat.hypercalls;
2496         return r;
2497 }
2498 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2499
2500 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2501 {
2502         char instruction[3];
2503         int ret = 0;
2504
2505
2506         /*
2507          * Blow out the MMU to ensure that no other VCPU has an active mapping
2508          * to ensure that the updated hypercall appears atomically across all
2509          * VCPUs.
2510          */
2511         kvm_mmu_zap_all(vcpu->kvm);
2512
2513         kvm_x86_ops->cache_regs(vcpu);
2514         kvm_x86_ops->patch_hypercall(vcpu, instruction);
2515         if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2516             != X86EMUL_CONTINUE)
2517                 ret = -EFAULT;
2518
2519         return ret;
2520 }
2521
2522 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2523 {
2524         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2525 }
2526
2527 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2528 {
2529         struct descriptor_table dt = { limit, base };
2530
2531         kvm_x86_ops->set_gdt(vcpu, &dt);
2532 }
2533
2534 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2535 {
2536         struct descriptor_table dt = { limit, base };
2537
2538         kvm_x86_ops->set_idt(vcpu, &dt);
2539 }
2540
2541 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2542                    unsigned long *rflags)
2543 {
2544         kvm_lmsw(vcpu, msw);
2545         *rflags = kvm_x86_ops->get_rflags(vcpu);
2546 }
2547
2548 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2549 {
2550         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2551         switch (cr) {
2552         case 0:
2553                 return vcpu->arch.cr0;
2554         case 2:
2555                 return vcpu->arch.cr2;
2556         case 3:
2557                 return vcpu->arch.cr3;
2558         case 4:
2559                 return vcpu->arch.cr4;
2560         case 8:
2561                 return kvm_get_cr8(vcpu);
2562         default:
2563                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2564                 return 0;
2565         }
2566 }
2567
2568 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2569                      unsigned long *rflags)
2570 {
2571         switch (cr) {
2572         case 0:
2573                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2574                 *rflags = kvm_x86_ops->get_rflags(vcpu);
2575                 break;
2576         case 2:
2577                 vcpu->arch.cr2 = val;
2578                 break;
2579         case 3:
2580                 kvm_set_cr3(vcpu, val);
2581                 break;
2582         case 4:
2583                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2584                 break;
2585         case 8:
2586                 kvm_set_cr8(vcpu, val & 0xfUL);
2587                 break;
2588         default:
2589                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2590         }
2591 }
2592
2593 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2594 {
2595         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2596         int j, nent = vcpu->arch.cpuid_nent;
2597
2598         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2599         /* when no next entry is found, the current entry[i] is reselected */
2600         for (j = i + 1; j == i; j = (j + 1) % nent) {
2601                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2602                 if (ej->function == e->function) {
2603                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2604                         return j;
2605                 }
2606         }
2607         return 0; /* silence gcc, even though control never reaches here */
2608 }
2609
2610 /* find an entry with matching function, matching index (if needed), and that
2611  * should be read next (if it's stateful) */
2612 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2613         u32 function, u32 index)
2614 {
2615         if (e->function != function)
2616                 return 0;
2617         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2618                 return 0;
2619         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2620                 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2621                 return 0;
2622         return 1;
2623 }
2624
2625 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2626 {
2627         int i;
2628         u32 function, index;
2629         struct kvm_cpuid_entry2 *e, *best;
2630
2631         kvm_x86_ops->cache_regs(vcpu);
2632         function = vcpu->arch.regs[VCPU_REGS_RAX];
2633         index = vcpu->arch.regs[VCPU_REGS_RCX];
2634         vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2635         vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2636         vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2637         vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2638         best = NULL;
2639         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2640                 e = &vcpu->arch.cpuid_entries[i];
2641                 if (is_matching_cpuid_entry(e, function, index)) {
2642                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2643                                 move_to_next_stateful_cpuid_entry(vcpu, i);
2644                         best = e;
2645                         break;
2646                 }
2647                 /*
2648                  * Both basic or both extended?
2649                  */
2650                 if (((e->function ^ function) & 0x80000000) == 0)
2651                         if (!best || e->function > best->function)
2652                                 best = e;
2653         }
2654         if (best) {
2655                 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2656                 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2657                 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2658                 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2659         }
2660         kvm_x86_ops->decache_regs(vcpu);
2661         kvm_x86_ops->skip_emulated_instruction(vcpu);
2662         KVMTRACE_5D(CPUID, vcpu, function,
2663                     (u32)vcpu->arch.regs[VCPU_REGS_RAX],
2664                     (u32)vcpu->arch.regs[VCPU_REGS_RBX],
2665                     (u32)vcpu->arch.regs[VCPU_REGS_RCX],
2666                     (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
2667 }
2668 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2669
2670 /*
2671  * Check if userspace requested an interrupt window, and that the
2672  * interrupt window is open.
2673  *
2674  * No need to exit to userspace if we already have an interrupt queued.
2675  */
2676 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2677                                           struct kvm_run *kvm_run)
2678 {
2679         return (!vcpu->arch.irq_summary &&
2680                 kvm_run->request_interrupt_window &&
2681                 vcpu->arch.interrupt_window_open &&
2682                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2683 }
2684
2685 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2686                               struct kvm_run *kvm_run)
2687 {
2688         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2689         kvm_run->cr8 = kvm_get_cr8(vcpu);
2690         kvm_run->apic_base = kvm_get_apic_base(vcpu);
2691         if (irqchip_in_kernel(vcpu->kvm))
2692                 kvm_run->ready_for_interrupt_injection = 1;
2693         else
2694                 kvm_run->ready_for_interrupt_injection =
2695                                         (vcpu->arch.interrupt_window_open &&
2696                                          vcpu->arch.irq_summary == 0);
2697 }
2698
2699 static void vapic_enter(struct kvm_vcpu *vcpu)
2700 {
2701         struct kvm_lapic *apic = vcpu->arch.apic;
2702         struct page *page;
2703
2704         if (!apic || !apic->vapic_addr)
2705                 return;
2706
2707         down_read(&current->mm->mmap_sem);
2708         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2709         up_read(&current->mm->mmap_sem);
2710
2711         vcpu->arch.apic->vapic_page = page;
2712 }
2713
2714 static void vapic_exit(struct kvm_vcpu *vcpu)
2715 {
2716         struct kvm_lapic *apic = vcpu->arch.apic;
2717
2718         if (!apic || !apic->vapic_addr)
2719                 return;
2720
2721         kvm_release_page_dirty(apic->vapic_page);
2722         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2723 }
2724
2725 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2726 {
2727         int r;
2728
2729         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2730                 pr_debug("vcpu %d received sipi with vector # %x\n",
2731                        vcpu->vcpu_id, vcpu->arch.sipi_vector);
2732                 kvm_lapic_reset(vcpu);
2733                 r = kvm_x86_ops->vcpu_reset(vcpu);
2734                 if (r)
2735                         return r;
2736                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2737         }
2738
2739         down_read(&vcpu->kvm->slots_lock);
2740         vapic_enter(vcpu);
2741
2742 preempted:
2743         if (vcpu->guest_debug.enabled)
2744                 kvm_x86_ops->guest_debug_pre(vcpu);
2745
2746 again:
2747         if (vcpu->requests)
2748                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2749                         kvm_mmu_unload(vcpu);
2750
2751         r = kvm_mmu_reload(vcpu);
2752         if (unlikely(r))
2753                 goto out;
2754
2755         if (vcpu->requests) {
2756                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2757                         __kvm_migrate_apic_timer(vcpu);
2758                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2759                                        &vcpu->requests)) {
2760                         kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2761                         r = 0;
2762                         goto out;
2763                 }
2764                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
2765                         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2766                         r = 0;
2767                         goto out;
2768                 }
2769         }
2770
2771         kvm_inject_pending_timer_irqs(vcpu);
2772
2773         preempt_disable();
2774
2775         kvm_x86_ops->prepare_guest_switch(vcpu);
2776         kvm_load_guest_fpu(vcpu);
2777
2778         local_irq_disable();
2779
2780         if (need_resched()) {
2781                 local_irq_enable();
2782                 preempt_enable();
2783                 r = 1;
2784                 goto out;
2785         }
2786
2787         if (vcpu->requests)
2788                 if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
2789                         local_irq_enable();
2790                         preempt_enable();
2791                         r = 1;
2792                         goto out;
2793                 }
2794
2795         if (signal_pending(current)) {
2796                 local_irq_enable();
2797                 preempt_enable();
2798                 r = -EINTR;
2799                 kvm_run->exit_reason = KVM_EXIT_INTR;
2800                 ++vcpu->stat.signal_exits;
2801                 goto out;
2802         }
2803
2804         if (vcpu->arch.exception.pending)
2805                 __queue_exception(vcpu);
2806         else if (irqchip_in_kernel(vcpu->kvm))
2807                 kvm_x86_ops->inject_pending_irq(vcpu);
2808         else
2809                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2810
2811         kvm_lapic_sync_to_vapic(vcpu);
2812
2813         up_read(&vcpu->kvm->slots_lock);
2814
2815         vcpu->guest_mode = 1;
2816         kvm_guest_enter();
2817
2818         if (vcpu->requests)
2819                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2820                         kvm_x86_ops->tlb_flush(vcpu);
2821
2822         KVMTRACE_0D(VMENTRY, vcpu, entryexit);
2823         kvm_x86_ops->run(vcpu, kvm_run);
2824
2825         vcpu->guest_mode = 0;
2826         local_irq_enable();
2827
2828         ++vcpu->stat.exits;
2829
2830         /*
2831          * We must have an instruction between local_irq_enable() and
2832          * kvm_guest_exit(), so the timer interrupt isn't delayed by
2833          * the interrupt shadow.  The stat.exits increment will do nicely.
2834          * But we need to prevent reordering, hence this barrier():
2835          */
2836         barrier();
2837
2838         kvm_guest_exit();
2839
2840         preempt_enable();
2841
2842         down_read(&vcpu->kvm->slots_lock);
2843
2844         /*
2845          * Profile KVM exit RIPs:
2846          */
2847         if (unlikely(prof_on == KVM_PROFILING)) {
2848                 kvm_x86_ops->cache_regs(vcpu);
2849                 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2850         }
2851
2852         if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2853                 vcpu->arch.exception.pending = false;
2854
2855         kvm_lapic_sync_from_vapic(vcpu);
2856
2857         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2858
2859         if (r > 0) {
2860                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2861                         r = -EINTR;
2862                         kvm_run->exit_reason = KVM_EXIT_INTR;
2863                         ++vcpu->stat.request_irq_exits;
2864                         goto out;
2865                 }
2866                 if (!need_resched())
2867                         goto again;
2868         }
2869
2870 out:
2871         up_read(&vcpu->kvm->slots_lock);
2872         if (r > 0) {
2873                 kvm_resched(vcpu);
2874                 down_read(&vcpu->kvm->slots_lock);
2875                 goto preempted;
2876         }
2877
2878         post_kvm_run_save(vcpu, kvm_run);
2879
2880         down_read(&vcpu->kvm->slots_lock);
2881         vapic_exit(vcpu);
2882         up_read(&vcpu->kvm->slots_lock);
2883
2884         return r;
2885 }
2886
2887 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2888 {
2889         int r;
2890         sigset_t sigsaved;
2891
2892         vcpu_load(vcpu);
2893
2894         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2895                 kvm_vcpu_block(vcpu);
2896                 vcpu_put(vcpu);
2897                 return -EAGAIN;
2898         }
2899
2900         if (vcpu->sigset_active)
2901                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2902
2903         /* re-sync apic's tpr */
2904         if (!irqchip_in_kernel(vcpu->kvm))
2905                 kvm_set_cr8(vcpu, kvm_run->cr8);
2906
2907         if (vcpu->arch.pio.cur_count) {
2908                 r = complete_pio(vcpu);
2909                 if (r)
2910                         goto out;
2911         }
2912 #if CONFIG_HAS_IOMEM
2913         if (vcpu->mmio_needed) {
2914                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2915                 vcpu->mmio_read_completed = 1;
2916                 vcpu->mmio_needed = 0;
2917
2918                 down_read(&vcpu->kvm->slots_lock);
2919                 r = emulate_instruction(vcpu, kvm_run,
2920                                         vcpu->arch.mmio_fault_cr2, 0,
2921                                         EMULTYPE_NO_DECODE);
2922                 up_read(&vcpu->kvm->slots_lock);
2923                 if (r == EMULATE_DO_MMIO) {
2924                         /*
2925                          * Read-modify-write.  Back to userspace.
2926                          */
2927                         r = 0;
2928                         goto out;
2929                 }
2930         }
2931 #endif
2932         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2933                 kvm_x86_ops->cache_regs(vcpu);
2934                 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2935                 kvm_x86_ops->decache_regs(vcpu);
2936         }
2937
2938         r = __vcpu_run(vcpu, kvm_run);
2939
2940 out:
2941         if (vcpu->sigset_active)
2942                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2943
2944         vcpu_put(vcpu);
2945         return r;
2946 }
2947
2948 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2949 {
2950         vcpu_load(vcpu);
2951
2952         kvm_x86_ops->cache_regs(vcpu);
2953
2954         regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2955         regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2956         regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2957         regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2958         regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2959         regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2960         regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2961         regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2962 #ifdef CONFIG_X86_64
2963         regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2964         regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2965         regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2966         regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2967         regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2968         regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2969         regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2970         regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2971 #endif
2972
2973         regs->rip = vcpu->arch.rip;
2974         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2975
2976         /*
2977          * Don't leak debug flags in case they were set for guest debugging
2978          */
2979         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2980                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2981
2982         vcpu_put(vcpu);
2983
2984         return 0;
2985 }
2986
2987 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2988 {
2989         vcpu_load(vcpu);
2990
2991         vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2992         vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2993         vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2994         vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2995         vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2996         vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2997         vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2998         vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2999 #ifdef CONFIG_X86_64
3000         vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
3001         vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
3002         vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
3003         vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
3004         vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
3005         vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
3006         vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
3007         vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
3008 #endif
3009
3010         vcpu->arch.rip = regs->rip;
3011         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3012
3013         kvm_x86_ops->decache_regs(vcpu);
3014
3015         vcpu_put(vcpu);
3016
3017         return 0;
3018 }
3019
3020 static void get_segment(struct kvm_vcpu *vcpu,
3021                         struct kvm_segment *var, int seg)
3022 {
3023         kvm_x86_ops->get_segment(vcpu, var, seg);
3024 }
3025
3026 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3027 {
3028         struct kvm_segment cs;
3029
3030         get_segment(vcpu, &cs, VCPU_SREG_CS);
3031         *db = cs.db;
3032         *l = cs.l;
3033 }
3034 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3035
3036 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3037                                   struct kvm_sregs *sregs)
3038 {
3039         struct descriptor_table dt;
3040         int pending_vec;
3041
3042         vcpu_load(vcpu);
3043
3044         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3045         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3046         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3047         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3048         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3049         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3050
3051         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3052         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3053
3054         kvm_x86_ops->get_idt(vcpu, &dt);
3055         sregs->idt.limit = dt.limit;
3056         sregs->idt.base = dt.base;
3057         kvm_x86_ops->get_gdt(vcpu, &dt);
3058         sregs->gdt.limit = dt.limit;
3059         sregs->gdt.base = dt.base;
3060
3061         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3062         sregs->cr0 = vcpu->arch.cr0;
3063         sregs->cr2 = vcpu->arch.cr2;
3064         sregs->cr3 = vcpu->arch.cr3;
3065         sregs->cr4 = vcpu->arch.cr4;
3066         sregs->cr8 = kvm_get_cr8(vcpu);
3067         sregs->efer = vcpu->arch.shadow_efer;
3068         sregs->apic_base = kvm_get_apic_base(vcpu);
3069
3070         if (irqchip_in_kernel(vcpu->kvm)) {
3071                 memset(sregs->interrupt_bitmap, 0,
3072                        sizeof sregs->interrupt_bitmap);
3073                 pending_vec = kvm_x86_ops->get_irq(vcpu);
3074                 if (pending_vec >= 0)
3075                         set_bit(pending_vec,
3076                                 (unsigned long *)sregs->interrupt_bitmap);
3077         } else
3078                 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
3079                        sizeof sregs->interrupt_bitmap);
3080
3081         vcpu_put(vcpu);
3082
3083         return 0;
3084 }
3085
3086 static void set_segment(struct kvm_vcpu *vcpu,
3087                         struct kvm_segment *var, int seg)
3088 {
3089         kvm_x86_ops->set_segment(vcpu, var, seg);
3090 }
3091
3092 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3093                                    struct kvm_segment *kvm_desct)
3094 {
3095         kvm_desct->base = seg_desc->base0;
3096         kvm_desct->base |= seg_desc->base1 << 16;
3097         kvm_desct->base |= seg_desc->base2 << 24;
3098         kvm_desct->limit = seg_desc->limit0;
3099         kvm_desct->limit |= seg_desc->limit << 16;
3100         kvm_desct->selector = selector;
3101         kvm_desct->type = seg_desc->type;
3102         kvm_desct->present = seg_desc->p;
3103         kvm_desct->dpl = seg_desc->dpl;
3104         kvm_desct->db = seg_desc->d;
3105         kvm_desct->s = seg_desc->s;
3106         kvm_desct->l = seg_desc->l;
3107         kvm_desct->g = seg_desc->g;
3108         kvm_desct->avl = seg_desc->avl;
3109         if (!selector)
3110                 kvm_desct->unusable = 1;
3111         else
3112                 kvm_desct->unusable = 0;
3113         kvm_desct->padding = 0;
3114 }
3115
3116 static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3117                                            u16 selector,
3118                                            struct descriptor_table *dtable)
3119 {
3120         if (selector & 1 << 2) {
3121                 struct kvm_segment kvm_seg;
3122
3123                 get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3124
3125                 if (kvm_seg.unusable)
3126                         dtable->limit = 0;
3127                 else
3128                         dtable->limit = kvm_seg.limit;
3129                 dtable->base = kvm_seg.base;
3130         }
3131         else
3132                 kvm_x86_ops->get_gdt(vcpu, dtable);
3133 }
3134
3135 /* allowed just for 8 bytes segments */
3136 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3137                                          struct desc_struct *seg_desc)
3138 {
3139         struct descriptor_table dtable;
3140         u16 index = selector >> 3;
3141
3142         get_segment_descritptor_dtable(vcpu, selector, &dtable);
3143
3144         if (dtable.limit < index * 8 + 7) {
3145                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3146                 return 1;
3147         }
3148         return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
3149 }
3150
3151 /* allowed just for 8 bytes segments */
3152 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3153                                          struct desc_struct *seg_desc)
3154 {
3155         struct descriptor_table dtable;
3156         u16 index = selector >> 3;
3157
3158         get_segment_descritptor_dtable(vcpu, selector, &dtable);
3159
3160         if (dtable.limit < index * 8 + 7)
3161                 return 1;
3162         return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
3163 }
3164
3165 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3166                              struct desc_struct *seg_desc)
3167 {
3168         u32 base_addr;
3169
3170         base_addr = seg_desc->base0;
3171         base_addr |= (seg_desc->base1 << 16);
3172         base_addr |= (seg_desc->base2 << 24);
3173
3174         return base_addr;
3175 }
3176
3177 static int load_tss_segment32(struct kvm_vcpu *vcpu,
3178                               struct desc_struct *seg_desc,
3179                               struct tss_segment_32 *tss)
3180 {
3181         u32 base_addr;
3182
3183         base_addr = get_tss_base_addr(vcpu, seg_desc);
3184
3185         return kvm_read_guest(vcpu->kvm, base_addr, tss,
3186                               sizeof(struct tss_segment_32));
3187 }
3188
3189 static int save_tss_segment32(struct kvm_vcpu *vcpu,
3190                               struct desc_struct *seg_desc,
3191                               struct tss_segment_32 *tss)
3192 {
3193         u32 base_addr;
3194
3195         base_addr = get_tss_base_addr(vcpu, seg_desc);
3196
3197         return kvm_write_guest(vcpu->kvm, base_addr, tss,
3198                                sizeof(struct tss_segment_32));
3199 }
3200
3201 static int load_tss_segment16(struct kvm_vcpu *vcpu,
3202                               struct desc_struct *seg_desc,
3203                               struct tss_segment_16 *tss)
3204 {
3205         u32 base_addr;
3206
3207         base_addr = get_tss_base_addr(vcpu, seg_desc);
3208
3209         return kvm_read_guest(vcpu->kvm, base_addr, tss,
3210                               sizeof(struct tss_segment_16));
3211 }
3212
3213 static int save_tss_segment16(struct kvm_vcpu *vcpu,
3214                               struct desc_struct *seg_desc,
3215                               struct tss_segment_16 *tss)
3216 {
3217         u32 base_addr;
3218
3219         base_addr = get_tss_base_addr(vcpu, seg_desc);
3220
3221         return kvm_write_guest(vcpu->kvm, base_addr, tss,
3222                                sizeof(struct tss_segment_16));
3223 }
3224
3225 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3226 {
3227         struct kvm_segment kvm_seg;
3228
3229         get_segment(vcpu, &kvm_seg, seg);
3230         return kvm_seg.selector;
3231 }
3232
3233 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3234                                                 u16 selector,
3235                                                 struct kvm_segment *kvm_seg)
3236 {
3237         struct desc_struct seg_desc;
3238
3239         if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
3240                 return 1;
3241         seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
3242         return 0;
3243 }
3244
3245 static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3246                                    int type_bits, int seg)
3247 {
3248         struct kvm_segment kvm_seg;
3249
3250         if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3251                 return 1;
3252         kvm_seg.type |= type_bits;
3253
3254         if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
3255             seg != VCPU_SREG_LDTR)
3256                 if (!kvm_seg.s)
3257                         kvm_seg.unusable = 1;
3258
3259         set_segment(vcpu, &kvm_seg, seg);
3260         return 0;
3261 }
3262
3263 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3264                                 struct tss_segment_32 *tss)
3265 {
3266         tss->cr3 = vcpu->arch.cr3;
3267         tss->eip = vcpu->arch.rip;
3268         tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3269         tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
3270         tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3271         tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
3272         tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
3273         tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
3274         tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
3275         tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
3276         tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
3277
3278         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3279         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3280         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3281         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3282         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3283         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3284         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3285         tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3286 }
3287
3288 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3289                                   struct tss_segment_32 *tss)
3290 {
3291         kvm_set_cr3(vcpu, tss->cr3);
3292
3293         vcpu->arch.rip = tss->eip;
3294         kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3295
3296         vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
3297         vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
3298         vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
3299         vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
3300         vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
3301         vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
3302         vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
3303         vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
3304
3305         if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3306                 return 1;
3307
3308         if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3309                 return 1;
3310
3311         if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3312                 return 1;
3313
3314         if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3315                 return 1;
3316
3317         if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3318                 return 1;
3319
3320         if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3321                 return 1;
3322
3323         if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3324                 return 1;
3325         return 0;
3326 }
3327
3328 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3329                                 struct tss_segment_16 *tss)
3330 {
3331         tss->ip = vcpu->arch.rip;
3332         tss->flag = kvm_x86_ops->get_rflags(vcpu);
3333         tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
3334         tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
3335         tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
3336         tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
3337         tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
3338         tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
3339         tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
3340         tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
3341
3342         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3343         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3344         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3345         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3346         tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3347         tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3348 }
3349
3350 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3351                                  struct tss_segment_16 *tss)
3352 {
3353         vcpu->arch.rip = tss->ip;
3354         kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3355         vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
3356         vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
3357         vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
3358         vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
3359         vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
3360         vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
3361         vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
3362         vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
3363
3364         if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3365                 return 1;
3366
3367         if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3368                 return 1;
3369
3370         if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3371                 return 1;
3372
3373         if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3374                 return 1;
3375
3376         if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3377                 return 1;
3378         return 0;
3379 }
3380
3381 int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3382                        struct desc_struct *cseg_desc,
3383                        struct desc_struct *nseg_desc)
3384 {
3385         struct tss_segment_16 tss_segment_16;
3386         int ret = 0;
3387
3388         if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16))
3389                 goto out;
3390
3391         save_state_to_tss16(vcpu, &tss_segment_16);
3392         save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
3393
3394         if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16))
3395                 goto out;
3396         if (load_state_from_tss16(vcpu, &tss_segment_16))
3397                 goto out;
3398
3399         ret = 1;
3400 out:
3401         return ret;
3402 }
3403
3404 int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3405                        struct desc_struct *cseg_desc,
3406                        struct desc_struct *nseg_desc)
3407 {
3408         struct tss_segment_32 tss_segment_32;
3409         int ret = 0;
3410
3411         if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32))
3412                 goto out;
3413
3414         save_state_to_tss32(vcpu, &tss_segment_32);
3415         save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
3416
3417         if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32))
3418                 goto out;
3419         if (load_state_from_tss32(vcpu, &tss_segment_32))
3420                 goto out;
3421
3422         ret = 1;
3423 out:
3424         return ret;
3425 }
3426
3427 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3428 {
3429         struct kvm_segment tr_seg;
3430         struct desc_struct cseg_desc;
3431         struct desc_struct nseg_desc;
3432         int ret = 0;
3433
3434         get_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3435
3436         if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3437                 goto out;
3438
3439         if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc))
3440                 goto out;
3441
3442
3443         if (reason != TASK_SWITCH_IRET) {
3444                 int cpl;
3445
3446                 cpl = kvm_x86_ops->get_cpl(vcpu);
3447                 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
3448                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
3449                         return 1;
3450                 }
3451         }
3452
3453         if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
3454                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
3455                 return 1;
3456         }
3457
3458         if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3459                 cseg_desc.type &= ~(1 << 8); //clear the B flag
3460                 save_guest_segment_descriptor(vcpu, tr_seg.selector,
3461                                               &cseg_desc);
3462         }
3463
3464         if (reason == TASK_SWITCH_IRET) {
3465                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3466                 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3467         }
3468
3469         kvm_x86_ops->skip_emulated_instruction(vcpu);
3470         kvm_x86_ops->cache_regs(vcpu);
3471
3472         if (nseg_desc.type & 8)
3473                 ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc,
3474                                          &nseg_desc);
3475         else
3476                 ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc,
3477                                          &nseg_desc);
3478
3479         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3480                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3481                 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
3482         }
3483
3484         if (reason != TASK_SWITCH_IRET) {
3485                 nseg_desc.type |= (1 << 8);
3486                 save_guest_segment_descriptor(vcpu, tss_selector,
3487                                               &nseg_desc);
3488         }
3489
3490         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3491         seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3492         tr_seg.type = 11;
3493         set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3494 out:
3495         kvm_x86_ops->decache_regs(vcpu);
3496         return ret;
3497 }
3498 EXPORT_SYMBOL_GPL(kvm_task_switch);
3499
3500 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3501                                   struct kvm_sregs *sregs)
3502 {
3503         int mmu_reset_needed = 0;
3504         int i, pending_vec, max_bits;
3505         struct descriptor_table dt;
3506
3507         vcpu_load(vcpu);
3508
3509         dt.limit = sregs->idt.limit;
3510         dt.base = sregs->idt.base;
3511         kvm_x86_ops->set_idt(vcpu, &dt);
3512         dt.limit = sregs->gdt.limit;
3513         dt.base = sregs->gdt.base;
3514         kvm_x86_ops->set_gdt(vcpu, &dt);
3515
3516         vcpu->arch.cr2 = sregs->cr2;
3517         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3518         vcpu->arch.cr3 = sregs->cr3;
3519
3520         kvm_set_cr8(vcpu, sregs->cr8);
3521
3522         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
3523         kvm_x86_ops->set_efer(vcpu, sregs->efer);
3524         kvm_set_apic_base(vcpu, sregs->apic_base);
3525
3526         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3527
3528         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
3529         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
3530         vcpu->arch.cr0 = sregs->cr0;
3531
3532         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
3533         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
3534         if (!is_long_mode(vcpu) && is_pae(vcpu))
3535                 load_pdptrs(vcpu, vcpu->arch.cr3);
3536
3537         if (mmu_reset_needed)
3538                 kvm_mmu_reset_context(vcpu);
3539
3540         if (!irqchip_in_kernel(vcpu->kvm)) {
3541                 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
3542                        sizeof vcpu->arch.irq_pending);
3543                 vcpu->arch.irq_summary = 0;
3544                 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
3545                         if (vcpu->arch.irq_pending[i])
3546                                 __set_bit(i, &vcpu->arch.irq_summary);
3547         } else {
3548                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3549                 pending_vec = find_first_bit(
3550                         (const unsigned long *)sregs->interrupt_bitmap,
3551                         max_bits);
3552                 /* Only pending external irq is handled here */
3553                 if (pending_vec < max_bits) {
3554                         kvm_x86_ops->set_irq(vcpu, pending_vec);
3555                         pr_debug("Set back pending irq %d\n",
3556                                  pending_vec);
3557                 }
3558         }
3559
3560         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3561         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3562         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3563         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3564         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3565         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3566
3567         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3568         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3569
3570         vcpu_put(vcpu);
3571
3572         return 0;
3573 }
3574
3575 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
3576                                     struct kvm_debug_guest *dbg)
3577 {
3578         int r;
3579
3580         vcpu_load(vcpu);
3581
3582         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
3583
3584         vcpu_put(vcpu);
3585
3586         return r;
3587 }
3588
3589 /*
3590  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
3591  * we have asm/x86/processor.h
3592  */
3593 struct fxsave {
3594         u16     cwd;
3595         u16     swd;
3596         u16     twd;
3597         u16     fop;
3598         u64     rip;
3599         u64     rdp;
3600         u32     mxcsr;
3601         u32     mxcsr_mask;
3602         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
3603 #ifdef CONFIG_X86_64
3604         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
3605 #else
3606         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
3607 #endif
3608 };
3609
3610 /*
3611  * Translate a guest virtual address to a guest physical address.
3612  */
3613 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
3614                                     struct kvm_translation *tr)
3615 {
3616         unsigned long vaddr = tr->linear_address;
3617         gpa_t gpa;
3618
3619         vcpu_load(vcpu);
3620         down_read(&vcpu->kvm->slots_lock);
3621         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
3622         up_read(&vcpu->kvm->slots_lock);
3623         tr->physical_address = gpa;
3624         tr->valid = gpa != UNMAPPED_GVA;
3625         tr->writeable = 1;
3626         tr->usermode = 0;
3627         vcpu_put(vcpu);
3628
3629         return 0;
3630 }
3631
3632 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3633 {
3634         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
3635
3636         vcpu_load(vcpu);
3637
3638         memcpy(fpu->fpr, fxsave->st_space, 128);
3639         fpu->fcw = fxsave->cwd;
3640         fpu->fsw = fxsave->swd;
3641         fpu->ftwx = fxsave->twd;
3642         fpu->last_opcode = fxsave->fop;
3643         fpu->last_ip = fxsave->rip;
3644         fpu->last_dp = fxsave->rdp;
3645         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
3646
3647         vcpu_put(vcpu);
3648
3649         return 0;
3650 }
3651
3652 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3653 {
3654         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
3655
3656         vcpu_load(vcpu);
3657
3658         memcpy(fxsave->st_space, fpu->fpr, 128);
3659         fxsave->cwd = fpu->fcw;
3660         fxsave->swd = fpu->fsw;
3661         fxsave->twd = fpu->ftwx;
3662         fxsave->fop = fpu->last_opcode;
3663         fxsave->rip = fpu->last_ip;
3664         fxsave->rdp = fpu->last_dp;
3665         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
3666
3667         vcpu_put(vcpu);
3668
3669         return 0;
3670 }
3671
3672 void fx_init(struct kvm_vcpu *vcpu)
3673 {
3674         unsigned after_mxcsr_mask;
3675
3676         /* Initialize guest FPU by resetting ours and saving into guest's */
3677         preempt_disable();
3678         fx_save(&vcpu->arch.host_fx_image);
3679         fpu_init();
3680         fx_save(&vcpu->arch.guest_fx_image);
3681         fx_restore(&vcpu->arch.host_fx_image);
3682         preempt_enable();
3683
3684         vcpu->arch.cr0 |= X86_CR0_ET;
3685         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
3686         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3687         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
3688                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
3689 }
3690 EXPORT_SYMBOL_GPL(fx_init);
3691
3692 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3693 {
3694         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
3695                 return;
3696
3697         vcpu->guest_fpu_loaded = 1;
3698         fx_save(&vcpu->arch.host_fx_image);
3699         fx_restore(&vcpu->arch.guest_fx_image);
3700 }
3701 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3702
3703 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3704 {
3705         if (!vcpu->guest_fpu_loaded)
3706                 return;
3707
3708         vcpu->guest_fpu_loaded = 0;
3709         fx_save(&vcpu->arch.guest_fx_image);
3710         fx_restore(&vcpu->arch.host_fx_image);
3711         ++vcpu->stat.fpu_reload;
3712 }
3713 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
3714
3715 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
3716 {
3717         kvm_x86_ops->vcpu_free(vcpu);
3718 }
3719
3720 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
3721                                                 unsigned int id)
3722 {
3723         return kvm_x86_ops->vcpu_create(kvm, id);
3724 }
3725
3726 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3727 {
3728         int r;
3729
3730         /* We do fxsave: this must be aligned. */
3731         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3732
3733         vcpu_load(vcpu);
3734         r = kvm_arch_vcpu_reset(vcpu);
3735         if (r == 0)
3736                 r = kvm_mmu_setup(vcpu);
3737         vcpu_put(vcpu);
3738         if (r < 0)
3739                 goto free_vcpu;
3740
3741         return 0;
3742 free_vcpu:
3743         kvm_x86_ops->vcpu_free(vcpu);
3744         return r;
3745 }
3746
3747 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3748 {
3749         vcpu_load(vcpu);
3750         kvm_mmu_unload(vcpu);
3751         vcpu_put(vcpu);
3752
3753         kvm_x86_ops->vcpu_free(vcpu);
3754 }
3755
3756 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3757 {
3758         return kvm_x86_ops->vcpu_reset(vcpu);
3759 }
3760
3761 void kvm_arch_hardware_enable(void *garbage)
3762 {
3763         kvm_x86_ops->hardware_enable(garbage);
3764 }
3765
3766 void kvm_arch_hardware_disable(void *garbage)
3767 {
3768         kvm_x86_ops->hardware_disable(garbage);
3769 }
3770
3771 int kvm_arch_hardware_setup(void)
3772 {
3773         return kvm_x86_ops->hardware_setup();
3774 }
3775
3776 void kvm_arch_hardware_unsetup(void)
3777 {
3778         kvm_x86_ops->hardware_unsetup();
3779 }
3780
3781 void kvm_arch_check_processor_compat(void *rtn)
3782 {
3783         kvm_x86_ops->check_processor_compatibility(rtn);
3784 }
3785
3786 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
3787 {
3788         struct page *page;
3789         struct kvm *kvm;
3790         int r;
3791
3792         BUG_ON(vcpu->kvm == NULL);
3793         kvm = vcpu->kvm;
3794
3795         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3796         if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3797                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3798         else
3799                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
3800
3801         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3802         if (!page) {
3803                 r = -ENOMEM;
3804                 goto fail;
3805         }
3806         vcpu->arch.pio_data = page_address(page);
3807
3808         r = kvm_mmu_create(vcpu);
3809         if (r < 0)
3810                 goto fail_free_pio_data;
3811
3812         if (irqchip_in_kernel(kvm)) {
3813                 r = kvm_create_lapic(vcpu);
3814                 if (r < 0)
3815                         goto fail_mmu_destroy;
3816         }
3817
3818         return 0;
3819
3820 fail_mmu_destroy:
3821         kvm_mmu_destroy(vcpu);
3822 fail_free_pio_data:
3823         free_page((unsigned long)vcpu->arch.pio_data);
3824 fail:
3825         return r;
3826 }
3827
3828 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3829 {
3830         kvm_free_lapic(vcpu);
3831         down_read(&vcpu->kvm->slots_lock);
3832         kvm_mmu_destroy(vcpu);
3833         up_read(&vcpu->kvm->slots_lock);
3834         free_page((unsigned long)vcpu->arch.pio_data);
3835 }
3836
3837 struct  kvm *kvm_arch_create_vm(void)
3838 {
3839         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3840
3841         if (!kvm)
3842                 return ERR_PTR(-ENOMEM);
3843
3844         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
3845
3846         return kvm;
3847 }
3848
3849 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3850 {
3851         vcpu_load(vcpu);
3852         kvm_mmu_unload(vcpu);
3853         vcpu_put(vcpu);
3854 }
3855
3856 static void kvm_free_vcpus(struct kvm *kvm)
3857 {
3858         unsigned int i;
3859
3860         /*
3861          * Unpin any mmu pages first.
3862          */
3863         for (i = 0; i < KVM_MAX_VCPUS; ++i)
3864                 if (kvm->vcpus[i])
3865                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3866         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3867                 if (kvm->vcpus[i]) {
3868                         kvm_arch_vcpu_free(kvm->vcpus[i]);
3869                         kvm->vcpus[i] = NULL;
3870                 }
3871         }
3872
3873 }
3874
3875 void kvm_arch_destroy_vm(struct kvm *kvm)
3876 {
3877         kvm_free_pit(kvm);
3878         kfree(kvm->arch.vpic);
3879         kfree(kvm->arch.vioapic);
3880         kvm_free_vcpus(kvm);
3881         kvm_free_physmem(kvm);
3882         if (kvm->arch.apic_access_page)
3883                 put_page(kvm->arch.apic_access_page);
3884         kfree(kvm);
3885 }
3886
3887 int kvm_arch_set_memory_region(struct kvm *kvm,
3888                                 struct kvm_userspace_memory_region *mem,
3889                                 struct kvm_memory_slot old,
3890                                 int user_alloc)
3891 {
3892         int npages = mem->memory_size >> PAGE_SHIFT;
3893         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3894
3895         /*To keep backward compatibility with older userspace,
3896          *x86 needs to hanlde !user_alloc case.
3897          */
3898         if (!user_alloc) {
3899                 if (npages && !old.rmap) {
3900                         down_write(&current->mm->mmap_sem);
3901                         memslot->userspace_addr = do_mmap(NULL, 0,
3902                                                      npages * PAGE_SIZE,
3903                                                      PROT_READ | PROT_WRITE,
3904                                                      MAP_SHARED | MAP_ANONYMOUS,
3905                                                      0);
3906                         up_write(&current->mm->mmap_sem);
3907
3908                         if (IS_ERR((void *)memslot->userspace_addr))
3909                                 return PTR_ERR((void *)memslot->userspace_addr);
3910                 } else {
3911                         if (!old.user_alloc && old.rmap) {
3912                                 int ret;
3913
3914                                 down_write(&current->mm->mmap_sem);
3915                                 ret = do_munmap(current->mm, old.userspace_addr,
3916                                                 old.npages * PAGE_SIZE);
3917                                 up_write(&current->mm->mmap_sem);
3918                                 if (ret < 0)
3919                                         printk(KERN_WARNING
3920                                        "kvm_vm_ioctl_set_memory_region: "
3921                                        "failed to munmap memory\n");
3922                         }
3923                 }
3924         }
3925
3926         if (!kvm->arch.n_requested_mmu_pages) {
3927                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3928                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3929         }
3930
3931         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3932         kvm_flush_remote_tlbs(kvm);
3933
3934         return 0;
3935 }
3936
3937 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3938 {
3939         return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
3940                || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
3941 }
3942
3943 static void vcpu_kick_intr(void *info)
3944 {
3945 #ifdef DEBUG
3946         struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
3947         printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
3948 #endif
3949 }
3950
3951 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3952 {
3953         int ipi_pcpu = vcpu->cpu;
3954
3955         if (waitqueue_active(&vcpu->wq)) {
3956                 wake_up_interruptible(&vcpu->wq);
3957                 ++vcpu->stat.halt_wakeup;
3958         }
3959         if (vcpu->guest_mode)
3960                 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
3961 }