KVM: convert custom marker based tracing to event traces
[safe/jmp/linux-2.6] / arch / x86 / kvm / svm.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * AMD SVM support
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  *
8  * Authors:
9  *   Yaniv Kamay  <yaniv@qumranet.com>
10  *   Avi Kivity   <avi@qumranet.com>
11  *
12  * This work is licensed under the terms of the GNU GPL, version 2.  See
13  * the COPYING file in the top-level directory.
14  *
15  */
16 #include <linux/kvm_host.h>
17
18 #include "irq.h"
19 #include "mmu.h"
20 #include "kvm_cache_regs.h"
21 #include "x86.h"
22
23 #include <linux/module.h>
24 #include <linux/kernel.h>
25 #include <linux/vmalloc.h>
26 #include <linux/highmem.h>
27 #include <linux/sched.h>
28 #include <linux/ftrace_event.h>
29
30 #include <asm/desc.h>
31
32 #include <asm/virtext.h>
33 #include "trace.h"
34
35 #define __ex(x) __kvm_handle_fault_on_reboot(x)
36
37 MODULE_AUTHOR("Qumranet");
38 MODULE_LICENSE("GPL");
39
40 #define IOPM_ALLOC_ORDER 2
41 #define MSRPM_ALLOC_ORDER 1
42
43 #define SEG_TYPE_LDT 2
44 #define SEG_TYPE_BUSY_TSS16 3
45
46 #define SVM_FEATURE_NPT  (1 << 0)
47 #define SVM_FEATURE_LBRV (1 << 1)
48 #define SVM_FEATURE_SVML (1 << 2)
49
50 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
51
52 /* Turn on to get debugging output*/
53 /* #define NESTED_DEBUG */
54
55 #ifdef NESTED_DEBUG
56 #define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
57 #else
58 #define nsvm_printk(fmt, args...) do {} while(0)
59 #endif
60
61 static const u32 host_save_user_msrs[] = {
62 #ifdef CONFIG_X86_64
63         MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
64         MSR_FS_BASE,
65 #endif
66         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
67 };
68
69 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
70
71 struct kvm_vcpu;
72
73 struct vcpu_svm {
74         struct kvm_vcpu vcpu;
75         struct vmcb *vmcb;
76         unsigned long vmcb_pa;
77         struct svm_cpu_data *svm_data;
78         uint64_t asid_generation;
79         uint64_t sysenter_esp;
80         uint64_t sysenter_eip;
81
82         u64 next_rip;
83
84         u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
85         u64 host_gs_base;
86
87         u32 *msrpm;
88         struct vmcb *hsave;
89         u64 hsave_msr;
90
91         u64 nested_vmcb;
92
93         /* These are the merged vectors */
94         u32 *nested_msrpm;
95
96         /* gpa pointers to the real vectors */
97         u64 nested_vmcb_msrpm;
98 };
99
100 /* enable NPT for AMD64 and X86 with PAE */
101 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
102 static bool npt_enabled = true;
103 #else
104 static bool npt_enabled = false;
105 #endif
106 static int npt = 1;
107
108 module_param(npt, int, S_IRUGO);
109
110 static int nested = 0;
111 module_param(nested, int, S_IRUGO);
112
113 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
114
115 static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
116 static int nested_svm_vmexit(struct vcpu_svm *svm);
117 static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
118                              void *arg2, void *opaque);
119 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
120                                       bool has_error_code, u32 error_code);
121
122 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
123 {
124         return container_of(vcpu, struct vcpu_svm, vcpu);
125 }
126
127 static inline bool is_nested(struct vcpu_svm *svm)
128 {
129         return svm->nested_vmcb;
130 }
131
132 static unsigned long iopm_base;
133
134 struct kvm_ldttss_desc {
135         u16 limit0;
136         u16 base0;
137         unsigned base1 : 8, type : 5, dpl : 2, p : 1;
138         unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
139         u32 base3;
140         u32 zero1;
141 } __attribute__((packed));
142
143 struct svm_cpu_data {
144         int cpu;
145
146         u64 asid_generation;
147         u32 max_asid;
148         u32 next_asid;
149         struct kvm_ldttss_desc *tss_desc;
150
151         struct page *save_area;
152 };
153
154 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
155 static uint32_t svm_features;
156
157 struct svm_init_data {
158         int cpu;
159         int r;
160 };
161
162 static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
163
164 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
165 #define MSRS_RANGE_SIZE 2048
166 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
167
168 #define MAX_INST_SIZE 15
169
170 static inline u32 svm_has(u32 feat)
171 {
172         return svm_features & feat;
173 }
174
175 static inline void clgi(void)
176 {
177         asm volatile (__ex(SVM_CLGI));
178 }
179
180 static inline void stgi(void)
181 {
182         asm volatile (__ex(SVM_STGI));
183 }
184
185 static inline void invlpga(unsigned long addr, u32 asid)
186 {
187         asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
188 }
189
190 static inline void force_new_asid(struct kvm_vcpu *vcpu)
191 {
192         to_svm(vcpu)->asid_generation--;
193 }
194
195 static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
196 {
197         force_new_asid(vcpu);
198 }
199
200 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
201 {
202         if (!npt_enabled && !(efer & EFER_LMA))
203                 efer &= ~EFER_LME;
204
205         to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
206         vcpu->arch.shadow_efer = efer;
207 }
208
209 static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
210                                 bool has_error_code, u32 error_code)
211 {
212         struct vcpu_svm *svm = to_svm(vcpu);
213
214         /* If we are within a nested VM we'd better #VMEXIT and let the
215            guest handle the exception */
216         if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
217                 return;
218
219         svm->vmcb->control.event_inj = nr
220                 | SVM_EVTINJ_VALID
221                 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
222                 | SVM_EVTINJ_TYPE_EXEPT;
223         svm->vmcb->control.event_inj_err = error_code;
224 }
225
226 static int is_external_interrupt(u32 info)
227 {
228         info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
229         return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
230 }
231
232 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
233 {
234         struct vcpu_svm *svm = to_svm(vcpu);
235         u32 ret = 0;
236
237         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
238                 ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
239         return ret & mask;
240 }
241
242 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
243 {
244         struct vcpu_svm *svm = to_svm(vcpu);
245
246         if (mask == 0)
247                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
248         else
249                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
250
251 }
252
253 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
254 {
255         struct vcpu_svm *svm = to_svm(vcpu);
256
257         if (!svm->next_rip) {
258                 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
259                                 EMULATE_DONE)
260                         printk(KERN_DEBUG "%s: NOP\n", __func__);
261                 return;
262         }
263         if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
264                 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
265                        __func__, kvm_rip_read(vcpu), svm->next_rip);
266
267         kvm_rip_write(vcpu, svm->next_rip);
268         svm_set_interrupt_shadow(vcpu, 0);
269 }
270
271 static int has_svm(void)
272 {
273         const char *msg;
274
275         if (!cpu_has_svm(&msg)) {
276                 printk(KERN_INFO "has_svm: %s\n", msg);
277                 return 0;
278         }
279
280         return 1;
281 }
282
283 static void svm_hardware_disable(void *garbage)
284 {
285         cpu_svm_disable();
286 }
287
288 static void svm_hardware_enable(void *garbage)
289 {
290
291         struct svm_cpu_data *svm_data;
292         uint64_t efer;
293         struct desc_ptr gdt_descr;
294         struct desc_struct *gdt;
295         int me = raw_smp_processor_id();
296
297         if (!has_svm()) {
298                 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
299                 return;
300         }
301         svm_data = per_cpu(svm_data, me);
302
303         if (!svm_data) {
304                 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
305                        me);
306                 return;
307         }
308
309         svm_data->asid_generation = 1;
310         svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
311         svm_data->next_asid = svm_data->max_asid + 1;
312
313         asm volatile ("sgdt %0" : "=m"(gdt_descr));
314         gdt = (struct desc_struct *)gdt_descr.address;
315         svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
316
317         rdmsrl(MSR_EFER, efer);
318         wrmsrl(MSR_EFER, efer | EFER_SVME);
319
320         wrmsrl(MSR_VM_HSAVE_PA,
321                page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
322 }
323
324 static void svm_cpu_uninit(int cpu)
325 {
326         struct svm_cpu_data *svm_data
327                 = per_cpu(svm_data, raw_smp_processor_id());
328
329         if (!svm_data)
330                 return;
331
332         per_cpu(svm_data, raw_smp_processor_id()) = NULL;
333         __free_page(svm_data->save_area);
334         kfree(svm_data);
335 }
336
337 static int svm_cpu_init(int cpu)
338 {
339         struct svm_cpu_data *svm_data;
340         int r;
341
342         svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
343         if (!svm_data)
344                 return -ENOMEM;
345         svm_data->cpu = cpu;
346         svm_data->save_area = alloc_page(GFP_KERNEL);
347         r = -ENOMEM;
348         if (!svm_data->save_area)
349                 goto err_1;
350
351         per_cpu(svm_data, cpu) = svm_data;
352
353         return 0;
354
355 err_1:
356         kfree(svm_data);
357         return r;
358
359 }
360
361 static void set_msr_interception(u32 *msrpm, unsigned msr,
362                                  int read, int write)
363 {
364         int i;
365
366         for (i = 0; i < NUM_MSR_MAPS; i++) {
367                 if (msr >= msrpm_ranges[i] &&
368                     msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
369                         u32 msr_offset = (i * MSRS_IN_RANGE + msr -
370                                           msrpm_ranges[i]) * 2;
371
372                         u32 *base = msrpm + (msr_offset / 32);
373                         u32 msr_shift = msr_offset % 32;
374                         u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
375                         *base = (*base & ~(0x3 << msr_shift)) |
376                                 (mask << msr_shift);
377                         return;
378                 }
379         }
380         BUG();
381 }
382
383 static void svm_vcpu_init_msrpm(u32 *msrpm)
384 {
385         memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
386
387 #ifdef CONFIG_X86_64
388         set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
389         set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
390         set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
391         set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
392         set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
393         set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
394 #endif
395         set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
396         set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
397 }
398
399 static void svm_enable_lbrv(struct vcpu_svm *svm)
400 {
401         u32 *msrpm = svm->msrpm;
402
403         svm->vmcb->control.lbr_ctl = 1;
404         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
405         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
406         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
407         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
408 }
409
410 static void svm_disable_lbrv(struct vcpu_svm *svm)
411 {
412         u32 *msrpm = svm->msrpm;
413
414         svm->vmcb->control.lbr_ctl = 0;
415         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
416         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
417         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
418         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
419 }
420
421 static __init int svm_hardware_setup(void)
422 {
423         int cpu;
424         struct page *iopm_pages;
425         void *iopm_va;
426         int r;
427
428         iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
429
430         if (!iopm_pages)
431                 return -ENOMEM;
432
433         iopm_va = page_address(iopm_pages);
434         memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
435         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
436
437         if (boot_cpu_has(X86_FEATURE_NX))
438                 kvm_enable_efer_bits(EFER_NX);
439
440         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
441                 kvm_enable_efer_bits(EFER_FFXSR);
442
443         if (nested) {
444                 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
445                 kvm_enable_efer_bits(EFER_SVME);
446         }
447
448         for_each_online_cpu(cpu) {
449                 r = svm_cpu_init(cpu);
450                 if (r)
451                         goto err;
452         }
453
454         svm_features = cpuid_edx(SVM_CPUID_FUNC);
455
456         if (!svm_has(SVM_FEATURE_NPT))
457                 npt_enabled = false;
458
459         if (npt_enabled && !npt) {
460                 printk(KERN_INFO "kvm: Nested Paging disabled\n");
461                 npt_enabled = false;
462         }
463
464         if (npt_enabled) {
465                 printk(KERN_INFO "kvm: Nested Paging enabled\n");
466                 kvm_enable_tdp();
467         } else
468                 kvm_disable_tdp();
469
470         return 0;
471
472 err:
473         __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
474         iopm_base = 0;
475         return r;
476 }
477
478 static __exit void svm_hardware_unsetup(void)
479 {
480         int cpu;
481
482         for_each_online_cpu(cpu)
483                 svm_cpu_uninit(cpu);
484
485         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
486         iopm_base = 0;
487 }
488
489 static void init_seg(struct vmcb_seg *seg)
490 {
491         seg->selector = 0;
492         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
493                 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
494         seg->limit = 0xffff;
495         seg->base = 0;
496 }
497
498 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
499 {
500         seg->selector = 0;
501         seg->attrib = SVM_SELECTOR_P_MASK | type;
502         seg->limit = 0xffff;
503         seg->base = 0;
504 }
505
506 static void init_vmcb(struct vcpu_svm *svm)
507 {
508         struct vmcb_control_area *control = &svm->vmcb->control;
509         struct vmcb_save_area *save = &svm->vmcb->save;
510
511         control->intercept_cr_read =    INTERCEPT_CR0_MASK |
512                                         INTERCEPT_CR3_MASK |
513                                         INTERCEPT_CR4_MASK;
514
515         control->intercept_cr_write =   INTERCEPT_CR0_MASK |
516                                         INTERCEPT_CR3_MASK |
517                                         INTERCEPT_CR4_MASK |
518                                         INTERCEPT_CR8_MASK;
519
520         control->intercept_dr_read =    INTERCEPT_DR0_MASK |
521                                         INTERCEPT_DR1_MASK |
522                                         INTERCEPT_DR2_MASK |
523                                         INTERCEPT_DR3_MASK;
524
525         control->intercept_dr_write =   INTERCEPT_DR0_MASK |
526                                         INTERCEPT_DR1_MASK |
527                                         INTERCEPT_DR2_MASK |
528                                         INTERCEPT_DR3_MASK |
529                                         INTERCEPT_DR5_MASK |
530                                         INTERCEPT_DR7_MASK;
531
532         control->intercept_exceptions = (1 << PF_VECTOR) |
533                                         (1 << UD_VECTOR) |
534                                         (1 << MC_VECTOR);
535
536
537         control->intercept =    (1ULL << INTERCEPT_INTR) |
538                                 (1ULL << INTERCEPT_NMI) |
539                                 (1ULL << INTERCEPT_SMI) |
540                                 (1ULL << INTERCEPT_CPUID) |
541                                 (1ULL << INTERCEPT_INVD) |
542                                 (1ULL << INTERCEPT_HLT) |
543                                 (1ULL << INTERCEPT_INVLPG) |
544                                 (1ULL << INTERCEPT_INVLPGA) |
545                                 (1ULL << INTERCEPT_IOIO_PROT) |
546                                 (1ULL << INTERCEPT_MSR_PROT) |
547                                 (1ULL << INTERCEPT_TASK_SWITCH) |
548                                 (1ULL << INTERCEPT_SHUTDOWN) |
549                                 (1ULL << INTERCEPT_VMRUN) |
550                                 (1ULL << INTERCEPT_VMMCALL) |
551                                 (1ULL << INTERCEPT_VMLOAD) |
552                                 (1ULL << INTERCEPT_VMSAVE) |
553                                 (1ULL << INTERCEPT_STGI) |
554                                 (1ULL << INTERCEPT_CLGI) |
555                                 (1ULL << INTERCEPT_SKINIT) |
556                                 (1ULL << INTERCEPT_WBINVD) |
557                                 (1ULL << INTERCEPT_MONITOR) |
558                                 (1ULL << INTERCEPT_MWAIT);
559
560         control->iopm_base_pa = iopm_base;
561         control->msrpm_base_pa = __pa(svm->msrpm);
562         control->tsc_offset = 0;
563         control->int_ctl = V_INTR_MASKING_MASK;
564
565         init_seg(&save->es);
566         init_seg(&save->ss);
567         init_seg(&save->ds);
568         init_seg(&save->fs);
569         init_seg(&save->gs);
570
571         save->cs.selector = 0xf000;
572         /* Executable/Readable Code Segment */
573         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
574                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
575         save->cs.limit = 0xffff;
576         /*
577          * cs.base should really be 0xffff0000, but vmx can't handle that, so
578          * be consistent with it.
579          *
580          * Replace when we have real mode working for vmx.
581          */
582         save->cs.base = 0xf0000;
583
584         save->gdtr.limit = 0xffff;
585         save->idtr.limit = 0xffff;
586
587         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
588         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
589
590         save->efer = EFER_SVME;
591         save->dr6 = 0xffff0ff0;
592         save->dr7 = 0x400;
593         save->rflags = 2;
594         save->rip = 0x0000fff0;
595         svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
596
597         /*
598          * cr0 val on cpu init should be 0x60000010, we enable cpu
599          * cache by default. the orderly way is to enable cache in bios.
600          */
601         save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
602         save->cr4 = X86_CR4_PAE;
603         /* rdx = ?? */
604
605         if (npt_enabled) {
606                 /* Setup VMCB for Nested Paging */
607                 control->nested_ctl = 1;
608                 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
609                                         (1ULL << INTERCEPT_INVLPG));
610                 control->intercept_exceptions &= ~(1 << PF_VECTOR);
611                 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
612                                                 INTERCEPT_CR3_MASK);
613                 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
614                                                  INTERCEPT_CR3_MASK);
615                 save->g_pat = 0x0007040600070406ULL;
616                 /* enable caching because the QEMU Bios doesn't enable it */
617                 save->cr0 = X86_CR0_ET;
618                 save->cr3 = 0;
619                 save->cr4 = 0;
620         }
621         force_new_asid(&svm->vcpu);
622
623         svm->nested_vmcb = 0;
624         svm->vcpu.arch.hflags = HF_GIF_MASK;
625 }
626
627 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
628 {
629         struct vcpu_svm *svm = to_svm(vcpu);
630
631         init_vmcb(svm);
632
633         if (!kvm_vcpu_is_bsp(vcpu)) {
634                 kvm_rip_write(vcpu, 0);
635                 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
636                 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
637         }
638         vcpu->arch.regs_avail = ~0;
639         vcpu->arch.regs_dirty = ~0;
640
641         return 0;
642 }
643
644 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
645 {
646         struct vcpu_svm *svm;
647         struct page *page;
648         struct page *msrpm_pages;
649         struct page *hsave_page;
650         struct page *nested_msrpm_pages;
651         int err;
652
653         svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
654         if (!svm) {
655                 err = -ENOMEM;
656                 goto out;
657         }
658
659         err = kvm_vcpu_init(&svm->vcpu, kvm, id);
660         if (err)
661                 goto free_svm;
662
663         page = alloc_page(GFP_KERNEL);
664         if (!page) {
665                 err = -ENOMEM;
666                 goto uninit;
667         }
668
669         err = -ENOMEM;
670         msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
671         if (!msrpm_pages)
672                 goto uninit;
673
674         nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
675         if (!nested_msrpm_pages)
676                 goto uninit;
677
678         svm->msrpm = page_address(msrpm_pages);
679         svm_vcpu_init_msrpm(svm->msrpm);
680
681         hsave_page = alloc_page(GFP_KERNEL);
682         if (!hsave_page)
683                 goto uninit;
684         svm->hsave = page_address(hsave_page);
685
686         svm->nested_msrpm = page_address(nested_msrpm_pages);
687
688         svm->vmcb = page_address(page);
689         clear_page(svm->vmcb);
690         svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
691         svm->asid_generation = 0;
692         init_vmcb(svm);
693
694         fx_init(&svm->vcpu);
695         svm->vcpu.fpu_active = 1;
696         svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
697         if (kvm_vcpu_is_bsp(&svm->vcpu))
698                 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
699
700         return &svm->vcpu;
701
702 uninit:
703         kvm_vcpu_uninit(&svm->vcpu);
704 free_svm:
705         kmem_cache_free(kvm_vcpu_cache, svm);
706 out:
707         return ERR_PTR(err);
708 }
709
710 static void svm_free_vcpu(struct kvm_vcpu *vcpu)
711 {
712         struct vcpu_svm *svm = to_svm(vcpu);
713
714         __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
715         __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
716         __free_page(virt_to_page(svm->hsave));
717         __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER);
718         kvm_vcpu_uninit(vcpu);
719         kmem_cache_free(kvm_vcpu_cache, svm);
720 }
721
722 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
723 {
724         struct vcpu_svm *svm = to_svm(vcpu);
725         int i;
726
727         if (unlikely(cpu != vcpu->cpu)) {
728                 u64 tsc_this, delta;
729
730                 /*
731                  * Make sure that the guest sees a monotonically
732                  * increasing TSC.
733                  */
734                 rdtscll(tsc_this);
735                 delta = vcpu->arch.host_tsc - tsc_this;
736                 svm->vmcb->control.tsc_offset += delta;
737                 vcpu->cpu = cpu;
738                 kvm_migrate_timers(vcpu);
739                 svm->asid_generation = 0;
740         }
741
742         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
743                 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
744 }
745
746 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
747 {
748         struct vcpu_svm *svm = to_svm(vcpu);
749         int i;
750
751         ++vcpu->stat.host_state_reload;
752         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
753                 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
754
755         rdtscll(vcpu->arch.host_tsc);
756 }
757
758 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
759 {
760         return to_svm(vcpu)->vmcb->save.rflags;
761 }
762
763 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
764 {
765         to_svm(vcpu)->vmcb->save.rflags = rflags;
766 }
767
768 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
769 {
770         switch (reg) {
771         case VCPU_EXREG_PDPTR:
772                 BUG_ON(!npt_enabled);
773                 load_pdptrs(vcpu, vcpu->arch.cr3);
774                 break;
775         default:
776                 BUG();
777         }
778 }
779
780 static void svm_set_vintr(struct vcpu_svm *svm)
781 {
782         svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
783 }
784
785 static void svm_clear_vintr(struct vcpu_svm *svm)
786 {
787         svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
788 }
789
790 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
791 {
792         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
793
794         switch (seg) {
795         case VCPU_SREG_CS: return &save->cs;
796         case VCPU_SREG_DS: return &save->ds;
797         case VCPU_SREG_ES: return &save->es;
798         case VCPU_SREG_FS: return &save->fs;
799         case VCPU_SREG_GS: return &save->gs;
800         case VCPU_SREG_SS: return &save->ss;
801         case VCPU_SREG_TR: return &save->tr;
802         case VCPU_SREG_LDTR: return &save->ldtr;
803         }
804         BUG();
805         return NULL;
806 }
807
808 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
809 {
810         struct vmcb_seg *s = svm_seg(vcpu, seg);
811
812         return s->base;
813 }
814
815 static void svm_get_segment(struct kvm_vcpu *vcpu,
816                             struct kvm_segment *var, int seg)
817 {
818         struct vmcb_seg *s = svm_seg(vcpu, seg);
819
820         var->base = s->base;
821         var->limit = s->limit;
822         var->selector = s->selector;
823         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
824         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
825         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
826         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
827         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
828         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
829         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
830         var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
831
832         /* AMD's VMCB does not have an explicit unusable field, so emulate it
833          * for cross vendor migration purposes by "not present"
834          */
835         var->unusable = !var->present || (var->type == 0);
836
837         switch (seg) {
838         case VCPU_SREG_CS:
839                 /*
840                  * SVM always stores 0 for the 'G' bit in the CS selector in
841                  * the VMCB on a VMEXIT. This hurts cross-vendor migration:
842                  * Intel's VMENTRY has a check on the 'G' bit.
843                  */
844                 var->g = s->limit > 0xfffff;
845                 break;
846         case VCPU_SREG_TR:
847                 /*
848                  * Work around a bug where the busy flag in the tr selector
849                  * isn't exposed
850                  */
851                 var->type |= 0x2;
852                 break;
853         case VCPU_SREG_DS:
854         case VCPU_SREG_ES:
855         case VCPU_SREG_FS:
856         case VCPU_SREG_GS:
857                 /*
858                  * The accessed bit must always be set in the segment
859                  * descriptor cache, although it can be cleared in the
860                  * descriptor, the cached bit always remains at 1. Since
861                  * Intel has a check on this, set it here to support
862                  * cross-vendor migration.
863                  */
864                 if (!var->unusable)
865                         var->type |= 0x1;
866                 break;
867         case VCPU_SREG_SS:
868                 /* On AMD CPUs sometimes the DB bit in the segment
869                  * descriptor is left as 1, although the whole segment has
870                  * been made unusable. Clear it here to pass an Intel VMX
871                  * entry check when cross vendor migrating.
872                  */
873                 if (var->unusable)
874                         var->db = 0;
875                 break;
876         }
877 }
878
879 static int svm_get_cpl(struct kvm_vcpu *vcpu)
880 {
881         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
882
883         return save->cpl;
884 }
885
886 static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
887 {
888         struct vcpu_svm *svm = to_svm(vcpu);
889
890         dt->limit = svm->vmcb->save.idtr.limit;
891         dt->base = svm->vmcb->save.idtr.base;
892 }
893
894 static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
895 {
896         struct vcpu_svm *svm = to_svm(vcpu);
897
898         svm->vmcb->save.idtr.limit = dt->limit;
899         svm->vmcb->save.idtr.base = dt->base ;
900 }
901
902 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
903 {
904         struct vcpu_svm *svm = to_svm(vcpu);
905
906         dt->limit = svm->vmcb->save.gdtr.limit;
907         dt->base = svm->vmcb->save.gdtr.base;
908 }
909
910 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
911 {
912         struct vcpu_svm *svm = to_svm(vcpu);
913
914         svm->vmcb->save.gdtr.limit = dt->limit;
915         svm->vmcb->save.gdtr.base = dt->base ;
916 }
917
918 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
919 {
920 }
921
922 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
923 {
924         struct vcpu_svm *svm = to_svm(vcpu);
925
926 #ifdef CONFIG_X86_64
927         if (vcpu->arch.shadow_efer & EFER_LME) {
928                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
929                         vcpu->arch.shadow_efer |= EFER_LMA;
930                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
931                 }
932
933                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
934                         vcpu->arch.shadow_efer &= ~EFER_LMA;
935                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
936                 }
937         }
938 #endif
939         if (npt_enabled)
940                 goto set;
941
942         if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
943                 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
944                 vcpu->fpu_active = 1;
945         }
946
947         vcpu->arch.cr0 = cr0;
948         cr0 |= X86_CR0_PG | X86_CR0_WP;
949         if (!vcpu->fpu_active) {
950                 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
951                 cr0 |= X86_CR0_TS;
952         }
953 set:
954         /*
955          * re-enable caching here because the QEMU bios
956          * does not do it - this results in some delay at
957          * reboot
958          */
959         cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
960         svm->vmcb->save.cr0 = cr0;
961 }
962
963 static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
964 {
965         unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
966         unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
967
968         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
969                 force_new_asid(vcpu);
970
971         vcpu->arch.cr4 = cr4;
972         if (!npt_enabled)
973                 cr4 |= X86_CR4_PAE;
974         cr4 |= host_cr4_mce;
975         to_svm(vcpu)->vmcb->save.cr4 = cr4;
976 }
977
978 static void svm_set_segment(struct kvm_vcpu *vcpu,
979                             struct kvm_segment *var, int seg)
980 {
981         struct vcpu_svm *svm = to_svm(vcpu);
982         struct vmcb_seg *s = svm_seg(vcpu, seg);
983
984         s->base = var->base;
985         s->limit = var->limit;
986         s->selector = var->selector;
987         if (var->unusable)
988                 s->attrib = 0;
989         else {
990                 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
991                 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
992                 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
993                 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
994                 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
995                 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
996                 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
997                 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
998         }
999         if (seg == VCPU_SREG_CS)
1000                 svm->vmcb->save.cpl
1001                         = (svm->vmcb->save.cs.attrib
1002                            >> SVM_SELECTOR_DPL_SHIFT) & 3;
1003
1004 }
1005
1006 static void update_db_intercept(struct kvm_vcpu *vcpu)
1007 {
1008         struct vcpu_svm *svm = to_svm(vcpu);
1009
1010         svm->vmcb->control.intercept_exceptions &=
1011                 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
1012
1013         if (vcpu->arch.singlestep)
1014                 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
1015
1016         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1017                 if (vcpu->guest_debug &
1018                     (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1019                         svm->vmcb->control.intercept_exceptions |=
1020                                 1 << DB_VECTOR;
1021                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1022                         svm->vmcb->control.intercept_exceptions |=
1023                                 1 << BP_VECTOR;
1024         } else
1025                 vcpu->guest_debug = 0;
1026 }
1027
1028 static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1029 {
1030         int old_debug = vcpu->guest_debug;
1031         struct vcpu_svm *svm = to_svm(vcpu);
1032
1033         vcpu->guest_debug = dbg->control;
1034
1035         update_db_intercept(vcpu);
1036
1037         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1038                 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1039         else
1040                 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1041
1042         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1043                 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1044         else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1045                 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1046
1047         return 0;
1048 }
1049
1050 static void load_host_msrs(struct kvm_vcpu *vcpu)
1051 {
1052 #ifdef CONFIG_X86_64
1053         wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1054 #endif
1055 }
1056
1057 static void save_host_msrs(struct kvm_vcpu *vcpu)
1058 {
1059 #ifdef CONFIG_X86_64
1060         rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1061 #endif
1062 }
1063
1064 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
1065 {
1066         if (svm_data->next_asid > svm_data->max_asid) {
1067                 ++svm_data->asid_generation;
1068                 svm_data->next_asid = 1;
1069                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1070         }
1071
1072         svm->asid_generation = svm_data->asid_generation;
1073         svm->vmcb->control.asid = svm_data->next_asid++;
1074 }
1075
1076 static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
1077 {
1078         struct vcpu_svm *svm = to_svm(vcpu);
1079         unsigned long val;
1080
1081         switch (dr) {
1082         case 0 ... 3:
1083                 val = vcpu->arch.db[dr];
1084                 break;
1085         case 6:
1086                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1087                         val = vcpu->arch.dr6;
1088                 else
1089                         val = svm->vmcb->save.dr6;
1090                 break;
1091         case 7:
1092                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1093                         val = vcpu->arch.dr7;
1094                 else
1095                         val = svm->vmcb->save.dr7;
1096                 break;
1097         default:
1098                 val = 0;
1099         }
1100
1101         return val;
1102 }
1103
1104 static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1105                        int *exception)
1106 {
1107         struct vcpu_svm *svm = to_svm(vcpu);
1108
1109         *exception = 0;
1110
1111         switch (dr) {
1112         case 0 ... 3:
1113                 vcpu->arch.db[dr] = value;
1114                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1115                         vcpu->arch.eff_db[dr] = value;
1116                 return;
1117         case 4 ... 5:
1118                 if (vcpu->arch.cr4 & X86_CR4_DE)
1119                         *exception = UD_VECTOR;
1120                 return;
1121         case 6:
1122                 if (value & 0xffffffff00000000ULL) {
1123                         *exception = GP_VECTOR;
1124                         return;
1125                 }
1126                 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1127                 return;
1128         case 7:
1129                 if (value & 0xffffffff00000000ULL) {
1130                         *exception = GP_VECTOR;
1131                         return;
1132                 }
1133                 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1134                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1135                         svm->vmcb->save.dr7 = vcpu->arch.dr7;
1136                         vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1137                 }
1138                 return;
1139         default:
1140                 /* FIXME: Possible case? */
1141                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1142                        __func__, dr);
1143                 *exception = UD_VECTOR;
1144                 return;
1145         }
1146 }
1147
1148 static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1149 {
1150         u64 fault_address;
1151         u32 error_code;
1152
1153         fault_address  = svm->vmcb->control.exit_info_2;
1154         error_code = svm->vmcb->control.exit_info_1;
1155
1156         trace_kvm_page_fault(fault_address, error_code);
1157         /*
1158          * FIXME: Tis shouldn't be necessary here, but there is a flush
1159          * missing in the MMU code. Until we find this bug, flush the
1160          * complete TLB here on an NPF
1161          */
1162         if (npt_enabled)
1163                 svm_flush_tlb(&svm->vcpu);
1164         else {
1165                 if (kvm_event_needs_reinjection(&svm->vcpu))
1166                         kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1167         }
1168         return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1169 }
1170
1171 static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1172 {
1173         if (!(svm->vcpu.guest_debug &
1174               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1175                 !svm->vcpu.arch.singlestep) {
1176                 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1177                 return 1;
1178         }
1179
1180         if (svm->vcpu.arch.singlestep) {
1181                 svm->vcpu.arch.singlestep = false;
1182                 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1183                         svm->vmcb->save.rflags &=
1184                                 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1185                 update_db_intercept(&svm->vcpu);
1186         }
1187
1188         if (svm->vcpu.guest_debug &
1189             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
1190                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1191                 kvm_run->debug.arch.pc =
1192                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1193                 kvm_run->debug.arch.exception = DB_VECTOR;
1194                 return 0;
1195         }
1196
1197         return 1;
1198 }
1199
1200 static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1201 {
1202         kvm_run->exit_reason = KVM_EXIT_DEBUG;
1203         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1204         kvm_run->debug.arch.exception = BP_VECTOR;
1205         return 0;
1206 }
1207
1208 static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1209 {
1210         int er;
1211
1212         er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
1213         if (er != EMULATE_DONE)
1214                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1215         return 1;
1216 }
1217
1218 static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1219 {
1220         svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1221         if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
1222                 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
1223         svm->vcpu.fpu_active = 1;
1224
1225         return 1;
1226 }
1227
1228 static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1229 {
1230         /*
1231          * On an #MC intercept the MCE handler is not called automatically in
1232          * the host. So do it by hand here.
1233          */
1234         asm volatile (
1235                 "int $0x12\n");
1236         /* not sure if we ever come back to this point */
1237
1238         return 1;
1239 }
1240
1241 static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1242 {
1243         /*
1244          * VMCB is undefined after a SHUTDOWN intercept
1245          * so reinitialize it.
1246          */
1247         clear_page(svm->vmcb);
1248         init_vmcb(svm);
1249
1250         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1251         return 0;
1252 }
1253
1254 static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1255 {
1256         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1257         int size, in, string;
1258         unsigned port;
1259
1260         ++svm->vcpu.stat.io_exits;
1261
1262         svm->next_rip = svm->vmcb->control.exit_info_2;
1263
1264         string = (io_info & SVM_IOIO_STR_MASK) != 0;
1265
1266         if (string) {
1267                 if (emulate_instruction(&svm->vcpu,
1268                                         kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1269                         return 0;
1270                 return 1;
1271         }
1272
1273         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1274         port = io_info >> 16;
1275         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1276
1277         skip_emulated_instruction(&svm->vcpu);
1278         return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1279 }
1280
1281 static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1282 {
1283         return 1;
1284 }
1285
1286 static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1287 {
1288         ++svm->vcpu.stat.irq_exits;
1289         return 1;
1290 }
1291
1292 static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1293 {
1294         return 1;
1295 }
1296
1297 static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1298 {
1299         svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1300         skip_emulated_instruction(&svm->vcpu);
1301         return kvm_emulate_halt(&svm->vcpu);
1302 }
1303
1304 static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1305 {
1306         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1307         skip_emulated_instruction(&svm->vcpu);
1308         kvm_emulate_hypercall(&svm->vcpu);
1309         return 1;
1310 }
1311
1312 static int nested_svm_check_permissions(struct vcpu_svm *svm)
1313 {
1314         if (!(svm->vcpu.arch.shadow_efer & EFER_SVME)
1315             || !is_paging(&svm->vcpu)) {
1316                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1317                 return 1;
1318         }
1319
1320         if (svm->vmcb->save.cpl) {
1321                 kvm_inject_gp(&svm->vcpu, 0);
1322                 return 1;
1323         }
1324
1325        return 0;
1326 }
1327
1328 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1329                                       bool has_error_code, u32 error_code)
1330 {
1331         if (is_nested(svm)) {
1332                 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
1333                 svm->vmcb->control.exit_code_hi = 0;
1334                 svm->vmcb->control.exit_info_1 = error_code;
1335                 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1336                 if (nested_svm_exit_handled(svm, false)) {
1337                         nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
1338
1339                         nested_svm_vmexit(svm);
1340                         return 1;
1341                 }
1342         }
1343
1344         return 0;
1345 }
1346
1347 static inline int nested_svm_intr(struct vcpu_svm *svm)
1348 {
1349         if (is_nested(svm)) {
1350                 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1351                         return 0;
1352
1353                 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1354                         return 0;
1355
1356                 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1357
1358                 if (nested_svm_exit_handled(svm, false)) {
1359                         nsvm_printk("VMexit -> INTR\n");
1360                         nested_svm_vmexit(svm);
1361                         return 1;
1362                 }
1363         }
1364
1365         return 0;
1366 }
1367
1368 static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
1369 {
1370         struct page *page;
1371
1372         down_read(&current->mm->mmap_sem);
1373         page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1374         up_read(&current->mm->mmap_sem);
1375
1376         if (is_error_page(page)) {
1377                 printk(KERN_INFO "%s: could not find page at 0x%llx\n",
1378                        __func__, gpa);
1379                 kvm_release_page_clean(page);
1380                 kvm_inject_gp(&svm->vcpu, 0);
1381                 return NULL;
1382         }
1383         return page;
1384 }
1385
1386 static int nested_svm_do(struct vcpu_svm *svm,
1387                          u64 arg1_gpa, u64 arg2_gpa, void *opaque,
1388                          int (*handler)(struct vcpu_svm *svm,
1389                                         void *arg1,
1390                                         void *arg2,
1391                                         void *opaque))
1392 {
1393         struct page *arg1_page;
1394         struct page *arg2_page = NULL;
1395         void *arg1;
1396         void *arg2 = NULL;
1397         int retval;
1398
1399         arg1_page = nested_svm_get_page(svm, arg1_gpa);
1400         if(arg1_page == NULL)
1401                 return 1;
1402
1403         if (arg2_gpa) {
1404                 arg2_page = nested_svm_get_page(svm, arg2_gpa);
1405                 if(arg2_page == NULL) {
1406                         kvm_release_page_clean(arg1_page);
1407                         return 1;
1408                 }
1409         }
1410
1411         arg1 = kmap_atomic(arg1_page, KM_USER0);
1412         if (arg2_gpa)
1413                 arg2 = kmap_atomic(arg2_page, KM_USER1);
1414
1415         retval = handler(svm, arg1, arg2, opaque);
1416
1417         kunmap_atomic(arg1, KM_USER0);
1418         if (arg2_gpa)
1419                 kunmap_atomic(arg2, KM_USER1);
1420
1421         kvm_release_page_dirty(arg1_page);
1422         if (arg2_gpa)
1423                 kvm_release_page_dirty(arg2_page);
1424
1425         return retval;
1426 }
1427
1428 static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
1429                                         void *arg1,
1430                                         void *arg2,
1431                                         void *opaque)
1432 {
1433         struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1434         bool kvm_overrides = *(bool *)opaque;
1435         u32 exit_code = svm->vmcb->control.exit_code;
1436
1437         if (kvm_overrides) {
1438                 switch (exit_code) {
1439                 case SVM_EXIT_INTR:
1440                 case SVM_EXIT_NMI:
1441                         return 0;
1442                 /* For now we are always handling NPFs when using them */
1443                 case SVM_EXIT_NPF:
1444                         if (npt_enabled)
1445                                 return 0;
1446                         break;
1447                 /* When we're shadowing, trap PFs */
1448                 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1449                         if (!npt_enabled)
1450                                 return 0;
1451                         break;
1452                 default:
1453                         break;
1454                 }
1455         }
1456
1457         switch (exit_code) {
1458         case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
1459                 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
1460                 if (nested_vmcb->control.intercept_cr_read & cr_bits)
1461                         return 1;
1462                 break;
1463         }
1464         case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
1465                 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
1466                 if (nested_vmcb->control.intercept_cr_write & cr_bits)
1467                         return 1;
1468                 break;
1469         }
1470         case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1471                 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1472                 if (nested_vmcb->control.intercept_dr_read & dr_bits)
1473                         return 1;
1474                 break;
1475         }
1476         case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
1477                 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
1478                 if (nested_vmcb->control.intercept_dr_write & dr_bits)
1479                         return 1;
1480                 break;
1481         }
1482         case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
1483                 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1484                 if (nested_vmcb->control.intercept_exceptions & excp_bits)
1485                         return 1;
1486                 break;
1487         }
1488         default: {
1489                 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1490                 nsvm_printk("exit code: 0x%x\n", exit_code);
1491                 if (nested_vmcb->control.intercept & exit_bits)
1492                         return 1;
1493         }
1494         }
1495
1496         return 0;
1497 }
1498
1499 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
1500                                        void *arg1, void *arg2,
1501                                        void *opaque)
1502 {
1503         struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1504         u8 *msrpm = (u8 *)arg2;
1505         u32 t0, t1;
1506         u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1507         u32 param = svm->vmcb->control.exit_info_1 & 1;
1508
1509         if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1510                 return 0;
1511
1512         switch(msr) {
1513         case 0 ... 0x1fff:
1514                 t0 = (msr * 2) % 8;
1515                 t1 = msr / 8;
1516                 break;
1517         case 0xc0000000 ... 0xc0001fff:
1518                 t0 = (8192 + msr - 0xc0000000) * 2;
1519                 t1 = (t0 / 8);
1520                 t0 %= 8;
1521                 break;
1522         case 0xc0010000 ... 0xc0011fff:
1523                 t0 = (16384 + msr - 0xc0010000) * 2;
1524                 t1 = (t0 / 8);
1525                 t0 %= 8;
1526                 break;
1527         default:
1528                 return 1;
1529                 break;
1530         }
1531         if (msrpm[t1] & ((1 << param) << t0))
1532                 return 1;
1533
1534         return 0;
1535 }
1536
1537 static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
1538 {
1539         bool k = kvm_override;
1540
1541         switch (svm->vmcb->control.exit_code) {
1542         case SVM_EXIT_MSR:
1543                 return nested_svm_do(svm, svm->nested_vmcb,
1544                                      svm->nested_vmcb_msrpm, NULL,
1545                                      nested_svm_exit_handled_msr);
1546         default: break;
1547         }
1548
1549         return nested_svm_do(svm, svm->nested_vmcb, 0, &k,
1550                              nested_svm_exit_handled_real);
1551 }
1552
1553 static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
1554                                   void *arg2, void *opaque)
1555 {
1556         struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1557         struct vmcb *hsave = svm->hsave;
1558         u64 nested_save[] = { nested_vmcb->save.cr0,
1559                               nested_vmcb->save.cr3,
1560                               nested_vmcb->save.cr4,
1561                               nested_vmcb->save.efer,
1562                               nested_vmcb->control.intercept_cr_read,
1563                               nested_vmcb->control.intercept_cr_write,
1564                               nested_vmcb->control.intercept_dr_read,
1565                               nested_vmcb->control.intercept_dr_write,
1566                               nested_vmcb->control.intercept_exceptions,
1567                               nested_vmcb->control.intercept,
1568                               nested_vmcb->control.msrpm_base_pa,
1569                               nested_vmcb->control.iopm_base_pa,
1570                               nested_vmcb->control.tsc_offset };
1571
1572         /* Give the current vmcb to the guest */
1573         memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb));
1574         nested_vmcb->save.cr0 = nested_save[0];
1575         if (!npt_enabled)
1576                 nested_vmcb->save.cr3 = nested_save[1];
1577         nested_vmcb->save.cr4 = nested_save[2];
1578         nested_vmcb->save.efer = nested_save[3];
1579         nested_vmcb->control.intercept_cr_read = nested_save[4];
1580         nested_vmcb->control.intercept_cr_write = nested_save[5];
1581         nested_vmcb->control.intercept_dr_read = nested_save[6];
1582         nested_vmcb->control.intercept_dr_write = nested_save[7];
1583         nested_vmcb->control.intercept_exceptions = nested_save[8];
1584         nested_vmcb->control.intercept = nested_save[9];
1585         nested_vmcb->control.msrpm_base_pa = nested_save[10];
1586         nested_vmcb->control.iopm_base_pa = nested_save[11];
1587         nested_vmcb->control.tsc_offset = nested_save[12];
1588
1589         /* We always set V_INTR_MASKING and remember the old value in hflags */
1590         if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1591                 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
1592
1593         if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
1594             (nested_vmcb->control.int_vector)) {
1595                 nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
1596                                 nested_vmcb->control.int_vector);
1597         }
1598
1599         /* Restore the original control entries */
1600         svm->vmcb->control = hsave->control;
1601
1602         /* Kill any pending exceptions */
1603         if (svm->vcpu.arch.exception.pending == true)
1604                 nsvm_printk("WARNING: Pending Exception\n");
1605         kvm_clear_exception_queue(&svm->vcpu);
1606         kvm_clear_interrupt_queue(&svm->vcpu);
1607
1608         /* Restore selected save entries */
1609         svm->vmcb->save.es = hsave->save.es;
1610         svm->vmcb->save.cs = hsave->save.cs;
1611         svm->vmcb->save.ss = hsave->save.ss;
1612         svm->vmcb->save.ds = hsave->save.ds;
1613         svm->vmcb->save.gdtr = hsave->save.gdtr;
1614         svm->vmcb->save.idtr = hsave->save.idtr;
1615         svm->vmcb->save.rflags = hsave->save.rflags;
1616         svm_set_efer(&svm->vcpu, hsave->save.efer);
1617         svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
1618         svm_set_cr4(&svm->vcpu, hsave->save.cr4);
1619         if (npt_enabled) {
1620                 svm->vmcb->save.cr3 = hsave->save.cr3;
1621                 svm->vcpu.arch.cr3 = hsave->save.cr3;
1622         } else {
1623                 kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
1624         }
1625         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
1626         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
1627         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
1628         svm->vmcb->save.dr7 = 0;
1629         svm->vmcb->save.cpl = 0;
1630         svm->vmcb->control.exit_int_info = 0;
1631
1632         svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
1633         /* Exit nested SVM mode */
1634         svm->nested_vmcb = 0;
1635
1636         return 0;
1637 }
1638
1639 static int nested_svm_vmexit(struct vcpu_svm *svm)
1640 {
1641         nsvm_printk("VMexit\n");
1642         if (nested_svm_do(svm, svm->nested_vmcb, 0,
1643                           NULL, nested_svm_vmexit_real))
1644                 return 1;
1645
1646         kvm_mmu_reset_context(&svm->vcpu);
1647         kvm_mmu_load(&svm->vcpu);
1648
1649         return 0;
1650 }
1651
1652 static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
1653                                   void *arg2, void *opaque)
1654 {
1655         int i;
1656         u32 *nested_msrpm = (u32*)arg1;
1657         for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
1658                 svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
1659         svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
1660
1661         return 0;
1662 }
1663
1664 static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
1665                             void *arg2, void *opaque)
1666 {
1667         struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1668         struct vmcb *hsave = svm->hsave;
1669
1670         /* nested_vmcb is our indicator if nested SVM is activated */
1671         svm->nested_vmcb = svm->vmcb->save.rax;
1672
1673         /* Clear internal status */
1674         kvm_clear_exception_queue(&svm->vcpu);
1675         kvm_clear_interrupt_queue(&svm->vcpu);
1676
1677         /* Save the old vmcb, so we don't need to pick what we save, but
1678            can restore everything when a VMEXIT occurs */
1679         memcpy(hsave, svm->vmcb, sizeof(struct vmcb));
1680         /* We need to remember the original CR3 in the SPT case */
1681         if (!npt_enabled)
1682                 hsave->save.cr3 = svm->vcpu.arch.cr3;
1683         hsave->save.cr4 = svm->vcpu.arch.cr4;
1684         hsave->save.rip = svm->next_rip;
1685
1686         if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
1687                 svm->vcpu.arch.hflags |= HF_HIF_MASK;
1688         else
1689                 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
1690
1691         /* Load the nested guest state */
1692         svm->vmcb->save.es = nested_vmcb->save.es;
1693         svm->vmcb->save.cs = nested_vmcb->save.cs;
1694         svm->vmcb->save.ss = nested_vmcb->save.ss;
1695         svm->vmcb->save.ds = nested_vmcb->save.ds;
1696         svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
1697         svm->vmcb->save.idtr = nested_vmcb->save.idtr;
1698         svm->vmcb->save.rflags = nested_vmcb->save.rflags;
1699         svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
1700         svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
1701         svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
1702         if (npt_enabled) {
1703                 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
1704                 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
1705         } else {
1706                 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
1707                 kvm_mmu_reset_context(&svm->vcpu);
1708         }
1709         svm->vmcb->save.cr2 = nested_vmcb->save.cr2;
1710         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
1711         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
1712         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
1713         /* In case we don't even reach vcpu_run, the fields are not updated */
1714         svm->vmcb->save.rax = nested_vmcb->save.rax;
1715         svm->vmcb->save.rsp = nested_vmcb->save.rsp;
1716         svm->vmcb->save.rip = nested_vmcb->save.rip;
1717         svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
1718         svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
1719         svm->vmcb->save.cpl = nested_vmcb->save.cpl;
1720
1721         /* We don't want a nested guest to be more powerful than the guest,
1722            so all intercepts are ORed */
1723         svm->vmcb->control.intercept_cr_read |=
1724                 nested_vmcb->control.intercept_cr_read;
1725         svm->vmcb->control.intercept_cr_write |=
1726                 nested_vmcb->control.intercept_cr_write;
1727         svm->vmcb->control.intercept_dr_read |=
1728                 nested_vmcb->control.intercept_dr_read;
1729         svm->vmcb->control.intercept_dr_write |=
1730                 nested_vmcb->control.intercept_dr_write;
1731         svm->vmcb->control.intercept_exceptions |=
1732                 nested_vmcb->control.intercept_exceptions;
1733
1734         svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
1735
1736         svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
1737
1738         force_new_asid(&svm->vcpu);
1739         svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1740         svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1741         svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1742         if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1743                 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1744                                 nested_vmcb->control.int_ctl);
1745         }
1746         if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1747                 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1748         else
1749                 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1750
1751         nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1752                         nested_vmcb->control.exit_int_info,
1753                         nested_vmcb->control.int_state);
1754
1755         svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1756         svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1757         svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1758         if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1759                 nsvm_printk("Injecting Event: 0x%x\n",
1760                                 nested_vmcb->control.event_inj);
1761         svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1762         svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1763
1764         svm->vcpu.arch.hflags |= HF_GIF_MASK;
1765
1766         return 0;
1767 }
1768
1769 static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1770 {
1771         to_vmcb->save.fs = from_vmcb->save.fs;
1772         to_vmcb->save.gs = from_vmcb->save.gs;
1773         to_vmcb->save.tr = from_vmcb->save.tr;
1774         to_vmcb->save.ldtr = from_vmcb->save.ldtr;
1775         to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
1776         to_vmcb->save.star = from_vmcb->save.star;
1777         to_vmcb->save.lstar = from_vmcb->save.lstar;
1778         to_vmcb->save.cstar = from_vmcb->save.cstar;
1779         to_vmcb->save.sfmask = from_vmcb->save.sfmask;
1780         to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
1781         to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
1782         to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1783
1784         return 1;
1785 }
1786
1787 static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
1788                              void *arg2, void *opaque)
1789 {
1790         return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
1791 }
1792
1793 static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
1794                              void *arg2, void *opaque)
1795 {
1796         return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
1797 }
1798
1799 static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1800 {
1801         if (nested_svm_check_permissions(svm))
1802                 return 1;
1803
1804         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1805         skip_emulated_instruction(&svm->vcpu);
1806
1807         nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload);
1808
1809         return 1;
1810 }
1811
1812 static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1813 {
1814         if (nested_svm_check_permissions(svm))
1815                 return 1;
1816
1817         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1818         skip_emulated_instruction(&svm->vcpu);
1819
1820         nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave);
1821
1822         return 1;
1823 }
1824
1825 static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1826 {
1827         nsvm_printk("VMrun\n");
1828         if (nested_svm_check_permissions(svm))
1829                 return 1;
1830
1831         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1832         skip_emulated_instruction(&svm->vcpu);
1833
1834         if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
1835                           NULL, nested_svm_vmrun))
1836                 return 1;
1837
1838         if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0,
1839                       NULL, nested_svm_vmrun_msrpm))
1840                 return 1;
1841
1842         return 1;
1843 }
1844
1845 static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1846 {
1847         if (nested_svm_check_permissions(svm))
1848                 return 1;
1849
1850         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1851         skip_emulated_instruction(&svm->vcpu);
1852
1853         svm->vcpu.arch.hflags |= HF_GIF_MASK;
1854
1855         return 1;
1856 }
1857
1858 static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1859 {
1860         if (nested_svm_check_permissions(svm))
1861                 return 1;
1862
1863         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1864         skip_emulated_instruction(&svm->vcpu);
1865
1866         svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
1867
1868         /* After a CLGI no interrupts should come */
1869         svm_clear_vintr(svm);
1870         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1871
1872         return 1;
1873 }
1874
1875 static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1876 {
1877         struct kvm_vcpu *vcpu = &svm->vcpu;
1878         nsvm_printk("INVLPGA\n");
1879
1880         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1881         kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
1882
1883         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1884         skip_emulated_instruction(&svm->vcpu);
1885         return 1;
1886 }
1887
1888 static int invalid_op_interception(struct vcpu_svm *svm,
1889                                    struct kvm_run *kvm_run)
1890 {
1891         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1892         return 1;
1893 }
1894
1895 static int task_switch_interception(struct vcpu_svm *svm,
1896                                     struct kvm_run *kvm_run)
1897 {
1898         u16 tss_selector;
1899         int reason;
1900         int int_type = svm->vmcb->control.exit_int_info &
1901                 SVM_EXITINTINFO_TYPE_MASK;
1902         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
1903         uint32_t type =
1904                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
1905         uint32_t idt_v =
1906                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
1907
1908         tss_selector = (u16)svm->vmcb->control.exit_info_1;
1909
1910         if (svm->vmcb->control.exit_info_2 &
1911             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
1912                 reason = TASK_SWITCH_IRET;
1913         else if (svm->vmcb->control.exit_info_2 &
1914                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
1915                 reason = TASK_SWITCH_JMP;
1916         else if (idt_v)
1917                 reason = TASK_SWITCH_GATE;
1918         else
1919                 reason = TASK_SWITCH_CALL;
1920
1921         if (reason == TASK_SWITCH_GATE) {
1922                 switch (type) {
1923                 case SVM_EXITINTINFO_TYPE_NMI:
1924                         svm->vcpu.arch.nmi_injected = false;
1925                         break;
1926                 case SVM_EXITINTINFO_TYPE_EXEPT:
1927                         kvm_clear_exception_queue(&svm->vcpu);
1928                         break;
1929                 case SVM_EXITINTINFO_TYPE_INTR:
1930                         kvm_clear_interrupt_queue(&svm->vcpu);
1931                         break;
1932                 default:
1933                         break;
1934                 }
1935         }
1936
1937         if (reason != TASK_SWITCH_GATE ||
1938             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
1939             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
1940              (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
1941                 skip_emulated_instruction(&svm->vcpu);
1942
1943         return kvm_task_switch(&svm->vcpu, tss_selector, reason);
1944 }
1945
1946 static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1947 {
1948         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1949         kvm_emulate_cpuid(&svm->vcpu);
1950         return 1;
1951 }
1952
1953 static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1954 {
1955         ++svm->vcpu.stat.nmi_window_exits;
1956         svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
1957         svm->vcpu.arch.hflags |= HF_IRET_MASK;
1958         return 1;
1959 }
1960
1961 static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1962 {
1963         if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
1964                 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
1965         return 1;
1966 }
1967
1968 static int emulate_on_interception(struct vcpu_svm *svm,
1969                                    struct kvm_run *kvm_run)
1970 {
1971         if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
1972                 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
1973         return 1;
1974 }
1975
1976 static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1977 {
1978         u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
1979         /* instruction emulation calls kvm_set_cr8() */
1980         emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1981         if (irqchip_in_kernel(svm->vcpu.kvm)) {
1982                 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
1983                 return 1;
1984         }
1985         if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
1986                 return 1;
1987         kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1988         return 0;
1989 }
1990
1991 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1992 {
1993         struct vcpu_svm *svm = to_svm(vcpu);
1994
1995         switch (ecx) {
1996         case MSR_IA32_TSC: {
1997                 u64 tsc;
1998
1999                 rdtscll(tsc);
2000                 *data = svm->vmcb->control.tsc_offset + tsc;
2001                 break;
2002         }
2003         case MSR_K6_STAR:
2004                 *data = svm->vmcb->save.star;
2005                 break;
2006 #ifdef CONFIG_X86_64
2007         case MSR_LSTAR:
2008                 *data = svm->vmcb->save.lstar;
2009                 break;
2010         case MSR_CSTAR:
2011                 *data = svm->vmcb->save.cstar;
2012                 break;
2013         case MSR_KERNEL_GS_BASE:
2014                 *data = svm->vmcb->save.kernel_gs_base;
2015                 break;
2016         case MSR_SYSCALL_MASK:
2017                 *data = svm->vmcb->save.sfmask;
2018                 break;
2019 #endif
2020         case MSR_IA32_SYSENTER_CS:
2021                 *data = svm->vmcb->save.sysenter_cs;
2022                 break;
2023         case MSR_IA32_SYSENTER_EIP:
2024                 *data = svm->sysenter_eip;
2025                 break;
2026         case MSR_IA32_SYSENTER_ESP:
2027                 *data = svm->sysenter_esp;
2028                 break;
2029         /* Nobody will change the following 5 values in the VMCB so
2030            we can safely return them on rdmsr. They will always be 0
2031            until LBRV is implemented. */
2032         case MSR_IA32_DEBUGCTLMSR:
2033                 *data = svm->vmcb->save.dbgctl;
2034                 break;
2035         case MSR_IA32_LASTBRANCHFROMIP:
2036                 *data = svm->vmcb->save.br_from;
2037                 break;
2038         case MSR_IA32_LASTBRANCHTOIP:
2039                 *data = svm->vmcb->save.br_to;
2040                 break;
2041         case MSR_IA32_LASTINTFROMIP:
2042                 *data = svm->vmcb->save.last_excp_from;
2043                 break;
2044         case MSR_IA32_LASTINTTOIP:
2045                 *data = svm->vmcb->save.last_excp_to;
2046                 break;
2047         case MSR_VM_HSAVE_PA:
2048                 *data = svm->hsave_msr;
2049                 break;
2050         case MSR_VM_CR:
2051                 *data = 0;
2052                 break;
2053         case MSR_IA32_UCODE_REV:
2054                 *data = 0x01000065;
2055                 break;
2056         default:
2057                 return kvm_get_msr_common(vcpu, ecx, data);
2058         }
2059         return 0;
2060 }
2061
2062 static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2063 {
2064         u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2065         u64 data;
2066
2067         if (svm_get_msr(&svm->vcpu, ecx, &data))
2068                 kvm_inject_gp(&svm->vcpu, 0);
2069         else {
2070                 trace_kvm_msr_read(ecx, data);
2071
2072                 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
2073                 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
2074                 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2075                 skip_emulated_instruction(&svm->vcpu);
2076         }
2077         return 1;
2078 }
2079
2080 static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2081 {
2082         struct vcpu_svm *svm = to_svm(vcpu);
2083
2084         switch (ecx) {
2085         case MSR_IA32_TSC: {
2086                 u64 tsc;
2087
2088                 rdtscll(tsc);
2089                 svm->vmcb->control.tsc_offset = data - tsc;
2090                 break;
2091         }
2092         case MSR_K6_STAR:
2093                 svm->vmcb->save.star = data;
2094                 break;
2095 #ifdef CONFIG_X86_64
2096         case MSR_LSTAR:
2097                 svm->vmcb->save.lstar = data;
2098                 break;
2099         case MSR_CSTAR:
2100                 svm->vmcb->save.cstar = data;
2101                 break;
2102         case MSR_KERNEL_GS_BASE:
2103                 svm->vmcb->save.kernel_gs_base = data;
2104                 break;
2105         case MSR_SYSCALL_MASK:
2106                 svm->vmcb->save.sfmask = data;
2107                 break;
2108 #endif
2109         case MSR_IA32_SYSENTER_CS:
2110                 svm->vmcb->save.sysenter_cs = data;
2111                 break;
2112         case MSR_IA32_SYSENTER_EIP:
2113                 svm->sysenter_eip = data;
2114                 svm->vmcb->save.sysenter_eip = data;
2115                 break;
2116         case MSR_IA32_SYSENTER_ESP:
2117                 svm->sysenter_esp = data;
2118                 svm->vmcb->save.sysenter_esp = data;
2119                 break;
2120         case MSR_IA32_DEBUGCTLMSR:
2121                 if (!svm_has(SVM_FEATURE_LBRV)) {
2122                         pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2123                                         __func__, data);
2124                         break;
2125                 }
2126                 if (data & DEBUGCTL_RESERVED_BITS)
2127                         return 1;
2128
2129                 svm->vmcb->save.dbgctl = data;
2130                 if (data & (1ULL<<0))
2131                         svm_enable_lbrv(svm);
2132                 else
2133                         svm_disable_lbrv(svm);
2134                 break;
2135         case MSR_VM_HSAVE_PA:
2136                 svm->hsave_msr = data;
2137                 break;
2138         case MSR_VM_CR:
2139         case MSR_VM_IGNNE:
2140         case MSR_K7_HWCR:
2141                 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2142                 break;
2143         default:
2144                 return kvm_set_msr_common(vcpu, ecx, data);
2145         }
2146         return 0;
2147 }
2148
2149 static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2150 {
2151         u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2152         u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
2153                 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2154
2155         trace_kvm_msr_write(ecx, data);
2156
2157         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2158         if (svm_set_msr(&svm->vcpu, ecx, data))
2159                 kvm_inject_gp(&svm->vcpu, 0);
2160         else
2161                 skip_emulated_instruction(&svm->vcpu);
2162         return 1;
2163 }
2164
2165 static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2166 {
2167         if (svm->vmcb->control.exit_info_1)
2168                 return wrmsr_interception(svm, kvm_run);
2169         else
2170                 return rdmsr_interception(svm, kvm_run);
2171 }
2172
2173 static int interrupt_window_interception(struct vcpu_svm *svm,
2174                                    struct kvm_run *kvm_run)
2175 {
2176         svm_clear_vintr(svm);
2177         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2178         /*
2179          * If the user space waits to inject interrupts, exit as soon as
2180          * possible
2181          */
2182         if (!irqchip_in_kernel(svm->vcpu.kvm) &&
2183             kvm_run->request_interrupt_window &&
2184             !kvm_cpu_has_interrupt(&svm->vcpu)) {
2185                 ++svm->vcpu.stat.irq_window_exits;
2186                 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2187                 return 0;
2188         }
2189
2190         return 1;
2191 }
2192
2193 static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2194                                       struct kvm_run *kvm_run) = {
2195         [SVM_EXIT_READ_CR0]                     = emulate_on_interception,
2196         [SVM_EXIT_READ_CR3]                     = emulate_on_interception,
2197         [SVM_EXIT_READ_CR4]                     = emulate_on_interception,
2198         [SVM_EXIT_READ_CR8]                     = emulate_on_interception,
2199         /* for now: */
2200         [SVM_EXIT_WRITE_CR0]                    = emulate_on_interception,
2201         [SVM_EXIT_WRITE_CR3]                    = emulate_on_interception,
2202         [SVM_EXIT_WRITE_CR4]                    = emulate_on_interception,
2203         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
2204         [SVM_EXIT_READ_DR0]                     = emulate_on_interception,
2205         [SVM_EXIT_READ_DR1]                     = emulate_on_interception,
2206         [SVM_EXIT_READ_DR2]                     = emulate_on_interception,
2207         [SVM_EXIT_READ_DR3]                     = emulate_on_interception,
2208         [SVM_EXIT_WRITE_DR0]                    = emulate_on_interception,
2209         [SVM_EXIT_WRITE_DR1]                    = emulate_on_interception,
2210         [SVM_EXIT_WRITE_DR2]                    = emulate_on_interception,
2211         [SVM_EXIT_WRITE_DR3]                    = emulate_on_interception,
2212         [SVM_EXIT_WRITE_DR5]                    = emulate_on_interception,
2213         [SVM_EXIT_WRITE_DR7]                    = emulate_on_interception,
2214         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
2215         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
2216         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
2217         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
2218         [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
2219         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
2220         [SVM_EXIT_INTR]                         = intr_interception,
2221         [SVM_EXIT_NMI]                          = nmi_interception,
2222         [SVM_EXIT_SMI]                          = nop_on_interception,
2223         [SVM_EXIT_INIT]                         = nop_on_interception,
2224         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
2225         /* [SVM_EXIT_CR0_SEL_WRITE]             = emulate_on_interception, */
2226         [SVM_EXIT_CPUID]                        = cpuid_interception,
2227         [SVM_EXIT_IRET]                         = iret_interception,
2228         [SVM_EXIT_INVD]                         = emulate_on_interception,
2229         [SVM_EXIT_HLT]                          = halt_interception,
2230         [SVM_EXIT_INVLPG]                       = invlpg_interception,
2231         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
2232         [SVM_EXIT_IOIO]                         = io_interception,
2233         [SVM_EXIT_MSR]                          = msr_interception,
2234         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
2235         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
2236         [SVM_EXIT_VMRUN]                        = vmrun_interception,
2237         [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
2238         [SVM_EXIT_VMLOAD]                       = vmload_interception,
2239         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
2240         [SVM_EXIT_STGI]                         = stgi_interception,
2241         [SVM_EXIT_CLGI]                         = clgi_interception,
2242         [SVM_EXIT_SKINIT]                       = invalid_op_interception,
2243         [SVM_EXIT_WBINVD]                       = emulate_on_interception,
2244         [SVM_EXIT_MONITOR]                      = invalid_op_interception,
2245         [SVM_EXIT_MWAIT]                        = invalid_op_interception,
2246         [SVM_EXIT_NPF]                          = pf_interception,
2247 };
2248
2249 static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2250 {
2251         struct vcpu_svm *svm = to_svm(vcpu);
2252         u32 exit_code = svm->vmcb->control.exit_code;
2253
2254         trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2255
2256         if (is_nested(svm)) {
2257                 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
2258                             exit_code, svm->vmcb->control.exit_info_1,
2259                             svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
2260                 if (nested_svm_exit_handled(svm, true)) {
2261                         nested_svm_vmexit(svm);
2262                         nsvm_printk("-> #VMEXIT\n");
2263                         return 1;
2264                 }
2265         }
2266
2267         if (npt_enabled) {
2268                 int mmu_reload = 0;
2269                 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
2270                         svm_set_cr0(vcpu, svm->vmcb->save.cr0);
2271                         mmu_reload = 1;
2272                 }
2273                 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2274                 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2275                 if (mmu_reload) {
2276                         kvm_mmu_reset_context(vcpu);
2277                         kvm_mmu_load(vcpu);
2278                 }
2279         }
2280
2281
2282         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2283                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2284                 kvm_run->fail_entry.hardware_entry_failure_reason
2285                         = svm->vmcb->control.exit_code;
2286                 return 0;
2287         }
2288
2289         if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2290             exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2291             exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
2292                 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2293                        "exit_code 0x%x\n",
2294                        __func__, svm->vmcb->control.exit_int_info,
2295                        exit_code);
2296
2297         if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
2298             || !svm_exit_handlers[exit_code]) {
2299                 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
2300                 kvm_run->hw.hardware_exit_reason = exit_code;
2301                 return 0;
2302         }
2303
2304         return svm_exit_handlers[exit_code](svm, kvm_run);
2305 }
2306
2307 static void reload_tss(struct kvm_vcpu *vcpu)
2308 {
2309         int cpu = raw_smp_processor_id();
2310
2311         struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
2312         svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
2313         load_TR_desc();
2314 }
2315
2316 static void pre_svm_run(struct vcpu_svm *svm)
2317 {
2318         int cpu = raw_smp_processor_id();
2319
2320         struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
2321
2322         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
2323         /* FIXME: handle wraparound of asid_generation */
2324         if (svm->asid_generation != svm_data->asid_generation)
2325                 new_asid(svm, svm_data);
2326 }
2327
2328 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2329 {
2330         struct vcpu_svm *svm = to_svm(vcpu);
2331
2332         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2333         vcpu->arch.hflags |= HF_NMI_MASK;
2334         svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2335         ++vcpu->stat.nmi_injections;
2336 }
2337
2338 static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2339 {
2340         struct vmcb_control_area *control;
2341
2342         trace_kvm_inj_virq(irq);
2343
2344         ++svm->vcpu.stat.irq_injections;
2345         control = &svm->vmcb->control;
2346         control->int_vector = irq;
2347         control->int_ctl &= ~V_INTR_PRIO_MASK;
2348         control->int_ctl |= V_IRQ_MASK |
2349                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
2350 }
2351
2352 static void svm_set_irq(struct kvm_vcpu *vcpu)
2353 {
2354         struct vcpu_svm *svm = to_svm(vcpu);
2355
2356         BUG_ON(!(svm->vcpu.arch.hflags & HF_GIF_MASK));
2357
2358         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
2359                 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2360 }
2361
2362 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2363 {
2364         struct vcpu_svm *svm = to_svm(vcpu);
2365
2366         if (irr == -1)
2367                 return;
2368
2369         if (tpr >= irr)
2370                 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
2371 }
2372
2373 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2374 {
2375         struct vcpu_svm *svm = to_svm(vcpu);
2376         struct vmcb *vmcb = svm->vmcb;
2377         return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2378                 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2379 }
2380
2381 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2382 {
2383         struct vcpu_svm *svm = to_svm(vcpu);
2384         struct vmcb *vmcb = svm->vmcb;
2385         return (vmcb->save.rflags & X86_EFLAGS_IF) &&
2386                 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2387                 (svm->vcpu.arch.hflags & HF_GIF_MASK) &&
2388                 !is_nested(svm);
2389 }
2390
2391 static void enable_irq_window(struct kvm_vcpu *vcpu)
2392 {
2393         struct vcpu_svm *svm = to_svm(vcpu);
2394         nsvm_printk("Trying to open IRQ window\n");
2395
2396         nested_svm_intr(svm);
2397
2398         /* In case GIF=0 we can't rely on the CPU to tell us when
2399          * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
2400          * The next time we get that intercept, this function will be
2401          * called again though and we'll get the vintr intercept. */
2402         if (svm->vcpu.arch.hflags & HF_GIF_MASK) {
2403                 svm_set_vintr(svm);
2404                 svm_inject_irq(svm, 0x0);
2405         }
2406 }
2407
2408 static void enable_nmi_window(struct kvm_vcpu *vcpu)
2409 {
2410         struct vcpu_svm *svm = to_svm(vcpu);
2411
2412         if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
2413             == HF_NMI_MASK)
2414                 return; /* IRET will cause a vm exit */
2415
2416         /* Something prevents NMI from been injected. Single step over
2417            possible problem (IRET or exception injection or interrupt
2418            shadow) */
2419         vcpu->arch.singlestep = true;
2420         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2421         update_db_intercept(vcpu);
2422 }
2423
2424 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
2425 {
2426         return 0;
2427 }
2428
2429 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
2430 {
2431         force_new_asid(vcpu);
2432 }
2433
2434 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
2435 {
2436 }
2437
2438 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2439 {
2440         struct vcpu_svm *svm = to_svm(vcpu);
2441
2442         if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2443                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2444                 kvm_set_cr8(vcpu, cr8);
2445         }
2446 }
2447
2448 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2449 {
2450         struct vcpu_svm *svm = to_svm(vcpu);
2451         u64 cr8;
2452
2453         cr8 = kvm_get_cr8(vcpu);
2454         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2455         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
2456 }
2457
2458 static void svm_complete_interrupts(struct vcpu_svm *svm)
2459 {
2460         u8 vector;
2461         int type;
2462         u32 exitintinfo = svm->vmcb->control.exit_int_info;
2463
2464         if (svm->vcpu.arch.hflags & HF_IRET_MASK)
2465                 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
2466
2467         svm->vcpu.arch.nmi_injected = false;
2468         kvm_clear_exception_queue(&svm->vcpu);
2469         kvm_clear_interrupt_queue(&svm->vcpu);
2470
2471         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
2472                 return;
2473
2474         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
2475         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
2476
2477         switch (type) {
2478         case SVM_EXITINTINFO_TYPE_NMI:
2479                 svm->vcpu.arch.nmi_injected = true;
2480                 break;
2481         case SVM_EXITINTINFO_TYPE_EXEPT:
2482                 /* In case of software exception do not reinject an exception
2483                    vector, but re-execute and instruction instead */
2484                 if (is_nested(svm))
2485                         break;
2486                 if (kvm_exception_is_soft(vector))
2487                         break;
2488                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
2489                         u32 err = svm->vmcb->control.exit_int_info_err;
2490                         kvm_queue_exception_e(&svm->vcpu, vector, err);
2491
2492                 } else
2493                         kvm_queue_exception(&svm->vcpu, vector);
2494                 break;
2495         case SVM_EXITINTINFO_TYPE_INTR:
2496                 kvm_queue_interrupt(&svm->vcpu, vector, false);
2497                 break;
2498         default:
2499                 break;
2500         }
2501 }
2502
2503 #ifdef CONFIG_X86_64
2504 #define R "r"
2505 #else
2506 #define R "e"
2507 #endif
2508
2509 static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2510 {
2511         struct vcpu_svm *svm = to_svm(vcpu);
2512         u16 fs_selector;
2513         u16 gs_selector;
2514         u16 ldt_selector;
2515
2516         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2517         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2518         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
2519
2520         pre_svm_run(svm);
2521
2522         sync_lapic_to_cr8(vcpu);
2523
2524         save_host_msrs(vcpu);
2525         fs_selector = kvm_read_fs();
2526         gs_selector = kvm_read_gs();
2527         ldt_selector = kvm_read_ldt();
2528         if (!is_nested(svm))
2529                 svm->vmcb->save.cr2 = vcpu->arch.cr2;
2530         /* required for live migration with NPT */
2531         if (npt_enabled)
2532                 svm->vmcb->save.cr3 = vcpu->arch.cr3;
2533
2534         clgi();
2535
2536         local_irq_enable();
2537
2538         asm volatile (
2539                 "push %%"R"bp; \n\t"
2540                 "mov %c[rbx](%[svm]), %%"R"bx \n\t"
2541                 "mov %c[rcx](%[svm]), %%"R"cx \n\t"
2542                 "mov %c[rdx](%[svm]), %%"R"dx \n\t"
2543                 "mov %c[rsi](%[svm]), %%"R"si \n\t"
2544                 "mov %c[rdi](%[svm]), %%"R"di \n\t"
2545                 "mov %c[rbp](%[svm]), %%"R"bp \n\t"
2546 #ifdef CONFIG_X86_64
2547                 "mov %c[r8](%[svm]),  %%r8  \n\t"
2548                 "mov %c[r9](%[svm]),  %%r9  \n\t"
2549                 "mov %c[r10](%[svm]), %%r10 \n\t"
2550                 "mov %c[r11](%[svm]), %%r11 \n\t"
2551                 "mov %c[r12](%[svm]), %%r12 \n\t"
2552                 "mov %c[r13](%[svm]), %%r13 \n\t"
2553                 "mov %c[r14](%[svm]), %%r14 \n\t"
2554                 "mov %c[r15](%[svm]), %%r15 \n\t"
2555 #endif
2556
2557                 /* Enter guest mode */
2558                 "push %%"R"ax \n\t"
2559                 "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
2560                 __ex(SVM_VMLOAD) "\n\t"
2561                 __ex(SVM_VMRUN) "\n\t"
2562                 __ex(SVM_VMSAVE) "\n\t"
2563                 "pop %%"R"ax \n\t"
2564
2565                 /* Save guest registers, load host registers */
2566                 "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
2567                 "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
2568                 "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
2569                 "mov %%"R"si, %c[rsi](%[svm]) \n\t"
2570                 "mov %%"R"di, %c[rdi](%[svm]) \n\t"
2571                 "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
2572 #ifdef CONFIG_X86_64
2573                 "mov %%r8,  %c[r8](%[svm]) \n\t"
2574                 "mov %%r9,  %c[r9](%[svm]) \n\t"
2575                 "mov %%r10, %c[r10](%[svm]) \n\t"
2576                 "mov %%r11, %c[r11](%[svm]) \n\t"
2577                 "mov %%r12, %c[r12](%[svm]) \n\t"
2578                 "mov %%r13, %c[r13](%[svm]) \n\t"
2579                 "mov %%r14, %c[r14](%[svm]) \n\t"
2580                 "mov %%r15, %c[r15](%[svm]) \n\t"
2581 #endif
2582                 "pop %%"R"bp"
2583                 :
2584                 : [svm]"a"(svm),
2585                   [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
2586                   [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
2587                   [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
2588                   [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
2589                   [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
2590                   [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
2591                   [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
2592 #ifdef CONFIG_X86_64
2593                   , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
2594                   [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
2595                   [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
2596                   [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
2597                   [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
2598                   [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
2599                   [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
2600                   [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
2601 #endif
2602                 : "cc", "memory"
2603                 , R"bx", R"cx", R"dx", R"si", R"di"
2604 #ifdef CONFIG_X86_64
2605                 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
2606 #endif
2607                 );
2608
2609         vcpu->arch.cr2 = svm->vmcb->save.cr2;
2610         vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
2611         vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
2612         vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
2613
2614         kvm_load_fs(fs_selector);
2615         kvm_load_gs(gs_selector);
2616         kvm_load_ldt(ldt_selector);
2617         load_host_msrs(vcpu);
2618
2619         reload_tss(vcpu);
2620
2621         local_irq_disable();
2622
2623         stgi();
2624
2625         sync_cr8_to_lapic(vcpu);
2626
2627         svm->next_rip = 0;
2628
2629         if (npt_enabled) {
2630                 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
2631                 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
2632         }
2633
2634         svm_complete_interrupts(svm);
2635 }
2636
2637 #undef R
2638
2639 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
2640 {
2641         struct vcpu_svm *svm = to_svm(vcpu);
2642
2643         if (npt_enabled) {
2644                 svm->vmcb->control.nested_cr3 = root;
2645                 force_new_asid(vcpu);
2646                 return;
2647         }
2648
2649         svm->vmcb->save.cr3 = root;
2650         force_new_asid(vcpu);
2651
2652         if (vcpu->fpu_active) {
2653                 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
2654                 svm->vmcb->save.cr0 |= X86_CR0_TS;
2655                 vcpu->fpu_active = 0;
2656         }
2657 }
2658
2659 static int is_disabled(void)
2660 {
2661         u64 vm_cr;
2662
2663         rdmsrl(MSR_VM_CR, vm_cr);
2664         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
2665                 return 1;
2666
2667         return 0;
2668 }
2669
2670 static void
2671 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2672 {
2673         /*
2674          * Patch in the VMMCALL instruction:
2675          */
2676         hypercall[0] = 0x0f;
2677         hypercall[1] = 0x01;
2678         hypercall[2] = 0xd9;
2679 }
2680
2681 static void svm_check_processor_compat(void *rtn)
2682 {
2683         *(int *)rtn = 0;
2684 }
2685
2686 static bool svm_cpu_has_accelerated_tpr(void)
2687 {
2688         return false;
2689 }
2690
2691 static int get_npt_level(void)
2692 {
2693 #ifdef CONFIG_X86_64
2694         return PT64_ROOT_LEVEL;
2695 #else
2696         return PT32E_ROOT_LEVEL;
2697 #endif
2698 }
2699
2700 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2701 {
2702         return 0;
2703 }
2704
2705 static const struct trace_print_flags svm_exit_reasons_str[] = {
2706         { SVM_EXIT_READ_CR0,                    "read_cr0" },
2707         { SVM_EXIT_READ_CR3,                    "read_cr3" },
2708         { SVM_EXIT_READ_CR4,                    "read_cr4" },
2709         { SVM_EXIT_READ_CR8,                    "read_cr8" },
2710         { SVM_EXIT_WRITE_CR0,                   "write_cr0" },
2711         { SVM_EXIT_WRITE_CR3,                   "write_cr3" },
2712         { SVM_EXIT_WRITE_CR4,                   "write_cr4" },
2713         { SVM_EXIT_WRITE_CR8,                   "write_cr8" },
2714         { SVM_EXIT_READ_DR0,                    "read_dr0" },
2715         { SVM_EXIT_READ_DR1,                    "read_dr1" },
2716         { SVM_EXIT_READ_DR2,                    "read_dr2" },
2717         { SVM_EXIT_READ_DR3,                    "read_dr3" },
2718         { SVM_EXIT_WRITE_DR0,                   "write_dr0" },
2719         { SVM_EXIT_WRITE_DR1,                   "write_dr1" },
2720         { SVM_EXIT_WRITE_DR2,                   "write_dr2" },
2721         { SVM_EXIT_WRITE_DR3,                   "write_dr3" },
2722         { SVM_EXIT_WRITE_DR5,                   "write_dr5" },
2723         { SVM_EXIT_WRITE_DR7,                   "write_dr7" },
2724         { SVM_EXIT_EXCP_BASE + DB_VECTOR,       "DB excp" },
2725         { SVM_EXIT_EXCP_BASE + BP_VECTOR,       "BP excp" },
2726         { SVM_EXIT_EXCP_BASE + UD_VECTOR,       "UD excp" },
2727         { SVM_EXIT_EXCP_BASE + PF_VECTOR,       "PF excp" },
2728         { SVM_EXIT_EXCP_BASE + NM_VECTOR,       "NM excp" },
2729         { SVM_EXIT_EXCP_BASE + MC_VECTOR,       "MC excp" },
2730         { SVM_EXIT_INTR,                        "interrupt" },
2731         { SVM_EXIT_NMI,                         "nmi" },
2732         { SVM_EXIT_SMI,                         "smi" },
2733         { SVM_EXIT_INIT,                        "init" },
2734         { SVM_EXIT_VINTR,                       "vintr" },
2735         { SVM_EXIT_CPUID,                       "cpuid" },
2736         { SVM_EXIT_INVD,                        "invd" },
2737         { SVM_EXIT_HLT,                         "hlt" },
2738         { SVM_EXIT_INVLPG,                      "invlpg" },
2739         { SVM_EXIT_INVLPGA,                     "invlpga" },
2740         { SVM_EXIT_IOIO,                        "io" },
2741         { SVM_EXIT_MSR,                         "msr" },
2742         { SVM_EXIT_TASK_SWITCH,                 "task_switch" },
2743         { SVM_EXIT_SHUTDOWN,                    "shutdown" },
2744         { SVM_EXIT_VMRUN,                       "vmrun" },
2745         { SVM_EXIT_VMMCALL,                     "hypercall" },
2746         { SVM_EXIT_VMLOAD,                      "vmload" },
2747         { SVM_EXIT_VMSAVE,                      "vmsave" },
2748         { SVM_EXIT_STGI,                        "stgi" },
2749         { SVM_EXIT_CLGI,                        "clgi" },
2750         { SVM_EXIT_SKINIT,                      "skinit" },
2751         { SVM_EXIT_WBINVD,                      "wbinvd" },
2752         { SVM_EXIT_MONITOR,                     "monitor" },
2753         { SVM_EXIT_MWAIT,                       "mwait" },
2754         { SVM_EXIT_NPF,                         "npf" },
2755         { -1, NULL }
2756 };
2757
2758 static struct kvm_x86_ops svm_x86_ops = {
2759         .cpu_has_kvm_support = has_svm,
2760         .disabled_by_bios = is_disabled,
2761         .hardware_setup = svm_hardware_setup,
2762         .hardware_unsetup = svm_hardware_unsetup,
2763         .check_processor_compatibility = svm_check_processor_compat,
2764         .hardware_enable = svm_hardware_enable,
2765         .hardware_disable = svm_hardware_disable,
2766         .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
2767
2768         .vcpu_create = svm_create_vcpu,
2769         .vcpu_free = svm_free_vcpu,
2770         .vcpu_reset = svm_vcpu_reset,
2771
2772         .prepare_guest_switch = svm_prepare_guest_switch,
2773         .vcpu_load = svm_vcpu_load,
2774         .vcpu_put = svm_vcpu_put,
2775
2776         .set_guest_debug = svm_guest_debug,
2777         .get_msr = svm_get_msr,
2778         .set_msr = svm_set_msr,
2779         .get_segment_base = svm_get_segment_base,
2780         .get_segment = svm_get_segment,
2781         .set_segment = svm_set_segment,
2782         .get_cpl = svm_get_cpl,
2783         .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
2784         .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
2785         .set_cr0 = svm_set_cr0,
2786         .set_cr3 = svm_set_cr3,
2787         .set_cr4 = svm_set_cr4,
2788         .set_efer = svm_set_efer,
2789         .get_idt = svm_get_idt,
2790         .set_idt = svm_set_idt,
2791         .get_gdt = svm_get_gdt,
2792         .set_gdt = svm_set_gdt,
2793         .get_dr = svm_get_dr,
2794         .set_dr = svm_set_dr,
2795         .cache_reg = svm_cache_reg,
2796         .get_rflags = svm_get_rflags,
2797         .set_rflags = svm_set_rflags,
2798
2799         .tlb_flush = svm_flush_tlb,
2800
2801         .run = svm_vcpu_run,
2802         .handle_exit = handle_exit,
2803         .skip_emulated_instruction = skip_emulated_instruction,
2804         .set_interrupt_shadow = svm_set_interrupt_shadow,
2805         .get_interrupt_shadow = svm_get_interrupt_shadow,
2806         .patch_hypercall = svm_patch_hypercall,
2807         .set_irq = svm_set_irq,
2808         .set_nmi = svm_inject_nmi,
2809         .queue_exception = svm_queue_exception,
2810         .interrupt_allowed = svm_interrupt_allowed,
2811         .nmi_allowed = svm_nmi_allowed,
2812         .enable_nmi_window = enable_nmi_window,
2813         .enable_irq_window = enable_irq_window,
2814         .update_cr8_intercept = update_cr8_intercept,
2815
2816         .set_tss_addr = svm_set_tss_addr,
2817         .get_tdp_level = get_npt_level,
2818         .get_mt_mask = svm_get_mt_mask,
2819
2820         .exit_reasons_str = svm_exit_reasons_str,
2821 };
2822
2823 static int __init svm_init(void)
2824 {
2825         return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
2826                               THIS_MODULE);
2827 }
2828
2829 static void __exit svm_exit(void)
2830 {
2831         kvm_exit();
2832 }
2833
2834 module_init(svm_init)
2835 module_exit(svm_exit)