[PATCH] kvm: fix vcpu freeing bug
[safe/jmp/linux-2.6] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19
20 #include <linux/kvm.h>
21 #include <linux/module.h>
22 #include <linux/errno.h>
23 #include <asm/processor.h>
24 #include <linux/percpu.h>
25 #include <linux/gfp.h>
26 #include <asm/msr.h>
27 #include <linux/mm.h>
28 #include <linux/miscdevice.h>
29 #include <linux/vmalloc.h>
30 #include <asm/uaccess.h>
31 #include <linux/reboot.h>
32 #include <asm/io.h>
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/file.h>
36 #include <asm/desc.h>
37
38 #include "x86_emulate.h"
39 #include "segment_descriptor.h"
40
41 MODULE_AUTHOR("Qumranet");
42 MODULE_LICENSE("GPL");
43
44 struct kvm_arch_ops *kvm_arch_ops;
45 struct kvm_stat kvm_stat;
46 EXPORT_SYMBOL_GPL(kvm_stat);
47
48 static struct kvm_stats_debugfs_item {
49         const char *name;
50         u32 *data;
51         struct dentry *dentry;
52 } debugfs_entries[] = {
53         { "pf_fixed", &kvm_stat.pf_fixed },
54         { "pf_guest", &kvm_stat.pf_guest },
55         { "tlb_flush", &kvm_stat.tlb_flush },
56         { "invlpg", &kvm_stat.invlpg },
57         { "exits", &kvm_stat.exits },
58         { "io_exits", &kvm_stat.io_exits },
59         { "mmio_exits", &kvm_stat.mmio_exits },
60         { "signal_exits", &kvm_stat.signal_exits },
61         { "irq_window", &kvm_stat.irq_window_exits },
62         { "halt_exits", &kvm_stat.halt_exits },
63         { "request_irq", &kvm_stat.request_irq_exits },
64         { "irq_exits", &kvm_stat.irq_exits },
65         { NULL, NULL }
66 };
67
68 static struct dentry *debugfs_dir;
69
70 #define MAX_IO_MSRS 256
71
72 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
73 #define LMSW_GUEST_MASK 0x0eULL
74 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
75 #define CR8_RESEVED_BITS (~0x0fULL)
76 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
77
78 #ifdef CONFIG_X86_64
79 // LDT or TSS descriptor in the GDT. 16 bytes.
80 struct segment_descriptor_64 {
81         struct segment_descriptor s;
82         u32 base_higher;
83         u32 pad_zero;
84 };
85
86 #endif
87
88 unsigned long segment_base(u16 selector)
89 {
90         struct descriptor_table gdt;
91         struct segment_descriptor *d;
92         unsigned long table_base;
93         typedef unsigned long ul;
94         unsigned long v;
95
96         if (selector == 0)
97                 return 0;
98
99         asm ("sgdt %0" : "=m"(gdt));
100         table_base = gdt.base;
101
102         if (selector & 4) {           /* from ldt */
103                 u16 ldt_selector;
104
105                 asm ("sldt %0" : "=g"(ldt_selector));
106                 table_base = segment_base(ldt_selector);
107         }
108         d = (struct segment_descriptor *)(table_base + (selector & ~7));
109         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
110 #ifdef CONFIG_X86_64
111         if (d->system == 0
112             && (d->type == 2 || d->type == 9 || d->type == 11))
113                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
114 #endif
115         return v;
116 }
117 EXPORT_SYMBOL_GPL(segment_base);
118
119 static inline int valid_vcpu(int n)
120 {
121         return likely(n >= 0 && n < KVM_MAX_VCPUS);
122 }
123
124 int kvm_read_guest(struct kvm_vcpu *vcpu,
125                              gva_t addr,
126                              unsigned long size,
127                              void *dest)
128 {
129         unsigned char *host_buf = dest;
130         unsigned long req_size = size;
131
132         while (size) {
133                 hpa_t paddr;
134                 unsigned now;
135                 unsigned offset;
136                 hva_t guest_buf;
137
138                 paddr = gva_to_hpa(vcpu, addr);
139
140                 if (is_error_hpa(paddr))
141                         break;
142
143                 guest_buf = (hva_t)kmap_atomic(
144                                         pfn_to_page(paddr >> PAGE_SHIFT),
145                                         KM_USER0);
146                 offset = addr & ~PAGE_MASK;
147                 guest_buf |= offset;
148                 now = min(size, PAGE_SIZE - offset);
149                 memcpy(host_buf, (void*)guest_buf, now);
150                 host_buf += now;
151                 addr += now;
152                 size -= now;
153                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
154         }
155         return req_size - size;
156 }
157 EXPORT_SYMBOL_GPL(kvm_read_guest);
158
159 int kvm_write_guest(struct kvm_vcpu *vcpu,
160                              gva_t addr,
161                              unsigned long size,
162                              void *data)
163 {
164         unsigned char *host_buf = data;
165         unsigned long req_size = size;
166
167         while (size) {
168                 hpa_t paddr;
169                 unsigned now;
170                 unsigned offset;
171                 hva_t guest_buf;
172
173                 paddr = gva_to_hpa(vcpu, addr);
174
175                 if (is_error_hpa(paddr))
176                         break;
177
178                 guest_buf = (hva_t)kmap_atomic(
179                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
180                 offset = addr & ~PAGE_MASK;
181                 guest_buf |= offset;
182                 now = min(size, PAGE_SIZE - offset);
183                 memcpy((void*)guest_buf, host_buf, now);
184                 host_buf += now;
185                 addr += now;
186                 size -= now;
187                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
188         }
189         return req_size - size;
190 }
191 EXPORT_SYMBOL_GPL(kvm_write_guest);
192
193 static int vcpu_slot(struct kvm_vcpu *vcpu)
194 {
195         return vcpu - vcpu->kvm->vcpus;
196 }
197
198 /*
199  * Switches to specified vcpu, until a matching vcpu_put()
200  */
201 static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot)
202 {
203         struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot];
204
205         mutex_lock(&vcpu->mutex);
206         if (unlikely(!vcpu->vmcs)) {
207                 mutex_unlock(&vcpu->mutex);
208                 return NULL;
209         }
210         return kvm_arch_ops->vcpu_load(vcpu);
211 }
212
213 static void vcpu_put(struct kvm_vcpu *vcpu)
214 {
215         kvm_arch_ops->vcpu_put(vcpu);
216         mutex_unlock(&vcpu->mutex);
217 }
218
219 static int kvm_dev_open(struct inode *inode, struct file *filp)
220 {
221         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
222         int i;
223
224         if (!kvm)
225                 return -ENOMEM;
226
227         spin_lock_init(&kvm->lock);
228         INIT_LIST_HEAD(&kvm->active_mmu_pages);
229         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
230                 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
231
232                 mutex_init(&vcpu->mutex);
233                 vcpu->kvm = kvm;
234                 vcpu->mmu.root_hpa = INVALID_PAGE;
235                 INIT_LIST_HEAD(&vcpu->free_pages);
236         }
237         filp->private_data = kvm;
238         return 0;
239 }
240
241 /*
242  * Free any memory in @free but not in @dont.
243  */
244 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
245                                   struct kvm_memory_slot *dont)
246 {
247         int i;
248
249         if (!dont || free->phys_mem != dont->phys_mem)
250                 if (free->phys_mem) {
251                         for (i = 0; i < free->npages; ++i)
252                                 if (free->phys_mem[i])
253                                         __free_page(free->phys_mem[i]);
254                         vfree(free->phys_mem);
255                 }
256
257         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
258                 vfree(free->dirty_bitmap);
259
260         free->phys_mem = NULL;
261         free->npages = 0;
262         free->dirty_bitmap = NULL;
263 }
264
265 static void kvm_free_physmem(struct kvm *kvm)
266 {
267         int i;
268
269         for (i = 0; i < kvm->nmemslots; ++i)
270                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
271 }
272
273 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
274 {
275         if (!vcpu_load(vcpu->kvm, vcpu_slot(vcpu)))
276                 return;
277
278         kvm_mmu_destroy(vcpu);
279         vcpu_put(vcpu);
280         kvm_arch_ops->vcpu_free(vcpu);
281 }
282
283 static void kvm_free_vcpus(struct kvm *kvm)
284 {
285         unsigned int i;
286
287         for (i = 0; i < KVM_MAX_VCPUS; ++i)
288                 kvm_free_vcpu(&kvm->vcpus[i]);
289 }
290
291 static int kvm_dev_release(struct inode *inode, struct file *filp)
292 {
293         struct kvm *kvm = filp->private_data;
294
295         kvm_free_vcpus(kvm);
296         kvm_free_physmem(kvm);
297         kfree(kvm);
298         return 0;
299 }
300
301 static void inject_gp(struct kvm_vcpu *vcpu)
302 {
303         kvm_arch_ops->inject_gp(vcpu, 0);
304 }
305
306 /*
307  * Load the pae pdptrs.  Return true is they are all valid.
308  */
309 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
310 {
311         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
312         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
313         int i;
314         u64 pdpte;
315         u64 *pdpt;
316         int ret;
317         struct kvm_memory_slot *memslot;
318
319         spin_lock(&vcpu->kvm->lock);
320         memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn);
321         /* FIXME: !memslot - emulate? 0xff? */
322         pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0);
323
324         ret = 1;
325         for (i = 0; i < 4; ++i) {
326                 pdpte = pdpt[offset + i];
327                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
328                         ret = 0;
329                         goto out;
330                 }
331         }
332
333         for (i = 0; i < 4; ++i)
334                 vcpu->pdptrs[i] = pdpt[offset + i];
335
336 out:
337         kunmap_atomic(pdpt, KM_USER0);
338         spin_unlock(&vcpu->kvm->lock);
339
340         return ret;
341 }
342
343 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
344 {
345         if (cr0 & CR0_RESEVED_BITS) {
346                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
347                        cr0, vcpu->cr0);
348                 inject_gp(vcpu);
349                 return;
350         }
351
352         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
353                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
354                 inject_gp(vcpu);
355                 return;
356         }
357
358         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
359                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
360                        "and a clear PE flag\n");
361                 inject_gp(vcpu);
362                 return;
363         }
364
365         if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
366 #ifdef CONFIG_X86_64
367                 if ((vcpu->shadow_efer & EFER_LME)) {
368                         int cs_db, cs_l;
369
370                         if (!is_pae(vcpu)) {
371                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
372                                        "in long mode while PAE is disabled\n");
373                                 inject_gp(vcpu);
374                                 return;
375                         }
376                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
377                         if (cs_l) {
378                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
379                                        "in long mode while CS.L == 1\n");
380                                 inject_gp(vcpu);
381                                 return;
382
383                         }
384                 } else
385 #endif
386                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
387                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
388                                "reserved bits\n");
389                         inject_gp(vcpu);
390                         return;
391                 }
392
393         }
394
395         kvm_arch_ops->set_cr0(vcpu, cr0);
396         vcpu->cr0 = cr0;
397
398         spin_lock(&vcpu->kvm->lock);
399         kvm_mmu_reset_context(vcpu);
400         spin_unlock(&vcpu->kvm->lock);
401         return;
402 }
403 EXPORT_SYMBOL_GPL(set_cr0);
404
405 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
406 {
407         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
408         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
409 }
410 EXPORT_SYMBOL_GPL(lmsw);
411
412 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
413 {
414         if (cr4 & CR4_RESEVED_BITS) {
415                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
416                 inject_gp(vcpu);
417                 return;
418         }
419
420         if (is_long_mode(vcpu)) {
421                 if (!(cr4 & CR4_PAE_MASK)) {
422                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
423                                "in long mode\n");
424                         inject_gp(vcpu);
425                         return;
426                 }
427         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
428                    && !load_pdptrs(vcpu, vcpu->cr3)) {
429                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
430                 inject_gp(vcpu);
431         }
432
433         if (cr4 & CR4_VMXE_MASK) {
434                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
435                 inject_gp(vcpu);
436                 return;
437         }
438         kvm_arch_ops->set_cr4(vcpu, cr4);
439         spin_lock(&vcpu->kvm->lock);
440         kvm_mmu_reset_context(vcpu);
441         spin_unlock(&vcpu->kvm->lock);
442 }
443 EXPORT_SYMBOL_GPL(set_cr4);
444
445 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
446 {
447         if (is_long_mode(vcpu)) {
448                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
449                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
450                         inject_gp(vcpu);
451                         return;
452                 }
453         } else {
454                 if (cr3 & CR3_RESEVED_BITS) {
455                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
456                         inject_gp(vcpu);
457                         return;
458                 }
459                 if (is_paging(vcpu) && is_pae(vcpu) &&
460                     !load_pdptrs(vcpu, cr3)) {
461                         printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
462                                "reserved bits\n");
463                         inject_gp(vcpu);
464                         return;
465                 }
466         }
467
468         vcpu->cr3 = cr3;
469         spin_lock(&vcpu->kvm->lock);
470         /*
471          * Does the new cr3 value map to physical memory? (Note, we
472          * catch an invalid cr3 even in real-mode, because it would
473          * cause trouble later on when we turn on paging anyway.)
474          *
475          * A real CPU would silently accept an invalid cr3 and would
476          * attempt to use it - with largely undefined (and often hard
477          * to debug) behavior on the guest side.
478          */
479         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
480                 inject_gp(vcpu);
481         else
482                 vcpu->mmu.new_cr3(vcpu);
483         spin_unlock(&vcpu->kvm->lock);
484 }
485 EXPORT_SYMBOL_GPL(set_cr3);
486
487 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
488 {
489         if ( cr8 & CR8_RESEVED_BITS) {
490                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
491                 inject_gp(vcpu);
492                 return;
493         }
494         vcpu->cr8 = cr8;
495 }
496 EXPORT_SYMBOL_GPL(set_cr8);
497
498 void fx_init(struct kvm_vcpu *vcpu)
499 {
500         struct __attribute__ ((__packed__)) fx_image_s {
501                 u16 control; //fcw
502                 u16 status; //fsw
503                 u16 tag; // ftw
504                 u16 opcode; //fop
505                 u64 ip; // fpu ip
506                 u64 operand;// fpu dp
507                 u32 mxcsr;
508                 u32 mxcsr_mask;
509
510         } *fx_image;
511
512         fx_save(vcpu->host_fx_image);
513         fpu_init();
514         fx_save(vcpu->guest_fx_image);
515         fx_restore(vcpu->host_fx_image);
516
517         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
518         fx_image->mxcsr = 0x1f80;
519         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
520                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
521 }
522 EXPORT_SYMBOL_GPL(fx_init);
523
524 /*
525  * Creates some virtual cpus.  Good luck creating more than one.
526  */
527 static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n)
528 {
529         int r;
530         struct kvm_vcpu *vcpu;
531
532         r = -EINVAL;
533         if (!valid_vcpu(n))
534                 goto out;
535
536         vcpu = &kvm->vcpus[n];
537
538         mutex_lock(&vcpu->mutex);
539
540         if (vcpu->vmcs) {
541                 mutex_unlock(&vcpu->mutex);
542                 return -EEXIST;
543         }
544
545         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
546                                            FX_IMAGE_ALIGN);
547         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
548
549         vcpu->cpu = -1;  /* First load will set up TR */
550         r = kvm_arch_ops->vcpu_create(vcpu);
551         if (r < 0)
552                 goto out_free_vcpus;
553
554         r = kvm_mmu_create(vcpu);
555         if (r < 0)
556                 goto out_free_vcpus;
557
558         kvm_arch_ops->vcpu_load(vcpu);
559         r = kvm_mmu_setup(vcpu);
560         if (r >= 0)
561                 r = kvm_arch_ops->vcpu_setup(vcpu);
562         vcpu_put(vcpu);
563
564         if (r < 0)
565                 goto out_free_vcpus;
566
567         return 0;
568
569 out_free_vcpus:
570         kvm_free_vcpu(vcpu);
571         mutex_unlock(&vcpu->mutex);
572 out:
573         return r;
574 }
575
576 /*
577  * Allocate some memory and give it an address in the guest physical address
578  * space.
579  *
580  * Discontiguous memory is allowed, mostly for framebuffers.
581  */
582 static int kvm_dev_ioctl_set_memory_region(struct kvm *kvm,
583                                            struct kvm_memory_region *mem)
584 {
585         int r;
586         gfn_t base_gfn;
587         unsigned long npages;
588         unsigned long i;
589         struct kvm_memory_slot *memslot;
590         struct kvm_memory_slot old, new;
591         int memory_config_version;
592
593         r = -EINVAL;
594         /* General sanity checks */
595         if (mem->memory_size & (PAGE_SIZE - 1))
596                 goto out;
597         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
598                 goto out;
599         if (mem->slot >= KVM_MEMORY_SLOTS)
600                 goto out;
601         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
602                 goto out;
603
604         memslot = &kvm->memslots[mem->slot];
605         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
606         npages = mem->memory_size >> PAGE_SHIFT;
607
608         if (!npages)
609                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
610
611 raced:
612         spin_lock(&kvm->lock);
613
614         memory_config_version = kvm->memory_config_version;
615         new = old = *memslot;
616
617         new.base_gfn = base_gfn;
618         new.npages = npages;
619         new.flags = mem->flags;
620
621         /* Disallow changing a memory slot's size. */
622         r = -EINVAL;
623         if (npages && old.npages && npages != old.npages)
624                 goto out_unlock;
625
626         /* Check for overlaps */
627         r = -EEXIST;
628         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
629                 struct kvm_memory_slot *s = &kvm->memslots[i];
630
631                 if (s == memslot)
632                         continue;
633                 if (!((base_gfn + npages <= s->base_gfn) ||
634                       (base_gfn >= s->base_gfn + s->npages)))
635                         goto out_unlock;
636         }
637         /*
638          * Do memory allocations outside lock.  memory_config_version will
639          * detect any races.
640          */
641         spin_unlock(&kvm->lock);
642
643         /* Deallocate if slot is being removed */
644         if (!npages)
645                 new.phys_mem = NULL;
646
647         /* Free page dirty bitmap if unneeded */
648         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
649                 new.dirty_bitmap = NULL;
650
651         r = -ENOMEM;
652
653         /* Allocate if a slot is being created */
654         if (npages && !new.phys_mem) {
655                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
656
657                 if (!new.phys_mem)
658                         goto out_free;
659
660                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
661                 for (i = 0; i < npages; ++i) {
662                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
663                                                      | __GFP_ZERO);
664                         if (!new.phys_mem[i])
665                                 goto out_free;
666                         new.phys_mem[i]->private = 0;
667                 }
668         }
669
670         /* Allocate page dirty bitmap if needed */
671         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
672                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
673
674                 new.dirty_bitmap = vmalloc(dirty_bytes);
675                 if (!new.dirty_bitmap)
676                         goto out_free;
677                 memset(new.dirty_bitmap, 0, dirty_bytes);
678         }
679
680         spin_lock(&kvm->lock);
681
682         if (memory_config_version != kvm->memory_config_version) {
683                 spin_unlock(&kvm->lock);
684                 kvm_free_physmem_slot(&new, &old);
685                 goto raced;
686         }
687
688         r = -EAGAIN;
689         if (kvm->busy)
690                 goto out_unlock;
691
692         if (mem->slot >= kvm->nmemslots)
693                 kvm->nmemslots = mem->slot + 1;
694
695         *memslot = new;
696         ++kvm->memory_config_version;
697
698         spin_unlock(&kvm->lock);
699
700         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
701                 struct kvm_vcpu *vcpu;
702
703                 vcpu = vcpu_load(kvm, i);
704                 if (!vcpu)
705                         continue;
706                 kvm_mmu_reset_context(vcpu);
707                 vcpu_put(vcpu);
708         }
709
710         kvm_free_physmem_slot(&old, &new);
711         return 0;
712
713 out_unlock:
714         spin_unlock(&kvm->lock);
715 out_free:
716         kvm_free_physmem_slot(&new, &old);
717 out:
718         return r;
719 }
720
721 static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
722 {
723         spin_lock(&vcpu->kvm->lock);
724         kvm_mmu_slot_remove_write_access(vcpu, slot);
725         spin_unlock(&vcpu->kvm->lock);
726 }
727
728 /*
729  * Get (and clear) the dirty memory log for a memory slot.
730  */
731 static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm,
732                                        struct kvm_dirty_log *log)
733 {
734         struct kvm_memory_slot *memslot;
735         int r, i;
736         int n;
737         int cleared;
738         unsigned long any = 0;
739
740         spin_lock(&kvm->lock);
741
742         /*
743          * Prevent changes to guest memory configuration even while the lock
744          * is not taken.
745          */
746         ++kvm->busy;
747         spin_unlock(&kvm->lock);
748         r = -EINVAL;
749         if (log->slot >= KVM_MEMORY_SLOTS)
750                 goto out;
751
752         memslot = &kvm->memslots[log->slot];
753         r = -ENOENT;
754         if (!memslot->dirty_bitmap)
755                 goto out;
756
757         n = ALIGN(memslot->npages, 8) / 8;
758
759         for (i = 0; !any && i < n; ++i)
760                 any = memslot->dirty_bitmap[i];
761
762         r = -EFAULT;
763         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
764                 goto out;
765
766
767         if (any) {
768                 cleared = 0;
769                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
770                         struct kvm_vcpu *vcpu = vcpu_load(kvm, i);
771
772                         if (!vcpu)
773                                 continue;
774                         if (!cleared) {
775                                 do_remove_write_access(vcpu, log->slot);
776                                 memset(memslot->dirty_bitmap, 0, n);
777                                 cleared = 1;
778                         }
779                         kvm_arch_ops->tlb_flush(vcpu);
780                         vcpu_put(vcpu);
781                 }
782         }
783
784         r = 0;
785
786 out:
787         spin_lock(&kvm->lock);
788         --kvm->busy;
789         spin_unlock(&kvm->lock);
790         return r;
791 }
792
793 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
794 {
795         int i;
796
797         for (i = 0; i < kvm->nmemslots; ++i) {
798                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
799
800                 if (gfn >= memslot->base_gfn
801                     && gfn < memslot->base_gfn + memslot->npages)
802                         return memslot;
803         }
804         return NULL;
805 }
806 EXPORT_SYMBOL_GPL(gfn_to_memslot);
807
808 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
809 {
810         int i;
811         struct kvm_memory_slot *memslot = NULL;
812         unsigned long rel_gfn;
813
814         for (i = 0; i < kvm->nmemslots; ++i) {
815                 memslot = &kvm->memslots[i];
816
817                 if (gfn >= memslot->base_gfn
818                     && gfn < memslot->base_gfn + memslot->npages) {
819
820                         if (!memslot || !memslot->dirty_bitmap)
821                                 return;
822
823                         rel_gfn = gfn - memslot->base_gfn;
824
825                         /* avoid RMW */
826                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
827                                 set_bit(rel_gfn, memslot->dirty_bitmap);
828                         return;
829                 }
830         }
831 }
832
833 static int emulator_read_std(unsigned long addr,
834                              unsigned long *val,
835                              unsigned int bytes,
836                              struct x86_emulate_ctxt *ctxt)
837 {
838         struct kvm_vcpu *vcpu = ctxt->vcpu;
839         void *data = val;
840
841         while (bytes) {
842                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
843                 unsigned offset = addr & (PAGE_SIZE-1);
844                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
845                 unsigned long pfn;
846                 struct kvm_memory_slot *memslot;
847                 void *page;
848
849                 if (gpa == UNMAPPED_GVA)
850                         return X86EMUL_PROPAGATE_FAULT;
851                 pfn = gpa >> PAGE_SHIFT;
852                 memslot = gfn_to_memslot(vcpu->kvm, pfn);
853                 if (!memslot)
854                         return X86EMUL_UNHANDLEABLE;
855                 page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0);
856
857                 memcpy(data, page + offset, tocopy);
858
859                 kunmap_atomic(page, KM_USER0);
860
861                 bytes -= tocopy;
862                 data += tocopy;
863                 addr += tocopy;
864         }
865
866         return X86EMUL_CONTINUE;
867 }
868
869 static int emulator_write_std(unsigned long addr,
870                               unsigned long val,
871                               unsigned int bytes,
872                               struct x86_emulate_ctxt *ctxt)
873 {
874         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
875                addr, bytes);
876         return X86EMUL_UNHANDLEABLE;
877 }
878
879 static int emulator_read_emulated(unsigned long addr,
880                                   unsigned long *val,
881                                   unsigned int bytes,
882                                   struct x86_emulate_ctxt *ctxt)
883 {
884         struct kvm_vcpu *vcpu = ctxt->vcpu;
885
886         if (vcpu->mmio_read_completed) {
887                 memcpy(val, vcpu->mmio_data, bytes);
888                 vcpu->mmio_read_completed = 0;
889                 return X86EMUL_CONTINUE;
890         } else if (emulator_read_std(addr, val, bytes, ctxt)
891                    == X86EMUL_CONTINUE)
892                 return X86EMUL_CONTINUE;
893         else {
894                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
895                 if (gpa == UNMAPPED_GVA)
896                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
897                 vcpu->mmio_needed = 1;
898                 vcpu->mmio_phys_addr = gpa;
899                 vcpu->mmio_size = bytes;
900                 vcpu->mmio_is_write = 0;
901
902                 return X86EMUL_UNHANDLEABLE;
903         }
904 }
905
906 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
907                                unsigned long val, int bytes)
908 {
909         struct kvm_memory_slot *m;
910         struct page *page;
911         void *virt;
912
913         if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
914                 return 0;
915         m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
916         if (!m)
917                 return 0;
918         page = gfn_to_page(m, gpa >> PAGE_SHIFT);
919         kvm_mmu_pre_write(vcpu, gpa, bytes);
920         virt = kmap_atomic(page, KM_USER0);
921         memcpy(virt + offset_in_page(gpa), &val, bytes);
922         kunmap_atomic(virt, KM_USER0);
923         kvm_mmu_post_write(vcpu, gpa, bytes);
924         return 1;
925 }
926
927 static int emulator_write_emulated(unsigned long addr,
928                                    unsigned long val,
929                                    unsigned int bytes,
930                                    struct x86_emulate_ctxt *ctxt)
931 {
932         struct kvm_vcpu *vcpu = ctxt->vcpu;
933         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
934
935         if (gpa == UNMAPPED_GVA)
936                 return X86EMUL_PROPAGATE_FAULT;
937
938         if (emulator_write_phys(vcpu, gpa, val, bytes))
939                 return X86EMUL_CONTINUE;
940
941         vcpu->mmio_needed = 1;
942         vcpu->mmio_phys_addr = gpa;
943         vcpu->mmio_size = bytes;
944         vcpu->mmio_is_write = 1;
945         memcpy(vcpu->mmio_data, &val, bytes);
946
947         return X86EMUL_CONTINUE;
948 }
949
950 static int emulator_cmpxchg_emulated(unsigned long addr,
951                                      unsigned long old,
952                                      unsigned long new,
953                                      unsigned int bytes,
954                                      struct x86_emulate_ctxt *ctxt)
955 {
956         static int reported;
957
958         if (!reported) {
959                 reported = 1;
960                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
961         }
962         return emulator_write_emulated(addr, new, bytes, ctxt);
963 }
964
965 #ifdef CONFIG_X86_32
966
967 static int emulator_cmpxchg8b_emulated(unsigned long addr,
968                                        unsigned long old_lo,
969                                        unsigned long old_hi,
970                                        unsigned long new_lo,
971                                        unsigned long new_hi,
972                                        struct x86_emulate_ctxt *ctxt)
973 {
974         static int reported;
975         int r;
976
977         if (!reported) {
978                 reported = 1;
979                 printk(KERN_WARNING "kvm: emulating exchange8b as write\n");
980         }
981         r = emulator_write_emulated(addr, new_lo, 4, ctxt);
982         if (r != X86EMUL_CONTINUE)
983                 return r;
984         return emulator_write_emulated(addr+4, new_hi, 4, ctxt);
985 }
986
987 #endif
988
989 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
990 {
991         return kvm_arch_ops->get_segment_base(vcpu, seg);
992 }
993
994 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
995 {
996         return X86EMUL_CONTINUE;
997 }
998
999 int emulate_clts(struct kvm_vcpu *vcpu)
1000 {
1001         unsigned long cr0;
1002
1003         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1004         cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1005         kvm_arch_ops->set_cr0(vcpu, cr0);
1006         return X86EMUL_CONTINUE;
1007 }
1008
1009 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1010 {
1011         struct kvm_vcpu *vcpu = ctxt->vcpu;
1012
1013         switch (dr) {
1014         case 0 ... 3:
1015                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
1016                 return X86EMUL_CONTINUE;
1017         default:
1018                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1019                        __FUNCTION__, dr);
1020                 return X86EMUL_UNHANDLEABLE;
1021         }
1022 }
1023
1024 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1025 {
1026         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1027         int exception;
1028
1029         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1030         if (exception) {
1031                 /* FIXME: better handling */
1032                 return X86EMUL_UNHANDLEABLE;
1033         }
1034         return X86EMUL_CONTINUE;
1035 }
1036
1037 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1038 {
1039         static int reported;
1040         u8 opcodes[4];
1041         unsigned long rip = ctxt->vcpu->rip;
1042         unsigned long rip_linear;
1043
1044         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1045
1046         if (reported)
1047                 return;
1048
1049         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1050
1051         printk(KERN_ERR "emulation failed but !mmio_needed?"
1052                " rip %lx %02x %02x %02x %02x\n",
1053                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1054         reported = 1;
1055 }
1056
1057 struct x86_emulate_ops emulate_ops = {
1058         .read_std            = emulator_read_std,
1059         .write_std           = emulator_write_std,
1060         .read_emulated       = emulator_read_emulated,
1061         .write_emulated      = emulator_write_emulated,
1062         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1063 #ifdef CONFIG_X86_32
1064         .cmpxchg8b_emulated  = emulator_cmpxchg8b_emulated,
1065 #endif
1066 };
1067
1068 int emulate_instruction(struct kvm_vcpu *vcpu,
1069                         struct kvm_run *run,
1070                         unsigned long cr2,
1071                         u16 error_code)
1072 {
1073         struct x86_emulate_ctxt emulate_ctxt;
1074         int r;
1075         int cs_db, cs_l;
1076
1077         kvm_arch_ops->cache_regs(vcpu);
1078
1079         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1080
1081         emulate_ctxt.vcpu = vcpu;
1082         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1083         emulate_ctxt.cr2 = cr2;
1084         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1085                 ? X86EMUL_MODE_REAL : cs_l
1086                 ? X86EMUL_MODE_PROT64 : cs_db
1087                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1088
1089         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1090                 emulate_ctxt.cs_base = 0;
1091                 emulate_ctxt.ds_base = 0;
1092                 emulate_ctxt.es_base = 0;
1093                 emulate_ctxt.ss_base = 0;
1094         } else {
1095                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1096                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1097                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1098                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1099         }
1100
1101         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1102         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1103
1104         vcpu->mmio_is_write = 0;
1105         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1106
1107         if ((r || vcpu->mmio_is_write) && run) {
1108                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1109                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1110                 run->mmio.len = vcpu->mmio_size;
1111                 run->mmio.is_write = vcpu->mmio_is_write;
1112         }
1113
1114         if (r) {
1115                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1116                         return EMULATE_DONE;
1117                 if (!vcpu->mmio_needed) {
1118                         report_emulation_failure(&emulate_ctxt);
1119                         return EMULATE_FAIL;
1120                 }
1121                 return EMULATE_DO_MMIO;
1122         }
1123
1124         kvm_arch_ops->decache_regs(vcpu);
1125         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1126
1127         if (vcpu->mmio_is_write)
1128                 return EMULATE_DO_MMIO;
1129
1130         return EMULATE_DONE;
1131 }
1132 EXPORT_SYMBOL_GPL(emulate_instruction);
1133
1134 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1135 {
1136         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1137 }
1138
1139 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1140 {
1141         struct descriptor_table dt = { limit, base };
1142
1143         kvm_arch_ops->set_gdt(vcpu, &dt);
1144 }
1145
1146 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1147 {
1148         struct descriptor_table dt = { limit, base };
1149
1150         kvm_arch_ops->set_idt(vcpu, &dt);
1151 }
1152
1153 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1154                    unsigned long *rflags)
1155 {
1156         lmsw(vcpu, msw);
1157         *rflags = kvm_arch_ops->get_rflags(vcpu);
1158 }
1159
1160 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1161 {
1162         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1163         switch (cr) {
1164         case 0:
1165                 return vcpu->cr0;
1166         case 2:
1167                 return vcpu->cr2;
1168         case 3:
1169                 return vcpu->cr3;
1170         case 4:
1171                 return vcpu->cr4;
1172         default:
1173                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1174                 return 0;
1175         }
1176 }
1177
1178 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1179                      unsigned long *rflags)
1180 {
1181         switch (cr) {
1182         case 0:
1183                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1184                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1185                 break;
1186         case 2:
1187                 vcpu->cr2 = val;
1188                 break;
1189         case 3:
1190                 set_cr3(vcpu, val);
1191                 break;
1192         case 4:
1193                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1194                 break;
1195         default:
1196                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1197         }
1198 }
1199
1200 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1201 {
1202         u64 data;
1203
1204         switch (msr) {
1205         case 0xc0010010: /* SYSCFG */
1206         case 0xc0010015: /* HWCR */
1207         case MSR_IA32_PLATFORM_ID:
1208         case MSR_IA32_P5_MC_ADDR:
1209         case MSR_IA32_P5_MC_TYPE:
1210         case MSR_IA32_MC0_CTL:
1211         case MSR_IA32_MCG_STATUS:
1212         case MSR_IA32_MCG_CAP:
1213         case MSR_IA32_MC0_MISC:
1214         case MSR_IA32_MC0_MISC+4:
1215         case MSR_IA32_MC0_MISC+8:
1216         case MSR_IA32_MC0_MISC+12:
1217         case MSR_IA32_MC0_MISC+16:
1218         case MSR_IA32_UCODE_REV:
1219         case MSR_IA32_PERF_STATUS:
1220                 /* MTRR registers */
1221         case 0xfe:
1222         case 0x200 ... 0x2ff:
1223                 data = 0;
1224                 break;
1225         case 0xcd: /* fsb frequency */
1226                 data = 3;
1227                 break;
1228         case MSR_IA32_APICBASE:
1229                 data = vcpu->apic_base;
1230                 break;
1231         case MSR_IA32_MISC_ENABLE:
1232                 data = vcpu->ia32_misc_enable_msr;
1233                 break;
1234 #ifdef CONFIG_X86_64
1235         case MSR_EFER:
1236                 data = vcpu->shadow_efer;
1237                 break;
1238 #endif
1239         default:
1240                 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1241                 return 1;
1242         }
1243         *pdata = data;
1244         return 0;
1245 }
1246 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1247
1248 /*
1249  * Reads an msr value (of 'msr_index') into 'pdata'.
1250  * Returns 0 on success, non-0 otherwise.
1251  * Assumes vcpu_load() was already called.
1252  */
1253 static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1254 {
1255         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1256 }
1257
1258 #ifdef CONFIG_X86_64
1259
1260 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1261 {
1262         if (efer & EFER_RESERVED_BITS) {
1263                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1264                        efer);
1265                 inject_gp(vcpu);
1266                 return;
1267         }
1268
1269         if (is_paging(vcpu)
1270             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1271                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1272                 inject_gp(vcpu);
1273                 return;
1274         }
1275
1276         kvm_arch_ops->set_efer(vcpu, efer);
1277
1278         efer &= ~EFER_LMA;
1279         efer |= vcpu->shadow_efer & EFER_LMA;
1280
1281         vcpu->shadow_efer = efer;
1282 }
1283
1284 #endif
1285
1286 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1287 {
1288         switch (msr) {
1289 #ifdef CONFIG_X86_64
1290         case MSR_EFER:
1291                 set_efer(vcpu, data);
1292                 break;
1293 #endif
1294         case MSR_IA32_MC0_STATUS:
1295                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1296                        __FUNCTION__, data);
1297                 break;
1298         case MSR_IA32_UCODE_REV:
1299         case MSR_IA32_UCODE_WRITE:
1300         case 0x200 ... 0x2ff: /* MTRRs */
1301                 break;
1302         case MSR_IA32_APICBASE:
1303                 vcpu->apic_base = data;
1304                 break;
1305         case MSR_IA32_MISC_ENABLE:
1306                 vcpu->ia32_misc_enable_msr = data;
1307                 break;
1308         default:
1309                 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1310                 return 1;
1311         }
1312         return 0;
1313 }
1314 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1315
1316 /*
1317  * Writes msr value into into the appropriate "register".
1318  * Returns 0 on success, non-0 otherwise.
1319  * Assumes vcpu_load() was already called.
1320  */
1321 static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1322 {
1323         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1324 }
1325
1326 void kvm_resched(struct kvm_vcpu *vcpu)
1327 {
1328         vcpu_put(vcpu);
1329         cond_resched();
1330         /* Cannot fail -  no vcpu unplug yet. */
1331         vcpu_load(vcpu->kvm, vcpu_slot(vcpu));
1332 }
1333 EXPORT_SYMBOL_GPL(kvm_resched);
1334
1335 void load_msrs(struct vmx_msr_entry *e, int n)
1336 {
1337         int i;
1338
1339         for (i = 0; i < n; ++i)
1340                 wrmsrl(e[i].index, e[i].data);
1341 }
1342 EXPORT_SYMBOL_GPL(load_msrs);
1343
1344 void save_msrs(struct vmx_msr_entry *e, int n)
1345 {
1346         int i;
1347
1348         for (i = 0; i < n; ++i)
1349                 rdmsrl(e[i].index, e[i].data);
1350 }
1351 EXPORT_SYMBOL_GPL(save_msrs);
1352
1353 static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run)
1354 {
1355         struct kvm_vcpu *vcpu;
1356         int r;
1357
1358         if (!valid_vcpu(kvm_run->vcpu))
1359                 return -EINVAL;
1360
1361         vcpu = vcpu_load(kvm, kvm_run->vcpu);
1362         if (!vcpu)
1363                 return -ENOENT;
1364
1365         /* re-sync apic's tpr */
1366         vcpu->cr8 = kvm_run->cr8;
1367
1368         if (kvm_run->emulated) {
1369                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1370                 kvm_run->emulated = 0;
1371         }
1372
1373         if (kvm_run->mmio_completed) {
1374                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1375                 vcpu->mmio_read_completed = 1;
1376         }
1377
1378         vcpu->mmio_needed = 0;
1379
1380         r = kvm_arch_ops->run(vcpu, kvm_run);
1381
1382         vcpu_put(vcpu);
1383         return r;
1384 }
1385
1386 static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs)
1387 {
1388         struct kvm_vcpu *vcpu;
1389
1390         if (!valid_vcpu(regs->vcpu))
1391                 return -EINVAL;
1392
1393         vcpu = vcpu_load(kvm, regs->vcpu);
1394         if (!vcpu)
1395                 return -ENOENT;
1396
1397         kvm_arch_ops->cache_regs(vcpu);
1398
1399         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1400         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1401         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1402         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1403         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1404         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1405         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1406         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1407 #ifdef CONFIG_X86_64
1408         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1409         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1410         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1411         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1412         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1413         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1414         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1415         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1416 #endif
1417
1418         regs->rip = vcpu->rip;
1419         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1420
1421         /*
1422          * Don't leak debug flags in case they were set for guest debugging
1423          */
1424         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1425                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1426
1427         vcpu_put(vcpu);
1428
1429         return 0;
1430 }
1431
1432 static int kvm_dev_ioctl_set_regs(struct kvm *kvm, struct kvm_regs *regs)
1433 {
1434         struct kvm_vcpu *vcpu;
1435
1436         if (!valid_vcpu(regs->vcpu))
1437                 return -EINVAL;
1438
1439         vcpu = vcpu_load(kvm, regs->vcpu);
1440         if (!vcpu)
1441                 return -ENOENT;
1442
1443         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
1444         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
1445         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
1446         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
1447         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
1448         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
1449         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
1450         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
1451 #ifdef CONFIG_X86_64
1452         vcpu->regs[VCPU_REGS_R8] = regs->r8;
1453         vcpu->regs[VCPU_REGS_R9] = regs->r9;
1454         vcpu->regs[VCPU_REGS_R10] = regs->r10;
1455         vcpu->regs[VCPU_REGS_R11] = regs->r11;
1456         vcpu->regs[VCPU_REGS_R12] = regs->r12;
1457         vcpu->regs[VCPU_REGS_R13] = regs->r13;
1458         vcpu->regs[VCPU_REGS_R14] = regs->r14;
1459         vcpu->regs[VCPU_REGS_R15] = regs->r15;
1460 #endif
1461
1462         vcpu->rip = regs->rip;
1463         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
1464
1465         kvm_arch_ops->decache_regs(vcpu);
1466
1467         vcpu_put(vcpu);
1468
1469         return 0;
1470 }
1471
1472 static void get_segment(struct kvm_vcpu *vcpu,
1473                         struct kvm_segment *var, int seg)
1474 {
1475         return kvm_arch_ops->get_segment(vcpu, var, seg);
1476 }
1477
1478 static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1479 {
1480         struct kvm_vcpu *vcpu;
1481         struct descriptor_table dt;
1482
1483         if (!valid_vcpu(sregs->vcpu))
1484                 return -EINVAL;
1485         vcpu = vcpu_load(kvm, sregs->vcpu);
1486         if (!vcpu)
1487                 return -ENOENT;
1488
1489         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1490         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1491         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1492         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1493         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1494         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1495
1496         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1497         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1498
1499         kvm_arch_ops->get_idt(vcpu, &dt);
1500         sregs->idt.limit = dt.limit;
1501         sregs->idt.base = dt.base;
1502         kvm_arch_ops->get_gdt(vcpu, &dt);
1503         sregs->gdt.limit = dt.limit;
1504         sregs->gdt.base = dt.base;
1505
1506         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1507         sregs->cr0 = vcpu->cr0;
1508         sregs->cr2 = vcpu->cr2;
1509         sregs->cr3 = vcpu->cr3;
1510         sregs->cr4 = vcpu->cr4;
1511         sregs->cr8 = vcpu->cr8;
1512         sregs->efer = vcpu->shadow_efer;
1513         sregs->apic_base = vcpu->apic_base;
1514
1515         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
1516                sizeof sregs->interrupt_bitmap);
1517
1518         vcpu_put(vcpu);
1519
1520         return 0;
1521 }
1522
1523 static void set_segment(struct kvm_vcpu *vcpu,
1524                         struct kvm_segment *var, int seg)
1525 {
1526         return kvm_arch_ops->set_segment(vcpu, var, seg);
1527 }
1528
1529 static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1530 {
1531         struct kvm_vcpu *vcpu;
1532         int mmu_reset_needed = 0;
1533         int i;
1534         struct descriptor_table dt;
1535
1536         if (!valid_vcpu(sregs->vcpu))
1537                 return -EINVAL;
1538         vcpu = vcpu_load(kvm, sregs->vcpu);
1539         if (!vcpu)
1540                 return -ENOENT;
1541
1542         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1543         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1544         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1545         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1546         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1547         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1548
1549         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1550         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1551
1552         dt.limit = sregs->idt.limit;
1553         dt.base = sregs->idt.base;
1554         kvm_arch_ops->set_idt(vcpu, &dt);
1555         dt.limit = sregs->gdt.limit;
1556         dt.base = sregs->gdt.base;
1557         kvm_arch_ops->set_gdt(vcpu, &dt);
1558
1559         vcpu->cr2 = sregs->cr2;
1560         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
1561         vcpu->cr3 = sregs->cr3;
1562
1563         vcpu->cr8 = sregs->cr8;
1564
1565         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
1566 #ifdef CONFIG_X86_64
1567         kvm_arch_ops->set_efer(vcpu, sregs->efer);
1568 #endif
1569         vcpu->apic_base = sregs->apic_base;
1570
1571         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1572
1573         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
1574         kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0);
1575
1576         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
1577         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
1578         if (!is_long_mode(vcpu) && is_pae(vcpu))
1579                 load_pdptrs(vcpu, vcpu->cr3);
1580
1581         if (mmu_reset_needed)
1582                 kvm_mmu_reset_context(vcpu);
1583
1584         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
1585                sizeof vcpu->irq_pending);
1586         vcpu->irq_summary = 0;
1587         for (i = 0; i < NR_IRQ_WORDS; ++i)
1588                 if (vcpu->irq_pending[i])
1589                         __set_bit(i, &vcpu->irq_summary);
1590
1591         vcpu_put(vcpu);
1592
1593         return 0;
1594 }
1595
1596 /*
1597  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1598  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1599  *
1600  * This list is modified at module load time to reflect the
1601  * capabilities of the host cpu.
1602  */
1603 static u32 msrs_to_save[] = {
1604         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1605         MSR_K6_STAR,
1606 #ifdef CONFIG_X86_64
1607         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1608 #endif
1609         MSR_IA32_TIME_STAMP_COUNTER,
1610 };
1611
1612 static unsigned num_msrs_to_save;
1613
1614 static u32 emulated_msrs[] = {
1615         MSR_IA32_MISC_ENABLE,
1616 };
1617
1618 static __init void kvm_init_msr_list(void)
1619 {
1620         u32 dummy[2];
1621         unsigned i, j;
1622
1623         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1624                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1625                         continue;
1626                 if (j < i)
1627                         msrs_to_save[j] = msrs_to_save[i];
1628                 j++;
1629         }
1630         num_msrs_to_save = j;
1631 }
1632
1633 /*
1634  * Adapt set_msr() to msr_io()'s calling convention
1635  */
1636 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1637 {
1638         return set_msr(vcpu, index, *data);
1639 }
1640
1641 /*
1642  * Read or write a bunch of msrs. All parameters are kernel addresses.
1643  *
1644  * @return number of msrs set successfully.
1645  */
1646 static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs,
1647                     struct kvm_msr_entry *entries,
1648                     int (*do_msr)(struct kvm_vcpu *vcpu,
1649                                   unsigned index, u64 *data))
1650 {
1651         struct kvm_vcpu *vcpu;
1652         int i;
1653
1654         if (!valid_vcpu(msrs->vcpu))
1655                 return -EINVAL;
1656
1657         vcpu = vcpu_load(kvm, msrs->vcpu);
1658         if (!vcpu)
1659                 return -ENOENT;
1660
1661         for (i = 0; i < msrs->nmsrs; ++i)
1662                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1663                         break;
1664
1665         vcpu_put(vcpu);
1666
1667         return i;
1668 }
1669
1670 /*
1671  * Read or write a bunch of msrs. Parameters are user addresses.
1672  *
1673  * @return number of msrs set successfully.
1674  */
1675 static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs,
1676                   int (*do_msr)(struct kvm_vcpu *vcpu,
1677                                 unsigned index, u64 *data),
1678                   int writeback)
1679 {
1680         struct kvm_msrs msrs;
1681         struct kvm_msr_entry *entries;
1682         int r, n;
1683         unsigned size;
1684
1685         r = -EFAULT;
1686         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1687                 goto out;
1688
1689         r = -E2BIG;
1690         if (msrs.nmsrs >= MAX_IO_MSRS)
1691                 goto out;
1692
1693         r = -ENOMEM;
1694         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1695         entries = vmalloc(size);
1696         if (!entries)
1697                 goto out;
1698
1699         r = -EFAULT;
1700         if (copy_from_user(entries, user_msrs->entries, size))
1701                 goto out_free;
1702
1703         r = n = __msr_io(kvm, &msrs, entries, do_msr);
1704         if (r < 0)
1705                 goto out_free;
1706
1707         r = -EFAULT;
1708         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1709                 goto out_free;
1710
1711         r = n;
1712
1713 out_free:
1714         vfree(entries);
1715 out:
1716         return r;
1717 }
1718
1719 /*
1720  * Translate a guest virtual address to a guest physical address.
1721  */
1722 static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr)
1723 {
1724         unsigned long vaddr = tr->linear_address;
1725         struct kvm_vcpu *vcpu;
1726         gpa_t gpa;
1727
1728         vcpu = vcpu_load(kvm, tr->vcpu);
1729         if (!vcpu)
1730                 return -ENOENT;
1731         spin_lock(&kvm->lock);
1732         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
1733         tr->physical_address = gpa;
1734         tr->valid = gpa != UNMAPPED_GVA;
1735         tr->writeable = 1;
1736         tr->usermode = 0;
1737         spin_unlock(&kvm->lock);
1738         vcpu_put(vcpu);
1739
1740         return 0;
1741 }
1742
1743 static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq)
1744 {
1745         struct kvm_vcpu *vcpu;
1746
1747         if (!valid_vcpu(irq->vcpu))
1748                 return -EINVAL;
1749         if (irq->irq < 0 || irq->irq >= 256)
1750                 return -EINVAL;
1751         vcpu = vcpu_load(kvm, irq->vcpu);
1752         if (!vcpu)
1753                 return -ENOENT;
1754
1755         set_bit(irq->irq, vcpu->irq_pending);
1756         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
1757
1758         vcpu_put(vcpu);
1759
1760         return 0;
1761 }
1762
1763 static int kvm_dev_ioctl_debug_guest(struct kvm *kvm,
1764                                      struct kvm_debug_guest *dbg)
1765 {
1766         struct kvm_vcpu *vcpu;
1767         int r;
1768
1769         if (!valid_vcpu(dbg->vcpu))
1770                 return -EINVAL;
1771         vcpu = vcpu_load(kvm, dbg->vcpu);
1772         if (!vcpu)
1773                 return -ENOENT;
1774
1775         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
1776
1777         vcpu_put(vcpu);
1778
1779         return r;
1780 }
1781
1782 static long kvm_dev_ioctl(struct file *filp,
1783                           unsigned int ioctl, unsigned long arg)
1784 {
1785         struct kvm *kvm = filp->private_data;
1786         void __user *argp = (void __user *)arg;
1787         int r = -EINVAL;
1788
1789         switch (ioctl) {
1790         case KVM_GET_API_VERSION:
1791                 r = KVM_API_VERSION;
1792                 break;
1793         case KVM_CREATE_VCPU: {
1794                 r = kvm_dev_ioctl_create_vcpu(kvm, arg);
1795                 if (r)
1796                         goto out;
1797                 break;
1798         }
1799         case KVM_RUN: {
1800                 struct kvm_run kvm_run;
1801
1802                 r = -EFAULT;
1803                 if (copy_from_user(&kvm_run, argp, sizeof kvm_run))
1804                         goto out;
1805                 r = kvm_dev_ioctl_run(kvm, &kvm_run);
1806                 if (r < 0 &&  r != -EINTR)
1807                         goto out;
1808                 if (copy_to_user(argp, &kvm_run, sizeof kvm_run)) {
1809                         r = -EFAULT;
1810                         goto out;
1811                 }
1812                 break;
1813         }
1814         case KVM_GET_REGS: {
1815                 struct kvm_regs kvm_regs;
1816
1817                 r = -EFAULT;
1818                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
1819                         goto out;
1820                 r = kvm_dev_ioctl_get_regs(kvm, &kvm_regs);
1821                 if (r)
1822                         goto out;
1823                 r = -EFAULT;
1824                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
1825                         goto out;
1826                 r = 0;
1827                 break;
1828         }
1829         case KVM_SET_REGS: {
1830                 struct kvm_regs kvm_regs;
1831
1832                 r = -EFAULT;
1833                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
1834                         goto out;
1835                 r = kvm_dev_ioctl_set_regs(kvm, &kvm_regs);
1836                 if (r)
1837                         goto out;
1838                 r = 0;
1839                 break;
1840         }
1841         case KVM_GET_SREGS: {
1842                 struct kvm_sregs kvm_sregs;
1843
1844                 r = -EFAULT;
1845                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
1846                         goto out;
1847                 r = kvm_dev_ioctl_get_sregs(kvm, &kvm_sregs);
1848                 if (r)
1849                         goto out;
1850                 r = -EFAULT;
1851                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
1852                         goto out;
1853                 r = 0;
1854                 break;
1855         }
1856         case KVM_SET_SREGS: {
1857                 struct kvm_sregs kvm_sregs;
1858
1859                 r = -EFAULT;
1860                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
1861                         goto out;
1862                 r = kvm_dev_ioctl_set_sregs(kvm, &kvm_sregs);
1863                 if (r)
1864                         goto out;
1865                 r = 0;
1866                 break;
1867         }
1868         case KVM_TRANSLATE: {
1869                 struct kvm_translation tr;
1870
1871                 r = -EFAULT;
1872                 if (copy_from_user(&tr, argp, sizeof tr))
1873                         goto out;
1874                 r = kvm_dev_ioctl_translate(kvm, &tr);
1875                 if (r)
1876                         goto out;
1877                 r = -EFAULT;
1878                 if (copy_to_user(argp, &tr, sizeof tr))
1879                         goto out;
1880                 r = 0;
1881                 break;
1882         }
1883         case KVM_INTERRUPT: {
1884                 struct kvm_interrupt irq;
1885
1886                 r = -EFAULT;
1887                 if (copy_from_user(&irq, argp, sizeof irq))
1888                         goto out;
1889                 r = kvm_dev_ioctl_interrupt(kvm, &irq);
1890                 if (r)
1891                         goto out;
1892                 r = 0;
1893                 break;
1894         }
1895         case KVM_DEBUG_GUEST: {
1896                 struct kvm_debug_guest dbg;
1897
1898                 r = -EFAULT;
1899                 if (copy_from_user(&dbg, argp, sizeof dbg))
1900                         goto out;
1901                 r = kvm_dev_ioctl_debug_guest(kvm, &dbg);
1902                 if (r)
1903                         goto out;
1904                 r = 0;
1905                 break;
1906         }
1907         case KVM_SET_MEMORY_REGION: {
1908                 struct kvm_memory_region kvm_mem;
1909
1910                 r = -EFAULT;
1911                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1912                         goto out;
1913                 r = kvm_dev_ioctl_set_memory_region(kvm, &kvm_mem);
1914                 if (r)
1915                         goto out;
1916                 break;
1917         }
1918         case KVM_GET_DIRTY_LOG: {
1919                 struct kvm_dirty_log log;
1920
1921                 r = -EFAULT;
1922                 if (copy_from_user(&log, argp, sizeof log))
1923                         goto out;
1924                 r = kvm_dev_ioctl_get_dirty_log(kvm, &log);
1925                 if (r)
1926                         goto out;
1927                 break;
1928         }
1929         case KVM_GET_MSRS:
1930                 r = msr_io(kvm, argp, get_msr, 1);
1931                 break;
1932         case KVM_SET_MSRS:
1933                 r = msr_io(kvm, argp, do_set_msr, 0);
1934                 break;
1935         case KVM_GET_MSR_INDEX_LIST: {
1936                 struct kvm_msr_list __user *user_msr_list = argp;
1937                 struct kvm_msr_list msr_list;
1938                 unsigned n;
1939
1940                 r = -EFAULT;
1941                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1942                         goto out;
1943                 n = msr_list.nmsrs;
1944                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1945                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1946                         goto out;
1947                 r = -E2BIG;
1948                 if (n < num_msrs_to_save)
1949                         goto out;
1950                 r = -EFAULT;
1951                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1952                                  num_msrs_to_save * sizeof(u32)))
1953                         goto out;
1954                 if (copy_to_user(user_msr_list->indices
1955                                  + num_msrs_to_save * sizeof(u32),
1956                                  &emulated_msrs,
1957                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1958                         goto out;
1959                 r = 0;
1960                 break;
1961         }
1962         default:
1963                 ;
1964         }
1965 out:
1966         return r;
1967 }
1968
1969 static struct page *kvm_dev_nopage(struct vm_area_struct *vma,
1970                                    unsigned long address,
1971                                    int *type)
1972 {
1973         struct kvm *kvm = vma->vm_file->private_data;
1974         unsigned long pgoff;
1975         struct kvm_memory_slot *slot;
1976         struct page *page;
1977
1978         *type = VM_FAULT_MINOR;
1979         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1980         slot = gfn_to_memslot(kvm, pgoff);
1981         if (!slot)
1982                 return NOPAGE_SIGBUS;
1983         page = gfn_to_page(slot, pgoff);
1984         if (!page)
1985                 return NOPAGE_SIGBUS;
1986         get_page(page);
1987         return page;
1988 }
1989
1990 static struct vm_operations_struct kvm_dev_vm_ops = {
1991         .nopage = kvm_dev_nopage,
1992 };
1993
1994 static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma)
1995 {
1996         vma->vm_ops = &kvm_dev_vm_ops;
1997         return 0;
1998 }
1999
2000 static struct file_operations kvm_chardev_ops = {
2001         .open           = kvm_dev_open,
2002         .release        = kvm_dev_release,
2003         .unlocked_ioctl = kvm_dev_ioctl,
2004         .compat_ioctl   = kvm_dev_ioctl,
2005         .mmap           = kvm_dev_mmap,
2006 };
2007
2008 static struct miscdevice kvm_dev = {
2009         MISC_DYNAMIC_MINOR,
2010         "kvm",
2011         &kvm_chardev_ops,
2012 };
2013
2014 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2015                        void *v)
2016 {
2017         if (val == SYS_RESTART) {
2018                 /*
2019                  * Some (well, at least mine) BIOSes hang on reboot if
2020                  * in vmx root mode.
2021                  */
2022                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2023                 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2024         }
2025         return NOTIFY_OK;
2026 }
2027
2028 static struct notifier_block kvm_reboot_notifier = {
2029         .notifier_call = kvm_reboot,
2030         .priority = 0,
2031 };
2032
2033 static __init void kvm_init_debug(void)
2034 {
2035         struct kvm_stats_debugfs_item *p;
2036
2037         debugfs_dir = debugfs_create_dir("kvm", NULL);
2038         for (p = debugfs_entries; p->name; ++p)
2039                 p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir,
2040                                                p->data);
2041 }
2042
2043 static void kvm_exit_debug(void)
2044 {
2045         struct kvm_stats_debugfs_item *p;
2046
2047         for (p = debugfs_entries; p->name; ++p)
2048                 debugfs_remove(p->dentry);
2049         debugfs_remove(debugfs_dir);
2050 }
2051
2052 hpa_t bad_page_address;
2053
2054 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
2055 {
2056         int r;
2057
2058         if (kvm_arch_ops) {
2059                 printk(KERN_ERR "kvm: already loaded the other module\n");
2060                 return -EEXIST;
2061         }
2062
2063         if (!ops->cpu_has_kvm_support()) {
2064                 printk(KERN_ERR "kvm: no hardware support\n");
2065                 return -EOPNOTSUPP;
2066         }
2067         if (ops->disabled_by_bios()) {
2068                 printk(KERN_ERR "kvm: disabled by bios\n");
2069                 return -EOPNOTSUPP;
2070         }
2071
2072         kvm_arch_ops = ops;
2073
2074         r = kvm_arch_ops->hardware_setup();
2075         if (r < 0)
2076             return r;
2077
2078         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
2079         register_reboot_notifier(&kvm_reboot_notifier);
2080
2081         kvm_chardev_ops.owner = module;
2082
2083         r = misc_register(&kvm_dev);
2084         if (r) {
2085                 printk (KERN_ERR "kvm: misc device register failed\n");
2086                 goto out_free;
2087         }
2088
2089         return r;
2090
2091 out_free:
2092         unregister_reboot_notifier(&kvm_reboot_notifier);
2093         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2094         kvm_arch_ops->hardware_unsetup();
2095         return r;
2096 }
2097
2098 void kvm_exit_arch(void)
2099 {
2100         misc_deregister(&kvm_dev);
2101
2102         unregister_reboot_notifier(&kvm_reboot_notifier);
2103         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2104         kvm_arch_ops->hardware_unsetup();
2105         kvm_arch_ops = NULL;
2106 }
2107
2108 static __init int kvm_init(void)
2109 {
2110         static struct page *bad_page;
2111         int r = 0;
2112
2113         kvm_init_debug();
2114
2115         kvm_init_msr_list();
2116
2117         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
2118                 r = -ENOMEM;
2119                 goto out;
2120         }
2121
2122         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
2123         memset(__va(bad_page_address), 0, PAGE_SIZE);
2124
2125         return r;
2126
2127 out:
2128         kvm_exit_debug();
2129         return r;
2130 }
2131
2132 static __exit void kvm_exit(void)
2133 {
2134         kvm_exit_debug();
2135         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
2136 }
2137
2138 module_init(kvm_init)
2139 module_exit(kvm_exit)
2140
2141 EXPORT_SYMBOL_GPL(kvm_init_arch);
2142 EXPORT_SYMBOL_GPL(kvm_exit_arch);