KVM: Portability: Make exported debugfs data architecture-specific
[safe/jmp/linux-2.6] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19 #include "x86.h"
20 #include "x86_emulate.h"
21 #include "irq.h"
22
23 #include <linux/kvm.h>
24 #include <linux/module.h>
25 #include <linux/errno.h>
26 #include <linux/percpu.h>
27 #include <linux/gfp.h>
28 #include <linux/mm.h>
29 #include <linux/miscdevice.h>
30 #include <linux/vmalloc.h>
31 #include <linux/reboot.h>
32 #include <linux/debugfs.h>
33 #include <linux/highmem.h>
34 #include <linux/file.h>
35 #include <linux/sysdev.h>
36 #include <linux/cpu.h>
37 #include <linux/sched.h>
38 #include <linux/cpumask.h>
39 #include <linux/smp.h>
40 #include <linux/anon_inodes.h>
41 #include <linux/profile.h>
42 #include <linux/kvm_para.h>
43 #include <linux/pagemap.h>
44 #include <linux/mman.h>
45
46 #include <asm/processor.h>
47 #include <asm/msr.h>
48 #include <asm/io.h>
49 #include <asm/uaccess.h>
50 #include <asm/desc.h>
51
52 MODULE_AUTHOR("Qumranet");
53 MODULE_LICENSE("GPL");
54
55 static DEFINE_SPINLOCK(kvm_lock);
56 static LIST_HEAD(vm_list);
57
58 static cpumask_t cpus_hardware_enabled;
59
60 struct kvm_x86_ops *kvm_x86_ops;
61 struct kmem_cache *kvm_vcpu_cache;
62 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
63
64 static __read_mostly struct preempt_ops kvm_preempt_ops;
65
66 static struct dentry *debugfs_dir;
67
68 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
69                            unsigned long arg);
70
71 static inline int valid_vcpu(int n)
72 {
73         return likely(n >= 0 && n < KVM_MAX_VCPUS);
74 }
75
76 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
77 {
78         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
79                 return;
80
81         vcpu->guest_fpu_loaded = 1;
82         fx_save(&vcpu->host_fx_image);
83         fx_restore(&vcpu->guest_fx_image);
84 }
85 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
86
87 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
88 {
89         if (!vcpu->guest_fpu_loaded)
90                 return;
91
92         vcpu->guest_fpu_loaded = 0;
93         fx_save(&vcpu->guest_fx_image);
94         fx_restore(&vcpu->host_fx_image);
95 }
96 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
97
98 /*
99  * Switches to specified vcpu, until a matching vcpu_put()
100  */
101 void vcpu_load(struct kvm_vcpu *vcpu)
102 {
103         int cpu;
104
105         mutex_lock(&vcpu->mutex);
106         cpu = get_cpu();
107         preempt_notifier_register(&vcpu->preempt_notifier);
108         kvm_arch_vcpu_load(vcpu, cpu);
109         put_cpu();
110 }
111
112 void vcpu_put(struct kvm_vcpu *vcpu)
113 {
114         preempt_disable();
115         kvm_arch_vcpu_put(vcpu);
116         preempt_notifier_unregister(&vcpu->preempt_notifier);
117         preempt_enable();
118         mutex_unlock(&vcpu->mutex);
119 }
120
121 static void ack_flush(void *_completed)
122 {
123 }
124
125 void kvm_flush_remote_tlbs(struct kvm *kvm)
126 {
127         int i, cpu;
128         cpumask_t cpus;
129         struct kvm_vcpu *vcpu;
130
131         cpus_clear(cpus);
132         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
133                 vcpu = kvm->vcpus[i];
134                 if (!vcpu)
135                         continue;
136                 if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
137                         continue;
138                 cpu = vcpu->cpu;
139                 if (cpu != -1 && cpu != raw_smp_processor_id())
140                         cpu_set(cpu, cpus);
141         }
142         smp_call_function_mask(cpus, ack_flush, NULL, 1);
143 }
144
145 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
146 {
147         struct page *page;
148         int r;
149
150         mutex_init(&vcpu->mutex);
151         vcpu->cpu = -1;
152         vcpu->mmu.root_hpa = INVALID_PAGE;
153         vcpu->kvm = kvm;
154         vcpu->vcpu_id = id;
155         if (!irqchip_in_kernel(kvm) || id == 0)
156                 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
157         else
158                 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
159         init_waitqueue_head(&vcpu->wq);
160
161         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
162         if (!page) {
163                 r = -ENOMEM;
164                 goto fail;
165         }
166         vcpu->run = page_address(page);
167
168         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
169         if (!page) {
170                 r = -ENOMEM;
171                 goto fail_free_run;
172         }
173         vcpu->pio_data = page_address(page);
174
175         r = kvm_mmu_create(vcpu);
176         if (r < 0)
177                 goto fail_free_pio_data;
178
179         if (irqchip_in_kernel(kvm)) {
180                 r = kvm_create_lapic(vcpu);
181                 if (r < 0)
182                         goto fail_mmu_destroy;
183         }
184
185         return 0;
186
187 fail_mmu_destroy:
188         kvm_mmu_destroy(vcpu);
189 fail_free_pio_data:
190         free_page((unsigned long)vcpu->pio_data);
191 fail_free_run:
192         free_page((unsigned long)vcpu->run);
193 fail:
194         return r;
195 }
196 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
197
198 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
199 {
200         kvm_free_lapic(vcpu);
201         kvm_mmu_destroy(vcpu);
202         free_page((unsigned long)vcpu->pio_data);
203         free_page((unsigned long)vcpu->run);
204 }
205 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
206
207 static struct kvm *kvm_create_vm(void)
208 {
209         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
210
211         if (!kvm)
212                 return ERR_PTR(-ENOMEM);
213
214         kvm_io_bus_init(&kvm->pio_bus);
215         mutex_init(&kvm->lock);
216         INIT_LIST_HEAD(&kvm->active_mmu_pages);
217         kvm_io_bus_init(&kvm->mmio_bus);
218         spin_lock(&kvm_lock);
219         list_add(&kvm->vm_list, &vm_list);
220         spin_unlock(&kvm_lock);
221         return kvm;
222 }
223
224 /*
225  * Free any memory in @free but not in @dont.
226  */
227 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
228                                   struct kvm_memory_slot *dont)
229 {
230         if (!dont || free->rmap != dont->rmap)
231                 vfree(free->rmap);
232
233         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
234                 vfree(free->dirty_bitmap);
235
236         free->npages = 0;
237         free->dirty_bitmap = NULL;
238         free->rmap = NULL;
239 }
240
241 static void kvm_free_physmem(struct kvm *kvm)
242 {
243         int i;
244
245         for (i = 0; i < kvm->nmemslots; ++i)
246                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
247 }
248
249 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
250 {
251         vcpu_load(vcpu);
252         kvm_mmu_unload(vcpu);
253         vcpu_put(vcpu);
254 }
255
256 static void kvm_free_vcpus(struct kvm *kvm)
257 {
258         unsigned int i;
259
260         /*
261          * Unpin any mmu pages first.
262          */
263         for (i = 0; i < KVM_MAX_VCPUS; ++i)
264                 if (kvm->vcpus[i])
265                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
266         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
267                 if (kvm->vcpus[i]) {
268                         kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
269                         kvm->vcpus[i] = NULL;
270                 }
271         }
272
273 }
274
275 static void kvm_destroy_vm(struct kvm *kvm)
276 {
277         spin_lock(&kvm_lock);
278         list_del(&kvm->vm_list);
279         spin_unlock(&kvm_lock);
280         kvm_io_bus_destroy(&kvm->pio_bus);
281         kvm_io_bus_destroy(&kvm->mmio_bus);
282         kfree(kvm->vpic);
283         kfree(kvm->vioapic);
284         kvm_free_vcpus(kvm);
285         kvm_free_physmem(kvm);
286         kfree(kvm);
287 }
288
289 static int kvm_vm_release(struct inode *inode, struct file *filp)
290 {
291         struct kvm *kvm = filp->private_data;
292
293         kvm_destroy_vm(kvm);
294         return 0;
295 }
296
297 void fx_init(struct kvm_vcpu *vcpu)
298 {
299         unsigned after_mxcsr_mask;
300
301         /* Initialize guest FPU by resetting ours and saving into guest's */
302         preempt_disable();
303         fx_save(&vcpu->host_fx_image);
304         fpu_init();
305         fx_save(&vcpu->guest_fx_image);
306         fx_restore(&vcpu->host_fx_image);
307         preempt_enable();
308
309         vcpu->cr0 |= X86_CR0_ET;
310         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
311         vcpu->guest_fx_image.mxcsr = 0x1f80;
312         memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
313                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
314 }
315 EXPORT_SYMBOL_GPL(fx_init);
316
317 /*
318  * Allocate some memory and give it an address in the guest physical address
319  * space.
320  *
321  * Discontiguous memory is allowed, mostly for framebuffers.
322  *
323  * Must be called holding kvm->lock.
324  */
325 int __kvm_set_memory_region(struct kvm *kvm,
326                             struct kvm_userspace_memory_region *mem,
327                             int user_alloc)
328 {
329         int r;
330         gfn_t base_gfn;
331         unsigned long npages;
332         unsigned long i;
333         struct kvm_memory_slot *memslot;
334         struct kvm_memory_slot old, new;
335
336         r = -EINVAL;
337         /* General sanity checks */
338         if (mem->memory_size & (PAGE_SIZE - 1))
339                 goto out;
340         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
341                 goto out;
342         if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
343                 goto out;
344         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
345                 goto out;
346
347         memslot = &kvm->memslots[mem->slot];
348         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
349         npages = mem->memory_size >> PAGE_SHIFT;
350
351         if (!npages)
352                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
353
354         new = old = *memslot;
355
356         new.base_gfn = base_gfn;
357         new.npages = npages;
358         new.flags = mem->flags;
359
360         /* Disallow changing a memory slot's size. */
361         r = -EINVAL;
362         if (npages && old.npages && npages != old.npages)
363                 goto out_free;
364
365         /* Check for overlaps */
366         r = -EEXIST;
367         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
368                 struct kvm_memory_slot *s = &kvm->memslots[i];
369
370                 if (s == memslot)
371                         continue;
372                 if (!((base_gfn + npages <= s->base_gfn) ||
373                       (base_gfn >= s->base_gfn + s->npages)))
374                         goto out_free;
375         }
376
377         /* Free page dirty bitmap if unneeded */
378         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
379                 new.dirty_bitmap = NULL;
380
381         r = -ENOMEM;
382
383         /* Allocate if a slot is being created */
384         if (npages && !new.rmap) {
385                 new.rmap = vmalloc(npages * sizeof(struct page *));
386
387                 if (!new.rmap)
388                         goto out_free;
389
390                 memset(new.rmap, 0, npages * sizeof(*new.rmap));
391
392                 new.user_alloc = user_alloc;
393                 if (user_alloc)
394                         new.userspace_addr = mem->userspace_addr;
395                 else {
396                         down_write(&current->mm->mmap_sem);
397                         new.userspace_addr = do_mmap(NULL, 0,
398                                                      npages * PAGE_SIZE,
399                                                      PROT_READ | PROT_WRITE,
400                                                      MAP_SHARED | MAP_ANONYMOUS,
401                                                      0);
402                         up_write(&current->mm->mmap_sem);
403
404                         if (IS_ERR((void *)new.userspace_addr))
405                                 goto out_free;
406                 }
407         } else {
408                 if (!old.user_alloc && old.rmap) {
409                         int ret;
410
411                         down_write(&current->mm->mmap_sem);
412                         ret = do_munmap(current->mm, old.userspace_addr,
413                                         old.npages * PAGE_SIZE);
414                         up_write(&current->mm->mmap_sem);
415                         if (ret < 0)
416                                 printk(KERN_WARNING
417                                        "kvm_vm_ioctl_set_memory_region: "
418                                        "failed to munmap memory\n");
419                 }
420         }
421
422         /* Allocate page dirty bitmap if needed */
423         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
424                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
425
426                 new.dirty_bitmap = vmalloc(dirty_bytes);
427                 if (!new.dirty_bitmap)
428                         goto out_free;
429                 memset(new.dirty_bitmap, 0, dirty_bytes);
430         }
431
432         if (mem->slot >= kvm->nmemslots)
433                 kvm->nmemslots = mem->slot + 1;
434
435         if (!kvm->n_requested_mmu_pages) {
436                 unsigned int n_pages;
437
438                 if (npages) {
439                         n_pages = npages * KVM_PERMILLE_MMU_PAGES / 1000;
440                         kvm_mmu_change_mmu_pages(kvm, kvm->n_alloc_mmu_pages +
441                                                  n_pages);
442                 } else {
443                         unsigned int nr_mmu_pages;
444
445                         n_pages = old.npages * KVM_PERMILLE_MMU_PAGES / 1000;
446                         nr_mmu_pages = kvm->n_alloc_mmu_pages - n_pages;
447                         nr_mmu_pages = max(nr_mmu_pages,
448                                         (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
449                         kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
450                 }
451         }
452
453         *memslot = new;
454
455         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
456         kvm_flush_remote_tlbs(kvm);
457
458         kvm_free_physmem_slot(&old, &new);
459         return 0;
460
461 out_free:
462         kvm_free_physmem_slot(&new, &old);
463 out:
464         return r;
465
466 }
467 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
468
469 int kvm_set_memory_region(struct kvm *kvm,
470                           struct kvm_userspace_memory_region *mem,
471                           int user_alloc)
472 {
473         int r;
474
475         mutex_lock(&kvm->lock);
476         r = __kvm_set_memory_region(kvm, mem, user_alloc);
477         mutex_unlock(&kvm->lock);
478         return r;
479 }
480 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
481
482 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
483                                    struct
484                                    kvm_userspace_memory_region *mem,
485                                    int user_alloc)
486 {
487         if (mem->slot >= KVM_MEMORY_SLOTS)
488                 return -EINVAL;
489         return kvm_set_memory_region(kvm, mem, user_alloc);
490 }
491
492 /*
493  * Get (and clear) the dirty memory log for a memory slot.
494  */
495 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
496                                       struct kvm_dirty_log *log)
497 {
498         struct kvm_memory_slot *memslot;
499         int r, i;
500         int n;
501         unsigned long any = 0;
502
503         mutex_lock(&kvm->lock);
504
505         r = -EINVAL;
506         if (log->slot >= KVM_MEMORY_SLOTS)
507                 goto out;
508
509         memslot = &kvm->memslots[log->slot];
510         r = -ENOENT;
511         if (!memslot->dirty_bitmap)
512                 goto out;
513
514         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
515
516         for (i = 0; !any && i < n/sizeof(long); ++i)
517                 any = memslot->dirty_bitmap[i];
518
519         r = -EFAULT;
520         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
521                 goto out;
522
523         /* If nothing is dirty, don't bother messing with page tables. */
524         if (any) {
525                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
526                 kvm_flush_remote_tlbs(kvm);
527                 memset(memslot->dirty_bitmap, 0, n);
528         }
529
530         r = 0;
531
532 out:
533         mutex_unlock(&kvm->lock);
534         return r;
535 }
536
537 int is_error_page(struct page *page)
538 {
539         return page == bad_page;
540 }
541 EXPORT_SYMBOL_GPL(is_error_page);
542
543 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
544 {
545         int i;
546         struct kvm_mem_alias *alias;
547
548         for (i = 0; i < kvm->naliases; ++i) {
549                 alias = &kvm->aliases[i];
550                 if (gfn >= alias->base_gfn
551                     && gfn < alias->base_gfn + alias->npages)
552                         return alias->target_gfn + gfn - alias->base_gfn;
553         }
554         return gfn;
555 }
556
557 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
558 {
559         int i;
560
561         for (i = 0; i < kvm->nmemslots; ++i) {
562                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
563
564                 if (gfn >= memslot->base_gfn
565                     && gfn < memslot->base_gfn + memslot->npages)
566                         return memslot;
567         }
568         return NULL;
569 }
570
571 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
572 {
573         gfn = unalias_gfn(kvm, gfn);
574         return __gfn_to_memslot(kvm, gfn);
575 }
576
577 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
578 {
579         int i;
580
581         gfn = unalias_gfn(kvm, gfn);
582         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
583                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
584
585                 if (gfn >= memslot->base_gfn
586                     && gfn < memslot->base_gfn + memslot->npages)
587                         return 1;
588         }
589         return 0;
590 }
591 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
592
593 /*
594  * Requires current->mm->mmap_sem to be held
595  */
596 static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn)
597 {
598         struct kvm_memory_slot *slot;
599         struct page *page[1];
600         int npages;
601
602         might_sleep();
603
604         gfn = unalias_gfn(kvm, gfn);
605         slot = __gfn_to_memslot(kvm, gfn);
606         if (!slot) {
607                 get_page(bad_page);
608                 return bad_page;
609         }
610
611         npages = get_user_pages(current, current->mm,
612                                 slot->userspace_addr
613                                 + (gfn - slot->base_gfn) * PAGE_SIZE, 1,
614                                 1, 1, page, NULL);
615         if (npages != 1) {
616                 get_page(bad_page);
617                 return bad_page;
618         }
619
620         return page[0];
621 }
622
623 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
624 {
625         struct page *page;
626
627         down_read(&current->mm->mmap_sem);
628         page = __gfn_to_page(kvm, gfn);
629         up_read(&current->mm->mmap_sem);
630
631         return page;
632 }
633
634 EXPORT_SYMBOL_GPL(gfn_to_page);
635
636 void kvm_release_page(struct page *page)
637 {
638         if (!PageReserved(page))
639                 SetPageDirty(page);
640         put_page(page);
641 }
642 EXPORT_SYMBOL_GPL(kvm_release_page);
643
644 static int next_segment(unsigned long len, int offset)
645 {
646         if (len > PAGE_SIZE - offset)
647                 return PAGE_SIZE - offset;
648         else
649                 return len;
650 }
651
652 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
653                         int len)
654 {
655         void *page_virt;
656         struct page *page;
657
658         page = gfn_to_page(kvm, gfn);
659         if (is_error_page(page)) {
660                 kvm_release_page(page);
661                 return -EFAULT;
662         }
663         page_virt = kmap_atomic(page, KM_USER0);
664
665         memcpy(data, page_virt + offset, len);
666
667         kunmap_atomic(page_virt, KM_USER0);
668         kvm_release_page(page);
669         return 0;
670 }
671 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
672
673 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
674 {
675         gfn_t gfn = gpa >> PAGE_SHIFT;
676         int seg;
677         int offset = offset_in_page(gpa);
678         int ret;
679
680         while ((seg = next_segment(len, offset)) != 0) {
681                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
682                 if (ret < 0)
683                         return ret;
684                 offset = 0;
685                 len -= seg;
686                 data += seg;
687                 ++gfn;
688         }
689         return 0;
690 }
691 EXPORT_SYMBOL_GPL(kvm_read_guest);
692
693 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
694                          int offset, int len)
695 {
696         void *page_virt;
697         struct page *page;
698
699         page = gfn_to_page(kvm, gfn);
700         if (is_error_page(page)) {
701                 kvm_release_page(page);
702                 return -EFAULT;
703         }
704         page_virt = kmap_atomic(page, KM_USER0);
705
706         memcpy(page_virt + offset, data, len);
707
708         kunmap_atomic(page_virt, KM_USER0);
709         mark_page_dirty(kvm, gfn);
710         kvm_release_page(page);
711         return 0;
712 }
713 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
714
715 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
716                     unsigned long len)
717 {
718         gfn_t gfn = gpa >> PAGE_SHIFT;
719         int seg;
720         int offset = offset_in_page(gpa);
721         int ret;
722
723         while ((seg = next_segment(len, offset)) != 0) {
724                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
725                 if (ret < 0)
726                         return ret;
727                 offset = 0;
728                 len -= seg;
729                 data += seg;
730                 ++gfn;
731         }
732         return 0;
733 }
734
735 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
736 {
737         void *page_virt;
738         struct page *page;
739
740         page = gfn_to_page(kvm, gfn);
741         if (is_error_page(page)) {
742                 kvm_release_page(page);
743                 return -EFAULT;
744         }
745         page_virt = kmap_atomic(page, KM_USER0);
746
747         memset(page_virt + offset, 0, len);
748
749         kunmap_atomic(page_virt, KM_USER0);
750         kvm_release_page(page);
751         return 0;
752 }
753 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
754
755 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
756 {
757         gfn_t gfn = gpa >> PAGE_SHIFT;
758         int seg;
759         int offset = offset_in_page(gpa);
760         int ret;
761
762         while ((seg = next_segment(len, offset)) != 0) {
763                 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
764                 if (ret < 0)
765                         return ret;
766                 offset = 0;
767                 len -= seg;
768                 ++gfn;
769         }
770         return 0;
771 }
772 EXPORT_SYMBOL_GPL(kvm_clear_guest);
773
774 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
775 {
776         struct kvm_memory_slot *memslot;
777
778         gfn = unalias_gfn(kvm, gfn);
779         memslot = __gfn_to_memslot(kvm, gfn);
780         if (memslot && memslot->dirty_bitmap) {
781                 unsigned long rel_gfn = gfn - memslot->base_gfn;
782
783                 /* avoid RMW */
784                 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
785                         set_bit(rel_gfn, memslot->dirty_bitmap);
786         }
787 }
788
789 /*
790  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
791  */
792 static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
793 {
794         DECLARE_WAITQUEUE(wait, current);
795
796         add_wait_queue(&vcpu->wq, &wait);
797
798         /*
799          * We will block until either an interrupt or a signal wakes us up
800          */
801         while (!kvm_cpu_has_interrupt(vcpu)
802                && !signal_pending(current)
803                && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
804                && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
805                 set_current_state(TASK_INTERRUPTIBLE);
806                 vcpu_put(vcpu);
807                 schedule();
808                 vcpu_load(vcpu);
809         }
810
811         __set_current_state(TASK_RUNNING);
812         remove_wait_queue(&vcpu->wq, &wait);
813 }
814
815 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
816 {
817         ++vcpu->stat.halt_exits;
818         if (irqchip_in_kernel(vcpu->kvm)) {
819                 vcpu->mp_state = VCPU_MP_STATE_HALTED;
820                 kvm_vcpu_block(vcpu);
821                 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
822                         return -EINTR;
823                 return 1;
824         } else {
825                 vcpu->run->exit_reason = KVM_EXIT_HLT;
826                 return 0;
827         }
828 }
829 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
830
831 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
832 {
833         unsigned long nr, a0, a1, a2, a3, ret;
834
835         kvm_x86_ops->cache_regs(vcpu);
836
837         nr = vcpu->regs[VCPU_REGS_RAX];
838         a0 = vcpu->regs[VCPU_REGS_RBX];
839         a1 = vcpu->regs[VCPU_REGS_RCX];
840         a2 = vcpu->regs[VCPU_REGS_RDX];
841         a3 = vcpu->regs[VCPU_REGS_RSI];
842
843         if (!is_long_mode(vcpu)) {
844                 nr &= 0xFFFFFFFF;
845                 a0 &= 0xFFFFFFFF;
846                 a1 &= 0xFFFFFFFF;
847                 a2 &= 0xFFFFFFFF;
848                 a3 &= 0xFFFFFFFF;
849         }
850
851         switch (nr) {
852         default:
853                 ret = -KVM_ENOSYS;
854                 break;
855         }
856         vcpu->regs[VCPU_REGS_RAX] = ret;
857         kvm_x86_ops->decache_regs(vcpu);
858         return 0;
859 }
860 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
861
862 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
863 {
864         char instruction[3];
865         int ret = 0;
866
867         mutex_lock(&vcpu->kvm->lock);
868
869         /*
870          * Blow out the MMU to ensure that no other VCPU has an active mapping
871          * to ensure that the updated hypercall appears atomically across all
872          * VCPUs.
873          */
874         kvm_mmu_zap_all(vcpu->kvm);
875
876         kvm_x86_ops->cache_regs(vcpu);
877         kvm_x86_ops->patch_hypercall(vcpu, instruction);
878         if (emulator_write_emulated(vcpu->rip, instruction, 3, vcpu)
879             != X86EMUL_CONTINUE)
880                 ret = -EFAULT;
881
882         mutex_unlock(&vcpu->kvm->lock);
883
884         return ret;
885 }
886
887 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
888 {
889         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
890 }
891
892 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
893 {
894         struct descriptor_table dt = { limit, base };
895
896         kvm_x86_ops->set_gdt(vcpu, &dt);
897 }
898
899 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
900 {
901         struct descriptor_table dt = { limit, base };
902
903         kvm_x86_ops->set_idt(vcpu, &dt);
904 }
905
906 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
907                    unsigned long *rflags)
908 {
909         lmsw(vcpu, msw);
910         *rflags = kvm_x86_ops->get_rflags(vcpu);
911 }
912
913 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
914 {
915         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
916         switch (cr) {
917         case 0:
918                 return vcpu->cr0;
919         case 2:
920                 return vcpu->cr2;
921         case 3:
922                 return vcpu->cr3;
923         case 4:
924                 return vcpu->cr4;
925         default:
926                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
927                 return 0;
928         }
929 }
930
931 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
932                      unsigned long *rflags)
933 {
934         switch (cr) {
935         case 0:
936                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
937                 *rflags = kvm_x86_ops->get_rflags(vcpu);
938                 break;
939         case 2:
940                 vcpu->cr2 = val;
941                 break;
942         case 3:
943                 set_cr3(vcpu, val);
944                 break;
945         case 4:
946                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
947                 break;
948         default:
949                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
950         }
951 }
952
953 void kvm_resched(struct kvm_vcpu *vcpu)
954 {
955         if (!need_resched())
956                 return;
957         cond_resched();
958 }
959 EXPORT_SYMBOL_GPL(kvm_resched);
960
961 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
962 {
963         int i;
964         u32 function;
965         struct kvm_cpuid_entry *e, *best;
966
967         kvm_x86_ops->cache_regs(vcpu);
968         function = vcpu->regs[VCPU_REGS_RAX];
969         vcpu->regs[VCPU_REGS_RAX] = 0;
970         vcpu->regs[VCPU_REGS_RBX] = 0;
971         vcpu->regs[VCPU_REGS_RCX] = 0;
972         vcpu->regs[VCPU_REGS_RDX] = 0;
973         best = NULL;
974         for (i = 0; i < vcpu->cpuid_nent; ++i) {
975                 e = &vcpu->cpuid_entries[i];
976                 if (e->function == function) {
977                         best = e;
978                         break;
979                 }
980                 /*
981                  * Both basic or both extended?
982                  */
983                 if (((e->function ^ function) & 0x80000000) == 0)
984                         if (!best || e->function > best->function)
985                                 best = e;
986         }
987         if (best) {
988                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
989                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
990                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
991                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
992         }
993         kvm_x86_ops->decache_regs(vcpu);
994         kvm_x86_ops->skip_emulated_instruction(vcpu);
995 }
996 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
997
998 /*
999  * Check if userspace requested an interrupt window, and that the
1000  * interrupt window is open.
1001  *
1002  * No need to exit to userspace if we already have an interrupt queued.
1003  */
1004 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1005                                           struct kvm_run *kvm_run)
1006 {
1007         return (!vcpu->irq_summary &&
1008                 kvm_run->request_interrupt_window &&
1009                 vcpu->interrupt_window_open &&
1010                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
1011 }
1012
1013 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1014                               struct kvm_run *kvm_run)
1015 {
1016         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
1017         kvm_run->cr8 = get_cr8(vcpu);
1018         kvm_run->apic_base = kvm_get_apic_base(vcpu);
1019         if (irqchip_in_kernel(vcpu->kvm))
1020                 kvm_run->ready_for_interrupt_injection = 1;
1021         else
1022                 kvm_run->ready_for_interrupt_injection =
1023                                         (vcpu->interrupt_window_open &&
1024                                          vcpu->irq_summary == 0);
1025 }
1026
1027 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1028 {
1029         int r;
1030
1031         if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
1032                 pr_debug("vcpu %d received sipi with vector # %x\n",
1033                        vcpu->vcpu_id, vcpu->sipi_vector);
1034                 kvm_lapic_reset(vcpu);
1035                 r = kvm_x86_ops->vcpu_reset(vcpu);
1036                 if (r)
1037                         return r;
1038                 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
1039         }
1040
1041 preempted:
1042         if (vcpu->guest_debug.enabled)
1043                 kvm_x86_ops->guest_debug_pre(vcpu);
1044
1045 again:
1046         r = kvm_mmu_reload(vcpu);
1047         if (unlikely(r))
1048                 goto out;
1049
1050         kvm_inject_pending_timer_irqs(vcpu);
1051
1052         preempt_disable();
1053
1054         kvm_x86_ops->prepare_guest_switch(vcpu);
1055         kvm_load_guest_fpu(vcpu);
1056
1057         local_irq_disable();
1058
1059         if (signal_pending(current)) {
1060                 local_irq_enable();
1061                 preempt_enable();
1062                 r = -EINTR;
1063                 kvm_run->exit_reason = KVM_EXIT_INTR;
1064                 ++vcpu->stat.signal_exits;
1065                 goto out;
1066         }
1067
1068         if (irqchip_in_kernel(vcpu->kvm))
1069                 kvm_x86_ops->inject_pending_irq(vcpu);
1070         else if (!vcpu->mmio_read_completed)
1071                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
1072
1073         vcpu->guest_mode = 1;
1074         kvm_guest_enter();
1075
1076         if (vcpu->requests)
1077                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
1078                         kvm_x86_ops->tlb_flush(vcpu);
1079
1080         kvm_x86_ops->run(vcpu, kvm_run);
1081
1082         vcpu->guest_mode = 0;
1083         local_irq_enable();
1084
1085         ++vcpu->stat.exits;
1086
1087         /*
1088          * We must have an instruction between local_irq_enable() and
1089          * kvm_guest_exit(), so the timer interrupt isn't delayed by
1090          * the interrupt shadow.  The stat.exits increment will do nicely.
1091          * But we need to prevent reordering, hence this barrier():
1092          */
1093         barrier();
1094
1095         kvm_guest_exit();
1096
1097         preempt_enable();
1098
1099         /*
1100          * Profile KVM exit RIPs:
1101          */
1102         if (unlikely(prof_on == KVM_PROFILING)) {
1103                 kvm_x86_ops->cache_regs(vcpu);
1104                 profile_hit(KVM_PROFILING, (void *)vcpu->rip);
1105         }
1106
1107         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
1108
1109         if (r > 0) {
1110                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
1111                         r = -EINTR;
1112                         kvm_run->exit_reason = KVM_EXIT_INTR;
1113                         ++vcpu->stat.request_irq_exits;
1114                         goto out;
1115                 }
1116                 if (!need_resched()) {
1117                         ++vcpu->stat.light_exits;
1118                         goto again;
1119                 }
1120         }
1121
1122 out:
1123         if (r > 0) {
1124                 kvm_resched(vcpu);
1125                 goto preempted;
1126         }
1127
1128         post_kvm_run_save(vcpu, kvm_run);
1129
1130         return r;
1131 }
1132
1133
1134 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1135 {
1136         int r;
1137         sigset_t sigsaved;
1138
1139         vcpu_load(vcpu);
1140
1141         if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
1142                 kvm_vcpu_block(vcpu);
1143                 vcpu_put(vcpu);
1144                 return -EAGAIN;
1145         }
1146
1147         if (vcpu->sigset_active)
1148                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1149
1150         /* re-sync apic's tpr */
1151         if (!irqchip_in_kernel(vcpu->kvm))
1152                 set_cr8(vcpu, kvm_run->cr8);
1153
1154         if (vcpu->pio.cur_count) {
1155                 r = complete_pio(vcpu);
1156                 if (r)
1157                         goto out;
1158         }
1159 #if CONFIG_HAS_IOMEM
1160         if (vcpu->mmio_needed) {
1161                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1162                 vcpu->mmio_read_completed = 1;
1163                 vcpu->mmio_needed = 0;
1164                 r = emulate_instruction(vcpu, kvm_run,
1165                                         vcpu->mmio_fault_cr2, 0, 1);
1166                 if (r == EMULATE_DO_MMIO) {
1167                         /*
1168                          * Read-modify-write.  Back to userspace.
1169                          */
1170                         r = 0;
1171                         goto out;
1172                 }
1173         }
1174 #endif
1175         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1176                 kvm_x86_ops->cache_regs(vcpu);
1177                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1178                 kvm_x86_ops->decache_regs(vcpu);
1179         }
1180
1181         r = __vcpu_run(vcpu, kvm_run);
1182
1183 out:
1184         if (vcpu->sigset_active)
1185                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1186
1187         vcpu_put(vcpu);
1188         return r;
1189 }
1190
1191 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
1192                                    struct kvm_regs *regs)
1193 {
1194         vcpu_load(vcpu);
1195
1196         kvm_x86_ops->cache_regs(vcpu);
1197
1198         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1199         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1200         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1201         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1202         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1203         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1204         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1205         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1206 #ifdef CONFIG_X86_64
1207         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1208         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1209         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1210         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1211         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1212         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1213         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1214         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1215 #endif
1216
1217         regs->rip = vcpu->rip;
1218         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
1219
1220         /*
1221          * Don't leak debug flags in case they were set for guest debugging
1222          */
1223         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1224                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1225
1226         vcpu_put(vcpu);
1227
1228         return 0;
1229 }
1230
1231 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
1232                                    struct kvm_regs *regs)
1233 {
1234         vcpu_load(vcpu);
1235
1236         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
1237         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
1238         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
1239         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
1240         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
1241         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
1242         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
1243         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
1244 #ifdef CONFIG_X86_64
1245         vcpu->regs[VCPU_REGS_R8] = regs->r8;
1246         vcpu->regs[VCPU_REGS_R9] = regs->r9;
1247         vcpu->regs[VCPU_REGS_R10] = regs->r10;
1248         vcpu->regs[VCPU_REGS_R11] = regs->r11;
1249         vcpu->regs[VCPU_REGS_R12] = regs->r12;
1250         vcpu->regs[VCPU_REGS_R13] = regs->r13;
1251         vcpu->regs[VCPU_REGS_R14] = regs->r14;
1252         vcpu->regs[VCPU_REGS_R15] = regs->r15;
1253 #endif
1254
1255         vcpu->rip = regs->rip;
1256         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
1257
1258         kvm_x86_ops->decache_regs(vcpu);
1259
1260         vcpu_put(vcpu);
1261
1262         return 0;
1263 }
1264
1265 static void get_segment(struct kvm_vcpu *vcpu,
1266                         struct kvm_segment *var, int seg)
1267 {
1268         return kvm_x86_ops->get_segment(vcpu, var, seg);
1269 }
1270
1271 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1272                                     struct kvm_sregs *sregs)
1273 {
1274         struct descriptor_table dt;
1275         int pending_vec;
1276
1277         vcpu_load(vcpu);
1278
1279         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1280         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1281         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1282         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1283         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1284         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1285
1286         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1287         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1288
1289         kvm_x86_ops->get_idt(vcpu, &dt);
1290         sregs->idt.limit = dt.limit;
1291         sregs->idt.base = dt.base;
1292         kvm_x86_ops->get_gdt(vcpu, &dt);
1293         sregs->gdt.limit = dt.limit;
1294         sregs->gdt.base = dt.base;
1295
1296         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1297         sregs->cr0 = vcpu->cr0;
1298         sregs->cr2 = vcpu->cr2;
1299         sregs->cr3 = vcpu->cr3;
1300         sregs->cr4 = vcpu->cr4;
1301         sregs->cr8 = get_cr8(vcpu);
1302         sregs->efer = vcpu->shadow_efer;
1303         sregs->apic_base = kvm_get_apic_base(vcpu);
1304
1305         if (irqchip_in_kernel(vcpu->kvm)) {
1306                 memset(sregs->interrupt_bitmap, 0,
1307                        sizeof sregs->interrupt_bitmap);
1308                 pending_vec = kvm_x86_ops->get_irq(vcpu);
1309                 if (pending_vec >= 0)
1310                         set_bit(pending_vec,
1311                                 (unsigned long *)sregs->interrupt_bitmap);
1312         } else
1313                 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
1314                        sizeof sregs->interrupt_bitmap);
1315
1316         vcpu_put(vcpu);
1317
1318         return 0;
1319 }
1320
1321 static void set_segment(struct kvm_vcpu *vcpu,
1322                         struct kvm_segment *var, int seg)
1323 {
1324         return kvm_x86_ops->set_segment(vcpu, var, seg);
1325 }
1326
1327 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1328                                     struct kvm_sregs *sregs)
1329 {
1330         int mmu_reset_needed = 0;
1331         int i, pending_vec, max_bits;
1332         struct descriptor_table dt;
1333
1334         vcpu_load(vcpu);
1335
1336         dt.limit = sregs->idt.limit;
1337         dt.base = sregs->idt.base;
1338         kvm_x86_ops->set_idt(vcpu, &dt);
1339         dt.limit = sregs->gdt.limit;
1340         dt.base = sregs->gdt.base;
1341         kvm_x86_ops->set_gdt(vcpu, &dt);
1342
1343         vcpu->cr2 = sregs->cr2;
1344         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
1345         vcpu->cr3 = sregs->cr3;
1346
1347         set_cr8(vcpu, sregs->cr8);
1348
1349         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
1350 #ifdef CONFIG_X86_64
1351         kvm_x86_ops->set_efer(vcpu, sregs->efer);
1352 #endif
1353         kvm_set_apic_base(vcpu, sregs->apic_base);
1354
1355         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1356
1357         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
1358         vcpu->cr0 = sregs->cr0;
1359         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
1360
1361         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
1362         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
1363         if (!is_long_mode(vcpu) && is_pae(vcpu))
1364                 load_pdptrs(vcpu, vcpu->cr3);
1365
1366         if (mmu_reset_needed)
1367                 kvm_mmu_reset_context(vcpu);
1368
1369         if (!irqchip_in_kernel(vcpu->kvm)) {
1370                 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
1371                        sizeof vcpu->irq_pending);
1372                 vcpu->irq_summary = 0;
1373                 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
1374                         if (vcpu->irq_pending[i])
1375                                 __set_bit(i, &vcpu->irq_summary);
1376         } else {
1377                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
1378                 pending_vec = find_first_bit(
1379                         (const unsigned long *)sregs->interrupt_bitmap,
1380                         max_bits);
1381                 /* Only pending external irq is handled here */
1382                 if (pending_vec < max_bits) {
1383                         kvm_x86_ops->set_irq(vcpu, pending_vec);
1384                         pr_debug("Set back pending irq %d\n",
1385                                  pending_vec);
1386                 }
1387         }
1388
1389         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1390         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1391         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1392         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1393         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1394         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1395
1396         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1397         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1398
1399         vcpu_put(vcpu);
1400
1401         return 0;
1402 }
1403
1404 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1405 {
1406         struct kvm_segment cs;
1407
1408         get_segment(vcpu, &cs, VCPU_SREG_CS);
1409         *db = cs.db;
1410         *l = cs.l;
1411 }
1412 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
1413
1414 /*
1415  * Translate a guest virtual address to a guest physical address.
1416  */
1417 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
1418                                     struct kvm_translation *tr)
1419 {
1420         unsigned long vaddr = tr->linear_address;
1421         gpa_t gpa;
1422
1423         vcpu_load(vcpu);
1424         mutex_lock(&vcpu->kvm->lock);
1425         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
1426         tr->physical_address = gpa;
1427         tr->valid = gpa != UNMAPPED_GVA;
1428         tr->writeable = 1;
1429         tr->usermode = 0;
1430         mutex_unlock(&vcpu->kvm->lock);
1431         vcpu_put(vcpu);
1432
1433         return 0;
1434 }
1435
1436 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1437                                     struct kvm_interrupt *irq)
1438 {
1439         if (irq->irq < 0 || irq->irq >= 256)
1440                 return -EINVAL;
1441         if (irqchip_in_kernel(vcpu->kvm))
1442                 return -ENXIO;
1443         vcpu_load(vcpu);
1444
1445         set_bit(irq->irq, vcpu->irq_pending);
1446         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
1447
1448         vcpu_put(vcpu);
1449
1450         return 0;
1451 }
1452
1453 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
1454                                       struct kvm_debug_guest *dbg)
1455 {
1456         int r;
1457
1458         vcpu_load(vcpu);
1459
1460         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
1461
1462         vcpu_put(vcpu);
1463
1464         return r;
1465 }
1466
1467 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
1468                                     unsigned long address,
1469                                     int *type)
1470 {
1471         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1472         unsigned long pgoff;
1473         struct page *page;
1474
1475         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1476         if (pgoff == 0)
1477                 page = virt_to_page(vcpu->run);
1478         else if (pgoff == KVM_PIO_PAGE_OFFSET)
1479                 page = virt_to_page(vcpu->pio_data);
1480         else
1481                 return NOPAGE_SIGBUS;
1482         get_page(page);
1483         if (type != NULL)
1484                 *type = VM_FAULT_MINOR;
1485
1486         return page;
1487 }
1488
1489 static struct vm_operations_struct kvm_vcpu_vm_ops = {
1490         .nopage = kvm_vcpu_nopage,
1491 };
1492
1493 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1494 {
1495         vma->vm_ops = &kvm_vcpu_vm_ops;
1496         return 0;
1497 }
1498
1499 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1500 {
1501         struct kvm_vcpu *vcpu = filp->private_data;
1502
1503         fput(vcpu->kvm->filp);
1504         return 0;
1505 }
1506
1507 static struct file_operations kvm_vcpu_fops = {
1508         .release        = kvm_vcpu_release,
1509         .unlocked_ioctl = kvm_vcpu_ioctl,
1510         .compat_ioctl   = kvm_vcpu_ioctl,
1511         .mmap           = kvm_vcpu_mmap,
1512 };
1513
1514 /*
1515  * Allocates an inode for the vcpu.
1516  */
1517 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1518 {
1519         int fd, r;
1520         struct inode *inode;
1521         struct file *file;
1522
1523         r = anon_inode_getfd(&fd, &inode, &file,
1524                              "kvm-vcpu", &kvm_vcpu_fops, vcpu);
1525         if (r)
1526                 return r;
1527         atomic_inc(&vcpu->kvm->filp->f_count);
1528         return fd;
1529 }
1530
1531 /*
1532  * Creates some virtual cpus.  Good luck creating more than one.
1533  */
1534 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1535 {
1536         int r;
1537         struct kvm_vcpu *vcpu;
1538
1539         if (!valid_vcpu(n))
1540                 return -EINVAL;
1541
1542         vcpu = kvm_x86_ops->vcpu_create(kvm, n);
1543         if (IS_ERR(vcpu))
1544                 return PTR_ERR(vcpu);
1545
1546         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1547
1548         /* We do fxsave: this must be aligned. */
1549         BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
1550
1551         vcpu_load(vcpu);
1552         r = kvm_x86_ops->vcpu_reset(vcpu);
1553         if (r == 0)
1554                 r = kvm_mmu_setup(vcpu);
1555         vcpu_put(vcpu);
1556         if (r < 0)
1557                 goto free_vcpu;
1558
1559         mutex_lock(&kvm->lock);
1560         if (kvm->vcpus[n]) {
1561                 r = -EEXIST;
1562                 mutex_unlock(&kvm->lock);
1563                 goto mmu_unload;
1564         }
1565         kvm->vcpus[n] = vcpu;
1566         mutex_unlock(&kvm->lock);
1567
1568         /* Now it's all set up, let userspace reach it */
1569         r = create_vcpu_fd(vcpu);
1570         if (r < 0)
1571                 goto unlink;
1572         return r;
1573
1574 unlink:
1575         mutex_lock(&kvm->lock);
1576         kvm->vcpus[n] = NULL;
1577         mutex_unlock(&kvm->lock);
1578
1579 mmu_unload:
1580         vcpu_load(vcpu);
1581         kvm_mmu_unload(vcpu);
1582         vcpu_put(vcpu);
1583
1584 free_vcpu:
1585         kvm_x86_ops->vcpu_free(vcpu);
1586         return r;
1587 }
1588
1589 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1590 {
1591         if (sigset) {
1592                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1593                 vcpu->sigset_active = 1;
1594                 vcpu->sigset = *sigset;
1595         } else
1596                 vcpu->sigset_active = 0;
1597         return 0;
1598 }
1599
1600 /*
1601  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
1602  * we have asm/x86/processor.h
1603  */
1604 struct fxsave {
1605         u16     cwd;
1606         u16     swd;
1607         u16     twd;
1608         u16     fop;
1609         u64     rip;
1610         u64     rdp;
1611         u32     mxcsr;
1612         u32     mxcsr_mask;
1613         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
1614 #ifdef CONFIG_X86_64
1615         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
1616 #else
1617         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
1618 #endif
1619 };
1620
1621 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
1622 {
1623         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
1624
1625         vcpu_load(vcpu);
1626
1627         memcpy(fpu->fpr, fxsave->st_space, 128);
1628         fpu->fcw = fxsave->cwd;
1629         fpu->fsw = fxsave->swd;
1630         fpu->ftwx = fxsave->twd;
1631         fpu->last_opcode = fxsave->fop;
1632         fpu->last_ip = fxsave->rip;
1633         fpu->last_dp = fxsave->rdp;
1634         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
1635
1636         vcpu_put(vcpu);
1637
1638         return 0;
1639 }
1640
1641 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
1642 {
1643         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
1644
1645         vcpu_load(vcpu);
1646
1647         memcpy(fxsave->st_space, fpu->fpr, 128);
1648         fxsave->cwd = fpu->fcw;
1649         fxsave->swd = fpu->fsw;
1650         fxsave->twd = fpu->ftwx;
1651         fxsave->fop = fpu->last_opcode;
1652         fxsave->rip = fpu->last_ip;
1653         fxsave->rdp = fpu->last_dp;
1654         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
1655
1656         vcpu_put(vcpu);
1657
1658         return 0;
1659 }
1660
1661 static long kvm_vcpu_ioctl(struct file *filp,
1662                            unsigned int ioctl, unsigned long arg)
1663 {
1664         struct kvm_vcpu *vcpu = filp->private_data;
1665         void __user *argp = (void __user *)arg;
1666         int r;
1667
1668         switch (ioctl) {
1669         case KVM_RUN:
1670                 r = -EINVAL;
1671                 if (arg)
1672                         goto out;
1673                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
1674                 break;
1675         case KVM_GET_REGS: {
1676                 struct kvm_regs kvm_regs;
1677
1678                 memset(&kvm_regs, 0, sizeof kvm_regs);
1679                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
1680                 if (r)
1681                         goto out;
1682                 r = -EFAULT;
1683                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
1684                         goto out;
1685                 r = 0;
1686                 break;
1687         }
1688         case KVM_SET_REGS: {
1689                 struct kvm_regs kvm_regs;
1690
1691                 r = -EFAULT;
1692                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
1693                         goto out;
1694                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
1695                 if (r)
1696                         goto out;
1697                 r = 0;
1698                 break;
1699         }
1700         case KVM_GET_SREGS: {
1701                 struct kvm_sregs kvm_sregs;
1702
1703                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
1704                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
1705                 if (r)
1706                         goto out;
1707                 r = -EFAULT;
1708                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
1709                         goto out;
1710                 r = 0;
1711                 break;
1712         }
1713         case KVM_SET_SREGS: {
1714                 struct kvm_sregs kvm_sregs;
1715
1716                 r = -EFAULT;
1717                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
1718                         goto out;
1719                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
1720                 if (r)
1721                         goto out;
1722                 r = 0;
1723                 break;
1724         }
1725         case KVM_TRANSLATE: {
1726                 struct kvm_translation tr;
1727
1728                 r = -EFAULT;
1729                 if (copy_from_user(&tr, argp, sizeof tr))
1730                         goto out;
1731                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
1732                 if (r)
1733                         goto out;
1734                 r = -EFAULT;
1735                 if (copy_to_user(argp, &tr, sizeof tr))
1736                         goto out;
1737                 r = 0;
1738                 break;
1739         }
1740         case KVM_INTERRUPT: {
1741                 struct kvm_interrupt irq;
1742
1743                 r = -EFAULT;
1744                 if (copy_from_user(&irq, argp, sizeof irq))
1745                         goto out;
1746                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1747                 if (r)
1748                         goto out;
1749                 r = 0;
1750                 break;
1751         }
1752         case KVM_DEBUG_GUEST: {
1753                 struct kvm_debug_guest dbg;
1754
1755                 r = -EFAULT;
1756                 if (copy_from_user(&dbg, argp, sizeof dbg))
1757                         goto out;
1758                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
1759                 if (r)
1760                         goto out;
1761                 r = 0;
1762                 break;
1763         }
1764         case KVM_SET_SIGNAL_MASK: {
1765                 struct kvm_signal_mask __user *sigmask_arg = argp;
1766                 struct kvm_signal_mask kvm_sigmask;
1767                 sigset_t sigset, *p;
1768
1769                 p = NULL;
1770                 if (argp) {
1771                         r = -EFAULT;
1772                         if (copy_from_user(&kvm_sigmask, argp,
1773                                            sizeof kvm_sigmask))
1774                                 goto out;
1775                         r = -EINVAL;
1776                         if (kvm_sigmask.len != sizeof sigset)
1777                                 goto out;
1778                         r = -EFAULT;
1779                         if (copy_from_user(&sigset, sigmask_arg->sigset,
1780                                            sizeof sigset))
1781                                 goto out;
1782                         p = &sigset;
1783                 }
1784                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
1785                 break;
1786         }
1787         case KVM_GET_FPU: {
1788                 struct kvm_fpu fpu;
1789
1790                 memset(&fpu, 0, sizeof fpu);
1791                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
1792                 if (r)
1793                         goto out;
1794                 r = -EFAULT;
1795                 if (copy_to_user(argp, &fpu, sizeof fpu))
1796                         goto out;
1797                 r = 0;
1798                 break;
1799         }
1800         case KVM_SET_FPU: {
1801                 struct kvm_fpu fpu;
1802
1803                 r = -EFAULT;
1804                 if (copy_from_user(&fpu, argp, sizeof fpu))
1805                         goto out;
1806                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
1807                 if (r)
1808                         goto out;
1809                 r = 0;
1810                 break;
1811         }
1812         default:
1813                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1814         }
1815 out:
1816         return r;
1817 }
1818
1819 static long kvm_vm_ioctl(struct file *filp,
1820                            unsigned int ioctl, unsigned long arg)
1821 {
1822         struct kvm *kvm = filp->private_data;
1823         void __user *argp = (void __user *)arg;
1824         int r;
1825
1826         switch (ioctl) {
1827         case KVM_CREATE_VCPU:
1828                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1829                 if (r < 0)
1830                         goto out;
1831                 break;
1832         case KVM_SET_USER_MEMORY_REGION: {
1833                 struct kvm_userspace_memory_region kvm_userspace_mem;
1834
1835                 r = -EFAULT;
1836                 if (copy_from_user(&kvm_userspace_mem, argp,
1837                                                 sizeof kvm_userspace_mem))
1838                         goto out;
1839
1840                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
1841                 if (r)
1842                         goto out;
1843                 break;
1844         }
1845         case KVM_GET_DIRTY_LOG: {
1846                 struct kvm_dirty_log log;
1847
1848                 r = -EFAULT;
1849                 if (copy_from_user(&log, argp, sizeof log))
1850                         goto out;
1851                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1852                 if (r)
1853                         goto out;
1854                 break;
1855         }
1856         default:
1857                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1858         }
1859 out:
1860         return r;
1861 }
1862
1863 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
1864                                   unsigned long address,
1865                                   int *type)
1866 {
1867         struct kvm *kvm = vma->vm_file->private_data;
1868         unsigned long pgoff;
1869         struct page *page;
1870
1871         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1872         if (!kvm_is_visible_gfn(kvm, pgoff))
1873                 return NOPAGE_SIGBUS;
1874         /* current->mm->mmap_sem is already held so call lockless version */
1875         page = __gfn_to_page(kvm, pgoff);
1876         if (is_error_page(page)) {
1877                 kvm_release_page(page);
1878                 return NOPAGE_SIGBUS;
1879         }
1880         if (type != NULL)
1881                 *type = VM_FAULT_MINOR;
1882
1883         return page;
1884 }
1885
1886 static struct vm_operations_struct kvm_vm_vm_ops = {
1887         .nopage = kvm_vm_nopage,
1888 };
1889
1890 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1891 {
1892         vma->vm_ops = &kvm_vm_vm_ops;
1893         return 0;
1894 }
1895
1896 static struct file_operations kvm_vm_fops = {
1897         .release        = kvm_vm_release,
1898         .unlocked_ioctl = kvm_vm_ioctl,
1899         .compat_ioctl   = kvm_vm_ioctl,
1900         .mmap           = kvm_vm_mmap,
1901 };
1902
1903 static int kvm_dev_ioctl_create_vm(void)
1904 {
1905         int fd, r;
1906         struct inode *inode;
1907         struct file *file;
1908         struct kvm *kvm;
1909
1910         kvm = kvm_create_vm();
1911         if (IS_ERR(kvm))
1912                 return PTR_ERR(kvm);
1913         r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
1914         if (r) {
1915                 kvm_destroy_vm(kvm);
1916                 return r;
1917         }
1918
1919         kvm->filp = file;
1920
1921         return fd;
1922 }
1923
1924 static long kvm_dev_ioctl(struct file *filp,
1925                           unsigned int ioctl, unsigned long arg)
1926 {
1927         void __user *argp = (void __user *)arg;
1928         long r = -EINVAL;
1929
1930         switch (ioctl) {
1931         case KVM_GET_API_VERSION:
1932                 r = -EINVAL;
1933                 if (arg)
1934                         goto out;
1935                 r = KVM_API_VERSION;
1936                 break;
1937         case KVM_CREATE_VM:
1938                 r = -EINVAL;
1939                 if (arg)
1940                         goto out;
1941                 r = kvm_dev_ioctl_create_vm();
1942                 break;
1943         case KVM_CHECK_EXTENSION: {
1944                 int ext = (long)argp;
1945
1946                 switch (ext) {
1947                 case KVM_CAP_IRQCHIP:
1948                 case KVM_CAP_HLT:
1949                 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1950                 case KVM_CAP_USER_MEMORY:
1951                 case KVM_CAP_SET_TSS_ADDR:
1952                         r = 1;
1953                         break;
1954                 default:
1955                         r = 0;
1956                         break;
1957                 }
1958                 break;
1959         }
1960         case KVM_GET_VCPU_MMAP_SIZE:
1961                 r = -EINVAL;
1962                 if (arg)
1963                         goto out;
1964                 r = 2 * PAGE_SIZE;
1965                 break;
1966         default:
1967                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
1968         }
1969 out:
1970         return r;
1971 }
1972
1973 static struct file_operations kvm_chardev_ops = {
1974         .unlocked_ioctl = kvm_dev_ioctl,
1975         .compat_ioctl   = kvm_dev_ioctl,
1976 };
1977
1978 static struct miscdevice kvm_dev = {
1979         KVM_MINOR,
1980         "kvm",
1981         &kvm_chardev_ops,
1982 };
1983
1984 /*
1985  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
1986  * cached on it.
1987  */
1988 static void decache_vcpus_on_cpu(int cpu)
1989 {
1990         struct kvm *vm;
1991         struct kvm_vcpu *vcpu;
1992         int i;
1993
1994         spin_lock(&kvm_lock);
1995         list_for_each_entry(vm, &vm_list, vm_list)
1996                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
1997                         vcpu = vm->vcpus[i];
1998                         if (!vcpu)
1999                                 continue;
2000                         /*
2001                          * If the vcpu is locked, then it is running on some
2002                          * other cpu and therefore it is not cached on the
2003                          * cpu in question.
2004                          *
2005                          * If it's not locked, check the last cpu it executed
2006                          * on.
2007                          */
2008                         if (mutex_trylock(&vcpu->mutex)) {
2009                                 if (vcpu->cpu == cpu) {
2010                                         kvm_x86_ops->vcpu_decache(vcpu);
2011                                         vcpu->cpu = -1;
2012                                 }
2013                                 mutex_unlock(&vcpu->mutex);
2014                         }
2015                 }
2016         spin_unlock(&kvm_lock);
2017 }
2018
2019 static void hardware_enable(void *junk)
2020 {
2021         int cpu = raw_smp_processor_id();
2022
2023         if (cpu_isset(cpu, cpus_hardware_enabled))
2024                 return;
2025         cpu_set(cpu, cpus_hardware_enabled);
2026         kvm_x86_ops->hardware_enable(NULL);
2027 }
2028
2029 static void hardware_disable(void *junk)
2030 {
2031         int cpu = raw_smp_processor_id();
2032
2033         if (!cpu_isset(cpu, cpus_hardware_enabled))
2034                 return;
2035         cpu_clear(cpu, cpus_hardware_enabled);
2036         decache_vcpus_on_cpu(cpu);
2037         kvm_x86_ops->hardware_disable(NULL);
2038 }
2039
2040 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2041                            void *v)
2042 {
2043         int cpu = (long)v;
2044
2045         switch (val) {
2046         case CPU_DYING:
2047         case CPU_DYING_FROZEN:
2048                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2049                        cpu);
2050                 hardware_disable(NULL);
2051                 break;
2052         case CPU_UP_CANCELED:
2053         case CPU_UP_CANCELED_FROZEN:
2054                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2055                        cpu);
2056                 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
2057                 break;
2058         case CPU_ONLINE:
2059         case CPU_ONLINE_FROZEN:
2060                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2061                        cpu);
2062                 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
2063                 break;
2064         }
2065         return NOTIFY_OK;
2066 }
2067
2068 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2069                       void *v)
2070 {
2071         if (val == SYS_RESTART) {
2072                 /*
2073                  * Some (well, at least mine) BIOSes hang on reboot if
2074                  * in vmx root mode.
2075                  */
2076                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2077                 on_each_cpu(hardware_disable, NULL, 0, 1);
2078         }
2079         return NOTIFY_OK;
2080 }
2081
2082 static struct notifier_block kvm_reboot_notifier = {
2083         .notifier_call = kvm_reboot,
2084         .priority = 0,
2085 };
2086
2087 void kvm_io_bus_init(struct kvm_io_bus *bus)
2088 {
2089         memset(bus, 0, sizeof(*bus));
2090 }
2091
2092 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2093 {
2094         int i;
2095
2096         for (i = 0; i < bus->dev_count; i++) {
2097                 struct kvm_io_device *pos = bus->devs[i];
2098
2099                 kvm_iodevice_destructor(pos);
2100         }
2101 }
2102
2103 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
2104 {
2105         int i;
2106
2107         for (i = 0; i < bus->dev_count; i++) {
2108                 struct kvm_io_device *pos = bus->devs[i];
2109
2110                 if (pos->in_range(pos, addr))
2111                         return pos;
2112         }
2113
2114         return NULL;
2115 }
2116
2117 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
2118 {
2119         BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
2120
2121         bus->devs[bus->dev_count++] = dev;
2122 }
2123
2124 static struct notifier_block kvm_cpu_notifier = {
2125         .notifier_call = kvm_cpu_hotplug,
2126         .priority = 20, /* must be > scheduler priority */
2127 };
2128
2129 static u64 stat_get(void *_offset)
2130 {
2131         unsigned offset = (long)_offset;
2132         u64 total = 0;
2133         struct kvm *kvm;
2134         struct kvm_vcpu *vcpu;
2135         int i;
2136
2137         spin_lock(&kvm_lock);
2138         list_for_each_entry(kvm, &vm_list, vm_list)
2139                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2140                         vcpu = kvm->vcpus[i];
2141                         if (vcpu)
2142                                 total += *(u32 *)((void *)vcpu + offset);
2143                 }
2144         spin_unlock(&kvm_lock);
2145         return total;
2146 }
2147
2148 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
2149
2150 static __init void kvm_init_debug(void)
2151 {
2152         struct kvm_stats_debugfs_item *p;
2153
2154         debugfs_dir = debugfs_create_dir("kvm", NULL);
2155         for (p = debugfs_entries; p->name; ++p)
2156                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
2157                                                 (void *)(long)p->offset,
2158                                                 &stat_fops);
2159 }
2160
2161 static void kvm_exit_debug(void)
2162 {
2163         struct kvm_stats_debugfs_item *p;
2164
2165         for (p = debugfs_entries; p->name; ++p)
2166                 debugfs_remove(p->dentry);
2167         debugfs_remove(debugfs_dir);
2168 }
2169
2170 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2171 {
2172         hardware_disable(NULL);
2173         return 0;
2174 }
2175
2176 static int kvm_resume(struct sys_device *dev)
2177 {
2178         hardware_enable(NULL);
2179         return 0;
2180 }
2181
2182 static struct sysdev_class kvm_sysdev_class = {
2183         .name = "kvm",
2184         .suspend = kvm_suspend,
2185         .resume = kvm_resume,
2186 };
2187
2188 static struct sys_device kvm_sysdev = {
2189         .id = 0,
2190         .cls = &kvm_sysdev_class,
2191 };
2192
2193 struct page *bad_page;
2194
2195 static inline
2196 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2197 {
2198         return container_of(pn, struct kvm_vcpu, preempt_notifier);
2199 }
2200
2201 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2202 {
2203         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2204
2205         kvm_x86_ops->vcpu_load(vcpu, cpu);
2206 }
2207
2208 static void kvm_sched_out(struct preempt_notifier *pn,
2209                           struct task_struct *next)
2210 {
2211         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2212
2213         kvm_x86_ops->vcpu_put(vcpu);
2214 }
2215
2216 int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
2217                   struct module *module)
2218 {
2219         int r;
2220         int cpu;
2221
2222         if (kvm_x86_ops) {
2223                 printk(KERN_ERR "kvm: already loaded the other module\n");
2224                 return -EEXIST;
2225         }
2226
2227         if (!ops->cpu_has_kvm_support()) {
2228                 printk(KERN_ERR "kvm: no hardware support\n");
2229                 return -EOPNOTSUPP;
2230         }
2231         if (ops->disabled_by_bios()) {
2232                 printk(KERN_ERR "kvm: disabled by bios\n");
2233                 return -EOPNOTSUPP;
2234         }
2235
2236         kvm_x86_ops = ops;
2237
2238         r = kvm_x86_ops->hardware_setup();
2239         if (r < 0)
2240                 goto out;
2241
2242         for_each_online_cpu(cpu) {
2243                 smp_call_function_single(cpu,
2244                                 kvm_x86_ops->check_processor_compatibility,
2245                                 &r, 0, 1);
2246                 if (r < 0)
2247                         goto out_free_0;
2248         }
2249
2250         on_each_cpu(hardware_enable, NULL, 0, 1);
2251         r = register_cpu_notifier(&kvm_cpu_notifier);
2252         if (r)
2253                 goto out_free_1;
2254         register_reboot_notifier(&kvm_reboot_notifier);
2255
2256         r = sysdev_class_register(&kvm_sysdev_class);
2257         if (r)
2258                 goto out_free_2;
2259
2260         r = sysdev_register(&kvm_sysdev);
2261         if (r)
2262                 goto out_free_3;
2263
2264         /* A kmem cache lets us meet the alignment requirements of fx_save. */
2265         kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
2266                                            __alignof__(struct kvm_vcpu), 0, 0);
2267         if (!kvm_vcpu_cache) {
2268                 r = -ENOMEM;
2269                 goto out_free_4;
2270         }
2271
2272         kvm_chardev_ops.owner = module;
2273
2274         r = misc_register(&kvm_dev);
2275         if (r) {
2276                 printk(KERN_ERR "kvm: misc device register failed\n");
2277                 goto out_free;
2278         }
2279
2280         kvm_preempt_ops.sched_in = kvm_sched_in;
2281         kvm_preempt_ops.sched_out = kvm_sched_out;
2282
2283         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2284
2285         return 0;
2286
2287 out_free:
2288         kmem_cache_destroy(kvm_vcpu_cache);
2289 out_free_4:
2290         sysdev_unregister(&kvm_sysdev);
2291 out_free_3:
2292         sysdev_class_unregister(&kvm_sysdev_class);
2293 out_free_2:
2294         unregister_reboot_notifier(&kvm_reboot_notifier);
2295         unregister_cpu_notifier(&kvm_cpu_notifier);
2296 out_free_1:
2297         on_each_cpu(hardware_disable, NULL, 0, 1);
2298 out_free_0:
2299         kvm_x86_ops->hardware_unsetup();
2300 out:
2301         kvm_x86_ops = NULL;
2302         return r;
2303 }
2304 EXPORT_SYMBOL_GPL(kvm_init_x86);
2305
2306 void kvm_exit_x86(void)
2307 {
2308         misc_deregister(&kvm_dev);
2309         kmem_cache_destroy(kvm_vcpu_cache);
2310         sysdev_unregister(&kvm_sysdev);
2311         sysdev_class_unregister(&kvm_sysdev_class);
2312         unregister_reboot_notifier(&kvm_reboot_notifier);
2313         unregister_cpu_notifier(&kvm_cpu_notifier);
2314         on_each_cpu(hardware_disable, NULL, 0, 1);
2315         kvm_x86_ops->hardware_unsetup();
2316         kvm_x86_ops = NULL;
2317 }
2318 EXPORT_SYMBOL_GPL(kvm_exit_x86);
2319
2320 static __init int kvm_init(void)
2321 {
2322         int r;
2323
2324         r = kvm_mmu_module_init();
2325         if (r)
2326                 goto out4;
2327
2328         kvm_init_debug();
2329
2330         kvm_arch_init();
2331
2332         bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2333
2334         if (bad_page == NULL) {
2335                 r = -ENOMEM;
2336                 goto out;
2337         }
2338
2339         return 0;
2340
2341 out:
2342         kvm_exit_debug();
2343         kvm_mmu_module_exit();
2344 out4:
2345         return r;
2346 }
2347
2348 static __exit void kvm_exit(void)
2349 {
2350         kvm_exit_debug();
2351         __free_page(bad_page);
2352         kvm_mmu_module_exit();
2353 }
2354
2355 module_init(kvm_init)
2356 module_exit(kvm_exit)