KVM: MMU: out of sync shadow core
[safe/jmp/linux-2.6] / arch / x86 / kvm / mmu.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * MMU support
8  *
9  * Copyright (C) 2006 Qumranet, Inc.
10  *
11  * Authors:
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *   Avi Kivity   <avi@qumranet.com>
14  *
15  * This work is licensed under the terms of the GNU GPL, version 2.  See
16  * the COPYING file in the top-level directory.
17  *
18  */
19
20 #include "vmx.h"
21 #include "mmu.h"
22
23 #include <linux/kvm_host.h>
24 #include <linux/types.h>
25 #include <linux/string.h>
26 #include <linux/mm.h>
27 #include <linux/highmem.h>
28 #include <linux/module.h>
29 #include <linux/swap.h>
30 #include <linux/hugetlb.h>
31 #include <linux/compiler.h>
32
33 #include <asm/page.h>
34 #include <asm/cmpxchg.h>
35 #include <asm/io.h>
36
37 /*
38  * When setting this variable to true it enables Two-Dimensional-Paging
39  * where the hardware walks 2 page tables:
40  * 1. the guest-virtual to guest-physical
41  * 2. while doing 1. it walks guest-physical to host-physical
42  * If the hardware supports that we don't need to do shadow paging.
43  */
44 bool tdp_enabled = false;
45
46 #undef MMU_DEBUG
47
48 #undef AUDIT
49
50 #ifdef AUDIT
51 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
52 #else
53 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
54 #endif
55
56 #ifdef MMU_DEBUG
57
58 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
59 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
60
61 #else
62
63 #define pgprintk(x...) do { } while (0)
64 #define rmap_printk(x...) do { } while (0)
65
66 #endif
67
68 #if defined(MMU_DEBUG) || defined(AUDIT)
69 static int dbg = 0;
70 module_param(dbg, bool, 0644);
71 #endif
72
73 #ifndef MMU_DEBUG
74 #define ASSERT(x) do { } while (0)
75 #else
76 #define ASSERT(x)                                                       \
77         if (!(x)) {                                                     \
78                 printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
79                        __FILE__, __LINE__, #x);                         \
80         }
81 #endif
82
83 #define PT_FIRST_AVAIL_BITS_SHIFT 9
84 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
85
86 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
87
88 #define PT64_LEVEL_BITS 9
89
90 #define PT64_LEVEL_SHIFT(level) \
91                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
92
93 #define PT64_LEVEL_MASK(level) \
94                 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
95
96 #define PT64_INDEX(address, level)\
97         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
98
99
100 #define PT32_LEVEL_BITS 10
101
102 #define PT32_LEVEL_SHIFT(level) \
103                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
104
105 #define PT32_LEVEL_MASK(level) \
106                 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
107
108 #define PT32_INDEX(address, level)\
109         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
110
111
112 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
113 #define PT64_DIR_BASE_ADDR_MASK \
114         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
115
116 #define PT32_BASE_ADDR_MASK PAGE_MASK
117 #define PT32_DIR_BASE_ADDR_MASK \
118         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
119
120 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
121                         | PT64_NX_MASK)
122
123 #define PFERR_PRESENT_MASK (1U << 0)
124 #define PFERR_WRITE_MASK (1U << 1)
125 #define PFERR_USER_MASK (1U << 2)
126 #define PFERR_FETCH_MASK (1U << 4)
127
128 #define PT_DIRECTORY_LEVEL 2
129 #define PT_PAGE_TABLE_LEVEL 1
130
131 #define RMAP_EXT 4
132
133 #define ACC_EXEC_MASK    1
134 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
135 #define ACC_USER_MASK    PT_USER_MASK
136 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
137
138 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
139
140 struct kvm_rmap_desc {
141         u64 *shadow_ptes[RMAP_EXT];
142         struct kvm_rmap_desc *more;
143 };
144
145 struct kvm_shadow_walk {
146         int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
147                      u64 addr, u64 *spte, int level);
148 };
149
150 struct kvm_unsync_walk {
151         int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
152 };
153
154 typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
155
156 static struct kmem_cache *pte_chain_cache;
157 static struct kmem_cache *rmap_desc_cache;
158 static struct kmem_cache *mmu_page_header_cache;
159
160 static u64 __read_mostly shadow_trap_nonpresent_pte;
161 static u64 __read_mostly shadow_notrap_nonpresent_pte;
162 static u64 __read_mostly shadow_base_present_pte;
163 static u64 __read_mostly shadow_nx_mask;
164 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
165 static u64 __read_mostly shadow_user_mask;
166 static u64 __read_mostly shadow_accessed_mask;
167 static u64 __read_mostly shadow_dirty_mask;
168
169 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
170 {
171         shadow_trap_nonpresent_pte = trap_pte;
172         shadow_notrap_nonpresent_pte = notrap_pte;
173 }
174 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
175
176 void kvm_mmu_set_base_ptes(u64 base_pte)
177 {
178         shadow_base_present_pte = base_pte;
179 }
180 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
181
182 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
183                 u64 dirty_mask, u64 nx_mask, u64 x_mask)
184 {
185         shadow_user_mask = user_mask;
186         shadow_accessed_mask = accessed_mask;
187         shadow_dirty_mask = dirty_mask;
188         shadow_nx_mask = nx_mask;
189         shadow_x_mask = x_mask;
190 }
191 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
192
193 static int is_write_protection(struct kvm_vcpu *vcpu)
194 {
195         return vcpu->arch.cr0 & X86_CR0_WP;
196 }
197
198 static int is_cpuid_PSE36(void)
199 {
200         return 1;
201 }
202
203 static int is_nx(struct kvm_vcpu *vcpu)
204 {
205         return vcpu->arch.shadow_efer & EFER_NX;
206 }
207
208 static int is_present_pte(unsigned long pte)
209 {
210         return pte & PT_PRESENT_MASK;
211 }
212
213 static int is_shadow_present_pte(u64 pte)
214 {
215         return pte != shadow_trap_nonpresent_pte
216                 && pte != shadow_notrap_nonpresent_pte;
217 }
218
219 static int is_large_pte(u64 pte)
220 {
221         return pte & PT_PAGE_SIZE_MASK;
222 }
223
224 static int is_writeble_pte(unsigned long pte)
225 {
226         return pte & PT_WRITABLE_MASK;
227 }
228
229 static int is_dirty_pte(unsigned long pte)
230 {
231         return pte & shadow_dirty_mask;
232 }
233
234 static int is_rmap_pte(u64 pte)
235 {
236         return is_shadow_present_pte(pte);
237 }
238
239 static pfn_t spte_to_pfn(u64 pte)
240 {
241         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
242 }
243
244 static gfn_t pse36_gfn_delta(u32 gpte)
245 {
246         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
247
248         return (gpte & PT32_DIR_PSE36_MASK) << shift;
249 }
250
251 static void set_shadow_pte(u64 *sptep, u64 spte)
252 {
253 #ifdef CONFIG_X86_64
254         set_64bit((unsigned long *)sptep, spte);
255 #else
256         set_64bit((unsigned long long *)sptep, spte);
257 #endif
258 }
259
260 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
261                                   struct kmem_cache *base_cache, int min)
262 {
263         void *obj;
264
265         if (cache->nobjs >= min)
266                 return 0;
267         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
268                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
269                 if (!obj)
270                         return -ENOMEM;
271                 cache->objects[cache->nobjs++] = obj;
272         }
273         return 0;
274 }
275
276 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
277 {
278         while (mc->nobjs)
279                 kfree(mc->objects[--mc->nobjs]);
280 }
281
282 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
283                                        int min)
284 {
285         struct page *page;
286
287         if (cache->nobjs >= min)
288                 return 0;
289         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
290                 page = alloc_page(GFP_KERNEL);
291                 if (!page)
292                         return -ENOMEM;
293                 set_page_private(page, 0);
294                 cache->objects[cache->nobjs++] = page_address(page);
295         }
296         return 0;
297 }
298
299 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
300 {
301         while (mc->nobjs)
302                 free_page((unsigned long)mc->objects[--mc->nobjs]);
303 }
304
305 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
306 {
307         int r;
308
309         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
310                                    pte_chain_cache, 4);
311         if (r)
312                 goto out;
313         r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
314                                    rmap_desc_cache, 1);
315         if (r)
316                 goto out;
317         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
318         if (r)
319                 goto out;
320         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
321                                    mmu_page_header_cache, 4);
322 out:
323         return r;
324 }
325
326 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
327 {
328         mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
329         mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
330         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
331         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
332 }
333
334 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
335                                     size_t size)
336 {
337         void *p;
338
339         BUG_ON(!mc->nobjs);
340         p = mc->objects[--mc->nobjs];
341         memset(p, 0, size);
342         return p;
343 }
344
345 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
346 {
347         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
348                                       sizeof(struct kvm_pte_chain));
349 }
350
351 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
352 {
353         kfree(pc);
354 }
355
356 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
357 {
358         return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
359                                       sizeof(struct kvm_rmap_desc));
360 }
361
362 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
363 {
364         kfree(rd);
365 }
366
367 /*
368  * Return the pointer to the largepage write count for a given
369  * gfn, handling slots that are not large page aligned.
370  */
371 static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
372 {
373         unsigned long idx;
374
375         idx = (gfn / KVM_PAGES_PER_HPAGE) -
376               (slot->base_gfn / KVM_PAGES_PER_HPAGE);
377         return &slot->lpage_info[idx].write_count;
378 }
379
380 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
381 {
382         int *write_count;
383
384         write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
385         *write_count += 1;
386 }
387
388 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
389 {
390         int *write_count;
391
392         write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
393         *write_count -= 1;
394         WARN_ON(*write_count < 0);
395 }
396
397 static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
398 {
399         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
400         int *largepage_idx;
401
402         if (slot) {
403                 largepage_idx = slot_largepage_idx(gfn, slot);
404                 return *largepage_idx;
405         }
406
407         return 1;
408 }
409
410 static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
411 {
412         struct vm_area_struct *vma;
413         unsigned long addr;
414         int ret = 0;
415
416         addr = gfn_to_hva(kvm, gfn);
417         if (kvm_is_error_hva(addr))
418                 return ret;
419
420         down_read(&current->mm->mmap_sem);
421         vma = find_vma(current->mm, addr);
422         if (vma && is_vm_hugetlb_page(vma))
423                 ret = 1;
424         up_read(&current->mm->mmap_sem);
425
426         return ret;
427 }
428
429 static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
430 {
431         struct kvm_memory_slot *slot;
432
433         if (has_wrprotected_page(vcpu->kvm, large_gfn))
434                 return 0;
435
436         if (!host_largepage_backed(vcpu->kvm, large_gfn))
437                 return 0;
438
439         slot = gfn_to_memslot(vcpu->kvm, large_gfn);
440         if (slot && slot->dirty_bitmap)
441                 return 0;
442
443         return 1;
444 }
445
446 /*
447  * Take gfn and return the reverse mapping to it.
448  * Note: gfn must be unaliased before this function get called
449  */
450
451 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
452 {
453         struct kvm_memory_slot *slot;
454         unsigned long idx;
455
456         slot = gfn_to_memslot(kvm, gfn);
457         if (!lpage)
458                 return &slot->rmap[gfn - slot->base_gfn];
459
460         idx = (gfn / KVM_PAGES_PER_HPAGE) -
461               (slot->base_gfn / KVM_PAGES_PER_HPAGE);
462
463         return &slot->lpage_info[idx].rmap_pde;
464 }
465
466 /*
467  * Reverse mapping data structures:
468  *
469  * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
470  * that points to page_address(page).
471  *
472  * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
473  * containing more mappings.
474  */
475 static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
476 {
477         struct kvm_mmu_page *sp;
478         struct kvm_rmap_desc *desc;
479         unsigned long *rmapp;
480         int i;
481
482         if (!is_rmap_pte(*spte))
483                 return;
484         gfn = unalias_gfn(vcpu->kvm, gfn);
485         sp = page_header(__pa(spte));
486         sp->gfns[spte - sp->spt] = gfn;
487         rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
488         if (!*rmapp) {
489                 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
490                 *rmapp = (unsigned long)spte;
491         } else if (!(*rmapp & 1)) {
492                 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
493                 desc = mmu_alloc_rmap_desc(vcpu);
494                 desc->shadow_ptes[0] = (u64 *)*rmapp;
495                 desc->shadow_ptes[1] = spte;
496                 *rmapp = (unsigned long)desc | 1;
497         } else {
498                 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
499                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
500                 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
501                         desc = desc->more;
502                 if (desc->shadow_ptes[RMAP_EXT-1]) {
503                         desc->more = mmu_alloc_rmap_desc(vcpu);
504                         desc = desc->more;
505                 }
506                 for (i = 0; desc->shadow_ptes[i]; ++i)
507                         ;
508                 desc->shadow_ptes[i] = spte;
509         }
510 }
511
512 static void rmap_desc_remove_entry(unsigned long *rmapp,
513                                    struct kvm_rmap_desc *desc,
514                                    int i,
515                                    struct kvm_rmap_desc *prev_desc)
516 {
517         int j;
518
519         for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
520                 ;
521         desc->shadow_ptes[i] = desc->shadow_ptes[j];
522         desc->shadow_ptes[j] = NULL;
523         if (j != 0)
524                 return;
525         if (!prev_desc && !desc->more)
526                 *rmapp = (unsigned long)desc->shadow_ptes[0];
527         else
528                 if (prev_desc)
529                         prev_desc->more = desc->more;
530                 else
531                         *rmapp = (unsigned long)desc->more | 1;
532         mmu_free_rmap_desc(desc);
533 }
534
535 static void rmap_remove(struct kvm *kvm, u64 *spte)
536 {
537         struct kvm_rmap_desc *desc;
538         struct kvm_rmap_desc *prev_desc;
539         struct kvm_mmu_page *sp;
540         pfn_t pfn;
541         unsigned long *rmapp;
542         int i;
543
544         if (!is_rmap_pte(*spte))
545                 return;
546         sp = page_header(__pa(spte));
547         pfn = spte_to_pfn(*spte);
548         if (*spte & shadow_accessed_mask)
549                 kvm_set_pfn_accessed(pfn);
550         if (is_writeble_pte(*spte))
551                 kvm_release_pfn_dirty(pfn);
552         else
553                 kvm_release_pfn_clean(pfn);
554         rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
555         if (!*rmapp) {
556                 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
557                 BUG();
558         } else if (!(*rmapp & 1)) {
559                 rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
560                 if ((u64 *)*rmapp != spte) {
561                         printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
562                                spte, *spte);
563                         BUG();
564                 }
565                 *rmapp = 0;
566         } else {
567                 rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
568                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
569                 prev_desc = NULL;
570                 while (desc) {
571                         for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
572                                 if (desc->shadow_ptes[i] == spte) {
573                                         rmap_desc_remove_entry(rmapp,
574                                                                desc, i,
575                                                                prev_desc);
576                                         return;
577                                 }
578                         prev_desc = desc;
579                         desc = desc->more;
580                 }
581                 BUG();
582         }
583 }
584
585 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
586 {
587         struct kvm_rmap_desc *desc;
588         struct kvm_rmap_desc *prev_desc;
589         u64 *prev_spte;
590         int i;
591
592         if (!*rmapp)
593                 return NULL;
594         else if (!(*rmapp & 1)) {
595                 if (!spte)
596                         return (u64 *)*rmapp;
597                 return NULL;
598         }
599         desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
600         prev_desc = NULL;
601         prev_spte = NULL;
602         while (desc) {
603                 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
604                         if (prev_spte == spte)
605                                 return desc->shadow_ptes[i];
606                         prev_spte = desc->shadow_ptes[i];
607                 }
608                 desc = desc->more;
609         }
610         return NULL;
611 }
612
613 static void rmap_write_protect(struct kvm *kvm, u64 gfn)
614 {
615         unsigned long *rmapp;
616         u64 *spte;
617         int write_protected = 0;
618
619         gfn = unalias_gfn(kvm, gfn);
620         rmapp = gfn_to_rmap(kvm, gfn, 0);
621
622         spte = rmap_next(kvm, rmapp, NULL);
623         while (spte) {
624                 BUG_ON(!spte);
625                 BUG_ON(!(*spte & PT_PRESENT_MASK));
626                 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
627                 if (is_writeble_pte(*spte)) {
628                         set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
629                         write_protected = 1;
630                 }
631                 spte = rmap_next(kvm, rmapp, spte);
632         }
633         if (write_protected) {
634                 pfn_t pfn;
635
636                 spte = rmap_next(kvm, rmapp, NULL);
637                 pfn = spte_to_pfn(*spte);
638                 kvm_set_pfn_dirty(pfn);
639         }
640
641         /* check for huge page mappings */
642         rmapp = gfn_to_rmap(kvm, gfn, 1);
643         spte = rmap_next(kvm, rmapp, NULL);
644         while (spte) {
645                 BUG_ON(!spte);
646                 BUG_ON(!(*spte & PT_PRESENT_MASK));
647                 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
648                 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
649                 if (is_writeble_pte(*spte)) {
650                         rmap_remove(kvm, spte);
651                         --kvm->stat.lpages;
652                         set_shadow_pte(spte, shadow_trap_nonpresent_pte);
653                         spte = NULL;
654                         write_protected = 1;
655                 }
656                 spte = rmap_next(kvm, rmapp, spte);
657         }
658
659         if (write_protected)
660                 kvm_flush_remote_tlbs(kvm);
661 }
662
663 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
664 {
665         u64 *spte;
666         int need_tlb_flush = 0;
667
668         while ((spte = rmap_next(kvm, rmapp, NULL))) {
669                 BUG_ON(!(*spte & PT_PRESENT_MASK));
670                 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
671                 rmap_remove(kvm, spte);
672                 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
673                 need_tlb_flush = 1;
674         }
675         return need_tlb_flush;
676 }
677
678 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
679                           int (*handler)(struct kvm *kvm, unsigned long *rmapp))
680 {
681         int i;
682         int retval = 0;
683
684         /*
685          * If mmap_sem isn't taken, we can look the memslots with only
686          * the mmu_lock by skipping over the slots with userspace_addr == 0.
687          */
688         for (i = 0; i < kvm->nmemslots; i++) {
689                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
690                 unsigned long start = memslot->userspace_addr;
691                 unsigned long end;
692
693                 /* mmu_lock protects userspace_addr */
694                 if (!start)
695                         continue;
696
697                 end = start + (memslot->npages << PAGE_SHIFT);
698                 if (hva >= start && hva < end) {
699                         gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
700                         retval |= handler(kvm, &memslot->rmap[gfn_offset]);
701                         retval |= handler(kvm,
702                                           &memslot->lpage_info[
703                                                   gfn_offset /
704                                                   KVM_PAGES_PER_HPAGE].rmap_pde);
705                 }
706         }
707
708         return retval;
709 }
710
711 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
712 {
713         return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
714 }
715
716 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
717 {
718         u64 *spte;
719         int young = 0;
720
721         /* always return old for EPT */
722         if (!shadow_accessed_mask)
723                 return 0;
724
725         spte = rmap_next(kvm, rmapp, NULL);
726         while (spte) {
727                 int _young;
728                 u64 _spte = *spte;
729                 BUG_ON(!(_spte & PT_PRESENT_MASK));
730                 _young = _spte & PT_ACCESSED_MASK;
731                 if (_young) {
732                         young = 1;
733                         clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
734                 }
735                 spte = rmap_next(kvm, rmapp, spte);
736         }
737         return young;
738 }
739
740 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
741 {
742         return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
743 }
744
745 #ifdef MMU_DEBUG
746 static int is_empty_shadow_page(u64 *spt)
747 {
748         u64 *pos;
749         u64 *end;
750
751         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
752                 if (is_shadow_present_pte(*pos)) {
753                         printk(KERN_ERR "%s: %p %llx\n", __func__,
754                                pos, *pos);
755                         return 0;
756                 }
757         return 1;
758 }
759 #endif
760
761 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
762 {
763         ASSERT(is_empty_shadow_page(sp->spt));
764         list_del(&sp->link);
765         __free_page(virt_to_page(sp->spt));
766         __free_page(virt_to_page(sp->gfns));
767         kfree(sp);
768         ++kvm->arch.n_free_mmu_pages;
769 }
770
771 static unsigned kvm_page_table_hashfn(gfn_t gfn)
772 {
773         return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
774 }
775
776 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
777                                                u64 *parent_pte)
778 {
779         struct kvm_mmu_page *sp;
780
781         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
782         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
783         sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
784         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
785         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
786         ASSERT(is_empty_shadow_page(sp->spt));
787         sp->slot_bitmap = 0;
788         sp->multimapped = 0;
789         sp->parent_pte = parent_pte;
790         --vcpu->kvm->arch.n_free_mmu_pages;
791         return sp;
792 }
793
794 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
795                                     struct kvm_mmu_page *sp, u64 *parent_pte)
796 {
797         struct kvm_pte_chain *pte_chain;
798         struct hlist_node *node;
799         int i;
800
801         if (!parent_pte)
802                 return;
803         if (!sp->multimapped) {
804                 u64 *old = sp->parent_pte;
805
806                 if (!old) {
807                         sp->parent_pte = parent_pte;
808                         return;
809                 }
810                 sp->multimapped = 1;
811                 pte_chain = mmu_alloc_pte_chain(vcpu);
812                 INIT_HLIST_HEAD(&sp->parent_ptes);
813                 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
814                 pte_chain->parent_ptes[0] = old;
815         }
816         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
817                 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
818                         continue;
819                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
820                         if (!pte_chain->parent_ptes[i]) {
821                                 pte_chain->parent_ptes[i] = parent_pte;
822                                 return;
823                         }
824         }
825         pte_chain = mmu_alloc_pte_chain(vcpu);
826         BUG_ON(!pte_chain);
827         hlist_add_head(&pte_chain->link, &sp->parent_ptes);
828         pte_chain->parent_ptes[0] = parent_pte;
829 }
830
831 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
832                                        u64 *parent_pte)
833 {
834         struct kvm_pte_chain *pte_chain;
835         struct hlist_node *node;
836         int i;
837
838         if (!sp->multimapped) {
839                 BUG_ON(sp->parent_pte != parent_pte);
840                 sp->parent_pte = NULL;
841                 return;
842         }
843         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
844                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
845                         if (!pte_chain->parent_ptes[i])
846                                 break;
847                         if (pte_chain->parent_ptes[i] != parent_pte)
848                                 continue;
849                         while (i + 1 < NR_PTE_CHAIN_ENTRIES
850                                 && pte_chain->parent_ptes[i + 1]) {
851                                 pte_chain->parent_ptes[i]
852                                         = pte_chain->parent_ptes[i + 1];
853                                 ++i;
854                         }
855                         pte_chain->parent_ptes[i] = NULL;
856                         if (i == 0) {
857                                 hlist_del(&pte_chain->link);
858                                 mmu_free_pte_chain(pte_chain);
859                                 if (hlist_empty(&sp->parent_ptes)) {
860                                         sp->multimapped = 0;
861                                         sp->parent_pte = NULL;
862                                 }
863                         }
864                         return;
865                 }
866         BUG();
867 }
868
869
870 static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
871                             mmu_parent_walk_fn fn)
872 {
873         struct kvm_pte_chain *pte_chain;
874         struct hlist_node *node;
875         struct kvm_mmu_page *parent_sp;
876         int i;
877
878         if (!sp->multimapped && sp->parent_pte) {
879                 parent_sp = page_header(__pa(sp->parent_pte));
880                 fn(vcpu, parent_sp);
881                 mmu_parent_walk(vcpu, parent_sp, fn);
882                 return;
883         }
884         hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
885                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
886                         if (!pte_chain->parent_ptes[i])
887                                 break;
888                         parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
889                         fn(vcpu, parent_sp);
890                         mmu_parent_walk(vcpu, parent_sp, fn);
891                 }
892 }
893
894 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
895                                     struct kvm_mmu_page *sp)
896 {
897         int i;
898
899         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
900                 sp->spt[i] = shadow_trap_nonpresent_pte;
901 }
902
903 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
904                                struct kvm_mmu_page *sp)
905 {
906         return 1;
907 }
908
909 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
910 {
911 }
912
913 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
914                            struct kvm_unsync_walk *walker)
915 {
916         int i, ret;
917
918         if (!sp->unsync_children)
919                 return 0;
920
921         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
922                 u64 ent = sp->spt[i];
923
924                 if (is_shadow_present_pte(ent)) {
925                         struct kvm_mmu_page *child;
926                         child = page_header(ent & PT64_BASE_ADDR_MASK);
927
928                         if (child->unsync_children) {
929                                 ret = mmu_unsync_walk(child, walker);
930                                 if (ret)
931                                         return ret;
932                         }
933
934                         if (child->unsync) {
935                                 ret = walker->entry(child, walker);
936                                 if (ret)
937                                         return ret;
938                         }
939                 }
940         }
941
942         if (i == PT64_ENT_PER_PAGE)
943                 sp->unsync_children = 0;
944
945         return 0;
946 }
947
948 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
949 {
950         unsigned index;
951         struct hlist_head *bucket;
952         struct kvm_mmu_page *sp;
953         struct hlist_node *node;
954
955         pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
956         index = kvm_page_table_hashfn(gfn);
957         bucket = &kvm->arch.mmu_page_hash[index];
958         hlist_for_each_entry(sp, node, bucket, hash_link)
959                 if (sp->gfn == gfn && !sp->role.metaphysical
960                     && !sp->role.invalid) {
961                         pgprintk("%s: found role %x\n",
962                                  __func__, sp->role.word);
963                         return sp;
964                 }
965         return NULL;
966 }
967
968 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
969 {
970         WARN_ON(!sp->unsync);
971         sp->unsync = 0;
972         --kvm->stat.mmu_unsync;
973 }
974
975 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
976
977 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
978 {
979         if (sp->role.glevels != vcpu->arch.mmu.root_level) {
980                 kvm_mmu_zap_page(vcpu->kvm, sp);
981                 return 1;
982         }
983
984         rmap_write_protect(vcpu->kvm, sp->gfn);
985         if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
986                 kvm_mmu_zap_page(vcpu->kvm, sp);
987                 return 1;
988         }
989
990         kvm_mmu_flush_tlb(vcpu);
991         kvm_unlink_unsync_page(vcpu->kvm, sp);
992         return 0;
993 }
994
995 struct sync_walker {
996         struct kvm_vcpu *vcpu;
997         struct kvm_unsync_walk walker;
998 };
999
1000 static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1001 {
1002         struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
1003                                                      walker);
1004         struct kvm_vcpu *vcpu = sync_walk->vcpu;
1005
1006         kvm_sync_page(vcpu, sp);
1007         return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
1008 }
1009
1010 static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1011 {
1012         struct sync_walker walker = {
1013                 .walker = { .entry = mmu_sync_fn, },
1014                 .vcpu = vcpu,
1015         };
1016
1017         while (mmu_unsync_walk(sp, &walker.walker))
1018                 cond_resched_lock(&vcpu->kvm->mmu_lock);
1019 }
1020
1021 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1022                                              gfn_t gfn,
1023                                              gva_t gaddr,
1024                                              unsigned level,
1025                                              int metaphysical,
1026                                              unsigned access,
1027                                              u64 *parent_pte)
1028 {
1029         union kvm_mmu_page_role role;
1030         unsigned index;
1031         unsigned quadrant;
1032         struct hlist_head *bucket;
1033         struct kvm_mmu_page *sp;
1034         struct hlist_node *node, *tmp;
1035
1036         role.word = 0;
1037         role.glevels = vcpu->arch.mmu.root_level;
1038         role.level = level;
1039         role.metaphysical = metaphysical;
1040         role.access = access;
1041         if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1042                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1043                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1044                 role.quadrant = quadrant;
1045         }
1046         pgprintk("%s: looking gfn %lx role %x\n", __func__,
1047                  gfn, role.word);
1048         index = kvm_page_table_hashfn(gfn);
1049         bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1050         hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
1051                 if (sp->gfn == gfn) {
1052                         if (sp->unsync)
1053                                 if (kvm_sync_page(vcpu, sp))
1054                                         continue;
1055
1056                         if (sp->role.word != role.word)
1057                                 continue;
1058
1059                         if (sp->unsync_children)
1060                                 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1061
1062                         mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1063                         pgprintk("%s: found\n", __func__);
1064                         return sp;
1065                 }
1066         ++vcpu->kvm->stat.mmu_cache_miss;
1067         sp = kvm_mmu_alloc_page(vcpu, parent_pte);
1068         if (!sp)
1069                 return sp;
1070         pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
1071         sp->gfn = gfn;
1072         sp->role = role;
1073         hlist_add_head(&sp->hash_link, bucket);
1074         if (!metaphysical) {
1075                 rmap_write_protect(vcpu->kvm, gfn);
1076                 account_shadowed(vcpu->kvm, gfn);
1077         }
1078         if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1079                 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1080         else
1081                 nonpaging_prefetch_page(vcpu, sp);
1082         return sp;
1083 }
1084
1085 static int walk_shadow(struct kvm_shadow_walk *walker,
1086                        struct kvm_vcpu *vcpu, u64 addr)
1087 {
1088         hpa_t shadow_addr;
1089         int level;
1090         int r;
1091         u64 *sptep;
1092         unsigned index;
1093
1094         shadow_addr = vcpu->arch.mmu.root_hpa;
1095         level = vcpu->arch.mmu.shadow_root_level;
1096         if (level == PT32E_ROOT_LEVEL) {
1097                 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1098                 shadow_addr &= PT64_BASE_ADDR_MASK;
1099                 --level;
1100         }
1101
1102         while (level >= PT_PAGE_TABLE_LEVEL) {
1103                 index = SHADOW_PT_INDEX(addr, level);
1104                 sptep = ((u64 *)__va(shadow_addr)) + index;
1105                 r = walker->entry(walker, vcpu, addr, sptep, level);
1106                 if (r)
1107                         return r;
1108                 shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
1109                 --level;
1110         }
1111         return 0;
1112 }
1113
1114 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1115                                          struct kvm_mmu_page *sp)
1116 {
1117         unsigned i;
1118         u64 *pt;
1119         u64 ent;
1120
1121         pt = sp->spt;
1122
1123         if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1124                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1125                         if (is_shadow_present_pte(pt[i]))
1126                                 rmap_remove(kvm, &pt[i]);
1127                         pt[i] = shadow_trap_nonpresent_pte;
1128                 }
1129                 return;
1130         }
1131
1132         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1133                 ent = pt[i];
1134
1135                 if (is_shadow_present_pte(ent)) {
1136                         if (!is_large_pte(ent)) {
1137                                 ent &= PT64_BASE_ADDR_MASK;
1138                                 mmu_page_remove_parent_pte(page_header(ent),
1139                                                            &pt[i]);
1140                         } else {
1141                                 --kvm->stat.lpages;
1142                                 rmap_remove(kvm, &pt[i]);
1143                         }
1144                 }
1145                 pt[i] = shadow_trap_nonpresent_pte;
1146         }
1147 }
1148
1149 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1150 {
1151         mmu_page_remove_parent_pte(sp, parent_pte);
1152 }
1153
1154 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1155 {
1156         int i;
1157
1158         for (i = 0; i < KVM_MAX_VCPUS; ++i)
1159                 if (kvm->vcpus[i])
1160                         kvm->vcpus[i]->arch.last_pte_updated = NULL;
1161 }
1162
1163 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1164 {
1165         u64 *parent_pte;
1166
1167         while (sp->multimapped || sp->parent_pte) {
1168                 if (!sp->multimapped)
1169                         parent_pte = sp->parent_pte;
1170                 else {
1171                         struct kvm_pte_chain *chain;
1172
1173                         chain = container_of(sp->parent_ptes.first,
1174                                              struct kvm_pte_chain, link);
1175                         parent_pte = chain->parent_ptes[0];
1176                 }
1177                 BUG_ON(!parent_pte);
1178                 kvm_mmu_put_page(sp, parent_pte);
1179                 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
1180         }
1181 }
1182
1183 struct zap_walker {
1184         struct kvm_unsync_walk walker;
1185         struct kvm *kvm;
1186         int zapped;
1187 };
1188
1189 static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1190 {
1191         struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
1192                                                      walker);
1193         kvm_mmu_zap_page(zap_walk->kvm, sp);
1194         zap_walk->zapped = 1;
1195         return 0;
1196 }
1197
1198 static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
1199 {
1200         struct zap_walker walker = {
1201                 .walker = { .entry = mmu_zap_fn, },
1202                 .kvm = kvm,
1203                 .zapped = 0,
1204         };
1205
1206         if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1207                 return 0;
1208         mmu_unsync_walk(sp, &walker.walker);
1209         return walker.zapped;
1210 }
1211
1212 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1213 {
1214         int ret;
1215         ++kvm->stat.mmu_shadow_zapped;
1216         ret = mmu_zap_unsync_children(kvm, sp);
1217         kvm_mmu_page_unlink_children(kvm, sp);
1218         kvm_mmu_unlink_parents(kvm, sp);
1219         kvm_flush_remote_tlbs(kvm);
1220         if (!sp->role.invalid && !sp->role.metaphysical)
1221                 unaccount_shadowed(kvm, sp->gfn);
1222         if (sp->unsync)
1223                 kvm_unlink_unsync_page(kvm, sp);
1224         if (!sp->root_count) {
1225                 hlist_del(&sp->hash_link);
1226                 kvm_mmu_free_page(kvm, sp);
1227         } else {
1228                 sp->role.invalid = 1;
1229                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1230                 kvm_reload_remote_mmus(kvm);
1231         }
1232         kvm_mmu_reset_last_pte_updated(kvm);
1233         return ret;
1234 }
1235
1236 /*
1237  * Changing the number of mmu pages allocated to the vm
1238  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
1239  */
1240 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1241 {
1242         /*
1243          * If we set the number of mmu pages to be smaller be than the
1244          * number of actived pages , we must to free some mmu pages before we
1245          * change the value
1246          */
1247
1248         if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
1249             kvm_nr_mmu_pages) {
1250                 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
1251                                        - kvm->arch.n_free_mmu_pages;
1252
1253                 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
1254                         struct kvm_mmu_page *page;
1255
1256                         page = container_of(kvm->arch.active_mmu_pages.prev,
1257                                             struct kvm_mmu_page, link);
1258                         kvm_mmu_zap_page(kvm, page);
1259                         n_used_mmu_pages--;
1260                 }
1261                 kvm->arch.n_free_mmu_pages = 0;
1262         }
1263         else
1264                 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1265                                          - kvm->arch.n_alloc_mmu_pages;
1266
1267         kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1268 }
1269
1270 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1271 {
1272         unsigned index;
1273         struct hlist_head *bucket;
1274         struct kvm_mmu_page *sp;
1275         struct hlist_node *node, *n;
1276         int r;
1277
1278         pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1279         r = 0;
1280         index = kvm_page_table_hashfn(gfn);
1281         bucket = &kvm->arch.mmu_page_hash[index];
1282         hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1283                 if (sp->gfn == gfn && !sp->role.metaphysical) {
1284                         pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1285                                  sp->role.word);
1286                         r = 1;
1287                         if (kvm_mmu_zap_page(kvm, sp))
1288                                 n = bucket->first;
1289                 }
1290         return r;
1291 }
1292
1293 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1294 {
1295         struct kvm_mmu_page *sp;
1296
1297         while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
1298                 pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
1299                 kvm_mmu_zap_page(kvm, sp);
1300         }
1301 }
1302
1303 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1304 {
1305         int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
1306         struct kvm_mmu_page *sp = page_header(__pa(pte));
1307
1308         __set_bit(slot, &sp->slot_bitmap);
1309 }
1310
1311 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1312 {
1313         int i;
1314         u64 *pt = sp->spt;
1315
1316         if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1317                 return;
1318
1319         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1320                 if (pt[i] == shadow_notrap_nonpresent_pte)
1321                         set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
1322         }
1323 }
1324
1325 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1326 {
1327         struct page *page;
1328
1329         gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1330
1331         if (gpa == UNMAPPED_GVA)
1332                 return NULL;
1333
1334         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1335
1336         return page;
1337 }
1338
1339 static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1340 {
1341         sp->unsync_children = 1;
1342         return 1;
1343 }
1344
1345 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1346 {
1347         unsigned index;
1348         struct hlist_head *bucket;
1349         struct kvm_mmu_page *s;
1350         struct hlist_node *node, *n;
1351
1352         index = kvm_page_table_hashfn(sp->gfn);
1353         bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1354         /* don't unsync if pagetable is shadowed with multiple roles */
1355         hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1356                 if (s->gfn != sp->gfn || s->role.metaphysical)
1357                         continue;
1358                 if (s->role.word != sp->role.word)
1359                         return 1;
1360         }
1361         mmu_parent_walk(vcpu, sp, unsync_walk_fn);
1362         ++vcpu->kvm->stat.mmu_unsync;
1363         sp->unsync = 1;
1364         mmu_convert_notrap(sp);
1365         return 0;
1366 }
1367
1368 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1369                                   bool can_unsync)
1370 {
1371         struct kvm_mmu_page *shadow;
1372
1373         shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1374         if (shadow) {
1375                 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1376                         return 1;
1377                 if (shadow->unsync)
1378                         return 0;
1379                 if (can_unsync)
1380                         return kvm_unsync_page(vcpu, shadow);
1381                 return 1;
1382         }
1383         return 0;
1384 }
1385
1386 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1387                     unsigned pte_access, int user_fault,
1388                     int write_fault, int dirty, int largepage,
1389                     gfn_t gfn, pfn_t pfn, bool speculative,
1390                     bool can_unsync)
1391 {
1392         u64 spte;
1393         int ret = 0;
1394         /*
1395          * We don't set the accessed bit, since we sometimes want to see
1396          * whether the guest actually used the pte (in order to detect
1397          * demand paging).
1398          */
1399         spte = shadow_base_present_pte | shadow_dirty_mask;
1400         if (!speculative)
1401                 spte |= shadow_accessed_mask;
1402         if (!dirty)
1403                 pte_access &= ~ACC_WRITE_MASK;
1404         if (pte_access & ACC_EXEC_MASK)
1405                 spte |= shadow_x_mask;
1406         else
1407                 spte |= shadow_nx_mask;
1408         if (pte_access & ACC_USER_MASK)
1409                 spte |= shadow_user_mask;
1410         if (largepage)
1411                 spte |= PT_PAGE_SIZE_MASK;
1412
1413         spte |= (u64)pfn << PAGE_SHIFT;
1414
1415         if ((pte_access & ACC_WRITE_MASK)
1416             || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1417
1418                 if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
1419                         ret = 1;
1420                         spte = shadow_trap_nonpresent_pte;
1421                         goto set_pte;
1422                 }
1423
1424                 spte |= PT_WRITABLE_MASK;
1425
1426                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1427                         pgprintk("%s: found shadow page for %lx, marking ro\n",
1428                                  __func__, gfn);
1429                         ret = 1;
1430                         pte_access &= ~ACC_WRITE_MASK;
1431                         if (is_writeble_pte(spte))
1432                                 spte &= ~PT_WRITABLE_MASK;
1433                 }
1434         }
1435
1436         if (pte_access & ACC_WRITE_MASK)
1437                 mark_page_dirty(vcpu->kvm, gfn);
1438
1439 set_pte:
1440         set_shadow_pte(shadow_pte, spte);
1441         return ret;
1442 }
1443
1444 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1445                          unsigned pt_access, unsigned pte_access,
1446                          int user_fault, int write_fault, int dirty,
1447                          int *ptwrite, int largepage, gfn_t gfn,
1448                          pfn_t pfn, bool speculative)
1449 {
1450         int was_rmapped = 0;
1451         int was_writeble = is_writeble_pte(*shadow_pte);
1452
1453         pgprintk("%s: spte %llx access %x write_fault %d"
1454                  " user_fault %d gfn %lx\n",
1455                  __func__, *shadow_pte, pt_access,
1456                  write_fault, user_fault, gfn);
1457
1458         if (is_rmap_pte(*shadow_pte)) {
1459                 /*
1460                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1461                  * the parent of the now unreachable PTE.
1462                  */
1463                 if (largepage && !is_large_pte(*shadow_pte)) {
1464                         struct kvm_mmu_page *child;
1465                         u64 pte = *shadow_pte;
1466
1467                         child = page_header(pte & PT64_BASE_ADDR_MASK);
1468                         mmu_page_remove_parent_pte(child, shadow_pte);
1469                 } else if (pfn != spte_to_pfn(*shadow_pte)) {
1470                         pgprintk("hfn old %lx new %lx\n",
1471                                  spte_to_pfn(*shadow_pte), pfn);
1472                         rmap_remove(vcpu->kvm, shadow_pte);
1473                 } else {
1474                         if (largepage)
1475                                 was_rmapped = is_large_pte(*shadow_pte);
1476                         else
1477                                 was_rmapped = 1;
1478                 }
1479         }
1480         if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1481                       dirty, largepage, gfn, pfn, speculative, true)) {
1482                 if (write_fault)
1483                         *ptwrite = 1;
1484                 kvm_x86_ops->tlb_flush(vcpu);
1485         }
1486
1487         pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
1488         pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1489                  is_large_pte(*shadow_pte)? "2MB" : "4kB",
1490                  is_present_pte(*shadow_pte)?"RW":"R", gfn,
1491                  *shadow_pte, shadow_pte);
1492         if (!was_rmapped && is_large_pte(*shadow_pte))
1493                 ++vcpu->kvm->stat.lpages;
1494
1495         page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
1496         if (!was_rmapped) {
1497                 rmap_add(vcpu, shadow_pte, gfn, largepage);
1498                 if (!is_rmap_pte(*shadow_pte))
1499                         kvm_release_pfn_clean(pfn);
1500         } else {
1501                 if (was_writeble)
1502                         kvm_release_pfn_dirty(pfn);
1503                 else
1504                         kvm_release_pfn_clean(pfn);
1505         }
1506         if (speculative) {
1507                 vcpu->arch.last_pte_updated = shadow_pte;
1508                 vcpu->arch.last_pte_gfn = gfn;
1509         }
1510 }
1511
1512 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1513 {
1514 }
1515
1516 struct direct_shadow_walk {
1517         struct kvm_shadow_walk walker;
1518         pfn_t pfn;
1519         int write;
1520         int largepage;
1521         int pt_write;
1522 };
1523
1524 static int direct_map_entry(struct kvm_shadow_walk *_walk,
1525                             struct kvm_vcpu *vcpu,
1526                             u64 addr, u64 *sptep, int level)
1527 {
1528         struct direct_shadow_walk *walk =
1529                 container_of(_walk, struct direct_shadow_walk, walker);
1530         struct kvm_mmu_page *sp;
1531         gfn_t pseudo_gfn;
1532         gfn_t gfn = addr >> PAGE_SHIFT;
1533
1534         if (level == PT_PAGE_TABLE_LEVEL
1535             || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1536                 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1537                              0, walk->write, 1, &walk->pt_write,
1538                              walk->largepage, gfn, walk->pfn, false);
1539                 ++vcpu->stat.pf_fixed;
1540                 return 1;
1541         }
1542
1543         if (*sptep == shadow_trap_nonpresent_pte) {
1544                 pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
1545                 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
1546                                       1, ACC_ALL, sptep);
1547                 if (!sp) {
1548                         pgprintk("nonpaging_map: ENOMEM\n");
1549                         kvm_release_pfn_clean(walk->pfn);
1550                         return -ENOMEM;
1551                 }
1552
1553                 set_shadow_pte(sptep,
1554                                __pa(sp->spt)
1555                                | PT_PRESENT_MASK | PT_WRITABLE_MASK
1556                                | shadow_user_mask | shadow_x_mask);
1557         }
1558         return 0;
1559 }
1560
1561 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1562                         int largepage, gfn_t gfn, pfn_t pfn)
1563 {
1564         int r;
1565         struct direct_shadow_walk walker = {
1566                 .walker = { .entry = direct_map_entry, },
1567                 .pfn = pfn,
1568                 .largepage = largepage,
1569                 .write = write,
1570                 .pt_write = 0,
1571         };
1572
1573         r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
1574         if (r < 0)
1575                 return r;
1576         return walker.pt_write;
1577 }
1578
1579 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1580 {
1581         int r;
1582         int largepage = 0;
1583         pfn_t pfn;
1584         unsigned long mmu_seq;
1585
1586         if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1587                 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1588                 largepage = 1;
1589         }
1590
1591         mmu_seq = vcpu->kvm->mmu_notifier_seq;
1592         smp_rmb();
1593         pfn = gfn_to_pfn(vcpu->kvm, gfn);
1594
1595         /* mmio */
1596         if (is_error_pfn(pfn)) {
1597                 kvm_release_pfn_clean(pfn);
1598                 return 1;
1599         }
1600
1601         spin_lock(&vcpu->kvm->mmu_lock);
1602         if (mmu_notifier_retry(vcpu, mmu_seq))
1603                 goto out_unlock;
1604         kvm_mmu_free_some_pages(vcpu);
1605         r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
1606         spin_unlock(&vcpu->kvm->mmu_lock);
1607
1608
1609         return r;
1610
1611 out_unlock:
1612         spin_unlock(&vcpu->kvm->mmu_lock);
1613         kvm_release_pfn_clean(pfn);
1614         return 0;
1615 }
1616
1617
1618 static void mmu_free_roots(struct kvm_vcpu *vcpu)
1619 {
1620         int i;
1621         struct kvm_mmu_page *sp;
1622
1623         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1624                 return;
1625         spin_lock(&vcpu->kvm->mmu_lock);
1626         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1627                 hpa_t root = vcpu->arch.mmu.root_hpa;
1628
1629                 sp = page_header(root);
1630                 --sp->root_count;
1631                 if (!sp->root_count && sp->role.invalid)
1632                         kvm_mmu_zap_page(vcpu->kvm, sp);
1633                 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1634                 spin_unlock(&vcpu->kvm->mmu_lock);
1635                 return;
1636         }
1637         for (i = 0; i < 4; ++i) {
1638                 hpa_t root = vcpu->arch.mmu.pae_root[i];
1639
1640                 if (root) {
1641                         root &= PT64_BASE_ADDR_MASK;
1642                         sp = page_header(root);
1643                         --sp->root_count;
1644                         if (!sp->root_count && sp->role.invalid)
1645                                 kvm_mmu_zap_page(vcpu->kvm, sp);
1646                 }
1647                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1648         }
1649         spin_unlock(&vcpu->kvm->mmu_lock);
1650         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1651 }
1652
1653 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1654 {
1655         int i;
1656         gfn_t root_gfn;
1657         struct kvm_mmu_page *sp;
1658         int metaphysical = 0;
1659
1660         root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1661
1662         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1663                 hpa_t root = vcpu->arch.mmu.root_hpa;
1664
1665                 ASSERT(!VALID_PAGE(root));
1666                 if (tdp_enabled)
1667                         metaphysical = 1;
1668                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1669                                       PT64_ROOT_LEVEL, metaphysical,
1670                                       ACC_ALL, NULL);
1671                 root = __pa(sp->spt);
1672                 ++sp->root_count;
1673                 vcpu->arch.mmu.root_hpa = root;
1674                 return;
1675         }
1676         metaphysical = !is_paging(vcpu);
1677         if (tdp_enabled)
1678                 metaphysical = 1;
1679         for (i = 0; i < 4; ++i) {
1680                 hpa_t root = vcpu->arch.mmu.pae_root[i];
1681
1682                 ASSERT(!VALID_PAGE(root));
1683                 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1684                         if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1685                                 vcpu->arch.mmu.pae_root[i] = 0;
1686                                 continue;
1687                         }
1688                         root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1689                 } else if (vcpu->arch.mmu.root_level == 0)
1690                         root_gfn = 0;
1691                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1692                                       PT32_ROOT_LEVEL, metaphysical,
1693                                       ACC_ALL, NULL);
1694                 root = __pa(sp->spt);
1695                 ++sp->root_count;
1696                 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1697         }
1698         vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1699 }
1700
1701 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
1702 {
1703         int i;
1704         struct kvm_mmu_page *sp;
1705
1706         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1707                 return;
1708         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1709                 hpa_t root = vcpu->arch.mmu.root_hpa;
1710                 sp = page_header(root);
1711                 mmu_sync_children(vcpu, sp);
1712                 return;
1713         }
1714         for (i = 0; i < 4; ++i) {
1715                 hpa_t root = vcpu->arch.mmu.pae_root[i];
1716
1717                 if (root) {
1718                         root &= PT64_BASE_ADDR_MASK;
1719                         sp = page_header(root);
1720                         mmu_sync_children(vcpu, sp);
1721                 }
1722         }
1723 }
1724
1725 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1726 {
1727         spin_lock(&vcpu->kvm->mmu_lock);
1728         mmu_sync_roots(vcpu);
1729         spin_unlock(&vcpu->kvm->mmu_lock);
1730 }
1731
1732 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1733 {
1734         return vaddr;
1735 }
1736
1737 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1738                                 u32 error_code)
1739 {
1740         gfn_t gfn;
1741         int r;
1742
1743         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
1744         r = mmu_topup_memory_caches(vcpu);
1745         if (r)
1746                 return r;
1747
1748         ASSERT(vcpu);
1749         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1750
1751         gfn = gva >> PAGE_SHIFT;
1752
1753         return nonpaging_map(vcpu, gva & PAGE_MASK,
1754                              error_code & PFERR_WRITE_MASK, gfn);
1755 }
1756
1757 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1758                                 u32 error_code)
1759 {
1760         pfn_t pfn;
1761         int r;
1762         int largepage = 0;
1763         gfn_t gfn = gpa >> PAGE_SHIFT;
1764         unsigned long mmu_seq;
1765
1766         ASSERT(vcpu);
1767         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1768
1769         r = mmu_topup_memory_caches(vcpu);
1770         if (r)
1771                 return r;
1772
1773         if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1774                 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1775                 largepage = 1;
1776         }
1777         mmu_seq = vcpu->kvm->mmu_notifier_seq;
1778         smp_rmb();
1779         pfn = gfn_to_pfn(vcpu->kvm, gfn);
1780         if (is_error_pfn(pfn)) {
1781                 kvm_release_pfn_clean(pfn);
1782                 return 1;
1783         }
1784         spin_lock(&vcpu->kvm->mmu_lock);
1785         if (mmu_notifier_retry(vcpu, mmu_seq))
1786                 goto out_unlock;
1787         kvm_mmu_free_some_pages(vcpu);
1788         r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1789                          largepage, gfn, pfn);
1790         spin_unlock(&vcpu->kvm->mmu_lock);
1791
1792         return r;
1793
1794 out_unlock:
1795         spin_unlock(&vcpu->kvm->mmu_lock);
1796         kvm_release_pfn_clean(pfn);
1797         return 0;
1798 }
1799
1800 static void nonpaging_free(struct kvm_vcpu *vcpu)
1801 {
1802         mmu_free_roots(vcpu);
1803 }
1804
1805 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1806 {
1807         struct kvm_mmu *context = &vcpu->arch.mmu;
1808
1809         context->new_cr3 = nonpaging_new_cr3;
1810         context->page_fault = nonpaging_page_fault;
1811         context->gva_to_gpa = nonpaging_gva_to_gpa;
1812         context->free = nonpaging_free;
1813         context->prefetch_page = nonpaging_prefetch_page;
1814         context->sync_page = nonpaging_sync_page;
1815         context->invlpg = nonpaging_invlpg;
1816         context->root_level = 0;
1817         context->shadow_root_level = PT32E_ROOT_LEVEL;
1818         context->root_hpa = INVALID_PAGE;
1819         return 0;
1820 }
1821
1822 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1823 {
1824         ++vcpu->stat.tlb_flush;
1825         kvm_x86_ops->tlb_flush(vcpu);
1826 }
1827
1828 static void paging_new_cr3(struct kvm_vcpu *vcpu)
1829 {
1830         pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
1831         mmu_free_roots(vcpu);
1832 }
1833
1834 static void inject_page_fault(struct kvm_vcpu *vcpu,
1835                               u64 addr,
1836                               u32 err_code)
1837 {
1838         kvm_inject_page_fault(vcpu, addr, err_code);
1839 }
1840
1841 static void paging_free(struct kvm_vcpu *vcpu)
1842 {
1843         nonpaging_free(vcpu);
1844 }
1845
1846 #define PTTYPE 64
1847 #include "paging_tmpl.h"
1848 #undef PTTYPE
1849
1850 #define PTTYPE 32
1851 #include "paging_tmpl.h"
1852 #undef PTTYPE
1853
1854 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1855 {
1856         struct kvm_mmu *context = &vcpu->arch.mmu;
1857
1858         ASSERT(is_pae(vcpu));
1859         context->new_cr3 = paging_new_cr3;
1860         context->page_fault = paging64_page_fault;
1861         context->gva_to_gpa = paging64_gva_to_gpa;
1862         context->prefetch_page = paging64_prefetch_page;
1863         context->sync_page = paging64_sync_page;
1864         context->invlpg = paging64_invlpg;
1865         context->free = paging_free;
1866         context->root_level = level;
1867         context->shadow_root_level = level;
1868         context->root_hpa = INVALID_PAGE;
1869         return 0;
1870 }
1871
1872 static int paging64_init_context(struct kvm_vcpu *vcpu)
1873 {
1874         return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1875 }
1876
1877 static int paging32_init_context(struct kvm_vcpu *vcpu)
1878 {
1879         struct kvm_mmu *context = &vcpu->arch.mmu;
1880
1881         context->new_cr3 = paging_new_cr3;
1882         context->page_fault = paging32_page_fault;
1883         context->gva_to_gpa = paging32_gva_to_gpa;
1884         context->free = paging_free;
1885         context->prefetch_page = paging32_prefetch_page;
1886         context->sync_page = paging32_sync_page;
1887         context->invlpg = paging32_invlpg;
1888         context->root_level = PT32_ROOT_LEVEL;
1889         context->shadow_root_level = PT32E_ROOT_LEVEL;
1890         context->root_hpa = INVALID_PAGE;
1891         return 0;
1892 }
1893
1894 static int paging32E_init_context(struct kvm_vcpu *vcpu)
1895 {
1896         return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1897 }
1898
1899 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1900 {
1901         struct kvm_mmu *context = &vcpu->arch.mmu;
1902
1903         context->new_cr3 = nonpaging_new_cr3;
1904         context->page_fault = tdp_page_fault;
1905         context->free = nonpaging_free;
1906         context->prefetch_page = nonpaging_prefetch_page;
1907         context->sync_page = nonpaging_sync_page;
1908         context->invlpg = nonpaging_invlpg;
1909         context->shadow_root_level = kvm_x86_ops->get_tdp_level();
1910         context->root_hpa = INVALID_PAGE;
1911
1912         if (!is_paging(vcpu)) {
1913                 context->gva_to_gpa = nonpaging_gva_to_gpa;
1914                 context->root_level = 0;
1915         } else if (is_long_mode(vcpu)) {
1916                 context->gva_to_gpa = paging64_gva_to_gpa;
1917                 context->root_level = PT64_ROOT_LEVEL;
1918         } else if (is_pae(vcpu)) {
1919                 context->gva_to_gpa = paging64_gva_to_gpa;
1920                 context->root_level = PT32E_ROOT_LEVEL;
1921         } else {
1922                 context->gva_to_gpa = paging32_gva_to_gpa;
1923                 context->root_level = PT32_ROOT_LEVEL;
1924         }
1925
1926         return 0;
1927 }
1928
1929 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
1930 {
1931         ASSERT(vcpu);
1932         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1933
1934         if (!is_paging(vcpu))
1935                 return nonpaging_init_context(vcpu);
1936         else if (is_long_mode(vcpu))
1937                 return paging64_init_context(vcpu);
1938         else if (is_pae(vcpu))
1939                 return paging32E_init_context(vcpu);
1940         else
1941                 return paging32_init_context(vcpu);
1942 }
1943
1944 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1945 {
1946         vcpu->arch.update_pte.pfn = bad_pfn;
1947
1948         if (tdp_enabled)
1949                 return init_kvm_tdp_mmu(vcpu);
1950         else
1951                 return init_kvm_softmmu(vcpu);
1952 }
1953
1954 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1955 {
1956         ASSERT(vcpu);
1957         if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
1958                 vcpu->arch.mmu.free(vcpu);
1959                 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1960         }
1961 }
1962
1963 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1964 {
1965         destroy_kvm_mmu(vcpu);
1966         return init_kvm_mmu(vcpu);
1967 }
1968 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1969
1970 int kvm_mmu_load(struct kvm_vcpu *vcpu)
1971 {
1972         int r;
1973
1974         r = mmu_topup_memory_caches(vcpu);
1975         if (r)
1976                 goto out;
1977         spin_lock(&vcpu->kvm->mmu_lock);
1978         kvm_mmu_free_some_pages(vcpu);
1979         mmu_alloc_roots(vcpu);
1980         mmu_sync_roots(vcpu);
1981         spin_unlock(&vcpu->kvm->mmu_lock);
1982         kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1983         kvm_mmu_flush_tlb(vcpu);
1984 out:
1985         return r;
1986 }
1987 EXPORT_SYMBOL_GPL(kvm_mmu_load);
1988
1989 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1990 {
1991         mmu_free_roots(vcpu);
1992 }
1993
1994 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1995                                   struct kvm_mmu_page *sp,
1996                                   u64 *spte)
1997 {
1998         u64 pte;
1999         struct kvm_mmu_page *child;
2000
2001         pte = *spte;
2002         if (is_shadow_present_pte(pte)) {
2003                 if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
2004                     is_large_pte(pte))
2005                         rmap_remove(vcpu->kvm, spte);
2006                 else {
2007                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2008                         mmu_page_remove_parent_pte(child, spte);
2009                 }
2010         }
2011         set_shadow_pte(spte, shadow_trap_nonpresent_pte);
2012         if (is_large_pte(pte))
2013                 --vcpu->kvm->stat.lpages;
2014 }
2015
2016 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2017                                   struct kvm_mmu_page *sp,
2018                                   u64 *spte,
2019                                   const void *new)
2020 {
2021         if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2022                 if (!vcpu->arch.update_pte.largepage ||
2023                     sp->role.glevels == PT32_ROOT_LEVEL) {
2024                         ++vcpu->kvm->stat.mmu_pde_zapped;
2025                         return;
2026                 }
2027         }
2028
2029         ++vcpu->kvm->stat.mmu_pte_updated;
2030         if (sp->role.glevels == PT32_ROOT_LEVEL)
2031                 paging32_update_pte(vcpu, sp, spte, new);
2032         else
2033                 paging64_update_pte(vcpu, sp, spte, new);
2034 }
2035
2036 static bool need_remote_flush(u64 old, u64 new)
2037 {
2038         if (!is_shadow_present_pte(old))
2039                 return false;
2040         if (!is_shadow_present_pte(new))
2041                 return true;
2042         if ((old ^ new) & PT64_BASE_ADDR_MASK)
2043                 return true;
2044         old ^= PT64_NX_MASK;
2045         new ^= PT64_NX_MASK;
2046         return (old & ~new & PT64_PERM_MASK) != 0;
2047 }
2048
2049 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
2050 {
2051         if (need_remote_flush(old, new))
2052                 kvm_flush_remote_tlbs(vcpu->kvm);
2053         else
2054                 kvm_mmu_flush_tlb(vcpu);
2055 }
2056
2057 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2058 {
2059         u64 *spte = vcpu->arch.last_pte_updated;
2060
2061         return !!(spte && (*spte & shadow_accessed_mask));
2062 }
2063
2064 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2065                                           const u8 *new, int bytes)
2066 {
2067         gfn_t gfn;
2068         int r;
2069         u64 gpte = 0;
2070         pfn_t pfn;
2071
2072         vcpu->arch.update_pte.largepage = 0;
2073
2074         if (bytes != 4 && bytes != 8)
2075                 return;
2076
2077         /*
2078          * Assume that the pte write on a page table of the same type
2079          * as the current vcpu paging mode.  This is nearly always true
2080          * (might be false while changing modes).  Note it is verified later
2081          * by update_pte().
2082          */
2083         if (is_pae(vcpu)) {
2084                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2085                 if ((bytes == 4) && (gpa % 4 == 0)) {
2086                         r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
2087                         if (r)
2088                                 return;
2089                         memcpy((void *)&gpte + (gpa % 8), new, 4);
2090                 } else if ((bytes == 8) && (gpa % 8 == 0)) {
2091                         memcpy((void *)&gpte, new, 8);
2092                 }
2093         } else {
2094                 if ((bytes == 4) && (gpa % 4 == 0))
2095                         memcpy((void *)&gpte, new, 4);
2096         }
2097         if (!is_present_pte(gpte))
2098                 return;
2099         gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2100
2101         if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
2102                 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
2103                 vcpu->arch.update_pte.largepage = 1;
2104         }
2105         vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2106         smp_rmb();
2107         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2108
2109         if (is_error_pfn(pfn)) {
2110                 kvm_release_pfn_clean(pfn);
2111                 return;
2112         }
2113         vcpu->arch.update_pte.gfn = gfn;
2114         vcpu->arch.update_pte.pfn = pfn;
2115 }
2116
2117 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2118 {
2119         u64 *spte = vcpu->arch.last_pte_updated;
2120
2121         if (spte
2122             && vcpu->arch.last_pte_gfn == gfn
2123             && shadow_accessed_mask
2124             && !(*spte & shadow_accessed_mask)
2125             && is_shadow_present_pte(*spte))
2126                 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
2127 }
2128
2129 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2130                        const u8 *new, int bytes)
2131 {
2132         gfn_t gfn = gpa >> PAGE_SHIFT;
2133         struct kvm_mmu_page *sp;
2134         struct hlist_node *node, *n;
2135         struct hlist_head *bucket;
2136         unsigned index;
2137         u64 entry, gentry;
2138         u64 *spte;
2139         unsigned offset = offset_in_page(gpa);
2140         unsigned pte_size;
2141         unsigned page_offset;
2142         unsigned misaligned;
2143         unsigned quadrant;
2144         int level;
2145         int flooded = 0;
2146         int npte;
2147         int r;
2148
2149         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2150         mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
2151         spin_lock(&vcpu->kvm->mmu_lock);
2152         kvm_mmu_access_page(vcpu, gfn);
2153         kvm_mmu_free_some_pages(vcpu);
2154         ++vcpu->kvm->stat.mmu_pte_write;
2155         kvm_mmu_audit(vcpu, "pre pte write");
2156         if (gfn == vcpu->arch.last_pt_write_gfn
2157             && !last_updated_pte_accessed(vcpu)) {
2158                 ++vcpu->arch.last_pt_write_count;
2159                 if (vcpu->arch.last_pt_write_count >= 3)
2160                         flooded = 1;
2161         } else {
2162                 vcpu->arch.last_pt_write_gfn = gfn;
2163                 vcpu->arch.last_pt_write_count = 1;
2164                 vcpu->arch.last_pte_updated = NULL;
2165         }
2166         index = kvm_page_table_hashfn(gfn);
2167         bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2168         hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
2169                 if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
2170                         continue;
2171                 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
2172                 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2173                 misaligned |= bytes < 4;
2174                 if (misaligned || flooded) {
2175                         /*
2176                          * Misaligned accesses are too much trouble to fix
2177                          * up; also, they usually indicate a page is not used
2178                          * as a page table.
2179                          *
2180                          * If we're seeing too many writes to a page,
2181                          * it may no longer be a page table, or we may be
2182                          * forking, in which case it is better to unmap the
2183                          * page.
2184                          */
2185                         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2186                                  gpa, bytes, sp->role.word);
2187                         if (kvm_mmu_zap_page(vcpu->kvm, sp))
2188                                 n = bucket->first;
2189                         ++vcpu->kvm->stat.mmu_flooded;
2190                         continue;
2191                 }
2192                 page_offset = offset;
2193                 level = sp->role.level;
2194                 npte = 1;
2195                 if (sp->role.glevels == PT32_ROOT_LEVEL) {
2196                         page_offset <<= 1;      /* 32->64 */
2197                         /*
2198                          * A 32-bit pde maps 4MB while the shadow pdes map
2199                          * only 2MB.  So we need to double the offset again
2200                          * and zap two pdes instead of one.
2201                          */
2202                         if (level == PT32_ROOT_LEVEL) {
2203                                 page_offset &= ~7; /* kill rounding error */
2204                                 page_offset <<= 1;
2205                                 npte = 2;
2206                         }
2207                         quadrant = page_offset >> PAGE_SHIFT;
2208                         page_offset &= ~PAGE_MASK;
2209                         if (quadrant != sp->role.quadrant)
2210                                 continue;
2211                 }
2212                 spte = &sp->spt[page_offset / sizeof(*spte)];
2213                 if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
2214                         gentry = 0;
2215                         r = kvm_read_guest_atomic(vcpu->kvm,
2216                                                   gpa & ~(u64)(pte_size - 1),
2217                                                   &gentry, pte_size);
2218                         new = (const void *)&gentry;
2219                         if (r < 0)
2220                                 new = NULL;
2221                 }
2222                 while (npte--) {
2223                         entry = *spte;
2224                         mmu_pte_write_zap_pte(vcpu, sp, spte);
2225                         if (new)
2226                                 mmu_pte_write_new_pte(vcpu, sp, spte, new);
2227                         mmu_pte_write_flush_tlb(vcpu, entry, *spte);
2228                         ++spte;
2229                 }
2230         }
2231         kvm_mmu_audit(vcpu, "post pte write");
2232         spin_unlock(&vcpu->kvm->mmu_lock);
2233         if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2234                 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
2235                 vcpu->arch.update_pte.pfn = bad_pfn;
2236         }
2237 }
2238
2239 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2240 {
2241         gpa_t gpa;
2242         int r;
2243
2244         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
2245
2246         spin_lock(&vcpu->kvm->mmu_lock);
2247         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2248         spin_unlock(&vcpu->kvm->mmu_lock);
2249         return r;
2250 }
2251 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2252
2253 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2254 {
2255         while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
2256                 struct kvm_mmu_page *sp;
2257
2258                 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2259                                   struct kvm_mmu_page, link);
2260                 kvm_mmu_zap_page(vcpu->kvm, sp);
2261                 ++vcpu->kvm->stat.mmu_recycled;
2262         }
2263 }
2264
2265 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2266 {
2267         int r;
2268         enum emulation_result er;
2269
2270         r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
2271         if (r < 0)
2272                 goto out;
2273
2274         if (!r) {
2275                 r = 1;
2276                 goto out;
2277         }
2278
2279         r = mmu_topup_memory_caches(vcpu);
2280         if (r)
2281                 goto out;
2282
2283         er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
2284
2285         switch (er) {
2286         case EMULATE_DONE:
2287                 return 1;
2288         case EMULATE_DO_MMIO:
2289                 ++vcpu->stat.mmio_exits;
2290                 return 0;
2291         case EMULATE_FAIL:
2292                 kvm_report_emulation_failure(vcpu, "pagetable");
2293                 return 1;
2294         default:
2295                 BUG();
2296         }
2297 out:
2298         return r;
2299 }
2300 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
2301
2302 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2303 {
2304         spin_lock(&vcpu->kvm->mmu_lock);
2305         vcpu->arch.mmu.invlpg(vcpu, gva);
2306         spin_unlock(&vcpu->kvm->mmu_lock);
2307         kvm_mmu_flush_tlb(vcpu);
2308         ++vcpu->stat.invlpg;
2309 }
2310 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2311
2312 void kvm_enable_tdp(void)
2313 {
2314         tdp_enabled = true;
2315 }
2316 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
2317
2318 void kvm_disable_tdp(void)
2319 {
2320         tdp_enabled = false;
2321 }
2322 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
2323
2324 static void free_mmu_pages(struct kvm_vcpu *vcpu)
2325 {
2326         struct kvm_mmu_page *sp;
2327
2328         while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2329                 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
2330                                   struct kvm_mmu_page, link);
2331                 kvm_mmu_zap_page(vcpu->kvm, sp);
2332                 cond_resched();
2333         }
2334         free_page((unsigned long)vcpu->arch.mmu.pae_root);
2335 }
2336
2337 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2338 {
2339         struct page *page;
2340         int i;
2341
2342         ASSERT(vcpu);
2343
2344         if (vcpu->kvm->arch.n_requested_mmu_pages)
2345                 vcpu->kvm->arch.n_free_mmu_pages =
2346                                         vcpu->kvm->arch.n_requested_mmu_pages;
2347         else
2348                 vcpu->kvm->arch.n_free_mmu_pages =
2349                                         vcpu->kvm->arch.n_alloc_mmu_pages;
2350         /*
2351          * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2352          * Therefore we need to allocate shadow page tables in the first
2353          * 4GB of memory, which happens to fit the DMA32 zone.
2354          */
2355         page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2356         if (!page)
2357                 goto error_1;
2358         vcpu->arch.mmu.pae_root = page_address(page);
2359         for (i = 0; i < 4; ++i)
2360                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2361
2362         return 0;
2363
2364 error_1:
2365         free_mmu_pages(vcpu);
2366         return -ENOMEM;
2367 }
2368
2369 int kvm_mmu_create(struct kvm_vcpu *vcpu)
2370 {
2371         ASSERT(vcpu);
2372         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2373
2374         return alloc_mmu_pages(vcpu);
2375 }
2376
2377 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
2378 {
2379         ASSERT(vcpu);
2380         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2381
2382         return init_kvm_mmu(vcpu);
2383 }
2384
2385 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
2386 {
2387         ASSERT(vcpu);
2388
2389         destroy_kvm_mmu(vcpu);
2390         free_mmu_pages(vcpu);
2391         mmu_free_memory_caches(vcpu);
2392 }
2393
2394 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2395 {
2396         struct kvm_mmu_page *sp;
2397
2398         spin_lock(&kvm->mmu_lock);
2399         list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2400                 int i;
2401                 u64 *pt;
2402
2403                 if (!test_bit(slot, &sp->slot_bitmap))
2404                         continue;
2405
2406                 pt = sp->spt;
2407                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2408                         /* avoid RMW */
2409                         if (pt[i] & PT_WRITABLE_MASK)
2410                                 pt[i] &= ~PT_WRITABLE_MASK;
2411         }
2412         kvm_flush_remote_tlbs(kvm);
2413         spin_unlock(&kvm->mmu_lock);
2414 }
2415
2416 void kvm_mmu_zap_all(struct kvm *kvm)
2417 {
2418         struct kvm_mmu_page *sp, *node;
2419
2420         spin_lock(&kvm->mmu_lock);
2421         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2422                 if (kvm_mmu_zap_page(kvm, sp))
2423                         node = container_of(kvm->arch.active_mmu_pages.next,
2424                                             struct kvm_mmu_page, link);
2425         spin_unlock(&kvm->mmu_lock);
2426
2427         kvm_flush_remote_tlbs(kvm);
2428 }
2429
2430 static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
2431 {
2432         struct kvm_mmu_page *page;
2433
2434         page = container_of(kvm->arch.active_mmu_pages.prev,
2435                             struct kvm_mmu_page, link);
2436         kvm_mmu_zap_page(kvm, page);
2437 }
2438
2439 static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2440 {
2441         struct kvm *kvm;
2442         struct kvm *kvm_freed = NULL;
2443         int cache_count = 0;
2444
2445         spin_lock(&kvm_lock);
2446
2447         list_for_each_entry(kvm, &vm_list, vm_list) {
2448                 int npages;
2449
2450                 if (!down_read_trylock(&kvm->slots_lock))
2451                         continue;
2452                 spin_lock(&kvm->mmu_lock);
2453                 npages = kvm->arch.n_alloc_mmu_pages -
2454                          kvm->arch.n_free_mmu_pages;
2455                 cache_count += npages;
2456                 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2457                         kvm_mmu_remove_one_alloc_mmu_page(kvm);
2458                         cache_count--;
2459                         kvm_freed = kvm;
2460                 }
2461                 nr_to_scan--;
2462
2463                 spin_unlock(&kvm->mmu_lock);
2464                 up_read(&kvm->slots_lock);
2465         }
2466         if (kvm_freed)
2467                 list_move_tail(&kvm_freed->vm_list, &vm_list);
2468
2469         spin_unlock(&kvm_lock);
2470
2471         return cache_count;
2472 }
2473
2474 static struct shrinker mmu_shrinker = {
2475         .shrink = mmu_shrink,
2476         .seeks = DEFAULT_SEEKS * 10,
2477 };
2478
2479 static void mmu_destroy_caches(void)
2480 {
2481         if (pte_chain_cache)
2482                 kmem_cache_destroy(pte_chain_cache);
2483         if (rmap_desc_cache)
2484                 kmem_cache_destroy(rmap_desc_cache);
2485         if (mmu_page_header_cache)
2486                 kmem_cache_destroy(mmu_page_header_cache);
2487 }
2488
2489 void kvm_mmu_module_exit(void)
2490 {
2491         mmu_destroy_caches();
2492         unregister_shrinker(&mmu_shrinker);
2493 }
2494
2495 int kvm_mmu_module_init(void)
2496 {
2497         pte_chain_cache = kmem_cache_create("kvm_pte_chain",
2498                                             sizeof(struct kvm_pte_chain),
2499                                             0, 0, NULL);
2500         if (!pte_chain_cache)
2501                 goto nomem;
2502         rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
2503                                             sizeof(struct kvm_rmap_desc),
2504                                             0, 0, NULL);
2505         if (!rmap_desc_cache)
2506                 goto nomem;
2507
2508         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
2509                                                   sizeof(struct kvm_mmu_page),
2510                                                   0, 0, NULL);
2511         if (!mmu_page_header_cache)
2512                 goto nomem;
2513
2514         register_shrinker(&mmu_shrinker);
2515
2516         return 0;
2517
2518 nomem:
2519         mmu_destroy_caches();
2520         return -ENOMEM;
2521 }
2522
2523 /*
2524  * Caculate mmu pages needed for kvm.
2525  */
2526 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
2527 {
2528         int i;
2529         unsigned int nr_mmu_pages;
2530         unsigned int  nr_pages = 0;
2531
2532         for (i = 0; i < kvm->nmemslots; i++)
2533                 nr_pages += kvm->memslots[i].npages;
2534
2535         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
2536         nr_mmu_pages = max(nr_mmu_pages,
2537                         (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
2538
2539         return nr_mmu_pages;
2540 }
2541
2542 static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
2543                                 unsigned len)
2544 {
2545         if (len > buffer->len)
2546                 return NULL;
2547         return buffer->ptr;
2548 }
2549
2550 static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
2551                                 unsigned len)
2552 {
2553         void *ret;
2554
2555         ret = pv_mmu_peek_buffer(buffer, len);
2556         if (!ret)
2557                 return ret;
2558         buffer->ptr += len;
2559         buffer->len -= len;
2560         buffer->processed += len;
2561         return ret;
2562 }
2563
2564 static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
2565                              gpa_t addr, gpa_t value)
2566 {
2567         int bytes = 8;
2568         int r;
2569
2570         if (!is_long_mode(vcpu) && !is_pae(vcpu))
2571                 bytes = 4;
2572
2573         r = mmu_topup_memory_caches(vcpu);
2574         if (r)
2575                 return r;
2576
2577         if (!emulator_write_phys(vcpu, addr, &value, bytes))
2578                 return -EFAULT;
2579
2580         return 1;
2581 }
2582
2583 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2584 {
2585         kvm_x86_ops->tlb_flush(vcpu);
2586         return 1;
2587 }
2588
2589 static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
2590 {
2591         spin_lock(&vcpu->kvm->mmu_lock);
2592         mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
2593         spin_unlock(&vcpu->kvm->mmu_lock);
2594         return 1;
2595 }
2596
2597 static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
2598                              struct kvm_pv_mmu_op_buffer *buffer)
2599 {
2600         struct kvm_mmu_op_header *header;
2601
2602         header = pv_mmu_peek_buffer(buffer, sizeof *header);
2603         if (!header)
2604                 return 0;
2605         switch (header->op) {
2606         case KVM_MMU_OP_WRITE_PTE: {
2607                 struct kvm_mmu_op_write_pte *wpte;
2608
2609                 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
2610                 if (!wpte)
2611                         return 0;
2612                 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
2613                                         wpte->pte_val);
2614         }
2615         case KVM_MMU_OP_FLUSH_TLB: {
2616                 struct kvm_mmu_op_flush_tlb *ftlb;
2617
2618                 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
2619                 if (!ftlb)
2620                         return 0;
2621                 return kvm_pv_mmu_flush_tlb(vcpu);
2622         }
2623         case KVM_MMU_OP_RELEASE_PT: {
2624                 struct kvm_mmu_op_release_pt *rpt;
2625
2626                 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
2627                 if (!rpt)
2628                         return 0;
2629                 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
2630         }
2631         default: return 0;
2632         }
2633 }
2634
2635 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2636                   gpa_t addr, unsigned long *ret)
2637 {
2638         int r;
2639         struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
2640
2641         buffer->ptr = buffer->buf;
2642         buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
2643         buffer->processed = 0;
2644
2645         r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
2646         if (r)
2647                 goto out;
2648
2649         while (buffer->len) {
2650                 r = kvm_pv_mmu_op_one(vcpu, buffer);
2651                 if (r < 0)
2652                         goto out;
2653                 if (r == 0)
2654                         break;
2655         }
2656
2657         r = 1;
2658 out:
2659         *ret = buffer->processed;
2660         return r;
2661 }
2662
2663 #ifdef AUDIT
2664
2665 static const char *audit_msg;
2666
2667 static gva_t canonicalize(gva_t gva)
2668 {
2669 #ifdef CONFIG_X86_64
2670         gva = (long long)(gva << 16) >> 16;
2671 #endif
2672         return gva;
2673 }
2674
2675 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
2676                                 gva_t va, int level)
2677 {
2678         u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
2679         int i;
2680         gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
2681
2682         for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
2683                 u64 ent = pt[i];
2684
2685                 if (ent == shadow_trap_nonpresent_pte)
2686                         continue;
2687
2688                 va = canonicalize(va);
2689                 if (level > 1) {
2690                         if (ent == shadow_notrap_nonpresent_pte)
2691                                 printk(KERN_ERR "audit: (%s) nontrapping pte"
2692                                        " in nonleaf level: levels %d gva %lx"
2693                                        " level %d pte %llx\n", audit_msg,
2694                                        vcpu->arch.mmu.root_level, va, level, ent);
2695
2696                         audit_mappings_page(vcpu, ent, va, level - 1);
2697                 } else {
2698                         gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
2699                         hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
2700
2701                         if (is_shadow_present_pte(ent)
2702                             && (ent & PT64_BASE_ADDR_MASK) != hpa)
2703                                 printk(KERN_ERR "xx audit error: (%s) levels %d"
2704                                        " gva %lx gpa %llx hpa %llx ent %llx %d\n",
2705                                        audit_msg, vcpu->arch.mmu.root_level,
2706                                        va, gpa, hpa, ent,
2707                                        is_shadow_present_pte(ent));
2708                         else if (ent == shadow_notrap_nonpresent_pte
2709                                  && !is_error_hpa(hpa))
2710                                 printk(KERN_ERR "audit: (%s) notrap shadow,"
2711                                        " valid guest gva %lx\n", audit_msg, va);
2712                         kvm_release_pfn_clean(pfn);
2713
2714                 }
2715         }
2716 }
2717
2718 static void audit_mappings(struct kvm_vcpu *vcpu)
2719 {
2720         unsigned i;
2721
2722         if (vcpu->arch.mmu.root_level == 4)
2723                 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
2724         else
2725                 for (i = 0; i < 4; ++i)
2726                         if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
2727                                 audit_mappings_page(vcpu,
2728                                                     vcpu->arch.mmu.pae_root[i],
2729                                                     i << 30,
2730                                                     2);
2731 }
2732
2733 static int count_rmaps(struct kvm_vcpu *vcpu)
2734 {
2735         int nmaps = 0;
2736         int i, j, k;
2737
2738         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
2739                 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
2740                 struct kvm_rmap_desc *d;
2741
2742                 for (j = 0; j < m->npages; ++j) {
2743                         unsigned long *rmapp = &m->rmap[j];
2744
2745                         if (!*rmapp)
2746                                 continue;
2747                         if (!(*rmapp & 1)) {
2748                                 ++nmaps;
2749                                 continue;
2750                         }
2751                         d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
2752                         while (d) {
2753                                 for (k = 0; k < RMAP_EXT; ++k)
2754                                         if (d->shadow_ptes[k])
2755                                                 ++nmaps;
2756                                         else
2757                                                 break;
2758                                 d = d->more;
2759                         }
2760                 }
2761         }
2762         return nmaps;
2763 }
2764
2765 static int count_writable_mappings(struct kvm_vcpu *vcpu)
2766 {
2767         int nmaps = 0;
2768         struct kvm_mmu_page *sp;
2769         int i;
2770
2771         list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
2772                 u64 *pt = sp->spt;
2773
2774                 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
2775                         continue;
2776
2777                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
2778                         u64 ent = pt[i];
2779
2780                         if (!(ent & PT_PRESENT_MASK))
2781                                 continue;
2782                         if (!(ent & PT_WRITABLE_MASK))
2783                                 continue;
2784                         ++nmaps;
2785                 }
2786         }
2787         return nmaps;
2788 }
2789
2790 static void audit_rmap(struct kvm_vcpu *vcpu)
2791 {
2792         int n_rmap = count_rmaps(vcpu);
2793         int n_actual = count_writable_mappings(vcpu);
2794
2795         if (n_rmap != n_actual)
2796                 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
2797                        __func__, audit_msg, n_rmap, n_actual);
2798 }
2799
2800 static void audit_write_protection(struct kvm_vcpu *vcpu)
2801 {
2802         struct kvm_mmu_page *sp;
2803         struct kvm_memory_slot *slot;
2804         unsigned long *rmapp;
2805         gfn_t gfn;
2806
2807         list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
2808                 if (sp->role.metaphysical)
2809                         continue;
2810
2811                 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
2812                 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
2813                 rmapp = &slot->rmap[gfn - slot->base_gfn];
2814                 if (*rmapp)
2815                         printk(KERN_ERR "%s: (%s) shadow page has writable"
2816                                " mappings: gfn %lx role %x\n",
2817                                __func__, audit_msg, sp->gfn,
2818                                sp->role.word);
2819         }
2820 }
2821
2822 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
2823 {
2824         int olddbg = dbg;
2825
2826         dbg = 0;
2827         audit_msg = msg;
2828         audit_rmap(vcpu);
2829         audit_write_protection(vcpu);
2830         audit_mappings(vcpu);
2831         dbg = olddbg;
2832 }
2833
2834 #endif