X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fkvm%2Fmmu.c;h=22c426cd8cb2a842274b0a005f01c42001fe430f;hb=5263bf65d6342e12ab716db8e529501670979321;hp=6dbd83b866239be935a3ba984240c22c6931e86b;hpb=9b7a032567ee1128daeebebfc14d3acedfe28c8c;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 6dbd83b..22c426c 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -26,8 +26,31 @@ #include "vmx.h" #include "kvm.h" -#define pgprintk(x...) do { printk(x); } while (0) -#define rmap_printk(x...) do { printk(x); } while (0) +#undef MMU_DEBUG + +#undef AUDIT + +#ifdef AUDIT +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); +#else +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} +#endif + +#ifdef MMU_DEBUG + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) + +#else + +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) + +#endif + +#if defined(MMU_DEBUG) || defined(AUDIT) +static int dbg = 1; +#endif #define ASSERT(x) \ if (!(x)) { \ @@ -120,6 +143,7 @@ #define PFERR_PRESENT_MASK (1U << 0) #define PFERR_WRITE_MASK (1U << 1) #define PFERR_USER_MASK (1U << 2) +#define PFERR_FETCH_MASK (1U << 4) #define PT64_ROOT_LEVEL 4 #define PT32_ROOT_LEVEL 2 @@ -145,6 +169,11 @@ static int is_cpuid_PSE36(void) return 1; } +static int is_nx(struct kvm_vcpu *vcpu) +{ + return vcpu->shadow_efer & EFER_NX; +} + static int is_present_pte(unsigned long pte) { return pte & PT_PRESENT_MASK; @@ -166,6 +195,91 @@ static int is_rmap_pte(u64 pte) == (PT_WRITABLE_MASK | PT_PRESENT_MASK); } +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + size_t objsize, int min) +{ + void *obj; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + obj = kzalloc(objsize, GFP_NOWAIT); + if (!obj) + return -ENOMEM; + cache->objects[cache->nobjs++] = obj; + } + return 0; +} + +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + kfree(mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +{ + int r; + + r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, + sizeof(struct kvm_pte_chain), 4); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, + sizeof(struct kvm_rmap_desc), 1); +out: + return r; +} + +static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) +{ + mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); + mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); +} + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, + size_t size) +{ + void *p; + + BUG_ON(!mc->nobjs); + p = mc->objects[--mc->nobjs]; + memset(p, 0, size); + return p; +} + +static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj) +{ + if (mc->nobjs < KVM_NR_MEM_OBJS) + mc->objects[mc->nobjs++] = obj; + else + kfree(obj); +} + +static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache, + sizeof(struct kvm_pte_chain)); +} + +static void mmu_free_pte_chain(struct kvm_vcpu *vcpu, + struct kvm_pte_chain *pc) +{ + mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc); +} + +static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache, + sizeof(struct kvm_rmap_desc)); +} + +static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu, + struct kvm_rmap_desc *rd) +{ + mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd); +} + /* * Reverse mapping data structures: * @@ -175,7 +289,7 @@ static int is_rmap_pte(u64 pte) * If page->private bit zero is one, (then page->private & ~1) points * to a struct kvm_rmap_desc containing more mappings. */ -static void rmap_add(struct kvm *kvm, u64 *spte) +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte) { struct page *page; struct kvm_rmap_desc *desc; @@ -189,9 +303,7 @@ static void rmap_add(struct kvm *kvm, u64 *spte) page->private = (unsigned long)spte; } else if (!(page->private & 1)) { rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); - desc = kzalloc(sizeof *desc, GFP_NOWAIT); - if (!desc) - BUG(); /* FIXME: return error */ + desc = mmu_alloc_rmap_desc(vcpu); desc->shadow_ptes[0] = (u64 *)page->private; desc->shadow_ptes[1] = spte; page->private = (unsigned long)desc | 1; @@ -201,9 +313,7 @@ static void rmap_add(struct kvm *kvm, u64 *spte) while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) desc = desc->more; if (desc->shadow_ptes[RMAP_EXT-1]) { - desc->more = kzalloc(sizeof *desc->more, GFP_NOWAIT); - if (!desc->more) - BUG(); /* FIXME: return error */ + desc->more = mmu_alloc_rmap_desc(vcpu); desc = desc->more; } for (i = 0; desc->shadow_ptes[i]; ++i) @@ -212,7 +322,8 @@ static void rmap_add(struct kvm *kvm, u64 *spte) } } -static void rmap_desc_remove_entry(struct page *page, +static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu, + struct page *page, struct kvm_rmap_desc *desc, int i, struct kvm_rmap_desc *prev_desc) @@ -232,10 +343,10 @@ static void rmap_desc_remove_entry(struct page *page, prev_desc->more = desc->more; else page->private = (unsigned long)desc->more | 1; - kfree(desc); + mmu_free_rmap_desc(vcpu, desc); } -static void rmap_remove(struct kvm *kvm, u64 *spte) +static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte) { struct page *page; struct kvm_rmap_desc *desc; @@ -263,7 +374,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) while (desc) { for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) if (desc->shadow_ptes[i] == spte) { - rmap_desc_remove_entry(page, desc, i, + rmap_desc_remove_entry(vcpu, page, + desc, i, prev_desc); return; } @@ -274,8 +386,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) } } -static void rmap_write_protect(struct kvm *kvm, u64 gfn) +static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) { + struct kvm *kvm = vcpu->kvm; struct page *page; struct kvm_memory_slot *slot; struct kvm_rmap_desc *desc; @@ -298,29 +411,36 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON(!(*spte & PT_WRITABLE_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - rmap_remove(kvm, spte); + rmap_remove(vcpu, spte); + kvm_arch_ops->tlb_flush(vcpu); *spte &= ~(u64)PT_WRITABLE_MASK; } } +static int is_empty_shadow_page(hpa_t page_hpa) +{ + u64 *pos; + u64 *end; + + for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64); + pos != end; pos++) + if (*pos != 0) { + printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, + pos, *pos); + return 0; + } + return 1; +} + static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) { struct kvm_mmu_page *page_head = page_header(page_hpa); + ASSERT(is_empty_shadow_page(page_hpa)); list_del(&page_head->link); page_head->page_hpa = page_hpa; list_add(&page_head->link, &vcpu->free_pages); -} - -static int is_empty_shadow_page(hpa_t page_hpa) -{ - u32 *pos; - u32 *end; - for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32); - pos != end; pos++) - if (*pos != 0) - return 0; - return 1; + ++vcpu->kvm->n_free_mmu_pages; } static unsigned kvm_page_table_hashfn(gfn_t gfn) @@ -344,10 +464,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, page->global = 1; page->multimapped = 0; page->parent_pte = parent_pte; + --vcpu->kvm->n_free_mmu_pages; return page; } -static void mmu_page_add_parent_pte(struct kvm_mmu_page *page, u64 *parent_pte) +static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, u64 *parent_pte) { struct kvm_pte_chain *pte_chain; struct hlist_node *node; @@ -363,8 +485,7 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_page *page, u64 *parent_pte) return; } page->multimapped = 1; - pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT); - BUG_ON(!pte_chain); + pte_chain = mmu_alloc_pte_chain(vcpu); INIT_HLIST_HEAD(&page->parent_ptes); hlist_add_head(&pte_chain->link, &page->parent_ptes); pte_chain->parent_ptes[0] = old; @@ -378,13 +499,14 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_page *page, u64 *parent_pte) return; } } - pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT); + pte_chain = mmu_alloc_pte_chain(vcpu); BUG_ON(!pte_chain); hlist_add_head(&pte_chain->link, &page->parent_ptes); pte_chain->parent_ptes[0] = parent_pte; } -static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page, +static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, u64 *parent_pte) { struct kvm_pte_chain *pte_chain; @@ -402,12 +524,21 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page, break; if (pte_chain->parent_ptes[i] != parent_pte) continue; - while (i + 1 < NR_PTE_CHAIN_ENTRIES) { + while (i + 1 < NR_PTE_CHAIN_ENTRIES + && pte_chain->parent_ptes[i + 1]) { pte_chain->parent_ptes[i] = pte_chain->parent_ptes[i + 1]; ++i; } pte_chain->parent_ptes[i] = NULL; + if (i == 0) { + hlist_del(&pte_chain->link); + mmu_free_pte_chain(vcpu, pte_chain); + if (hlist_empty(&page->parent_ptes)) { + page->multimapped = 0; + page->parent_pte = NULL; + } + } return; } BUG(); @@ -462,7 +593,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, bucket = &vcpu->kvm->mmu_page_hash[index]; hlist_for_each_entry(page, node, bucket, hash_link) if (page->gfn == gfn && page->role.word == role.word) { - mmu_page_add_parent_pte(page, parent_pte); + mmu_page_add_parent_pte(vcpu, page, parent_pte); pgprintk("%s: found\n", __FUNCTION__); return page; } @@ -474,15 +605,96 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, page->role = role; hlist_add_head(&page->hash_link, bucket); if (!metaphysical) - rmap_write_protect(vcpu->kvm, gfn); + rmap_write_protect(vcpu, gfn); return page; } +static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page) +{ + unsigned i; + u64 *pt; + u64 ent; + + pt = __va(page->page_hpa); + + if (page->role.level == PT_PAGE_TABLE_LEVEL) { + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (pt[i] & PT_PRESENT_MASK) + rmap_remove(vcpu, &pt[i]); + pt[i] = 0; + } + kvm_arch_ops->tlb_flush(vcpu); + return; + } + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + ent = pt[i]; + + pt[i] = 0; + if (!(ent & PT_PRESENT_MASK)) + continue; + ent &= PT64_BASE_ADDR_MASK; + mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]); + } +} + static void kvm_mmu_put_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, u64 *parent_pte) { - mmu_page_remove_parent_pte(page, parent_pte); + mmu_page_remove_parent_pte(vcpu, page, parent_pte); +} + +static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page) +{ + u64 *parent_pte; + + while (page->multimapped || page->parent_pte) { + if (!page->multimapped) + parent_pte = page->parent_pte; + else { + struct kvm_pte_chain *chain; + + chain = container_of(page->parent_ptes.first, + struct kvm_pte_chain, link); + parent_pte = chain->parent_ptes[0]; + } + BUG_ON(!parent_pte); + kvm_mmu_put_page(vcpu, page, parent_pte); + *parent_pte = 0; + } + kvm_mmu_page_unlink_children(vcpu, page); + if (!page->root_count) { + hlist_del(&page->hash_link); + kvm_mmu_free_page(vcpu, page->page_hpa); + } else { + list_del(&page->link); + list_add(&page->link, &vcpu->kvm->active_mmu_pages); + } +} + +static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *page; + struct hlist_node *node, *n; + int r; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + r = 0; + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->mmu_page_hash[index]; + hlist_for_each_entry_safe(page, node, n, bucket, hash_link) + if (page->gfn == gfn && !page->role.metaphysical) { + pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, + page->role.word); + kvm_mmu_zap_page(vcpu, page); + r = 1; + } + return r; } static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) @@ -523,35 +735,6 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) return gpa_to_hpa(vcpu, gpa); } - -static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa, - int level) -{ - u64 *pos; - u64 *end; - - ASSERT(vcpu); - ASSERT(VALID_PAGE(page_hpa)); - ASSERT(level <= PT64_ROOT_LEVEL && level > 0); - - for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE; - pos != end; pos++) { - u64 current_ent = *pos; - - if (is_present_pte(current_ent)) { - if (level != 1) - release_pt_page_64(vcpu, - current_ent & - PT64_BASE_ADDR_MASK, - level - 1); - else - rmap_remove(vcpu->kvm, pos); - } - *pos = 0; - } - kvm_mmu_free_page(vcpu, page_hpa); -} - static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } @@ -577,7 +760,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) page_header_update_slot(vcpu->kvm, table, v); table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - rmap_add(vcpu->kvm, &table[index]); + rmap_add(vcpu, &table[index]); return 0; } @@ -605,12 +788,15 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) static void mmu_free_roots(struct kvm_vcpu *vcpu) { int i; + struct kvm_mmu_page *page; #ifdef CONFIG_X86_64 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->mmu.root_hpa; ASSERT(VALID_PAGE(root)); + page = page_header(root); + --page->root_count; vcpu->mmu.root_hpa = INVALID_PAGE; return; } @@ -620,6 +806,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) ASSERT(VALID_PAGE(root)); root &= PT64_BASE_ADDR_MASK; + page = page_header(root); + --page->root_count; vcpu->mmu.pae_root[i] = INVALID_PAGE; } vcpu->mmu.root_hpa = INVALID_PAGE; @@ -629,6 +817,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) { int i; gfn_t root_gfn; + struct kvm_mmu_page *page; + root_gfn = vcpu->cr3 >> PAGE_SHIFT; #ifdef CONFIG_X86_64 @@ -636,8 +826,10 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->mmu.root_hpa; ASSERT(!VALID_PAGE(root)); - root = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, 0, NULL)->page_hpa; + page = kvm_mmu_get_page(vcpu, root_gfn, 0, + PT64_ROOT_LEVEL, 0, NULL); + root = page->page_hpa; + ++page->root_count; vcpu->mmu.root_hpa = root; return; } @@ -650,26 +842,16 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; else if (vcpu->mmu.root_level == 0) root_gfn = 0; - root = kvm_mmu_get_page(vcpu, root_gfn, i << 30, + page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, !is_paging(vcpu), - NULL)->page_hpa; + NULL); + root = page->page_hpa; + ++page->root_count; vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); } -static void nonpaging_flush(struct kvm_vcpu *vcpu) -{ - hpa_t root = vcpu->mmu.root_hpa; - - ++kvm_stat.tlb_flush; - pgprintk("nonpaging_flush\n"); - mmu_free_roots(vcpu); - mmu_alloc_roots(vcpu); - kvm_arch_ops->set_cr3(vcpu, root); - kvm_arch_ops->tlb_flush(vcpu); -} - static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -678,32 +860,24 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { - int ret; gpa_t addr = gva; + hpa_t paddr; + int r; + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); - for (;;) { - hpa_t paddr; - paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); + paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); - if (is_error_hpa(paddr)) - return 1; - - ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr); - if (ret) { - nonpaging_flush(vcpu); - continue; - } - break; - } - return ret; -} + if (is_error_hpa(paddr)) + return 1; -static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) -{ + return nonpaging_map(vcpu, addr & PAGE_MASK, paddr); } static void nonpaging_free(struct kvm_vcpu *vcpu) @@ -717,7 +891,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; - context->inval_page = nonpaging_inval_page; context->gva_to_gpa = nonpaging_gva_to_gpa; context->free = nonpaging_free; context->root_level = 0; @@ -738,6 +911,8 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) { pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); mmu_free_roots(vcpu); + if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) + kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); kvm_mmu_flush_tlb(vcpu); kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); @@ -785,7 +960,10 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu, pgprintk("%s: found shadow page for %lx, marking ro\n", __FUNCTION__, gfn); access_bits &= ~PT_WRITABLE_MASK; - *shadow_pte &= ~PT_WRITABLE_MASK; + if (is_writeble_pte(*shadow_pte)) { + *shadow_pte &= ~PT_WRITABLE_MASK; + kvm_arch_ops->tlb_flush(vcpu); + } } } @@ -793,7 +971,7 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu, mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); - rmap_add(vcpu->kvm, shadow_pte); + rmap_add(vcpu, shadow_pte); } static void inject_page_fault(struct kvm_vcpu *vcpu, @@ -820,52 +998,6 @@ static inline int fix_read_pf(u64 *shadow_ent) return 0; } -static int may_access(u64 pte, int write, int user) -{ - - if (user && !(pte & PT_USER_MASK)) - return 0; - if (write && !(pte & PT_WRITABLE_MASK)) - return 0; - return 1; -} - -/* - * Remove a shadow pte. - */ -static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) -{ - hpa_t page_addr = vcpu->mmu.root_hpa; - int level = vcpu->mmu.shadow_root_level; - - ++kvm_stat.invlpg; - - for (; ; level--) { - u32 index = PT64_INDEX(addr, level); - u64 *table = __va(page_addr); - - if (level == PT_PAGE_TABLE_LEVEL ) { - rmap_remove(vcpu->kvm, &table[index]); - table[index] = 0; - return; - } - - if (!is_present_pte(table[index])) - return; - - page_addr = table[index] & PT64_BASE_ADDR_MASK; - - if (level == PT_DIRECTORY_LEVEL && - (table[index] & PT_SHADOW_PS_MARK)) { - table[index] = 0; - release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL); - - kvm_arch_ops->tlb_flush(vcpu); - return; - } - } -} - static void paging_free(struct kvm_vcpu *vcpu) { nonpaging_free(vcpu); @@ -886,7 +1018,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; context->page_fault = paging64_page_fault; - context->inval_page = paging_inval_page; context->gva_to_gpa = paging64_gva_to_gpa; context->free = paging_free; context->root_level = level; @@ -909,7 +1040,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; - context->inval_page = paging_inval_page; context->gva_to_gpa = paging32_gva_to_gpa; context->free = paging_free; context->root_level = PT32_ROOT_LEVEL; @@ -952,8 +1082,15 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { + int r; + destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); + r = init_kvm_mmu(vcpu); + if (r < 0) + goto out; + r = mmu_topup_memory_caches(vcpu); +out: + return r; } void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) @@ -961,21 +1098,50 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *page; struct kvm_mmu_page *child; - struct hlist_node *node; + struct hlist_node *node, *n; struct hlist_head *bucket; unsigned index; u64 *spte; u64 pte; unsigned offset = offset_in_page(gpa); + unsigned pte_size; unsigned page_offset; + unsigned misaligned; int level; + int flooded = 0; pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); + if (gfn == vcpu->last_pt_write_gfn) { + ++vcpu->last_pt_write_count; + if (vcpu->last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->last_pt_write_gfn = gfn; + vcpu->last_pt_write_count = 1; + } index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; bucket = &vcpu->kvm->mmu_page_hash[index]; - hlist_for_each_entry(page, node, bucket, hash_link) { + hlist_for_each_entry_safe(page, node, n, bucket, hash_link) { if (page->gfn != gfn || page->role.metaphysical) continue; + pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; + misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + if (misaligned || flooded) { + /* + * Misaligned accesses are too much trouble to fix + * up; also, they usually indicate a page is not used + * as a page table. + * + * If we're seeing too many writes to a page, + * it may no longer be a page table, or we may be + * forking, in which case it is better to unmap the + * page. + */ + pgprintk("misaligned: gpa %llx bytes %d role %x\n", + gpa, bytes, page->role.word); + kvm_mmu_zap_page(vcpu, page); + continue; + } page_offset = offset; level = page->role.level; if (page->role.glevels == PT32_ROOT_LEVEL) { @@ -987,10 +1153,10 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) pte = *spte; if (is_present_pte(pte)) { if (level == PT_PAGE_TABLE_LEVEL) - rmap_remove(vcpu->kvm, spte); + rmap_remove(vcpu, spte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, spte); + mmu_page_remove_parent_pte(vcpu, child, spte); } } *spte = 0; @@ -1001,11 +1167,35 @@ void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) { } -static void free_mmu_pages(struct kvm_vcpu *vcpu) +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { - while (!list_empty(&vcpu->free_pages)) { + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); + + return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT); +} + +void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) { struct kvm_mmu_page *page; + page = container_of(vcpu->kvm->active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu, page); + } +} +EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages); + +static void free_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *page; + + while (!list_empty(&vcpu->kvm->active_mmu_pages)) { + page = container_of(vcpu->kvm->active_mmu_pages.next, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu, page); + } + while (!list_empty(&vcpu->free_pages)) { page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); list_del(&page->link); @@ -1032,6 +1222,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; memset(__va(page_header->page_hpa), 0, PAGE_SIZE); list_add(&page_header->link, &vcpu->free_pages); + ++vcpu->kvm->n_free_mmu_pages; } /* @@ -1077,10 +1268,12 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) destroy_kvm_mmu(vcpu); free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot) { + struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *page; list_for_each_entry(page, &kvm->active_mmu_pages, link) { @@ -1094,8 +1287,168 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ if (pt[i] & PT_WRITABLE_MASK) { - rmap_remove(kvm, &pt[i]); + rmap_remove(vcpu, &pt[i]); pt[i] &= ~PT_WRITABLE_MASK; } } } + +#ifdef AUDIT + +static const char *audit_msg; + +static gva_t canonicalize(gva_t gva) +{ +#ifdef CONFIG_X86_64 + gva = (long long)(gva << 16) >> 16; +#endif + return gva; +} + +static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, + gva_t va, int level) +{ + u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); + int i; + gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { + u64 ent = pt[i]; + + if (!ent & PT_PRESENT_MASK) + continue; + + va = canonicalize(va); + if (level > 1) + audit_mappings_page(vcpu, ent, va, level - 1); + else { + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); + hpa_t hpa = gpa_to_hpa(vcpu, gpa); + + if ((ent & PT_PRESENT_MASK) + && (ent & PT64_BASE_ADDR_MASK) != hpa) + printk(KERN_ERR "audit error: (%s) levels %d" + " gva %lx gpa %llx hpa %llx ent %llx\n", + audit_msg, vcpu->mmu.root_level, + va, gpa, hpa, ent); + } + } +} + +static void audit_mappings(struct kvm_vcpu *vcpu) +{ + int i; + + if (vcpu->mmu.root_level == 4) + audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); + else + for (i = 0; i < 4; ++i) + if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK) + audit_mappings_page(vcpu, + vcpu->mmu.pae_root[i], + i << 30, + 2); +} + +static int count_rmaps(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + int i, j, k; + + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_rmap_desc *d; + + for (j = 0; j < m->npages; ++j) { + struct page *page = m->phys_mem[j]; + + if (!page->private) + continue; + if (!(page->private & 1)) { + ++nmaps; + continue; + } + d = (struct kvm_rmap_desc *)(page->private & ~1ul); + while (d) { + for (k = 0; k < RMAP_EXT; ++k) + if (d->shadow_ptes[k]) + ++nmaps; + else + break; + d = d->more; + } + } + } + return nmaps; +} + +static int count_writable_mappings(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + struct kvm_mmu_page *page; + int i; + + list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { + u64 *pt = __va(page->page_hpa); + + if (page->role.level != PT_PAGE_TABLE_LEVEL) + continue; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = pt[i]; + + if (!(ent & PT_PRESENT_MASK)) + continue; + if (!(ent & PT_WRITABLE_MASK)) + continue; + ++nmaps; + } + } + return nmaps; +} + +static void audit_rmap(struct kvm_vcpu *vcpu) +{ + int n_rmap = count_rmaps(vcpu); + int n_actual = count_writable_mappings(vcpu); + + if (n_rmap != n_actual) + printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", + __FUNCTION__, audit_msg, n_rmap, n_actual); +} + +static void audit_write_protection(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *page; + + list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { + hfn_t hfn; + struct page *pg; + + if (page->role.metaphysical) + continue; + + hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT) + >> PAGE_SHIFT; + pg = pfn_to_page(hfn); + if (pg->private) + printk(KERN_ERR "%s: (%s) shadow page has writable" + " mappings: gfn %lx role %x\n", + __FUNCTION__, audit_msg, page->gfn, + page->role.word); + } +} + +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) +{ + int olddbg = dbg; + + dbg = 0; + audit_msg = msg; + audit_rmap(vcpu); + audit_write_protection(vcpu); + audit_mappings(vcpu); + dbg = olddbg; +} + +#endif