X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fx86%2Fkvm%2Fmmu.c;h=aac0499947d864786003b8b993f738b797a2877b;hb=4677a3b693e035f186e2875259b9a0bb94c42fbe;hp=a24da8f2ee9136d603848b04bc24682199989ec4;hpb=bc2d429979451d69d0985c5dbdf908cace2831cc;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a24da8f..aac0499 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -17,7 +17,6 @@ * */ -#include "vmx.h" #include "mmu.h" #include @@ -33,6 +32,7 @@ #include #include #include +#include /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -70,6 +70,9 @@ static int dbg = 0; module_param(dbg, bool, 0644); #endif +static int oos_shadow = 1; +module_param(oos_shadow, bool, 0644); + #ifndef MMU_DEBUG #define ASSERT(x) do { } while (0) #else @@ -142,11 +145,26 @@ struct kvm_rmap_desc { struct kvm_rmap_desc *more; }; -struct kvm_shadow_walk { - int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, - gva_t addr, u64 *spte, int level); +struct kvm_shadow_walk_iterator { + u64 addr; + hpa_t shadow_addr; + int level; + u64 *sptep; + unsigned index; +}; + +#define for_each_shadow_entry(_vcpu, _addr, _walker) \ + for (shadow_walk_init(&(_walker), _vcpu, _addr); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + + +struct kvm_unsync_walk { + int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); }; +typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); + static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; @@ -159,6 +177,7 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; +static u64 __read_mostly shadow_mt_mask; void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) { @@ -174,13 +193,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte) EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; + shadow_mt_mask = mt_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -305,7 +325,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, - rmap_desc_cache, 1); + rmap_desc_cache, 4); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); @@ -375,7 +395,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) { int *write_count; - write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + gfn = unalias_gfn(kvm, gfn); + write_count = slot_largepage_idx(gfn, + gfn_to_memslot_unaliased(kvm, gfn)); *write_count += 1; } @@ -383,16 +405,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { int *write_count; - write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + gfn = unalias_gfn(kvm, gfn); + write_count = slot_largepage_idx(gfn, + gfn_to_memslot_unaliased(kvm, gfn)); *write_count -= 1; WARN_ON(*write_count < 0); } static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) { - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + struct kvm_memory_slot *slot; int *largepage_idx; + gfn = unalias_gfn(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); if (slot) { largepage_idx = slot_largepage_idx(gfn, slot); return *largepage_idx; @@ -405,16 +431,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr; + int ret = 0; addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return 0; + return ret; + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); if (vma && is_vm_hugetlb_page(vma)) - return 1; + ret = 1; + up_read(¤t->mm->mmap_sem); - return 0; + return ret; } static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) @@ -601,7 +630,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) return NULL; } -static void rmap_write_protect(struct kvm *kvm, u64 gfn) +static int rmap_write_protect(struct kvm *kvm, u64 gfn) { unsigned long *rmapp; u64 *spte; @@ -647,10 +676,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) spte = rmap_next(kvm, rmapp, spte); } - if (write_protected) - kvm_flush_remote_tlbs(kvm); - - account_shadowed(kvm, gfn); + return write_protected; } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) @@ -776,8 +802,9 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&sp->oos_link); ASSERT(is_empty_shadow_page(sp->spt)); - sp->slot_bitmap = 0; + bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; --vcpu->kvm->arch.n_free_mmu_pages; @@ -859,6 +886,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } + +static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_parent_walk_fn fn) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + struct kvm_mmu_page *parent_sp; + int i; + + if (!sp->multimapped && sp->parent_pte) { + parent_sp = page_header(__pa(sp->parent_pte)); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + return; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + } +} + +static void kvm_mmu_update_unsync_bitmap(u64 *spte) +{ + unsigned int index; + struct kvm_mmu_page *sp = page_header(__pa(spte)); + + index = spte - sp->spt; + if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) + sp->unsync_children++; + WARN_ON(!sp->unsync_children); +} + +static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!sp->parent_pte) + return; + + if (!sp->multimapped) { + kvm_mmu_update_unsync_bitmap(sp->parent_pte); + return; + } + + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); + } +} + +static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + kvm_mmu_update_parents_unsync(sp); + return 1; +} + +static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + kvm_mmu_update_parents_unsync(sp); +} + static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -868,6 +966,96 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, sp->spt[i] = shadow_trap_nonpresent_pte; } +static int nonpaging_sync_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + return 1; +} + +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ +} + +#define KVM_PAGE_ARRAY_NR 16 + +struct kvm_mmu_pages { + struct mmu_page_and_offset { + struct kvm_mmu_page *sp; + unsigned int idx; + } page[KVM_PAGE_ARRAY_NR]; + unsigned int nr; +}; + +#define for_each_unsync_children(bitmap, idx) \ + for (idx = find_first_bit(bitmap, 512); \ + idx < 512; \ + idx = find_next_bit(bitmap, 512, idx+1)) + +int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, + int idx) +{ + int i; + + if (sp->unsync) + for (i=0; i < pvec->nr; i++) + if (pvec->page[i].sp == sp) + return 0; + + pvec->page[pvec->nr].sp = sp; + pvec->page[pvec->nr].idx = idx; + pvec->nr++; + return (pvec->nr == KVM_PAGE_ARRAY_NR); +} + +static int __mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + int i, ret, nr_unsync_leaf = 0; + + for_each_unsync_children(sp->unsync_child_bitmap, i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + __clear_bit(i, sp->unsync_child_bitmap); + else if (ret > 0) + nr_unsync_leaf += ret; + else + return ret; + } + + if (child->unsync) { + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + } + } + } + + if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) + sp->unsync_children = 0; + + return nr_unsync_leaf; +} + +static int mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + if (!sp->unsync_children) + return 0; + + mmu_pages_add(pvec, sp, 0); + return __mmu_unsync_walk(sp, pvec); +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -888,6 +1076,127 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } +static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + list_del(&sp->oos_link); + --kvm->stat.mmu_unsync_global; +} + +static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + WARN_ON(!sp->unsync); + sp->unsync = 0; + if (sp->global) + kvm_unlink_unsync_global(kvm, sp); + --kvm->stat.mmu_unsync; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + if (sp->role.glevels != vcpu->arch.mmu.root_level) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + if (rmap_write_protect(vcpu->kvm, sp->gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); + kvm_unlink_unsync_page(vcpu->kvm, sp); + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + kvm_mmu_flush_tlb(vcpu); + return 0; +} + +struct mmu_page_path { + struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; + unsigned int idx[PT64_ROOT_LEVEL-1]; +}; + +#define for_each_sp(pvec, sp, parents, i) \ + for (i = mmu_pages_next(&pvec, &parents, -1), \ + sp = pvec.page[i].sp; \ + i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ + i = mmu_pages_next(&pvec, &parents, i)) + +int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, + int i) +{ + int n; + + for (n = i+1; n < pvec->nr; n++) { + struct kvm_mmu_page *sp = pvec->page[n].sp; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) { + parents->idx[0] = pvec->page[n].idx; + return n; + } + + parents->parent[sp->role.level-2] = sp; + parents->idx[sp->role.level-1] = pvec->page[n].idx; + } + + return n; +} + +void mmu_pages_clear_parents(struct mmu_page_path *parents) +{ + struct kvm_mmu_page *sp; + unsigned int level = 0; + + do { + unsigned int idx = parents->idx[level]; + + sp = parents->parent[level]; + if (!sp) + return; + + --sp->unsync_children; + WARN_ON((int)sp->unsync_children < 0); + __clear_bit(idx, sp->unsync_child_bitmap); + level++; + } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); +} + +static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, + struct mmu_page_path *parents, + struct kvm_mmu_pages *pvec) +{ + parents->parent[parent->role.level-1] = NULL; + pvec->nr = 0; +} + +static void mmu_sync_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *parent) +{ + int i; + struct kvm_mmu_page *sp; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + int protected = 0; + + for_each_sp(pages, sp, parents, i) + protected |= rmap_write_protect(vcpu->kvm, sp->gfn); + + if (protected) + kvm_flush_remote_tlbs(vcpu->kvm); + + for_each_sp(pages, sp, parents, i) { + kvm_sync_page(vcpu, sp); + mmu_pages_clear_parents(&parents); + } + cond_resched_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_pages_init(parent, &parents, &pages); + } +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -901,10 +1210,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned quadrant; struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; + struct hlist_node *node, *tmp; - role.word = 0; - role.glevels = vcpu->arch.mmu.root_level; + role = vcpu->arch.mmu.base_role; role.level = level; role.metaphysical = metaphysical; role.access = access; @@ -917,9 +1225,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && sp->role.word == role.word) { + hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) + if (sp->gfn == gfn) { + if (sp->unsync) + if (kvm_sync_page(vcpu, sp)) + continue; + + if (sp->role.word != role.word) + continue; + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } pgprintk("%s: found\n", __func__); return sp; } @@ -930,9 +1249,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); sp->gfn = gfn; sp->role = role; + sp->global = role.cr4_pge; hlist_add_head(&sp->hash_link, bucket); - if (!metaphysical) - rmap_write_protect(vcpu->kvm, gfn); + if (!metaphysical) { + if (rmap_write_protect(vcpu->kvm, gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); + account_shadowed(vcpu->kvm, gfn); + } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) vcpu->arch.mmu.prefetch_page(vcpu, sp); else @@ -940,33 +1263,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } -static int walk_shadow(struct kvm_shadow_walk *walker, - struct kvm_vcpu *vcpu, gva_t addr) +static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, u64 addr) { - hpa_t shadow_addr; - int level; - int r; - u64 *sptep; - unsigned index; - - shadow_addr = vcpu->arch.mmu.root_hpa; - level = vcpu->arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - --level; + iterator->addr = addr; + iterator->shadow_addr = vcpu->arch.mmu.root_hpa; + iterator->level = vcpu->arch.mmu.shadow_root_level; + if (iterator->level == PT32E_ROOT_LEVEL) { + iterator->shadow_addr + = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + --iterator->level; + if (!iterator->shadow_addr) + iterator->level = 0; } +} - while (level >= PT_PAGE_TABLE_LEVEL) { - index = SHADOW_PT_INDEX(addr, level); - sptep = ((u64 *)__va(shadow_addr)) + index; - r = walker->entry(walker, vcpu, addr, sptep, level); - if (r) - return r; - shadow_addr = *sptep & PT64_BASE_ADDR_MASK; - --level; - } - return 0; +static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) +{ + if (iterator->level < PT_PAGE_TABLE_LEVEL) + return false; + iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; + return true; +} + +static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) +{ + iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; + --iterator->level; } static void kvm_mmu_page_unlink_children(struct kvm *kvm, @@ -1038,14 +1363,43 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } } -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static int mmu_zap_unsync_children(struct kvm *kvm, + struct kvm_mmu_page *parent) +{ + int i, zapped = 0; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; + + if (parent->role.level == PT_PAGE_TABLE_LEVEL) + return 0; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + struct kvm_mmu_page *sp; + + for_each_sp(pages, sp, parents, i) { + kvm_mmu_zap_page(kvm, sp); + mmu_pages_clear_parents(&parents); + } + zapped += pages.nr; + kvm_mmu_pages_init(parent, &parents, &pages); + } + + return zapped; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { + int ret; ++kvm->stat.mmu_shadow_zapped; + ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.metaphysical) unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) + kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); @@ -1055,6 +1409,7 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_reload_remote_mmus(kvm); } kvm_mmu_reset_last_pte_updated(kvm); + return ret; } /* @@ -1107,19 +1462,29 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) if (sp->gfn == gfn && !sp->role.metaphysical) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + n = bucket->first; } return r; } static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { + unsigned index; + struct hlist_head *bucket; struct kvm_mmu_page *sp; + struct hlist_node *node, *nn; - while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + index = kvm_page_table_hashfn(gfn); + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { + if (sp->gfn == gfn && !sp->role.metaphysical + && !sp->role.invalid) { + pgprintk("%s: zap %lx %x\n", + __func__, gfn, sp->role.word); + kvm_mmu_zap_page(kvm, sp); + } } } @@ -1128,7 +1493,21 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); struct kvm_mmu_page *sp = page_header(__pa(pte)); - __set_bit(slot, &sp->slot_bitmap); + __set_bit(slot, sp->slot_bitmap); +} + +static void mmu_convert_notrap(struct kvm_mmu_page *sp) +{ + int i; + u64 *pt = sp->spt; + + if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (pt[i] == shadow_notrap_nonpresent_pte) + set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); + } } struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) @@ -1140,48 +1519,178 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) if (gpa == UNMAPPED_GVA) return NULL; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); return page; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) +/* + * The function is based on mtrr_type_lookup() in + * arch/x86/kernel/cpu/mtrr/generic.c + */ +static int get_mtrr_type(struct mtrr_state_type *mtrr_state, + u64 start, u64 end) { - u64 spte; - int was_rmapped = 0; - int was_writeble = is_writeble_pte(*shadow_pte); + int i; + u64 base, mask; + u8 prev_match, curr_match; + int num_var_ranges = KVM_NR_VAR_MTRR; + + if (!mtrr_state->enabled) + return 0xFF; + + /* Make end inclusive end, instead of exclusive */ + end--; + + /* Look in fixed ranges. Just return the type as per start */ + if (mtrr_state->have_fixed && (start < 0x100000)) { + int idx; + + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0x1000000) { + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state->fixed_ranges[idx]; + } + } - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", - __func__, *shadow_pte, pt_access, - write_fault, user_fault, gfn); + /* + * Look in variable ranges + * Look of multiple ranges matching this address and pick type + * as per MTRR precedence + */ + if (!(mtrr_state->enabled & 2)) + return mtrr_state->def_type; - if (is_rmap_pte(*shadow_pte)) { - /* - * If we overwrite a PTE page pointer with a 2MB PMD, unlink - * the parent of the now unreachable PTE. - */ - if (largepage && !is_large_pte(*shadow_pte)) { - struct kvm_mmu_page *child; - u64 pte = *shadow_pte; + prev_match = 0xFF; + for (i = 0; i < num_var_ranges; ++i) { + unsigned short start_state, end_state; - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, shadow_pte); - } else if (pfn != spte_to_pfn(*shadow_pte)) { - pgprintk("hfn old %lx new %lx\n", - spte_to_pfn(*shadow_pte), pfn); - rmap_remove(vcpu->kvm, shadow_pte); - } else { - if (largepage) - was_rmapped = is_large_pte(*shadow_pte); - else - was_rmapped = 1; + if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) + continue; + + base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + + (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + + (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); + + start_state = ((start & mask) == (base & mask)); + end_state = ((end & mask) == (base & mask)); + if (start_state != end_state) + return 0xFE; + + if ((start & mask) != (base & mask)) + continue; + + curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; + if (prev_match == 0xFF) { + prev_match = curr_match; + continue; + } + + if (prev_match == MTRR_TYPE_UNCACHABLE || + curr_match == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + if ((prev_match == MTRR_TYPE_WRBACK && + curr_match == MTRR_TYPE_WRTHROUGH) || + (prev_match == MTRR_TYPE_WRTHROUGH && + curr_match == MTRR_TYPE_WRBACK)) { + prev_match = MTRR_TYPE_WRTHROUGH; + curr_match = MTRR_TYPE_WRTHROUGH; + } + + if (prev_match != curr_match) + return MTRR_TYPE_UNCACHABLE; + } + + if (prev_match != 0xFF) + return prev_match; + + return mtrr_state->def_type; +} + +static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u8 mtrr; + + mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, + (gfn << PAGE_SHIFT) + PAGE_SIZE); + if (mtrr == 0xfe || mtrr == 0xff) + mtrr = MTRR_TYPE_WRBACK; + return mtrr; +} + +static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + + index = kvm_page_table_hashfn(sp->gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != sp->gfn || s->role.metaphysical) + continue; + if (s->role.word != sp->role.word) + return 1; + } + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; + + if (sp->global) { + list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages); + ++vcpu->kvm->stat.mmu_unsync_global; + } else + kvm_mmu_mark_parents_unsync(vcpu, sp); + + mmu_convert_notrap(sp); + return 0; +} + +static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) +{ + struct kvm_mmu_page *shadow; + + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + return 1; + if (shadow->unsync) + return 0; + if (can_unsync && oos_shadow) + return kvm_unsync_page(vcpu, shadow); + return 1; + } + return 0; +} + +static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pte_access, int user_fault, + int write_fault, int dirty, int largepage, + int global, gfn_t gfn, pfn_t pfn, bool speculative, + bool can_unsync) +{ + u64 spte; + int ret = 0; + u64 mt_mask = shadow_mt_mask; + struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); + + if (!global && sp->global) { + sp->global = 0; + if (sp->unsync) { + kvm_unlink_unsync_global(vcpu->kvm, sp); + kvm_mmu_mark_parents_unsync(vcpu, sp); } } @@ -1192,7 +1701,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, */ spte = shadow_base_present_pte | shadow_dirty_mask; if (!speculative) - pte_access |= PT_ACCESSED_MASK; + spte |= shadow_accessed_mask; if (!dirty) pte_access &= ~ACC_WRITE_MASK; if (pte_access & ACC_EXEC_MASK) @@ -1203,40 +1712,106 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= shadow_user_mask; if (largepage) spte |= PT_PAGE_SIZE_MASK; + if (mt_mask) { + if (!kvm_is_mmio_pfn(pfn)) { + mt_mask = get_memory_type(vcpu, gfn) << + kvm_x86_ops->get_mt_mask_shift(); + mt_mask |= VMX_EPT_IGMT_BIT; + } else + mt_mask = MTRR_TYPE_UNCACHABLE << + kvm_x86_ops->get_mt_mask_shift(); + spte |= mt_mask; + } spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; + + if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { + ret = 1; + spte = shadow_trap_nonpresent_pte; + goto set_pte; + } spte |= PT_WRITABLE_MASK; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow || - (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writeble_pte(*shadow_pte)) + goto set_pte; + + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); + ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) { + if (is_writeble_pte(spte)) spte &= ~PT_WRITABLE_MASK; - kvm_x86_ops->tlb_flush(vcpu); - } - if (write_fault) - *ptwrite = 1; } } if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); - pgprintk("%s: setting spte %llx\n", __func__, spte); - pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", - (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", - (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); +set_pte: set_shadow_pte(shadow_pte, spte); - if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) - && (spte & PT_PRESENT_MASK)) + return ret; +} + +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pt_access, unsigned pte_access, + int user_fault, int write_fault, int dirty, + int *ptwrite, int largepage, int global, + gfn_t gfn, pfn_t pfn, bool speculative) +{ + int was_rmapped = 0; + int was_writeble = is_writeble_pte(*shadow_pte); + + pgprintk("%s: spte %llx access %x write_fault %d" + " user_fault %d gfn %lx\n", + __func__, *shadow_pte, pt_access, + write_fault, user_fault, gfn); + + if (is_rmap_pte(*shadow_pte)) { + /* + * If we overwrite a PTE page pointer with a 2MB PMD, unlink + * the parent of the now unreachable PTE. + */ + if (largepage && !is_large_pte(*shadow_pte)) { + struct kvm_mmu_page *child; + u64 pte = *shadow_pte; + + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, shadow_pte); + } else if (pfn != spte_to_pfn(*shadow_pte)) { + pgprintk("hfn old %lx new %lx\n", + spte_to_pfn(*shadow_pte), pfn); + rmap_remove(vcpu->kvm, shadow_pte); + } else { + if (largepage) + was_rmapped = is_large_pte(*shadow_pte); + else + was_rmapped = 1; + } + } + if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, + dirty, largepage, global, gfn, pfn, speculative, true)) { + if (write_fault) + *ptwrite = 1; + kvm_x86_ops->tlb_flush(vcpu); + } + + pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); + pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", + is_large_pte(*shadow_pte)? "2MB" : "4kB", + is_present_pte(*shadow_pte)?"RW":"R", gfn, + *shadow_pte, shadow_pte); + if (!was_rmapped && is_large_pte(*shadow_pte)) ++vcpu->kvm->stat.lpages; page_header_update_slot(vcpu->kvm, shadow_pte, gfn); @@ -1260,67 +1835,42 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -struct direct_shadow_walk { - struct kvm_shadow_walk walker; - pfn_t pfn; - int write; - int largepage; - int pt_write; -}; - -static int direct_map_entry(struct kvm_shadow_walk *_walk, - struct kvm_vcpu *vcpu, - gva_t addr, u64 *sptep, int level) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + int largepage, gfn_t gfn, pfn_t pfn) { - struct direct_shadow_walk *walk = - container_of(_walk, struct direct_shadow_walk, walker); + struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; + int pt_write = 0; gfn_t pseudo_gfn; - gfn_t gfn = addr >> PAGE_SHIFT; - - if (level == PT_PAGE_TABLE_LEVEL - || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { - mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, - 0, walk->write, 1, &walk->pt_write, - walk->largepage, gfn, walk->pfn, false); - ++vcpu->stat.pf_fixed; - return 1; - } - if (*sptep == shadow_trap_nonpresent_pte) { - pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, addr, level - 1, - 1, ACC_ALL, sptep); - if (!sp) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_pfn_clean(walk->pfn); - return -ENOMEM; + for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { + if (iterator.level == PT_PAGE_TABLE_LEVEL + || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, + 0, write, 1, &pt_write, + largepage, 0, gfn, pfn, false); + ++vcpu->stat.pf_fixed; + break; } - set_shadow_pte(sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); - } - return 0; -} + if (*iterator.sptep == shadow_trap_nonpresent_pte) { + pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, + iterator.level - 1, + 1, ACC_ALL, iterator.sptep); + if (!sp) { + pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_pfn_clean(pfn); + return -ENOMEM; + } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn) -{ - int r; - struct direct_shadow_walk walker = { - .walker = { .entry = direct_map_entry, }, - .pfn = pfn, - .largepage = largepage, - .write = write, - .pt_write = 0, - }; - - r = walk_shadow(&walker.walker, vcpu, (gva_t)gfn << PAGE_SHIFT); - if (r < 0) - return r; - return walker.pt_write; + set_shadow_pte(iterator.sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); + } + } + return pt_write; } static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) @@ -1330,16 +1880,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn_t pfn; unsigned long mmu_seq; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { @@ -1447,6 +1995,53 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } +static void mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + } + } +} + +static void mmu_sync_global(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page *sp, *n; + + list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link) + kvm_sync_page(vcpu, sp); +} + +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + +void kvm_mmu_sync_global(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_global(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -1488,15 +2083,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return 1; @@ -1531,6 +2124,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1578,6 +2173,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->prefetch_page = paging64_prefetch_page; + context->sync_page = paging64_sync_page; + context->invlpg = paging64_invlpg; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -1599,6 +2196,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; context->free = paging_free; context->prefetch_page = paging32_prefetch_page; + context->sync_page = paging32_sync_page; + context->invlpg = paging32_invlpg; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1618,6 +2217,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = tdp_page_fault; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; @@ -1640,17 +2241,23 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { + int r; + ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - return nonpaging_init_context(vcpu); + r = nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) - return paging64_init_context(vcpu); + r = paging64_init_context(vcpu); else if (is_pae(vcpu)) - return paging32E_init_context(vcpu); + r = paging32E_init_context(vcpu); else - return paging32_init_context(vcpu); + r = paging32_init_context(vcpu); + + vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; + + return r; } static int init_kvm_mmu(struct kvm_vcpu *vcpu) @@ -1689,6 +2296,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); + mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); @@ -1809,15 +2417,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - down_read(¤t->mm->mmap_sem); if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); vcpu->arch.update_pte.largepage = 1; } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); @@ -1840,7 +2446,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, + bool guest_initiated) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -1866,15 +2473,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; + if (guest_initiated) { + if (gfn == vcpu->arch.last_pt_write_gfn + && !last_updated_pte_accessed(vcpu)) { + ++vcpu->arch.last_pt_write_count; + if (vcpu->arch.last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->arch.last_pt_write_gfn = gfn; + vcpu->arch.last_pt_write_count = 1; + vcpu->arch.last_pte_updated = NULL; + } } index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; @@ -1897,7 +2506,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - kvm_mmu_zap_page(vcpu->kvm, sp); + if (kvm_mmu_zap_page(vcpu->kvm, sp)) + n = bucket->first; ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -2011,6 +2621,14 @@ out: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ + vcpu->arch.mmu.invlpg(vcpu, gva); + kvm_mmu_flush_tlb(vcpu); + ++vcpu->stat.invlpg; +} +EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); + void kvm_enable_tdp(void) { tdp_enabled = true; @@ -2097,11 +2715,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; + spin_lock(&kvm->mmu_lock); list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; u64 *pt; - if (!test_bit(slot, &sp->slot_bitmap)) + if (!test_bit(slot, sp->slot_bitmap)) continue; pt = sp->spt; @@ -2110,6 +2729,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (pt[i] & PT_WRITABLE_MASK) pt[i] &= ~PT_WRITABLE_MASK; } + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); } void kvm_mmu_zap_all(struct kvm *kvm) @@ -2118,7 +2739,9 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + node = container_of(kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); @@ -2280,6 +2903,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { kvm_x86_ops->tlb_flush(vcpu); + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); return 1; } @@ -2505,8 +3129,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) if (sp->role.metaphysical) continue; - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); gfn = unalias_gfn(vcpu->kvm, sp->gfn); + slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; if (*rmapp) printk(KERN_ERR "%s: (%s) shadow page has writable"