X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fx86%2Fkvm%2Fmmu.c;h=4c2585cab18984e7b22be9baa45de35138609761;hb=e58b0f9e0e2c17112e375a3f0ca1ef7e57730f68;hp=c28a36b4cbba0fc4be18ace782b9f98ce2509a38;hpb=7b52345e2c4c7333bf7eba8034ffc4683fa63c91;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c28a36b..4c2585c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -17,8 +17,8 @@ * */ -#include "vmx.h" #include "mmu.h" +#include "kvm_cache_regs.h" #include #include @@ -33,6 +33,7 @@ #include #include #include +#include /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -66,9 +67,13 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} #endif #if defined(MMU_DEBUG) || defined(AUDIT) -static int dbg = 1; +static int dbg = 0; +module_param(dbg, bool, 0644); #endif +static int oos_shadow = 1; +module_param(oos_shadow, bool, 0644); + #ifndef MMU_DEBUG #define ASSERT(x) do { } while (0) #else @@ -122,6 +127,7 @@ static int dbg = 1; #define PFERR_PRESENT_MASK (1U << 0) #define PFERR_WRITE_MASK (1U << 1) #define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) #define PT_DIRECTORY_LEVEL 2 @@ -134,18 +140,33 @@ static int dbg = 1; #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) -struct kvm_pv_mmu_op_buffer { - void *ptr; - unsigned len; - unsigned processed; - char buf[512] __aligned(sizeof(long)); -}; +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) struct kvm_rmap_desc { - u64 *shadow_ptes[RMAP_EXT]; + u64 *sptes[RMAP_EXT]; struct kvm_rmap_desc *more; }; +struct kvm_shadow_walk_iterator { + u64 addr; + hpa_t shadow_addr; + int level; + u64 *sptep; + unsigned index; +}; + +#define for_each_shadow_entry(_vcpu, _addr, _walker) \ + for (shadow_walk_init(&(_walker), _vcpu, _addr); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + + +struct kvm_unsync_walk { + int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); +}; + +typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); + static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; @@ -159,6 +180,11 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; +static inline u64 rsvd_bits(int s, int e) +{ + return ((1ULL << (e - s + 1)) - 1) << s; +} + void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) { shadow_trap_nonpresent_pte = trap_pte; @@ -198,11 +224,6 @@ static int is_nx(struct kvm_vcpu *vcpu) return vcpu->arch.shadow_efer & EFER_NX; } -static int is_present_pte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - static int is_shadow_present_pte(u64 pte) { return pte != shadow_trap_nonpresent_pte @@ -219,16 +240,25 @@ static int is_writeble_pte(unsigned long pte) return pte & PT_WRITABLE_MASK; } -static int is_dirty_pte(unsigned long pte) +static int is_dirty_gpte(unsigned long pte) { - return pte & shadow_dirty_mask; + return pte & PT_DIRTY_MASK; } -static int is_rmap_pte(u64 pte) +static int is_rmap_spte(u64 pte) { return is_shadow_present_pte(pte); } +static int is_last_spte(u64 pte, int level) +{ + if (level == PT_PAGE_TABLE_LEVEL) + return 1; + if (level == PT_DIRECTORY_LEVEL && is_large_pte(pte)) + return 1; + return 0; +} + static pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -241,7 +271,7 @@ static gfn_t pse36_gfn_delta(u32 gpte) return (gpte & PT32_DIR_PSE36_MASK) << shift; } -static void set_shadow_pte(u64 *sptep, u64 spte) +static void __set_spte(u64 *sptep, u64 spte) { #ifdef CONFIG_X86_64 set_64bit((unsigned long *)sptep, spte); @@ -304,7 +334,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, - rmap_desc_cache, 1); + rmap_desc_cache, 4); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); @@ -331,7 +361,6 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, BUG_ON(!mc->nobjs); p = mc->objects[--mc->nobjs]; - memset(p, 0, size); return p; } @@ -374,25 +403,30 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) { int *write_count; - write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + gfn = unalias_gfn(kvm, gfn); + write_count = slot_largepage_idx(gfn, + gfn_to_memslot_unaliased(kvm, gfn)); *write_count += 1; - WARN_ON(*write_count > KVM_PAGES_PER_HPAGE); } static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { int *write_count; - write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + gfn = unalias_gfn(kvm, gfn); + write_count = slot_largepage_idx(gfn, + gfn_to_memslot_unaliased(kvm, gfn)); *write_count -= 1; WARN_ON(*write_count < 0); } static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) { - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + struct kvm_memory_slot *slot; int *largepage_idx; + gfn = unalias_gfn(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); if (slot) { largepage_idx = slot_largepage_idx(gfn, slot); return *largepage_idx; @@ -405,16 +439,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr; + int ret = 0; addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return 0; + return ret; + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); if (vma && is_vm_hugetlb_page(vma)) - return 1; + ret = 1; + up_read(¤t->mm->mmap_sem); - return 0; + return ret; } static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) @@ -462,16 +499,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) * * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc * containing more mappings. + * + * Returns the number of rmap entries before the spte was added or zero if + * the spte was not added. + * */ -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; unsigned long *rmapp; - int i; + int i, count = 0; - if (!is_rmap_pte(*spte)) - return; + if (!is_rmap_spte(*spte)) + return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); sp->gfns[spte - sp->spt] = gfn; @@ -482,22 +523,25 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) } else if (!(*rmapp & 1)) { rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_rmap_desc(vcpu); - desc->shadow_ptes[0] = (u64 *)*rmapp; - desc->shadow_ptes[1] = spte; + desc->sptes[0] = (u64 *)*rmapp; + desc->sptes[1] = spte; *rmapp = (unsigned long)desc | 1; } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) + while (desc->sptes[RMAP_EXT-1] && desc->more) { desc = desc->more; - if (desc->shadow_ptes[RMAP_EXT-1]) { + count += RMAP_EXT; + } + if (desc->sptes[RMAP_EXT-1]) { desc->more = mmu_alloc_rmap_desc(vcpu); desc = desc->more; } - for (i = 0; desc->shadow_ptes[i]; ++i) + for (i = 0; desc->sptes[i]; ++i) ; - desc->shadow_ptes[i] = spte; + desc->sptes[i] = spte; } + return count; } static void rmap_desc_remove_entry(unsigned long *rmapp, @@ -507,14 +551,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp, { int j; - for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) + for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) ; - desc->shadow_ptes[i] = desc->shadow_ptes[j]; - desc->shadow_ptes[j] = NULL; + desc->sptes[i] = desc->sptes[j]; + desc->sptes[j] = NULL; if (j != 0) return; if (!prev_desc && !desc->more) - *rmapp = (unsigned long)desc->shadow_ptes[0]; + *rmapp = (unsigned long)desc->sptes[0]; else if (prev_desc) prev_desc->more = desc->more; @@ -532,7 +576,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) unsigned long *rmapp; int i; - if (!is_rmap_pte(*spte)) + if (!is_rmap_spte(*spte)) return; sp = page_header(__pa(spte)); pfn = spte_to_pfn(*spte); @@ -559,8 +603,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) - if (desc->shadow_ptes[i] == spte) { + for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) + if (desc->sptes[i] == spte) { rmap_desc_remove_entry(rmapp, desc, i, prev_desc); @@ -591,17 +635,17 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) prev_desc = NULL; prev_spte = NULL; while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { + for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { if (prev_spte == spte) - return desc->shadow_ptes[i]; - prev_spte = desc->shadow_ptes[i]; + return desc->sptes[i]; + prev_spte = desc->sptes[i]; } desc = desc->more; } return NULL; } -static void rmap_write_protect(struct kvm *kvm, u64 gfn) +static int rmap_write_protect(struct kvm *kvm, u64 gfn) { unsigned long *rmapp; u64 *spte; @@ -616,7 +660,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writeble_pte(*spte)) { - set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); @@ -640,16 +684,109 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) if (is_writeble_pte(*spte)) { rmap_remove(kvm, spte); --kvm->stat.lpages; - set_shadow_pte(spte, shadow_trap_nonpresent_pte); + __set_spte(spte, shadow_trap_nonpresent_pte); + spte = NULL; write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); } - if (write_protected) - kvm_flush_remote_tlbs(kvm); + return write_protected; +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *spte; + int need_tlb_flush = 0; + + while ((spte = rmap_next(kvm, rmapp, NULL))) { + BUG_ON(!(*spte & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); + rmap_remove(kvm, spte); + __set_spte(spte, shadow_trap_nonpresent_pte); + need_tlb_flush = 1; + } + return need_tlb_flush; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + int (*handler)(struct kvm *kvm, unsigned long *rmapp)) +{ + int i; + int retval = 0; + + /* + * If mmap_sem isn't taken, we can look the memslots with only + * the mmu_lock by skipping over the slots with userspace_addr == 0. + */ + for (i = 0; i < kvm->nmemslots; i++) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + unsigned long start = memslot->userspace_addr; + unsigned long end; + + /* mmu_lock protects userspace_addr */ + if (!start) + continue; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + retval |= handler(kvm, &memslot->rmap[gfn_offset]); + retval |= handler(kvm, + &memslot->lpage_info[ + gfn_offset / + KVM_PAGES_PER_HPAGE].rmap_pde); + } + } + + return retval; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *spte; + int young = 0; + + /* always return old for EPT */ + if (!shadow_accessed_mask) + return 0; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + int _young; + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + _young = _spte & PT_ACCESSED_MASK; + if (_young) { + young = 1; + clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + } + spte = rmap_next(kvm, rmapp, spte); + } + return young; +} + +#define RMAP_RECYCLE_THRESHOLD 1000 + +static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) +{ + unsigned long *rmapp; + + gfn = unalias_gfn(vcpu->kvm, gfn); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + + kvm_unmap_rmapp(vcpu->kvm, rmapp); + kvm_flush_remote_tlbs(vcpu->kvm); +} - account_shadowed(kvm, gfn); +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, kvm_age_rmapp); } #ifdef MMU_DEBUG @@ -659,7 +796,7 @@ static int is_empty_shadow_page(u64 *spt) u64 *end; for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if (*pos != shadow_trap_nonpresent_pte) { + if (is_shadow_present_pte(*pos)) { printk(KERN_ERR "%s: %p %llx\n", __func__, pos, *pos); return 0; @@ -693,8 +830,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - ASSERT(is_empty_shadow_page(sp->spt)); - sp->slot_bitmap = 0; + INIT_LIST_HEAD(&sp->oos_link); + bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; --vcpu->kvm->arch.n_free_mmu_pages; @@ -776,6 +913,176 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } + +static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_parent_walk_fn fn) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + struct kvm_mmu_page *parent_sp; + int i; + + if (!sp->multimapped && sp->parent_pte) { + parent_sp = page_header(__pa(sp->parent_pte)); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + return; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + } +} + +static void kvm_mmu_update_unsync_bitmap(u64 *spte) +{ + unsigned int index; + struct kvm_mmu_page *sp = page_header(__pa(spte)); + + index = spte - sp->spt; + if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) + sp->unsync_children++; + WARN_ON(!sp->unsync_children); +} + +static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!sp->parent_pte) + return; + + if (!sp->multimapped) { + kvm_mmu_update_unsync_bitmap(sp->parent_pte); + return; + } + + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); + } +} + +static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + kvm_mmu_update_parents_unsync(sp); + return 1; +} + +static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + kvm_mmu_update_parents_unsync(sp); +} + +static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + sp->spt[i] = shadow_trap_nonpresent_pte; +} + +static int nonpaging_sync_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + return 1; +} + +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ +} + +#define KVM_PAGE_ARRAY_NR 16 + +struct kvm_mmu_pages { + struct mmu_page_and_offset { + struct kvm_mmu_page *sp; + unsigned int idx; + } page[KVM_PAGE_ARRAY_NR]; + unsigned int nr; +}; + +#define for_each_unsync_children(bitmap, idx) \ + for (idx = find_first_bit(bitmap, 512); \ + idx < 512; \ + idx = find_next_bit(bitmap, 512, idx+1)) + +static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, + int idx) +{ + int i; + + if (sp->unsync) + for (i=0; i < pvec->nr; i++) + if (pvec->page[i].sp == sp) + return 0; + + pvec->page[pvec->nr].sp = sp; + pvec->page[pvec->nr].idx = idx; + pvec->nr++; + return (pvec->nr == KVM_PAGE_ARRAY_NR); +} + +static int __mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + int i, ret, nr_unsync_leaf = 0; + + for_each_unsync_children(sp->unsync_child_bitmap, i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + __clear_bit(i, sp->unsync_child_bitmap); + else if (ret > 0) + nr_unsync_leaf += ret; + else + return ret; + } + + if (child->unsync) { + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + } + } + } + + if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) + sp->unsync_children = 0; + + return nr_unsync_leaf; +} + +static int mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + if (!sp->unsync_children) + return 0; + + mmu_pages_add(pvec, sp, 0); + return __mmu_unsync_walk(sp, pvec); +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -787,7 +1094,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical + if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { pgprintk("%s: found role %x\n", __func__, sp->role.word); @@ -796,11 +1103,125 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } +static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + WARN_ON(!sp->unsync); + sp->unsync = 0; + --kvm->stat.mmu_unsync; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + if (sp->role.glevels != vcpu->arch.mmu.root_level) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + if (rmap_write_protect(vcpu->kvm, sp->gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); + kvm_unlink_unsync_page(vcpu->kvm, sp); + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + kvm_mmu_flush_tlb(vcpu); + return 0; +} + +struct mmu_page_path { + struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; + unsigned int idx[PT64_ROOT_LEVEL-1]; +}; + +#define for_each_sp(pvec, sp, parents, i) \ + for (i = mmu_pages_next(&pvec, &parents, -1), \ + sp = pvec.page[i].sp; \ + i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ + i = mmu_pages_next(&pvec, &parents, i)) + +static int mmu_pages_next(struct kvm_mmu_pages *pvec, + struct mmu_page_path *parents, + int i) +{ + int n; + + for (n = i+1; n < pvec->nr; n++) { + struct kvm_mmu_page *sp = pvec->page[n].sp; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) { + parents->idx[0] = pvec->page[n].idx; + return n; + } + + parents->parent[sp->role.level-2] = sp; + parents->idx[sp->role.level-1] = pvec->page[n].idx; + } + + return n; +} + +static void mmu_pages_clear_parents(struct mmu_page_path *parents) +{ + struct kvm_mmu_page *sp; + unsigned int level = 0; + + do { + unsigned int idx = parents->idx[level]; + + sp = parents->parent[level]; + if (!sp) + return; + + --sp->unsync_children; + WARN_ON((int)sp->unsync_children < 0); + __clear_bit(idx, sp->unsync_child_bitmap); + level++; + } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); +} + +static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, + struct mmu_page_path *parents, + struct kvm_mmu_pages *pvec) +{ + parents->parent[parent->role.level-1] = NULL; + pvec->nr = 0; +} + +static void mmu_sync_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *parent) +{ + int i; + struct kvm_mmu_page *sp; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + int protected = 0; + + for_each_sp(pages, sp, parents, i) + protected |= rmap_write_protect(vcpu->kvm, sp->gfn); + + if (protected) + kvm_flush_remote_tlbs(vcpu->kvm); + + for_each_sp(pages, sp, parents, i) { + kvm_sync_page(vcpu, sp); + mmu_pages_clear_parents(&parents); + } + cond_resched_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_pages_init(parent, &parents, &pages); + } +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, unsigned level, - int metaphysical, + int direct, unsigned access, u64 *parent_pte) { @@ -809,12 +1230,11 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned quadrant; struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; + struct hlist_node *node, *tmp; - role.word = 0; - role.glevels = vcpu->arch.mmu.root_level; + role = vcpu->arch.mmu.base_role; role.level = level; - role.metaphysical = metaphysical; + role.direct = direct; role.access = access; if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); @@ -825,9 +1245,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && sp->role.word == role.word) { + hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) + if (sp->gfn == gfn) { + if (sp->unsync) + if (kvm_sync_page(vcpu, sp)) + continue; + + if (sp->role.word != role.word) + continue; + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } pgprintk("%s: found\n", __func__); return sp; } @@ -839,12 +1270,49 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); - if (!metaphysical) - rmap_write_protect(vcpu->kvm, gfn); - vcpu->arch.mmu.prefetch_page(vcpu, sp); + if (!direct) { + if (rmap_write_protect(vcpu->kvm, gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); + account_shadowed(vcpu->kvm, gfn); + } + if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) + vcpu->arch.mmu.prefetch_page(vcpu, sp); + else + nonpaging_prefetch_page(vcpu, sp); return sp; } +static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, u64 addr) +{ + iterator->addr = addr; + iterator->shadow_addr = vcpu->arch.mmu.root_hpa; + iterator->level = vcpu->arch.mmu.shadow_root_level; + if (iterator->level == PT32E_ROOT_LEVEL) { + iterator->shadow_addr + = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + --iterator->level; + if (!iterator->shadow_addr) + iterator->level = 0; + } +} + +static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) +{ + if (iterator->level < PT_PAGE_TABLE_LEVEL) + return false; + iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; + return true; +} + +static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) +{ + iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; + --iterator->level; +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -854,32 +1322,22 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, pt = sp->spt; - if (sp->role.level == PT_PAGE_TABLE_LEVEL) { - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (is_shadow_present_pte(pt[i])) - rmap_remove(kvm, &pt[i]); - pt[i] = shadow_trap_nonpresent_pte; - } - kvm_flush_remote_tlbs(kvm); - return; - } - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { ent = pt[i]; if (is_shadow_present_pte(ent)) { - if (!is_large_pte(ent)) { + if (!is_last_spte(ent, sp->role.level)) { ent &= PT64_BASE_ADDR_MASK; mmu_page_remove_parent_pte(page_header(ent), &pt[i]); } else { - --kvm->stat.lpages; + if (is_large_pte(ent)) + --kvm->stat.lpages; rmap_remove(kvm, &pt[i]); } } pt[i] = shadow_trap_nonpresent_pte; } - kvm_flush_remote_tlbs(kvm); } static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) @@ -890,17 +1348,16 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) { int i; + struct kvm_vcpu *vcpu; - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm->vcpus[i]->arch.last_pte_updated = NULL; + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.last_pte_updated = NULL; } -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *parent_pte; - ++kvm->stat.mmu_shadow_zapped; while (sp->multimapped || sp->parent_pte) { if (!sp->multimapped) parent_pte = sp->parent_pte; @@ -913,20 +1370,57 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) } BUG_ON(!parent_pte); kvm_mmu_put_page(sp, parent_pte); - set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); + __set_spte(parent_pte, shadow_trap_nonpresent_pte); + } +} + +static int mmu_zap_unsync_children(struct kvm *kvm, + struct kvm_mmu_page *parent) +{ + int i, zapped = 0; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; + + if (parent->role.level == PT_PAGE_TABLE_LEVEL) + return 0; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + struct kvm_mmu_page *sp; + + for_each_sp(pages, sp, parents, i) { + kvm_mmu_zap_page(kvm, sp); + mmu_pages_clear_parents(&parents); + } + zapped += pages.nr; + kvm_mmu_pages_init(parent, &parents, &pages); } + + return zapped; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + int ret; + ++kvm->stat.mmu_shadow_zapped; + ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); + kvm_mmu_unlink_parents(kvm, sp); + kvm_flush_remote_tlbs(kvm); + if (!sp->role.invalid && !sp->role.direct) + unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) + kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { - if (!sp->role.metaphysical) - unaccount_shadowed(kvm, sp->gfn); hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else { - list_move(&sp->link, &kvm->arch.active_mmu_pages); sp->role.invalid = 1; + list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); } kvm_mmu_reset_last_pte_updated(kvm); + return ret; } /* @@ -935,24 +1429,25 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { + int used_pages; + + used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; + used_pages = max(0, used_pages); + /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ - if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > - kvm_nr_mmu_pages) { - int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; - - while (n_used_mmu_pages > kvm_nr_mmu_pages) { + if (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_zap_page(kvm, page); - n_used_mmu_pages--; + used_pages--; } kvm->arch.n_free_mmu_pages = 0; } @@ -976,86 +1471,227 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical) { + if (sp->gfn == gfn && !sp->role.direct) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + n = bucket->first; } return r; } static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { + unsigned index; + struct hlist_head *bucket; struct kvm_mmu_page *sp; + struct hlist_node *node, *nn; + + index = kvm_page_table_hashfn(gfn); + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { + if (sp->gfn == gfn && !sp->role.direct + && !sp->role.invalid) { + pgprintk("%s: zap %lx %x\n", + __func__, gfn, sp->role.word); + kvm_mmu_zap_page(kvm, sp); + } + } +} + +static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) +{ + int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + struct kvm_mmu_page *sp = page_header(__pa(pte)); + + __set_bit(slot, sp->slot_bitmap); +} + +static void mmu_convert_notrap(struct kvm_mmu_page *sp) +{ + int i; + u64 *pt = sp->spt; + + if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (pt[i] == shadow_notrap_nonpresent_pte) + __set_spte(&pt[i], shadow_trap_nonpresent_pte); + } +} + +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct page *page; + + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + + if (gpa == UNMAPPED_GVA) + return NULL; - while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + + return page; +} + +/* + * The function is based on mtrr_type_lookup() in + * arch/x86/kernel/cpu/mtrr/generic.c + */ +static int get_mtrr_type(struct mtrr_state_type *mtrr_state, + u64 start, u64 end) +{ + int i; + u64 base, mask; + u8 prev_match, curr_match; + int num_var_ranges = KVM_NR_VAR_MTRR; + + if (!mtrr_state->enabled) + return 0xFF; + + /* Make end inclusive end, instead of exclusive */ + end--; + + /* Look in fixed ranges. Just return the type as per start */ + if (mtrr_state->have_fixed && (start < 0x100000)) { + int idx; + + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0x1000000) { + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state->fixed_ranges[idx]; + } + } + + /* + * Look in variable ranges + * Look of multiple ranges matching this address and pick type + * as per MTRR precedence + */ + if (!(mtrr_state->enabled & 2)) + return mtrr_state->def_type; + + prev_match = 0xFF; + for (i = 0; i < num_var_ranges; ++i) { + unsigned short start_state, end_state; + + if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) + continue; + + base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + + (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + + (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); + + start_state = ((start & mask) == (base & mask)); + end_state = ((end & mask) == (base & mask)); + if (start_state != end_state) + return 0xFE; + + if ((start & mask) != (base & mask)) + continue; + + curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; + if (prev_match == 0xFF) { + prev_match = curr_match; + continue; + } + + if (prev_match == MTRR_TYPE_UNCACHABLE || + curr_match == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + if ((prev_match == MTRR_TYPE_WRBACK && + curr_match == MTRR_TYPE_WRTHROUGH) || + (prev_match == MTRR_TYPE_WRTHROUGH && + curr_match == MTRR_TYPE_WRBACK)) { + prev_match = MTRR_TYPE_WRTHROUGH; + curr_match = MTRR_TYPE_WRTHROUGH; + } + + if (prev_match != curr_match) + return MTRR_TYPE_UNCACHABLE; + } + + if (prev_match != 0xFF) + return prev_match; + + return mtrr_state->def_type; +} + +u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u8 mtrr; + + mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, + (gfn << PAGE_SHIFT) + PAGE_SIZE); + if (mtrr == 0xfe || mtrr == 0xff) + mtrr = MTRR_TYPE_WRBACK; + return mtrr; +} +EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); + +static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + + index = kvm_page_table_hashfn(sp->gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != sp->gfn || s->role.direct) + continue; + if (s->role.word != sp->role.word) + return 1; } -} + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; -static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) -{ - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); - struct kvm_mmu_page *sp = page_header(__pa(pte)); + kvm_mmu_mark_parents_unsync(vcpu, sp); - __set_bit(slot, &sp->slot_bitmap); + mmu_convert_notrap(sp); + return 0; } -struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) +static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) { - struct page *page; - - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); - - if (gpa == UNMAPPED_GVA) - return NULL; - - down_read(¤t->mm->mmap_sem); - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); + struct kvm_mmu_page *shadow; - return page; + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + return 1; + if (shadow->unsync) + return 0; + if (can_unsync && oos_shadow) + return kvm_unsync_page(vcpu, shadow); + return 1; + } + return 0; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) +static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned pte_access, int user_fault, + int write_fault, int dirty, int largepage, + gfn_t gfn, pfn_t pfn, bool speculative, + bool can_unsync) { u64 spte; - int was_rmapped = 0; - int was_writeble = is_writeble_pte(*shadow_pte); - - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", - __func__, *shadow_pte, pt_access, - write_fault, user_fault, gfn); - - if (is_rmap_pte(*shadow_pte)) { - /* - * If we overwrite a PTE page pointer with a 2MB PMD, unlink - * the parent of the now unreachable PTE. - */ - if (largepage && !is_large_pte(*shadow_pte)) { - struct kvm_mmu_page *child; - u64 pte = *shadow_pte; - - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, shadow_pte); - } else if (pfn != spte_to_pfn(*shadow_pte)) { - pgprintk("hfn old %lx new %lx\n", - spte_to_pfn(*shadow_pte), pfn); - rmap_remove(vcpu->kvm, shadow_pte); - } else { - if (largepage) - was_rmapped = is_large_pte(*shadow_pte); - else - was_rmapped = 1; - } - } + int ret = 0; /* * We don't set the accessed bit, since we sometimes want to see @@ -1064,7 +1700,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, */ spte = shadow_base_present_pte | shadow_dirty_mask; if (!speculative) - pte_access |= PT_ACCESSED_MASK; + spte |= shadow_accessed_mask; if (!dirty) pte_access &= ~ACC_WRITE_MASK; if (pte_access & ACC_EXEC_MASK) @@ -1075,61 +1711,115 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= shadow_user_mask; if (largepage) spte |= PT_PAGE_SIZE_MASK; + if (tdp_enabled) + spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; - spte |= PT_WRITABLE_MASK; - if (user_fault) { - mmu_unshadow(vcpu->kvm, gfn); - goto unshadowed; + if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { + ret = 1; + spte = shadow_trap_nonpresent_pte; + goto set_pte; } - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow || - (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { + spte |= PT_WRITABLE_MASK; + + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writeble_pte(*sptep)) + goto set_pte; + + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); + ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) { + if (is_writeble_pte(spte)) spte &= ~PT_WRITABLE_MASK; - kvm_x86_ops->tlb_flush(vcpu); - } - if (write_fault) - *ptwrite = 1; } } -unshadowed: - if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); - pgprintk("%s: setting spte %llx\n", __func__, spte); - pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", - (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", - (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); - set_shadow_pte(shadow_pte, spte); - if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) - && (spte & PT_PRESENT_MASK)) +set_pte: + __set_spte(sptep, spte); + return ret; +} + +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned pt_access, unsigned pte_access, + int user_fault, int write_fault, int dirty, + int *ptwrite, int largepage, gfn_t gfn, + pfn_t pfn, bool speculative) +{ + int was_rmapped = 0; + int was_writeble = is_writeble_pte(*sptep); + int rmap_count; + + pgprintk("%s: spte %llx access %x write_fault %d" + " user_fault %d gfn %lx\n", + __func__, *sptep, pt_access, + write_fault, user_fault, gfn); + + if (is_rmap_spte(*sptep)) { + /* + * If we overwrite a PTE page pointer with a 2MB PMD, unlink + * the parent of the now unreachable PTE. + */ + if (largepage && !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + u64 pte = *sptep; + + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, sptep); + } else if (pfn != spte_to_pfn(*sptep)) { + pgprintk("hfn old %lx new %lx\n", + spte_to_pfn(*sptep), pfn); + rmap_remove(vcpu->kvm, sptep); + } else + was_rmapped = 1; + } + if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, + dirty, largepage, gfn, pfn, speculative, true)) { + if (write_fault) + *ptwrite = 1; + kvm_x86_ops->tlb_flush(vcpu); + } + + pgprintk("%s: setting spte %llx\n", __func__, *sptep); + pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", + is_large_pte(*sptep)? "2MB" : "4kB", + is_present_pte(*sptep)?"RW":"R", gfn, + *shadow_pte, sptep); + if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; - page_header_update_slot(vcpu->kvm, shadow_pte, gfn); + page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { - rmap_add(vcpu, shadow_pte, gfn, largepage); - if (!is_rmap_pte(*shadow_pte)) + rmap_count = rmap_add(vcpu, sptep, gfn, largepage); + if (!is_rmap_spte(*sptep)) kvm_release_pfn_clean(pfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, gfn, largepage); } else { if (was_writeble) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); } - if (!ptwrite || !*ptwrite) - vcpu->arch.last_pte_updated = shadow_pte; + if (speculative) { + vcpu->arch.last_pte_updated = sptep; + vcpu->arch.last_pte_gfn = gfn; + } } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) @@ -1137,51 +1827,41 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn, - int level) + int largepage, gfn_t gfn, pfn_t pfn) { - hpa_t table_addr = vcpu->arch.mmu.root_hpa; + struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp; int pt_write = 0; - - for (; ; level--) { - u32 index = PT64_INDEX(v, level); - u64 *table; - - ASSERT(VALID_PAGE(table_addr)); - table = __va(table_addr); - - if (level == 1) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 0, gfn, pfn, false); - return pt_write; - } - - if (largepage && level == 2) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 1, gfn, pfn, false); - return pt_write; + gfn_t pseudo_gfn; + + for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { + if (iterator.level == PT_PAGE_TABLE_LEVEL + || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, + 0, write, 1, &pt_write, + largepage, gfn, pfn, false); + ++vcpu->stat.pf_fixed; + break; } - if (table[index] == shadow_trap_nonpresent_pte) { - struct kvm_mmu_page *new_table; - gfn_t pseudo_gfn; - - pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) - >> PAGE_SHIFT; - new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, - v, level - 1, - 1, ACC_ALL, &table[index]); - if (!new_table) { + if (*iterator.sptep == shadow_trap_nonpresent_pte) { + pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, + iterator.level - 1, + 1, ACC_ALL, iterator.sptep); + if (!sp) { pgprintk("nonpaging_map: ENOMEM\n"); kvm_release_pfn_clean(pfn); return -ENOMEM; } - table[index] = __pa(new_table->spt) | PT_PRESENT_MASK - | PT_WRITABLE_MASK | shadow_user_mask; + __set_spte(iterator.sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); } - table_addr = table[index] & PT64_BASE_ADDR_MASK; } + return pt_write; } static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) @@ -1189,15 +1869,16 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) int r; int largepage = 0; pfn_t pfn; + unsigned long mmu_seq; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { @@ -1206,25 +1887,22 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) } spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, largepage, gfn, pfn, - PT32E_ROOT_LEVEL); + r = __direct_map(vcpu, v, write, largepage, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; -} - - -static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - int i; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - sp->spt[i] = shadow_trap_nonpresent_pte; +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; } + static void mmu_free_roots(struct kvm_vcpu *vcpu) { int i; @@ -1233,7 +1911,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; spin_lock(&vcpu->kvm->mmu_lock); -#ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -1245,7 +1922,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); return; } -#endif for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -1262,54 +1938,102 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = INVALID_PAGE; } -static void mmu_alloc_roots(struct kvm_vcpu *vcpu) +static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) +{ + int ret = 0; + + if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + ret = 1; + } + + return ret; +} + +static int mmu_alloc_roots(struct kvm_vcpu *vcpu) { int i; gfn_t root_gfn; struct kvm_mmu_page *sp; - int metaphysical = 0; + int direct = 0; + u64 pdptr; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; -#ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); if (tdp_enabled) - metaphysical = 1; + direct = 1; + if (mmu_check_root(vcpu, root_gfn)) + return 1; sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, metaphysical, + PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.root_hpa = root; - return; + return 0; } -#endif - metaphysical = !is_paging(vcpu); + direct = !is_paging(vcpu); if (tdp_enabled) - metaphysical = 1; + direct = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - if (!is_present_pte(vcpu->arch.pdptrs[i])) { + pdptr = kvm_pdptr_read(vcpu, i); + if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } - root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; + root_gfn = pdptr >> PAGE_SHIFT; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; + if (mmu_check_root(vcpu, root_gfn)) + return 1; sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, metaphysical, + PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + return 0; +} + +static void mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + } + } +} + +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -1344,6 +2068,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, int r; int largepage = 0; gfn_t gfn = gpa >> PAGE_SHIFT; + unsigned long mmu_seq; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -1352,24 +2077,31 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return 1; } spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); + largepage, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; } static void nonpaging_free(struct kvm_vcpu *vcpu) @@ -1386,6 +2118,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1416,6 +2150,14 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } +static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte >> 7) & 1; + return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -1424,6 +2166,59 @@ static void paging_free(struct kvm_vcpu *vcpu) #include "paging_tmpl.h" #undef PTTYPE +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + u64 exb_bit_rsvd = 0; + + if (!is_nx(vcpu)) + exb_bit_rsvd = rsvd_bits(63, 63); + switch (level) { + case PT32_ROOT_LEVEL: + /* no rsvd bits for 2 level 4K page table entries */ + context->rsvd_bits_mask[0][1] = 0; + context->rsvd_bits_mask[0][0] = 0; + if (is_cpuid_PSE36()) + /* 36bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + else + /* 32 bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + break; + case PT32E_ROOT_LEVEL: + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 63) | + rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PDE */ + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PTE */ + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + break; + case PT64_ROOT_LEVEL: + context->rsvd_bits_mask[0][3] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + break; + } +} + static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { struct kvm_mmu *context = &vcpu->arch.mmu; @@ -1433,6 +2228,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->prefetch_page = paging64_prefetch_page; + context->sync_page = paging64_sync_page; + context->invlpg = paging64_invlpg; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -1442,6 +2239,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) static int paging64_init_context(struct kvm_vcpu *vcpu) { + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); } @@ -1449,11 +2247,14 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; context->free = paging_free; context->prefetch_page = paging32_prefetch_page; + context->sync_page = paging32_sync_page; + context->invlpg = paging32_invlpg; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1462,6 +2263,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) static int paging32E_init_context(struct kvm_vcpu *vcpu) { + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); } @@ -1473,6 +2275,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = tdp_page_fault; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; @@ -1480,12 +2284,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; } else if (is_pae(vcpu)) { + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; } else { + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; } @@ -1495,17 +2302,23 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { + int r; + ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - return nonpaging_init_context(vcpu); + r = nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) - return paging64_init_context(vcpu); + r = paging64_init_context(vcpu); else if (is_pae(vcpu)) - return paging32E_init_context(vcpu); + r = paging32E_init_context(vcpu); else - return paging32_init_context(vcpu); + r = paging32_init_context(vcpu); + + vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; + + return r; } static int init_kvm_mmu(struct kvm_vcpu *vcpu) @@ -1543,8 +2356,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) goto out; spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - mmu_alloc_roots(vcpu); + r = mmu_alloc_roots(vcpu); + mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); + if (r) + goto out; kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); out: @@ -1566,15 +2382,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, pte = *spte; if (is_shadow_present_pte(pte)) { - if (sp->role.level == PT_PAGE_TABLE_LEVEL || - is_large_pte(pte)) + if (is_last_spte(pte, sp->role.level)) rmap_remove(vcpu->kvm, spte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, spte); } } - set_shadow_pte(spte, shadow_trap_nonpresent_pte); + __set_spte(spte, shadow_trap_nonpresent_pte); if (is_large_pte(pte)) --vcpu->kvm->stat.lpages; } @@ -1584,11 +2399,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, u64 *spte, const void *new) { - if ((sp->role.level != PT_PAGE_TABLE_LEVEL) - && !vcpu->arch.update_pte.largepage) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } + if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + if (!vcpu->arch.update_pte.largepage || + sp->role.glevels == PT32_ROOT_LEVEL) { + ++vcpu->kvm->stat.mmu_pde_zapped; + return; + } + } ++vcpu->kvm->stat.mmu_pte_updated; if (sp->role.glevels == PT32_ROOT_LEVEL) @@ -1658,17 +2475,17 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if ((bytes == 4) && (gpa % 4 == 0)) memcpy((void *)&gpte, new, 4); } - if (!is_present_pte(gpte)) + if (!is_present_gpte(gpte)) return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - down_read(¤t->mm->mmap_sem); if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); vcpu->arch.update_pte.largepage = 1; } + vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); @@ -1678,8 +2495,21 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.update_pte.pfn = pfn; } +static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u64 *spte = vcpu->arch.last_pte_updated; + + if (spte + && vcpu->arch.last_pte_gfn == gfn + && shadow_accessed_mask + && !(*spte & shadow_accessed_mask) + && is_shadow_present_pte(*spte)) + set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); +} + void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, + bool guest_initiated) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -1701,23 +2531,26 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; + if (guest_initiated) { + if (gfn == vcpu->arch.last_pt_write_gfn + && !last_updated_pte_accessed(vcpu)) { + ++vcpu->arch.last_pt_write_count; + if (vcpu->arch.last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->arch.last_pt_write_gfn = gfn; + vcpu->arch.last_pt_write_count = 1; + vcpu->arch.last_pte_updated = NULL; + } } index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.metaphysical) + if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) continue; pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); @@ -1735,7 +2568,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - kvm_mmu_zap_page(vcpu->kvm, sp); + if (kvm_mmu_zap_page(vcpu->kvm, sp)) + n = bucket->first; ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -1798,6 +2632,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) spin_unlock(&vcpu->kvm->mmu_lock); return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { @@ -1838,8 +2673,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) ++vcpu->stat.mmio_exits; return 0; case EMULATE_FAIL: - kvm_report_emulation_failure(vcpu, "pagetable"); - return 1; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + return 0; default: BUG(); } @@ -1848,21 +2684,28 @@ out: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ + vcpu->arch.mmu.invlpg(vcpu, gva); + kvm_mmu_flush_tlb(vcpu); + ++vcpu->stat.invlpg; +} +EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); + void kvm_enable_tdp(void) { tdp_enabled = true; } EXPORT_SYMBOL_GPL(kvm_enable_tdp); -static void free_mmu_pages(struct kvm_vcpu *vcpu) +void kvm_disable_tdp(void) { - struct kvm_mmu_page *sp; + tdp_enabled = false; +} +EXPORT_SYMBOL_GPL(kvm_disable_tdp); - while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { - sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); - } +static void free_mmu_pages(struct kvm_vcpu *vcpu) +{ free_page((unsigned long)vcpu->arch.mmu.pae_root); } @@ -1931,7 +2774,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) int i; u64 *pt; - if (!test_bit(slot, &sp->slot_bitmap)) + if (!test_bit(slot, sp->slot_bitmap)) continue; pt = sp->spt; @@ -1940,6 +2783,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (pt[i] & PT_WRITABLE_MASK) pt[i] &= ~PT_WRITABLE_MASK; } + kvm_flush_remote_tlbs(kvm); } void kvm_mmu_zap_all(struct kvm *kvm) @@ -1948,13 +2792,15 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + node = container_of(kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } -void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) +static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) { struct kvm_mmu_page *page; @@ -1974,6 +2820,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) list_for_each_entry(kvm, &vm_list, vm_list) { int npages; + if (!down_read_trylock(&kvm->slots_lock)) + continue; spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; @@ -1986,6 +2834,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) nr_to_scan--; spin_unlock(&kvm->mmu_lock); + up_read(&kvm->slots_lock); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); @@ -2000,7 +2849,7 @@ static struct shrinker mmu_shrinker = { .seeks = DEFAULT_SEEKS * 10, }; -void mmu_destroy_caches(void) +static void mmu_destroy_caches(void) { if (pte_chain_cache) kmem_cache_destroy(pte_chain_cache); @@ -2106,7 +2955,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - kvm_x86_ops->tlb_flush(vcpu); + kvm_set_cr3(vcpu, vcpu->arch.cr3); return 1; } @@ -2160,18 +3009,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, gpa_t addr, unsigned long *ret) { int r; - struct kvm_pv_mmu_op_buffer buffer; + struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; - buffer.ptr = buffer.buf; - buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); - buffer.processed = 0; + buffer->ptr = buffer->buf; + buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); + buffer->processed = 0; - r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); + r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); if (r) goto out; - while (buffer.len) { - r = kvm_pv_mmu_op_one(vcpu, &buffer); + while (buffer->len) { + r = kvm_pv_mmu_op_one(vcpu, buffer); if (r < 0) goto out; if (r == 0) @@ -2180,7 +3029,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, r = 1; out: - *ret = buffer.processed; + *ret = buffer->processed; return r; } @@ -2196,6 +3045,55 @@ static gva_t canonicalize(gva_t gva) return gva; } + +typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *sptep); + +static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, + inspect_spte_fn fn) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + if (sp->role.level > 1 && !is_large_pte(ent)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(kvm, child, fn); + } + if (sp->role.level == 1) + fn(kvm, sp, &sp->spt[i]); + } + } +} + +static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + } + } + return; +} + static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, gva_t va, int level) { @@ -2216,11 +3114,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, " in nonleaf level: levels %d gva %lx" " level %d pte %llx\n", audit_msg, vcpu->arch.mmu.root_level, va, level, ent); - - audit_mappings_page(vcpu, ent, va, level - 1); + else + audit_mappings_page(vcpu, ent, va, level - 1); } else { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); - hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; + gfn_t gfn = gpa >> PAGE_SHIFT; + pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); + hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; if (is_shadow_present_pte(ent) && (ent & PT64_BASE_ADDR_MASK) != hpa) @@ -2275,7 +3175,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); while (d) { for (k = 0; k < RMAP_EXT; ++k) - if (d->shadow_ptes[k]) + if (d->sptes[k]) ++nmaps; else break; @@ -2286,9 +3186,47 @@ static int count_rmaps(struct kvm_vcpu *vcpu) return nmaps; } -static int count_writable_mappings(struct kvm_vcpu *vcpu) +void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) +{ + unsigned long *rmapp; + struct kvm_mmu_page *rev_sp; + gfn_t gfn; + + if (*sptep & PT_WRITABLE_MASK) { + rev_sp = page_header(__pa(sptep)); + gfn = rev_sp->gfns[sptep - rev_sp->spt]; + + if (!gfn_to_memslot(kvm, gfn)) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no memslot for gfn %ld\n", + audit_msg, gfn); + printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", + audit_msg, sptep - rev_sp->spt, + rev_sp->gfn); + dump_stack(); + return; + } + + rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 0); + if (!*rmapp) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no rmap for writable spte %llx\n", + audit_msg, *sptep); + dump_stack(); + } + } + +} + +void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, inspect_spte_has_rmap); +} + +static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) { - int nmaps = 0; struct kvm_mmu_page *sp; int i; @@ -2305,20 +3243,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) continue; if (!(ent & PT_WRITABLE_MASK)) continue; - ++nmaps; + inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); } } - return nmaps; + return; } static void audit_rmap(struct kvm_vcpu *vcpu) { - int n_rmap = count_rmaps(vcpu); - int n_actual = count_writable_mappings(vcpu); - - if (n_rmap != n_actual) - printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", - __func__, audit_msg, n_rmap, n_actual); + check_writable_mappings_rmap(vcpu); + count_rmaps(vcpu); } static void audit_write_protection(struct kvm_vcpu *vcpu) @@ -2326,20 +3260,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) struct kvm_mmu_page *sp; struct kvm_memory_slot *slot; unsigned long *rmapp; + u64 *spte; gfn_t gfn; list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - if (sp->role.metaphysical) + if (sp->role.direct) + continue; + if (sp->unsync) continue; - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); gfn = unalias_gfn(vcpu->kvm, sp->gfn); + slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; - if (*rmapp) - printk(KERN_ERR "%s: (%s) shadow page has writable" - " mappings: gfn %lx role %x\n", + + spte = rmap_next(vcpu->kvm, rmapp, NULL); + while (spte) { + if (*spte & PT_WRITABLE_MASK) + printk(KERN_ERR "%s: (%s) shadow page has " + "writable mappings: gfn %lx role %x\n", __func__, audit_msg, sp->gfn, sp->role.word); + spte = rmap_next(vcpu->kvm, rmapp, spte); + } } } @@ -2352,6 +3294,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) audit_rmap(vcpu); audit_write_protection(vcpu); audit_mappings(vcpu); + audit_writable_sptes_have_rmaps(vcpu); dbg = olddbg; }