X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fx86%2Fkvm%2Fpaging_tmpl.h;h=258e4591e1ca02246801fc380ae19472b2375b0a;hb=c2d0ee46e6e633a3c23ecbcb9b03ad731906cd79;hp=156fe10288ae330c6d9ac2d3e0b857a9d6c853b5;hpb=35149e2129fe34fc8cb5917e1ecf5156b0fa3415;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 156fe10..258e459 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -29,7 +29,6 @@ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS #ifdef CONFIG_X86_64 @@ -46,7 +45,6 @@ #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 @@ -91,14 +89,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, pt_element_t *table; struct page *page; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(kvm, table_gfn); - up_read(¤t->mm->mmap_sem); table = kmap_atomic(page, KM_USER0); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table, KM_USER0); kvm_release_page_dirty(page); @@ -129,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, pte_access; gpa_t pte_gpa; + int rsvd_fault = 0; pgprintk("%s: addr %lx\n", __func__, addr); walk: @@ -163,6 +158,10 @@ walk: if (!is_present_pte(pte)) goto not_present; + rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); + if (rsvd_fault) + goto access_error; + if (write_fault && !is_writeble_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; @@ -215,7 +214,6 @@ walk: if (ret) goto walk; pte |= PT_DIRTY_MASK; - kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); walker->ptes[walker->level - 1] = pte; } @@ -239,6 +237,8 @@ err: walker->error_code |= PFERR_USER_MASK; if (fetch_fault) walker->error_code |= PFERR_FETCH_MASK; + if (rsvd_fault) + walker->error_code |= PFERR_RSVD_MASK; return 0; } @@ -263,95 +263,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pfn = vcpu->arch.update_pte.pfn; if (is_error_pfn(pfn)) return; + if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) + return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), - pfn, true); + gpte & PT_DIRTY_MASK, NULL, largepage, + gpte_to_gfn(gpte), pfn, true); } /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker, + struct guest_walker *gw, int user_fault, int write_fault, int largepage, int *ptwrite, pfn_t pfn) { - hpa_t shadow_addr; + unsigned access = gw->pt_access; + struct kvm_mmu_page *shadow_page; + u64 spte, *sptep; + int direct; + gfn_t table_gfn; + int r; int level; - u64 *shadow_ent; - unsigned access = walker->pt_access; + pt_element_t curr_pte; + struct kvm_shadow_walk_iterator iterator; - if (!is_present_pte(walker->ptes[walker->level - 1])) + if (!is_present_pte(gw->ptes[gw->level - 1])) return NULL; - shadow_addr = vcpu->arch.mmu.root_hpa; - level = vcpu->arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - --level; - } - - for (; ; level--) { - u32 index = SHADOW_PT_INDEX(addr, level); - struct kvm_mmu_page *shadow_page; - u64 shadow_pte; - int metaphysical; - gfn_t table_gfn; - - shadow_ent = ((u64 *)__va(shadow_addr)) + index; - if (level == PT_PAGE_TABLE_LEVEL) - break; - - if (largepage && level == PT_DIRECTORY_LEVEL) + for_each_shadow_entry(vcpu, addr, iterator) { + level = iterator.level; + sptep = iterator.sptep; + if (level == PT_PAGE_TABLE_LEVEL + || (largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, access, + gw->pte_access & access, + user_fault, write_fault, + gw->ptes[gw->level-1] & PT_DIRTY_MASK, + ptwrite, largepage, + gw->gfn, pfn, false); break; + } - if (is_shadow_present_pte(*shadow_ent) - && !is_large_pte(*shadow_ent)) { - shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) continue; - } - if (is_large_pte(*shadow_ent)) - rmap_remove(vcpu->kvm, shadow_ent); + if (is_large_pte(*sptep)) { + rmap_remove(vcpu->kvm, sptep); + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } - if (level - 1 == PT_PAGE_TABLE_LEVEL - && walker->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - if (!is_dirty_pte(walker->ptes[level - 1])) + if (level == PT_DIRECTORY_LEVEL + && gw->level == PT_DIRECTORY_LEVEL) { + direct = 1; + if (!is_dirty_pte(gw->ptes[level - 1])) access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(walker->ptes[level - 1]); + table_gfn = gpte_to_gfn(gw->ptes[level - 1]); } else { - metaphysical = 0; - table_gfn = walker->table_gfn[level - 2]; + direct = 0; + table_gfn = gw->table_gfn[level - 2]; } shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, access, - shadow_ent); - if (!metaphysical) { - int r; - pt_element_t curr_pte; + direct, access, sptep); + if (!direct) { r = kvm_read_guest_atomic(vcpu->kvm, - walker->pte_gpa[level - 2], + gw->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != walker->ptes[level - 2]) { + if (r || curr_pte != gw->ptes[level - 2]) { + kvm_mmu_put_page(shadow_page, sptep); kvm_release_pfn_clean(pfn); - return NULL; + sptep = NULL; + break; } } - shadow_addr = __pa(shadow_page->spt); - shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK + + spte = __pa(shadow_page->spt) + | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - *shadow_ent = shadow_pte; + *sptep = spte; } - mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, - user_fault, write_fault, - walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, walker->gfn, pfn, false); - - return shadow_ent; + return sptep; } /* @@ -380,6 +374,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int r; pfn_t pfn; int largepage = 0; + unsigned long mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); kvm_mmu_audit(vcpu, "pre page fault"); @@ -389,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return r; /* - * Look up the shadow pte for the faulting address. + * Look up the guest pte for the faulting address. */ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, fetch_fault); @@ -404,7 +399,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - down_read(¤t->mm->mmap_sem); if (walker.level == PT_DIRECTORY_LEVEL) { gfn_t large_gfn; large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); @@ -413,17 +407,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, largepage = 1; } } + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { - pgprintk("gfn %x is mmio\n", walker.gfn); + pgprintk("gfn %lx is mmio\n", walker.gfn); kvm_release_pfn_clean(pfn); return 1; } spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, largepage, &write_pt, pfn); @@ -439,6 +436,65 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, spin_unlock(&vcpu->kvm->mmu_lock); return write_pt; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; +} + +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct kvm_shadow_walk_iterator iterator; + pt_element_t gpte; + gpa_t pte_gpa = -1; + int level; + u64 *sptep; + int need_flush = 0; + + spin_lock(&vcpu->kvm->mmu_lock); + + for_each_shadow_entry(vcpu, gva, iterator) { + level = iterator.level; + sptep = iterator.sptep; + + /* FIXME: properly handle invlpg on large guest pages */ + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + pte_gpa = (sp->gfn << PAGE_SHIFT); + pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + + if (is_shadow_present_pte(*sptep)) { + rmap_remove(vcpu->kvm, sptep); + if (is_large_pte(*sptep)) + --vcpu->kvm->stat.lpages; + need_flush = 1; + } + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + break; + } + + if (!is_shadow_present_pte(*sptep)) + break; + } + + if (need_flush) + kvm_flush_remote_tlbs(vcpu->kvm); + spin_unlock(&vcpu->kvm->mmu_lock); + + if (pte_gpa == -1) + return; + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, + sizeof(pt_element_t))) + return; + if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { + if (mmu_topup_memory_caches(vcpu)) + return; + kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, + sizeof(pt_element_t), 0); + } } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -460,29 +516,85 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - int i, offset = 0, r = 0; - pt_element_t pt; + int i, j, offset, r; + pt_element_t pt[256 / sizeof(pt_element_t)]; + gpa_t pte_gpa; - if (sp->role.metaphysical + if (sp->role.direct || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { nonpaging_prefetch_page(vcpu, sp); return; } + pte_gpa = gfn_to_gpa(sp->gfn); + if (PTTYPE == 32) { + offset = sp->role.quadrant << PT64_LEVEL_BITS; + pte_gpa += offset * sizeof(pt_element_t); + } + + for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) { + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); + pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); + for (j = 0; j < ARRAY_SIZE(pt); ++j) + if (r || is_present_pte(pt[j])) + sp->spt[i+j] = shadow_trap_nonpresent_pte; + else + sp->spt[i+j] = shadow_notrap_nonpresent_pte; + } +} + +/* + * Using the cached information from sp->gfns is safe because: + * - The spte has a reference to the struct page, so the pfn for a given gfn + * can't change unless all sptes pointing to it are nuked first. + * - Alias changes zap the entire shadow cache. + */ +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + int i, offset, nr_present; + + offset = nr_present = 0; + if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - gpa_t pte_gpa = gfn_to_gpa(sp->gfn); + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn = sp->gfns[i]; + + if (!is_shadow_present_pte(sp->spt[i])) + continue; + + pte_gpa = gfn_to_gpa(sp->gfn); pte_gpa += (i+offset) * sizeof(pt_element_t); - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, - sizeof(pt_element_t)); - if (r || is_present_pte(pt)) - sp->spt[i] = shadow_trap_nonpresent_pte; - else - sp->spt[i] = shadow_notrap_nonpresent_pte; + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -EINVAL; + + if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || + !(gpte & PT_ACCESSED_MASK)) { + u64 nonpresent; + + rmap_remove(vcpu->kvm, &sp->spt[i]); + if (is_present_pte(gpte)) + nonpresent = shadow_trap_nonpresent_pte; + else + nonpresent = shadow_notrap_nonpresent_pte; + set_shadow_pte(&sp->spt[i], nonpresent); + continue; + } + + nr_present++; + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, + is_dirty_pte(gpte), 0, gfn, + spte_to_pfn(sp->spt[i]), true, false); } + + return !nr_present; } #undef pt_element_t @@ -490,7 +602,6 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -#undef SHADOW_PT_INDEX #undef PT_LEVEL_MASK #undef PT_DIR_BASE_ADDR_MASK #undef PT_LEVEL_BITS