KVM: MMU: Add tracepoint for guest page aging
[safe/jmp/linux-2.6] / arch / x86 / kvm / mmu.c
index 4f5508c..7397932 100644 (file)
@@ -18,6 +18,7 @@
  */
 
 #include "mmu.h"
+#include "x86.h"
 #include "kvm_cache_regs.h"
 
 #include <linux/kvm_host.h>
@@ -150,6 +151,9 @@ module_param(oos_shadow, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
+#include <trace/events/kvm.h>
+
+#undef TRACE_INCLUDE_FILE
 #define CREATE_TRACE_POINTS
 #include "mmutrace.h"
 
@@ -226,7 +230,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
 static int is_write_protection(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.cr0 & X86_CR0_WP;
+       return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
 
 static int is_cpuid_PSE36(void)
@@ -236,7 +240,7 @@ static int is_cpuid_PSE36(void)
 
 static int is_nx(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.shadow_efer & EFER_NX;
+       return vcpu->arch.efer & EFER_NX;
 }
 
 static int is_shadow_present_pte(u64 pte)
@@ -250,7 +254,7 @@ static int is_large_pte(u64 pte)
        return pte & PT_PAGE_SIZE_MASK;
 }
 
-static int is_writeble_pte(unsigned long pte)
+static int is_writable_pte(unsigned long pte)
 {
        return pte & PT_WRITABLE_MASK;
 }
@@ -467,24 +471,10 @@ static int has_wrprotected_page(struct kvm *kvm,
 
 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 {
-       unsigned long page_size = PAGE_SIZE;
-       struct vm_area_struct *vma;
-       unsigned long addr;
+       unsigned long page_size;
        int i, ret = 0;
 
-       addr = gfn_to_hva(kvm, gfn);
-       if (kvm_is_error_hva(addr))
-               return PT_PAGE_TABLE_LEVEL;
-
-       down_read(&current->mm->mmap_sem);
-       vma = find_vma(current->mm, addr);
-       if (!vma)
-               goto out;
-
-       page_size = vma_kernel_pagesize(vma);
-
-out:
-       up_read(&current->mm->mmap_sem);
+       page_size = kvm_host_page_size(kvm, gfn);
 
        for (i = PT_PAGE_TABLE_LEVEL;
             i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
@@ -632,7 +622,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
        pfn = spte_to_pfn(*spte);
        if (*spte & shadow_accessed_mask)
                kvm_set_pfn_accessed(pfn);
-       if (is_writeble_pte(*spte))
+       if (is_writable_pte(*spte))
                kvm_set_pfn_dirty(pfn);
        rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
        if (!*rmapp) {
@@ -708,7 +698,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
                BUG_ON(!spte);
                BUG_ON(!(*spte & PT_PRESENT_MASK));
                rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
-               if (is_writeble_pte(*spte)) {
+               if (is_writable_pte(*spte)) {
                        __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
                        write_protected = 1;
                }
@@ -732,7 +722,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
                        BUG_ON(!(*spte & PT_PRESENT_MASK));
                        BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
                        pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
-                       if (is_writeble_pte(*spte)) {
+                       if (is_writable_pte(*spte)) {
                                rmap_remove(kvm, spte);
                                --kvm->stat.lpages;
                                __set_spte(spte, shadow_trap_nonpresent_pte);
@@ -787,7 +777,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
                        new_spte &= ~PT_WRITABLE_MASK;
                        new_spte &= ~SPTE_HOST_WRITEABLE;
-                       if (is_writeble_pte(*spte))
+                       if (is_writable_pte(*spte))
                                kvm_set_pfn_dirty(spte_to_pfn(*spte));
                        __set_spte(spte, new_spte);
                        spte = rmap_next(kvm, rmapp, spte);
@@ -805,6 +795,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
                                         unsigned long data))
 {
        int i, j;
+       int ret;
        int retval = 0;
        struct kvm_memslots *slots;
 
@@ -819,16 +810,17 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
                if (hva >= start && hva < end) {
                        gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
 
-                       retval |= handler(kvm, &memslot->rmap[gfn_offset],
-                                         data);
+                       ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 
                        for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
                                int idx = gfn_offset;
                                idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
-                               retval |= handler(kvm,
+                               ret |= handler(kvm,
                                        &memslot->lpage_info[j][idx].rmap_pde,
                                        data);
                        }
+                       trace_kvm_age_page(hva, memslot, ret);
+                       retval |= ret;
                }
        }
 
@@ -851,9 +843,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
        u64 *spte;
        int young = 0;
 
-       /* always return old for EPT */
+       /*
+        * Emulate the accessed bit for EPT, by checking if this page has
+        * an EPT mapping, and clearing it if it does. On the next access,
+        * a new EPT mapping will be established.
+        * This has some overhead, but not as much as the cost of swapping
+        * out actively used pages or breaking up actively used hugepages.
+        */
        if (!shadow_accessed_mask)
-               return 0;
+               return kvm_unmap_rmapp(kvm, rmapp, data);
 
        spte = rmap_next(kvm, rmapp, NULL);
        while (spte) {
@@ -1847,7 +1845,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                 * is responsibility of mmu_get_page / kvm_sync_page.
                 * Same reasoning can be applied to dirty page accounting.
                 */
-               if (!can_unsync && is_writeble_pte(*sptep))
+               if (!can_unsync && is_writable_pte(*sptep))
                        goto set_pte;
 
                if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1855,7 +1853,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                                 __func__, gfn);
                        ret = 1;
                        pte_access &= ~ACC_WRITE_MASK;
-                       if (is_writeble_pte(spte))
+                       if (is_writable_pte(spte))
                                spte &= ~PT_WRITABLE_MASK;
                }
        }
@@ -1876,7 +1874,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                         bool reset_host_protection)
 {
        int was_rmapped = 0;
-       int was_writeble = is_writeble_pte(*sptep);
+       int was_writable = is_writable_pte(*sptep);
        int rmap_count;
 
        pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1927,7 +1925,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                if (rmap_count > RMAP_RECYCLE_THRESHOLD)
                        rmap_recycle(vcpu, sptep, gfn);
        } else {
-               if (was_writeble)
+               if (was_writable)
                        kvm_release_pfn_dirty(pfn);
                else
                        kvm_release_pfn_clean(pfn);
@@ -2842,16 +2840,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
         */
        page = alloc_page(GFP_KERNEL | __GFP_DMA32);
        if (!page)
-               goto error_1;
+               return -ENOMEM;
+
        vcpu->arch.mmu.pae_root = page_address(page);
        for (i = 0; i < 4; ++i)
                vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 
        return 0;
-
-error_1:
-       free_mmu_pages(vcpu);
-       return -ENOMEM;
 }
 
 int kvm_mmu_create(struct kvm_vcpu *vcpu)