x86: preallocate and prepopulate separately
authorJeremy Fitzhardinge <jeremy@goop.org>
Wed, 25 Jun 2008 04:19:13 +0000 (00:19 -0400)
committerIngo Molnar <mingo@elte.hu>
Tue, 8 Jul 2008 11:11:02 +0000 (13:11 +0200)
Jan Beulich points out that vmalloc_sync_all() assumes that the
kernel's pmd is always expected to be present in the pgd.  The current
pgd construction code will add the pgd to the pgd_list before its pmds
have been pre-populated, thereby making it visible to
vmalloc_sync_all().

However, because pgd_prepopulate_pmd also does the allocation, it may
block and cannot be done under spinlock.

The solution is to preallocate the pmds out of the spinlock, then
populate them while holding the pgd_list lock.

This patch also pulls the pmd preallocation and mop-up functions out
to be common, assuming that the compiler will generate no code for
them when PREALLOCTED_PMDS is 0.  Also, there's no need for pgd_ctor
to clear the pgd again, since it's allocated as a zeroed page.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/x86/mm/pgtable.c

index 418c443..557b2ab 100644 (file)
@@ -66,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd)
 static void pgd_ctor(void *p)
 {
        pgd_t *pgd = p;
-       unsigned long flags;
-
-       /* Clear usermode parts of PGD */
-       memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
-
-       spin_lock_irqsave(&pgd_lock, flags);
 
        /* If the pgd points to a shared pagetable level (either the
           ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -91,8 +85,6 @@ static void pgd_ctor(void *p)
        /* list required to sync kernel mapping updates */
        if (!SHARED_KERNEL_PMD)
                pgd_list_add(pgd);
-
-       spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
 static void pgd_dtor(void *pgd)
@@ -120,6 +112,72 @@ static void pgd_dtor(void *pgd)
 
 #ifdef CONFIG_X86_PAE
 /*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+#define PREALLOCATED_PMDS      UNSHARED_PTRS_PER_PGD
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+       paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+       /* Note: almost everything apart from _PAGE_PRESENT is
+          reserved at the pmd (PDPT) level. */
+       set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+       /*
+        * According to Intel App note "TLBs, Paging-Structure Caches,
+        * and Their Invalidation", April 2007, document 317080-001,
+        * section 8.1: in PAE mode we explicitly have to flush the
+        * TLB via cr3 if the top-level pgd is changed...
+        */
+       if (mm == current->active_mm)
+               write_cr3(read_cr3());
+}
+#else  /* !CONFIG_X86_PAE */
+
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+#define PREALLOCATED_PMDS      0
+
+#endif /* CONFIG_X86_PAE */
+
+static void free_pmds(pmd_t *pmds[])
+{
+       int i;
+
+       for(i = 0; i < PREALLOCATED_PMDS; i++)
+               if (pmds[i])
+                       free_page((unsigned long)pmds[i]);
+}
+
+static int preallocate_pmds(pmd_t *pmds[])
+{
+       int i;
+       bool failed = false;
+
+       for(i = 0; i < PREALLOCATED_PMDS; i++) {
+               pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+               if (pmd == NULL)
+                       failed = true;
+               pmds[i] = pmd;
+       }
+
+       if (failed) {
+               free_pmds(pmds);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+/*
  * Mop up any pmd pages which may still be attached to the pgd.
  * Normally they will be freed by munmap/exit_mmap, but any pmd we
  * preallocate which never got a corresponding vma will need to be
@@ -129,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 {
        int i;
 
-       for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+       for(i = 0; i < PREALLOCATED_PMDS; i++) {
                pgd_t pgd = pgdp[i];
 
                if (pgd_val(pgd) != 0) {
@@ -143,32 +201,17 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
        }
 }
 
-/*
- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
- * updating the top-level pagetable entries to guarantee the
- * processor notices the update.  Since this is expensive, and
- * all 4 top-level entries are used almost immediately in a
- * new process's life, we just pre-populate them here.
- *
- * Also, if we're in a paravirt environment where the kernel pmd is
- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
- * and initialize the kernel pmds here.
- */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 {
        pud_t *pud;
        unsigned long addr;
        int i;
 
        pud = pud_offset(pgd, 0);
-       for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
-            i++, pud++, addr += PUD_SIZE) {
-               pmd_t *pmd = pmd_alloc_one(mm, addr);
 
-               if (!pmd) {
-                       pgd_mop_up_pmds(mm, pgd);
-                       return 0;
-               }
+       for (addr = i = 0; i < PREALLOCATED_PMDS;
+            i++, pud++, addr += PUD_SIZE) {
+               pmd_t *pmd = pmds[i];
 
                if (i >= KERNEL_PGD_BOUNDARY)
                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -176,57 +219,47 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
 
                pud_populate(mm, pud, pmd);
        }
-
-       return 1;
 }
 
-void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-       paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+       pgd_t *pgd;
+       pmd_t *pmds[PREALLOCATED_PMDS];
+       unsigned long flags;
 
-       /* Note: almost everything apart from _PAGE_PRESENT is
-          reserved at the pmd (PDPT) level. */
-       set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+       pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+
+       if (pgd == NULL)
+               goto out;
+
+       mm->pgd = pgd;
+
+       if (preallocate_pmds(pmds) != 0)
+               goto out_free_pgd;
+
+       if (paravirt_pgd_alloc(mm) != 0)
+               goto out_free_pmds;
 
        /*
-        * According to Intel App note "TLBs, Paging-Structure Caches,
-        * and Their Invalidation", April 2007, document 317080-001,
-        * section 8.1: in PAE mode we explicitly have to flush the
-        * TLB via cr3 if the top-level pgd is changed...
+        * Make sure that pre-populating the pmds is atomic with
+        * respect to anything walking the pgd_list, so that they
+        * never see a partially populated pgd.
         */
-       if (mm == current->active_mm)
-               write_cr3(read_cr3());
-}
-#else  /* !CONFIG_X86_PAE */
-/* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-       return 1;
-}
-
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
-{
-}
-#endif /* CONFIG_X86_PAE */
+       spin_lock_irqsave(&pgd_lock, flags);
 
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-       pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+       pgd_ctor(pgd);
+       pgd_prepopulate_pmd(mm, pgd, pmds);
 
-       /* so that alloc_pmd can use it */
-       mm->pgd = pgd;
-       if (pgd) {
-               pgd_ctor(pgd);
-
-               if (paravirt_pgd_alloc(mm) != 0 ||
-                   !pgd_prepopulate_pmd(mm, pgd)) {
-                       pgd_dtor(pgd);
-                       free_page((unsigned long)pgd);
-                       pgd = NULL;
-               }
-       }
+       spin_unlock_irqrestore(&pgd_lock, flags);
 
        return pgd;
+
+out_free_pmds:
+       free_pmds(pmds);
+out_free_pgd:
+       free_page((unsigned long)pgd);
+out:
+       return NULL;
 }
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)