[PATCH] mm: use symbolic names instead of indices for zone initialisation

[safe/jmp/linux-2.6] / arch / powerpc / mm / hugetlbpage.c
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c

index 0073a04..5615acc 100644 (file)
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -30,13 +30,66 @@
  #define NUM_LOW_AREAS  (0x100000000UL >> SID_SHIFT)
  #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  
+#ifdef CONFIG_PPC_64K_PAGES
+#define HUGEPTE_INDEX_SIZE     (PMD_SHIFT-HPAGE_SHIFT)
+#else
+#define HUGEPTE_INDEX_SIZE     (PUD_SHIFT-HPAGE_SHIFT)
+#endif
+#define PTRS_PER_HUGEPTE       (1 << HUGEPTE_INDEX_SIZE)
+#define HUGEPTE_TABLE_SIZE     (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
+
+#define HUGEPD_SHIFT           (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
+#define HUGEPD_SIZE            (1UL << HUGEPD_SHIFT)
+#define HUGEPD_MASK            (~(HUGEPD_SIZE-1))
+
+#define huge_pgtable_cache     (pgtable_cache[HUGEPTE_CACHE_NUM])
+
+/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
+ * will choke on pointers to hugepte tables, which is handy for
+ * catching screwups early. */
+#define HUGEPD_OK      0x1
+
+typedef struct { unsigned long pd; } hugepd_t;
+
+#define hugepd_none(hpd)       ((hpd).pd == 0)
+
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+       BUG_ON(!(hpd.pd & HUGEPD_OK));
+       return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+}
+
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+{
+       unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
+       pte_t *dir = hugepd_page(*hpdp);
+
+       return dir + idx;
+}
+
+static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
+                          unsigned long address)
+{
+       pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
+                                     GFP_KERNEL|__GFP_REPEAT);
+
+       if (! new)
+               return -ENOMEM;
+
+       spin_lock(&mm->page_table_lock);
+       if (!hugepd_none(*hpdp))
+               kmem_cache_free(huge_pgtable_cache, new);
+       else
+               hpdp->pd = (unsigned long)new | HUGEPD_OK;
+       spin_unlock(&mm->page_table_lock);
+       return 0;
+}
+
  /* Modelled after find_linux_pte() */
  pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  {
         pgd_t *pg;
         pud_t *pu;
-       pmd_t *pm;
-       pte_t *pt;
  
         BUG_ON(! in_hugepage_area(mm->context, addr));
  
@@ -46,26 +99,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
         if (!pgd_none(*pg)) {
                 pu = pud_offset(pg, addr);
                 if (!pud_none(*pu)) {
-                       pm = pmd_offset(pu, addr);
  #ifdef CONFIG_PPC_64K_PAGES
-                       /* Currently, we use the normal PTE offset within full
-                        * size PTE pages, thus our huge PTEs are scattered in
-                        * the PTE page and we do waste some. We may change
-                        * that in the future, but the current mecanism keeps
-                        * things much simpler
-                        */
-                       if (!pmd_none(*pm)) {
-                               /* Note: pte_offset_* are all equivalent on
-                                * ppc64 as we don't have HIGHMEM
-                                */
-                               pt = pte_offset_kernel(pm, addr);
-                               return pt;
-                       }
-#else /* CONFIG_PPC_64K_PAGES */
-                       /* On 4k pages, we put huge PTEs in the PMD page */
-                       pt = (pte_t *)pm;
-                       return pt;
-#endif /* CONFIG_PPC_64K_PAGES */
+                       pmd_t *pm;
+                       pm = pmd_offset(pu, addr);
+                       if (!pmd_none(*pm))
+                               return hugepte_offset((hugepd_t *)pm, addr);
+#else
+                       return hugepte_offset((hugepd_t *)pu, addr);
+#endif
                 }
         }
  
@@ -76,8 +117,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
  {
         pgd_t *pg;
         pud_t *pu;
-       pmd_t *pm;
-       pte_t *pt;
+       hugepd_t *hpdp = NULL;
  
         BUG_ON(! in_hugepage_area(mm->context, addr));
  
@@ -87,23 +127,182 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
         pu = pud_alloc(mm, pg, addr);
  
         if (pu) {
+#ifdef CONFIG_PPC_64K_PAGES
+               pmd_t *pm;
                 pm = pmd_alloc(mm, pu, addr);
-               if (pm) {
+               if (pm)
+                       hpdp = (hugepd_t *)pm;
+#else
+               hpdp = (hugepd_t *)pu;
+#endif
+       }
+
+       if (! hpdp)
+               return NULL;
+
+       if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+               return NULL;
+
+       return hugepte_offset(hpdp, addr);
+}
+
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+{
+       pte_t *hugepte = hugepd_page(*hpdp);
+
+       hpdp->pd = 0;
+       tlb->need_flush = 1;
+       pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
+                                                PGF_CACHENUM_MASK));
+}
+
  #ifdef CONFIG_PPC_64K_PAGES
-                       /* See comment in huge_pte_offset. Note that if we ever
-                        * want to put the page size in the PMD, we would have
-                        * to open code our own pte_alloc* function in order
-                        * to populate and set the size atomically
-                        */
-                       pt = pte_alloc_map(mm, pm, addr);
-#else /* CONFIG_PPC_64K_PAGES */
-                       pt = (pte_t *)pm;
-#endif /* CONFIG_PPC_64K_PAGES */
-                       return pt;
-               }
+static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+                                  unsigned long addr, unsigned long end,
+                                  unsigned long floor, unsigned long ceiling)
+{
+       pmd_t *pmd;
+       unsigned long next;
+       unsigned long start;
+
+       start = addr;
+       pmd = pmd_offset(pud, addr);
+       do {
+               next = pmd_addr_end(addr, end);
+               if (pmd_none(*pmd))
+                       continue;
+               free_hugepte_range(tlb, (hugepd_t *)pmd);
+       } while (pmd++, addr = next, addr != end);
+
+       start &= PUD_MASK;
+       if (start < floor)
+               return;
+       if (ceiling) {
+               ceiling &= PUD_MASK;
+               if (!ceiling)
+                       return;
         }
+       if (end - 1 > ceiling - 1)
+               return;
  
-       return NULL;
+       pmd = pmd_offset(pud, start);
+       pud_clear(pud);
+       pmd_free_tlb(tlb, pmd);
+}
+#endif
+
+static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+                                  unsigned long addr, unsigned long end,
+                                  unsigned long floor, unsigned long ceiling)
+{
+       pud_t *pud;
+       unsigned long next;
+       unsigned long start;
+
+       start = addr;
+       pud = pud_offset(pgd, addr);
+       do {
+               next = pud_addr_end(addr, end);
+#ifdef CONFIG_PPC_64K_PAGES
+               if (pud_none_or_clear_bad(pud))
+                       continue;
+               hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+#else
+               if (pud_none(*pud))
+                       continue;
+               free_hugepte_range(tlb, (hugepd_t *)pud);
+#endif
+       } while (pud++, addr = next, addr != end);
+
+       start &= PGDIR_MASK;
+       if (start < floor)
+               return;
+       if (ceiling) {
+               ceiling &= PGDIR_MASK;
+               if (!ceiling)
+                       return;
+       }
+       if (end - 1 > ceiling - 1)
+               return;
+
+       pud = pud_offset(pgd, start);
+       pgd_clear(pgd);
+       pud_free_tlb(tlb, pud);
+}
+
+/*
+ * This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
+ */
+void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+                           unsigned long addr, unsigned long end,
+                           unsigned long floor, unsigned long ceiling)
+{
+       pgd_t *pgd;
+       unsigned long next;
+       unsigned long start;
+
+       /*
+        * Comments below take from the normal free_pgd_range().  They
+        * apply here too.  The tests against HUGEPD_MASK below are
+        * essential, because we *don't* test for this at the bottom
+        * level.  Without them we'll attempt to free a hugepte table
+        * when we unmap just part of it, even if there are other
+        * active mappings using it.
+        *
+        * The next few lines have given us lots of grief...
+        *
+        * Why are we testing HUGEPD* at this top level?  Because
+        * often there will be no work to do at all, and we'd prefer
+        * not to go all the way down to the bottom just to discover
+        * that.
+        *
+        * Why all these "- 1"s?  Because 0 represents both the bottom
+        * of the address space and the top of it (using -1 for the
+        * top wouldn't help much: the masks would do the wrong thing).
+        * The rule is that addr 0 and floor 0 refer to the bottom of
+        * the address space, but end 0 and ceiling 0 refer to the top
+        * Comparisons need to use "end - 1" and "ceiling - 1" (though
+        * that end 0 case should be mythical).
+        *
+        * Wherever addr is brought up or ceiling brought down, we
+        * must be careful to reject "the opposite 0" before it
+        * confuses the subsequent tests.  But what about where end is
+        * brought down by HUGEPD_SIZE below? no, end can't go down to
+        * 0 there.
+        *
+        * Whereas we round start (addr) and ceiling down, by different
+        * masks at different levels, in order to test whether a table
+        * now has no other vmas using it, so can be freed, we don't
+        * bother to round floor or end up - the tests don't need that.
+        */
+
+       addr &= HUGEPD_MASK;
+       if (addr < floor) {
+               addr += HUGEPD_SIZE;
+               if (!addr)
+                       return;
+       }
+       if (ceiling) {
+               ceiling &= HUGEPD_MASK;
+               if (!ceiling)
+                       return;
+       }
+       if (end - 1 > ceiling - 1)
+               end -= HUGEPD_SIZE;
+       if (addr > end - 1)
+               return;
+
+       start = addr;
+       pgd = pgd_offset((*tlb)->mm, addr);
+       do {
+               BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
+               next = pgd_addr_end(addr, end);
+               if (pgd_none_or_clear_bad(pgd))
+                       continue;
+               hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+       } while (pgd++, addr = next, addr != end);
  }
  
  void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -133,58 +332,63 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
         return __pte(old);
  }
  
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-       if (len & ~HPAGE_MASK)
-               return -EINVAL;
-       if (addr & ~HPAGE_MASK)
-               return -EINVAL;
-       if (! (within_hugepage_low_range(addr, len)
-              || within_hugepage_high_range(addr, len)) )
-               return -EINVAL;
-       return 0;
-}
+struct slb_flush_info {
+       struct mm_struct *mm;
+       u16 newareas;
+};
  
  static void flush_low_segments(void *parm)
  {
-       u16 areas = (unsigned long) parm;
+       struct slb_flush_info *fi = parm;
         unsigned long i;
  
-       asm volatile("isync" : : : "memory");
+       BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
+
+       if (current->active_mm != fi->mm)
+               return;
  
-       BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
+       /* Only need to do anything if this CPU is working in the same
+        * mm as the one which has changed */
  
+       /* update the paca copy of the context struct */
+       get_paca()->context = current->active_mm->context;
+
+       asm volatile("isync" : : : "memory");
         for (i = 0; i < NUM_LOW_AREAS; i++) {
-               if (! (areas & (1U << i)))
+               if (! (fi->newareas & (1U << i)))
                         continue;
                 asm volatile("slbie %0"
                              : : "r" ((i << SID_SHIFT) | SLBIE_C));
         }
-
         asm volatile("isync" : : : "memory");
  }
  
  static void flush_high_segments(void *parm)
  {
-       u16 areas = (unsigned long) parm;
+       struct slb_flush_info *fi = parm;
         unsigned long i, j;
  
-       asm volatile("isync" : : : "memory");
  
-       BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+       BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
+
+       if (current->active_mm != fi->mm)
+               return;
+
+       /* Only need to do anything if this CPU is working in the same
+        * mm as the one which has changed */
+
+       /* update the paca copy of the context struct */
+       get_paca()->context = current->active_mm->context;
  
+       asm volatile("isync" : : : "memory");
         for (i = 0; i < NUM_HIGH_AREAS; i++) {
-               if (! (areas & (1U << i)))
+               if (! (fi->newareas & (1U << i)))
                         continue;
                 for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
                         asm volatile("slbie %0"
                                      :: "r" (((i << HTLB_AREA_SHIFT)
-                                            + (j << SID_SHIFT)) | SLBIE_C));
+                                             + (j << SID_SHIFT)) | SLBIE_C));
         }
-
         asm volatile("isync" : : : "memory");
  }
  
@@ -212,6 +416,12 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
  
         BUG_ON(area >= NUM_HIGH_AREAS);
  
+       /* Hack, so that each addresses is controlled by exactly one
+        * of the high or low area bitmaps, the first high area starts
+        * at 4GB, not 0 */
+       if (start == 0)
+               start = 0x100000000UL;
+
         /* Check no VMAs are in the region */
         vma = find_vma(mm, start);
         if (vma && (vma->vm_start < end))
@@ -223,6 +433,7 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
  static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
  {
         unsigned long i;
+       struct slb_flush_info fi;
  
         BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
         BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
@@ -238,19 +449,20 @@ static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
  
         mm->context.low_htlb_areas |= newareas;
  
-       /* update the paca copy of the context struct */
-       get_paca()->context = mm->context;
-
         /* the context change must make it to memory before the flush,
          * so that further SLB misses do the right thing. */
         mb();
-       on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+
+       fi.mm = mm;
+       fi.newareas = newareas;
+       on_each_cpu(flush_low_segments, &fi, 0, 1);
  
         return 0;
  }
  
  static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
  {
+       struct slb_flush_info fi;
         unsigned long i;
  
         BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
@@ -274,22 +486,25 @@ static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
         /* the context change must make it to memory before the flush,
          * so that further SLB misses do the right thing. */
         mb();
-       on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
+
+       fi.mm = mm;
+       fi.newareas = newareas;
+       on_each_cpu(flush_high_segments, &fi, 0, 1);
  
         return 0;
  }
  
  int prepare_hugepage_range(unsigned long addr, unsigned long len)
  {
-       int err;
+       int err = 0;
  
         if ( (addr+len) < addr )
                 return -EINVAL;
  
-       if ((addr + len) < 0x100000000UL)
+       if (addr < 0x100000000UL)
                 err = open_low_hpage_areas(current->mm,
                                           LOW_ESID_MASK(addr, len));
-       else
+       if ((addr + len) > 0x100000000UL)
                 err = open_high_hpage_areas(current->mm,
                                             HTLB_AREA_MASK(addr, len));
         if (err) {
@@ -518,6 +733,17 @@ fail:
         return addr;
  }
  
+static int htlb_check_hinted_area(unsigned long addr, unsigned long len)
+{
+       struct vm_area_struct *vma;
+
+       vma = find_vma(current->mm, addr);
+       if (!vma || ((addr + len) <= vma->vm_start))
+               return 0;
+
+       return -ENOMEM;
+}
+
  static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
  {
         unsigned long addr = 0;
@@ -587,15 +813,28 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
         if (!cpu_has_feature(CPU_FTR_16M_PAGE))
                 return -EINVAL;
  
+       /* Paranoia, caller should have dealt with this */
+       BUG_ON((addr + len)  < addr);
+
         if (test_thread_flag(TIF_32BIT)) {
+               /* Paranoia, caller should have dealt with this */
+               BUG_ON((addr + len) > 0x100000000UL);
+
                 curareas = current->mm->context.low_htlb_areas;
  
-               /* First see if we can do the mapping in the existing
-                * low areas */
+               /* First see if we can use the hint address */
+               if (addr && (htlb_check_hinted_area(addr, len) == 0)) {
+                       areamask = LOW_ESID_MASK(addr, len);
+                       if (open_low_hpage_areas(current->mm, areamask) == 0)
+                               return addr;
+               }
+
+               /* Next see if we can map in the existing low areas */
                 addr = htlb_get_low_area(len, curareas);
                 if (addr != -ENOMEM)
                         return addr;
  
+               /* Finally go looking for areas to open */
                 lastshift = 0;
                 for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
                      ! lastshift; areamask >>=1) {
@@ -610,12 +849,22 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
         } else {
                 curareas = current->mm->context.high_htlb_areas;
  
-               /* First see if we can do the mapping in the existing
-                * high areas */
+               /* First see if we can use the hint address */
+               /* We discourage 64-bit processes from doing hugepage
+                * mappings below 4GB (must use MAP_FIXED) */
+               if ((addr >= 0x100000000UL)
+                   && (htlb_check_hinted_area(addr, len) == 0)) {
+                       areamask = HTLB_AREA_MASK(addr, len);
+                       if (open_high_hpage_areas(current->mm, areamask) == 0)
+                               return addr;
+               }
+
+               /* Next see if we can map in the existing high areas */
                 addr = htlb_get_high_area(len, curareas);
                 if (addr != -ENOMEM)
                         return addr;
  
+               /* Finally go looking for areas to open */
                 lastshift = 0;
                 for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
                      ! lastshift; areamask >>=1) {
@@ -633,8 +882,36 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
         return -ENOMEM;
  }
  
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
+                                                 pte_t pte, int trap)
+{
+       struct page *page;
+       int i;
+
+       if (!pfn_valid(pte_pfn(pte)))
+               return rflags;
+
+       page = pte_page(pte);
+
+       /* page is dirty */
+       if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+               if (trap == 0x400) {
+                       for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
+                               __flush_dcache_icache(page_address(page+i));
+                       set_bit(PG_arch_1, &page->flags);
+               } else {
+                       rflags |= HPTE_R_N;
+               }
+       }
+       return rflags;
+}
+
  int hash_huge_page(struct mm_struct *mm, unsigned long access,
-                  unsigned long ea, unsigned long vsid, int local)
+                  unsigned long ea, unsigned long vsid, int local,
+                  unsigned long trap)
  {
         pte_t *ptep;
         unsigned long old_pte, new_pte;
@@ -685,6 +962,11 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
         rflags = 0x2 | (!(new_pte & _PAGE_RW));
         /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
         rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+               /* No CPU has hugepages but lacks no execute, so we
+                * don't need to worry about that case */
+               rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
+                                                      trap);
  
         /* Check if pte already has an hpte (case 2) */
         if (unlikely(old_pte & _PAGE_HASHPTE)) {
@@ -697,7 +979,8 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
                 slot += (old_pte & _PAGE_F_GIX) >> 12;
  
-               if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
+               if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
+                                        local) == -1)
                         old_pte &= ~_PAGE_HPTEFLAGS;
         }
  
@@ -748,9 +1031,7 @@ repeat:
         }
  
         /*
-        * No need to use ldarx/stdcx here because all who
-        * might be updating the pte will hold the
-        * page_table_lock
+        * No need to use ldarx/stdcx here
          */
         *ptep = __pte(new_pte & ~_PAGE_BUSY);
  
@@ -759,3 +1040,27 @@ repeat:
   out:
         return err;
  }
+
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+{
+       memset(addr, 0, kmem_cache_size(cache));
+}
+
+static int __init hugetlbpage_init(void)
+{
+       if (!cpu_has_feature(CPU_FTR_16M_PAGE))
+               return -ENODEV;
+
+       huge_pgtable_cache = kmem_cache_create("hugepte_cache",
+                                              HUGEPTE_TABLE_SIZE,
+                                              HUGEPTE_TABLE_SIZE,
+                                              SLAB_HWCACHE_ALIGN |
+                                              SLAB_MUST_HWCACHE_ALIGN,
+                                              zero_ctor, NULL);
+       if (! huge_pgtable_cache)
+               panic("hugetlbpage_init(): could not create hugepte cache\n");
+
+       return 0;
+}
+
+module_init(hugetlbpage_init);