powerpc/mm: Fix pgtable cache cleanup with CONFIG_PPC_SUBPAGE_PROT
[safe/jmp/linux-2.6] / arch / powerpc / mm / hash_utils_64.c
index bf5b6d7..50f867d 100644 (file)
@@ -68,6 +68,7 @@
 
 #define KB (1024)
 #define MB (1024*KB)
+#define GB (1024L*MB)
 
 /*
  * Note:  pte   --> Linux PTE
@@ -91,6 +92,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 struct hash_pte *htab_address;
 unsigned long htab_size_bytes;
 unsigned long htab_hash_mask;
+EXPORT_SYMBOL_GPL(htab_hash_mask);
 int mmu_linear_psize = MMU_PAGE_4K;
 int mmu_virtual_psize = MMU_PAGE_4K;
 int mmu_vmalloc_psize = MMU_PAGE_4K;
@@ -101,8 +103,8 @@ int mmu_io_psize = MMU_PAGE_4K;
 int mmu_kernel_ssize = MMU_SEGSIZE_256M;
 int mmu_highuser_ssize = MMU_SEGSIZE_256M;
 u16 mmu_slb_size = 64;
+EXPORT_SYMBOL_GPL(mmu_slb_size);
 #ifdef CONFIG_HUGETLB_PAGE
-int mmu_huge_psize = MMU_PAGE_16M;
 unsigned int HPAGE_SHIFT;
 #endif
 #ifdef CONFIG_PPC_64K_PAGES
@@ -151,39 +153,58 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
        },
 };
 
+static unsigned long htab_convert_pte_flags(unsigned long pteflags)
+{
+       unsigned long rflags = pteflags & 0x1fa;
+
+       /* _PAGE_EXEC -> NOEXEC */
+       if ((pteflags & _PAGE_EXEC) == 0)
+               rflags |= HPTE_R_N;
+
+       /* PP bits. PAGE_USER is already PP bit 0x2, so we only
+        * need to add in 0x1 if it's a read-only user page
+        */
+       if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
+                                        (pteflags & _PAGE_DIRTY)))
+               rflags |= 1;
+
+       /* Always add C */
+       return rflags | HPTE_R_C;
+}
 
 int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
-                     unsigned long pstart, unsigned long mode,
+                     unsigned long pstart, unsigned long prot,
                      int psize, int ssize)
 {
        unsigned long vaddr, paddr;
        unsigned int step, shift;
-       unsigned long tmp_mode;
        int ret = 0;
 
        shift = mmu_psize_defs[psize].shift;
        step = 1 << shift;
 
+       prot = htab_convert_pte_flags(prot);
+
+       DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
+           vstart, vend, pstart, prot, psize, ssize);
+
        for (vaddr = vstart, paddr = pstart; vaddr < vend;
             vaddr += step, paddr += step) {
                unsigned long hash, hpteg;
                unsigned long vsid = get_kernel_vsid(vaddr, ssize);
                unsigned long va = hpt_va(vaddr, vsid, ssize);
+               unsigned long tprot = prot;
 
-               tmp_mode = mode;
-               
-               /* Make non-kernel text non-executable */
-               if (!in_kernel_text(vaddr))
-                       tmp_mode = mode | HPTE_R_N;
+               /* Make kernel text executable */
+               if (overlaps_kernel_text(vaddr, vaddr + step))
+                       tprot &= ~HPTE_R_N;
 
                hash = hpt_hash(va, shift, ssize);
                hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
 
-               DBG("htab_bolt_mapping: calling %p\n", ppc_md.hpte_insert);
-
                BUG_ON(!ppc_md.hpte_insert);
-               ret = ppc_md.hpte_insert(hpteg, va, paddr,
-                               tmp_mode, HPTE_V_BOLTED, psize, ssize);
+               ret = ppc_md.hpte_insert(hpteg, va, paddr, tprot,
+                                        HPTE_V_BOLTED, psize, ssize);
 
                if (ret < 0)
                        break;
@@ -329,6 +350,48 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
        return 0;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+/* Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+                                       const char *uname, int depth,
+                                       void *data) {
+       char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+       unsigned long *addr_prop;
+       u32 *page_count_prop;
+       unsigned int expected_pages;
+       long unsigned int phys_addr;
+       long unsigned int block_size;
+
+       /* We are scanning "memory" nodes only */
+       if (type == NULL || strcmp(type, "memory") != 0)
+               return 0;
+
+       /* This property is the log base 2 of the number of virtual pages that
+        * will represent this memory block. */
+       page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+       if (page_count_prop == NULL)
+               return 0;
+       expected_pages = (1 << page_count_prop[0]);
+       addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+       if (addr_prop == NULL)
+               return 0;
+       phys_addr = addr_prop[0];
+       block_size = addr_prop[1];
+       if (block_size != (16 * GB))
+               return 0;
+       printk(KERN_INFO "Huge page(16GB) memory: "
+                       "addr = 0x%lX size = 0x%lX pages = %d\n",
+                       phys_addr, block_size, expected_pages);
+       if (phys_addr + (16 * GB) <= lmb_end_of_DRAM()) {
+               lmb_reserve(phys_addr, block_size * expected_pages);
+               add_gpage(phys_addr, block_size, expected_pages);
+       }
+       return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
 static void __init htab_init_page_sizes(void)
 {
        int rc;
@@ -418,15 +481,8 @@ static void __init htab_init_page_sizes(void)
               );
 
 #ifdef CONFIG_HUGETLB_PAGE
-       /* Init large page size. Currently, we pick 16M or 1M depending
-        * on what is available
-        */
-       if (mmu_psize_defs[MMU_PAGE_16M].shift)
-               set_huge_psize(MMU_PAGE_16M);
-       /* With 4k/4level pagetables, we can't (for now) cope with a
-        * huge page size < PMD_SIZE */
-       else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-               set_huge_psize(MMU_PAGE_1M);
+       /* Reserve 16G huge page memory sections for huge pages */
+       of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
@@ -452,7 +508,7 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
 
 static unsigned long __init htab_get_table_size(void)
 {
-       unsigned long mem_size, rnd_mem_size, pteg_count;
+       unsigned long mem_size, rnd_mem_size, pteg_count, psize;
 
        /* If hash size isn't already provided by the platform, we try to
         * retrieve it from the device-tree. If it's not there neither, we
@@ -470,7 +526,8 @@ static unsigned long __init htab_get_table_size(void)
                rnd_mem_size <<= 1;
 
        /* # pages / 2 */
-       pteg_count = max(rnd_mem_size >> (12 + 1), 1UL << 11);
+       psize = mmu_psize_defs[mmu_virtual_psize].shift;
+       pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11);
 
        return pteg_count << 7;
 }
@@ -478,9 +535,9 @@ static unsigned long __init htab_get_table_size(void)
 #ifdef CONFIG_MEMORY_HOTPLUG
 void create_section_mapping(unsigned long start, unsigned long end)
 {
-               BUG_ON(htab_bolt_mapping(start, end, __pa(start),
-                       _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX,
-                       mmu_linear_psize, mmu_kernel_ssize));
+       BUG_ON(htab_bolt_mapping(start, end, __pa(start),
+                                pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+                                mmu_kernel_ssize));
 }
 
 int remove_section_mapping(unsigned long start, unsigned long end)
@@ -525,11 +582,11 @@ static void __init htab_finish_init(void)
        make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
 }
 
-void __init htab_initialize(void)
+static void __init htab_initialize(void)
 {
        unsigned long table;
        unsigned long pteg_count;
-       unsigned long mode_rw;
+       unsigned long prot;
        unsigned long base = 0, size = 0, limit;
        int i;
 
@@ -587,7 +644,7 @@ void __init htab_initialize(void)
                mtspr(SPRN_SDR1, _SDR1);
        }
 
-       mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
+       prot = pgprot_val(PAGE_KERNEL);
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
        linear_map_hash_count = lmb_end_of_DRAM() >> PAGE_SHIFT;
@@ -606,7 +663,8 @@ void __init htab_initialize(void)
                base = (unsigned long)__va(lmb.memory.region[i].base);
                size = lmb.memory.region[i].size;
 
-               DBG("creating mapping for region: %lx : %lx\n", base, size);
+               DBG("creating mapping for region: %lx..%lx (prot: %x)\n",
+                   base, size, prot);
 
 #ifdef CONFIG_U3_DART
                /* Do not map the DART space. Fortunately, it will be aligned
@@ -623,21 +681,21 @@ void __init htab_initialize(void)
                        unsigned long dart_table_end = dart_tablebase + 16 * MB;
                        if (base != dart_tablebase)
                                BUG_ON(htab_bolt_mapping(base, dart_tablebase,
-                                                       __pa(base), mode_rw,
+                                                       __pa(base), prot,
                                                        mmu_linear_psize,
                                                        mmu_kernel_ssize));
                        if ((base + size) > dart_table_end)
                                BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB,
                                                        base + size,
                                                        __pa(dart_table_end),
-                                                        mode_rw,
+                                                        prot,
                                                         mmu_linear_psize,
                                                         mmu_kernel_ssize));
                        continue;
                }
 #endif /* CONFIG_U3_DART */
                BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
-                               mode_rw, mmu_linear_psize, mmu_kernel_ssize));
+                               prot, mmu_linear_psize, mmu_kernel_ssize));
        }
 
        /*
@@ -655,7 +713,7 @@ void __init htab_initialize(void)
                        tce_alloc_start = base + size + 1;
 
                BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
-                                        __pa(tce_alloc_start), mode_rw,
+                                        __pa(tce_alloc_start), prot,
                                         mmu_linear_psize, mmu_kernel_ssize));
        }
 
@@ -666,11 +724,43 @@ void __init htab_initialize(void)
 #undef KB
 #undef MB
 
-void htab_initialize_secondary(void)
+void __init early_init_mmu(void)
+{
+       /* Setup initial STAB address in the PACA */
+       get_paca()->stab_real = __pa((u64)&initial_stab);
+       get_paca()->stab_addr = (u64)&initial_stab;
+
+       /* Initialize the MMU Hash table and create the linear mapping
+        * of memory. Has to be done before stab/slb initialization as
+        * this is currently where the page size encoding is obtained
+        */
+       htab_initialize();
+
+       /* Initialize stab / SLB management except on iSeries
+        */
+       if (cpu_has_feature(CPU_FTR_SLB))
+               slb_initialize();
+       else if (!firmware_has_feature(FW_FEATURE_ISERIES))
+               stab_initialize(get_paca()->stab_real);
+}
+
+#ifdef CONFIG_SMP
+void __cpuinit early_init_mmu_secondary(void)
 {
+       /* Initialize hash table for that CPU */
        if (!firmware_has_feature(FW_FEATURE_LPAR))
                mtspr(SPRN_SDR1, _SDR1);
+
+       /* Initialize STAB/SLB. We use a virtual address as it works
+        * in real mode on pSeries and we want a virutal address on
+        * iSeries anyway
+        */
+       if (cpu_has_feature(CPU_FTR_SLB))
+               slb_initialize();
+       else
+               stab_initialize(get_paca()->stab_addr);
 }
+#endif /* CONFIG_SMP */
 
 /*
  * Called by asm hashtable.S for doing lazy icache flush
@@ -687,7 +777,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
        /* page is dirty */
        if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
                if (trap == 0x400) {
-                       __flush_dcache_icache(page_address(page));
+                       flush_dcache_icache_page(page);
                        set_bit(PG_arch_1, &page->flags);
                } else
                        pp |= HPTE_R_N;
@@ -695,6 +785,28 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
        return pp;
 }
 
+#ifdef CONFIG_PPC_MM_SLICES
+unsigned int get_paca_psize(unsigned long addr)
+{
+       unsigned long index, slices;
+
+       if (addr < SLICE_LOW_TOP) {
+               slices = get_paca()->context.low_slices_psize;
+               index = GET_LOW_SLICE_INDEX(addr);
+       } else {
+               slices = get_paca()->context.high_slices_psize;
+               index = GET_HIGH_SLICE_INDEX(addr);
+       }
+       return (slices >> (index * 4)) & 0xF;
+}
+
+#else
+unsigned int get_paca_psize(unsigned long addr)
+{
+       return get_paca()->context.user_psize;
+}
+#endif
+
 /*
  * Demote a segment to using 4k pages.
  * For now this makes the whole process use 4k pages.
@@ -702,13 +814,13 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
 #ifdef CONFIG_PPC_64K_PAGES
 void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
 {
-       if (mm->context.user_psize == MMU_PAGE_4K)
+       if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
                return;
-       slice_set_user_psize(mm, MMU_PAGE_4K);
+       slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
 #ifdef CONFIG_SPU_BASE
        spu_flush_all_slbs(mm);
 #endif
-       if (get_paca()->context.user_psize != MMU_PAGE_4K) {
+       if (get_paca_psize(addr) != MMU_PAGE_4K) {
                get_paca()->context = mm->context;
                slb_flush_and_rebolt();
        }
@@ -723,9 +835,9 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
  * Result is 0: full permissions, _PAGE_RW: read-only,
  * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
  */
-static int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
-       struct subpage_prot_table *spt = pgd_subpage_prot(pgdir);
+       struct subpage_prot_table *spt = &mm->context.spt;
        u32 spp = 0;
        u32 **sbpm, *sbpp;
 
@@ -753,7 +865,7 @@ static int subpage_protection(pgd_t *pgdir, unsigned long ea)
 }
 
 #else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
        return 0;
 }
@@ -771,7 +883,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        unsigned long vsid;
        struct mm_struct *mm;
        pte_t *ptep;
-       cpumask_t tmp;
+       unsigned hugeshift;
+       const struct cpumask *tmp;
        int rc, user_region = 0, local = 0;
        int psize, ssize;
 
@@ -792,11 +905,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
                        DBG_LOW(" user region with no mm !\n");
                        return 1;
                }
-#ifdef CONFIG_PPC_MM_SLICES
                psize = get_slice_psize(mm, ea);
-#else
-               psize = mm->context.user_psize;
-#endif
                ssize = user_segment_size(ea);
                vsid = get_vsid(mm->context.id, ea, ssize);
                break;
@@ -823,34 +932,35 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
                return 1;
 
        /* Check CPU locality */
-       tmp = cpumask_of_cpu(smp_processor_id());
-       if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
+       tmp = cpumask_of(smp_processor_id());
+       if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
                local = 1;
 
-#ifdef CONFIG_HUGETLB_PAGE
-       /* Handle hugepage regions */
-       if (HPAGE_SHIFT && psize == mmu_huge_psize) {
-               DBG_LOW(" -> huge page !\n");
-               return hash_huge_page(mm, access, ea, vsid, local, trap);
-       }
-#endif /* CONFIG_HUGETLB_PAGE */
-
 #ifndef CONFIG_PPC_64K_PAGES
-       /* If we use 4K pages and our psize is not 4K, then we are hitting
-        * a special driver mapping, we need to align the address before
-        * we fetch the PTE
+       /* If we use 4K pages and our psize is not 4K, then we might
+        * be hitting a special driver mapping, and need to align the
+        * address before we fetch the PTE.
+        *
+        * It could also be a hugepage mapping, in which case this is
+        * not necessary, but it's not harmful, either.
         */
        if (psize != MMU_PAGE_4K)
                ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
 #endif /* CONFIG_PPC_64K_PAGES */
 
        /* Get PTE and page size from page tables */
-       ptep = find_linux_pte(pgdir, ea);
+       ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
        if (ptep == NULL || !pte_present(*ptep)) {
                DBG_LOW(" no PTE !\n");
                return 1;
        }
 
+#ifdef CONFIG_HUGETLB_PAGE
+       if (hugeshift)
+               return __hash_page_huge(ea, access, vsid, ptep, trap, local,
+                                       ssize, hugeshift, psize);
+#endif /* CONFIG_HUGETLB_PAGE */
+
 #ifndef CONFIG_PPC_64K_PAGES
        DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
 #else
@@ -868,7 +978,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        /* Do actual hashing */
 #ifdef CONFIG_PPC_64K_PAGES
        /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */
-       if (pte_val(*ptep) & _PAGE_4K_PFN) {
+       if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
                demote_segment_4k(mm, ea);
                psize = MMU_PAGE_4K;
        }
@@ -897,7 +1007,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
                }
        }
        if (user_region) {
-               if (psize != get_paca()->context.user_psize) {
+               if (psize != get_paca_psize(ea)) {
                        get_paca()->context = mm->context;
                        slb_flush_and_rebolt();
                }
@@ -940,7 +1050,6 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
        unsigned long vsid;
        void *pgdir;
        pte_t *ptep;
-       cpumask_t mask;
        unsigned long flags;
        int local = 0;
        int ssize;
@@ -983,8 +1092,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
        local_irq_save(flags);
 
        /* Is that local to this CPU ? */
-       mask = cpumask_of_cpu(smp_processor_id());
-       if (cpus_equal(mm->cpu_vm_mask, mask))
+       if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
                local = 1;
 
        /* Hash it in */
@@ -1058,8 +1166,7 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
        unsigned long hash, hpteg;
        unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
        unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
-       unsigned long mode = _PAGE_ACCESSED | _PAGE_DIRTY |
-               _PAGE_COHERENT | PP_RWXX | HPTE_R_N;
+       unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
        int ret;
 
        hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);