Merge commit 'linus/master' into HEAD
[safe/jmp/linux-2.6] / arch / x86 / mm / init_32.c
index f59e9b8..3cd7711 100644 (file)
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/init.h>
 
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
 
-
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
-int after_bootmem;
-
-int direct_gbpages;
+bool __read_mostly __vmalloc_start_set = false;
 
 static __init void *alloc_low_page(void)
 {
-       unsigned long pfn = table_end++;
+       unsigned long pfn = e820_table_end++;
        void *adr;
 
-       if (pfn >= table_top)
+       if (pfn >= e820_table_top)
                panic("alloc_low_page: ran out of memory");
 
        adr = __va(pfn * PAGE_SIZE);
@@ -120,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
                pte_t *page_table = NULL;
 
                if (after_bootmem) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
                        page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
                        if (!page_table)
@@ -137,6 +128,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
        return pte_offset_kernel(pmd, 0);
 }
 
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+       int pgd_idx = pgd_index(vaddr);
+       int pmd_idx = pmd_index(vaddr);
+
+       return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+       int pte_idx = pte_index(vaddr);
+       pmd_t *pmd;
+
+       pmd = populate_extra_pmd(vaddr);
+       return one_page_table_init(pmd) + pte_idx;
+}
+
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
                                           unsigned long vaddr, pte_t *lastpte)
 {
@@ -153,8 +161,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
        if (pmd_idx_kmap_begin != pmd_idx_kmap_end
            && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
            && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-           && ((__pa(pte) >> PAGE_SHIFT) < table_start
-               || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+           && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+               || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
                pte_t *newpte;
                int i;
 
@@ -227,10 +235,13 @@ static inline int is_kernel_text(unsigned long addr)
  * of max_low_pfn pages, by creating page tables starting from address
  * PAGE_OFFSET:
  */
-static void __init kernel_physical_mapping_init(unsigned long start_pfn,
-                                               unsigned long end_pfn,
-                                               int use_pse)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+                            unsigned long end,
+                            unsigned long page_size_mask)
 {
+       int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+       unsigned long start_pfn, end_pfn;
        pgd_t *pgd_base = swapper_pg_dir;
        int pgd_idx, pmd_idx, pte_ofs;
        unsigned long pfn;
@@ -240,6 +251,9 @@ static void __init kernel_physical_mapping_init(unsigned long start_pfn,
        unsigned pages_2m, pages_4k;
        int mapping_iter;
 
+       start_pfn = start >> PAGE_SHIFT;
+       end_pfn = end >> PAGE_SHIFT;
+
        /*
         * First iteration will setup identity mapping using large/small pages
         * based on use_pse, with other attributes same as set by
@@ -354,6 +368,7 @@ repeat:
                mapping_iter = 2;
                goto repeat;
        }
+       return 0;
 }
 
 pte_t *kmap_pte;
@@ -509,7 +524,7 @@ void __init native_pagetable_setup_done(pgd_t *base)
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init early_ioremap_page_table_range_init(void)
+void __init early_ioremap_page_table_range_init(void)
 {
        pgd_t *pgd_base = swapper_pg_dir;
        unsigned long vaddr, end;
@@ -549,7 +564,7 @@ static inline void save_pg_dir(void)
 }
 #endif /* !CONFIG_ACPI_SLEEP */
 
-void zap_low_mappings(void)
+void zap_low_mappings(bool early)
 {
        int i;
 
@@ -566,64 +581,16 @@ void zap_low_mappings(void)
                set_pgd(swapper_pg_dir+i, __pgd(0));
 #endif
        }
-       flush_tlb_all();
-}
 
-int nx_enabled;
+       if (early)
+               __flush_tlb();
+       else
+               flush_tlb_all();
+}
 
 pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-#ifdef CONFIG_X86_PAE
-
-static int disable_nx __initdata;
-
-/*
- * noexec = on|off
- *
- * Control non executable mappings.
- *
- * on      Enable
- * off     Disable
- */
-static int __init noexec_setup(char *str)
-{
-       if (!str || !strcmp(str, "on")) {
-               if (cpu_has_nx) {
-                       __supported_pte_mask |= _PAGE_NX;
-                       disable_nx = 0;
-               }
-       } else {
-               if (!strcmp(str, "off")) {
-                       disable_nx = 1;
-                       __supported_pte_mask &= ~_PAGE_NX;
-               } else {
-                       return -EINVAL;
-               }
-       }
-
-       return 0;
-}
-early_param("noexec", noexec_setup);
-
-static void __init set_nx(void)
-{
-       unsigned int v[4], l, h;
-
-       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
-               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-
-               if ((v[3] & (1 << 20)) && !disable_nx) {
-                       rdmsr(MSR_EFER, l, h);
-                       l |= EFER_NX;
-                       wrmsr(MSR_EFER, l, h);
-                       nx_enabled = 1;
-                       __supported_pte_mask |= _PAGE_NX;
-               }
-       }
-}
-#endif
-
 /* user-defined highmem size */
 static unsigned int highmem_pages = -1;
 
@@ -743,21 +710,23 @@ void __init initmem_init(unsigned long start_pfn,
        highstart_pfn = highend_pfn = max_pfn;
        if (max_pfn > max_low_pfn)
                highstart_pfn = max_low_pfn;
-       memory_present(0, 0, highend_pfn);
        e820_register_active_regions(0, 0, highend_pfn);
+       sparse_memory_present_with_active_regions(0);
        printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
                pages_to_mb(highend_pfn - highstart_pfn));
        num_physpages = highend_pfn;
        high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-       memory_present(0, 0, max_low_pfn);
        e820_register_active_regions(0, 0, max_low_pfn);
+       sparse_memory_present_with_active_regions(0);
        num_physpages = max_low_pfn;
        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
 #ifdef CONFIG_FLATMEM
        max_mapnr = num_physpages;
 #endif
+       __vmalloc_start_set = true;
+
        printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
                        pages_to_mb(max_low_pfn));
 
@@ -786,11 +755,6 @@ static unsigned long __init setup_node_bootmem(int nodeid,
 {
        unsigned long bootmap_size;
 
-       if (start_pfn > max_low_pfn)
-               return bootmap;
-       if (end_pfn > max_low_pfn)
-               end_pfn = max_low_pfn;
-
        /* don't touch min_low_pfn */
        bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
                                         bootmap >> PAGE_SHIFT,
@@ -823,300 +787,27 @@ void __init setup_bootmem_allocator(void)
                 max_pfn_mapped<<PAGE_SHIFT);
        printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-       for_each_online_node(nodeid)
-               bootmap = setup_node_bootmem(nodeid, node_start_pfn[nodeid],
-                                       node_end_pfn[nodeid], bootmap);
-#else
-       bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap);
-#endif
-
-       after_bootmem = 1;
-}
-
-static void __init find_early_table_space(unsigned long end, int use_pse,
-                                         int use_gbpages)
-{
-       unsigned long puds, pmds, ptes, tables, start;
-
-       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-       tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-
-       if (use_gbpages) {
-               unsigned long extra;
-
-               extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
-               pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-       } else
-               pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-
-       tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
-       if (use_pse) {
-               unsigned long extra;
-
-               extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
-               extra += PMD_SIZE;
-#endif
-               ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       } else
-               ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
-       tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
-#ifdef CONFIG_X86_32
-       /* for fixmap */
-       tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
-
-       /*
-        * RED-PEN putting page tables only on node 0 could
-        * cause a hotspot and fill up ZONE_DMA. The page tables
-        * need roughly 0.5KB per GB.
-        */
-#ifdef CONFIG_X86_32
-       start = 0x7000;
-       table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-                                       tables, PAGE_SIZE);
-#else /* CONFIG_X86_64 */
-       start = 0x8000;
-       table_start = find_e820_area(start, end, tables, PAGE_SIZE);
-#endif
-       if (table_start == -1UL)
-               panic("Cannot find space for the kernel page tables");
-
-       table_start >>= PAGE_SHIFT;
-       table_end = table_start;
-       table_top = table_start + (tables >> PAGE_SHIFT);
-
-       printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-               end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-struct map_range {
-       unsigned long start;
-       unsigned long end;
-       unsigned page_size_mask;
-};
-
-#ifdef CONFIG_X86_32
-#define NR_RANGE_MR 3
-#else /* CONFIG_X86_64 */
-#define NR_RANGE_MR 5
-#endif
-
-static int save_mr(struct map_range *mr, int nr_range,
-                  unsigned long start_pfn, unsigned long end_pfn,
-                  unsigned long page_size_mask)
-{
-       if (start_pfn < end_pfn) {
-               if (nr_range >= NR_RANGE_MR)
-                       panic("run out of range for init_memory_mapping\n");
-               mr[nr_range].start = start_pfn<<PAGE_SHIFT;
-               mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
-               mr[nr_range].page_size_mask = page_size_mask;
-               nr_range++;
-       }
+       for_each_online_node(nodeid) {
+                unsigned long start_pfn, end_pfn;
 
-       return nr_range;
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-                                              unsigned long end)
-{
-       unsigned long page_size_mask = 0;
-       unsigned long start_pfn, end_pfn;
-       unsigned long pos;
-       unsigned long ret;
-
-       struct map_range mr[NR_RANGE_MR];
-       int nr_range, i;
-       int use_pse, use_gbpages;
-
-       printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       /*
-        * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-        * This will simplify cpa(), which otherwise needs to support splitting
-        * large pages into small in interrupt context, etc.
-        */
-       use_pse = use_gbpages = 0;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+               start_pfn = node_start_pfn[nodeid];
+               end_pfn = node_end_pfn[nodeid];
+               if (start_pfn > max_low_pfn)
+                       continue;
+               if (end_pfn > max_low_pfn)
+                       end_pfn = max_low_pfn;
 #else
-       use_pse = cpu_has_pse;
-       use_gbpages = direct_gbpages;
-#endif
-
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_PAE
-       set_nx();
-       if (nx_enabled)
-               printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
-
-       /* Enable PSE if available */
-       if (cpu_has_pse)
-               set_in_cr4(X86_CR4_PSE);
-
-       /* Enable PGE if available */
-       if (cpu_has_pge) {
-               set_in_cr4(X86_CR4_PGE);
-               __supported_pte_mask |= _PAGE_GLOBAL;
-       }
-#endif
-
-       if (use_gbpages)
-               page_size_mask |= 1 << PG_LEVEL_1G;
-       if (use_pse)
-               page_size_mask |= 1 << PG_LEVEL_2M;
-
-       memset(mr, 0, sizeof(mr));
-       nr_range = 0;
-
-       /* head if not big page alignment ? */
-       start_pfn = start >> PAGE_SHIFT;
-       pos = start_pfn << PAGE_SHIFT;
-#ifdef CONFIG_X86_32
-       /*
-        * Don't use a large page for the first 2/4MB of memory
-        * because there are often fixed size MTRRs in there
-        * and overlapping MTRRs into large pages can cause
-        * slowdowns.
-        */
-       if (pos == 0)
-               end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
-       else
-               end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                                << (PMD_SHIFT - PAGE_SHIFT);
-#else /* CONFIG_X86_64 */
-       end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-                       << (PMD_SHIFT - PAGE_SHIFT);
-#endif
-       if (end_pfn > (end >> PAGE_SHIFT))
-               end_pfn = end >> PAGE_SHIFT;
-       if (start_pfn < end_pfn) {
-               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-               pos = end_pfn << PAGE_SHIFT;
-       }
-
-       /* big page (2M) range */
-       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                        << (PMD_SHIFT - PAGE_SHIFT);
-#ifdef CONFIG_X86_32
-       end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-#else /* CONFIG_X86_64 */
-       end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-                        << (PUD_SHIFT - PAGE_SHIFT);
-       if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-               end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
-#endif
-
-       if (start_pfn < end_pfn) {
-               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-                               page_size_mask & (1<<PG_LEVEL_2M));
-               pos = end_pfn << PAGE_SHIFT;
-       }
-
-#ifdef CONFIG_X86_64
-       /* big page (1G) range */
-       start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-                        << (PUD_SHIFT - PAGE_SHIFT);
-       end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
-       if (start_pfn < end_pfn) {
-               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-                               page_size_mask &
-                                ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-               pos = end_pfn << PAGE_SHIFT;
-       }
-
-       /* tail is not big page (1G) alignment */
-       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                        << (PMD_SHIFT - PAGE_SHIFT);
-       end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-       if (start_pfn < end_pfn) {
-               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-                               page_size_mask & (1<<PG_LEVEL_2M));
-               pos = end_pfn << PAGE_SHIFT;
-       }
+               start_pfn = 0;
+               end_pfn = max_low_pfn;
 #endif
-
-       /* tail is not big page (2M) alignment */
-       start_pfn = pos>>PAGE_SHIFT;
-       end_pfn = end>>PAGE_SHIFT;
-       nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
-       /* try to merge same page size and continuous */
-       for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
-               unsigned long old_start;
-               if (mr[i].end != mr[i+1].start ||
-                   mr[i].page_size_mask != mr[i+1].page_size_mask)
-                       continue;
-               /* move it */
-               old_start = mr[i].start;
-               memmove(&mr[i], &mr[i+1],
-                       (nr_range - 1 - i) * sizeof(struct map_range));
-               mr[i--].start = old_start;
-               nr_range--;
+               bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
+                                                bootmap);
        }
 
-       for (i = 0; i < nr_range; i++)
-               printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-                               mr[i].start, mr[i].end,
-                       (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-                        (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
-       /*
-        * Find space for the kernel direct mapping tables.
-        *
-        * Later we should allocate these tables in the local node of the
-        * memory mapped. Unfortunately this is done currently before the
-        * nodes are discovered.
-        */
-       if (!after_bootmem)
-               find_early_table_space(end, use_pse, use_gbpages);
-
-#ifdef CONFIG_X86_32
-       for (i = 0; i < nr_range; i++)
-               kernel_physical_mapping_init(
-                               mr[i].start >> PAGE_SHIFT,
-                               mr[i].end >> PAGE_SHIFT,
-                               mr[i].page_size_mask == (1<<PG_LEVEL_2M));
-       ret = end;
-#else /* CONFIG_X86_64 */
-       for (i = 0; i < nr_range; i++)
-               ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
-                                                  mr[i].page_size_mask);
-#endif
-
-#ifdef CONFIG_X86_32
-       early_ioremap_page_table_range_init();
-
-       load_cr3(swapper_pg_dir);
-#endif
-
-#ifdef CONFIG_X86_64
-       if (!after_bootmem)
-               mmu_cr4_features = read_cr4();
-#endif
-       __flush_tlb_all();
-
-       if (!after_bootmem && table_end > table_start)
-               reserve_early(table_start << PAGE_SHIFT,
-                                table_end << PAGE_SHIFT, "PGTABLE");
-
-       if (!after_bootmem)
-               early_memtest(start, end);
-
-       return ret >> PAGE_SHIFT;
+       after_bootmem = 1;
 }
 
-
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
  * already mapped by head.S.
@@ -1269,7 +960,7 @@ void __init mem_init(void)
                test_wp_bit();
 
        save_pg_dir();
-       zap_low_mappings();
+       zap_low_mappings(true);
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -1312,17 +1003,47 @@ static noinline int do_test_wp_bit(void)
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
 
+static int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
+{
+       unsigned long start = PFN_ALIGN(_text);
+       unsigned long size = PFN_ALIGN(_etext) - start;
+
+       if (!kernel_set_to_readonly)
+               return;
+
+       pr_debug("Set kernel text: %lx - %lx for read write\n",
+                start, start+size);
+
+       set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+       unsigned long start = PFN_ALIGN(_text);
+       unsigned long size = PFN_ALIGN(_etext) - start;
+
+       if (!kernel_set_to_readonly)
+               return;
+
+       pr_debug("Set kernel text: %lx - %lx for read only\n",
+                start, start+size);
+
+       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
 void mark_rodata_ro(void)
 {
        unsigned long start = PFN_ALIGN(_text);
        unsigned long size = PFN_ALIGN(_etext) - start;
 
-#ifndef CONFIG_DYNAMIC_FTRACE
-       /* Dynamic tracing modifies the kernel text section */
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
        printk(KERN_INFO "Write protecting the kernel text: %luk\n",
                size >> 10);
 
+       kernel_set_to_readonly = 1;
+
 #ifdef CONFIG_CPA_DEBUG
        printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
                start, start+size);
@@ -1331,7 +1052,6 @@ void mark_rodata_ro(void)
        printk(KERN_INFO "Testing CPA: write protecting again\n");
        set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 #endif
-#endif /* CONFIG_DYNAMIC_FTRACE */
 
        start += size;
        size = (unsigned long)__end_rodata - start;