#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/pci.h>
#include <linux/pfn.h>
#include <linux/poison.h>
#include <linux/bootmem.h>
#include <linux/cpumask.h>
#include <asm/asm.h>
+#include <asm/bios_ebda.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
-unsigned int __VMALLOC_RESERVE = 128 << 20;
-
+unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
static noinline int do_test_wp_bit(void);
+
+static unsigned long __initdata table_start;
+static unsigned long __meminitdata table_end;
+static unsigned long __meminitdata table_top;
+
+int after_bootmem;
+
+int direct_gbpages;
+
+static __init void *alloc_low_page(void)
+{
+ unsigned long pfn = table_end++;
+ void *adr;
+
+ if (pfn >= table_top)
+ panic("alloc_low_page: ran out of memory");
+
+ adr = __va(pfn * PAGE_SIZE);
+ memset(adr, 0, PAGE_SIZE);
+ return adr;
+}
+
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-
+ if (after_bootmem)
+ pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ else
+ pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+ return pmd_table;
}
#endif
pud = pud_offset(pgd, 0);
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = NULL;
+ if (after_bootmem) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+ page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
- if (!page_table) {
- page_table =
+ if (!page_table)
+ page_table =
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- }
+ } else
+ page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
return pte_offset_kernel(pmd, 0);
}
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+ unsigned long vaddr, pte_t *lastpte)
+{
+#ifdef CONFIG_HIGHMEM
+ /*
+ * Something (early fixmap) may already have put a pte
+ * page here, which causes the page table allocation
+ * to become nonlinear. Attempt to fix it, and if it
+ * is still nonlinear then we have to bug.
+ */
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+
+ if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+ && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+ && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
+ && ((__pa(pte) >> PAGE_SHIFT) < table_start
+ || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+ pte_t *newpte;
+ int i;
+
+ BUG_ON(after_bootmem);
+ newpte = alloc_low_page();
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ set_pte(newpte + i, pte[i]);
+
+ paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+ BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+ __flush_tlb_all();
+
+ paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+ pte = newpte;
+ }
+ BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+ && vaddr > fix_to_virt(FIX_KMAP_END)
+ && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+ return pte;
+}
+
/*
* This function initializes a certain range of kernel virtual memory
* with new bootmem page tables, everywhere page tables are missing in
unsigned long vaddr;
pgd_t *pgd;
pmd_t *pmd;
+ pte_t *pte = NULL;
vaddr = start;
pgd_idx = pgd_index(vaddr);
pmd = pmd + pmd_index(vaddr);
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
- one_page_table_init(pmd);
+ pte = page_table_kmap_check(one_page_table_init(pmd),
+ pmd, vaddr, pte);
vaddr += PMD_SIZE;
}
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET:
*/
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+static void __init kernel_physical_mapping_init(unsigned long start_pfn,
+ unsigned long end_pfn,
+ int use_pse)
{
+ pgd_t *pgd_base = swapper_pg_dir;
int pgd_idx, pmd_idx, pte_ofs;
unsigned long pfn;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
+ unsigned pages_2m, pages_4k;
+ int mapping_iter;
- pgd_idx = pgd_index(PAGE_OFFSET);
- pgd = pgd_base + pgd_idx;
- pfn = 0;
+ /*
+ * First iteration will setup identity mapping using large/small pages
+ * based on use_pse, with other attributes same as set by
+ * the early code in head_32.S
+ *
+ * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+ * as desired for the kernel identity mapping.
+ *
+ * This two pass mechanism conforms to the TLB app note which says:
+ *
+ * "Software should not write to a paging-structure entry in a way
+ * that would change, for any linear address, both the page size
+ * and either the page frame or attributes."
+ */
+ mapping_iter = 1;
+
+ if (!cpu_has_pse)
+ use_pse = 0;
+repeat:
+ pages_2m = pages_4k = 0;
+ pfn = start_pfn;
+ pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pgd = pgd_base + pgd_idx;
for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd);
- if (pfn >= max_low_pfn)
- continue;
- for (pmd_idx = 0;
- pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+ if (pfn >= end_pfn)
+ continue;
+#ifdef CONFIG_X86_PAE
+ pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pmd += pmd_idx;
+#else
+ pmd_idx = 0;
+#endif
+ for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
pmd++, pmd_idx++) {
unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
/*
* Map with big pages if possible, otherwise
* create normal page tables:
- *
- * Don't use a large page for the first 2/4MB of memory
- * because there are often fixed size MTRRs in there
- * and overlapping MTRRs into large pages can cause
- * slowdowns.
*/
- if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
+ if (use_pse) {
unsigned int addr2;
pgprot_t prot = PAGE_KERNEL_LARGE;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute + _PAGE_PSE.
+ */
+ pgprot_t init_prot =
+ __pgprot(PTE_IDENT_ATTR |
+ _PAGE_PSE);
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
is_kernel_text(addr2))
prot = PAGE_KERNEL_LARGE_EXEC;
- set_pmd(pmd, pfn_pmd(pfn, prot));
+ pages_2m++;
+ if (mapping_iter == 1)
+ set_pmd(pmd, pfn_pmd(pfn, init_prot));
+ else
+ set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
- max_pfn_mapped = pfn;
continue;
}
pte = one_page_table_init(pmd);
- for (pte_ofs = 0;
- pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+ pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pte += pte_ofs;
+ for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
pgprot_t prot = PAGE_KERNEL;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute.
+ */
+ pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
if (is_kernel_text(addr))
prot = PAGE_KERNEL_EXEC;
- set_pte(pte, pfn_pte(pfn, prot));
+ pages_4k++;
+ if (mapping_iter == 1)
+ set_pte(pte, pfn_pte(pfn, init_prot));
+ else
+ set_pte(pte, pfn_pte(pfn, prot));
}
- max_pfn_mapped = pfn;
}
}
-}
+ if (mapping_iter == 1) {
+ /*
+ * update direct mapping page count only in the first
+ * iteration.
+ */
+ update_page_count(PG_LEVEL_2M, pages_2m);
+ update_page_count(PG_LEVEL_4K, pages_4k);
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
- if (pagenr <= 256)
- return 1;
- if (!page_is_ram(pagenr))
- return 1;
- return 0;
+ /*
+ * local global flush tlb, which will flush the previous
+ * mappings present in both small and large page TLB's.
+ */
+ __flush_tlb_all();
+
+ /*
+ * Second iteration will set the actual desired PTE attributes.
+ */
+ mapping_iter = 2;
+ goto repeat;
+ }
}
-#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
pgprot_t kmap_prot;
kmap_prot = PAGE_KERNEL;
}
+#ifdef CONFIG_HIGHMEM
static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
unsigned long vaddr;
unsigned long end_pfn;
};
-static void __init add_highpages_work_fn(unsigned long start_pfn,
+static int __init add_highpages_work_fn(unsigned long start_pfn,
unsigned long end_pfn, void *datax)
{
int node_pfn;
final_start_pfn = max(start_pfn, data->start_pfn);
final_end_pfn = min(end_pfn, data->end_pfn);
if (final_start_pfn >= final_end_pfn)
- return;
+ return 0;
for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
node_pfn++) {
add_one_highpage_init(page, node_pfn);
}
+ return 0;
+
}
void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
work_with_active_regions(nid, add_highpages_work_fn, &data);
}
-#ifndef CONFIG_NUMA
-static void __init set_highmem_pages_init(void)
+#else
+static inline void permanent_kmaps_init(pgd_t *pgd_base)
{
- add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
- totalram_pages += totalhigh_pages;
}
-#endif /* !CONFIG_NUMA */
-
-#else
-# define kmap_init() do { } while (0)
-# define permanent_kmaps_init(pgd_base) do { } while (0)
-# define set_highmem_pages_init() do { } while (0)
#endif /* CONFIG_HIGHMEM */
-pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
-EXPORT_SYMBOL(__PAGE_KERNEL);
-
-pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
-
void __init native_pagetable_setup_start(pgd_t *base)
{
unsigned long pfn, va;
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
-static void __init pagetable_init(void)
+static void __init early_ioremap_page_table_range_init(void)
{
pgd_t *pgd_base = swapper_pg_dir;
unsigned long vaddr, end;
- paravirt_pagetable_setup_start(pgd_base);
-
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
-
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __PAGE_KERNEL |= _PAGE_GLOBAL;
- __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
- }
-
- kernel_physical_mapping_init(pgd_base);
- remap_numa_kva();
-
/*
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
*/
- early_ioremap_clear();
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
early_ioremap_reset();
+}
- permanent_kmaps_init(pgd_base);
+static void __init pagetable_init(void)
+{
+ pgd_t *pgd_base = swapper_pg_dir;
- paravirt_pagetable_setup_done(pgd_base);
+ permanent_kmaps_init(pgd_base);
}
#ifdef CONFIG_ACPI_SLEEP
int nx_enabled;
-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
#ifdef CONFIG_X86_PAE
}
#endif
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
/*
- * paging_init() sets up the page tables - note that the first 8MB are
- * already mapped by head.S.
- *
- * This routines also unmaps the page at virtual kernel address 0, so
- * that we can trap those pesky NULL-reference errors in the kernel.
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
*/
-void __init paging_init(void)
+static int __init parse_highmem(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+ return 0;
+}
+early_param("highmem", parse_highmem);
+
+#define MSG_HIGHMEM_TOO_BIG \
+ "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
+
+#define MSG_LOWMEM_TOO_SMALL \
+ "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
+/*
+ * All of RAM fits into lowmem - but if user wants highmem
+ * artificially via the highmem=x boot parameter then create
+ * it:
+ */
+void __init lowmem_pfn_init(void)
+{
+ /* max_low_pfn is 0, we already have early_res support */
+ max_low_pfn = max_pfn;
+
+ if (highmem_pages == -1)
+ highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+ if (highmem_pages >= max_pfn) {
+ printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
+ pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+ highmem_pages = 0;
+ }
+ if (highmem_pages) {
+ if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
+ printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
+ pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+ max_low_pfn -= highmem_pages;
+ }
+#else
+ if (highmem_pages)
+ printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+}
+
+#define MSG_HIGHMEM_TOO_SMALL \
+ "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
+
+#define MSG_HIGHMEM_TRIMMED \
+ "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+/*
+ * We have more RAM than fits into lowmem - we try to put it into
+ * highmem, also taking the highmem=x boot parameter into account:
+ */
+void __init highmem_pfn_init(void)
+{
+ max_low_pfn = MAXMEM_PFN;
+
+ if (highmem_pages == -1)
+ highmem_pages = max_pfn - MAXMEM_PFN;
+
+ if (highmem_pages + MAXMEM_PFN < max_pfn)
+ max_pfn = MAXMEM_PFN + highmem_pages;
+
+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
+ printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
+ pages_to_mb(max_pfn - MAXMEM_PFN),
+ pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+#ifndef CONFIG_HIGHMEM
+ /* Maximum memory usable is what is directly addressable */
+ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
+ if (max_pfn > MAX_NONPAE_PFN)
+ printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+ else
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+ if (max_pfn > MAX_NONPAE_PFN) {
+ max_pfn = MAX_NONPAE_PFN;
+ printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
+ }
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+}
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+ /* it could update max_pfn */
+
+ if (max_pfn <= MAXMEM_PFN)
+ lowmem_pfn_init();
+ else
+ highmem_pfn_init();
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init initmem_init(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+#ifdef CONFIG_HIGHMEM
+ highstart_pfn = highend_pfn = max_pfn;
+ if (max_pfn > max_low_pfn)
+ highstart_pfn = max_low_pfn;
+ memory_present(0, 0, highend_pfn);
+ e820_register_active_regions(0, 0, highend_pfn);
+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ pages_to_mb(highend_pfn - highstart_pfn));
+ num_physpages = highend_pfn;
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+ memory_present(0, 0, max_low_pfn);
+ e820_register_active_regions(0, 0, max_low_pfn);
+ num_physpages = max_low_pfn;
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+ max_mapnr = num_physpages;
+#endif
+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+ pages_to_mb(max_low_pfn));
+
+ setup_bootmem_allocator();
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+static void __init zone_sizes_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+ max_zone_pfns[ZONE_DMA] =
+ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+#endif
+
+ free_area_init_nodes(max_zone_pfns);
+}
+
+static unsigned long __init setup_node_bootmem(int nodeid,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long bootmap)
+{
+ unsigned long bootmap_size;
+
+ if (start_pfn > max_low_pfn)
+ return bootmap;
+ if (end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
+
+ /* don't touch min_low_pfn */
+ bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+ bootmap >> PAGE_SHIFT,
+ start_pfn, end_pfn);
+ printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
+ nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
+ nodeid, bootmap, bootmap + bootmap_size);
+ free_bootmem_with_active_regions(nodeid, end_pfn);
+ early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+ return bootmap + bootmap_size;
+}
+
+void __init setup_bootmem_allocator(void)
+{
+ int nodeid;
+ unsigned long bootmap_size, bootmap;
+ /*
+ * Initialize the boot-time allocator (with low memory only):
+ */
+ bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
+ bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+ PAGE_SIZE);
+ if (bootmap == -1L)
+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+
+ printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+ printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ for_each_online_node(nodeid)
+ bootmap = setup_node_bootmem(nodeid, node_start_pfn[nodeid],
+ node_end_pfn[nodeid], bootmap);
+#else
+ bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap);
+#endif
+
+ after_bootmem = 1;
+}
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+ int use_gbpages)
+{
+ unsigned long puds, pmds, ptes, tables, start;
+
+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+ if (use_gbpages) {
+ unsigned long extra;
+
+ extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+ pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+ } else
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+ if (use_pse) {
+ unsigned long extra;
+
+ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+ extra += PMD_SIZE;
+#endif
+ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else
+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+ /* for fixmap */
+ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
+ /*
+ * RED-PEN putting page tables only on node 0 could
+ * cause a hotspot and fill up ZONE_DMA. The page tables
+ * need roughly 0.5KB per GB.
+ */
+#ifdef CONFIG_X86_32
+ start = 0x7000;
+ table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+ tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
+ start = 0x8000;
+ table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
+ if (table_start == -1UL)
+ panic("Cannot find space for the kernel page tables");
+
+ table_start >>= PAGE_SHIFT;
+ table_end = table_start;
+ table_top = table_start + (tables >> PAGE_SHIFT);
+
+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
+}
+
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int save_mr(struct map_range *mr, int nr_range,
+ unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long page_size_mask)
+{
+ if (start_pfn < end_pfn) {
+ if (nr_range >= NR_RANGE_MR)
+ panic("run out of range for init_memory_mapping\n");
+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
+ mr[nr_range].page_size_mask = page_size_mask;
+ nr_range++;
+ }
+
+ return nr_range;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
{
+ unsigned long page_size_mask = 0;
+ unsigned long start_pfn, end_pfn;
+ unsigned long pos;
+ unsigned long ret;
+
+ struct map_range mr[NR_RANGE_MR];
+ int nr_range, i;
+ int use_pse, use_gbpages;
+
+ printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ use_pse = use_gbpages = 0;
+#else
+ use_pse = cpu_has_pse;
+ use_gbpages = direct_gbpages;
+#endif
+
+#ifdef CONFIG_X86_32
#ifdef CONFIG_X86_PAE
set_nx();
if (nx_enabled)
printk(KERN_INFO "NX (Execute Disable) protection: active\n");
#endif
- pagetable_init();
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+#endif
+
+ if (use_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (use_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = 0;
+
+ /* head if not big page alignment ? */
+ start_pfn = start >> PAGE_SHIFT;
+ pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+ /*
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
+ */
+ if (pos == 0)
+ end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+ else
+ end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+ end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#endif
+ if (end_pfn > (end >> PAGE_SHIFT))
+ end_pfn = end >> PAGE_SHIFT;
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+ /* big page (2M) range */
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+ end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+ end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+#ifdef CONFIG_X86_64
+ /* big page (1G) range */
+ start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask &
+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+ /* tail is not big page (1G) alignment */
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+#endif
+
+ /* tail is not big page (2M) alignment */
+ start_pfn = pos>>PAGE_SHIFT;
+ end_pfn = end>>PAGE_SHIFT;
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ /* try to merge same page size and continuous */
+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+ unsigned long old_start;
+ if (mr[i].end != mr[i+1].start ||
+ mr[i].page_size_mask != mr[i+1].page_size_mask)
+ continue;
+ /* move it */
+ old_start = mr[i].start;
+ memmove(&mr[i], &mr[i+1],
+ (nr_range - 1 - i) * sizeof(struct map_range));
+ mr[i--].start = old_start;
+ nr_range--;
+ }
+
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+ mr[i].start, mr[i].end,
+ (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+ (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+ /*
+ * Find space for the kernel direct mapping tables.
+ *
+ * Later we should allocate these tables in the local node of the
+ * memory mapped. Unfortunately this is done currently before the
+ * nodes are discovered.
+ */
+ if (!after_bootmem)
+ find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_32
+ for (i = 0; i < nr_range; i++)
+ kernel_physical_mapping_init(
+ mr[i].start >> PAGE_SHIFT,
+ mr[i].end >> PAGE_SHIFT,
+ mr[i].page_size_mask == (1<<PG_LEVEL_2M));
+ ret = end;
+#else /* CONFIG_X86_64 */
+ for (i = 0; i < nr_range; i++)
+ ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+#endif
+
+#ifdef CONFIG_X86_32
+ early_ioremap_page_table_range_init();
load_cr3(swapper_pg_dir);
+#endif
+
+#ifdef CONFIG_X86_64
+ if (!after_bootmem)
+ mmu_cr4_features = read_cr4();
+#endif
+ __flush_tlb_all();
+
+ if (!after_bootmem && table_end > table_start)
+ reserve_early(table_start << PAGE_SHIFT,
+ table_end << PAGE_SHIFT, "PGTABLE");
+
+ if (!after_bootmem)
+ early_memtest(start, end);
+
+ return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+ pagetable_init();
__flush_tlb_all();
kmap_init();
+
+ /*
+ * NOTE: at this point the bootmem allocator is fully available.
+ */
+ sparse_init();
+ zone_sizes_init();
}
/*
int codesize, reservedpages, datasize, initsize;
int tmp;
+ pci_iommu_alloc();
+
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
-#ifdef CONFIG_HIGHMEM
- /* check that fixmap and pkmap do not overlap */
- if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
- printk(KERN_ERR
- "fixmap and kmap areas overlap - this will crash\n");
- printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
- PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
- FIXADDR_START);
- BUG();
- }
-#endif
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
);
-#if 1 /* double-sanity-check paranoia */
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
(unsigned long)&_text, (unsigned long)&_etext,
((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+ /*
+ * Check boundaries twice: Some fundamental inconsistencies can
+ * be detected at build time already.
+ */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+ BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+ BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
- BUG_ON(VMALLOC_START > VMALLOC_END);
+ BUG_ON(VMALLOC_START >= VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
-#endif /* double-sanity-check paranoia */
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
- cpa_init();
save_pg_dir();
zap_low_mappings();
}
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- return __add_pages(zone, start_pfn, nr_pages);
+ return __add_pages(nid, zone, start_pfn, nr_pages);
}
#endif
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
+#ifndef CONFIG_DYNAMIC_FTRACE
+ /* Dynamic tracing modifies the kernel text section */
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel text: %luk\n",
size >> 10);
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
start += size;
size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
}
#endif
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * If debugging page accesses then do not free this memory but
- * mark them not present - any buggy init-section access will
- * create a kernel page fault:
- */
- printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
- begin, PAGE_ALIGN(end));
- set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
- unsigned long addr;
-
- /*
- * We just marked the kernel text read only above, now that
- * we are going to free part of that, we need to make that
- * writeable first.
- */
- set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
-
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- free_page(addr);
- totalram_pages++;
- }
- printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-#endif
-}
-
-void free_initmem(void)
-{
- free_init_pages("unused kernel memory",
- (unsigned long)(&__init_begin),
- (unsigned long)(&__init_end));
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_init_pages("initrd memory", start, end);
-}
-#endif
-
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{