[PATCH] add x86-64 specific support for sparsemem
[safe/jmp/linux-2.6] / arch / x86_64 / mm / numa.c
1 /* 
2  * Generic VM initialization for x86-64 NUMA setups.
3  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4  */ 
5 #include <linux/kernel.h>
6 #include <linux/mm.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
14
15 #include <asm/e820.h>
16 #include <asm/proto.h>
17 #include <asm/dma.h>
18 #include <asm/numa.h>
19 #include <asm/acpi.h>
20
21 #ifndef Dprintk
22 #define Dprintk(x...)
23 #endif
24
25 struct pglist_data *node_data[MAX_NUMNODES];
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28 int memnode_shift;
29 u8  memnodemap[NODEMAPSIZE];
30
31 unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
32 cpumask_t     node_to_cpumask[MAX_NUMNODES];
33
34 int numa_off __initdata;
35
36 int __init compute_hash_shift(struct node *nodes, int numnodes)
37 {
38         int i; 
39         int shift = 24;
40         u64 addr;
41         
42         /* When in doubt use brute force. */
43         while (shift < 48) { 
44                 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); 
45                 for (i = 0; i < numnodes; i++) {
46                         if (nodes[i].start == nodes[i].end) 
47                                 continue;
48                         for (addr = nodes[i].start; 
49                              addr < nodes[i].end; 
50                              addr += (1UL << shift)) {
51                                 if (memnodemap[addr >> shift] != 0xff && 
52                                     memnodemap[addr >> shift] != i) { 
53                                         printk(KERN_INFO 
54                                             "node %d shift %d addr %Lx conflict %d\n", 
55                                                i, shift, addr, memnodemap[addr>>shift]);
56                                         goto next; 
57                                 } 
58                                 memnodemap[addr >> shift] = i; 
59                         } 
60                 } 
61                 return shift; 
62         next:
63                 shift++; 
64         } 
65         memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); 
66         return -1; 
67 }
68
69 #ifdef CONFIG_SPARSEMEM
70 int early_pfn_to_nid(unsigned long pfn)
71 {
72         return phys_to_nid(pfn << PAGE_SHIFT);
73 }
74 #endif
75
76 /* Initialize bootmem allocator for a node */
77 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
78
79         unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
80         unsigned long nodedata_phys;
81         const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
82
83         start = round_up(start, ZONE_ALIGN); 
84
85         printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
86
87         start_pfn = start >> PAGE_SHIFT;
88         end_pfn = end >> PAGE_SHIFT;
89
90         memory_present(nodeid, start_pfn, end_pfn);
91         nodedata_phys = find_e820_area(start, end, pgdat_size); 
92         if (nodedata_phys == -1L) 
93                 panic("Cannot find memory pgdat in node %d\n", nodeid);
94
95         Dprintk("nodedata_phys %lx\n", nodedata_phys); 
96
97         node_data[nodeid] = phys_to_virt(nodedata_phys);
98         memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
99         NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
100         NODE_DATA(nodeid)->node_start_pfn = start_pfn;
101         NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
102
103         /* Find a place for the bootmem map */
104         bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
105         bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
106         bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
107         if (bootmap_start == -1L) 
108                 panic("Not enough continuous space for bootmap on node %d", nodeid); 
109         Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
110         
111         bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
112                                          bootmap_start >> PAGE_SHIFT, 
113                                          start_pfn, end_pfn); 
114
115         e820_bootmem_free(NODE_DATA(nodeid), start, end);
116
117         reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
118         reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
119         node_set_online(nodeid);
120
121
122 /* Initialize final allocator for a zone */
123 void __init setup_node_zones(int nodeid)
124
125         unsigned long start_pfn, end_pfn; 
126         unsigned long zones[MAX_NR_ZONES];
127         unsigned long dma_end_pfn;
128
129         memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 
130
131         start_pfn = node_start_pfn(nodeid);
132         end_pfn = node_end_pfn(nodeid);
133
134         Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
135         
136         /* All nodes > 0 have a zero length zone DMA */ 
137         dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; 
138         if (start_pfn < dma_end_pfn) { 
139                 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
140                 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; 
141         } else { 
142                 zones[ZONE_NORMAL] = end_pfn - start_pfn; 
143         } 
144     
145         free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
146                             start_pfn, NULL); 
147
148
149 void __init numa_init_array(void)
150 {
151         int rr, i;
152         /* There are unfortunately some poorly designed mainboards around
153            that only connect memory to a single CPU. This breaks the 1:1 cpu->node
154            mapping. To avoid this fill in the mapping for all possible
155            CPUs, as the number of CPUs is not known yet. 
156            We round robin the existing nodes. */
157         rr = 0;
158         for (i = 0; i < NR_CPUS; i++) {
159                 if (cpu_to_node[i] != NUMA_NO_NODE)
160                         continue;
161                 rr = next_node(rr, node_online_map);
162                 if (rr == MAX_NUMNODES)
163                         rr = first_node(node_online_map);
164                 cpu_to_node[i] = rr;
165                 rr++; 
166         }
167
168         set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
169 }
170
171 #ifdef CONFIG_NUMA_EMU
172 int numa_fake __initdata = 0;
173
174 /* Numa emulation */
175 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
176 {
177         int i;
178         struct node nodes[MAX_NUMNODES];
179         unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
180
181         /* Kludge needed for the hash function */
182         if (hweight64(sz) > 1) {
183                 unsigned long x = 1;
184                 while ((x << 1) < sz)
185                         x <<= 1;
186                 if (x < sz/2)
187                         printk("Numa emulation unbalanced. Complain to maintainer\n");
188                 sz = x;
189         }
190
191         memset(&nodes,0,sizeof(nodes));
192         for (i = 0; i < numa_fake; i++) {
193                 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
194                 if (i == numa_fake-1)
195                         sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
196                 nodes[i].end = nodes[i].start + sz;
197                 if (i != numa_fake-1)
198                         nodes[i].end--;
199                 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
200                        i,
201                        nodes[i].start, nodes[i].end,
202                        (nodes[i].end - nodes[i].start) >> 20);
203                 node_set_online(i);
204         }
205         memnode_shift = compute_hash_shift(nodes, numa_fake);
206         if (memnode_shift < 0) {
207                 memnode_shift = 0;
208                 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
209                 return -1;
210         }
211         for_each_online_node(i)
212                 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
213         numa_init_array();
214         return 0;
215 }
216 #endif
217
218 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
219
220         int i;
221
222 #ifdef CONFIG_NUMA_EMU
223         if (numa_fake && !numa_emulation(start_pfn, end_pfn))
224                 return;
225 #endif
226
227 #ifdef CONFIG_ACPI_NUMA
228         if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
229                                           end_pfn << PAGE_SHIFT))
230                 return;
231 #endif
232
233 #ifdef CONFIG_K8_NUMA
234         if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
235                 return;
236 #endif
237         printk(KERN_INFO "%s\n",
238                numa_off ? "NUMA turned off" : "No NUMA configuration found");
239
240         printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
241                start_pfn << PAGE_SHIFT,
242                end_pfn << PAGE_SHIFT); 
243                 /* setup dummy node covering all memory */ 
244         memnode_shift = 63; 
245         memnodemap[0] = 0;
246         nodes_clear(node_online_map);
247         node_set_online(0);
248         for (i = 0; i < NR_CPUS; i++)
249                 cpu_to_node[i] = 0;
250         node_to_cpumask[0] = cpumask_of_cpu(0);
251         setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
252 }
253
254 __init void numa_add_cpu(int cpu)
255 {
256         /* BP is initialized elsewhere */
257         if (cpu) 
258                 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
259
260
261 unsigned long __init numa_free_all_bootmem(void) 
262
263         int i;
264         unsigned long pages = 0;
265         for_each_online_node(i) {
266                 pages += free_all_bootmem_node(NODE_DATA(i));
267         }
268         return pages;
269
270
271 void __init paging_init(void)
272
273         int i;
274         for_each_online_node(i) {
275                 setup_node_zones(i); 
276         }
277
278
279 /* [numa=off] */
280 __init int numa_setup(char *opt) 
281
282         if (!strncmp(opt,"off",3))
283                 numa_off = 1;
284 #ifdef CONFIG_NUMA_EMU
285         if(!strncmp(opt, "fake=", 5)) {
286                 numa_fake = simple_strtoul(opt+5,NULL,0); ;
287                 if (numa_fake >= MAX_NUMNODES)
288                         numa_fake = MAX_NUMNODES;
289         }
290 #endif
291 #ifdef CONFIG_ACPI_NUMA
292         if (!strncmp(opt,"noacpi",6))
293                 acpi_numa = -1;
294 #endif
295         return 1;
296
297
298 EXPORT_SYMBOL(cpu_to_node);
299 EXPORT_SYMBOL(node_to_cpumask);
300 EXPORT_SYMBOL(memnode_shift);
301 EXPORT_SYMBOL(memnodemap);
302 EXPORT_SYMBOL(node_data);