x86: have set_memory_array_{uc,wb} coalesce memtypes, fix
[safe/jmp/linux-2.6] / arch / x86 / mm / pageattr.c
1 /*
2  * Copyright 2002 Andi Kleen, SuSE Labs.
3  * Thanks to Ben LaHaise for precious feedback.
4  */
5 #include <linux/highmem.h>
6 #include <linux/bootmem.h>
7 #include <linux/module.h>
8 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/mm.h>
11 #include <linux/interrupt.h>
12 #include <linux/seq_file.h>
13 #include <linux/debugfs.h>
14
15 #include <asm/e820.h>
16 #include <asm/processor.h>
17 #include <asm/tlbflush.h>
18 #include <asm/sections.h>
19 #include <asm/uaccess.h>
20 #include <asm/pgalloc.h>
21 #include <asm/proto.h>
22 #include <asm/pat.h>
23
24 /*
25  * The current flushing context - we pass it instead of 5 arguments:
26  */
27 struct cpa_data {
28         unsigned long   *vaddr;
29         pgprot_t        mask_set;
30         pgprot_t        mask_clr;
31         int             numpages;
32         int             flags;
33         unsigned long   pfn;
34         unsigned        force_split : 1;
35         int             curpage;
36 };
37
38 #define CPA_FLUSHTLB 1
39 #define CPA_ARRAY 2
40
41 #ifdef CONFIG_PROC_FS
42 static unsigned long direct_pages_count[PG_LEVEL_NUM];
43
44 void update_page_count(int level, unsigned long pages)
45 {
46         unsigned long flags;
47
48         /* Protect against CPA */
49         spin_lock_irqsave(&pgd_lock, flags);
50         direct_pages_count[level] += pages;
51         spin_unlock_irqrestore(&pgd_lock, flags);
52 }
53
54 static void split_page_count(int level)
55 {
56         direct_pages_count[level]--;
57         direct_pages_count[level - 1] += PTRS_PER_PTE;
58 }
59
60 int arch_report_meminfo(char *page)
61 {
62         int n = sprintf(page, "DirectMap4k:  %8lu kB\n",
63                         direct_pages_count[PG_LEVEL_4K] << 2);
64 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
65         n += sprintf(page + n, "DirectMap2M:  %8lu kB\n",
66                         direct_pages_count[PG_LEVEL_2M] << 11);
67 #else
68         n += sprintf(page + n, "DirectMap4M:  %8lu kB\n",
69                         direct_pages_count[PG_LEVEL_2M] << 12);
70 #endif
71 #ifdef CONFIG_X86_64
72         if (direct_gbpages)
73                 n += sprintf(page + n, "DirectMap1G:  %8lu kB\n",
74                         direct_pages_count[PG_LEVEL_1G] << 20);
75 #endif
76         return n;
77 }
78 #else
79 static inline void split_page_count(int level) { }
80 #endif
81
82 #ifdef CONFIG_X86_64
83
84 static inline unsigned long highmap_start_pfn(void)
85 {
86         return __pa(_text) >> PAGE_SHIFT;
87 }
88
89 static inline unsigned long highmap_end_pfn(void)
90 {
91         return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
92 }
93
94 #endif
95
96 #ifdef CONFIG_DEBUG_PAGEALLOC
97 # define debug_pagealloc 1
98 #else
99 # define debug_pagealloc 0
100 #endif
101
102 static inline int
103 within(unsigned long addr, unsigned long start, unsigned long end)
104 {
105         return addr >= start && addr < end;
106 }
107
108 /*
109  * Flushing functions
110  */
111
112 /**
113  * clflush_cache_range - flush a cache range with clflush
114  * @addr:       virtual start address
115  * @size:       number of bytes to flush
116  *
117  * clflush is an unordered instruction which needs fencing with mfence
118  * to avoid ordering issues.
119  */
120 void clflush_cache_range(void *vaddr, unsigned int size)
121 {
122         void *vend = vaddr + size - 1;
123
124         mb();
125
126         for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
127                 clflush(vaddr);
128         /*
129          * Flush any possible final partial cacheline:
130          */
131         clflush(vend);
132
133         mb();
134 }
135
136 static void __cpa_flush_all(void *arg)
137 {
138         unsigned long cache = (unsigned long)arg;
139
140         /*
141          * Flush all to work around Errata in early athlons regarding
142          * large page flushing.
143          */
144         __flush_tlb_all();
145
146         if (cache && boot_cpu_data.x86_model >= 4)
147                 wbinvd();
148 }
149
150 static void cpa_flush_all(unsigned long cache)
151 {
152         BUG_ON(irqs_disabled());
153
154         on_each_cpu(__cpa_flush_all, (void *) cache, 1);
155 }
156
157 static void __cpa_flush_range(void *arg)
158 {
159         /*
160          * We could optimize that further and do individual per page
161          * tlb invalidates for a low number of pages. Caveat: we must
162          * flush the high aliases on 64bit as well.
163          */
164         __flush_tlb_all();
165 }
166
167 static void cpa_flush_range(unsigned long start, int numpages, int cache)
168 {
169         unsigned int i, level;
170         unsigned long addr;
171
172         BUG_ON(irqs_disabled());
173         WARN_ON(PAGE_ALIGN(start) != start);
174
175         on_each_cpu(__cpa_flush_range, NULL, 1);
176
177         if (!cache)
178                 return;
179
180         /*
181          * We only need to flush on one CPU,
182          * clflush is a MESI-coherent instruction that
183          * will cause all other CPUs to flush the same
184          * cachelines:
185          */
186         for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
187                 pte_t *pte = lookup_address(addr, &level);
188
189                 /*
190                  * Only flush present addresses:
191                  */
192                 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
193                         clflush_cache_range((void *) addr, PAGE_SIZE);
194         }
195 }
196
197 static void cpa_flush_array(unsigned long *start, int numpages, int cache)
198 {
199         unsigned int i, level;
200         unsigned long *addr;
201
202         BUG_ON(irqs_disabled());
203
204         on_each_cpu(__cpa_flush_range, NULL, 1);
205
206         if (!cache)
207                 return;
208
209         /* 4M threshold */
210         if (numpages >= 1024) {
211                 if (boot_cpu_data.x86_model >= 4)
212                         wbinvd();
213                 return;
214         }
215         /*
216          * We only need to flush on one CPU,
217          * clflush is a MESI-coherent instruction that
218          * will cause all other CPUs to flush the same
219          * cachelines:
220          */
221         for (i = 0, addr = start; i < numpages; i++, addr++) {
222                 pte_t *pte = lookup_address(*addr, &level);
223
224                 /*
225                  * Only flush present addresses:
226                  */
227                 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
228                         clflush_cache_range((void *) *addr, PAGE_SIZE);
229         }
230 }
231
232 /*
233  * Certain areas of memory on x86 require very specific protection flags,
234  * for example the BIOS area or kernel text. Callers don't always get this
235  * right (again, ioremap() on BIOS memory is not uncommon) so this function
236  * checks and fixes these known static required protection bits.
237  */
238 static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
239                                    unsigned long pfn)
240 {
241         pgprot_t forbidden = __pgprot(0);
242
243         /*
244          * The BIOS area between 640k and 1Mb needs to be executable for
245          * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
246          */
247         if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
248                 pgprot_val(forbidden) |= _PAGE_NX;
249
250         /*
251          * The kernel text needs to be executable for obvious reasons
252          * Does not cover __inittext since that is gone later on. On
253          * 64bit we do not enforce !NX on the low mapping
254          */
255         if (within(address, (unsigned long)_text, (unsigned long)_etext))
256                 pgprot_val(forbidden) |= _PAGE_NX;
257
258         /*
259          * The .rodata section needs to be read-only. Using the pfn
260          * catches all aliases.
261          */
262         if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
263                    __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
264                 pgprot_val(forbidden) |= _PAGE_RW;
265
266         prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
267
268         return prot;
269 }
270
271 /*
272  * Lookup the page table entry for a virtual address. Return a pointer
273  * to the entry and the level of the mapping.
274  *
275  * Note: We return pud and pmd either when the entry is marked large
276  * or when the present bit is not set. Otherwise we would return a
277  * pointer to a nonexisting mapping.
278  */
279 pte_t *lookup_address(unsigned long address, unsigned int *level)
280 {
281         pgd_t *pgd = pgd_offset_k(address);
282         pud_t *pud;
283         pmd_t *pmd;
284
285         *level = PG_LEVEL_NONE;
286
287         if (pgd_none(*pgd))
288                 return NULL;
289
290         pud = pud_offset(pgd, address);
291         if (pud_none(*pud))
292                 return NULL;
293
294         *level = PG_LEVEL_1G;
295         if (pud_large(*pud) || !pud_present(*pud))
296                 return (pte_t *)pud;
297
298         pmd = pmd_offset(pud, address);
299         if (pmd_none(*pmd))
300                 return NULL;
301
302         *level = PG_LEVEL_2M;
303         if (pmd_large(*pmd) || !pmd_present(*pmd))
304                 return (pte_t *)pmd;
305
306         *level = PG_LEVEL_4K;
307
308         return pte_offset_kernel(pmd, address);
309 }
310 EXPORT_SYMBOL_GPL(lookup_address);
311
312 /*
313  * Set the new pmd in all the pgds we know about:
314  */
315 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
316 {
317         /* change init_mm */
318         set_pte_atomic(kpte, pte);
319 #ifdef CONFIG_X86_32
320         if (!SHARED_KERNEL_PMD) {
321                 struct page *page;
322
323                 list_for_each_entry(page, &pgd_list, lru) {
324                         pgd_t *pgd;
325                         pud_t *pud;
326                         pmd_t *pmd;
327
328                         pgd = (pgd_t *)page_address(page) + pgd_index(address);
329                         pud = pud_offset(pgd, address);
330                         pmd = pmd_offset(pud, address);
331                         set_pte_atomic((pte_t *)pmd, pte);
332                 }
333         }
334 #endif
335 }
336
337 static int
338 try_preserve_large_page(pte_t *kpte, unsigned long address,
339                         struct cpa_data *cpa)
340 {
341         unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
342         pte_t new_pte, old_pte, *tmp;
343         pgprot_t old_prot, new_prot;
344         int i, do_split = 1;
345         unsigned int level;
346
347         if (cpa->force_split)
348                 return 1;
349
350         spin_lock_irqsave(&pgd_lock, flags);
351         /*
352          * Check for races, another CPU might have split this page
353          * up already:
354          */
355         tmp = lookup_address(address, &level);
356         if (tmp != kpte)
357                 goto out_unlock;
358
359         switch (level) {
360         case PG_LEVEL_2M:
361                 psize = PMD_PAGE_SIZE;
362                 pmask = PMD_PAGE_MASK;
363                 break;
364 #ifdef CONFIG_X86_64
365         case PG_LEVEL_1G:
366                 psize = PUD_PAGE_SIZE;
367                 pmask = PUD_PAGE_MASK;
368                 break;
369 #endif
370         default:
371                 do_split = -EINVAL;
372                 goto out_unlock;
373         }
374
375         /*
376          * Calculate the number of pages, which fit into this large
377          * page starting at address:
378          */
379         nextpage_addr = (address + psize) & pmask;
380         numpages = (nextpage_addr - address) >> PAGE_SHIFT;
381         if (numpages < cpa->numpages)
382                 cpa->numpages = numpages;
383
384         /*
385          * We are safe now. Check whether the new pgprot is the same:
386          */
387         old_pte = *kpte;
388         old_prot = new_prot = pte_pgprot(old_pte);
389
390         pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
391         pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
392
393         /*
394          * old_pte points to the large page base address. So we need
395          * to add the offset of the virtual address:
396          */
397         pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
398         cpa->pfn = pfn;
399
400         new_prot = static_protections(new_prot, address, pfn);
401
402         /*
403          * We need to check the full range, whether
404          * static_protection() requires a different pgprot for one of
405          * the pages in the range we try to preserve:
406          */
407         addr = address + PAGE_SIZE;
408         pfn++;
409         for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
410                 pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
411
412                 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
413                         goto out_unlock;
414         }
415
416         /*
417          * If there are no changes, return. maxpages has been updated
418          * above:
419          */
420         if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
421                 do_split = 0;
422                 goto out_unlock;
423         }
424
425         /*
426          * We need to change the attributes. Check, whether we can
427          * change the large page in one go. We request a split, when
428          * the address is not aligned and the number of pages is
429          * smaller than the number of pages in the large page. Note
430          * that we limited the number of possible pages already to
431          * the number of pages in the large page.
432          */
433         if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
434                 /*
435                  * The address is aligned and the number of pages
436                  * covers the full page.
437                  */
438                 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
439                 __set_pmd_pte(kpte, address, new_pte);
440                 cpa->flags |= CPA_FLUSHTLB;
441                 do_split = 0;
442         }
443
444 out_unlock:
445         spin_unlock_irqrestore(&pgd_lock, flags);
446
447         return do_split;
448 }
449
450 static LIST_HEAD(page_pool);
451 static unsigned long pool_size, pool_pages, pool_low;
452 static unsigned long pool_used, pool_failed;
453
454 static void cpa_fill_pool(struct page **ret)
455 {
456         gfp_t gfp = GFP_KERNEL;
457         unsigned long flags;
458         struct page *p;
459
460         /*
461          * Avoid recursion (on debug-pagealloc) and also signal
462          * our priority to get to these pagetables:
463          */
464         if (current->flags & PF_MEMALLOC)
465                 return;
466         current->flags |= PF_MEMALLOC;
467
468         /*
469          * Allocate atomically from atomic contexts:
470          */
471         if (in_atomic() || irqs_disabled() || debug_pagealloc)
472                 gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
473
474         while (pool_pages < pool_size || (ret && !*ret)) {
475                 p = alloc_pages(gfp, 0);
476                 if (!p) {
477                         pool_failed++;
478                         break;
479                 }
480                 /*
481                  * If the call site needs a page right now, provide it:
482                  */
483                 if (ret && !*ret) {
484                         *ret = p;
485                         continue;
486                 }
487                 spin_lock_irqsave(&pgd_lock, flags);
488                 list_add(&p->lru, &page_pool);
489                 pool_pages++;
490                 spin_unlock_irqrestore(&pgd_lock, flags);
491         }
492
493         current->flags &= ~PF_MEMALLOC;
494 }
495
496 #define SHIFT_MB                (20 - PAGE_SHIFT)
497 #define ROUND_MB_GB             ((1 << 10) - 1)
498 #define SHIFT_MB_GB             10
499 #define POOL_PAGES_PER_GB       16
500
501 void __init cpa_init(void)
502 {
503         struct sysinfo si;
504         unsigned long gb;
505
506         si_meminfo(&si);
507         /*
508          * Calculate the number of pool pages:
509          *
510          * Convert totalram (nr of pages) to MiB and round to the next
511          * GiB. Shift MiB to Gib and multiply the result by
512          * POOL_PAGES_PER_GB:
513          */
514         if (debug_pagealloc) {
515                 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
516                 pool_size = POOL_PAGES_PER_GB * gb;
517         } else {
518                 pool_size = 1;
519         }
520         pool_low = pool_size;
521
522         cpa_fill_pool(NULL);
523         printk(KERN_DEBUG
524                "CPA: page pool initialized %lu of %lu pages preallocated\n",
525                pool_pages, pool_size);
526 }
527
528 static int split_large_page(pte_t *kpte, unsigned long address)
529 {
530         unsigned long flags, pfn, pfninc = 1;
531         unsigned int i, level;
532         pte_t *pbase, *tmp;
533         pgprot_t ref_prot;
534         struct page *base;
535
536         /*
537          * Get a page from the pool. The pool list is protected by the
538          * pgd_lock, which we have to take anyway for the split
539          * operation:
540          */
541         spin_lock_irqsave(&pgd_lock, flags);
542         if (list_empty(&page_pool)) {
543                 spin_unlock_irqrestore(&pgd_lock, flags);
544                 base = NULL;
545                 cpa_fill_pool(&base);
546                 if (!base)
547                         return -ENOMEM;
548                 spin_lock_irqsave(&pgd_lock, flags);
549         } else {
550                 base = list_first_entry(&page_pool, struct page, lru);
551                 list_del(&base->lru);
552                 pool_pages--;
553
554                 if (pool_pages < pool_low)
555                         pool_low = pool_pages;
556         }
557
558         /*
559          * Check for races, another CPU might have split this page
560          * up for us already:
561          */
562         tmp = lookup_address(address, &level);
563         if (tmp != kpte)
564                 goto out_unlock;
565
566         pbase = (pte_t *)page_address(base);
567         paravirt_alloc_pte(&init_mm, page_to_pfn(base));
568         ref_prot = pte_pgprot(pte_clrhuge(*kpte));
569
570 #ifdef CONFIG_X86_64
571         if (level == PG_LEVEL_1G) {
572                 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
573                 pgprot_val(ref_prot) |= _PAGE_PSE;
574         }
575 #endif
576
577         /*
578          * Get the target pfn from the original entry:
579          */
580         pfn = pte_pfn(*kpte);
581         for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
582                 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
583
584         if (address >= (unsigned long)__va(0) &&
585                 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
586                 split_page_count(level);
587
588 #ifdef CONFIG_X86_64
589         if (address >= (unsigned long)__va(1UL<<32) &&
590                 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
591                 split_page_count(level);
592 #endif
593
594         /*
595          * Install the new, split up pagetable. Important details here:
596          *
597          * On Intel the NX bit of all levels must be cleared to make a
598          * page executable. See section 4.13.2 of Intel 64 and IA-32
599          * Architectures Software Developer's Manual).
600          *
601          * Mark the entry present. The current mapping might be
602          * set to not present, which we preserved above.
603          */
604         ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
605         pgprot_val(ref_prot) |= _PAGE_PRESENT;
606         __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
607         base = NULL;
608
609 out_unlock:
610         /*
611          * If we dropped out via the lookup_address check under
612          * pgd_lock then stick the page back into the pool:
613          */
614         if (base) {
615                 list_add(&base->lru, &page_pool);
616                 pool_pages++;
617         } else
618                 pool_used++;
619         spin_unlock_irqrestore(&pgd_lock, flags);
620
621         return 0;
622 }
623
624 static int __change_page_attr(struct cpa_data *cpa, int primary)
625 {
626         unsigned long address;
627         int do_split, err;
628         unsigned int level;
629         pte_t *kpte, old_pte;
630
631         if (cpa->flags & CPA_ARRAY)
632                 address = cpa->vaddr[cpa->curpage];
633         else
634                 address = *cpa->vaddr;
635
636 repeat:
637         kpte = lookup_address(address, &level);
638         if (!kpte)
639                 return 0;
640
641         old_pte = *kpte;
642         if (!pte_val(old_pte)) {
643                 if (!primary)
644                         return 0;
645                 WARN(1, KERN_WARNING "CPA: called for zero pte. "
646                        "vaddr = %lx cpa->vaddr = %lx\n", address,
647                        *cpa->vaddr);
648                 return -EINVAL;
649         }
650
651         if (level == PG_LEVEL_4K) {
652                 pte_t new_pte;
653                 pgprot_t new_prot = pte_pgprot(old_pte);
654                 unsigned long pfn = pte_pfn(old_pte);
655
656                 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
657                 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
658
659                 new_prot = static_protections(new_prot, address, pfn);
660
661                 /*
662                  * We need to keep the pfn from the existing PTE,
663                  * after all we're only going to change it's attributes
664                  * not the memory it points to
665                  */
666                 new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
667                 cpa->pfn = pfn;
668                 /*
669                  * Do we really change anything ?
670                  */
671                 if (pte_val(old_pte) != pte_val(new_pte)) {
672                         set_pte_atomic(kpte, new_pte);
673                         cpa->flags |= CPA_FLUSHTLB;
674                 }
675                 cpa->numpages = 1;
676                 return 0;
677         }
678
679         /*
680          * Check, whether we can keep the large page intact
681          * and just change the pte:
682          */
683         do_split = try_preserve_large_page(kpte, address, cpa);
684         /*
685          * When the range fits into the existing large page,
686          * return. cp->numpages and cpa->tlbflush have been updated in
687          * try_large_page:
688          */
689         if (do_split <= 0)
690                 return do_split;
691
692         /*
693          * We have to split the large page:
694          */
695         err = split_large_page(kpte, address);
696         if (!err) {
697                 cpa->flags |= CPA_FLUSHTLB;
698                 goto repeat;
699         }
700
701         return err;
702 }
703
704 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
705
706 static int cpa_process_alias(struct cpa_data *cpa)
707 {
708         struct cpa_data alias_cpa;
709         int ret = 0;
710         unsigned long temp_cpa_vaddr, vaddr;
711
712         if (cpa->pfn >= max_pfn_mapped)
713                 return 0;
714
715 #ifdef CONFIG_X86_64
716         if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
717                 return 0;
718 #endif
719         /*
720          * No need to redo, when the primary call touched the direct
721          * mapping already:
722          */
723         if (cpa->flags & CPA_ARRAY)
724                 vaddr = cpa->vaddr[cpa->curpage];
725         else
726                 vaddr = *cpa->vaddr;
727
728         if (!(within(vaddr, PAGE_OFFSET,
729                     PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
730 #ifdef CONFIG_X86_64
731                 || within(vaddr, PAGE_OFFSET + (1UL<<32),
732                     PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
733 #endif
734         )) {
735
736                 alias_cpa = *cpa;
737                 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
738                 alias_cpa.vaddr = &temp_cpa_vaddr;
739                 alias_cpa.flags &= ~CPA_ARRAY;
740
741
742                 ret = __change_page_attr_set_clr(&alias_cpa, 0);
743         }
744
745 #ifdef CONFIG_X86_64
746         if (ret)
747                 return ret;
748         /*
749          * No need to redo, when the primary call touched the high
750          * mapping already:
751          */
752         if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
753                 return 0;
754
755         /*
756          * If the physical address is inside the kernel map, we need
757          * to touch the high mapped kernel as well:
758          */
759         if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
760                 return 0;
761
762         alias_cpa = *cpa;
763         temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
764         alias_cpa.vaddr = &temp_cpa_vaddr;
765         alias_cpa.flags &= ~CPA_ARRAY;
766
767         /*
768          * The high mapping range is imprecise, so ignore the return value.
769          */
770         __change_page_attr_set_clr(&alias_cpa, 0);
771 #endif
772         return ret;
773 }
774
775 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
776 {
777         int ret, numpages = cpa->numpages;
778
779         while (numpages) {
780                 /*
781                  * Store the remaining nr of pages for the large page
782                  * preservation check.
783                  */
784                 cpa->numpages = numpages;
785                 /* for array changes, we can't use large page */
786                 if (cpa->flags & CPA_ARRAY)
787                         cpa->numpages = 1;
788
789                 ret = __change_page_attr(cpa, checkalias);
790                 if (ret)
791                         return ret;
792
793                 if (checkalias) {
794                         ret = cpa_process_alias(cpa);
795                         if (ret)
796                                 return ret;
797                 }
798
799                 /*
800                  * Adjust the number of pages with the result of the
801                  * CPA operation. Either a large page has been
802                  * preserved or a single page update happened.
803                  */
804                 BUG_ON(cpa->numpages > numpages);
805                 numpages -= cpa->numpages;
806                 if (cpa->flags & CPA_ARRAY)
807                         cpa->curpage++;
808                 else
809                         *cpa->vaddr += cpa->numpages * PAGE_SIZE;
810
811         }
812         return 0;
813 }
814
815 static inline int cache_attr(pgprot_t attr)
816 {
817         return pgprot_val(attr) &
818                 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
819 }
820
821 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
822                                     pgprot_t mask_set, pgprot_t mask_clr,
823                                     int force_split, int array)
824 {
825         struct cpa_data cpa;
826         int ret, cache, checkalias;
827
828         /*
829          * Check, if we are requested to change a not supported
830          * feature:
831          */
832         mask_set = canon_pgprot(mask_set);
833         mask_clr = canon_pgprot(mask_clr);
834         if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
835                 return 0;
836
837         /* Ensure we are PAGE_SIZE aligned */
838         if (!array) {
839                 if (*addr & ~PAGE_MASK) {
840                         *addr &= PAGE_MASK;
841                         /*
842                          * People should not be passing in unaligned addresses:
843                          */
844                         WARN_ON_ONCE(1);
845                 }
846         } else {
847                 int i;
848                 for (i = 0; i < numpages; i++) {
849                         if (addr[i] & ~PAGE_MASK) {
850                                 addr[i] &= PAGE_MASK;
851                                 WARN_ON_ONCE(1);
852                         }
853                 }
854         }
855
856         /* Must avoid aliasing mappings in the highmem code */
857         kmap_flush_unused();
858
859         cpa.vaddr = addr;
860         cpa.numpages = numpages;
861         cpa.mask_set = mask_set;
862         cpa.mask_clr = mask_clr;
863         cpa.flags = 0;
864         cpa.curpage = 0;
865         cpa.force_split = force_split;
866
867         if (array)
868                 cpa.flags |= CPA_ARRAY;
869
870         /* No alias checking for _NX bit modifications */
871         checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
872
873         ret = __change_page_attr_set_clr(&cpa, checkalias);
874
875         /*
876          * Check whether we really changed something:
877          */
878         if (!(cpa.flags & CPA_FLUSHTLB))
879                 goto out;
880
881         /*
882          * No need to flush, when we did not set any of the caching
883          * attributes:
884          */
885         cache = cache_attr(mask_set);
886
887         /*
888          * On success we use clflush, when the CPU supports it to
889          * avoid the wbindv. If the CPU does not support it and in the
890          * error case we fall back to cpa_flush_all (which uses
891          * wbindv):
892          */
893         if (!ret && cpu_has_clflush) {
894                 if (cpa.flags & CPA_ARRAY)
895                         cpa_flush_array(addr, numpages, cache);
896                 else
897                         cpa_flush_range(*addr, numpages, cache);
898         } else
899                 cpa_flush_all(cache);
900
901 out:
902         cpa_fill_pool(NULL);
903
904         return ret;
905 }
906
907 static inline int change_page_attr_set(unsigned long *addr, int numpages,
908                                        pgprot_t mask, int array)
909 {
910         return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
911                 array);
912 }
913
914 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
915                                          pgprot_t mask, int array)
916 {
917         return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
918                 array);
919 }
920
921 int _set_memory_uc(unsigned long addr, int numpages)
922 {
923         /*
924          * for now UC MINUS. see comments in ioremap_nocache()
925          */
926         return change_page_attr_set(&addr, numpages,
927                                     __pgprot(_PAGE_CACHE_UC_MINUS), 0);
928 }
929
930 int set_memory_uc(unsigned long addr, int numpages)
931 {
932         /*
933          * for now UC MINUS. see comments in ioremap_nocache()
934          */
935         if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
936                             _PAGE_CACHE_UC_MINUS, NULL))
937                 return -EINVAL;
938
939         return _set_memory_uc(addr, numpages);
940 }
941 EXPORT_SYMBOL(set_memory_uc);
942
943 int set_memory_array_uc(unsigned long *addr, int addrinarray)
944 {
945         unsigned long start;
946         unsigned long end;
947         int i;
948         /*
949          * for now UC MINUS. see comments in ioremap_nocache()
950          */
951         for (i = 0; i < addrinarray; i++) {
952                 start = __pa(addr[i]);
953                 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
954                         if (end != __pa(addr[i + 1]))
955                                 break;
956                         i++;
957                 }
958                 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
959                         goto out;
960         }
961
962         return change_page_attr_set(addr, addrinarray,
963                                     __pgprot(_PAGE_CACHE_UC_MINUS), 1);
964 out:
965         for (i = 0; i < addrinarray; i++) {
966                 unsigned long tmp = __pa(addr[i]);
967
968                 if (tmp == start)
969                         break;
970                 for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
971                         if (end != __pa(addr[i + 1]))
972                                 break;
973                         i++;
974                 }
975                 free_memtype(tmp, end);
976         }
977         return -EINVAL;
978 }
979 EXPORT_SYMBOL(set_memory_array_uc);
980
981 int _set_memory_wc(unsigned long addr, int numpages)
982 {
983         return change_page_attr_set(&addr, numpages,
984                                     __pgprot(_PAGE_CACHE_WC), 0);
985 }
986
987 int set_memory_wc(unsigned long addr, int numpages)
988 {
989         if (!pat_enabled)
990                 return set_memory_uc(addr, numpages);
991
992         if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
993                 _PAGE_CACHE_WC, NULL))
994                 return -EINVAL;
995
996         return _set_memory_wc(addr, numpages);
997 }
998 EXPORT_SYMBOL(set_memory_wc);
999
1000 int _set_memory_wb(unsigned long addr, int numpages)
1001 {
1002         return change_page_attr_clear(&addr, numpages,
1003                                       __pgprot(_PAGE_CACHE_MASK), 0);
1004 }
1005
1006 int set_memory_wb(unsigned long addr, int numpages)
1007 {
1008         free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1009
1010         return _set_memory_wb(addr, numpages);
1011 }
1012 EXPORT_SYMBOL(set_memory_wb);
1013
1014 int set_memory_array_wb(unsigned long *addr, int addrinarray)
1015 {
1016         int i;
1017
1018         for (i = 0; i < addrinarray; i++) {
1019                 unsigned long start = __pa(addr[i]);
1020                 unsigned long end;
1021
1022                 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
1023                         if (end != __pa(addr[i + 1]))
1024                                 break;
1025                         i++;
1026                 }
1027                 free_memtype(start, end);
1028         }
1029         return change_page_attr_clear(addr, addrinarray,
1030                                       __pgprot(_PAGE_CACHE_MASK), 1);
1031 }
1032 EXPORT_SYMBOL(set_memory_array_wb);
1033
1034 int set_memory_x(unsigned long addr, int numpages)
1035 {
1036         return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1037 }
1038 EXPORT_SYMBOL(set_memory_x);
1039
1040 int set_memory_nx(unsigned long addr, int numpages)
1041 {
1042         return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1043 }
1044 EXPORT_SYMBOL(set_memory_nx);
1045
1046 int set_memory_ro(unsigned long addr, int numpages)
1047 {
1048         return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1049 }
1050
1051 int set_memory_rw(unsigned long addr, int numpages)
1052 {
1053         return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1054 }
1055
1056 int set_memory_np(unsigned long addr, int numpages)
1057 {
1058         return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1059 }
1060
1061 int set_memory_4k(unsigned long addr, int numpages)
1062 {
1063         return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1064                                         __pgprot(0), 1, 0);
1065 }
1066
1067 int set_pages_uc(struct page *page, int numpages)
1068 {
1069         unsigned long addr = (unsigned long)page_address(page);
1070
1071         return set_memory_uc(addr, numpages);
1072 }
1073 EXPORT_SYMBOL(set_pages_uc);
1074
1075 int set_pages_wb(struct page *page, int numpages)
1076 {
1077         unsigned long addr = (unsigned long)page_address(page);
1078
1079         return set_memory_wb(addr, numpages);
1080 }
1081 EXPORT_SYMBOL(set_pages_wb);
1082
1083 int set_pages_x(struct page *page, int numpages)
1084 {
1085         unsigned long addr = (unsigned long)page_address(page);
1086
1087         return set_memory_x(addr, numpages);
1088 }
1089 EXPORT_SYMBOL(set_pages_x);
1090
1091 int set_pages_nx(struct page *page, int numpages)
1092 {
1093         unsigned long addr = (unsigned long)page_address(page);
1094
1095         return set_memory_nx(addr, numpages);
1096 }
1097 EXPORT_SYMBOL(set_pages_nx);
1098
1099 int set_pages_ro(struct page *page, int numpages)
1100 {
1101         unsigned long addr = (unsigned long)page_address(page);
1102
1103         return set_memory_ro(addr, numpages);
1104 }
1105
1106 int set_pages_rw(struct page *page, int numpages)
1107 {
1108         unsigned long addr = (unsigned long)page_address(page);
1109
1110         return set_memory_rw(addr, numpages);
1111 }
1112
1113 #ifdef CONFIG_DEBUG_PAGEALLOC
1114
1115 static int __set_pages_p(struct page *page, int numpages)
1116 {
1117         unsigned long tempaddr = (unsigned long) page_address(page);
1118         struct cpa_data cpa = { .vaddr = &tempaddr,
1119                                 .numpages = numpages,
1120                                 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1121                                 .mask_clr = __pgprot(0),
1122                                 .flags = 0};
1123
1124         return __change_page_attr_set_clr(&cpa, 1);
1125 }
1126
1127 static int __set_pages_np(struct page *page, int numpages)
1128 {
1129         unsigned long tempaddr = (unsigned long) page_address(page);
1130         struct cpa_data cpa = { .vaddr = &tempaddr,
1131                                 .numpages = numpages,
1132                                 .mask_set = __pgprot(0),
1133                                 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1134                                 .flags = 0};
1135
1136         return __change_page_attr_set_clr(&cpa, 1);
1137 }
1138
1139 void kernel_map_pages(struct page *page, int numpages, int enable)
1140 {
1141         if (PageHighMem(page))
1142                 return;
1143         if (!enable) {
1144                 debug_check_no_locks_freed(page_address(page),
1145                                            numpages * PAGE_SIZE);
1146         }
1147
1148         /*
1149          * If page allocator is not up yet then do not call c_p_a():
1150          */
1151         if (!debug_pagealloc_enabled)
1152                 return;
1153
1154         /*
1155          * The return value is ignored as the calls cannot fail.
1156          * Large pages are kept enabled at boot time, and are
1157          * split up quickly with DEBUG_PAGEALLOC. If a splitup
1158          * fails here (due to temporary memory shortage) no damage
1159          * is done because we just keep the largepage intact up
1160          * to the next attempt when it will likely be split up:
1161          */
1162         if (enable)
1163                 __set_pages_p(page, numpages);
1164         else
1165                 __set_pages_np(page, numpages);
1166
1167         /*
1168          * We should perform an IPI and flush all tlbs,
1169          * but that can deadlock->flush only current cpu:
1170          */
1171         __flush_tlb_all();
1172
1173         /*
1174          * Try to refill the page pool here. We can do this only after
1175          * the tlb flush.
1176          */
1177         cpa_fill_pool(NULL);
1178 }
1179
1180 #ifdef CONFIG_DEBUG_FS
1181 static int dpa_show(struct seq_file *m, void *v)
1182 {
1183         seq_puts(m, "DEBUG_PAGEALLOC\n");
1184         seq_printf(m, "pool_size     : %lu\n", pool_size);
1185         seq_printf(m, "pool_pages    : %lu\n", pool_pages);
1186         seq_printf(m, "pool_low      : %lu\n", pool_low);
1187         seq_printf(m, "pool_used     : %lu\n", pool_used);
1188         seq_printf(m, "pool_failed   : %lu\n", pool_failed);
1189
1190         return 0;
1191 }
1192
1193 static int dpa_open(struct inode *inode, struct file *filp)
1194 {
1195         return single_open(filp, dpa_show, NULL);
1196 }
1197
1198 static const struct file_operations dpa_fops = {
1199         .open           = dpa_open,
1200         .read           = seq_read,
1201         .llseek         = seq_lseek,
1202         .release        = single_release,
1203 };
1204
1205 static int __init debug_pagealloc_proc_init(void)
1206 {
1207         struct dentry *de;
1208
1209         de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1210                                  &dpa_fops);
1211         if (!de)
1212                 return -ENOMEM;
1213
1214         return 0;
1215 }
1216 __initcall(debug_pagealloc_proc_init);
1217 #endif
1218
1219 #ifdef CONFIG_HIBERNATION
1220
1221 bool kernel_page_present(struct page *page)
1222 {
1223         unsigned int level;
1224         pte_t *pte;
1225
1226         if (PageHighMem(page))
1227                 return false;
1228
1229         pte = lookup_address((unsigned long)page_address(page), &level);
1230         return (pte_val(*pte) & _PAGE_PRESENT);
1231 }
1232
1233 #endif /* CONFIG_HIBERNATION */
1234
1235 #endif /* CONFIG_DEBUG_PAGEALLOC */
1236
1237 /*
1238  * The testcases use internal knowledge of the implementation that shouldn't
1239  * be exposed to the rest of the kernel. Include these directly here.
1240  */
1241 #ifdef CONFIG_CPA_DEBUG
1242 #include "pageattr-test.c"
1243 #endif