2 * Handle caching attributes in page tables (PAT)
4 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
5 * Suresh B Siddha <suresh.b.siddha@intel.com>
7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
10 #include <linux/seq_file.h>
11 #include <linux/bootmem.h>
12 #include <linux/debugfs.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/gfp.h>
18 #include <linux/rbtree.h>
20 #include <asm/cacheflush.h>
21 #include <asm/processor.h>
22 #include <asm/tlbflush.h>
23 #include <asm/pgtable.h>
24 #include <asm/fcntl.h>
33 int __read_mostly pat_enabled = 1;
35 static inline void pat_disable(const char *reason)
38 printk(KERN_INFO "%s\n", reason);
41 static int __init nopat(char *str)
43 pat_disable("PAT support disabled.");
46 early_param("nopat", nopat);
48 static inline void pat_disable(const char *reason)
55 static int debug_enable;
57 static int __init pat_debug_setup(char *str)
62 __setup("debugpat", pat_debug_setup);
64 #define dprintk(fmt, arg...) \
65 do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
68 static u64 __read_mostly boot_pat_state;
71 PAT_UC = 0, /* uncached */
72 PAT_WC = 1, /* Write combining */
73 PAT_WT = 4, /* Write Through */
74 PAT_WP = 5, /* Write Protected */
75 PAT_WB = 6, /* Write Back (default) */
76 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
79 #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
89 if (!boot_pat_state) {
90 pat_disable("PAT not supported by CPU.");
94 * If this happens we are on a secondary CPU, but
95 * switched to PAT on the boot CPU. We have no way to
98 printk(KERN_ERR "PAT enabled, "
99 "but not supported by secondary CPU\n");
104 /* Set PWT to Write-Combining. All other bits stay the same */
106 * PTE encoding used in Linux:
111 * 000 WB _PAGE_CACHE_WB
112 * 001 WC _PAGE_CACHE_WC
113 * 010 UC- _PAGE_CACHE_UC_MINUS
114 * 011 UC _PAGE_CACHE_UC
117 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
118 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
122 rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
124 wrmsrl(MSR_IA32_CR_PAT, pat);
125 printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
126 smp_processor_id(), boot_pat_state, pat);
131 static char *cattr_name(unsigned long flags)
133 switch (flags & _PAGE_CACHE_MASK) {
134 case _PAGE_CACHE_UC: return "uncached";
135 case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
136 case _PAGE_CACHE_WB: return "write-back";
137 case _PAGE_CACHE_WC: return "write-combining";
138 default: return "broken";
143 * The global memtype list keeps track of memory type for specific
144 * physical memory areas. Conflicting memory types in different
145 * mappings can cause CPU cache corruption. To avoid this we keep track.
147 * The list is sorted based on starting address and can contain multiple
148 * entries for each address (this allows reference counting for overlapping
149 * areas). All the aliases have the same cache attributes of course.
150 * Zero attributes are represented as holes.
152 * The data structure is a list that is also organized as an rbtree
153 * sorted on the start address of memtype range.
155 * memtype_lock protects both the linear list and rbtree.
166 static struct rb_root memtype_rbroot = RB_ROOT;
167 static LIST_HEAD(memtype_list);
168 static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
170 static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
172 struct rb_node *node = root->rb_node;
173 struct memtype *last_lower = NULL;
176 struct memtype *data = container_of(node, struct memtype, rb);
178 if (data->start < start) {
180 node = node->rb_right;
181 } else if (data->start > start) {
182 node = node->rb_left;
187 /* Will return NULL if there is no entry with its start <= start */
191 static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
193 struct rb_node **new = &(root->rb_node);
194 struct rb_node *parent = NULL;
197 struct memtype *this = container_of(*new, struct memtype, rb);
200 if (data->start <= this->start)
201 new = &((*new)->rb_left);
202 else if (data->start > this->start)
203 new = &((*new)->rb_right);
206 rb_link_node(&data->rb, parent, new);
207 rb_insert_color(&data->rb, root);
211 * Does intersection of PAT memory type and MTRR memory type and returns
212 * the resulting memory type as PAT understands it.
213 * (Type in pat and mtrr will not have same value)
214 * The intersection is based on "Effective Memory Type" tables in IA-32
217 static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
220 * Look for MTRR hint to get the effective type in case where PAT
223 if (req_type == _PAGE_CACHE_WB) {
226 mtrr_type = mtrr_type_lookup(start, end);
227 if (mtrr_type != MTRR_TYPE_WRBACK)
228 return _PAGE_CACHE_UC_MINUS;
230 return _PAGE_CACHE_WB;
237 chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
239 if (new->type != entry->type) {
241 new->type = entry->type;
247 /* check overlaps with more than one entry in the list */
248 list_for_each_entry_continue(entry, &memtype_list, nd) {
249 if (new->end <= entry->start)
251 else if (new->type != entry->type)
257 printk(KERN_INFO "%s:%d conflicting memory types "
258 "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
259 new->end, cattr_name(new->type), cattr_name(entry->type));
263 static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
265 int ram_page = 0, not_rampage = 0;
266 unsigned long page_nr;
268 for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
271 * For legacy reasons, physical address range in the legacy ISA
272 * region is tracked as non-RAM. This will allow users of
273 * /dev/mem to map portions of legacy ISA region, even when
274 * some of those portions are listed(or not even listed) with
275 * different e820 types(RAM/reserved/..)
277 if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
278 page_is_ram(page_nr))
283 if (ram_page == not_rampage)
291 * For RAM pages, mark the pages as non WB memory type using
292 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
293 * set_memory_wc() on a RAM page at a time before marking it as WB again.
294 * This is ok, because only one driver will be owning the page and
295 * doing set_memory_*() calls.
297 * For now, we use PageNonWB to track that the RAM page is being mapped
298 * as non WB. In future, we will have to use one more flag
299 * (or some other mechanism in page_struct) to distinguish between
302 static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
303 unsigned long *new_type)
308 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
309 page = pfn_to_page(pfn);
310 if (page_mapped(page) || PageNonWB(page))
319 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
320 page = pfn_to_page(pfn);
321 ClearPageNonWB(page);
327 static int free_ram_pages_type(u64 start, u64 end)
332 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
333 page = pfn_to_page(pfn);
334 if (page_mapped(page) || !PageNonWB(page))
337 ClearPageNonWB(page);
343 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
344 page = pfn_to_page(pfn);
351 * req_type typically has one of the:
354 * - _PAGE_CACHE_UC_MINUS
357 * req_type will have a special case value '-1', when requester want to inherit
358 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
360 * If new_type is NULL, function will return an error if it cannot reserve the
361 * region with req_type. If new_type is non-NULL, function will return
362 * available type in new_type in case of no error. In case of any error
363 * it will return a negative return value.
365 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
366 unsigned long *new_type)
368 struct memtype *new, *entry;
369 unsigned long actual_type;
370 struct list_head *where;
374 BUG_ON(start >= end); /* end is exclusive */
377 /* This is identical to page table setting without PAT */
380 *new_type = _PAGE_CACHE_WB;
381 else if (req_type == _PAGE_CACHE_WC)
382 *new_type = _PAGE_CACHE_UC_MINUS;
384 *new_type = req_type & _PAGE_CACHE_MASK;
389 /* Low ISA region is always mapped WB in page table. No need to track */
390 if (is_ISA_range(start, end - 1)) {
392 *new_type = _PAGE_CACHE_WB;
397 * Call mtrr_lookup to get the type hint. This is an
398 * optimization for /dev/mem mmap'ers into WB memory (BIOS
399 * tools and ACPI tools). Use WB request for WB memory and use
400 * UC_MINUS otherwise.
402 actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
405 *new_type = actual_type;
407 is_range_ram = pat_pagerange_is_ram(start, end);
408 if (is_range_ram == 1)
409 return reserve_ram_pages_type(start, end, req_type,
411 else if (is_range_ram < 0)
414 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
420 new->type = actual_type;
422 spin_lock(&memtype_lock);
424 entry = memtype_rb_search(&memtype_rbroot, new->start);
425 if (likely(entry != NULL)) {
426 /* To work correctly with list_for_each_entry_continue */
427 entry = list_entry(entry->nd.prev, struct memtype, nd);
429 entry = list_entry(&memtype_list, struct memtype, nd);
432 /* Search for existing mapping that overlaps the current range */
434 list_for_each_entry_continue(entry, &memtype_list, nd) {
435 if (end <= entry->start) {
436 where = entry->nd.prev;
438 } else if (start <= entry->start) { /* end > entry->start */
439 err = chk_conflict(new, entry, new_type);
441 dprintk("Overlap at 0x%Lx-0x%Lx\n",
442 entry->start, entry->end);
443 where = entry->nd.prev;
446 } else if (start < entry->end) { /* start > entry->start */
447 err = chk_conflict(new, entry, new_type);
449 dprintk("Overlap at 0x%Lx-0x%Lx\n",
450 entry->start, entry->end);
453 * Move to right position in the linked
454 * list to add this new entry
456 list_for_each_entry_continue(entry,
458 if (start <= entry->start) {
459 where = entry->nd.prev;
469 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
470 "track %s, req %s\n",
471 start, end, cattr_name(new->type), cattr_name(req_type));
473 spin_unlock(&memtype_lock);
479 list_add(&new->nd, where);
481 list_add_tail(&new->nd, &memtype_list);
483 memtype_rb_insert(&memtype_rbroot, new);
485 spin_unlock(&memtype_lock);
487 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
488 start, end, cattr_name(new->type), cattr_name(req_type),
489 new_type ? cattr_name(*new_type) : "-");
494 int free_memtype(u64 start, u64 end)
496 struct memtype *entry, *saved_entry;
503 /* Low ISA region is always mapped WB. No need to track */
504 if (is_ISA_range(start, end - 1))
507 is_range_ram = pat_pagerange_is_ram(start, end);
508 if (is_range_ram == 1)
509 return free_ram_pages_type(start, end);
510 else if (is_range_ram < 0)
513 spin_lock(&memtype_lock);
515 entry = memtype_rb_search(&memtype_rbroot, start);
516 if (unlikely(entry == NULL))
520 * Saved entry points to an entry with start same or less than what
521 * we searched for. Now go through the list in both directions to look
522 * for the entry that matches with both start and end, with list stored
523 * in sorted start address
526 list_for_each_entry(entry, &memtype_list, nd) {
527 if (entry->start == start && entry->end == end) {
528 rb_erase(&entry->rb, &memtype_rbroot);
529 list_del(&entry->nd);
533 } else if (entry->start > start) {
542 list_for_each_entry_reverse(entry, &memtype_list, nd) {
543 if (entry->start == start && entry->end == end) {
544 rb_erase(&entry->rb, &memtype_rbroot);
545 list_del(&entry->nd);
549 } else if (entry->start < start) {
554 spin_unlock(&memtype_lock);
557 printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
558 current->comm, current->pid, start, end);
561 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
568 * io_reserve_memtype - Request a memory type mapping for a region of memory
569 * @start: start (physical address) of the region
570 * @end: end (physical address) of the region
571 * @type: A pointer to memtype, with requested type. On success, requested
572 * or any other compatible type that was available for the region is returned
574 * On success, returns 0
575 * On failure, returns non-zero
577 int io_reserve_memtype(resource_size_t start, resource_size_t end,
580 unsigned long req_type = *type;
581 unsigned long new_type;
584 WARN_ON_ONCE(iomem_map_sanity_check(start, end - start));
586 ret = reserve_memtype(start, end, req_type, &new_type);
590 if (!is_new_memtype_allowed(req_type, new_type))
593 if (kernel_map_sync_memtype(start, end - start, new_type) < 0)
600 free_memtype(start, end);
607 * io_free_memtype - Release a memory type mapping for a region of memory
608 * @start: start (physical address) of the region
609 * @end: end (physical address) of the region
611 void io_free_memtype(resource_size_t start, resource_size_t end)
613 free_memtype(start, end);
616 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
617 unsigned long size, pgprot_t vma_prot)
622 #ifdef CONFIG_STRICT_DEVMEM
623 /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
624 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
629 /* This check is needed to avoid cache aliasing when PAT is enabled */
630 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
632 u64 from = ((u64)pfn) << PAGE_SHIFT;
633 u64 to = from + size;
639 while (cursor < to) {
640 if (!devmem_is_allowed(pfn)) {
642 "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
643 current->comm, from, to);
651 #endif /* CONFIG_STRICT_DEVMEM */
653 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
654 unsigned long size, pgprot_t *vma_prot)
656 unsigned long flags = _PAGE_CACHE_WB;
658 if (!range_is_allowed(pfn, size))
661 if (file->f_flags & O_SYNC) {
662 flags = _PAGE_CACHE_UC_MINUS;
667 * On the PPro and successors, the MTRRs are used to set
668 * memory types for physical addresses outside main memory,
669 * so blindly setting UC or PWT on those pages is wrong.
670 * For Pentiums and earlier, the surround logic should disable
671 * caching for the high addresses through the KEN pin, but
672 * we maintain the tradition of paranoia in this code.
675 !(boot_cpu_has(X86_FEATURE_MTRR) ||
676 boot_cpu_has(X86_FEATURE_K6_MTRR) ||
677 boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
678 boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
679 (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
680 flags = _PAGE_CACHE_UC;
684 *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
690 * Change the memory type for the physial address range in kernel identity
691 * mapping space if that range is a part of identity map.
693 int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
697 if (base >= __pa(high_memory))
700 id_sz = (__pa(high_memory) < base + size) ?
701 __pa(high_memory) - base :
704 if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
706 "%s:%d ioremap_change_attr failed %s "
708 current->comm, current->pid,
710 base, (unsigned long long)(base + size));
717 * Internal interface to reserve a range of physical memory with prot.
718 * Reserved non RAM regions only and after successful reserve_memtype,
719 * this func also keeps identity mapping (if any) in sync with this new prot.
721 static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
726 unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
727 unsigned long flags = want_flags;
729 is_ram = pat_pagerange_is_ram(paddr, paddr + size);
732 * reserve_pfn_range() doesn't support RAM pages. Maintain the current
733 * behavior with RAM pages by returning success.
738 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
742 if (flags != want_flags) {
743 if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) {
744 free_memtype(paddr, paddr + size);
745 printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
746 " for %Lx-%Lx, got %s\n",
747 current->comm, current->pid,
748 cattr_name(want_flags),
749 (unsigned long long)paddr,
750 (unsigned long long)(paddr + size),
755 * We allow returning different type than the one requested in
758 *vma_prot = __pgprot((pgprot_val(*vma_prot) &
759 (~_PAGE_CACHE_MASK)) |
763 if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
764 free_memtype(paddr, paddr + size);
771 * Internal interface to free a range of physical memory.
772 * Frees non RAM regions only.
774 static void free_pfn_range(u64 paddr, unsigned long size)
778 is_ram = pat_pagerange_is_ram(paddr, paddr + size);
780 free_memtype(paddr, paddr + size);
784 * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
785 * copied through copy_page_range().
787 * If the vma has a linear pfn mapping for the entire range, we get the prot
788 * from pte and reserve the entire vma range with single reserve_pfn_range call.
790 int track_pfn_vma_copy(struct vm_area_struct *vma)
792 resource_size_t paddr;
794 unsigned long vma_size = vma->vm_end - vma->vm_start;
798 * For now, only handle remap_pfn_range() vmas where
799 * is_linear_pfn_mapping() == TRUE. Handling of
800 * vm_insert_pfn() is TBD.
802 if (is_linear_pfn_mapping(vma)) {
804 * reserve the whole chunk covered by vma. We need the
805 * starting address and protection from pte.
807 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
811 pgprot = __pgprot(prot);
812 return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
819 * track_pfn_vma_new is called when a _new_ pfn mapping is being established
820 * for physical range indicated by pfn and size.
822 * prot is passed in as a parameter for the new mapping. If the vma has a
823 * linear pfn mapping for the entire range reserve the entire vma range with
824 * single reserve_pfn_range call.
826 int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
827 unsigned long pfn, unsigned long size)
829 resource_size_t paddr;
830 unsigned long vma_size = vma->vm_end - vma->vm_start;
833 * For now, only handle remap_pfn_range() vmas where
834 * is_linear_pfn_mapping() == TRUE. Handling of
835 * vm_insert_pfn() is TBD.
837 if (is_linear_pfn_mapping(vma)) {
838 /* reserve the whole chunk starting from vm_pgoff */
839 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
840 return reserve_pfn_range(paddr, vma_size, prot, 0);
847 * untrack_pfn_vma is called while unmapping a pfnmap for a region.
848 * untrack can be called for a specific region indicated by pfn and size or
849 * can be for the entire vma (in which case size can be zero).
851 void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
854 resource_size_t paddr;
855 unsigned long vma_size = vma->vm_end - vma->vm_start;
858 * For now, only handle remap_pfn_range() vmas where
859 * is_linear_pfn_mapping() == TRUE. Handling of
860 * vm_insert_pfn() is TBD.
862 if (is_linear_pfn_mapping(vma)) {
863 /* free the whole chunk starting from vm_pgoff */
864 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
865 free_pfn_range(paddr, vma_size);
870 pgprot_t pgprot_writecombine(pgprot_t prot)
873 return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
875 return pgprot_noncached(prot);
877 EXPORT_SYMBOL_GPL(pgprot_writecombine);
879 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
881 /* get Nth element of the linked list */
882 static struct memtype *memtype_get_idx(loff_t pos)
884 struct memtype *list_node, *print_entry;
887 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
891 spin_lock(&memtype_lock);
892 list_for_each_entry(list_node, &memtype_list, nd) {
894 *print_entry = *list_node;
895 spin_unlock(&memtype_lock);
900 spin_unlock(&memtype_lock);
906 static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
910 seq_printf(seq, "PAT memtype list:\n");
913 return memtype_get_idx(*pos);
916 static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
919 return memtype_get_idx(*pos);
922 static void memtype_seq_stop(struct seq_file *seq, void *v)
926 static int memtype_seq_show(struct seq_file *seq, void *v)
928 struct memtype *print_entry = (struct memtype *)v;
930 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
931 print_entry->start, print_entry->end);
937 static struct seq_operations memtype_seq_ops = {
938 .start = memtype_seq_start,
939 .next = memtype_seq_next,
940 .stop = memtype_seq_stop,
941 .show = memtype_seq_show,
944 static int memtype_seq_open(struct inode *inode, struct file *file)
946 return seq_open(file, &memtype_seq_ops);
949 static const struct file_operations memtype_fops = {
950 .open = memtype_seq_open,
953 .release = seq_release,
956 static int __init pat_memtype_list_init(void)
958 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
959 NULL, &memtype_fops);
963 late_initcall(pat_memtype_list_init);
965 #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */