x86: PAT use reserve free memtype in mmap of /dev/mem
[safe/jmp/linux-2.6] / drivers / char / mem.c
index 6fe7b6c..e83623e 100644 (file)
@@ -8,7 +8,6 @@
  *  Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
  */
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
 #include <linux/raw.h>
 #include <linux/tty.h>
 #include <linux/capability.h>
-#include <linux/smp_lock.h>
 #include <linux/ptrace.h>
 #include <linux/device.h>
 #include <linux/highmem.h>
 #include <linux/crash_dump.h>
 #include <linux/backing-dev.h>
 #include <linux/bootmem.h>
-#include <linux/pipe_fs_i.h>
+#include <linux/splice.h>
+#include <linux/pfn.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
  */
 static inline int uncached_access(struct file *file, unsigned long addr)
 {
-#if defined(__i386__)
-       /*
-        * On the PPro and successors, the MTRRs are used to set
-        * memory types for physical addresses outside main memory,
-        * so blindly setting PCD or PWT on those pages is wrong.
-        * For Pentiums and earlier, the surround logic should disable
-        * caching for the high addresses through the KEN pin, but
-        * we maintain the tradition of paranoia in this code.
-        */
-       if (file->f_flags & O_SYNC)
-               return 1;
-       return !( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
-                 test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
-                 test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
-                 test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability) )
-         && addr >= __pa(high_memory);
-#elif defined(__x86_64__)
-       /* 
-        * This is broken because it can generate memory type aliases,
-        * which can cause cache corruptions
-        * But it is only available for root and we have to be bug-to-bug
-        * compatible with i386.
-        */
-       if (file->f_flags & O_SYNC)
-               return 1;
-       /* same behaviour as i386. PAT always set to cached and MTRRs control the
-          caching behaviour. 
-          Hopefully a full PAT implementation will fix that soon. */      
-       return 0;
-#elif defined(CONFIG_IA64)
+#if defined(CONFIG_IA64)
        /*
         * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases.
         */
        return !(efi_mem_attributes(addr) & EFI_MEMORY_WB);
+#elif defined(CONFIG_MIPS)
+       {
+               extern int __uncached_access(struct file *file,
+                                            unsigned long addr);
+
+               return __uncached_access(file, addr);
+       }
 #else
        /*
         * Accessing memory above the top the kernel knows about or through a file pointer
@@ -96,12 +73,42 @@ static inline int valid_phys_addr_range(unsigned long addr, size_t count)
        return 1;
 }
 
-static inline int valid_mmap_phys_addr_range(unsigned long addr, size_t size)
+static inline int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
 {
        return 1;
 }
 #endif
 
+#ifdef CONFIG_NONPROMISC_DEVMEM
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+       u64 from = ((u64)pfn) << PAGE_SHIFT;
+       u64 to = from + size;
+       u64 cursor = from;
+
+       while (cursor < to) {
+               if (!devmem_is_allowed(pfn)) {
+                       printk(KERN_INFO
+               "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+                               current->comm, from, to);
+                       return 0;
+               }
+               cursor += PAGE_SIZE;
+               pfn++;
+       }
+       return 1;
+}
+#else
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+       return 1;
+}
+#endif
+
+void __attribute__((weak)) unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+}
+
 /*
  * This funcion reads the *physical* memory. The f_pos points directly to the 
  * memory location. 
@@ -144,15 +151,25 @@ static ssize_t read_mem(struct file * file, char __user * buf,
 
                sz = min_t(unsigned long, sz, count);
 
+               if (!range_is_allowed(p >> PAGE_SHIFT, count))
+                       return -EPERM;
+
                /*
                 * On ia64 if a page has been mapped somewhere as
                 * uncached, then it must also be accessed uncached
                 * by the kernel or data corruption may occur
                 */
                ptr = xlate_dev_mem_ptr(p);
+               if (!ptr)
+                       return -EFAULT;
 
-               if (copy_to_user(buf, ptr, sz))
+               if (copy_to_user(buf, ptr, sz)) {
+                       unxlate_dev_mem_ptr(p, ptr);
                        return -EFAULT;
+               }
+
+               unxlate_dev_mem_ptr(p, ptr);
+
                buf += sz;
                p += sz;
                count -= sz;
@@ -201,20 +218,32 @@ static ssize_t write_mem(struct file * file, const char __user * buf,
 
                sz = min_t(unsigned long, sz, count);
 
+               if (!range_is_allowed(p >> PAGE_SHIFT, sz))
+                       return -EPERM;
+
                /*
                 * On ia64 if a page has been mapped somewhere as
                 * uncached, then it must also be accessed uncached
                 * by the kernel or data corruption may occur
                 */
                ptr = xlate_dev_mem_ptr(p);
+               if (!ptr) {
+                       if (written)
+                               break;
+                       return -EFAULT;
+               }
 
                copied = copy_from_user(ptr, buf, sz);
                if (copied) {
                        written += sz - copied;
+                       unxlate_dev_mem_ptr(p, ptr);
                        if (written)
                                break;
                        return -EFAULT;
                }
+
+               unxlate_dev_mem_ptr(p, ptr);
+
                buf += sz;
                p += sz;
                count -= sz;
@@ -225,6 +254,12 @@ static ssize_t write_mem(struct file * file, const char __user * buf,
        return written;
 }
 
+int __attribute__((weak)) phys_mem_access_prot_allowed(struct file *file,
+       unsigned long pfn, unsigned long size, pgprot_t *vma_prot)
+{
+       return 1;
+}
+
 #ifndef __HAVE_PHYS_MEM_ACCESS_PROT
 static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                                     unsigned long size, pgprot_t vma_prot)
@@ -239,24 +274,93 @@ static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 }
 #endif
 
+#ifndef CONFIG_MMU
+static unsigned long get_unmapped_area_mem(struct file *file,
+                                          unsigned long addr,
+                                          unsigned long len,
+                                          unsigned long pgoff,
+                                          unsigned long flags)
+{
+       if (!valid_mmap_phys_addr_range(pgoff, len))
+               return (unsigned long) -EINVAL;
+       return pgoff << PAGE_SHIFT;
+}
+
+/* can't do an in-place private mapping if there's no MMU */
+static inline int private_mapping_ok(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_MAYSHARE;
+}
+#else
+#define get_unmapped_area_mem  NULL
+
+static inline int private_mapping_ok(struct vm_area_struct *vma)
+{
+       return 1;
+}
+#endif
+
+void __attribute__((weak))
+map_devmem(unsigned long pfn, unsigned long len, pgprot_t prot)
+{
+       /* nothing. architectures can override. */
+}
+
+void __attribute__((weak))
+unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t prot)
+{
+       /* nothing. architectures can override. */
+}
+
+static void mmap_mem_open(struct vm_area_struct *vma)
+{
+       map_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
+                       vma->vm_page_prot);
+}
+
+static void mmap_mem_close(struct vm_area_struct *vma)
+{
+       unmap_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
+                       vma->vm_page_prot);
+}
+
+static struct vm_operations_struct mmap_mem_ops = {
+       .open  = mmap_mem_open,
+       .close = mmap_mem_close
+};
+
 static int mmap_mem(struct file * file, struct vm_area_struct * vma)
 {
        size_t size = vma->vm_end - vma->vm_start;
 
-       if (!valid_mmap_phys_addr_range(vma->vm_pgoff << PAGE_SHIFT, size))
+       if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size))
+               return -EINVAL;
+
+       if (!private_mapping_ok(vma))
+               return -ENOSYS;
+
+       if (!range_is_allowed(vma->vm_pgoff, size))
+               return -EPERM;
+
+       if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
+                                               &vma->vm_page_prot))
                return -EINVAL;
 
        vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
                                                 size,
                                                 vma->vm_page_prot);
 
+       vma->vm_ops = &mmap_mem_ops;
+
        /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
        if (remap_pfn_range(vma,
                            vma->vm_start,
                            vma->vm_pgoff,
                            size,
-                           vma->vm_page_prot))
+                           vma->vm_page_prot)) {
+               unmap_devmem(vma->vm_pgoff, size, vma->vm_page_prot);
                return -EAGAIN;
+       }
        return 0;
 }
 
@@ -523,7 +627,7 @@ static ssize_t write_kmem(struct file * file, const char __user * buf,
        return virtr + wrote;
 }
 
-#if defined(CONFIG_ISA) || !defined(__mc68000__)
+#ifdef CONFIG_DEVPORT
 static ssize_t read_port(struct file * file, char __user * buf,
                         size_t count, loff_t *ppos)
 {
@@ -590,64 +694,10 @@ static ssize_t splice_write_null(struct pipe_inode_info *pipe,struct file *out,
        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null);
 }
 
-#ifdef CONFIG_MMU
-/*
- * For fun, we are using the MMU for this.
- */
-static inline size_t read_zero_pagealigned(char __user * buf, size_t size)
-{
-       struct mm_struct *mm;
-       struct vm_area_struct * vma;
-       unsigned long addr=(unsigned long)buf;
-
-       mm = current->mm;
-       /* Oops, this was forgotten before. -ben */
-       down_read(&mm->mmap_sem);
-
-       /* For private mappings, just map in zero pages. */
-       for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
-               unsigned long count;
-
-               if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
-                       goto out_up;
-               if (vma->vm_flags & (VM_SHARED | VM_HUGETLB))
-                       break;
-               count = vma->vm_end - addr;
-               if (count > size)
-                       count = size;
-
-               zap_page_range(vma, addr, count, NULL);
-               zeromap_page_range(vma, addr, count, PAGE_COPY);
-
-               size -= count;
-               buf += count;
-               addr += count;
-               if (size == 0)
-                       goto out_up;
-       }
-
-       up_read(&mm->mmap_sem);
-       
-       /* The shared case is hard. Let's do the conventional zeroing. */ 
-       do {
-               unsigned long unwritten = clear_user(buf, PAGE_SIZE);
-               if (unwritten)
-                       return size + unwritten - PAGE_SIZE;
-               cond_resched();
-               buf += PAGE_SIZE;
-               size -= PAGE_SIZE;
-       } while (size);
-
-       return size;
-out_up:
-       up_read(&mm->mmap_sem);
-       return size;
-}
-
 static ssize_t read_zero(struct file * file, char __user * buf, 
                         size_t count, loff_t *ppos)
 {
-       unsigned long left, unwritten, written = 0;
+       size_t written;
 
        if (!count)
                return 0;
@@ -655,66 +705,33 @@ static ssize_t read_zero(struct file * file, char __user * buf,
        if (!access_ok(VERIFY_WRITE, buf, count))
                return -EFAULT;
 
-       left = count;
-
-       /* do we want to be clever? Arbitrary cut-off */
-       if (count >= PAGE_SIZE*4) {
-               unsigned long partial;
+       written = 0;
+       while (count) {
+               unsigned long unwritten;
+               size_t chunk = count;
 
-               /* How much left of the page? */
-               partial = (PAGE_SIZE-1) & -(unsigned long) buf;
-               unwritten = clear_user(buf, partial);
-               written = partial - unwritten;
+               if (chunk > PAGE_SIZE)
+                       chunk = PAGE_SIZE;      /* Just for latency reasons */
+               unwritten = clear_user(buf, chunk);
+               written += chunk - unwritten;
                if (unwritten)
-                       goto out;
-               left -= partial;
-               buf += partial;
-               unwritten = read_zero_pagealigned(buf, left & PAGE_MASK);
-               written += (left & PAGE_MASK) - unwritten;
-               if (unwritten)
-                       goto out;
-               buf += left & PAGE_MASK;
-               left &= ~PAGE_MASK;
+                       break;
+               buf += chunk;
+               count -= chunk;
+               cond_resched();
        }
-       unwritten = clear_user(buf, left);
-       written += left - unwritten;
-out:
        return written ? written : -EFAULT;
 }
 
 static int mmap_zero(struct file * file, struct vm_area_struct * vma)
 {
+#ifndef CONFIG_MMU
+       return -ENOSYS;
+#endif
        if (vma->vm_flags & VM_SHARED)
                return shmem_zero_setup(vma);
-       if (zeromap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot))
-               return -EAGAIN;
        return 0;
 }
-#else /* CONFIG_MMU */
-static ssize_t read_zero(struct file * file, char * buf, 
-                        size_t count, loff_t *ppos)
-{
-       size_t todo = count;
-
-       while (todo) {
-               size_t chunk = todo;
-
-               if (chunk > 4096)
-                       chunk = 4096;   /* Just for latency reasons */
-               if (clear_user(buf, chunk))
-                       return -EFAULT;
-               buf += chunk;
-               todo -= chunk;
-               cond_resched();
-       }
-       return count;
-}
-
-static int mmap_zero(struct file * file, struct vm_area_struct * vma)
-{
-       return -ENOSYS;
-}
-#endif /* CONFIG_MMU */
 
 static ssize_t write_full(struct file * file, const char __user * buf,
                          size_t count, loff_t *ppos)
@@ -745,7 +762,7 @@ static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
 {
        loff_t ret;
 
-       mutex_lock(&file->f_dentry->d_inode->i_mutex);
+       mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
        switch (orig) {
                case 0:
                        file->f_pos = offset;
@@ -760,7 +777,7 @@ static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
                default:
                        ret = -EINVAL;
        }
-       mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+       mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
        return ret;
 }
 
@@ -777,31 +794,33 @@ static int open_port(struct inode * inode, struct file * filp)
 #define open_kmem      open_mem
 #define open_oldmem    open_mem
 
-static struct file_operations mem_fops = {
+static const struct file_operations mem_fops = {
        .llseek         = memory_lseek,
        .read           = read_mem,
        .write          = write_mem,
        .mmap           = mmap_mem,
        .open           = open_mem,
+       .get_unmapped_area = get_unmapped_area_mem,
 };
 
-static struct file_operations kmem_fops = {
+static const struct file_operations kmem_fops = {
        .llseek         = memory_lseek,
        .read           = read_kmem,
        .write          = write_kmem,
        .mmap           = mmap_kmem,
        .open           = open_kmem,
+       .get_unmapped_area = get_unmapped_area_mem,
 };
 
-static struct file_operations null_fops = {
+static const struct file_operations null_fops = {
        .llseek         = null_lseek,
        .read           = read_null,
        .write          = write_null,
        .splice_write   = splice_write_null,
 };
 
-#if defined(CONFIG_ISA) || !defined(__mc68000__)
-static struct file_operations port_fops = {
+#ifdef CONFIG_DEVPORT
+static const struct file_operations port_fops = {
        .llseek         = memory_lseek,
        .read           = read_port,
        .write          = write_port,
@@ -809,25 +828,29 @@ static struct file_operations port_fops = {
 };
 #endif
 
-static struct file_operations zero_fops = {
+static const struct file_operations zero_fops = {
        .llseek         = zero_lseek,
        .read           = read_zero,
        .write          = write_zero,
        .mmap           = mmap_zero,
 };
 
+/*
+ * capabilities for /dev/zero
+ * - permits private mappings, "copies" are taken of the source of zeros
+ */
 static struct backing_dev_info zero_bdi = {
        .capabilities   = BDI_CAP_MAP_COPY,
 };
 
-static struct file_operations full_fops = {
+static const struct file_operations full_fops = {
        .llseek         = full_lseek,
        .read           = read_full,
        .write          = write_full,
 };
 
 #ifdef CONFIG_CRASH_DUMP
-static struct file_operations oldmem_fops = {
+static const struct file_operations oldmem_fops = {
        .read   = read_oldmem,
        .open   = open_oldmem,
 };
@@ -854,7 +877,7 @@ static ssize_t kmsg_write(struct file * file, const char __user * buf,
        return ret;
 }
 
-static struct file_operations kmsg_fops = {
+static const struct file_operations kmsg_fops = {
        .write =        kmsg_write,
 };
 
@@ -863,14 +886,18 @@ static int memory_open(struct inode * inode, struct file * filp)
        switch (iminor(inode)) {
                case 1:
                        filp->f_op = &mem_fops;
+                       filp->f_mapping->backing_dev_info =
+                               &directly_mappable_cdev_bdi;
                        break;
                case 2:
                        filp->f_op = &kmem_fops;
+                       filp->f_mapping->backing_dev_info =
+                               &directly_mappable_cdev_bdi;
                        break;
                case 3:
                        filp->f_op = &null_fops;
                        break;
-#if defined(CONFIG_ISA) || !defined(__mc68000__)
+#ifdef CONFIG_DEVPORT
                case 4:
                        filp->f_op = &port_fops;
                        break;
@@ -904,7 +931,7 @@ static int memory_open(struct inode * inode, struct file * filp)
        return 0;
 }
 
-static struct file_operations memory_fops = {
+static const struct file_operations memory_fops = {
        .open           = memory_open,  /* just a selector for the real open */
 };
 
@@ -917,7 +944,7 @@ static const struct {
        {1, "mem",     S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops},
        {2, "kmem",    S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops},
        {3, "null",    S_IRUGO | S_IWUGO,           &null_fops},
-#if defined(CONFIG_ISA) || !defined(__mc68000__)
+#ifdef CONFIG_DEVPORT
        {4, "port",    S_IRUSR | S_IWUSR | S_IRGRP, &port_fops},
 #endif
        {5, "zero",    S_IRUGO | S_IWUGO,           &zero_fops},
@@ -935,16 +962,21 @@ static struct class *mem_class;
 static int __init chr_dev_init(void)
 {
        int i;
+       int err;
+
+       err = bdi_init(&zero_bdi);
+       if (err)
+               return err;
 
        if (register_chrdev(MEM_MAJOR,"mem",&memory_fops))
                printk("unable to get major %d for memory devs\n", MEM_MAJOR);
 
        mem_class = class_create(THIS_MODULE, "mem");
        for (i = 0; i < ARRAY_SIZE(devlist); i++)
-               class_device_create(mem_class, NULL,
-                                       MKDEV(MEM_MAJOR, devlist[i].minor),
-                                       NULL, devlist[i].name);
-       
+               device_create(mem_class, NULL,
+                             MKDEV(MEM_MAJOR, devlist[i].minor),
+                             devlist[i].name);
+
        return 0;
 }