nfsd41: Backchannel: Implement cb_recall over NFSv4.1

[safe/jmp/linux-2.6] / mm / mmap.c
diff --git a/mm/mmap.c b/mm/mmap.c

index 5d09d08..34579b2 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3,7 +3,7 @@
   *
   * Written by obz.
   *
- * Address space accounting code       <alan@redhat.com>
+ * Address space accounting code       <alan@lxorguk.ukuu.org.uk>
   */
  
  #include <linux/slab.h>
@@ -20,6 +20,7 @@
  #include <linux/fs.h>
  #include <linux/personality.h>
  #include <linux/security.h>
+#include <linux/ima.h>
  #include <linux/hugetlb.h>
  #include <linux/profile.h>
  #include <linux/module.h>
@@ -27,6 +28,7 @@
  #include <linux/mempolicy.h>
  #include <linux/rmap.h>
  #include <linux/mmu_notifier.h>
+#include <linux/perf_counter.h>
  
  #include <asm/uaccess.h>
  #include <asm/cacheflush.h>
@@ -84,7 +86,10 @@ EXPORT_SYMBOL(vm_get_page_prot);
  int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
  int sysctl_overcommit_ratio = 50;      /* default is 50% */
  int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
+
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
  
  /*
   * Check that a process has enough memory to allocate a new virtual
@@ -175,13 +180,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
  
         /* Don't let a single process grow too big:
            leave 3% of the size of this process for other processes */
-       allowed -= mm->total_vm / 32;
+       if (mm)
+               allowed -= mm->total_vm / 32;
  
-       /*
-        * cast `allowed' as a signed long because vm_committed_space
-        * sometimes has a negative value
-        */
-       if (atomic_long_read(&vm_committed_space) < (long)allowed)
+       if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                 return 0;
  error:
         vm_unacct_memory(pages);
@@ -244,7 +246,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
         return next;
  }
  
-asmlinkage unsigned long sys_brk(unsigned long brk)
+SYSCALL_DEFINE1(brk, unsigned long, brk)
  {
         unsigned long rlim, retval;
         unsigned long newbrk, oldbrk;
@@ -370,7 +372,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                 if (vma_tmp->vm_end > addr) {
                         vma = vma_tmp;
                         if (vma_tmp->vm_start <= addr)
-                               return vma;
+                               break;
                         __rb_link = &__rb_parent->rb_left;
                 } else {
                         rb_prev = __rb_parent;
@@ -410,9 +412,9 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
         rb_insert_color(&vma->vm_rb, &mm->mm_rb);
  }
  
-static inline void __vma_link_file(struct vm_area_struct *vma)
+static void __vma_link_file(struct vm_area_struct *vma)
  {
-       struct file * file;
+       struct file *file;
  
         file = vma->vm_file;
         if (file) {
@@ -473,11 +475,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
   * insert vm structure into list and rbtree and anon_vma,
   * but it has already been inserted into prio_tree earlier.
   */
-static void
-__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
  {
-       struct vm_area_struct * __vma, * prev;
-       struct rb_node ** rb_link, * rb_parent;
+       struct vm_area_struct *__vma, *prev;
+       struct rb_node **rb_link, *rb_parent;
  
         __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
         BUG_ON(__vma && __vma->vm_start < vma->vm_end);
@@ -658,16 +659,17 @@ again:                    remove_next = 1 + (end > next->vm_end);
         validate_mm(mm);
  }
  
+/* Flags that can be inherited from an existing mapping when merging */
+#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
+
  /*
   * If the vma has a ->close operation then the driver probably needs to release
   * per-vma resources, so we don't attempt to merge those.
   */
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
-
  static inline int is_mergeable_vma(struct vm_area_struct *vma,
                         struct file *file, unsigned long vm_flags)
  {
-       if (vma->vm_flags != vm_flags)
+       if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS)
                 return 0;
         if (vma->vm_file != file)
                 return 0;
@@ -909,7 +911,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
   * The caller must hold down_write(current->mm->mmap_sem).
   */
  
-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                         unsigned long len, unsigned long prot,
                         unsigned long flags, unsigned long pgoff)
  {
@@ -917,7 +919,6 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
         struct inode *inode;
         unsigned int vm_flags;
         int error;
-       int accountable = 1;
         unsigned long reqprot = prot;
  
         /*
@@ -972,6 +973,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
                         return -EPERM;
                 vm_flags |= VM_LOCKED;
         }
+
         /* mlock MCL_FUTURE? */
         if (vm_flags & VM_LOCKED) {
                 unsigned long locked, lock_limit;
@@ -1017,8 +1019,6 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
                                         return -EPERM;
                                 vm_flags &= ~VM_MAYEXEC;
                         }
-                       if (is_file_hugepages(file))
-                               accountable = 0;
  
                         if (!file->f_op || !file->f_op->mmap)
                                 return -ENODEV;
@@ -1030,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
         } else {
                 switch (flags & MAP_TYPE) {
                 case MAP_SHARED:
+                       /*
+                        * Ignore pgoff.
+                        */
+                       pgoff = 0;
                         vm_flags |= VM_SHARED | VM_MAYSHARE;
                         break;
                 case MAP_PRIVATE:
@@ -1046,9 +1050,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
         error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
         if (error)
                 return error;
+       error = ima_file_mmap(file, prot);
+       if (error)
+               return error;
  
-       return mmap_region(file, addr, len, flags, vm_flags, pgoff,
-                          accountable);
+       return mmap_region(file, addr, len, flags, vm_flags, pgoff);
  }
  EXPORT_SYMBOL(do_mmap_pgoff);
  
@@ -1084,10 +1090,25 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
                 mapping_cap_account_dirty(vma->vm_file->f_mapping);
  }
  
+/*
+ * We account for memory if it's a private writeable mapping,
+ * not hugepages and VM_NORESERVE wasn't set.
+ */
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+{
+       /*
+        * hugetlb has its own accounting separate from the core VM
+        * VM_HUGETLB may not be set yet so we cannot check for that flag.
+        */
+       if (file && is_file_hugepages(file))
+               return 0;
+
+       return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
+
  unsigned long mmap_region(struct file *file, unsigned long addr,
                           unsigned long len, unsigned long flags,
-                         unsigned int vm_flags, unsigned long pgoff,
-                         int accountable)
+                         unsigned int vm_flags, unsigned long pgoff)
  {
         struct mm_struct *mm = current->mm;
         struct vm_area_struct *vma, *prev;
@@ -1111,33 +1132,35 @@ munmap_back:
         if (!may_expand_vm(mm, len >> PAGE_SHIFT))
                 return -ENOMEM;
  
-       if (flags & MAP_NORESERVE)
-               vm_flags |= VM_NORESERVE;
+       /*
+        * Set 'VM_NORESERVE' if we should not account for the
+        * memory use of this mapping.
+        */
+       if ((flags & MAP_NORESERVE)) {
+               /* We honor MAP_NORESERVE if allowed to overcommit */
+               if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+                       vm_flags |= VM_NORESERVE;
  
-       if (accountable && (!(flags & MAP_NORESERVE) ||
-                           sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
-               if (vm_flags & VM_SHARED) {
-                       /* Check memory availability in shmem_file_setup? */
-                       vm_flags |= VM_ACCOUNT;
-               } else if (vm_flags & VM_WRITE) {
-                       /*
-                        * Private writable mapping: check memory availability
-                        */
-                       charged = len >> PAGE_SHIFT;
-                       if (security_vm_enough_memory(charged))
-                               return -ENOMEM;
-                       vm_flags |= VM_ACCOUNT;
-               }
+               /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+               if (file && is_file_hugepages(file))
+                       vm_flags |= VM_NORESERVE;
         }
  
         /*
-        * Can we just expand an old private anonymous mapping?
-        * The VM_SHARED test is necessary because shmem_zero_setup
-        * will create the file object for a shared anonymous map below.
+        * Private writable mapping: check memory availability
          */
-       if (!file && !(vm_flags & VM_SHARED) &&
-           vma_merge(mm, prev, addr, addr + len, vm_flags,
-                                       NULL, NULL, pgoff, NULL))
+       if (accountable_mapping(file, vm_flags)) {
+               charged = len >> PAGE_SHIFT;
+               if (security_vm_enough_memory(charged))
+                       return -ENOMEM;
+               vm_flags |= VM_ACCOUNT;
+       }
+
+       /*
+        * Can we just expand an old mapping?
+        */
+       vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+       if (vma)
                 goto out;
  
         /*
@@ -1181,14 +1204,6 @@ munmap_back:
                         goto free_vma;
         }
  
-       /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
-        * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
-        * that memory reservation must be checked; but that reservation
-        * belongs to shared memory object, not to vma: so now clear it.
-        */
-       if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
-               vma->vm_flags &= ~VM_ACCOUNT;
-
         /* Can addr have changed??
          *
          * Answer: Yes, several device drivers can do it in their
@@ -1201,29 +1216,26 @@ munmap_back:
         if (vma_wants_writenotify(vma))
                 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
  
-       if (file && vma_merge(mm, prev, addr, vma->vm_end,
-                       vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
-               mpol_put(vma_policy(vma));
-               kmem_cache_free(vm_area_cachep, vma);
-               fput(file);
-               if (vm_flags & VM_EXECUTABLE)
-                       removed_exe_file_vma(mm);
-       } else {
-               vma_link(mm, vma, prev, rb_link, rb_parent);
-               file = vma->vm_file;
-       }
+       vma_link(mm, vma, prev, rb_link, rb_parent);
+       file = vma->vm_file;
  
         /* Once vma denies write, undo our temporary denial count */
         if (correct_wcount)
                 atomic_inc(&inode->i_writecount);
  out:
+       perf_counter_mmap(vma);
+
         mm->total_vm += len >> PAGE_SHIFT;
         vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
         if (vm_flags & VM_LOCKED) {
-               mm->locked_vm += len >> PAGE_SHIFT;
-               make_pages_present(addr, addr + len);
-       }
-       if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+               /*
+                * makes pages present; downgrades, drops, reacquires mmap_sem
+                */
+               long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
+               if (nr_pages < 0)
+                       return nr_pages;        /* vma gone! */
+               mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
+       } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
                 make_pages_present(addr, addr + len);
         return addr;
  
@@ -1454,7 +1466,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
  EXPORT_SYMBOL(get_unmapped_area);
  
  /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
  {
         struct vm_area_struct *vma = NULL;
  
@@ -1497,7 +1509,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
                         struct vm_area_struct **pprev)
  {
         struct vm_area_struct *vma = NULL, *prev = NULL;
-       struct rb_node * rb_node;
+       struct rb_node *rb_node;
         if (!mm)
                 goto out;
  
@@ -1531,7 +1543,7 @@ out:
   * update accounting. This is shared with both the
   * grow-up and grow-down cases.
   */
-static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
  {
         struct mm_struct *mm = vma->vm_mm;
         struct rlimit *rlim = current->signal->rlim;
@@ -1565,7 +1577,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
          * Overcommit..  This must be the final test, as it will
          * update security statistics.
          */
-       if (security_vm_enough_memory(grow))
+       if (security_vm_enough_memory_mm(mm, grow))
                 return -ENOMEM;
  
         /* Ok, everything looks good - let it rip */
@@ -1582,7 +1594,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
   * vma is the last one with address > vma->vm_end.  Have to extend vma.
   */
  #ifndef CONFIG_IA64
-static inline
+static
  #endif
  int expand_upwards(struct vm_area_struct *vma, unsigned long address)
  {
@@ -1632,7 +1644,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
  /*
   * vma is the first one with address < vma->vm_start.  Have to extend vma.
   */
-static inline int expand_downwards(struct vm_area_struct *vma,
+static int expand_downwards(struct vm_area_struct *vma,
                                    unsigned long address)
  {
         int error;
@@ -1696,8 +1708,10 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
                 return vma;
         if (!prev || expand_stack(prev, addr))
                 return NULL;
-       if (prev->vm_flags & VM_LOCKED)
-               make_pages_present(addr, prev->vm_end);
+       if (prev->vm_flags & VM_LOCKED) {
+               if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
+                       return NULL;    /* vma gone! */
+       }
         return prev;
  }
  #else
@@ -1723,8 +1737,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
         start = vma->vm_start;
         if (expand_stack(vma, addr))
                 return NULL;
-       if (vma->vm_flags & VM_LOCKED)
-               make_pages_present(addr, start);
+       if (vma->vm_flags & VM_LOCKED) {
+               if (mlock_vma_pages_range(vma, addr, start) < 0)
+                       return NULL;    /* vma gone! */
+       }
         return vma;
  }
  #endif
@@ -1743,8 +1759,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
                 long nrpages = vma_pages(vma);
  
                 mm->total_vm -= nrpages;
-               if (vma->vm_flags & VM_LOCKED)
-                       mm->locked_vm -= nrpages;
                 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
                 vma = remove_vma(vma);
         } while (vma);
@@ -1910,6 +1924,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
         vma = prev? prev->vm_next: mm->mmap;
  
         /*
+        * unlock any mlock()ed ranges before detaching vmas
+        */
+       if (mm->locked_vm) {
+               struct vm_area_struct *tmp = vma;
+               while (tmp && tmp->vm_start < end) {
+                       if (tmp->vm_flags & VM_LOCKED) {
+                               mm->locked_vm -= vma_pages(tmp);
+                               munlock_vma_pages_all(tmp);
+                       }
+                       tmp = tmp->vm_next;
+               }
+       }
+
+       /*
          * Remove the vma's, and unmap the actual pages
          */
         detach_vmas_to_be_unmapped(mm, vma, prev, end);
@@ -1923,7 +1951,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
  
  EXPORT_SYMBOL(do_munmap);
  
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
  {
         int ret;
         struct mm_struct *mm = current->mm;
@@ -2021,8 +2049,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
                 return -ENOMEM;
  
         /* Can we just expand an old private anonymous mapping? */
-       if (vma_merge(mm, prev, addr, addr + len, flags,
-                                       NULL, NULL, pgoff, NULL))
+       vma = vma_merge(mm, prev, addr, addr + len, flags,
+                                       NULL, NULL, pgoff, NULL);
+       if (vma)
                 goto out;
  
         /*
@@ -2044,8 +2073,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
  out:
         mm->total_vm += len >> PAGE_SHIFT;
         if (flags & VM_LOCKED) {
-               mm->locked_vm += len >> PAGE_SHIFT;
-               make_pages_present(addr, addr + len);
+               if (!mlock_vma_pages_range(vma, addr, addr + len))
+                       mm->locked_vm += (len >> PAGE_SHIFT);
         }
         return addr;
  }
@@ -2056,18 +2085,32 @@ EXPORT_SYMBOL(do_brk);
  void exit_mmap(struct mm_struct *mm)
  {
         struct mmu_gather *tlb;
-       struct vm_area_struct *vma = mm->mmap;
+       struct vm_area_struct *vma;
         unsigned long nr_accounted = 0;
         unsigned long end;
  
         /* mm's last user has gone, and its about to be pulled down */
-       arch_exit_mmap(mm);
         mmu_notifier_release(mm);
  
+       if (mm->locked_vm) {
+               vma = mm->mmap;
+               while (vma) {
+                       if (vma->vm_flags & VM_LOCKED)
+                               munlock_vma_pages_all(vma);
+                       vma = vma->vm_next;
+               }
+       }
+
+       arch_exit_mmap(mm);
+
+       vma = mm->mmap;
+       if (!vma)       /* Can happen if dup_mmap() received an OOM */
+               return;
+
         lru_add_drain();
         flush_cache_mm(mm);
         tlb = tlb_gather_mmu(mm, 1);
-       /* Don't update_hiwater_rss(mm) here, do_exit already did */
+       /* update_hiwater_rss(mm) here? but nobody should be looking */
         /* Use -1 here to ensure all VMAs in the mm are unmapped */
         end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
         vm_unacct_memory(nr_accounted);
@@ -2268,6 +2311,8 @@ int install_special_mapping(struct mm_struct *mm,
  
         mm->total_vm += len >> PAGE_SHIFT;
  
+       perf_counter_mmap(vma);
+
         return 0;
  }
  
@@ -2358,11 +2403,17 @@ int mm_take_all_locks(struct mm_struct *mm)
         for (vma = mm->mmap; vma; vma = vma->vm_next) {
                 if (signal_pending(current))
                         goto out_unlock;
-               if (vma->anon_vma)
-                       vm_lock_anon_vma(mm, vma->anon_vma);
                 if (vma->vm_file && vma->vm_file->f_mapping)
                         vm_lock_mapping(mm, vma->vm_file->f_mapping);
         }
+
+       for (vma = mm->mmap; vma; vma = vma->vm_next) {
+               if (signal_pending(current))
+                       goto out_unlock;
+               if (vma->anon_vma)
+                       vm_lock_anon_vma(mm, vma->anon_vma);
+       }
+
         ret = 0;
  
  out_unlock:
@@ -2428,3 +2479,14 @@ void mm_drop_all_locks(struct mm_struct *mm)
  
         mutex_unlock(&mm_all_locks_mutex);
  }
+
+/*
+ * initialise the VMA slab
+ */
+void __init mmap_init(void)
+{
+       int ret;
+
+       ret = percpu_counter_init(&vm_committed_as, 0);
+       VM_BUG_ON(ret);
+}