Merge branches 'core/futexes' and 'core/iommu' into core/urgent

[safe/jmp/linux-2.6] / mm / mmap.c
diff --git a/mm/mmap.c b/mm/mmap.c

index 214b6a2..ee22989 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -27,6 +27,7 @@
  #include <linux/mempolicy.h>
  #include <linux/rmap.h>
  #include <linux/mmu_notifier.h>
+#include <linux/perf_event.h>
  
  #include <asm/uaccess.h>
  #include <asm/cacheflush.h>
@@ -84,7 +85,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
  int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
  int sysctl_overcommit_ratio = 50;      /* default is 50% */
  int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
  
  /*
   * Check that a process has enough memory to allocate a new virtual
@@ -178,11 +179,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
         if (mm)
                 allowed -= mm->total_vm / 32;
  
-       /*
-        * cast `allowed' as a signed long because vm_committed_space
-        * sometimes has a negative value
-        */
-       if (atomic_long_read(&vm_committed_space) < (long)allowed)
+       if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                 return 0;
  error:
         vm_unacct_memory(pages);
@@ -572,9 +569,9 @@ again:                      remove_next = 1 + (end > next->vm_end);
  
         /*
          * When changing only vma->vm_end, we don't really need
-        * anon_vma lock: but is that case worth optimizing out?
+        * anon_vma lock.
          */
-       if (vma->anon_vma)
+       if (vma->anon_vma && (insert || importer || start != vma->vm_start))
                 anon_vma = vma->anon_vma;
         if (anon_vma) {
                 spin_lock(&anon_vma->lock);
@@ -658,9 +655,6 @@ again:                      remove_next = 1 + (end > next->vm_end);
         validate_mm(mm);
  }
  
-/* Flags that can be inherited from an existing mapping when merging */
-#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
-
  /*
   * If the vma has a ->close operation then the driver probably needs to release
   * per-vma resources, so we don't attempt to merge those.
@@ -668,7 +662,8 @@ again:                      remove_next = 1 + (end > next->vm_end);
  static inline int is_mergeable_vma(struct vm_area_struct *vma,
                         struct file *file, unsigned long vm_flags)
  {
-       if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS)
+       /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
+       if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
                 return 0;
         if (vma->vm_file != file)
                 return 0;
@@ -907,7 +902,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
  #endif /* CONFIG_PROC_FS */
  
  /*
- * The caller must hold down_write(current->mm->mmap_sem).
+ * The caller must hold down_write(&current->mm->mmap_sem).
   */
  
  unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
@@ -918,7 +913,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
         struct inode *inode;
         unsigned int vm_flags;
         int error;
-       int accountable = 1;
         unsigned long reqprot = prot;
  
         /*
@@ -937,13 +931,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
         if (!(flags & MAP_FIXED))
                 addr = round_hint_to_min(addr);
  
-       error = arch_mmap_check(addr, len, flags);
-       if (error)
-               return error;
-
         /* Careful about overflows.. */
         len = PAGE_ALIGN(len);
-       if (!len || len > TASK_SIZE)
+       if (!len)
                 return -ENOMEM;
  
         /* offset overflow? */
@@ -968,11 +958,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
         vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
                         mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
  
-       if (flags & MAP_LOCKED) {
+       if (flags & MAP_LOCKED)
                 if (!can_do_mlock())
                         return -EPERM;
-               vm_flags |= VM_LOCKED;
-       }
  
         /* mlock MCL_FUTURE? */
         if (vm_flags & VM_LOCKED) {
@@ -1019,8 +1007,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                                         return -EPERM;
                                 vm_flags &= ~VM_MAYEXEC;
                         }
-                       if (is_file_hugepages(file))
-                               accountable = 0;
  
                         if (!file->f_op || !file->f_op->mmap)
                                 return -ENODEV;
@@ -1053,11 +1039,50 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
         if (error)
                 return error;
  
-       return mmap_region(file, addr, len, flags, vm_flags, pgoff,
-                          accountable);
+       return mmap_region(file, addr, len, flags, vm_flags, pgoff);
  }
  EXPORT_SYMBOL(do_mmap_pgoff);
  
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+               unsigned long, prot, unsigned long, flags,
+               unsigned long, fd, unsigned long, pgoff)
+{
+       struct file *file = NULL;
+       unsigned long retval = -EBADF;
+
+       if (!(flags & MAP_ANONYMOUS)) {
+               if (unlikely(flags & MAP_HUGETLB))
+                       return -EINVAL;
+               file = fget(fd);
+               if (!file)
+                       goto out;
+       } else if (flags & MAP_HUGETLB) {
+               struct user_struct *user = NULL;
+               /*
+                * VM_NORESERVE is used because the reservations will be
+                * taken when vm_ops->mmap() is called
+                * A dummy user value is used because we are not locking
+                * memory so no accounting is necessary
+                */
+               len = ALIGN(len, huge_page_size(&default_hstate));
+               file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+                                               &user, HUGETLB_ANONHUGE_INODE);
+               if (IS_ERR(file))
+                       return PTR_ERR(file);
+       }
+
+       flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+
+       down_write(&current->mm->mmap_sem);
+       retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+       up_write(&current->mm->mmap_sem);
+
+       if (file)
+               fput(file);
+out:
+       return retval;
+}
+
  /*
   * Some shared mappigns will want the pages marked read-only
   * to track write events. If so, we'll downgrade vm_page_prot
@@ -1092,17 +1117,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
  
  /*
   * We account for memory if it's a private writeable mapping,
- * and VM_NORESERVE wasn't set.
+ * not hugepages and VM_NORESERVE wasn't set.
   */
-static inline int accountable_mapping(unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
  {
+       /*
+        * hugetlb has its own accounting separate from the core VM
+        * VM_HUGETLB may not be set yet so we cannot check for that flag.
+        */
+       if (file && is_file_hugepages(file))
+               return 0;
+
         return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
  }
  
  unsigned long mmap_region(struct file *file, unsigned long addr,
                           unsigned long len, unsigned long flags,
-                         unsigned int vm_flags, unsigned long pgoff,
-                         int accountable)
+                         unsigned int vm_flags, unsigned long pgoff)
  {
         struct mm_struct *mm = current->mm;
         struct vm_area_struct *vma, *prev;
@@ -1128,18 +1159,22 @@ munmap_back:
  
         /*
          * Set 'VM_NORESERVE' if we should not account for the
-        * memory use of this mapping. We only honor MAP_NORESERVE
-        * if we're allowed to overcommit memory.
+        * memory use of this mapping.
          */
-       if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
-               vm_flags |= VM_NORESERVE;
-       if (!accountable)
-               vm_flags |= VM_NORESERVE;
+       if ((flags & MAP_NORESERVE)) {
+               /* We honor MAP_NORESERVE if allowed to overcommit */
+               if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+                       vm_flags |= VM_NORESERVE;
+
+               /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+               if (file && is_file_hugepages(file))
+                       vm_flags |= VM_NORESERVE;
+       }
  
         /*
          * Private writable mapping: check memory availability
          */
-       if (accountable_mapping(vm_flags)) {
+       if (accountable_mapping(file, vm_flags)) {
                 charged = len >> PAGE_SHIFT;
                 if (security_vm_enough_memory(charged))
                         return -ENOMEM;
@@ -1188,23 +1223,35 @@ munmap_back:
                         goto unmap_and_free_vma;
                 if (vm_flags & VM_EXECUTABLE)
                         added_exe_file_vma(mm);
+
+               /* Can addr have changed??
+                *
+                * Answer: Yes, several device drivers can do it in their
+                *         f_op->mmap method. -DaveM
+                */
+               addr = vma->vm_start;
+               pgoff = vma->vm_pgoff;
+               vm_flags = vma->vm_flags;
         } else if (vm_flags & VM_SHARED) {
                 error = shmem_zero_setup(vma);
                 if (error)
                         goto free_vma;
         }
  
-       /* Can addr have changed??
-        *
-        * Answer: Yes, several device drivers can do it in their
-        *         f_op->mmap method. -DaveM
-        */
-       addr = vma->vm_start;
-       pgoff = vma->vm_pgoff;
-       vm_flags = vma->vm_flags;
+       if (vma_wants_writenotify(vma)) {
+               pgprot_t pprot = vma->vm_page_prot;
  
-       if (vma_wants_writenotify(vma))
+               /* Can vma->vm_page_prot have changed??
+                *
+                * Answer: Yes, drivers may have changed it in their
+                *         f_op->mmap method.
+                *
+                * Ensures that vmas marked as uncached stay that way.
+                */
                 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
+               if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
+                       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+       }
  
         vma_link(mm, vma, prev, rb_link, rb_parent);
         file = vma->vm_file;
@@ -1213,6 +1260,8 @@ munmap_back:
         if (correct_wcount)
                 atomic_inc(&inode->i_writecount);
  out:
+       perf_event_mmap(vma);
+
         mm->total_vm += len >> PAGE_SHIFT;
         vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
         if (vm_flags & VM_LOCKED) {
@@ -1436,6 +1485,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
         unsigned long (*get_area)(struct file *, unsigned long,
                                   unsigned long, unsigned long, unsigned long);
  
+       unsigned long error = arch_mmap_check(addr, len, flags);
+       if (error)
+               return error;
+
+       /* Careful about overflows.. */
+       if (len > TASK_SIZE)
+               return -ENOMEM;
+
         get_area = current->mm->get_unmapped_area;
         if (file && file->f_op && file->f_op->get_unmapped_area)
                 get_area = file->f_op->get_unmapped_area;
@@ -1565,7 +1622,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
          * Overcommit..  This must be the final test, as it will
          * update security statistics.
          */
-       if (security_vm_enough_memory(grow))
+       if (security_vm_enough_memory_mm(mm, grow))
                 return -ENOMEM;
  
         /* Ok, everything looks good - let it rip */
@@ -1806,10 +1863,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
  }
  
  /*
- * Split a vma into two pieces at address 'addr', a new vma is allocated
- * either for the first part or the tail.
+ * __split_vma() bypasses sysctl_max_map_count checking.  We use this on the
+ * munmap path where it doesn't make sense to fail.
   */
-int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
               unsigned long addr, int new_below)
  {
         struct mempolicy *pol;
@@ -1819,9 +1876,6 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
                                         ~(huge_page_mask(hstate_vma(vma)))))
                 return -EINVAL;
  
-       if (mm->map_count >= sysctl_max_map_count)
-               return -ENOMEM;
-
         new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
         if (!new)
                 return -ENOMEM;
@@ -1861,6 +1915,19 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
         return 0;
  }
  
+/*
+ * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * either for the first part or the tail.
+ */
+int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+             unsigned long addr, int new_below)
+{
+       if (mm->map_count >= sysctl_max_map_count)
+               return -ENOMEM;
+
+       return __split_vma(mm, vma, addr, new_below);
+}
+
  /* Munmap is split into 2 main parts -- this part which finds
   * what needs doing, and the areas themselves, which do the
   * work.  This now handles partial unmappings.
@@ -1896,7 +1963,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
          * places tmp vma above, and higher split_vma places tmp vma below.
          */
         if (start > vma->vm_start) {
-               int error = split_vma(mm, vma, start, 0);
+               int error;
+
+               /*
+                * Make sure that map_count on return from munmap() will
+                * not exceed its limit; but let map_count go just above
+                * its limit temporarily, to help free resources as expected.
+                */
+               if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
+                       return -ENOMEM;
+
+               error = __split_vma(mm, vma, start, 0);
                 if (error)
                         return error;
                 prev = vma;
@@ -1905,7 +1982,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
         /* Does it split the last one? */
         last = find_vma(mm, end);
         if (last && end > last->vm_start) {
-               int error = split_vma(mm, last, end, 1);
+               int error = __split_vma(mm, last, end, 1);
                 if (error)
                         return error;
         }
@@ -1980,20 +2057,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
         if (!len)
                 return addr;
  
-       if ((addr + len) > TASK_SIZE || (addr + len) < addr)
-               return -EINVAL;
-
-       if (is_hugepage_only_range(mm, addr, len))
-               return -EINVAL;
-
         error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
         if (error)
                 return error;
  
         flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
  
-       error = arch_mmap_check(addr, len, flags);
-       if (error)
+       error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+       if (error & ~PAGE_MASK)
                 return error;
  
         /*
@@ -2078,12 +2149,8 @@ void exit_mmap(struct mm_struct *mm)
         unsigned long end;
  
         /* mm's last user has gone, and its about to be pulled down */
-       arch_exit_mmap(mm);
         mmu_notifier_release(mm);
  
-       if (!mm->mmap)  /* Can happen if dup_mmap() received an OOM */
-               return;
-
         if (mm->locked_vm) {
                 vma = mm->mmap;
                 while (vma) {
@@ -2092,7 +2159,13 @@ void exit_mmap(struct mm_struct *mm)
                         vma = vma->vm_next;
                 }
         }
+
+       arch_exit_mmap(mm);
+
         vma = mm->mmap;
+       if (!vma)       /* Can happen if dup_mmap() received an OOM */
+               return;
+
         lru_add_drain();
         flush_cache_mm(mm);
         tlb = tlb_gather_mmu(mm, 1);
@@ -2100,6 +2173,7 @@ void exit_mmap(struct mm_struct *mm)
         /* Use -1 here to ensure all VMAs in the mm are unmapped */
         end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
         vm_unacct_memory(nr_accounted);
+
         free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
         tlb_finish_mmu(tlb, 0, end);
  
@@ -2256,7 +2330,7 @@ static void special_mapping_close(struct vm_area_struct *vma)
  {
  }
  
-static struct vm_operations_struct special_mapping_vmops = {
+static const struct vm_operations_struct special_mapping_vmops = {
         .close = special_mapping_close,
         .fault = special_mapping_fault,
  };
@@ -2297,6 +2371,8 @@ int install_special_mapping(struct mm_struct *mm,
  
         mm->total_vm += len >> PAGE_SHIFT;
  
+       perf_event_mmap(vma);
+
         return 0;
  }
  
@@ -2469,7 +2545,8 @@ void mm_drop_all_locks(struct mm_struct *mm)
   */
  void __init mmap_init(void)
  {
-       vm_area_cachep = kmem_cache_create("vm_area_struct",
-                       sizeof(struct vm_area_struct), 0,
-                       SLAB_PANIC, NULL);
+       int ret;
+
+       ret = percpu_counter_init(&vm_committed_as, 0);
+       VM_BUG_ON(ret);
  }