nfsd: 4.1 has an rfc number

[safe/jmp/linux-2.6] / mm / mremap.c
diff --git a/mm/mremap.c b/mm/mremap.c

index 0d1c1b9..8451908 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -3,7 +3,7 @@
   *
   *     (C) Copyright 1996 Linus Torvalds
   *
- *     Address space accounting code   <alan@redhat.com>
+ *     Address space accounting code   <alan@lxorguk.ukuu.org.uk>
   *     (C) Copyright 2002 Red Hat Inc, All Rights Reserved
   */
  
@@ -11,46 +11,23 @@
  #include <linux/hugetlb.h>
  #include <linux/slab.h>
  #include <linux/shm.h>
+#include <linux/ksm.h>
  #include <linux/mman.h>
  #include <linux/swap.h>
+#include <linux/capability.h>
  #include <linux/fs.h>
  #include <linux/highmem.h>
  #include <linux/security.h>
  #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
  
  #include <asm/uaccess.h>
  #include <asm/cacheflush.h>
  #include <asm/tlbflush.h>
  
-static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte = NULL;
-
-       pgd = pgd_offset(mm, addr);
-       if (pgd_none_or_clear_bad(pgd))
-               goto end;
-
-       pud = pud_offset(pgd, addr);
-       if (pud_none_or_clear_bad(pud))
-               goto end;
+#include "internal.h"
  
-       pmd = pmd_offset(pud, addr);
-       if (pmd_none_or_clear_bad(pmd))
-               goto end;
-
-       pte = pte_offset_map_nested(pmd, addr);
-       if (pte_none(*pte)) {
-               pte_unmap_nested(pte);
-               pte = NULL;
-       }
-end:
-       return pte;
-}
-
-static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
  {
         pgd_t *pgd;
         pud_t *pud;
@@ -68,41 +45,49 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
         if (pmd_none_or_clear_bad(pmd))
                 return NULL;
  
-       return pte_offset_map(pmd, addr);
+       return pmd;
  }
  
-static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
  {
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
-       pte_t *pte = NULL;
  
         pgd = pgd_offset(mm, addr);
-
         pud = pud_alloc(mm, pgd, addr);
         if (!pud)
                 return NULL;
+
         pmd = pmd_alloc(mm, pud, addr);
-       if (pmd)
-               pte = pte_alloc_map(mm, pmd, addr);
-       return pte;
+       if (!pmd)
+               return NULL;
+
+       if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+               return NULL;
+
+       return pmd;
  }
  
-static int
-move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
-               struct vm_area_struct *new_vma, unsigned long new_addr)
+static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+               unsigned long old_addr, unsigned long old_end,
+               struct vm_area_struct *new_vma, pmd_t *new_pmd,
+               unsigned long new_addr)
  {
         struct address_space *mapping = NULL;
         struct mm_struct *mm = vma->vm_mm;
-       int error = 0;
-       pte_t *src, *dst;
+       pte_t *old_pte, *new_pte, pte;
+       spinlock_t *old_ptl, *new_ptl;
+       unsigned long old_start;
  
+       old_start = old_addr;
+       mmu_notifier_invalidate_range_start(vma->vm_mm,
+                                           old_start, old_end);
         if (vma->vm_file) {
                 /*
                  * Subtle point from Rajesh Venkatasubramanian: before
-                * moving file-based ptes, we must lock vmtruncate out,
-                * since it might clean the dst vma before the src vma,
+                * moving file-based ptes, we must lock truncate_pagecache
+                * out, since it might clean the dst vma before the src vma,
                  * and we propagate stale pages into the dst afterward.
                  */
                 mapping = vma->vm_file->f_mapping;
@@ -111,70 +96,71 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
                     new_vma->vm_truncate_count != vma->vm_truncate_count)
                         new_vma->vm_truncate_count = 0;
         }
-       spin_lock(&mm->page_table_lock);
  
-       src = get_one_pte_map_nested(mm, old_addr);
-       if (src) {
-               /*
-                * Look to see whether alloc_one_pte_map needs to perform a
-                * memory allocation.  If it does then we need to drop the
-                * atomic kmap
-                */
-               dst = get_one_pte_map(mm, new_addr);
-               if (unlikely(!dst)) {
-                       pte_unmap_nested(src);
-                       if (mapping)
-                               spin_unlock(&mapping->i_mmap_lock);
-                       dst = alloc_one_pte_map(mm, new_addr);
-                       if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
-                               spin_unlock(&mm->page_table_lock);
-                               spin_lock(&mapping->i_mmap_lock);
-                               spin_lock(&mm->page_table_lock);
-                       }
-                       src = get_one_pte_map_nested(mm, old_addr);
-               }
-               /*
-                * Since alloc_one_pte_map can drop and re-acquire
-                * page_table_lock, we should re-check the src entry...
-                */
-               if (src) {
-                       if (dst) {
-                               pte_t pte;
-                               pte = ptep_clear_flush(vma, old_addr, src);
-                               set_pte_at(mm, new_addr, dst, pte);
-                       } else
-                               error = -ENOMEM;
-                       pte_unmap_nested(src);
-               }
-               if (dst)
-                       pte_unmap(dst);
+       /*
+        * We don't have to worry about the ordering of src and dst
+        * pte locks because exclusive mmap_sem prevents deadlock.
+        */
+       old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+       new_pte = pte_offset_map_nested(new_pmd, new_addr);
+       new_ptl = pte_lockptr(mm, new_pmd);
+       if (new_ptl != old_ptl)
+               spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+       arch_enter_lazy_mmu_mode();
+
+       for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
+                                  new_pte++, new_addr += PAGE_SIZE) {
+               if (pte_none(*old_pte))
+                       continue;
+               pte = ptep_clear_flush(vma, old_addr, old_pte);
+               pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+               set_pte_at(mm, new_addr, new_pte, pte);
         }
-       spin_unlock(&mm->page_table_lock);
+
+       arch_leave_lazy_mmu_mode();
+       if (new_ptl != old_ptl)
+               spin_unlock(new_ptl);
+       pte_unmap_nested(new_pte - 1);
+       pte_unmap_unlock(old_pte - 1, old_ptl);
         if (mapping)
                 spin_unlock(&mapping->i_mmap_lock);
-       return error;
+       mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
  }
  
-static unsigned long move_page_tables(struct vm_area_struct *vma,
+#define LATENCY_LIMIT  (64 * PAGE_SIZE)
+
+unsigned long move_page_tables(struct vm_area_struct *vma,
                 unsigned long old_addr, struct vm_area_struct *new_vma,
                 unsigned long new_addr, unsigned long len)
  {
-       unsigned long offset;
+       unsigned long extent, next, old_end;
+       pmd_t *old_pmd, *new_pmd;
  
-       flush_cache_range(vma, old_addr, old_addr + len);
+       old_end = old_addr + len;
+       flush_cache_range(vma, old_addr, old_end);
  
-       /*
-        * This is not the clever way to do this, but we're taking the
-        * easy way out on the assumption that most remappings will be
-        * only a few pages.. This also makes error recovery easier.
-        */
-       for (offset = 0; offset < len; offset += PAGE_SIZE) {
-               if (move_one_page(vma, old_addr + offset,
-                               new_vma, new_addr + offset) < 0)
-                       break;
+       for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
                 cond_resched();
+               next = (old_addr + PMD_SIZE) & PMD_MASK;
+               if (next - 1 > old_end)
+                       next = old_end;
+               extent = next - old_addr;
+               old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+               if (!old_pmd)
+                       continue;
+               new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+               if (!new_pmd)
+                       break;
+               next = (new_addr + PMD_SIZE) & PMD_MASK;
+               if (extent > next - new_addr)
+                       extent = next - new_addr;
+               if (extent > LATENCY_LIMIT)
+                       extent = LATENCY_LIMIT;
+               move_ptes(vma, old_pmd, old_addr, old_addr + extent,
+                               new_vma, new_pmd, new_addr);
         }
-       return offset;
+
+       return len + old_addr - old_end;        /* how much done */
  }
  
  static unsigned long move_vma(struct vm_area_struct *vma,
@@ -187,7 +173,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         unsigned long new_pgoff;
         unsigned long moved_len;
         unsigned long excess = 0;
+       unsigned long hiwater_vm;
         int split = 0;
+       int err;
  
         /*
          * We'd prefer to avoid failure later on in do_munmap:
@@ -196,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         if (mm->map_count >= sysctl_max_map_count - 3)
                 return -ENOMEM;
  
+       /*
+        * Advise KSM to break any KSM pages in the area to be moved:
+        * it would be confusing if they were to turn up at the new
+        * location, where they happen to coincide with different KSM
+        * pages recently unmapped.  But leave vma->vm_flags as it was,
+        * so KSM can come around to merge on vma and new_vma afterwards.
+        */
+       err = ksm_madvise(vma, old_addr, old_addr + old_len,
+                                               MADV_UNMERGEABLE, &vm_flags);
+       if (err)
+               return err;
+
         new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
         new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
         if (!new_vma)
@@ -224,11 +224,25 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                         split = 1;
         }
  
+       /*
+        * If we failed to move page tables we still do total_vm increment
+        * since do_munmap() will decrement it by old_len == new_len.
+        *
+        * Since total_vm is about to be raised artificially high for a
+        * moment, we need to restore high watermark afterwards: if stats
+        * are taken meanwhile, total_vm and hiwater_vm appear too high.
+        * If this were a serious issue, we'd add a flag to do_munmap().
+        */
+       hiwater_vm = mm->hiwater_vm;
+       mm->total_vm += new_len >> PAGE_SHIFT;
+       vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+
         if (do_munmap(mm, old_addr, old_len) < 0) {
                 /* OOM: unable to split vma, just get accounts right */
                 vm_unacct_memory(excess >> PAGE_SHIFT);
                 excess = 0;
         }
+       mm->hiwater_vm = hiwater_vm;
  
         /* Restore VM_ACCOUNT if one or two pieces of vma left */
         if (excess) {
@@ -237,18 +251,147 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                         vma->vm_next->vm_flags |= VM_ACCOUNT;
         }
  
-       mm->total_vm += new_len >> PAGE_SHIFT;
-       __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
         if (vm_flags & VM_LOCKED) {
                 mm->locked_vm += new_len >> PAGE_SHIFT;
                 if (new_len > old_len)
-                       make_pages_present(new_addr + old_len,
-                                          new_addr + new_len);
+                       mlock_vma_pages_range(new_vma, new_addr + old_len,
+                                                      new_addr + new_len);
         }
  
         return new_addr;
  }
  
+static struct vm_area_struct *vma_to_resize(unsigned long addr,
+       unsigned long old_len, unsigned long new_len, unsigned long *p)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma = find_vma(mm, addr);
+
+       if (!vma || vma->vm_start > addr)
+               goto Efault;
+
+       if (is_vm_hugetlb_page(vma))
+               goto Einval;
+
+       /* We can't remap across vm area boundaries */
+       if (old_len > vma->vm_end - addr)
+               goto Efault;
+
+       if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
+               if (new_len > old_len)
+                       goto Efault;
+       }
+
+       if (vma->vm_flags & VM_LOCKED) {
+               unsigned long locked, lock_limit;
+               locked = mm->locked_vm << PAGE_SHIFT;
+               lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+               locked += new_len - old_len;
+               if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+                       goto Eagain;
+       }
+
+       if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+               goto Enomem;
+
+       if (vma->vm_flags & VM_ACCOUNT) {
+               unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+               if (security_vm_enough_memory(charged))
+                       goto Efault;
+               *p = charged;
+       }
+
+       return vma;
+
+Efault:        /* very odd choice for most of the cases, but... */
+       return ERR_PTR(-EFAULT);
+Einval:
+       return ERR_PTR(-EINVAL);
+Enomem:
+       return ERR_PTR(-ENOMEM);
+Eagain:
+       return ERR_PTR(-EAGAIN);
+}
+
+static unsigned long mremap_to(unsigned long addr,
+       unsigned long old_len, unsigned long new_addr,
+       unsigned long new_len)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       unsigned long ret = -EINVAL;
+       unsigned long charged = 0;
+       unsigned long map_flags;
+
+       if (new_addr & ~PAGE_MASK)
+               goto out;
+
+       if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+               goto out;
+
+       /* Check if the location we're moving into overlaps the
+        * old location at all, and fail if it does.
+        */
+       if ((new_addr <= addr) && (new_addr+new_len) > addr)
+               goto out;
+
+       if ((addr <= new_addr) && (addr+old_len) > new_addr)
+               goto out;
+
+       ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+       if (ret)
+               goto out;
+
+       ret = do_munmap(mm, new_addr, new_len);
+       if (ret)
+               goto out;
+
+       if (old_len >= new_len) {
+               ret = do_munmap(mm, addr+new_len, old_len - new_len);
+               if (ret && old_len != new_len)
+                       goto out;
+               old_len = new_len;
+       }
+
+       vma = vma_to_resize(addr, old_len, new_len, &charged);
+       if (IS_ERR(vma)) {
+               ret = PTR_ERR(vma);
+               goto out;
+       }
+
+       map_flags = MAP_FIXED;
+       if (vma->vm_flags & VM_MAYSHARE)
+               map_flags |= MAP_SHARED;
+
+       ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
+                               ((addr - vma->vm_start) >> PAGE_SHIFT),
+                               map_flags);
+       if (ret & ~PAGE_MASK)
+               goto out1;
+
+       ret = move_vma(vma, addr, old_len, new_len, new_addr);
+       if (!(ret & ~PAGE_MASK))
+               goto out;
+out1:
+       vm_unacct_memory(charged);
+
+out:
+       return ret;
+}
+
+static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
+{
+       unsigned long end = vma->vm_end + delta;
+       if (end < vma->vm_end) /* overflow */
+               return 0;
+       if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+               return 0;
+       if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
+                             0, MAP_FIXED) & ~PAGE_MASK)
+               return 0;
+       return 1;
+}
+
  /*
   * Expand (or shrink) an existing mapping, potentially moving it at the
   * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
@@ -260,6 +403,7 @@ unsigned long do_mremap(unsigned long addr,
         unsigned long old_len, unsigned long new_len,
         unsigned long flags, unsigned long new_addr)
  {
+       struct mm_struct *mm = current->mm;
         struct vm_area_struct *vma;
         unsigned long ret = -EINVAL;
         unsigned long charged = 0;
@@ -281,28 +425,10 @@ unsigned long do_mremap(unsigned long addr,
         if (!new_len)
                 goto out;
  
-       /* new_addr is only valid if MREMAP_FIXED is specified */
         if (flags & MREMAP_FIXED) {
-               if (new_addr & ~PAGE_MASK)
-                       goto out;
-               if (!(flags & MREMAP_MAYMOVE))
-                       goto out;
-
-               if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
-                       goto out;
-
-               /* Check if the location we're moving into overlaps the
-                * old location at all, and fail if it does.
-                */
-               if ((new_addr <= addr) && (new_addr+new_len) > addr)
-                       goto out;
-
-               if ((addr <= new_addr) && (addr+old_len) > new_addr)
-                       goto out;
-
-               ret = do_munmap(current->mm, new_addr, new_len);
-               if (ret)
-                       goto out;
+               if (flags & MREMAP_MAYMOVE)
+                       ret = mremap_to(addr, old_len, new_addr, new_len);
+               goto out;
         }
  
         /*
@@ -311,75 +437,37 @@ unsigned long do_mremap(unsigned long addr,
          * do_munmap does all the needed commit accounting
          */
         if (old_len >= new_len) {
-               ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
+               ret = do_munmap(mm, addr+new_len, old_len - new_len);
                 if (ret && old_len != new_len)
                         goto out;
                 ret = addr;
-               if (!(flags & MREMAP_FIXED) || (new_addr == addr))
-                       goto out;
-               old_len = new_len;
+               goto out;
         }
  
         /*
-        * Ok, we need to grow..  or relocate.
+        * Ok, we need to grow..
          */
-       ret = -EFAULT;
-       vma = find_vma(current->mm, addr);
-       if (!vma || vma->vm_start > addr)
-               goto out;
-       if (is_vm_hugetlb_page(vma)) {
-               ret = -EINVAL;
-               goto out;
-       }
-       /* We can't remap across vm area boundaries */
-       if (old_len > vma->vm_end - addr)
-               goto out;
-       if (vma->vm_flags & VM_DONTEXPAND) {
-               if (new_len > old_len)
-                       goto out;
-       }
-       if (vma->vm_flags & VM_LOCKED) {
-               unsigned long locked, lock_limit;
-               locked = current->mm->locked_vm << PAGE_SHIFT;
-               lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-               locked += new_len - old_len;
-               ret = -EAGAIN;
-               if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-                       goto out;
-       }
-       ret = -ENOMEM;
-       if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
-           > current->signal->rlim[RLIMIT_AS].rlim_cur)
+       vma = vma_to_resize(addr, old_len, new_len, &charged);
+       if (IS_ERR(vma)) {
+               ret = PTR_ERR(vma);
                 goto out;
-
-       if (vma->vm_flags & VM_ACCOUNT) {
-               charged = (new_len - old_len) >> PAGE_SHIFT;
-               if (security_vm_enough_memory(charged))
-                       goto out_nc;
         }
  
         /* old_len exactly to the end of the area..
-        * And we're not relocating the area.
          */
-       if (old_len == vma->vm_end - addr &&
-           !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
-           (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
-               unsigned long max_addr = TASK_SIZE;
-               if (vma->vm_next)
-                       max_addr = vma->vm_next->vm_start;
+       if (old_len == vma->vm_end - addr) {
                 /* can we just expand the current mapping? */
-               if (max_addr - addr >= new_len) {
+               if (vma_expandable(vma, new_len - old_len)) {
                         int pages = (new_len - old_len) >> PAGE_SHIFT;
  
                         vma_adjust(vma, vma->vm_start,
                                 addr + new_len, vma->vm_pgoff, NULL);
  
-                       current->mm->total_vm += pages;
-                       __vm_stat_account(vma->vm_mm, vma->vm_flags,
-                                                       vma->vm_file, pages);
+                       mm->total_vm += pages;
+                       vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
                         if (vma->vm_flags & VM_LOCKED) {
-                               current->mm->locked_vm += pages;
-                               make_pages_present(addr + old_len,
+                               mm->locked_vm += pages;
+                               mlock_vma_pages_range(vma, addr + old_len,
                                                    addr + new_len);
                         }
                         ret = addr;
@@ -393,29 +481,33 @@ unsigned long do_mremap(unsigned long addr,
          */
         ret = -ENOMEM;
         if (flags & MREMAP_MAYMOVE) {
-               if (!(flags & MREMAP_FIXED)) {
-                       unsigned long map_flags = 0;
-                       if (vma->vm_flags & VM_MAYSHARE)
-                               map_flags |= MAP_SHARED;
-
-                       new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
-                                               vma->vm_pgoff, map_flags);
+               unsigned long map_flags = 0;
+               if (vma->vm_flags & VM_MAYSHARE)
+                       map_flags |= MAP_SHARED;
+
+               new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+                                       vma->vm_pgoff +
+                                       ((addr - vma->vm_start) >> PAGE_SHIFT),
+                                       map_flags);
+               if (new_addr & ~PAGE_MASK) {
                         ret = new_addr;
-                       if (new_addr & ~PAGE_MASK)
-                               goto out;
+                       goto out;
                 }
+
+               ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+               if (ret)
+                       goto out;
                 ret = move_vma(vma, addr, old_len, new_len, new_addr);
         }
  out:
         if (ret & ~PAGE_MASK)
                 vm_unacct_memory(charged);
-out_nc:
         return ret;
  }
  
-asmlinkage unsigned long sys_mremap(unsigned long addr,
-       unsigned long old_len, unsigned long new_len,
-       unsigned long flags, unsigned long new_addr)
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+               unsigned long, new_len, unsigned long, flags,
+               unsigned long, new_addr)
  {
         unsigned long ret;