vmscan: properly account for the number of page cache pages zone_reclaim() can reclaim
[safe/jmp/linux-2.6] / mm / nommu.c
index 0d363df..2fd2ad5 100644 (file)
@@ -10,7 +10,7 @@
  *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
  *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
  *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
- *  Copyright (c) 2007      Paul Mundt <lethal@linux-sh.org>
+ *  Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
  */
 
 #include <linux/module.h>
@@ -62,13 +62,17 @@ void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
 unsigned long num_physpages;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 int heap_stack_gap = 0;
 
-atomic_t mmap_pages_allocated;
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+
+atomic_long_t mmap_pages_allocated;
 
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(num_physpages);
@@ -148,6 +152,20 @@ unsigned int kobjsize(const void *objp)
                return ksize(objp);
 
        /*
+        * If it's not a compound page, see if we have a matching VMA
+        * region. This test is intentionally done in reverse order,
+        * so if there's no VMA, we still fall through and hand back
+        * PAGE_SIZE for 0-order pages.
+        */
+       if (!PageCompound(page)) {
+               struct vm_area_struct *vma;
+
+               vma = find_vma(current->mm, (unsigned long)objp);
+               if (vma)
+                       return vma->vm_end - vma->vm_start;
+       }
+
+       /*
         * The ksize() function is only guaranteed to work for pointers
         * returned by kmalloc(). So handle arbitrary pointers here.
         */
@@ -379,6 +397,24 @@ void vunmap(const void *addr)
 }
 EXPORT_SYMBOL(vunmap);
 
+void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+{
+       BUG();
+       return NULL;
+}
+EXPORT_SYMBOL(vm_map_ram);
+
+void vm_unmap_ram(const void *mem, unsigned int count)
+{
+       BUG();
+}
+EXPORT_SYMBOL(vm_unmap_ram);
+
+void vm_unmap_aliases(void)
+{
+}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
+
 /*
  * Implement a stub for vmalloc_sync_all() if the architecture chose not to
  * have one.
@@ -401,7 +437,7 @@ EXPORT_SYMBOL(vm_insert_page);
  *  to a regular file.  in this case, the unmapping will need
  *  to invoke file system routines that need the global lock.
  */
-asmlinkage unsigned long sys_brk(unsigned long brk)
+SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
        struct mm_struct *mm = current->mm;
 
@@ -430,12 +466,11 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
  */
 void __init mmap_init(void)
 {
-       vm_region_jar = kmem_cache_create("vm_region_jar",
-                                         sizeof(struct vm_region), 0,
-                                         SLAB_PANIC, NULL);
-       vm_area_cachep = kmem_cache_create("vm_area_struct",
-                                          sizeof(struct vm_area_struct), 0,
-                                          SLAB_PANIC, NULL);
+       int ret;
+
+       ret = percpu_counter_init(&vm_committed_as, 0);
+       VM_BUG_ON(ret);
+       vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
 
 /*
@@ -453,23 +488,24 @@ static noinline void validate_nommu_regions(void)
                return;
 
        last = rb_entry(lastp, struct vm_region, vm_rb);
-       if (unlikely(last->vm_end <= last->vm_start))
-               BUG();
+       BUG_ON(unlikely(last->vm_end <= last->vm_start));
+       BUG_ON(unlikely(last->vm_top < last->vm_end));
 
        while ((p = rb_next(lastp))) {
                region = rb_entry(p, struct vm_region, vm_rb);
                last = rb_entry(lastp, struct vm_region, vm_rb);
 
-               if (unlikely(region->vm_end <= region->vm_start))
-                       BUG();
-               if (unlikely(region->vm_start < last->vm_end))
-                       BUG();
+               BUG_ON(unlikely(region->vm_end <= region->vm_start));
+               BUG_ON(unlikely(region->vm_top < region->vm_end));
+               BUG_ON(unlikely(region->vm_start < last->vm_top));
 
                lastp = p;
        }
 }
 #else
-#define validate_nommu_regions() do {} while(0)
+static void validate_nommu_regions(void)
+{
+}
 #endif
 
 /*
@@ -482,8 +518,6 @@ static void add_nommu_region(struct vm_region *region)
 
        validate_nommu_regions();
 
-       BUG_ON(region->vm_start & ~PAGE_MASK);
-
        parent = NULL;
        p = &nommu_region_tree.rb_node;
        while (*p) {
@@ -526,17 +560,18 @@ static void free_page_series(unsigned long from, unsigned long to)
                struct page *page = virt_to_page(from);
 
                kdebug("- free %lx", from);
-               atomic_dec(&mmap_pages_allocated);
+               atomic_long_dec(&mmap_pages_allocated);
                if (page_count(page) != 1)
-                       kdebug("free page %p [%d]", page, page_count(page));
+                       kdebug("free page %p: refcount not one: %d",
+                              page, page_count(page));
                put_page(page);
        }
 }
 
 /*
  * release a reference to a region
- * - the caller must hold the region semaphore, which this releases
- * - the region may not have been added to the tree yet, in which case vm_end
+ * - the caller must hold the region semaphore for writing, which this releases
+ * - the region may not have been added to the tree yet, in which case vm_top
  *   will equal vm_start
  */
 static void __put_nommu_region(struct vm_region *region)
@@ -547,7 +582,7 @@ static void __put_nommu_region(struct vm_region *region)
        BUG_ON(!nommu_region_tree.rb_node);
 
        if (atomic_dec_and_test(&region->vm_usage)) {
-               if (region->vm_end > region->vm_start)
+               if (region->vm_top > region->vm_start)
                        delete_nommu_region(region);
                up_write(&nommu_region_sem);
 
@@ -558,7 +593,7 @@ static void __put_nommu_region(struct vm_region *region)
                 * from ramfs/tmpfs mustn't be released here */
                if (region->vm_flags & VM_MAPPED_COPY) {
                        kdebug("free series");
-                       free_page_series(region->vm_start, region->vm_end);
+                       free_page_series(region->vm_start, region->vm_top);
                }
                kmem_cache_free(vm_region_jar, region);
        } else {
@@ -999,6 +1034,10 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
        int ret;
 
        ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+       if (ret == 0) {
+               vma->vm_region->vm_top = vma->vm_region->vm_end;
+               return ret;
+       }
        if (ret != -ENOSYS)
                return ret;
 
@@ -1027,11 +1066,14 @@ static int do_mmap_private(struct vm_area_struct *vma,
         */
        if (vma->vm_file) {
                ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
-               if (ret != -ENOSYS) {
+               if (ret == 0) {
                        /* shouldn't return success if we're not sharing */
-                       BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE));
-                       return ret; /* success or a real error */
+                       BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
+                       vma->vm_region->vm_top = vma->vm_region->vm_end;
+                       return ret;
                }
+               if (ret != -ENOSYS)
+                       return ret;
 
                /* getting an ENOSYS error indicates that direct mmap isn't
                 * possible (as opposed to tried but failed) so we'll try to
@@ -1051,23 +1093,25 @@ static int do_mmap_private(struct vm_area_struct *vma,
        if (!pages)
                goto enomem;
 
-       /* we allocated a power-of-2 sized page set, so we need to trim off the
-        * excess */
        total = 1 << order;
-       atomic_add(total, &mmap_pages_allocated);
+       atomic_long_add(total, &mmap_pages_allocated);
 
        point = rlen >> PAGE_SHIFT;
-       while (total > point) {
-               order = ilog2(total - point);
-               n = 1 << order;
-               kdebug("shave %lu/%lu @%lu", n, total - point, total);
-               atomic_sub(n, &mmap_pages_allocated);
-               total -= n;
-               set_page_refcounted(pages + total);
-               __free_pages(pages + total, order);
+
+       /* we allocated a power-of-2 sized page set, so we may want to trim off
+        * the excess */
+       if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
+               while (total > point) {
+                       order = ilog2(total - point);
+                       n = 1 << order;
+                       kdebug("shave %lu/%lu @%lu", n, total - point, total);
+                       atomic_long_sub(n, &mmap_pages_allocated);
+                       total -= n;
+                       set_page_refcounted(pages + total);
+                       __free_pages(pages + total, order);
+               }
        }
 
-       total = rlen >> PAGE_SHIFT;
        for (point = 1; point < total; point++)
                set_page_refcounted(&pages[point]);
 
@@ -1075,6 +1119,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
        region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
        region->vm_start = (unsigned long) base;
        region->vm_end   = region->vm_start + rlen;
+       region->vm_top   = region->vm_start + (total << PAGE_SHIFT);
 
        vma->vm_start = region->vm_start;
        vma->vm_end   = region->vm_start + len;
@@ -1110,11 +1155,12 @@ error_free:
        free_page_series(region->vm_start, region->vm_end);
        region->vm_start = vma->vm_start = 0;
        region->vm_end   = vma->vm_end = 0;
+       region->vm_top   = 0;
        return ret;
 
 enomem:
-       printk("Allocation of length %lu from process %d failed\n",
-              len, current->pid);
+       printk("Allocation of length %lu from process %d (%s) failed\n",
+              len, current->pid, current->comm);
        show_free_areas();
        return -ENOMEM;
 }
@@ -1401,7 +1447,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        npages = (addr - vma->vm_start) >> PAGE_SHIFT;
 
        if (new_below) {
-               region->vm_end = new->vm_end = addr;
+               region->vm_top = region->vm_end = new->vm_end = addr;
        } else {
                region->vm_start = new->vm_start = addr;
                region->vm_pgoff = new->vm_pgoff += npages;
@@ -1418,6 +1464,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
                vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
        } else {
                vma->vm_region->vm_end = vma->vm_end = addr;
+               vma->vm_region->vm_top = addr;
        }
        add_nommu_region(vma->vm_region);
        add_nommu_region(new->vm_region);
@@ -1454,10 +1501,12 @@ static int shrink_vma(struct mm_struct *mm,
 
        down_write(&nommu_region_sem);
        delete_nommu_region(region);
-       if (from > region->vm_start)
-               region->vm_end = from;
-       else
+       if (from > region->vm_start) {
+               to = region->vm_top;
+               region->vm_top = region->vm_end = from;
+       } else {
                region->vm_start = to;
+       }
        add_nommu_region(region);
        up_write(&nommu_region_sem);
 
@@ -1485,10 +1534,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        /* find the first potentially overlapping VMA */
        vma = find_vma(mm, start);
        if (!vma) {
-               printk(KERN_WARNING
-                      "munmap of memory not mmapped by process %d (%s):"
-                      " 0x%lx-0x%lx\n",
-                      current->pid, current->comm, start, start + len - 1);
+               static int limit = 0;
+               if (limit < 5) {
+                       printk(KERN_WARNING
+                              "munmap of memory not mmapped by process %d"
+                              " (%s): 0x%lx-0x%lx\n",
+                              current->pid, current->comm,
+                              start, start + len - 1);
+                       limit++;
+               }
                return -EINVAL;
        }
 
@@ -1540,7 +1594,7 @@ erase_whole_vma:
 }
 EXPORT_SYMBOL(do_munmap);
 
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
        int ret;
        struct mm_struct *mm = current->mm;
@@ -1624,10 +1678,9 @@ unsigned long do_mremap(unsigned long addr,
 }
 EXPORT_SYMBOL(do_mremap);
 
-asmlinkage
-unsigned long sys_mremap(unsigned long addr,
-                        unsigned long old_len, unsigned long new_len,
-                        unsigned long flags, unsigned long new_addr)
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+               unsigned long, new_len, unsigned long, flags,
+               unsigned long, new_addr)
 {
        unsigned long ret;
 
@@ -1799,12 +1852,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
        if (mm)
                allowed -= mm->total_vm / 32;
 
-       /*
-        * cast `allowed' as a signed long because vm_committed_space
-        * sometimes has a negative value
-        */
-       if (atomic_long_read(&vm_committed_space) < (long)allowed)
+       if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
+
 error:
        vm_unacct_memory(pages);