*
* Written by obz.
*
- * Address space accounting code <alan@redhat.com>
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/slab.h>
+#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
+#include <linux/capability.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
+#include <linux/ima.h>
#include <linux/hugetlb.h>
#include <linux/profile.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/perf_counter.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+
+#include "internal.h"
+
+#ifndef arch_mmap_check
+#define arch_mmap_check(addr, len, flags) (0)
+#endif
+
+#ifndef arch_rebalance_pgtables
+#define arch_rebalance_pgtables(addr, len) (addr)
+#endif
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
+pgprot_t vm_get_page_prot(unsigned long vm_flags)
+{
+ return __pgprot(pgprot_val(protection_map[vm_flags &
+ (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
+ pgprot_val(arch_vm_get_page_prot(vm_flags)));
+}
+EXPORT_SYMBOL(vm_get_page_prot);
+
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_t vm_committed_space = ATOMIC_INIT(0);
+struct percpu_counter vm_committed_as;
+
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
/*
* Check that a process has enough memory to allocate a new virtual
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
-int __vm_enough_memory(long pages, int cap_sys_admin)
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
unsigned long free, allowed;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
unsigned long n;
- free = get_page_cache_size();
+ free = global_page_state(NR_FILE_PAGES);
free += nr_swap_pages;
/*
* which are reclaimable, under pressure. The dentry
* cache and most inode caches should fall into this
*/
- free += atomic_read(&slab_reclaim_pages);
+ free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
* Leave the last 3% for root
* only call if we're about to fail.
*/
n = nr_free_pages();
+
+ /*
+ * Leave reserved pages. The pages are not for anonymous pages.
+ */
+ if (n <= totalreserve_pages)
+ goto error;
+ else
+ n -= totalreserve_pages;
+
+ /*
+ * Leave the last 3% for root
+ */
if (!cap_sys_admin)
n -= n / 32;
free += n;
if (free > pages)
return 0;
- vm_unacct_memory(pages);
- return -ENOMEM;
+
+ goto error;
}
allowed = (totalram_pages - hugetlb_total_pages())
/* Don't let a single process grow too big:
leave 3% of the size of this process for other processes */
- allowed -= current->mm->total_vm / 32;
+ if (mm)
+ allowed -= mm->total_vm / 32;
- /*
- * cast `allowed' as a signed long because vm_committed_space
- * sometimes has a negative value
- */
- if (atomic_read(&vm_committed_space) < (long)allowed)
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
-
+error:
vm_unacct_memory(pages);
return -ENOMEM;
}
-EXPORT_SYMBOL(__vm_enough_memory);
-
/*
* Requires inode->i_mapping->i_mmap_lock
*/
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_DENYWRITE)
- atomic_inc(&file->f_dentry->d_inode->i_writecount);
+ atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable--;
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file)
+ if (vma->vm_file) {
fput(vma->vm_file);
- mpol_free(vma_policy(vma));
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(vma->vm_mm);
+ }
+ mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
return next;
}
-asmlinkage unsigned long sys_brk(unsigned long brk)
+SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long rlim, retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
+ unsigned long min_brk;
down_write(&mm->mmap_sem);
- if (brk < mm->end_code)
+#ifdef CONFIG_COMPAT_BRK
+ min_brk = mm->end_code;
+#else
+ min_brk = mm->start_brk;
+#endif
+ if (brk < min_brk)
+ goto out;
+
+ /*
+ * Check against rlimit here. If this check is done later after the test
+ * of oldbrk with newbrk then it can escape the test and let the data
+ * segment grow beyond its set limit the in case where the limit is
+ * not page aligned -Ram Gupta
+ */
+ rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+ if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
+ (mm->end_data - mm->start_data) > rlim)
goto out;
+
newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
if (oldbrk == newbrk)
goto out;
}
- /* Check against rlimit.. */
- rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
- if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
- goto out;
-
/* Check against existing mmap mappings. */
if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
goto out;
printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
i++;
pn = nd;
+ prev = vma->vm_start;
+ pend = vma->vm_end;
}
j = 0;
for (nd = pn; nd; nd = rb_prev(nd)) {
i = browse_rb(&mm->mm_rb);
if (i != mm->map_count)
printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
- if (bug)
- BUG();
+ BUG_ON(bug);
}
#else
#define validate_mm(mm) do { } while (0)
if (vma_tmp->vm_end > addr) {
vma = vma_tmp;
if (vma_tmp->vm_start <= addr)
- return vma;
+ break;
__rb_link = &__rb_parent->rb_left;
} else {
rb_prev = __rb_parent;
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
}
-static inline void __vma_link_file(struct vm_area_struct *vma)
+static void __vma_link_file(struct vm_area_struct *vma)
{
- struct file * file;
+ struct file *file;
file = vma->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file->f_dentry->d_inode->i_writecount);
+ atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
* insert vm structure into list and rbtree and anon_vma,
* but it has already been inserted into prio_tree earlier.
*/
-static void
-__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
+ struct vm_area_struct *__vma, *prev;
+ struct rb_node **rb_link, *rb_parent;
__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
- if (__vma && __vma->vm_start < vma->vm_end)
- BUG();
+ BUG_ON(__vma && __vma->vm_start < vma->vm_end);
__vma_link(mm, vma, prev, rb_link, rb_parent);
mm->map_count++;
}
spin_unlock(&mapping->i_mmap_lock);
if (remove_next) {
- if (file)
+ if (file) {
fput(file);
+ if (next->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ }
mm->map_count--;
- mpol_free(vma_policy(next));
+ mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
* In mprotect's case 6 (see comments on vma_merge),
validate_mm(mm);
}
+/* Flags that can be inherited from an existing mapping when merging */
+#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
+
/*
* If the vma has a ->close operation then the driver probably needs to release
* per-vma resources, so we don't attempt to merge those.
*/
-#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
-
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- if (vma->vm_flags != vm_flags)
+ if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS)
return 0;
if (vma->vm_file != file)
return 0;
* (e.g. stash info in next's anon_vma_node when assigning
* an anon_vma, or when trying vma_merge). Another time.
*/
- if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma)
- BUG();
+ BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
if (!near)
goto none;
const unsigned long stack_flags
= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
-#ifdef CONFIG_HUGETLB
- if (flags & VM_HUGETLB) {
- if (!(flags & VM_DONTCOPY))
- mm->shared_vm += pages;
- return;
- }
-#endif /* CONFIG_HUGETLB */
-
if (file) {
mm->shared_vm += pages;
if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
* The caller must hold down_write(current->mm->mmap_sem).
*/
-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
struct inode *inode;
unsigned int vm_flags;
- int correct_wcount = 0;
int error;
- struct rb_node ** rb_link, * rb_parent;
- int accountable = 1;
- unsigned long charged = 0, reqprot = prot;
-
- if (file) {
- if (is_file_hugepages(file))
- accountable = 0;
+ unsigned long reqprot = prot;
- if (!file->f_op || !file->f_op->mmap)
- return -ENODEV;
-
- if ((prot & PROT_EXEC) &&
- (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
- return -EPERM;
- }
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+ if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
prot |= PROT_EXEC;
if (!len)
return -EINVAL;
+ if (!(flags & MAP_FIXED))
+ addr = round_hint_to_min(addr);
+
+ error = arch_mmap_check(addr, len, flags);
+ if (error)
+ return error;
+
/* Careful about overflows.. */
len = PAGE_ALIGN(len);
if (!len || len > TASK_SIZE)
return -EPERM;
vm_flags |= VM_LOCKED;
}
+
/* mlock MCL_FUTURE? */
if (vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
return -EAGAIN;
}
- inode = file ? file->f_dentry->d_inode : NULL;
+ inode = file ? file->f_path.dentry->d_inode : NULL;
if (file) {
switch (flags & MAP_TYPE) {
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
+ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+ if (vm_flags & VM_EXEC)
+ return -EPERM;
+ vm_flags &= ~VM_MAYEXEC;
+ }
+
+ if (!file->f_op || !file->f_op->mmap)
+ return -ENODEV;
break;
default:
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
+ /*
+ * Ignore pgoff.
+ */
+ pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
}
}
- error = security_file_mmap(file, reqprot, prot, flags);
+ error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+ if (error)
+ return error;
+ error = ima_file_mmap(file, prot);
if (error)
return error;
-
+
+ return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+}
+EXPORT_SYMBOL(do_mmap_pgoff);
+
+/*
+ * Some shared mappigns will want the pages marked read-only
+ * to track write events. If so, we'll downgrade vm_page_prot
+ * to the private version (using protection_map[] without the
+ * VM_SHARED bit).
+ */
+int vma_wants_writenotify(struct vm_area_struct *vma)
+{
+ unsigned int vm_flags = vma->vm_flags;
+
+ /* If it was private or non-writable, the write bit is already clear */
+ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+ return 0;
+
+ /* The backer wishes to know when pages are first written to? */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+ return 1;
+
+ /* The open routine did something to the protections already? */
+ if (pgprot_val(vma->vm_page_prot) !=
+ pgprot_val(vm_get_page_prot(vm_flags)))
+ return 0;
+
+ /* Specialty mapping? */
+ if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+ return 0;
+
+ /* Can the mapping track the dirty pages? */
+ return vma->vm_file && vma->vm_file->f_mapping &&
+ mapping_cap_account_dirty(vma->vm_file->f_mapping);
+}
+
+/*
+ * We account for memory if it's a private writeable mapping,
+ * not hugepages and VM_NORESERVE wasn't set.
+ */
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+{
+ /*
+ * hugetlb has its own accounting separate from the core VM
+ * VM_HUGETLB may not be set yet so we cannot check for that flag.
+ */
+ if (file && is_file_hugepages(file))
+ return 0;
+
+ return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ unsigned int vm_flags, unsigned long pgoff)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ int correct_wcount = 0;
+ int error;
+ struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = 0;
+ struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
+
/* Clear old maps */
error = -ENOMEM;
munmap_back:
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
- if (accountable && (!(flags & MAP_NORESERVE) ||
- sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
- if (vm_flags & VM_SHARED) {
- /* Check memory availability in shmem_file_setup? */
- vm_flags |= VM_ACCOUNT;
- } else if (vm_flags & VM_WRITE) {
- /*
- * Private writable mapping: check memory availability
- */
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
+ /*
+ * Set 'VM_NORESERVE' if we should not account for the
+ * memory use of this mapping.
+ */
+ if ((flags & MAP_NORESERVE)) {
+ /* We honor MAP_NORESERVE if allowed to overcommit */
+ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ vm_flags |= VM_NORESERVE;
+
+ /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+ if (file && is_file_hugepages(file))
+ vm_flags |= VM_NORESERVE;
}
/*
- * Can we just expand an old private anonymous mapping?
- * The VM_SHARED test is necessary because shmem_zero_setup
- * will create the file object for a shared anonymous map below.
+ * Private writable mapping: check memory availability
*/
- if (!file && !(vm_flags & VM_SHARED) &&
- vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, NULL, pgoff, NULL))
+ if (accountable_mapping(file, vm_flags)) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory(charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+
+ /*
+ * Can we just expand an old mapping?
+ */
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+ if (vma)
goto out;
/*
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
- memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
- vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
if (file) {
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
- if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
- == (VM_WRITE | VM_RESERVED)) {
- printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
- "PROT_WRITE mmap of VM_RESERVED memory, which "
- "is deprecated. Please report this to "
- "linux-kernel@vger.kernel.org\n",current->comm);
- if (vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
- error = -EACCES;
- goto unmap_and_free_vma;
- }
+ if (vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
- /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
- * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
- * that memory reservation must be checked; but that reservation
- * belongs to shared memory object, not to vma: so now clear it.
- */
- if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
- vma->vm_flags &= ~VM_ACCOUNT;
-
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
pgoff = vma->vm_pgoff;
vm_flags = vma->vm_flags;
- if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
- vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
- file = vma->vm_file;
- vma_link(mm, vma, prev, rb_link, rb_parent);
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- } else {
- if (file) {
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- fput(file);
- }
- mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
- }
-out:
+ if (vma_wants_writenotify(vma))
+ vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
+
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ file = vma->vm_file;
+
+ /* Once vma denies write, undo our temporary denial count */
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+out:
+ perf_counter_mmap(vma);
+
mm->total_vm += len >> PAGE_SHIFT;
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
- mm->locked_vm += len >> PAGE_SHIFT;
+ /*
+ * makes pages present; downgrades, drops, reacquires mmap_sem
+ */
+ long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
+ if (nr_pages < 0)
+ return nr_pages; /* vma gone! */
+ mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
+ } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
make_pages_present(addr, addr + len);
- }
- if (flags & MAP_POPULATE) {
- up_write(&mm->mmap_sem);
- sys_remap_file_pages(addr, len, 0,
- pgoff, flags & MAP_NONBLOCK);
- down_write(&mm->mmap_sem);
- }
return addr;
unmap_and_free_vma:
return error;
}
-EXPORT_SYMBOL(do_mmap_pgoff);
-
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
if (len > TASK_SIZE)
return -ENOMEM;
+ if (flags & MAP_FIXED)
+ return addr;
+
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (len > TASK_SIZE)
return -ENOMEM;
+ if (flags & MAP_FIXED)
+ return addr;
+
/* requesting a specific address */
if (addr) {
addr = PAGE_ALIGN(addr);
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- unsigned long ret;
-
- if (!(flags & MAP_FIXED)) {
- unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
-
- get_area = current->mm->get_unmapped_area;
- if (file && file->f_op && file->f_op->get_unmapped_area)
- get_area = file->f_op->get_unmapped_area;
- addr = get_area(file, addr, len, pgoff, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
- }
+ unsigned long (*get_area)(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+
+ get_area = current->mm->get_unmapped_area;
+ if (file && file->f_op && file->f_op->get_unmapped_area)
+ get_area = file->f_op->get_unmapped_area;
+ addr = get_area(file, addr, len, pgoff, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
if (addr > TASK_SIZE - len)
return -ENOMEM;
if (addr & ~PAGE_MASK)
return -EINVAL;
- if (file && is_file_hugepages(file)) {
- /*
- * Check if the given range is hugepage aligned, and
- * can be made suitable for hugepages.
- */
- ret = prepare_hugepage_range(addr, len);
- } else {
- /*
- * Ensure that a normal request is not falling in a
- * reserved hugepage range. For some archs like IA-64,
- * there is a separate region for hugepages.
- */
- ret = is_hugepage_only_range(current->mm, addr, len);
- }
- if (ret)
- return -EINVAL;
- return addr;
+
+ return arch_rebalance_pgtables(addr, len);
}
EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = NULL;
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma = NULL, *prev = NULL;
- struct rb_node * rb_node;
+ struct rb_node *rb_node;
if (!mm)
goto out;
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
-static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
+ unsigned long new_start;
/* address space limit tests */
if (!may_expand_vm(mm, grow))
return -ENOMEM;
}
+ /* Check to ensure the stack will not grow into a hugetlb-only region */
+ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
+ vma->vm_end - size;
+ if (is_hugepage_only_range(vma->vm_mm, new_start, size))
+ return -EFAULT;
+
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
- if (security_vm_enough_memory(grow))
+ if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
/* Ok, everything looks good - let it rip */
* PA-RISC uses this for its stack; IA64 for its Register Backing Store.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
-#ifdef CONFIG_STACK_GROWSUP
-static inline
+#ifndef CONFIG_IA64
+static
#endif
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
+ * Also guard against wrapping around to address 0.
*/
- address += 4 + PAGE_SIZE - 1;
- address &= PAGE_MASK;
+ if (address < PAGE_ALIGN(address+4))
+ address = PAGE_ALIGN(address+4);
+ else {
+ anon_vma_unlock(vma);
+ return -ENOMEM;
+ }
error = 0;
/* Somebody else might have raced and expanded it already */
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
-#ifdef CONFIG_STACK_GROWSUP
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
-{
- return expand_upwards(vma, address);
-}
-
-struct vm_area_struct *
-find_extend_vma(struct mm_struct *mm, unsigned long addr)
-{
- struct vm_area_struct *vma, *prev;
-
- addr &= PAGE_MASK;
- vma = find_vma_prev(mm, addr, &prev);
- if (vma && (vma->vm_start <= addr))
- return vma;
- if (!prev || expand_stack(prev, addr))
- return NULL;
- if (prev->vm_flags & VM_LOCKED) {
- make_pages_present(addr, prev->vm_end);
- }
- return prev;
-}
-#else
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+static int expand_downwards(struct vm_area_struct *vma,
+ unsigned long address)
{
int error;
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
+
+ address &= PAGE_MASK;
+ error = security_file_mmap(NULL, 0, 0, 0, address, 1);
+ if (error)
+ return error;
+
anon_vma_lock(vma);
/*
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
- address &= PAGE_MASK;
- error = 0;
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
return error;
}
+int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_downwards(vma, address);
+}
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_upwards(vma, address);
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma, *prev;
+
+ addr &= PAGE_MASK;
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+ if (!prev || expand_stack(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED) {
+ if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
+ return NULL; /* vma gone! */
+ }
+ return prev;
+}
+#else
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_downwards(vma, address);
+}
+
struct vm_area_struct *
find_extend_vma(struct mm_struct * mm, unsigned long addr)
{
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED) {
- make_pages_present(addr, start);
+ if (mlock_vma_pages_range(vma, addr, start) < 0)
+ return NULL; /* vma gone! */
}
return vma;
}
long nrpages = vma_pages(vma);
mm->total_vm -= nrpages;
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm -= nrpages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
vma = remove_vma(vma);
} while (vma);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+ free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
}
/*
* Split a vma into two pieces at address 'addr', a new vma is allocated
- * either for the first part or the the tail.
+ * either for the first part or the tail.
*/
int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long addr, int new_below)
struct mempolicy *pol;
struct vm_area_struct *new;
- if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+ if (is_vm_hugetlb_page(vma) && (addr &
+ ~(huge_page_mask(hstate_vma(vma)))))
return -EINVAL;
if (mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
- new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
return -ENOMEM;
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
- pol = mpol_copy(vma_policy(vma));
+ pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol)) {
kmem_cache_free(vm_area_cachep, new);
return PTR_ERR(pol);
}
vma_set_policy(new, pol);
- if (new->vm_file)
+ if (new->vm_file) {
get_file(new->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ }
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
vma = prev? prev->vm_next: mm->mmap;
/*
+ * unlock any mlock()ed ranges before detaching vmas
+ */
+ if (mm->locked_vm) {
+ struct vm_area_struct *tmp = vma;
+ while (tmp && tmp->vm_start < end) {
+ if (tmp->vm_flags & VM_LOCKED) {
+ mm->locked_vm -= vma_pages(tmp);
+ munlock_vma_pages_all(tmp);
+ }
+ tmp = tmp->vm_next;
+ }
+ }
+
+ /*
* Remove the vma's, and unmap the actual pages
*/
detach_vmas_to_be_unmapped(mm, vma, prev, end);
EXPORT_SYMBOL(do_munmap);
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
int ret;
struct mm_struct *mm = current->mm;
unsigned long flags;
struct rb_node ** rb_link, * rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
+ int error;
len = PAGE_ALIGN(len);
if (!len)
if ((addr + len) > TASK_SIZE || (addr + len) < addr)
return -EINVAL;
+ if (is_hugepage_only_range(mm, addr, len))
+ return -EINVAL;
+
+ error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
+ if (error)
+ return error;
+
+ flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+
+ error = arch_mmap_check(addr, len, flags);
+ if (error)
+ return error;
+
/*
* mlock MCL_FUTURE?
*/
if (security_vm_enough_memory(len >> PAGE_SHIFT))
return -ENOMEM;
- flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-
/* Can we just expand an old private anonymous mapping? */
- if (vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL))
+ vma = vma_merge(mm, prev, addr, addr + len, flags,
+ NULL, NULL, pgoff, NULL);
+ if (vma)
goto out;
/*
* create a vma struct for an anonymous mapping
*/
- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
}
- memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
- vma->vm_page_prot = protection_map[flags & 0x0f];
+ vma->vm_page_prot = vm_get_page_prot(flags);
vma_link(mm, vma, prev, rb_link, rb_parent);
out:
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
- mm->locked_vm += len >> PAGE_SHIFT;
- make_pages_present(addr, addr + len);
+ if (!mlock_vma_pages_range(vma, addr, addr + len))
+ mm->locked_vm += (len >> PAGE_SHIFT);
}
return addr;
}
void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather *tlb;
- struct vm_area_struct *vma = mm->mmap;
+ struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
unsigned long end;
+ /* mm's last user has gone, and its about to be pulled down */
+ mmu_notifier_release(mm);
+
+ if (mm->locked_vm) {
+ vma = mm->mmap;
+ while (vma) {
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_pages_all(vma);
+ vma = vma->vm_next;
+ }
+ }
+
+ arch_exit_mmap(mm);
+
+ vma = mm->mmap;
+ if (!vma) /* Can happen if dup_mmap() received an OOM */
+ return;
+
lru_add_drain();
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);
- /* Don't update_hiwater_rss(mm) here, do_exit already did */
+ /* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
/*
if (__vma && __vma->vm_start < vma->vm_end)
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory(vma_pages(vma)))
+ security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
vma_start < new_vma->vm_end)
*vmap = new_vma;
} else {
- new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) {
*new_vma = *vma;
- pol = mpol_copy(vma_policy(vma));
+ pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol)) {
kmem_cache_free(vm_area_cachep, new_vma);
return NULL;
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
- if (new_vma->vm_file)
+ if (new_vma->vm_file) {
get_file(new_vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ }
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
return 0;
return 1;
}
+
+
+static int special_mapping_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ pgoff_t pgoff;
+ struct page **pages;
+
+ /*
+ * special mappings have no vm_file, and in that case, the mm
+ * uses vm_pgoff internally. So we have to subtract it from here.
+ * We are allowed to do this because we are the mm; do not copy
+ * this code into drivers!
+ */
+ pgoff = vmf->pgoff - vma->vm_pgoff;
+
+ for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
+ pgoff--;
+
+ if (*pages) {
+ struct page *page = *pages;
+ get_page(page);
+ vmf->page = page;
+ return 0;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+/*
+ * Having a close hook prevents vma merging regardless of flags.
+ */
+static void special_mapping_close(struct vm_area_struct *vma)
+{
+}
+
+static struct vm_operations_struct special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+};
+
+/*
+ * Called with mm->mmap_sem held for writing.
+ * Insert a new vma covering the given region, with the given flags.
+ * Its pages are supplied by the given array of struct page *.
+ * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
+ * The region past the last page supplied will always produce SIGBUS.
+ * The array pointer and the pages it points to are assumed to stay alive
+ * for as long as this mapping might exist.
+ */
+int install_special_mapping(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, struct page **pages)
+{
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ if (unlikely(vma == NULL))
+ return -ENOMEM;
+
+ vma->vm_mm = mm;
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+
+ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+ vma->vm_ops = &special_mapping_vmops;
+ vma->vm_private_data = pages;
+
+ if (unlikely(insert_vm_struct(mm, vma))) {
+ kmem_cache_free(vm_area_cachep, vma);
+ return -ENOMEM;
+ }
+
+ mm->total_vm += len >> PAGE_SHIFT;
+
+ perf_counter_mmap(vma);
+
+ return 0;
+}
+
+static DEFINE_MUTEX(mm_all_locks_mutex);
+
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+ if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ /*
+ * The LSB of head.next can't change from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+ /*
+ * We can safely modify head.next after taking the
+ * anon_vma->lock. If some other vma in this mm shares
+ * the same anon_vma we won't take it again.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us thanks to the
+ * anon_vma->lock.
+ */
+ if (__test_and_set_bit(0, (unsigned long *)
+ &anon_vma->head.next))
+ BUG();
+ }
+}
+
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+ if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change from under us because
+ * we hold the mm_all_locks_mutex.
+ *
+ * Operations on ->flags have to be atomic because
+ * even if AS_MM_ALL_LOCKS is stable thanks to the
+ * mm_all_locks_mutex, there may be other cpus
+ * changing other bitflags in parallel to us.
+ */
+ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+ BUG();
+ spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+ }
+}
+
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int ret = -EINTR;
+
+ BUG_ON(down_read_trylock(&mm->mmap_sem));
+
+ mutex_lock(&mm_all_locks_mutex);
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->anon_vma)
+ vm_lock_anon_vma(mm, vma->anon_vma);
+ }
+
+ ret = 0;
+
+out_unlock:
+ if (ret)
+ mm_drop_all_locks(mm);
+
+ return ret;
+}
+
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+ if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ /*
+ * The LSB of head.next can't change to 0 from under
+ * us because we hold the mm_all_locks_mutex.
+ *
+ * We must however clear the bitflag before unlocking
+ * the vma so the users using the anon_vma->head will
+ * never see our bitflag.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us until we release the
+ * anon_vma->lock.
+ */
+ if (!__test_and_clear_bit(0, (unsigned long *)
+ &anon_vma->head.next))
+ BUG();
+ spin_unlock(&anon_vma->lock);
+ }
+}
+
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+ if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change to 0 from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ spin_unlock(&mapping->i_mmap_lock);
+ if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+ &mapping->flags))
+ BUG();
+ }
+}
+
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->anon_vma)
+ vm_unlock_anon_vma(vma->anon_vma);
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_unlock_mapping(vma->vm_file->f_mapping);
+ }
+
+ mutex_unlock(&mm_all_locks_mutex);
+}
+
+/*
+ * initialise the VMA slab
+ */
+void __init mmap_init(void)
+{
+ int ret;
+
+ ret = percpu_counter_init(&vm_committed_as, 0);
+ VM_BUG_ON(ret);
+}