*
* Written by obz.
*
- * Address space accounting code <alan@redhat.com>
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/slab.h>
+#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
+#include <linux/ima.h>
#include <linux/hugetlb.h>
#include <linux/profile.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/perf_counter.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
+#include "internal.h"
+
#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags) (0)
#endif
+#ifndef arch_rebalance_pgtables
+#define arch_rebalance_pgtables(addr, len) (addr)
+#endif
+
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
- return protection_map[vm_flags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
+ return __pgprot(pgprot_val(protection_map[vm_flags &
+ (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
+ pgprot_val(arch_vm_get_page_prot(vm_flags)));
}
EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_t vm_committed_space = ATOMIC_INIT(0);
+struct percpu_counter vm_committed_as;
+
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
/*
* Check that a process has enough memory to allocate a new virtual
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
-int __vm_enough_memory(long pages, int cap_sys_admin)
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
unsigned long free, allowed;
/* Don't let a single process grow too big:
leave 3% of the size of this process for other processes */
- allowed -= current->mm->total_vm / 32;
+ if (mm)
+ allowed -= mm->total_vm / 32;
- /*
- * cast `allowed' as a signed long because vm_committed_space
- * sometimes has a negative value
- */
- if (atomic_read(&vm_committed_space) < (long)allowed)
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);
return -ENOMEM;
}
-EXPORT_SYMBOL(__vm_enough_memory);
-
/*
* Requires inode->i_mapping->i_mmap_lock
*/
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file)
+ if (vma->vm_file) {
fput(vma->vm_file);
- mpol_free(vma_policy(vma));
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(vma->vm_mm);
+ }
+ mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
return next;
}
-asmlinkage unsigned long sys_brk(unsigned long brk)
+SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long rlim, retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
+ unsigned long min_brk;
down_write(&mm->mmap_sem);
- if (brk < mm->end_code)
+#ifdef CONFIG_COMPAT_BRK
+ min_brk = mm->end_code;
+#else
+ min_brk = mm->start_brk;
+#endif
+ if (brk < min_brk)
goto out;
/*
* not page aligned -Ram Gupta
*/
rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
- if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
+ if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
+ (mm->end_data - mm->start_data) > rlim)
goto out;
newbrk = PAGE_ALIGN(brk);
if (vma_tmp->vm_end > addr) {
vma = vma_tmp;
if (vma_tmp->vm_start <= addr)
- return vma;
+ break;
__rb_link = &__rb_parent->rb_left;
} else {
rb_prev = __rb_parent;
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
}
-static inline void __vma_link_file(struct vm_area_struct *vma)
+static void __vma_link_file(struct vm_area_struct *vma)
{
- struct file * file;
+ struct file *file;
file = vma->vm_file;
if (file) {
* insert vm structure into list and rbtree and anon_vma,
* but it has already been inserted into prio_tree earlier.
*/
-static void
-__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
+ struct vm_area_struct *__vma, *prev;
+ struct rb_node **rb_link, *rb_parent;
__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
BUG_ON(__vma && __vma->vm_start < vma->vm_end);
spin_unlock(&mapping->i_mmap_lock);
if (remove_next) {
- if (file)
+ if (file) {
fput(file);
+ if (next->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ }
mm->map_count--;
- mpol_free(vma_policy(next));
+ mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
* In mprotect's case 6 (see comments on vma_merge),
validate_mm(mm);
}
+/* Flags that can be inherited from an existing mapping when merging */
+#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
+
/*
* If the vma has a ->close operation then the driver probably needs to release
* per-vma resources, so we don't attempt to merge those.
*/
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
-
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- if (vma->vm_flags != vm_flags)
+ if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS)
return 0;
if (vma->vm_file != file)
return 0;
* The caller must hold down_write(current->mm->mmap_sem).
*/
-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
struct inode *inode;
unsigned int vm_flags;
- int correct_wcount = 0;
int error;
- struct rb_node ** rb_link, * rb_parent;
- int accountable = 1;
- unsigned long charged = 0, reqprot = prot;
+ unsigned long reqprot = prot;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
if (!len)
return -EINVAL;
+ if (!(flags & MAP_FIXED))
+ addr = round_hint_to_min(addr);
+
error = arch_mmap_check(addr, len, flags);
if (error)
return error;
return -EPERM;
vm_flags |= VM_LOCKED;
}
+
/* mlock MCL_FUTURE? */
if (vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
- if (is_file_hugepages(file))
- accountable = 0;
if (!file->f_op || !file->f_op->mmap)
return -ENODEV;
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
+ /*
+ * Ignore pgoff.
+ */
+ pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
}
}
- error = security_file_mmap(file, reqprot, prot, flags);
+ error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
if (error)
return error;
-
+ error = ima_file_mmap(file, prot);
+ if (error)
+ return error;
+
+ return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+}
+EXPORT_SYMBOL(do_mmap_pgoff);
+
+/*
+ * Some shared mappigns will want the pages marked read-only
+ * to track write events. If so, we'll downgrade vm_page_prot
+ * to the private version (using protection_map[] without the
+ * VM_SHARED bit).
+ */
+int vma_wants_writenotify(struct vm_area_struct *vma)
+{
+ unsigned int vm_flags = vma->vm_flags;
+
+ /* If it was private or non-writable, the write bit is already clear */
+ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+ return 0;
+
+ /* The backer wishes to know when pages are first written to? */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+ return 1;
+
+ /* The open routine did something to the protections already? */
+ if (pgprot_val(vma->vm_page_prot) !=
+ pgprot_val(vm_get_page_prot(vm_flags)))
+ return 0;
+
+ /* Specialty mapping? */
+ if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+ return 0;
+
+ /* Can the mapping track the dirty pages? */
+ return vma->vm_file && vma->vm_file->f_mapping &&
+ mapping_cap_account_dirty(vma->vm_file->f_mapping);
+}
+
+/*
+ * We account for memory if it's a private writeable mapping,
+ * not hugepages and VM_NORESERVE wasn't set.
+ */
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+{
+ /*
+ * hugetlb has its own accounting separate from the core VM
+ * VM_HUGETLB may not be set yet so we cannot check for that flag.
+ */
+ if (file && is_file_hugepages(file))
+ return 0;
+
+ return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ unsigned int vm_flags, unsigned long pgoff)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ int correct_wcount = 0;
+ int error;
+ struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = 0;
+ struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
+
/* Clear old maps */
error = -ENOMEM;
munmap_back:
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
- if (accountable && (!(flags & MAP_NORESERVE) ||
- sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
- if (vm_flags & VM_SHARED) {
- /* Check memory availability in shmem_file_setup? */
- vm_flags |= VM_ACCOUNT;
- } else if (vm_flags & VM_WRITE) {
- /*
- * Private writable mapping: check memory availability
- */
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
+ /*
+ * Set 'VM_NORESERVE' if we should not account for the
+ * memory use of this mapping.
+ */
+ if ((flags & MAP_NORESERVE)) {
+ /* We honor MAP_NORESERVE if allowed to overcommit */
+ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ vm_flags |= VM_NORESERVE;
+
+ /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+ if (file && is_file_hugepages(file))
+ vm_flags |= VM_NORESERVE;
+ }
+
+ /*
+ * Private writable mapping: check memory availability
+ */
+ if (accountable_mapping(file, vm_flags)) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory(charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
}
/*
- * Can we just expand an old private anonymous mapping?
- * The VM_SHARED test is necessary because shmem_zero_setup
- * will create the file object for a shared anonymous map below.
+ * Can we just expand an old mapping?
*/
- if (!file && !(vm_flags & VM_SHARED) &&
- vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, NULL, pgoff, NULL))
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+ if (vma)
goto out;
/*
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
- vma->vm_page_prot = protection_map[vm_flags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
if (file) {
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
+ if (vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
- /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
- * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
- * that memory reservation must be checked; but that reservation
- * belongs to shared memory object, not to vma: so now clear it.
- */
- if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
- vma->vm_flags &= ~VM_ACCOUNT;
-
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
vm_flags = vma->vm_flags;
if (vma_wants_writenotify(vma))
- vma->vm_page_prot =
- protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
-
- if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
- vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
- file = vma->vm_file;
- vma_link(mm, vma, prev, rb_link, rb_parent);
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- } else {
- if (file) {
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- fput(file);
- }
- mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
- }
-out:
+ vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
+
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ file = vma->vm_file;
+
+ /* Once vma denies write, undo our temporary denial count */
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+out:
+ perf_counter_mmap(vma);
+
mm->total_vm += len >> PAGE_SHIFT;
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
- mm->locked_vm += len >> PAGE_SHIFT;
+ /*
+ * makes pages present; downgrades, drops, reacquires mmap_sem
+ */
+ long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
+ if (nr_pages < 0)
+ return nr_pages; /* vma gone! */
+ mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
+ } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
make_pages_present(addr, addr + len);
- }
- if (flags & MAP_POPULATE) {
- up_write(&mm->mmap_sem);
- sys_remap_file_pages(addr, len, 0,
- pgoff, flags & MAP_NONBLOCK);
- down_write(&mm->mmap_sem);
- }
return addr;
unmap_and_free_vma:
return error;
}
-EXPORT_SYMBOL(do_mmap_pgoff);
-
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
if (addr & ~PAGE_MASK)
return -EINVAL;
- return addr;
+ return arch_rebalance_pgtables(addr, len);
}
EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = NULL;
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma = NULL, *prev = NULL;
- struct rb_node * rb_node;
+ struct rb_node *rb_node;
if (!mm)
goto out;
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
-static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
- if (security_vm_enough_memory(grow))
+ if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
/* Ok, everything looks good - let it rip */
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
#ifndef CONFIG_IA64
-static inline
+static
#endif
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
+ * Also guard against wrapping around to address 0.
*/
- address += 4 + PAGE_SIZE - 1;
- address &= PAGE_MASK;
+ if (address < PAGE_ALIGN(address+4))
+ address = PAGE_ALIGN(address+4);
+ else {
+ anon_vma_unlock(vma);
+ return -ENOMEM;
+ }
error = 0;
/* Somebody else might have raced and expanded it already */
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
-#ifdef CONFIG_STACK_GROWSUP
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
-{
- return expand_upwards(vma, address);
-}
-
-struct vm_area_struct *
-find_extend_vma(struct mm_struct *mm, unsigned long addr)
-{
- struct vm_area_struct *vma, *prev;
-
- addr &= PAGE_MASK;
- vma = find_vma_prev(mm, addr, &prev);
- if (vma && (vma->vm_start <= addr))
- return vma;
- if (!prev || expand_stack(prev, addr))
- return NULL;
- if (prev->vm_flags & VM_LOCKED) {
- make_pages_present(addr, prev->vm_end);
- }
- return prev;
-}
-#else
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+static int expand_downwards(struct vm_area_struct *vma,
+ unsigned long address)
{
int error;
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
+
+ address &= PAGE_MASK;
+ error = security_file_mmap(NULL, 0, 0, 0, address, 1);
+ if (error)
+ return error;
+
anon_vma_lock(vma);
/*
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
- address &= PAGE_MASK;
- error = 0;
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
return error;
}
+int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_downwards(vma, address);
+}
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_upwards(vma, address);
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma, *prev;
+
+ addr &= PAGE_MASK;
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+ if (!prev || expand_stack(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED) {
+ if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
+ return NULL; /* vma gone! */
+ }
+ return prev;
+}
+#else
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_downwards(vma, address);
+}
+
struct vm_area_struct *
find_extend_vma(struct mm_struct * mm, unsigned long addr)
{
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED) {
- make_pages_present(addr, start);
+ if (mlock_vma_pages_range(vma, addr, start) < 0)
+ return NULL; /* vma gone! */
}
return vma;
}
long nrpages = vma_pages(vma);
mm->total_vm -= nrpages;
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm -= nrpages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
vma = remove_vma(vma);
} while (vma);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+ free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
}
struct mempolicy *pol;
struct vm_area_struct *new;
- if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+ if (is_vm_hugetlb_page(vma) && (addr &
+ ~(huge_page_mask(hstate_vma(vma)))))
return -EINVAL;
if (mm->map_count >= sysctl_max_map_count)
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
- pol = mpol_copy(vma_policy(vma));
+ pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol)) {
kmem_cache_free(vm_area_cachep, new);
return PTR_ERR(pol);
}
vma_set_policy(new, pol);
- if (new->vm_file)
+ if (new->vm_file) {
get_file(new->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ }
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
vma = prev? prev->vm_next: mm->mmap;
/*
+ * unlock any mlock()ed ranges before detaching vmas
+ */
+ if (mm->locked_vm) {
+ struct vm_area_struct *tmp = vma;
+ while (tmp && tmp->vm_start < end) {
+ if (tmp->vm_flags & VM_LOCKED) {
+ mm->locked_vm -= vma_pages(tmp);
+ munlock_vma_pages_all(tmp);
+ }
+ tmp = tmp->vm_next;
+ }
+ }
+
+ /*
* Remove the vma's, and unmap the actual pages
*/
detach_vmas_to_be_unmapped(mm, vma, prev, end);
EXPORT_SYMBOL(do_munmap);
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
int ret;
struct mm_struct *mm = current->mm;
if (is_hugepage_only_range(mm, addr, len))
return -EINVAL;
+ error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
+ if (error)
+ return error;
+
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = arch_mmap_check(addr, len, flags);
return -ENOMEM;
/* Can we just expand an old private anonymous mapping? */
- if (vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL))
+ vma = vma_merge(mm, prev, addr, addr + len, flags,
+ NULL, NULL, pgoff, NULL);
+ if (vma)
goto out;
/*
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
- vma->vm_page_prot = protection_map[flags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
+ vma->vm_page_prot = vm_get_page_prot(flags);
vma_link(mm, vma, prev, rb_link, rb_parent);
out:
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
- mm->locked_vm += len >> PAGE_SHIFT;
- make_pages_present(addr, addr + len);
+ if (!mlock_vma_pages_range(vma, addr, addr + len))
+ mm->locked_vm += (len >> PAGE_SHIFT);
}
return addr;
}
void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather *tlb;
- struct vm_area_struct *vma = mm->mmap;
+ struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
unsigned long end;
/* mm's last user has gone, and its about to be pulled down */
+ mmu_notifier_release(mm);
+
+ if (mm->locked_vm) {
+ vma = mm->mmap;
+ while (vma) {
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_pages_all(vma);
+ vma = vma->vm_next;
+ }
+ }
+
arch_exit_mmap(mm);
+ vma = mm->mmap;
+ if (!vma) /* Can happen if dup_mmap() received an OOM */
+ return;
+
lru_add_drain();
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);
- /* Don't update_hiwater_rss(mm) here, do_exit already did */
+ /* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
/*
if (__vma && __vma->vm_start < vma->vm_end)
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory(vma_pages(vma)))
+ security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) {
*new_vma = *vma;
- pol = mpol_copy(vma_policy(vma));
+ pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol)) {
kmem_cache_free(vm_area_cachep, new_vma);
return NULL;
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
- if (new_vma->vm_file)
+ if (new_vma->vm_file) {
get_file(new_vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ }
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
}
-static struct page *special_mapping_nopage(struct vm_area_struct *vma,
- unsigned long address, int *type)
+static int special_mapping_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
{
+ pgoff_t pgoff;
struct page **pages;
- BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ /*
+ * special mappings have no vm_file, and in that case, the mm
+ * uses vm_pgoff internally. So we have to subtract it from here.
+ * We are allowed to do this because we are the mm; do not copy
+ * this code into drivers!
+ */
+ pgoff = vmf->pgoff - vma->vm_pgoff;
- address -= vma->vm_start;
- for (pages = vma->vm_private_data; address > 0 && *pages; ++pages)
- address -= PAGE_SIZE;
+ for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
+ pgoff--;
if (*pages) {
struct page *page = *pages;
get_page(page);
- return page;
+ vmf->page = page;
+ return 0;
}
- return NOPAGE_SIGBUS;
+ return VM_FAULT_SIGBUS;
}
/*
static struct vm_operations_struct special_mapping_vmops = {
.close = special_mapping_close,
- .nopage = special_mapping_nopage,
+ .fault = special_mapping_fault,
};
/*
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_flags = vm_flags | mm->def_flags;
- vma->vm_page_prot = protection_map[vma->vm_flags & 7];
+ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
vma->vm_ops = &special_mapping_vmops;
vma->vm_private_data = pages;
mm->total_vm += len >> PAGE_SHIFT;
+ perf_counter_mmap(vma);
+
return 0;
}
+
+static DEFINE_MUTEX(mm_all_locks_mutex);
+
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+ if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ /*
+ * The LSB of head.next can't change from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+ /*
+ * We can safely modify head.next after taking the
+ * anon_vma->lock. If some other vma in this mm shares
+ * the same anon_vma we won't take it again.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us thanks to the
+ * anon_vma->lock.
+ */
+ if (__test_and_set_bit(0, (unsigned long *)
+ &anon_vma->head.next))
+ BUG();
+ }
+}
+
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+ if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change from under us because
+ * we hold the mm_all_locks_mutex.
+ *
+ * Operations on ->flags have to be atomic because
+ * even if AS_MM_ALL_LOCKS is stable thanks to the
+ * mm_all_locks_mutex, there may be other cpus
+ * changing other bitflags in parallel to us.
+ */
+ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+ BUG();
+ spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+ }
+}
+
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int ret = -EINTR;
+
+ BUG_ON(down_read_trylock(&mm->mmap_sem));
+
+ mutex_lock(&mm_all_locks_mutex);
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->anon_vma)
+ vm_lock_anon_vma(mm, vma->anon_vma);
+ }
+
+ ret = 0;
+
+out_unlock:
+ if (ret)
+ mm_drop_all_locks(mm);
+
+ return ret;
+}
+
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+ if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ /*
+ * The LSB of head.next can't change to 0 from under
+ * us because we hold the mm_all_locks_mutex.
+ *
+ * We must however clear the bitflag before unlocking
+ * the vma so the users using the anon_vma->head will
+ * never see our bitflag.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us until we release the
+ * anon_vma->lock.
+ */
+ if (!__test_and_clear_bit(0, (unsigned long *)
+ &anon_vma->head.next))
+ BUG();
+ spin_unlock(&anon_vma->lock);
+ }
+}
+
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+ if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change to 0 from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ spin_unlock(&mapping->i_mmap_lock);
+ if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+ &mapping->flags))
+ BUG();
+ }
+}
+
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->anon_vma)
+ vm_unlock_anon_vma(vma->anon_vma);
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_unlock_mapping(vma->vm_file->f_mapping);
+ }
+
+ mutex_unlock(&mm_all_locks_mutex);
+}
+
+/*
+ * initialise the VMA slab
+ */
+void __init mmap_init(void)
+{
+ int ret;
+
+ ret = percpu_counter_init(&vm_committed_as, 0);
+ VM_BUG_ON(ret);
+}