*
* Written by obz.
*
- * Address space accounting code <alan@redhat.com>
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
+#include <linux/ima.h>
#include <linux/hugetlb.h>
#include <linux/profile.h>
#include <linux/module.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
+#include <linux/perf_event.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
/*
* Check that a process has enough memory to allocate a new virtual
/* Don't let a single process grow too big:
leave 3% of the size of this process for other processes */
- allowed -= mm->total_vm / 32;
+ if (mm)
+ allowed -= mm->total_vm / 32;
- /*
- * cast `allowed' as a signed long because vm_committed_space
- * sometimes has a negative value
- */
- if (atomic_long_read(&vm_committed_space) < (long)allowed)
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);
return next;
}
-asmlinkage unsigned long sys_brk(unsigned long brk)
+SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long rlim, retval;
unsigned long newbrk, oldbrk;
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
}
-static inline void __vma_link_file(struct vm_area_struct *vma)
+static void __vma_link_file(struct vm_area_struct *vma)
{
- struct file * file;
+ struct file *file;
file = vma->vm_file;
if (file) {
* insert vm structure into list and rbtree and anon_vma,
* but it has already been inserted into prio_tree earlier.
*/
-static void
-__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
+ struct vm_area_struct *__vma, *prev;
+ struct rb_node **rb_link, *rb_parent;
__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
BUG_ON(__vma && __vma->vm_start < vma->vm_end);
/*
* When changing only vma->vm_end, we don't really need
- * anon_vma lock: but is that case worth optimizing out?
+ * anon_vma lock.
*/
- if (vma->anon_vma)
+ if (vma->anon_vma && (insert || importer || start != vma->vm_start))
anon_vma = vma->anon_vma;
if (anon_vma) {
spin_lock(&anon_vma->lock);
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- if (vma->vm_flags != vm_flags)
+ /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
+ if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
return 0;
if (vma->vm_file != file)
return 0;
#endif /* CONFIG_PROC_FS */
/*
- * The caller must hold down_write(current->mm->mmap_sem).
+ * The caller must hold down_write(¤t->mm->mmap_sem).
*/
-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff)
{
struct inode *inode;
unsigned int vm_flags;
int error;
- int accountable = 1;
unsigned long reqprot = prot;
/*
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
+ if (flags & MAP_HUGETLB) {
+ struct user_struct *user = NULL;
+ if (file)
+ return -EINVAL;
+
+ /*
+ * VM_NORESERVE is used because the reservations will be
+ * taken when vm_ops->mmap() is called
+ * A dummy user value is used because we are not locking
+ * memory so no accounting is necessary
+ */
+ len = ALIGN(len, huge_page_size(&default_hstate));
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+ &user, HUGETLB_ANONHUGE_INODE);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ }
+
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (flags & MAP_LOCKED) {
+ if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
- vm_flags |= VM_LOCKED;
- }
/* mlock MCL_FUTURE? */
if (vm_flags & VM_LOCKED) {
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
- if (is_file_hugepages(file))
- accountable = 0;
if (!file->f_op || !file->f_op->mmap)
return -ENODEV;
error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
if (error)
return error;
+ error = ima_file_mmap(file, prot);
+ if (error)
+ return error;
- return mmap_region(file, addr, len, flags, vm_flags, pgoff,
- accountable);
+ return mmap_region(file, addr, len, flags, vm_flags, pgoff);
}
EXPORT_SYMBOL(do_mmap_pgoff);
mapping_cap_account_dirty(vma->vm_file->f_mapping);
}
+/*
+ * We account for memory if it's a private writeable mapping,
+ * not hugepages and VM_NORESERVE wasn't set.
+ */
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+{
+ /*
+ * hugetlb has its own accounting separate from the core VM
+ * VM_HUGETLB may not be set yet so we cannot check for that flag.
+ */
+ if (file && is_file_hugepages(file))
+ return 0;
+
+ return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
+
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, unsigned long flags,
- unsigned int vm_flags, unsigned long pgoff,
- int accountable)
+ unsigned int vm_flags, unsigned long pgoff)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
- if (flags & MAP_NORESERVE)
- vm_flags |= VM_NORESERVE;
+ /*
+ * Set 'VM_NORESERVE' if we should not account for the
+ * memory use of this mapping.
+ */
+ if ((flags & MAP_NORESERVE)) {
+ /* We honor MAP_NORESERVE if allowed to overcommit */
+ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ vm_flags |= VM_NORESERVE;
- if (accountable && (!(flags & MAP_NORESERVE) ||
- sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
- if (vm_flags & VM_SHARED) {
- /* Check memory availability in shmem_file_setup? */
- vm_flags |= VM_ACCOUNT;
- } else if (vm_flags & VM_WRITE) {
- /*
- * Private writable mapping: check memory availability
- */
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
+ /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+ if (file && is_file_hugepages(file))
+ vm_flags |= VM_NORESERVE;
}
/*
- * Can we just expand an old private anonymous mapping?
- * The VM_SHARED test is necessary because shmem_zero_setup
- * will create the file object for a shared anonymous map below.
+ * Private writable mapping: check memory availability
*/
- if (!file && !(vm_flags & VM_SHARED)) {
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, NULL, pgoff, NULL);
- if (vma)
- goto out;
+ if (accountable_mapping(file, vm_flags)) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory(charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
}
/*
+ * Can we just expand an old mapping?
+ */
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+ if (vma)
+ goto out;
+
+ /*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
goto unmap_and_free_vma;
if (vm_flags & VM_EXECUTABLE)
added_exe_file_vma(mm);
+
+ /* Can addr have changed??
+ *
+ * Answer: Yes, several device drivers can do it in their
+ * f_op->mmap method. -DaveM
+ */
+ addr = vma->vm_start;
+ pgoff = vma->vm_pgoff;
+ vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
- /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
- * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
- * that memory reservation must be checked; but that reservation
- * belongs to shared memory object, not to vma: so now clear it.
- */
- if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
- vma->vm_flags &= ~VM_ACCOUNT;
-
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- */
- addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
- vm_flags = vma->vm_flags;
-
if (vma_wants_writenotify(vma))
vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
- if (file && vma_merge(mm, prev, addr, vma->vm_end,
- vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
- mpol_put(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
- fput(file);
- if (vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(mm);
- } else {
- vma_link(mm, vma, prev, rb_link, rb_parent);
- file = vma->vm_file;
- }
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ file = vma->vm_file;
/* Once vma denies write, undo our temporary denial count */
if (correct_wcount)
atomic_inc(&inode->i_writecount);
out:
+ perf_event_mmap(vma);
+
mm->total_vm += len >> PAGE_SHIFT;
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = NULL;
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma = NULL, *prev = NULL;
- struct rb_node * rb_node;
+ struct rb_node *rb_node;
if (!mm)
goto out;
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
-static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
- if (security_vm_enough_memory(grow))
+ if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
/* Ok, everything looks good - let it rip */
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
#ifndef CONFIG_IA64
-static inline
+static
#endif
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-static inline int expand_downwards(struct vm_area_struct *vma,
+static int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
int error;
EXPORT_SYMBOL(do_munmap);
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
int ret;
struct mm_struct *mm = current->mm;
unsigned long end;
/* mm's last user has gone, and its about to be pulled down */
- arch_exit_mmap(mm);
mmu_notifier_release(mm);
if (mm->locked_vm) {
vma = vma->vm_next;
}
}
+
+ arch_exit_mmap(mm);
+
vma = mm->mmap;
+ if (!vma) /* Can happen if dup_mmap() received an OOM */
+ return;
+
lru_add_drain();
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);
- /* Don't update_hiwater_rss(mm) here, do_exit already did */
+ /* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
+
free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
{
}
-static struct vm_operations_struct special_mapping_vmops = {
+static const struct vm_operations_struct special_mapping_vmops = {
.close = special_mapping_close,
.fault = special_mapping_fault,
};
mm->total_vm += len >> PAGE_SHIFT;
+ perf_event_mmap(vma);
+
return 0;
}
mutex_unlock(&mm_all_locks_mutex);
}
+
+/*
+ * initialise the VMA slab
+ */
+void __init mmap_init(void)
+{
+ int ret;
+
+ ret = percpu_counter_init(&vm_committed_as, 0);
+ VM_BUG_ON(ret);
+}