X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fnommu.c;h=17773862619b5334832b76a062411c830e90135b;hb=de3fab39348dff18c69a0cd04efee9c276a02f51;hp=23fb033e596d40e3d2bea57de77fb1e97bd87501;hpb=e9536ae7205d255bc94616b72910fc6e16c861fe;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/nommu.c b/mm/nommu.c index 23fb033..1777386 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -6,12 +6,14 @@ * * See Documentation/nommu-mmap.txt * - * Copyright (c) 2004-2005 David Howells + * Copyright (c) 2004-2008 David Howells * Copyright (c) 2000-2003 David McCullough * Copyright (c) 2000-2001 D Jeff Dionne * Copyright (c) 2002 Greg Ungerer + * Copyright (c) 2007-2009 Paul Mundt */ +#include #include #include #include @@ -20,7 +22,7 @@ #include #include #include -#include +#include #include #include #include @@ -31,74 +33,55 @@ #include #include #include +#include +#include "internal.h" + +static inline __attribute__((format(printf, 1, 2))) +void no_printk(const char *fmt, ...) +{ +} + +#if 0 +#define kenter(FMT, ...) \ + printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) \ + printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) \ + printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) +#else +#define kenter(FMT, ...) \ + no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) \ + no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) \ + no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) +#endif void *high_memory; struct page *mem_map; unsigned long max_mapnr; unsigned long num_physpages; -unsigned long askedalloc, realalloc; -atomic_t vm_committed_space = ATOMIC_INIT(0); +unsigned long highest_memmap_pfn; +struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; +int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; +atomic_long_t mmap_pages_allocated; + EXPORT_SYMBOL(mem_map); -EXPORT_SYMBOL(__vm_enough_memory); +EXPORT_SYMBOL(num_physpages); -/* list of shareable VMAs */ -struct rb_root nommu_vma_tree = RB_ROOT; -DECLARE_RWSEM(nommu_vma_sem); +/* list of mapped, potentially shareable regions */ +static struct kmem_cache *vm_region_jar; +struct rb_root nommu_region_tree = RB_ROOT; +DECLARE_RWSEM(nommu_region_sem); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { }; -EXPORT_SYMBOL(vfree); -EXPORT_SYMBOL(vmalloc_to_page); -EXPORT_SYMBOL(vmalloc_32); -EXPORT_SYMBOL(vmap); -EXPORT_SYMBOL(vunmap); - -/* - * Handle all mappings that got truncated by a "truncate()" - * system call. - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode *inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long limit; - - if (inode->i_size < offset) - goto do_expand; - i_size_write(inode, offset); - - truncate_inode_pages(mapping, offset); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out; - i_size_write(inode, offset); - -out_truncate: - if (inode->i_op && inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out: - return -EFBIG; -} - -EXPORT_SYMBOL(vmtruncate); - /* * Return the total memory allocated for this pointer, not * just what the caller asked for. @@ -109,46 +92,66 @@ unsigned int kobjsize(const void *objp) { struct page *page; - if (!objp || !((page = virt_to_page(objp)))) + /* + * If the object we have should not have ksize performed on it, + * return size of 0 + */ + if (!objp || !virt_addr_valid(objp)) return 0; + page = virt_to_head_page(objp); + + /* + * If the allocator sets PageSlab, we know the pointer came from + * kmalloc(). + */ if (PageSlab(page)) return ksize(objp); - BUG_ON(page->index < 0); - BUG_ON(page->index >= MAX_ORDER); + /* + * If it's not a compound page, see if we have a matching VMA + * region. This test is intentionally done in reverse order, + * so if there's no VMA, we still fall through and hand back + * PAGE_SIZE for 0-order pages. + */ + if (!PageCompound(page)) { + struct vm_area_struct *vma; - return (PAGE_SIZE << page->index); + vma = find_vma(current->mm, (unsigned long)objp); + if (vma) + return vma->vm_end - vma->vm_start; + } + + /* + * The ksize() function is only guaranteed to work for pointers + * returned by kmalloc(). So handle arbitrary pointers here. + */ + return PAGE_SIZE << compound_order(page); } -/* - * get a list of pages in an address range belonging to the specified process - * and indicate the VMA that covers each page - * - this is potentially dodgy as we may end incrementing the page count of a - * slab page or a secondary page from a compound page - * - don't permit access to VMAs that don't support it, such as I/O mappings - */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int nr_pages, unsigned int foll_flags, + struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; int i; /* calculate required read or write permissions. - * - if 'force' is set, we only require the "MAY" flags. + * If FOLL_FORCE is set, we only require the "MAY" flags. */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = (foll_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (foll_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - for (i = 0; i < len; i++) { + for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) goto finish_or_fault; /* protect what we can, including chardevs */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP) || + if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) goto finish_or_fault; @@ -168,34 +171,99 @@ finish_or_fault: return i ? : -EFAULT; } +/* + * get a list of pages in an address range belonging to the specified process + * and indicate the VMA that covers each page + * - this is potentially dodgy as we may end incrementing the page count of a + * slab page or a secondary page from a compound page + * - don't permit access to VMAs that don't support it, such as I/O mappings + */ +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int nr_pages, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); +} EXPORT_SYMBOL(get_user_pages); +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + *pfn = address >> PAGE_SHIFT; + return 0; +} +EXPORT_SYMBOL(follow_pfn); + DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; -void vfree(void *addr) +void vfree(const void *addr) { kfree(addr); } +EXPORT_SYMBOL(vfree); void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { /* - * kmalloc doesn't like __GFP_HIGHMEM for some reason + * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() + * returns only a logical address. */ return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); } +EXPORT_SYMBOL(__vmalloc); -struct page * vmalloc_to_page(void *addr) +void *vmalloc_user(unsigned long size) +{ + void *ret; + + ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); + if (ret) { + struct vm_area_struct *vma; + + down_write(¤t->mm->mmap_sem); + vma = find_vma(current->mm, (unsigned long)ret); + if (vma) + vma->vm_flags |= VM_USERMAP; + up_write(¤t->mm->mmap_sem); + } + + return ret; +} +EXPORT_SYMBOL(vmalloc_user); + +struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); } +EXPORT_SYMBOL(vmalloc_to_page); -unsigned long vmalloc_to_pfn(void *addr) +unsigned long vmalloc_to_pfn(const void *addr) { return page_to_pfn(virt_to_page(addr)); } - +EXPORT_SYMBOL(vmalloc_to_pfn); long vread(char *buf, char *addr, unsigned long count) { @@ -236,9 +304,29 @@ void *vmalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vmalloc_node); -/* - * vmalloc_32 - allocate virtually continguos memory (32bit addressable) +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + +/** + * vmalloc_exec - allocate virtually contiguous, executable memory + * @size: allocation size * + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ + +void *vmalloc_exec(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); +} + +/** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the @@ -248,17 +336,73 @@ void *vmalloc_32(unsigned long size) { return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); } +EXPORT_SYMBOL(vmalloc_32); + +/** + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory + * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. + * + * VM_USERMAP is set on the corresponding VMA so that subsequent calls to + * remap_vmalloc_range() are permissible. + */ +void *vmalloc_32_user(unsigned long size) +{ + /* + * We'll have to sort out the ZONE_DMA bits for 64-bit, + * but for now this can simply use vmalloc_user() directly. + */ + return vmalloc_user(size); +} +EXPORT_SYMBOL(vmalloc_32_user); void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { BUG(); return NULL; } +EXPORT_SYMBOL(vmap); + +void vunmap(const void *addr) +{ + BUG(); +} +EXPORT_SYMBOL(vunmap); + +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL(vm_map_ram); -void vunmap(void *addr) +void vm_unmap_ram(const void *mem, unsigned int count) { BUG(); } +EXPORT_SYMBOL(vm_unmap_ram); + +void vm_unmap_aliases(void) +{ +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + +/* + * Implement a stub for vmalloc_sync_all() if the architecture chose not to + * have one. + */ +void __attribute__((weak)) vmalloc_sync_all(void) +{ +} + +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_page); /* * sys_brk() for the most part doesn't need the global kernel @@ -267,7 +411,7 @@ void vunmap(void *addr) * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage unsigned long sys_brk(unsigned long brk) +SYSCALL_DEFINE1(brk, unsigned long, brk) { struct mm_struct *mm = current->mm; @@ -288,127 +432,195 @@ asmlinkage unsigned long sys_brk(unsigned long brk) /* * Ok, looks good - let it rip. */ + flush_icache_range(mm->brk, brk); return mm->brk = brk; } -#ifdef DEBUG -static void show_process_blocks(void) +/* + * initialise the VMA and region record slabs + */ +void __init mmap_init(void) { - struct vm_list_struct *vml; - - printk("Process blocks %d:", current->pid); + int ret; - for (vml = ¤t->mm->context.vmlist; vml; vml = vml->next) { - printk(" %p: %p", vml, vml->vma); - if (vml->vma) - printk(" (%d @%lx #%d)", - kobjsize((void *) vml->vma->vm_start), - vml->vma->vm_start, - atomic_read(&vml->vma->vm_usage)); - printk(vml->next ? " ->" : ".\n"); - } + ret = percpu_counter_init(&vm_committed_as, 0); + VM_BUG_ON(ret); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } -#endif /* DEBUG */ /* - * add a VMA into a process's mm_struct in the appropriate place in the list - * - should be called with mm->mmap_sem held writelocked + * validate the region tree + * - the caller must hold the region lock */ -static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) +#ifdef CONFIG_DEBUG_NOMMU_REGIONS +static noinline void validate_nommu_regions(void) { - struct vm_list_struct **ppv; + struct vm_region *region, *last; + struct rb_node *p, *lastp; - for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) - if ((*ppv)->vma->vm_start > vml->vma->vm_start) - break; + lastp = rb_first(&nommu_region_tree); + if (!lastp) + return; + + last = rb_entry(lastp, struct vm_region, vm_rb); + BUG_ON(unlikely(last->vm_end <= last->vm_start)); + BUG_ON(unlikely(last->vm_top < last->vm_end)); - vml->next = *ppv; - *ppv = vml; + while ((p = rb_next(lastp))) { + region = rb_entry(p, struct vm_region, vm_rb); + last = rb_entry(lastp, struct vm_region, vm_rb); + + BUG_ON(unlikely(region->vm_end <= region->vm_start)); + BUG_ON(unlikely(region->vm_top < region->vm_end)); + BUG_ON(unlikely(region->vm_start < last->vm_top)); + + lastp = p; + } } +#else +static void validate_nommu_regions(void) +{ +} +#endif /* - * look up the first VMA in which addr resides, NULL if none - * - should be called with mm->mmap_sem at least held readlocked + * add a region into the global tree */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +static void add_nommu_region(struct vm_region *region) { - struct vm_list_struct *loop, *vml; + struct vm_region *pregion; + struct rb_node **p, *parent; - /* search the vm_start ordered list */ - vml = NULL; - for (loop = mm->context.vmlist; loop; loop = loop->next) { - if (loop->vma->vm_start > addr) - break; - vml = loop; + validate_nommu_regions(); + + parent = NULL; + p = &nommu_region_tree.rb_node; + while (*p) { + parent = *p; + pregion = rb_entry(parent, struct vm_region, vm_rb); + if (region->vm_start < pregion->vm_start) + p = &(*p)->rb_left; + else if (region->vm_start > pregion->vm_start) + p = &(*p)->rb_right; + else if (pregion == region) + return; + else + BUG(); } - if (vml && vml->vma->vm_end > addr) - return vml->vma; + rb_link_node(®ion->vm_rb, parent, p); + rb_insert_color(®ion->vm_rb, &nommu_region_tree); - return NULL; + validate_nommu_regions(); } -EXPORT_SYMBOL(find_vma); /* - * find a VMA - * - we don't extend stack VMAs under NOMMU conditions + * delete a region from the global tree */ -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +static void delete_nommu_region(struct vm_region *region) { - return find_vma(mm, addr); + BUG_ON(!nommu_region_tree.rb_node); + + validate_nommu_regions(); + rb_erase(®ion->vm_rb, &nommu_region_tree); + validate_nommu_regions(); } /* - * look up the first VMA exactly that exactly matches addr - * - should be called with mm->mmap_sem at least held readlocked + * free a contiguous series of pages */ -static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, - unsigned long addr) +static void free_page_series(unsigned long from, unsigned long to) { - struct vm_list_struct *vml; - - /* search the vm_start ordered list */ - for (vml = mm->context.vmlist; vml; vml = vml->next) { - if (vml->vma->vm_start == addr) - return vml->vma; - if (vml->vma->vm_start > addr) - break; + for (; from < to; from += PAGE_SIZE) { + struct page *page = virt_to_page(from); + + kdebug("- free %lx", from); + atomic_long_dec(&mmap_pages_allocated); + if (page_count(page) != 1) + kdebug("free page %p: refcount not one: %d", + page, page_count(page)); + put_page(page); } - - return NULL; } /* - * find a VMA in the global tree + * release a reference to a region + * - the caller must hold the region semaphore for writing, which this releases + * - the region may not have been added to the tree yet, in which case vm_top + * will equal vm_start */ -static inline struct vm_area_struct *find_nommu_vma(unsigned long start) +static void __put_nommu_region(struct vm_region *region) + __releases(nommu_region_sem) { - struct vm_area_struct *vma; - struct rb_node *n = nommu_vma_tree.rb_node; + kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); - while (n) { - vma = rb_entry(n, struct vm_area_struct, vm_rb); + BUG_ON(!nommu_region_tree.rb_node); - if (start < vma->vm_start) - n = n->rb_left; - else if (start > vma->vm_start) - n = n->rb_right; - else - return vma; + if (atomic_dec_and_test(®ion->vm_usage)) { + if (region->vm_top > region->vm_start) + delete_nommu_region(region); + up_write(&nommu_region_sem); + + if (region->vm_file) + fput(region->vm_file); + + /* IO memory and memory shared directly out of the pagecache + * from ramfs/tmpfs mustn't be released here */ + if (region->vm_flags & VM_MAPPED_COPY) { + kdebug("free series"); + free_page_series(region->vm_start, region->vm_top); + } + kmem_cache_free(vm_region_jar, region); + } else { + up_write(&nommu_region_sem); } +} - return NULL; +/* + * release a reference to a region + */ +static void put_nommu_region(struct vm_region *region) +{ + down_write(&nommu_region_sem); + __put_nommu_region(region); } /* - * add a VMA in the global tree + * update protection on a vma + */ +static void protect_vma(struct vm_area_struct *vma, unsigned long flags) +{ +#ifdef CONFIG_MPU + struct mm_struct *mm = vma->vm_mm; + long start = vma->vm_start & PAGE_MASK; + while (start < vma->vm_end) { + protect_page(mm, start, flags); + start += PAGE_SIZE; + } + update_protections(mm); +#endif +} + +/* + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_sem held writelocked */ -static void add_nommu_vma(struct vm_area_struct *vma) +static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma; + struct vm_area_struct *pvma, **pp; struct address_space *mapping; - struct rb_node **p = &nommu_vma_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p, *parent; + + kenter(",%p", vma); + + BUG_ON(!vma->vm_region); + + mm->map_count++; + vma->vm_mm = mm; + + protect_vma(vma, vma->vm_flags); /* add the VMA to the mapping */ if (vma->vm_file) { @@ -419,42 +631,64 @@ static void add_nommu_vma(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); } - /* add the VMA to the master list */ + /* add the VMA to the tree */ + parent = NULL; + p = &mm->mm_rb.rb_node; while (*p) { parent = *p; pvma = rb_entry(parent, struct vm_area_struct, vm_rb); - if (vma->vm_start < pvma->vm_start) { + /* sort by: start addr, end addr, VMA struct addr in that order + * (the latter is necessary as we may get identical VMAs) */ + if (vma->vm_start < pvma->vm_start) p = &(*p)->rb_left; - } - else if (vma->vm_start > pvma->vm_start) { + else if (vma->vm_start > pvma->vm_start) p = &(*p)->rb_right; - } - else { - /* mappings are at the same address - this can only - * happen for shared-mem chardevs and shared file - * mappings backed by ramfs/tmpfs */ - BUG_ON(!(pvma->vm_flags & VM_SHARED)); - - if (vma < pvma) - p = &(*p)->rb_left; - else if (vma > pvma) - p = &(*p)->rb_right; - else - BUG(); - } + else if (vma->vm_end < pvma->vm_end) + p = &(*p)->rb_left; + else if (vma->vm_end > pvma->vm_end) + p = &(*p)->rb_right; + else if (vma < pvma) + p = &(*p)->rb_left; + else if (vma > pvma) + p = &(*p)->rb_right; + else + BUG(); } rb_link_node(&vma->vm_rb, parent, p); - rb_insert_color(&vma->vm_rb, &nommu_vma_tree); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); + + /* add VMA to the VMA list also */ + for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { + if (pvma->vm_start > vma->vm_start) + break; + if (pvma->vm_start < vma->vm_start) + continue; + if (pvma->vm_end < vma->vm_end) + break; + } + + vma->vm_next = *pp; + *pp = vma; } /* - * delete a VMA from the global list + * delete a VMA from its owning mm_struct and address space */ -static void delete_nommu_vma(struct vm_area_struct *vma) +static void delete_vma_from_mm(struct vm_area_struct *vma) { + struct vm_area_struct **pp; struct address_space *mapping; + struct mm_struct *mm = vma->vm_mm; + + kenter("%p", vma); + + protect_vma(vma, 0); + + mm->map_count--; + if (mm->mmap_cache == vma) + mm->mmap_cache = NULL; /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -465,8 +699,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); } - /* remove from the master list */ - rb_erase(&vma->vm_rb, &nommu_vma_tree); + /* remove from the MM's tree and list */ + rb_erase(&vma->vm_rb, &mm->mm_rb); + for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { + if (*pp == vma) { + *pp = vma->vm_next; + break; + } + } + + vma->vm_mm = NULL; +} + +/* + * destroy a VMA record + */ +static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + kenter("%p", vma); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) { + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } + put_nommu_region(vma->vm_region); + kmem_cache_free(vm_area_cachep, vma); +} + +/* + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_sem at least held readlocked + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + struct rb_node *n = mm->mm_rb.rb_node; + + /* check the cache first */ + vma = mm->mmap_cache; + if (vma && vma->vm_start <= addr && vma->vm_end > addr) + return vma; + + /* trawl the tree (there may be multiple mappings in which addr + * resides) */ + for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { + vma = rb_entry(n, struct vm_area_struct, vm_rb); + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end > addr) { + mm->mmap_cache = vma; + return vma; + } + } + + return NULL; +} +EXPORT_SYMBOL(find_vma); + +/* + * find a VMA + * - we don't extend stack VMAs under NOMMU conditions + */ +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + return find_vma(mm, addr); +} + +/* + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return -ENOMEM; +} + +/* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_sem at least held readlocked + */ +static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + struct rb_node *n = mm->mm_rb.rb_node; + unsigned long end = addr + len; + + /* check the cache first */ + vma = mm->mmap_cache; + if (vma && vma->vm_start == addr && vma->vm_end == end) + return vma; + + /* trawl the tree (there may be multiple mappings in which addr + * resides) */ + for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { + vma = rb_entry(n, struct vm_area_struct, vm_rb); + if (vma->vm_start < addr) + continue; + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end == end) { + mm->mmap_cache = vma; + return vma; + } + } + + return NULL; } /* @@ -481,12 +822,12 @@ static int validate_mmap_request(struct file *file, unsigned long pgoff, unsigned long *_capabilities) { - unsigned long capabilities; + unsigned long capabilities, rlen; unsigned long reqprot = prot; int ret; /* do the simple checks first */ - if (flags & MAP_FIXED || addr) { + if (flags & MAP_FIXED) { printk(KERN_DEBUG "%d: Can't do fixed-address/overlay mmap of RAM\n", current->pid); @@ -501,12 +842,12 @@ static int validate_mmap_request(struct file *file, return -EINVAL; /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) + rlen = PAGE_ALIGN(len); + if (!rlen || rlen > TASK_SIZE) return -ENOMEM; /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; if (file) { @@ -557,6 +898,10 @@ static int validate_mmap_request(struct file *file, if (!file->f_op->read) capabilities &= ~BDI_CAP_MAP_COPY; + /* The file shall have been opened with read permission. */ + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (flags & MAP_SHARED) { /* do checks for writing, appending and locking */ if ((prot & PROT_WRITE) && @@ -630,7 +975,7 @@ static int validate_mmap_request(struct file *file, } /* allow the security API to have its say */ - ret = security_file_mmap(file, reqprot, prot, flags); + ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); if (ret < 0) return ret; @@ -673,20 +1018,25 @@ static unsigned long determine_vm_flags(struct file *file, * it's being traced - otherwise breakpoints set in it may interfere * with another untraced process */ - if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) + if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) vm_flags &= ~VM_MAYSHARE; return vm_flags; } /* - * set up a shared mapping on a file + * set up a shared mapping on a file (the driver or filesystem provides and + * pins the storage) */ -static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) +static int do_mmap_shared_file(struct vm_area_struct *vma) { int ret; ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + if (ret == 0) { + vma->vm_region->vm_top = vma->vm_region->vm_end; + return 0; + } if (ret != -ENOSYS) return ret; @@ -700,45 +1050,79 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) /* * set up a private mapping or an anonymous shared mapping */ -static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) +static int do_mmap_private(struct vm_area_struct *vma, + struct vm_region *region, + unsigned long len, + unsigned long capabilities) { + struct page *pages; + unsigned long total, point, n, rlen; void *base; - int ret; + int ret, order; /* invoke the file's mapping function so that it can keep track of * shared mappings on devices or memory * - VM_MAYSHARE will be set if it may attempt to share */ - if (vma->vm_file) { + if (capabilities & BDI_CAP_MAP_DIRECT) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); - if (ret != -ENOSYS) { + if (ret == 0) { /* shouldn't return success if we're not sharing */ - BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); - return ret; /* success or a real error */ + BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); + vma->vm_region->vm_top = vma->vm_region->vm_end; + return 0; } + if (ret != -ENOSYS) + return ret; /* getting an ENOSYS error indicates that direct mmap isn't * possible (as opposed to tried but failed) so we'll try to * make a private copy of the data and map that instead */ } + rlen = PAGE_ALIGN(len); + /* allocate some memory to hold the mapping * - note that this may not return a page-aligned address if the object * we're allocating is smaller than a page */ - base = kmalloc(len, GFP_KERNEL|__GFP_COMP); - if (!base) + order = get_order(rlen); + kdebug("alloc order %d for %lx", order, len); + + pages = alloc_pages(GFP_KERNEL, order); + if (!pages) goto enomem; - vma->vm_start = (unsigned long) base; - vma->vm_end = vma->vm_start + len; - vma->vm_flags |= VM_MAPPED_COPY; + total = 1 << order; + atomic_long_add(total, &mmap_pages_allocated); + + point = rlen >> PAGE_SHIFT; + + /* we allocated a power-of-2 sized page set, so we may want to trim off + * the excess */ + if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { + while (total > point) { + order = ilog2(total - point); + n = 1 << order; + kdebug("shave %lu/%lu @%lu", n, total - point, total); + atomic_long_sub(n, &mmap_pages_allocated); + total -= n; + set_page_refcounted(pages + total); + __free_pages(pages + total, order); + } + } + + for (point = 1; point < total; point++) + set_page_refcounted(&pages[point]); -#ifdef WARN_ON_SLACK - if (len + WARN_ON_SLACK <= kobjsize(result)) - printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", - len, current->pid, kobjsize(result) - len); -#endif + base = page_address(pages); + region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + region->vm_start = (unsigned long) base; + region->vm_end = region->vm_start + rlen; + region->vm_top = region->vm_start + (total << PAGE_SHIFT); + + vma->vm_start = region->vm_start; + vma->vm_end = region->vm_start + len; if (vma->vm_file) { /* read the contents of a file into the copy */ @@ -750,31 +1134,30 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) old_fs = get_fs(); set_fs(KERNEL_DS); - ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); + ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); set_fs(old_fs); if (ret < 0) goto error_free; /* clear the last little bit */ - if (ret < len) - memset(base + ret, 0, len - ret); + if (ret < rlen) + memset(base + ret, 0, rlen - ret); - } else { - /* if it's an anonymous mapping, then just clear it */ - memset(base, 0, len); } return 0; error_free: - kfree(base); - vma->vm_start = 0; + free_page_series(region->vm_start, region->vm_end); + region->vm_start = vma->vm_start = 0; + region->vm_end = vma->vm_end = 0; + region->vm_top = 0; return ret; enomem: - printk("Allocation of length %lu from process %d failed\n", - len, current->pid); + printk("Allocation of length %lu from process %d (%s) failed\n", + len, current->pid, current->comm); show_free_areas(); return -ENOMEM; } @@ -789,273 +1172,450 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long flags, unsigned long pgoff) { - struct vm_list_struct *vml = NULL; - struct vm_area_struct *vma = NULL; + struct vm_area_struct *vma; + struct vm_region *region; struct rb_node *rb; - unsigned long capabilities, vm_flags; - void *result; + unsigned long capabilities, vm_flags, result; int ret; + kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); + /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, &capabilities); - if (ret < 0) + if (ret < 0) { + kleave(" = %d [val]", ret); return ret; + } + + /* we ignore the address hint */ + addr = 0; /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); - /* we're going to need to record the mapping if it works */ - vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); - if (!vml) - goto error_getting_vml; + /* we're going to need to record the mapping */ + region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); + if (!region) + goto error_getting_region; + + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + goto error_getting_vma; + + atomic_set(®ion->vm_usage, 1); + region->vm_flags = vm_flags; + region->vm_pgoff = pgoff; - down_write(&nommu_vma_sem); + INIT_LIST_HEAD(&vma->anon_vma_node); + vma->vm_flags = vm_flags; + vma->vm_pgoff = pgoff; - /* if we want to share, we need to check for VMAs created by other + if (file) { + region->vm_file = file; + get_file(file); + vma->vm_file = file; + get_file(file); + if (vm_flags & VM_EXECUTABLE) { + added_exe_file_vma(current->mm); + vma->vm_mm = current->mm; + } + } + + down_write(&nommu_region_sem); + + /* if we want to share, we need to check for regions created by other * mmap() calls that overlap with our proposed mapping - * - we can only share with an exact match on most regular files + * - we can only share with a superset match on most regular files * - shared mappings on character devices and memory backed files are * permitted to overlap inexactly as far as we are concerned for in * these cases, sharing is handled in the driver or filesystem rather * than here */ if (vm_flags & VM_MAYSHARE) { - unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long vmpglen; + struct vm_region *pregion; + unsigned long pglen, rpglen, pgend, rpgend, start; - for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { - vma = rb_entry(rb, struct vm_area_struct, vm_rb); + pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgend = pgoff + pglen; + + for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { + pregion = rb_entry(rb, struct vm_region, vm_rb); - if (!(vma->vm_flags & VM_MAYSHARE)) + if (!(pregion->vm_flags & VM_MAYSHARE)) continue; /* search for overlapping mappings on the same file */ - if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) + if (pregion->vm_file->f_path.dentry->d_inode != + file->f_path.dentry->d_inode) continue; - if (vma->vm_pgoff >= pgoff + pglen) + if (pregion->vm_pgoff >= pgend) continue; - vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; - vmpglen >>= PAGE_SHIFT; - if (pgoff >= vma->vm_pgoff + vmpglen) + rpglen = pregion->vm_end - pregion->vm_start; + rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; + rpgend = pregion->vm_pgoff + rpglen; + if (pgoff >= rpgend) continue; - /* handle inexactly overlapping matches between mappings */ - if (vma->vm_pgoff != pgoff || vmpglen != pglen) { + /* handle inexactly overlapping matches between + * mappings */ + if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && + !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { + /* new mapping is not a subset of the region */ if (!(capabilities & BDI_CAP_MAP_DIRECT)) goto sharing_violation; continue; } - /* we've found a VMA we can share */ - atomic_inc(&vma->vm_usage); - - vml->vma = vma; - result = (void *) vma->vm_start; - goto shared; + /* we've found a region we can share */ + atomic_inc(&pregion->vm_usage); + vma->vm_region = pregion; + start = pregion->vm_start; + start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; + vma->vm_start = start; + vma->vm_end = start + len; + + if (pregion->vm_flags & VM_MAPPED_COPY) { + kdebug("share copy"); + vma->vm_flags |= VM_MAPPED_COPY; + } else { + kdebug("share mmap"); + ret = do_mmap_shared_file(vma); + if (ret < 0) { + vma->vm_region = NULL; + vma->vm_start = 0; + vma->vm_end = 0; + atomic_dec(&pregion->vm_usage); + pregion = NULL; + goto error_just_free; + } + } + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + region = pregion; + result = start; + goto share; } - vma = NULL; - /* obtain the address at which to make a shared mapping * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping */ - if (file && file->f_op->get_unmapped_area) { + if (capabilities & BDI_CAP_MAP_DIRECT) { addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); if (IS_ERR((void *) addr)) { ret = addr; if (ret != (unsigned long) -ENOSYS) - goto error; + goto error_just_free; /* the driver refused to tell us where to site * the mapping so we'll have to attempt to copy * it */ ret = (unsigned long) -ENODEV; if (!(capabilities & BDI_CAP_MAP_COPY)) - goto error; + goto error_just_free; capabilities &= ~BDI_CAP_MAP_DIRECT; + } else { + vma->vm_start = region->vm_start = addr; + vma->vm_end = region->vm_end = addr + len; } } } - /* we're going to need a VMA struct as well */ - vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); - if (!vma) - goto error_getting_vma; + vma->vm_region = region; - INIT_LIST_HEAD(&vma->anon_vma_node); - atomic_set(&vma->vm_usage, 1); - if (file) - get_file(file); - vma->vm_file = file; - vma->vm_flags = vm_flags; - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; - - vml->vma = vma; - - /* set up the mapping */ + /* set up the mapping + * - the region is filled in if BDI_CAP_MAP_DIRECT is still set + */ if (file && vma->vm_flags & VM_SHARED) - ret = do_mmap_shared_file(vma, len); + ret = do_mmap_shared_file(vma); else - ret = do_mmap_private(vma, len); + ret = do_mmap_private(vma, region, len, capabilities); if (ret < 0) - goto error; - - /* okay... we have a mapping; now we have to register it */ - result = (void *) vma->vm_start; + goto error_just_free; + add_nommu_region(region); - if (vma->vm_flags & VM_MAPPED_COPY) { - realalloc += kobjsize(result); - askedalloc += len; - } + /* clear anonymous mappings that don't ask for uninitialized data */ + if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) + memset((void *)region->vm_start, 0, + region->vm_end - region->vm_start); - realalloc += kobjsize(vma); - askedalloc += sizeof(*vma); + /* okay... we have a mapping; now we have to register it */ + result = vma->vm_start; current->mm->total_vm += len >> PAGE_SHIFT; - add_nommu_vma(vma); - - shared: - realalloc += kobjsize(vml); - askedalloc += sizeof(*vml); - - add_vma_to_mm(current->mm, vml); +share: + add_vma_to_mm(current->mm, vma); - up_write(&nommu_vma_sem); - - if (prot & PROT_EXEC) - flush_icache_range((unsigned long) result, - (unsigned long) result + len); + /* we flush the region from the icache only when the first executable + * mapping of it is made */ + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { + flush_icache_range(region->vm_start, region->vm_end); + region->vm_icache_flushed = true; + } -#ifdef DEBUG - printk("do_mmap:\n"); - show_process_blocks(); -#endif + up_write(&nommu_region_sem); - return (unsigned long) result; + kleave(" = %lx", result); + return result; - error: - up_write(&nommu_vma_sem); - kfree(vml); - if (vma) { - if (vma->vm_file) - fput(vma->vm_file); - kfree(vma); - } +error_just_free: + up_write(&nommu_region_sem); +error: + if (region->vm_file) + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + if (vma->vm_file) + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + kmem_cache_free(vm_area_cachep, vma); + kleave(" = %d", ret); return ret; - sharing_violation: - up_write(&nommu_vma_sem); - printk("Attempt to share mismatched mappings\n"); - kfree(vml); - return -EINVAL; +sharing_violation: + up_write(&nommu_region_sem); + printk(KERN_WARNING "Attempt to share mismatched mappings\n"); + ret = -EINVAL; + goto error; - error_getting_vma: - up_write(&nommu_vma_sem); - kfree(vml); - printk("Allocation of vma for %lu byte allocation from process %d failed\n", +error_getting_vma: + kmem_cache_free(vm_region_jar, region); + printk(KERN_WARNING "Allocation of vma for %lu byte allocation" + " from process %d failed\n", len, current->pid); show_free_areas(); return -ENOMEM; - error_getting_vml: - printk("Allocation of vml for %lu byte allocation from process %d failed\n", +error_getting_region: + printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" + " from process %d failed\n", len, current->pid); show_free_areas(); return -ENOMEM; } +EXPORT_SYMBOL(do_mmap_pgoff); + +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} /* - * handle mapping disposal for uClinux + * split a vma into two pieces at address 'addr', a new vma is allocated either + * for the first part or the tail. */ -static void put_vma(struct vm_area_struct *vma) +int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) { - if (vma) { - down_write(&nommu_vma_sem); + struct vm_area_struct *new; + struct vm_region *region; + unsigned long npages; - if (atomic_dec_and_test(&vma->vm_usage)) { - delete_nommu_vma(vma); + kenter(""); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); + /* we're only permitted to split anonymous regions that have a single + * owner */ + if (vma->vm_file || + atomic_read(&vma->vm_region->vm_usage) != 1) + return -ENOMEM; - /* IO memory and memory shared directly out of the pagecache from - * ramfs/tmpfs mustn't be released here */ - if (vma->vm_flags & VM_MAPPED_COPY) { - realalloc -= kobjsize((void *) vma->vm_start); - askedalloc -= vma->vm_end - vma->vm_start; - kfree((void *) vma->vm_start); - } + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; - realalloc -= kobjsize(vma); - askedalloc -= sizeof(*vma); + region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); + if (!region) + return -ENOMEM; - if (vma->vm_file) - fput(vma->vm_file); - kfree(vma); - } + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!new) { + kmem_cache_free(vm_region_jar, region); + return -ENOMEM; + } + + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + *region = *vma->vm_region; + new->vm_region = region; + + npages = (addr - vma->vm_start) >> PAGE_SHIFT; - up_write(&nommu_vma_sem); + if (new_below) { + region->vm_top = region->vm_end = new->vm_end = addr; + } else { + region->vm_start = new->vm_start = addr; + region->vm_pgoff = new->vm_pgoff += npages; + } + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + delete_vma_from_mm(vma); + down_write(&nommu_region_sem); + delete_nommu_region(vma->vm_region); + if (new_below) { + vma->vm_region->vm_start = vma->vm_start = addr; + vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; + } else { + vma->vm_region->vm_end = vma->vm_end = addr; + vma->vm_region->vm_top = addr; } + add_nommu_region(vma->vm_region); + add_nommu_region(new->vm_region); + up_write(&nommu_region_sem); + add_vma_to_mm(mm, vma); + add_vma_to_mm(mm, new); + return 0; } /* - * release a mapping - * - under NOMMU conditions the parameters must match exactly to the mapping to - * be removed + * shrink a VMA by removing the specified chunk from either the beginning or + * the end */ -int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) +static int shrink_vma(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long from, unsigned long to) { - struct vm_list_struct *vml, **parent; - unsigned long end = addr + len; + struct vm_region *region; -#ifdef DEBUG - printk("do_munmap:\n"); -#endif + kenter(""); - for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { - if ((*parent)->vma->vm_start > addr) - break; - if ((*parent)->vma->vm_start == addr && - ((len == 0) || ((*parent)->vma->vm_end == end))) - goto found; + /* adjust the VMA's pointers, which may reposition it in the MM's tree + * and list */ + delete_vma_from_mm(vma); + if (from > vma->vm_start) + vma->vm_end = from; + else + vma->vm_start = to; + add_vma_to_mm(mm, vma); + + /* cut the backing region down to size */ + region = vma->vm_region; + BUG_ON(atomic_read(®ion->vm_usage) != 1); + + down_write(&nommu_region_sem); + delete_nommu_region(region); + if (from > region->vm_start) { + to = region->vm_top; + region->vm_top = region->vm_end = from; + } else { + region->vm_start = to; } + add_nommu_region(region); + up_write(&nommu_region_sem); - printk("munmap of non-mmaped memory by process %d (%s): %p\n", - current->pid, current->comm, (void *) addr); - return -EINVAL; + free_page_series(from, to); + return 0; +} - found: - vml = *parent; +/* + * release a mapping + * - under NOMMU conditions the chunk to be unmapped must be backed by a single + * VMA, though it need not cover the whole VMA + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + struct vm_area_struct *vma; + struct rb_node *rb; + unsigned long end = start + len; + int ret; - put_vma(vml->vma); + kenter(",%lx,%zx", start, len); - *parent = vml->next; - realalloc -= kobjsize(vml); - askedalloc -= sizeof(*vml); - kfree(vml); + if (len == 0) + return -EINVAL; - update_hiwater_vm(mm); - mm->total_vm -= len >> PAGE_SHIFT; + /* find the first potentially overlapping VMA */ + vma = find_vma(mm, start); + if (!vma) { + static int limit = 0; + if (limit < 5) { + printk(KERN_WARNING + "munmap of memory not mmapped by process %d" + " (%s): 0x%lx-0x%lx\n", + current->pid, current->comm, + start, start + len - 1); + limit++; + } + return -EINVAL; + } -#ifdef DEBUG - show_process_blocks(); -#endif + /* we're allowed to split an anonymous VMA but not a file-backed one */ + if (vma->vm_file) { + do { + if (start > vma->vm_start) { + kleave(" = -EINVAL [miss]"); + return -EINVAL; + } + if (end == vma->vm_end) + goto erase_whole_vma; + rb = rb_next(&vma->vm_rb); + vma = rb_entry(rb, struct vm_area_struct, vm_rb); + } while (rb); + kleave(" = -EINVAL [split file]"); + return -EINVAL; + } else { + /* the chunk must be a subset of the VMA found */ + if (start == vma->vm_start && end == vma->vm_end) + goto erase_whole_vma; + if (start < vma->vm_start || end > vma->vm_end) { + kleave(" = -EINVAL [superset]"); + return -EINVAL; + } + if (start & ~PAGE_MASK) { + kleave(" = -EINVAL [unaligned start]"); + return -EINVAL; + } + if (end != vma->vm_end && end & ~PAGE_MASK) { + kleave(" = -EINVAL [unaligned split]"); + return -EINVAL; + } + if (start != vma->vm_start && end != vma->vm_end) { + ret = split_vma(mm, vma, start, 1); + if (ret < 0) { + kleave(" = %d [split]", ret); + return ret; + } + } + return shrink_vma(mm, vma, start, end); + } +erase_whole_vma: + delete_vma_from_mm(vma); + delete_vma(mm, vma); + kleave(" = 0"); return 0; } +EXPORT_SYMBOL(do_munmap); -asmlinkage long sys_munmap(unsigned long addr, size_t len) +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { int ret; struct mm_struct *mm = current->mm; @@ -1067,32 +1627,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len) } /* - * Release all mappings + * release all the mappings made in a process's VM space */ -void exit_mmap(struct mm_struct * mm) +void exit_mmap(struct mm_struct *mm) { - struct vm_list_struct *tmp; - - if (mm) { -#ifdef DEBUG - printk("Exit_mmap:\n"); -#endif + struct vm_area_struct *vma; - mm->total_vm = 0; + if (!mm) + return; - while ((tmp = mm->context.vmlist)) { - mm->context.vmlist = tmp->next; - put_vma(tmp->vma); + kenter(""); - realalloc -= kobjsize(tmp); - askedalloc -= sizeof(*tmp); - kfree(tmp); - } + mm->total_vm = 0; -#ifdef DEBUG - show_process_blocks(); -#endif + while ((vma = mm->mmap)) { + mm->mmap = vma->vm_next; + delete_vma_from_mm(vma); + delete_vma(mm, vma); } + + kleave(""); } unsigned long do_brk(unsigned long addr, unsigned long len) @@ -1105,8 +1659,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) * time (controlled by the MREMAP_MAYMOVE flag and available VM space) * * under NOMMU conditions, we only permit changing a mapping's size, and only - * as long as it stays within the hole allocated by the kmalloc() call in - * do_mmap_pgoff() and the block is not shareable + * as long as it stays within the region allocated by do_mmap_private() and the + * block is not shareable * * MREMAP_FIXED is not supported under NOMMU conditions */ @@ -1117,13 +1671,16 @@ unsigned long do_mremap(unsigned long addr, struct vm_area_struct *vma; /* insanity checks first */ - if (new_len == 0) + if (old_len == 0 || new_len == 0) return (unsigned long) -EINVAL; + if (addr & ~PAGE_MASK) + return -EINVAL; + if (flags & MREMAP_FIXED && new_addr != addr) return (unsigned long) -EINVAL; - vma = find_vma_exact(current->mm, addr); + vma = find_vma_exact(current->mm, addr, old_len); if (!vma) return (unsigned long) -EINVAL; @@ -1133,21 +1690,18 @@ unsigned long do_mremap(unsigned long addr, if (vma->vm_flags & VM_MAYSHARE) return (unsigned long) -EPERM; - if (new_len > kobjsize((void *) addr)) + if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) return (unsigned long) -ENOMEM; /* all checks complete - do it */ vma->vm_end = vma->vm_start + new_len; - - askedalloc -= old_len; - askedalloc += new_len; - return vma->vm_start; } +EXPORT_SYMBOL(do_mremap); -asmlinkage unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { unsigned long ret; @@ -1171,6 +1725,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, } EXPORT_SYMBOL(remap_pfn_range); +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + unsigned int size = vma->vm_end - vma->vm_start; + + if (!(vma->vm_flags & VM_USERMAP)) + return -EINVAL; + + vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); + vma->vm_end = vma->vm_start + size; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_range); + void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { } @@ -1193,6 +1762,27 @@ void unmap_mapping_range(struct address_space *mapping, EXPORT_SYMBOL(unmap_mapping_range); /* + * ask for an unmapped area at which to create a mapping on a file + */ +unsigned long get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long (*get_area)(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); + + get_area = current->mm->get_unmapped_area; + if (file && file->f_op && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + + if (!get_area) + return -ENOSYS; + + return get_area(file, addr, len, pgoff, flags); +} +EXPORT_SYMBOL(get_unmapped_area); + +/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. @@ -1208,7 +1798,7 @@ EXPORT_SYMBOL(unmap_mapping_range); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(long pages, int cap_sys_admin) +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; @@ -1280,14 +1870,12 @@ int __vm_enough_memory(long pages, int cap_sys_admin) /* Don't let a single process grow too big: leave 3% of the size of this process for other processes */ - allowed -= current->mm->total_vm / 32; + if (mm) + allowed -= mm->total_vm / 32; - /* - * cast `allowed' as a signed long because vm_committed_space - * sometimes has a negative value - */ - if (atomic_read(&vm_committed_space) < (long)allowed) + if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; + error: vm_unacct_memory(pages); @@ -1299,12 +1887,12 @@ int in_gate_area_no_task(unsigned long addr) return 0; } -struct page *filemap_nopage(struct vm_area_struct *area, - unsigned long address, int *type) +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); - return NULL; + return 0; } +EXPORT_SYMBOL(filemap_fault); /* * Access another process' address space. @@ -1333,9 +1921,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in /* only read or write mappings where it is permitted */ if (write && vma->vm_flags & VM_MAYWRITE) - len -= copy_to_user((void *) addr, buf, len); + copy_to_user_page(vma, NULL, addr, + (void *) addr, buf, len); else if (!write && vma->vm_flags & VM_MAYREAD) - len -= copy_from_user(buf, (void *) addr, len); + copy_from_user_page(vma, NULL, addr, + buf, (void *) addr, len); else len = 0; } else {