X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmadvise.c;h=35b1479b7c9d080ed97b772da44603c4f757093c;hb=caed0f486e582eeeb6e3546417fd758230fe4ad9;hp=77916e9fc52bc466abb08d55457b8c79b0e05854;hpb=00e9fa2d6421fbbefb4c02821a1e779a3ce47781;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/madvise.c b/mm/madvise.c index 77916e9..35b1479 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -10,6 +10,26 @@ #include #include #include +#include +#include + +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_sem for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static int madvise_need_mmap_write(int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 0; + default: + /* be safe, default to 1. list exceptions explicitly */ + return 1; + } +} /* * We can potentially split a vm area into separate @@ -22,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma, struct mm_struct * mm = vma->vm_mm; int error = 0; pgoff_t pgoff; - int new_flags = vma->vm_flags; + unsigned long new_flags = vma->vm_flags; switch (behavior) { case MADV_NORMAL: @@ -38,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma, new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + goto out; + } new_flags &= ~VM_DONTCOPY; break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -93,7 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma, if (!file) return -EBADF; - if (file->f_mapping->a_ops->get_xip_page) { + if (file->f_mapping->a_ops->get_xip_mem) { /* no bad return value, but ignore advice */ return 0; } @@ -104,8 +134,7 @@ static long madvise_willneed(struct vm_area_struct * vma, end = vma->vm_end; end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - force_page_cache_readahead(file->f_mapping, - file, start, max_sane_readahead(end - start)); + force_page_cache_readahead(file->f_mapping, file, start, end - start); return 0; } @@ -113,10 +142,10 @@ static long madvise_willneed(struct vm_area_struct * vma, * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range call sets things up for refill_inactive to actually free + * zap_page_range call sets things up for shrink_active_list to actually free * these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for - * refill_inactive to pick up before reclaiming other pages. + * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for @@ -159,9 +188,10 @@ static long madvise_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct address_space *mapping; - loff_t offset, endoff; + loff_t offset, endoff; + int error; - *prev = vma; + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) return -EINVAL; @@ -180,44 +210,77 @@ static long madvise_remove(struct vm_area_struct *vma, + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); endoff = (loff_t)(end - vma->vm_start - 1) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - return vmtruncate_range(mapping->host, offset, endoff); + + /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + up_read(¤t->mm->mmap_sem); + error = vmtruncate_range(mapping->host, offset, endoff); + down_read(¤t->mm->mmap_sem); + return error; +} + +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_hwpoison(unsigned long start, unsigned long end) +{ + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for (; start < end; start += PAGE_SIZE) { + struct page *p; + int ret = get_user_pages(current, current->mm, start, 1, + 0, 0, &p, NULL); + if (ret != 1) + return ret; + printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + page_to_pfn(p), start); + /* Ignore return value for now */ + __memory_failure(page_to_pfn(p), 0, 1); + put_page(p); + } + return ret; } +#endif static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { - long error; + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_DONTNEED: + return madvise_dontneed(vma, prev, start, end); + default: + return madvise_behavior(vma, prev, start, end, behavior); + } +} +static int +madvise_behavior_valid(int behavior) +{ switch (behavior) { case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - break; - } case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: - error = madvise_behavior(vma, prev, start, end, behavior); - break; case MADV_REMOVE: - error = madvise_remove(vma, prev, start, end); - break; - case MADV_WILLNEED: - error = madvise_willneed(vma, prev, start, end); - break; - case MADV_DONTNEED: - error = madvise_dontneed(vma, prev, start, end); - break; +#ifdef CONFIG_KSM + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: +#endif + return 1; default: - error = -EINVAL; - break; + return 0; } - return error; } /* @@ -244,6 +307,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * so the kernel can free resources associated with it. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. + * MADV_DONTFORK - omit this area from child's address space when forking: + * typically, to avoid COWing pages pinned by get_user_pages(). + * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_MERGEABLE - the application recommends that KSM try to merge pages in + * this area with pages of identical content from other such areas. + * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * * return values: * zero - success @@ -256,15 +325,27 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; struct vm_area_struct * vma, *prev; int unmapped_error = 0; int error = -EINVAL; + int write; size_t len; - down_write(¤t->mm->mmap_sem); +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON) + return madvise_hwpoison(start, start+len_in); +#endif + if (!madvise_behavior_valid(behavior)) + return error; + + write = madvise_need_mmap_write(behavior); + if (write) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); if (start & ~PAGE_MASK) goto out; @@ -315,14 +396,21 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) if (error) goto out; start = tmp; - if (start < prev->vm_end) + if (prev && start < prev->vm_end) start = prev->vm_end; error = unmapped_error; if (start >= end) goto out; - vma = prev->vm_next; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); } out: - up_write(¤t->mm->mmap_sem); + if (write) + up_write(¤t->mm->mmap_sem); + else + up_read(¤t->mm->mmap_sem); + return error; }