X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmadvise.c;h=319528b8db74c8d0c7cd10c8468acb5539aea7dd;hb=84bb671dc46d77d665d2b5e74539e81b2129bb3e;hp=20e075d1c64c9c64674e5fc418c6643418931661;hpb=1bef40032992320dd25a266fc166bfb8fa3f2f59;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/madvise.c b/mm/madvise.c index 20e075d..319528b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -9,7 +9,28 @@ #include #include #include +#include #include +#include +#include + +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_sem for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static int madvise_need_mmap_write(int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 0; + default: + /* be safe, default to 1. list exceptions explicitly */ + return 1; + } +} /* * We can potentially split a vm area into separate @@ -22,16 +43,33 @@ static long madvise_behavior(struct vm_area_struct * vma, struct mm_struct * mm = vma->vm_mm; int error = 0; pgoff_t pgoff; - int new_flags = vma->vm_flags & ~VM_READHINTMASK; + unsigned long new_flags = vma->vm_flags; switch (behavior) { + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; case MADV_SEQUENTIAL: - new_flags |= VM_SEQ_READ; + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; break; case MADV_RANDOM: - new_flags |= VM_RAND_READ; + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; break; - default: + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTCOPY; + break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; break; } @@ -86,7 +124,7 @@ static long madvise_willneed(struct vm_area_struct * vma, if (!file) return -EBADF; - if (file->f_mapping->a_ops->get_xip_page) { + if (file->f_mapping->a_ops->get_xip_mem) { /* no bad return value, but ignore advice */ return 0; } @@ -97,8 +135,7 @@ static long madvise_willneed(struct vm_area_struct * vma, end = vma->vm_end; end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - force_page_cache_readahead(file->f_mapping, - file, start, max_sane_readahead(end - start)); + force_page_cache_readahead(file->f_mapping, file, start, end - start); return 0; } @@ -106,10 +143,10 @@ static long madvise_willneed(struct vm_area_struct * vma, * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range call sets things up for refill_inactive to actually free + * zap_page_range call sets things up for shrink_active_list to actually free * these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for - * refill_inactive to pick up before reclaiming other pages. + * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for @@ -126,7 +163,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { *prev = vma; - if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { @@ -140,32 +177,117 @@ static long madvise_dontneed(struct vm_area_struct * vma, return 0; } +/* + * Application wants to free up the pages and associated backing store. + * This is effectively punching a hole into the middle of a file. + * + * NOTE: Currently, only shmfs/tmpfs is supported for this operation. + * Other filesystems return -ENOSYS. + */ +static long madvise_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct address_space *mapping; + loff_t offset, endoff; + int error; + + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + + if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + return -EINVAL; + + if (!vma->vm_file || !vma->vm_file->f_mapping + || !vma->vm_file->f_mapping->host) { + return -EINVAL; + } + + if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + return -EACCES; + + mapping = vma->vm_file->f_mapping; + + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + endoff = (loff_t)(end - vma->vm_start - 1) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + + /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + up_read(¤t->mm->mmap_sem); + error = vmtruncate_range(mapping->host, offset, endoff); + down_read(¤t->mm->mmap_sem); + return error; +} + +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) +{ + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for (; start < end; start += PAGE_SIZE) { + struct page *p; + int ret = get_user_pages_fast(start, 1, 0, &p); + if (ret != 1) + return ret; + if (bhv == MADV_SOFT_OFFLINE) { + printk(KERN_INFO "Soft offlining page %lx at %lx\n", + page_to_pfn(p), start); + ret = soft_offline_page(p, MF_COUNT_INCREASED); + if (ret) + break; + continue; + } + printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + page_to_pfn(p), start); + /* Ignore return value for now */ + __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + } + return ret; +} +#endif + static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { - long error; + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_DONTNEED: + return madvise_dontneed(vma, prev, start, end); + default: + return madvise_behavior(vma, prev, start, end, behavior); + } +} +static int +madvise_behavior_valid(int behavior) +{ switch (behavior) { + case MADV_DOFORK: + case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: - error = madvise_behavior(vma, prev, start, end, behavior); - break; - + case MADV_REMOVE: case MADV_WILLNEED: - error = madvise_willneed(vma, prev, start, end); - break; - case MADV_DONTNEED: - error = madvise_dontneed(vma, prev, start, end); - break; +#ifdef CONFIG_KSM + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: +#endif + return 1; default: - error = -EINVAL; - break; + return 0; } - return error; } /* @@ -190,6 +312,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_REMOVE - the application wants to free up the given range of + * pages and associated backing store. + * MADV_DONTFORK - omit this area from child's address space when forking: + * typically, to avoid COWing pages pinned by get_user_pages(). + * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_MERGEABLE - the application recommends that KSM try to merge pages in + * this area with pages of identical content from other such areas. + * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * * return values: * zero - success @@ -202,15 +332,27 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; struct vm_area_struct * vma, *prev; int unmapped_error = 0; int error = -EINVAL; + int write; size_t len; - down_write(¤t->mm->mmap_sem); +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_hwpoison(behavior, start, start+len_in); +#endif + if (!madvise_behavior_valid(behavior)) + return error; + + write = madvise_need_mmap_write(behavior); + if (write) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); if (start & ~PAGE_MASK) goto out; @@ -261,14 +403,21 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) if (error) goto out; start = tmp; - if (start < prev->vm_end) + if (prev && start < prev->vm_end) start = prev->vm_end; error = unmapped_error; if (start >= end) goto out; - vma = prev->vm_next; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); } out: - up_write(¤t->mm->mmap_sem); + if (write) + up_write(¤t->mm->mmap_sem); + else + up_read(¤t->mm->mmap_sem); + return error; }