X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmadvise.c;h=b9ce574827c8a2a48b972f0261decc234edf9e27;hb=0eb253e223c88b982461e59154fcad1b82597592;hp=e3108054733c2530fd28114b4814e27eda106ac0;hpb=05b7438475ddbac47e75506913d44550f0e75938;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/madvise.c b/mm/madvise.c index e310805..b9ce574 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -10,6 +10,25 @@ #include #include #include +#include + +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_sem for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static int madvise_need_mmap_write(int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 0; + default: + /* be safe, default to 1. list exceptions explicitly */ + return 1; + } +} /* * We can potentially split a vm area into separate @@ -22,22 +41,29 @@ static long madvise_behavior(struct vm_area_struct * vma, struct mm_struct * mm = vma->vm_mm; int error = 0; pgoff_t pgoff; - int new_flags = vma->vm_flags & ~VM_READHINTMASK; + int new_flags = vma->vm_flags; switch (behavior) { + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; case MADV_SEQUENTIAL: - new_flags |= VM_SEQ_READ; + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; break; case MADV_RANDOM: - new_flags |= VM_RAND_READ; + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; break; - default: + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + new_flags &= ~VM_DONTCOPY; break; } if (new_flags == vma->vm_flags) { *prev = vma; - goto success; + goto out; } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); @@ -62,16 +88,15 @@ static long madvise_behavior(struct vm_area_struct * vma, goto out; } +success: /* * vm_flags is protected by the mmap_sem held in write mode. */ - VM_ClearReadHint(vma); vma->vm_flags = new_flags; out: if (error == -ENOMEM) error = -EAGAIN; -success: return error; } @@ -87,6 +112,11 @@ static long madvise_willneed(struct vm_area_struct * vma, if (!file) return -EBADF; + if (file->f_mapping->a_ops->get_xip_mem) { + /* no bad return value, but ignore advice */ + return 0; + } + *prev = vma; start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) @@ -102,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma, * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range call sets things up for refill_inactive to actually free + * zap_page_range call sets things up for shrink_active_list to actually free * these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for - * refill_inactive to pick up before reclaiming other pages. + * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for @@ -122,7 +152,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { *prev = vma; - if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { @@ -136,17 +166,69 @@ static long madvise_dontneed(struct vm_area_struct * vma, return 0; } -static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) +/* + * Application wants to free up the pages and associated backing store. + * This is effectively punching a hole into the middle of a file. + * + * NOTE: Currently, only shmfs/tmpfs is supported for this operation. + * Other filesystems return -ENOSYS. + */ +static long madvise_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct address_space *mapping; + loff_t offset, endoff; + int error; + + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + + if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + return -EINVAL; + + if (!vma->vm_file || !vma->vm_file->f_mapping + || !vma->vm_file->f_mapping->host) { + return -EINVAL; + } + + if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + return -EACCES; + + mapping = vma->vm_file->f_mapping; + + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + endoff = (loff_t)(end - vma->vm_start - 1) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + + /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + up_read(¤t->mm->mmap_sem); + error = vmtruncate_range(mapping->host, offset, endoff); + down_read(¤t->mm->mmap_sem); + return error; +} + +static long +madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) { - long error = -EBADF; + long error; switch (behavior) { + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + break; + } + case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: error = madvise_behavior(vma, prev, start, end, behavior); break; + case MADV_REMOVE: + error = madvise_remove(vma, prev, start, end); + break; case MADV_WILLNEED: error = madvise_willneed(vma, prev, start, end); @@ -160,7 +242,6 @@ static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev error = -EINVAL; break; } - return error; } @@ -186,6 +267,8 @@ static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_REMOVE - the application wants to free up the given range of + * pages and associated backing store. * * return values: * zero - success @@ -198,15 +281,20 @@ static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; struct vm_area_struct * vma, *prev; int unmapped_error = 0; int error = -EINVAL; + int write; size_t len; - down_write(¤t->mm->mmap_sem); + write = madvise_need_mmap_write(behavior); + if (write) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); if (start & ~PAGE_MASK) goto out; @@ -230,8 +318,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) * - different from the way of handling in mlock etc. */ vma = find_vma_prev(current->mm, start, &prev); - if (!vma && prev) - vma = prev->vm_next; + if (vma && start > vma->vm_start) + prev = vma; + for (;;) { /* Still start < end. */ error = -ENOMEM; @@ -256,14 +345,21 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) if (error) goto out; start = tmp; - if (start < prev->vm_end) + if (prev && start < prev->vm_end) start = prev->vm_end; error = unmapped_error; if (start >= end) goto out; - vma = prev->vm_next; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); } out: - up_write(¤t->mm->mmap_sem); + if (write) + up_write(¤t->mm->mmap_sem); + else + up_read(¤t->mm->mmap_sem); + return error; }