X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmlock.c;h=bd6f0e466f6c4f1c51b4bbbe1e605140bb3e6493;hb=75b08038ceb62f3bd8935346679920f97c3cf9f6;hp=4d3fea267e0dabde457091a2618a0efe8dba864d;hpb=e8edc6e03a5c8562dc70a6d969f732bdb355a7e7;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/mlock.c b/mm/mlock.c index 4d3fea2..bd6f0e4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -8,10 +8,18 @@ #include #include #include +#include +#include +#include #include #include #include #include +#include +#include +#include + +#include "internal.h" int can_do_mlock(void) { @@ -23,17 +31,347 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); +/* + * Mlocked pages are marked with PageMlocked() flag for efficient testing + * in vmscan and, possibly, the fault path; and to support semi-accurate + * statistics. + * + * An mlocked page [PageMlocked(page)] is unevictable. As such, it will + * be placed on the LRU "unevictable" list, rather than the [in]active lists. + * The unevictable list is an LRU sibling list to the [in]active lists. + * PageUnevictable is set to indicate the unevictable state. + * + * When lazy mlocking via vmscan, it is important to ensure that the + * vma's VM_LOCKED status is not concurrently being modified, otherwise we + * may have mlocked a page that is being munlocked. So lazy mlock must take + * the mmap_sem for read, and verify that the vma really is locked + * (see mm/rmap.c). + */ + +/* + * LRU accounting for clear_page_mlock() + */ +void __clear_page_mlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page)); + + if (!page->mapping) { /* truncated ? */ + return; + } + + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + if (!isolate_lru_page(page)) { + putback_lru_page(page); + } else { + /* + * We lost the race. the page already moved to evictable list. + */ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + } +} + +/* + * Mark page as mlocked if not already. + * If page on LRU, isolate and putback to move to unevictable list. + */ +void mlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + if (!isolate_lru_page(page)) + putback_lru_page(page); + } +} + +/* + * called from munlock()/munmap() path with page supposedly on the LRU. + * + * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked + * [in try_to_munlock()] and then attempt to isolate the page. We must + * isolate the page to keep others from messing with its unevictable + * and mlocked state while trying to munlock. However, we pre-clear the + * mlocked state anyway as we might lose the isolation race and we might + * not get another chance to clear PageMlocked. If we successfully + * isolate the page and try_to_munlock() detects other VM_LOCKED vmas + * mapping the page, it will restore the PageMlocked state, unless the page + * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), + * perhaps redundantly. + * If we lose the isolation race, and the page is mapped by other VM_LOCKED + * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() + * either of which will restore the PageMlocked state by calling + * mlock_vma_page() above, if it can grab the vma's mmap sem. + */ +static void munlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (TestClearPageMlocked(page)) { + dec_zone_page_state(page, NR_MLOCK); + if (!isolate_lru_page(page)) { + int ret = try_to_munlock(page); + /* + * did try_to_unlock() succeed or punt? + */ + if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + + putback_lru_page(page); + } else { + /* + * We lost the race. let try_to_unmap() deal + * with it. At least we get the page state and + * mlock stats right. However, page is still on + * the noreclaim list. We'll fix that up when + * the page is eventually freed or we scan the + * noreclaim list. + */ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + else + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + } + } +} + +/** + * __mlock_vma_pages_range() - mlock a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * + * This takes care of making the pages present too. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held for at least read. + */ +static long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start; + struct page *pages[16]; /* 16 gives a reasonable batch */ + int nr_pages = (end - start) / PAGE_SIZE; + int ret = 0; + int gup_flags; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON(start < vma->vm_start); + VM_BUG_ON(end > vma->vm_end); + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + gup_flags = FOLL_TOUCH | FOLL_GET; + if (vma->vm_flags & VM_WRITE) + gup_flags |= FOLL_WRITE; + + while (nr_pages > 0) { + int i; + + cond_resched(); + + /* + * get_user_pages makes pages present if we are + * setting mlock. and this extra reference count will + * disable migration of this page. However, page may + * still be truncated out from under us. + */ + ret = __get_user_pages(current, mm, addr, + min_t(int, nr_pages, ARRAY_SIZE(pages)), + gup_flags, pages, NULL); + /* + * This can happen for, e.g., VM_NONLINEAR regions before + * a page has been allocated and mapped at a given offset, + * or for addresses that map beyond end of a file. + * We'll mlock the pages if/when they get faulted in. + */ + if (ret < 0) + break; + + lru_add_drain(); /* push cached pages to LRU */ + + for (i = 0; i < ret; i++) { + struct page *page = pages[i]; + + if (page->mapping) { + /* + * That preliminary check is mainly to avoid + * the pointless overhead of lock_page on the + * ZERO_PAGE: which might bounce very badly if + * there is contention. However, we're still + * dirtying its cacheline with get/put_page: + * we'll add another __get_user_pages flag to + * avoid it if that case turns out to matter. + */ + lock_page(page); + /* + * Because we lock page here and migration is + * blocked by the elevated reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + put_page(page); /* ref from get_user_pages() */ + } + + addr += ret * PAGE_SIZE; + nr_pages -= ret; + ret = 0; + } + + return ret; /* 0 or negative error code */ +} + +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + +/** + * mlock_vma_pages_range() - mlock pages in specified vma range. + * @vma - the vma containing the specfied address range + * @start - starting address in @vma to mlock + * @end - end address [+1] in @vma to mlock + * + * For mmap()/mremap()/expansion of mlocked vma. + * + * return 0 on success for "normal" vmas. + * + * return number of pages [> 0] to be removed from locked_vm on success + * of "special" vmas. + */ +long mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + int nr_pages = (end - start) / PAGE_SIZE; + BUG_ON(!(vma->vm_flags & VM_LOCKED)); + + /* + * filter unlockable vmas + */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + goto no_mlock; + + if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current))) { + + __mlock_vma_pages_range(vma, start, end); + + /* Hide errors from mmap() and other callers */ + return 0; + } + + /* + * User mapped kernel pages or huge pages: + * make these pages present to populate the ptes, but + * fall thru' to reset VM_LOCKED--no need to unlock, and + * return nr_pages so these don't get counted against task's + * locked limit. huge pages are already counted against + * locked vm limit. + */ + make_pages_present(start, end); + +no_mlock: + vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ + return nr_pages; /* error or pages NOT mlocked */ +} + +/* + * munlock_vma_pages_range() - munlock all pages in the vma range.' + * @vma - vma containing range to be munlock()ed. + * @start - start address in @vma of the range + * @end - end of range in @vma. + * + * For mremap(), munmap() and exit(). + * + * Called with @vma VM_LOCKED. + * + * Returns with VM_LOCKED cleared. Callers must be prepared to + * deal with this. + * + * We don't save and restore VM_LOCKED here because pages are + * still on lru. In unmap path, pages might be scanned by reclaim + * and re-mlocked by try_to_{munlock|unmap} before we unmap and + * free them. This will result in freeing mlocked pages. + */ +void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long addr; + + lru_add_drain(); + vma->vm_flags &= ~VM_LOCKED; + + for (addr = start; addr < end; addr += PAGE_SIZE) { + struct page *page; + /* + * Although FOLL_DUMP is intended for get_dump_page(), + * it just so happens that its special treatment of the + * ZERO_PAGE (returning an error instead of doing get_page) + * suits munlock very well (and if somehow an abnormal page + * has sneaked into the range, we won't oops here: great). + */ + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + if (page && !IS_ERR(page)) { + lock_page(page); + /* + * Like in __mlock_vma_pages_range(), + * because we lock page here and migration is + * blocked by the elevated reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + munlock_vma_page(page); + unlock_page(page); + put_page(page); + } + cond_resched(); + } +} + +/* + * mlock_fixup - handle mlock[all]/munlock[all] requests. + * + * Filters out "special" vmas -- VM_LOCKED never gets set for these, and + * munlock is a no-op. However, for some special vmas, we go ahead and + * populate the ptes via make_pages_present(). + * + * For vmas that pass the filters, merge/split as appropriate. + */ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned int newflags) { - struct mm_struct * mm = vma->vm_mm; + struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; - int pages; + int nr_pages; int ret = 0; - - if (newflags == vma->vm_flags) { - *prev = vma; - goto out; + int lock = newflags & VM_LOCKED; + + if (newflags == vma->vm_flags || + (vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; /* don't set VM_LOCKED, don't count */ + + if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current)) { + if (lock) + make_pages_present(start, end); + goto out; /* don't set VM_LOCKED, don't count */ } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); @@ -44,8 +382,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, goto success; } - *prev = vma; - if (start != vma->vm_start) { ret = split_vma(mm, vma, start, 1); if (ret) @@ -60,26 +396,30 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, success: /* - * vm_flags is protected by the mmap_sem held in write mode. - * It's okay if try_to_unmap_one unmaps a page just after we - * set VM_LOCKED, make_pages_present below will bring it back. + * Keep track of amount of locked VM. */ - vma->vm_flags = newflags; + nr_pages = (end - start) >> PAGE_SHIFT; + if (!lock) + nr_pages = -nr_pages; + mm->locked_vm += nr_pages; /* - * Keep track of amount of locked VM. + * vm_flags is protected by the mmap_sem held in write mode. + * It's okay if try_to_unmap_one unmaps a page just after we + * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - pages = (end - start) >> PAGE_SHIFT; - if (newflags & VM_LOCKED) { - pages = -pages; - if (!(newflags & VM_IO)) - ret = make_pages_present(start, end); + + if (lock) { + vma->vm_flags = newflags; + ret = __mlock_vma_pages_range(vma, start, end); + if (ret < 0) + ret = __mlock_posix_error_return(ret); + } else { + munlock_vma_pages_range(vma, start, end); } - mm->locked_vm -= pages; out: - if (ret == -ENOMEM) - ret = -EAGAIN; + *prev = vma; return ret; } @@ -132,7 +472,7 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -asmlinkage long sys_mlock(unsigned long start, size_t len) +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; unsigned long lock_limit; @@ -141,6 +481,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) if (!can_do_mlock()) return -EPERM; + lru_add_drain_all(); /* flush pagevec */ + down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; @@ -158,7 +500,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) return error; } -asmlinkage long sys_munlock(unsigned long start, size_t len) +SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; @@ -195,7 +537,7 @@ out: return 0; } -asmlinkage long sys_mlockall(int flags) +SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; int ret = -EINVAL; @@ -207,6 +549,8 @@ asmlinkage long sys_mlockall(int flags) if (!can_do_mlock()) goto out; + lru_add_drain_all(); /* flush pagevec */ + down_write(¤t->mm->mmap_sem); lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; @@ -221,7 +565,7 @@ out: return ret; } -asmlinkage long sys_munlockall(void) +SYSCALL_DEFINE0(munlockall) { int ret; @@ -244,9 +588,12 @@ int user_shm_lock(size_t size, struct user_struct *user) locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + if (lock_limit == RLIM_INFINITY) + allowed = 1; lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); - if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) + if (!allowed && + locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) goto out; get_uid(user); user->locked_shm += locked; @@ -263,3 +610,44 @@ void user_shm_unlock(size_t size, struct user_struct *user) spin_unlock(&shmlock_user_lock); free_uid(user); } + +int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, + size_t size) +{ + unsigned long lim, vm, pgsz; + int error = -ENOMEM; + + pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + + down_write(&mm->mmap_sem); + + lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = mm->total_vm + pgsz; + if (lim < vm) + goto out; + + lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = mm->locked_vm + pgsz; + if (lim < vm) + goto out; + + mm->total_vm += pgsz; + mm->locked_vm += pgsz; + + error = 0; + out: + up_write(&mm->mmap_sem); + return error; +} + +void refund_locked_memory(struct mm_struct *mm, size_t size) +{ + unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + + down_write(&mm->mmap_sem); + + mm->total_vm -= pgsz; + mm->locked_vm -= pgsz; + + up_write(&mm->mmap_sem); +}