X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmemory.c;h=fb5608a120edc77cffac3050ce7e5ebbcff94b8f;hb=a190901c6b5f1f4a31681e8c69d811a4f9426e2b;hp=7e2a4b1580e31474ed3adb1136cc1f6a1ea4c650;hpb=f8891e5e1f93a128c3900f82035e8541357896a7;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/memory.c b/mm/memory.c index 7e2a4b1..fb5608a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -47,7 +47,10 @@ #include #include #include +#include #include +#include +#include #include #include @@ -76,13 +79,22 @@ unsigned long num_physpages; * and ZONE_HIGHMEM. */ void * high_memory; -unsigned long vmalloc_earlyreserve; EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(high_memory); -EXPORT_SYMBOL(vmalloc_earlyreserve); -int randomize_va_space __read_mostly = 1; +/* + * Randomize the address space (stacks, mmaps, brk, etc.). + * + * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, + * as ancient (libc5 based) binaries can segfault. ) + */ +int randomize_va_space __read_mostly = +#ifdef CONFIG_COMPAT_BRK + 1; +#else + 2; +#endif static int __init disable_randmaps(char *s) { @@ -122,11 +134,9 @@ void pmd_clear_bad(pmd_t *pmd) */ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) { - struct page *page = pmd_page(*pmd); + pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); - pte_lock_deinit(page); - pte_free_tlb(tlb, page); - dec_zone_page_state(page, NR_PAGETABLE); + pte_free_tlb(tlb, token); tlb->mm->nr_ptes--; } @@ -259,9 +269,6 @@ void free_pgd_range(struct mmu_gather **tlb, continue; free_pud_range(*tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); - - if (!(*tlb)->fullmm) - flush_tlb_pgtables((*tlb)->mm, start, end); } void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, @@ -300,21 +307,34 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { - struct page *new = pte_alloc_one(mm, address); + pgtable_t new = pte_alloc_one(mm, address); if (!new) return -ENOMEM; - pte_lock_init(new); + /* + * Ensure all pte setup (eg. pte page lock and page clearing) are + * visible before the pte is made visible to other CPUs by being + * put into page tables. + * + * The other side of the story is the pointer chasing in the page + * table walking code (when walking the page table without locking; + * ie. most of the time). Fortunately, these data accesses consist + * of a chain of data-dependent loads, meaning most CPUs (alpha + * being the notable exception) will already guarantee loads are + * seen in-order. See the alpha page table accessors for the + * smp_read_barrier_depends() barriers in page table walking code. + */ + smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ + spin_lock(&mm->page_table_lock); - if (pmd_present(*pmd)) { /* Another has populated it */ - pte_lock_deinit(new); - pte_free(new); - } else { + if (!pmd_present(*pmd)) { /* Has another populated it ? */ mm->nr_ptes++; - inc_zone_page_state(new, NR_PAGETABLE); pmd_populate(mm, pmd, new); + new = NULL; } spin_unlock(&mm->page_table_lock); + if (new) + pte_free(mm, new); return 0; } @@ -324,12 +344,16 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) if (!new) return -ENOMEM; + smp_wmb(); /* See comment in __pte_alloc */ + spin_lock(&init_mm.page_table_lock); - if (pmd_present(*pmd)) /* Another has populated it */ - pte_free_kernel(new); - else + if (!pmd_present(*pmd)) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); + new = NULL; + } spin_unlock(&init_mm.page_table_lock); + if (new) + pte_free_kernel(&init_mm, new); return 0; } @@ -364,55 +388,93 @@ static inline int is_cow_mapping(unsigned int flags) } /* - * This function gets the "struct page" associated with a pte. + * vm_normal_page -- This function gets the "struct page" associated with a pte. + * + * "Special" mappings do not wish to be associated with a "struct page" (either + * it doesn't exist, or it exists but they don't want to touch it). In this + * case, NULL is returned here. "Normal" mappings do have a struct page. + * + * There are 2 broad cases. Firstly, an architecture may define a pte_special() + * pte bit, in which case this function is trivial. Secondly, an architecture + * may not have a spare pte bit, which requires a more complicated scheme, + * described below. * - * NOTE! Some mappings do not have "struct pages". A raw PFN mapping - * will have each page table entry just pointing to a raw page frame - * number, and as far as the VM layer is concerned, those do not have - * pages associated with them - even if the PFN might point to memory - * that otherwise is perfectly fine and has a "struct page". + * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a + * special mapping (even if there are underlying and valid "struct pages"). + * COWed pages of a VM_PFNMAP are always normal. * - * The way we recognize those mappings is through the rules set up - * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, - * and the vm_pgoff will point to the first PFN mapped: thus every - * page that is a raw mapping will always honor the rule + * The way we recognize COWed pages within VM_PFNMAP mappings is through the + * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit + * set, and the vm_pgoff will point to the first PFN mapped: thus every special + * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) * - * and if that isn't true, the page has been COW'ed (in which case it - * _does_ have a "struct page" associated with it even if it is in a - * VM_PFNMAP range). + * And for normal mappings this is false. + * + * This restricts such mappings to be a linear translation from virtual address + * to pfn. To get around this restriction, we allow arbitrary mappings so long + * as the vma is not a COW mapping; in that case, we know that all ptes are + * special (because none can have been COWed). + * + * + * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. + * + * VM_MIXEDMAP mappings can likewise contain memory with or without "struct + * page" backing, however the difference is that _all_ pages with a struct + * page (that is, those where pfn_valid is true) are refcounted and considered + * normal pages by the VM. The disadvantage is that pages are refcounted + * (which can be slower and simply not an option for some PFNMAP users). The + * advantage is that we don't have to follow the strict linearity rule of + * PFNMAP mappings in order to support COWable mappings. + * */ -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +#ifdef __HAVE_ARCH_PTE_SPECIAL +# define HAVE_PTE_SPECIAL 1 +#else +# define HAVE_PTE_SPECIAL 0 +#endif +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) { - unsigned long pfn = pte_pfn(pte); + unsigned long pfn; - if (unlikely(vma->vm_flags & VM_PFNMAP)) { - unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; + if (HAVE_PTE_SPECIAL) { + if (likely(!pte_special(pte))) { + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + return pte_page(pte); + } + VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); + return NULL; } - /* - * Add some anal sanity checks for now. Eventually, - * we should just do "return pfn_to_page(pfn)", but - * in the meantime we check that we get a valid pfn, - * and that the resulting page looks ok. - */ - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); - return NULL; + /* !HAVE_PTE_SPECIAL case follows: */ + + pfn = pte_pfn(pte); + + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } } + VM_BUG_ON(!pfn_valid(pfn)); + /* - * NOTE! We still have PageReserved() pages in the page - * tables. + * NOTE! We still have PageReserved() pages in the page tables. * - * The PAGE_ZERO() pages and various VDSO mappings can - * cause them to exist. + * eg. VDSO mappings can cause them to exist. */ +out: return pfn_to_page(pfn); } @@ -465,7 +527,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_cow_mapping(vm_flags)) { ptep_set_wrprotect(src_mm, addr, src_pte); - pte = *src_pte; + pte = pte_wrprotect(pte); } /* @@ -479,7 +541,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); - page_dup_rmap(page); + page_dup_rmap(page, vma, addr); rss[!!PageAnon(page)]++; } @@ -503,7 +565,8 @@ again: return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); - spin_lock(src_ptl); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); do { /* @@ -513,8 +576,7 @@ again: if (progress >= 32) { progress = 0; if (need_resched() || - need_lockbreak(src_ptl) || - need_lockbreak(dst_ptl)) + spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } if (pte_none(*src_pte)) { @@ -525,6 +587,7 @@ again: progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); add_mm_rss(dst_mm, rss[0], rss[1]); @@ -626,6 +689,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, int anon_rss = 0; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; if (pte_none(ptent)) { @@ -673,10 +737,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent)) - mark_page_accessed(page); + SetPageReferenced(page); file_rss--; } - page_remove_rmap(page); + page_remove_rmap(page, vma); tlb_remove_page(tlb, page); continue; } @@ -688,10 +752,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; if (!pte_file(ptent)) free_swap_and_cache(pte_to_swp_entry(ptent)); - pte_clear_full(mm, addr, pte, tlb->fullmm); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); add_mm_rss(mm, file_rss, anon_rss); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return addr; @@ -850,7 +915,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, tlb_finish_mmu(*tlbp, tlb_start, start); if (need_resched() || - (i_mmap_lock && need_lockbreak(i_mmap_lock))) { + (i_mmap_lock && spin_needbreak(i_mmap_lock))) { if (i_mmap_lock) { *tlbp = NULL; goto out; @@ -921,7 +986,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page_table; pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + if (pmd_none(*pmd)) goto no_page_table; if (pmd_huge(*pmd)) { @@ -930,6 +995,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto out; } + if (unlikely(pmd_bad(*pmd))) + goto no_page_table; + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) goto out; @@ -962,7 +1030,7 @@ no_page_table: * has touched so far, we don't want to allocate page tables. */ if (flags & FOLL_ANON) { - page = ZERO_PAGE(address); + page = ZERO_PAGE(0); if (flags & FOLL_GET) get_page(page); BUG_ON(flags & FOLL_WRITE); @@ -977,6 +1045,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int i; unsigned int vm_flags; + if (len <= 0) + return 0; /* * Require read or write permissions. * If 'force' is set, we only require the "MAY" flags. @@ -1035,7 +1105,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &len, i); + &start, &len, i, write); continue; } @@ -1043,48 +1113,56 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (pages) foll_flags |= FOLL_GET; if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || !vma->vm_ops->nopage)) + (!vma->vm_ops || !vma->vm_ops->fault)) foll_flags |= FOLL_ANON; do { struct page *page; + /* + * If tsk is ooming, cut off its access to large memory + * allocations. It has a pending SIGKILL, but it can't + * be processed until returning to user space. + */ + if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) + return -ENOMEM; + if (write) foll_flags |= FOLL_WRITE; cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; - ret = __handle_mm_fault(mm, vma, start, + ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return i ? i : -ENOMEM; + else if (ret & VM_FAULT_SIGBUS) + return i ? i : -EFAULT; + BUG(); + } + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + /* - * The VM_FAULT_WRITE bit tells us that do_wp_page has - * broken COW when necessary, even if maybe_mkwrite - * decided not to set pte_write. We can thus safely do - * subsequent page lookups as if they were reads. + * The VM_FAULT_WRITE bit tells us that + * do_wp_page has broken COW when necessary, + * even if maybe_mkwrite decided not to set + * pte_write. We can thus safely do subsequent + * page lookups as if they were reads. */ if (ret & VM_FAULT_WRITE) foll_flags &= ~FOLL_WRITE; - - switch (ret & ~VM_FAULT_WRITE) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - return i ? i : -EFAULT; - case VM_FAULT_OOM: - return i ? i : -ENOMEM; - default: - BUG(); - } + + cond_resched(); } if (pages) { pages[i] = page; - flush_anon_page(page, start); + flush_anon_page(vma, page, start); flush_dcache_page(page); } if (vmas) @@ -1098,84 +1176,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages); -static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t prot) -{ - pte_t *pte; - spinlock_t *ptl; - - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); - if (!pte) - return -ENOMEM; - do { - struct page *page = ZERO_PAGE(addr); - pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); - page_cache_get(page); - page_add_file_rmap(page); - inc_mm_counter(mm, file_rss); - BUG_ON(!pte_none(*pte)); - set_pte_at(mm, addr, pte, zero_pte); - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); - return 0; -} - -static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, pgprot_t prot) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - return -ENOMEM; - do { - next = pmd_addr_end(addr, end); - if (zeromap_pte_range(mm, pmd, addr, next, prot)) - return -ENOMEM; - } while (pmd++, addr = next, addr != end); - return 0; -} - -static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, pgprot_t prot) -{ - pud_t *pud; - unsigned long next; - - pud = pud_alloc(mm, pgd, addr); - if (!pud) - return -ENOMEM; - do { - next = pud_addr_end(addr, end); - if (zeromap_pmd_range(mm, pud, addr, next, prot)) - return -ENOMEM; - } while (pud++, addr = next, addr != end); - return 0; -} - -int zeromap_page_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long size, pgprot_t prot) -{ - pgd_t *pgd; - unsigned long next; - unsigned long end = addr + size; - struct mm_struct *mm = vma->vm_mm; - int err; - - BUG_ON(addr >= end); - pgd = pgd_offset(mm, addr); - flush_cache_range(vma, addr, end); - do { - next = pgd_addr_end(addr, end); - err = zeromap_pud_range(mm, pgd, addr, next, prot); - if (err) - break; - } while (pgd++, addr = next, addr != end); - return err; -} - -pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) +pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) { pgd_t * pgd = pgd_offset(mm, addr); pud_t * pud = pud_alloc(mm, pgd, addr); @@ -1194,20 +1196,26 @@ pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlo * old drivers should use this, and they needed to mark their * pages reserved for the old functions anyway. */ -static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) +static int insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page, pgprot_t prot) { + struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; - spinlock_t *ptl; + spinlock_t *ptl; + + retval = mem_cgroup_charge(page, mm, GFP_KERNEL); + if (retval) + goto out; retval = -EINVAL; if (PageAnon(page)) - goto out; + goto out_uncharge; retval = -ENOMEM; flush_dcache_page(page); pte = get_locked_pte(mm, addr, &ptl); if (!pte) - goto out; + goto out_uncharge; retval = -EBUSY; if (!pte_none(*pte)) goto out_unlock; @@ -1219,13 +1227,22 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa set_pte_at(mm, addr, pte, mk_pte(page, prot)); retval = 0; + pte_unmap_unlock(pte, ptl); + return retval; out_unlock: pte_unmap_unlock(pte, ptl); +out_uncharge: + mem_cgroup_uncharge_page(page); out: return retval; } -/* +/** + * vm_insert_page - insert single page into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @page: source kernel page + * * This allows drivers to insert individual pages they've allocated * into a user vma. * @@ -1242,17 +1259,103 @@ out: * * The page does not need to be reserved. */ -int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) { if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!page_count(page)) return -EINVAL; vma->vm_flags |= VM_INSERTPAGE; - return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); + return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); +static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + int retval; + pte_t *pte, entry; + spinlock_t *ptl; + + retval = -ENOMEM; + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = -EBUSY; + if (!pte_none(*pte)) + goto out_unlock; + + /* Ok, finally just insert the thing.. */ + entry = pte_mkspecial(pfn_pte(pfn, prot)); + set_pte_at(mm, addr, pte, entry); + update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ + + retval = 0; +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +/** + * vm_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_inert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return NULL. + */ +int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + /* + * Technically, architectures with pte_special can avoid all these + * restrictions (same for remap_pfn_range). However we would like + * consistency in testing and feature parity among all, so we should + * try to keep these invariants in place for everybody. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + return insert_pfn(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + + /* + * If we don't have pte special, then we have to use the pfn_valid() + * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* + * refcount the page if pfn_valid is true (hence insert_page rather + * than insert_pfn). + */ + if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { + struct page *page; + + page = pfn_to_page(pfn); + return insert_page(vma, addr, page, vma->vm_page_prot); + } + return insert_pfn(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_mixed); + /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results @@ -1268,11 +1371,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; + arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); - set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1317,7 +1422,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/* Note: this is only safe if the mm semaphore is held when called. */ +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -1368,6 +1482,100 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pte_t *pte; + int err; + pgtable_t token; + spinlock_t *uninitialized_var(ptl); + + pte = (mm == &init_mm) ? + pte_alloc_kernel(pmd, addr) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + + BUG_ON(pmd_huge(*pmd)); + + token = pmd_pgtable(*pmd); + + do { + err = fn(pte, token, addr, data); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (mm != &init_mm) + pte_unmap_unlock(pte-1, ptl); + return err; +} + +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pmd_t *pmd; + unsigned long next; + int err; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (err) + break; + } while (pmd++, addr = next, addr != end); + return err; +} + +static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pud_t *pud; + unsigned long next; + int err; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (err) + break; + } while (pud++, addr = next, addr != end); + return err; +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + err = apply_to_pud_range(mm, pgd, addr, next, fn, data); + if (err) + break; + } while (pgd++, addr = next, addr != end); + return err; +} +EXPORT_SYMBOL_GPL(apply_to_page_range); + /* * handle_pte_fault chooses page fault handler according to an entry * which was read non-atomically. Before making any commitment, on @@ -1406,7 +1614,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* * If the source page was a PFN mapping, we don't have @@ -1427,10 +1635,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) memset(kaddr, 0, PAGE_SIZE); kunmap_atomic(kaddr, KM_USER0); - return; - - } - copy_user_highpage(dst, src, va); + flush_dcache_page(dst); + } else + copy_user_highpage(dst, src, va, vma); } /* @@ -1457,14 +1664,30 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int reuse, ret = VM_FAULT_MINOR; + int reuse = 0, ret = 0; + int page_mkwrite = 0; + struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) goto gotten; - if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == - (VM_SHARED|VM_WRITE))) { + /* + * Take out anonymous pages first, anonymous shared vmas are + * not dirty accountable. + */ + if (PageAnon(old_page)) { + if (!TestSetPageLocked(old_page)) { + reuse = can_share_swap_page(old_page); + unlock_page(old_page); + } + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED))) { + /* + * Only catch write-faults on shared writable pages, + * read-only shared pages can get COWed by + * get_user_pages(.write=1, .force=1). + */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { /* * Notify the address space that the page is about to @@ -1480,8 +1703,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) goto unwritable_page; - page_cache_release(old_page); - /* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If @@ -1490,25 +1711,23 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + page_cache_release(old_page); if (!pte_same(*page_table, orig_pte)) goto unlock; - } + page_mkwrite = 1; + } + dirty_page = old_page; + get_page(dirty_page); reuse = 1; - } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { - reuse = can_share_swap_page(old_page); - unlock_page(old_page); - } else { - reuse = 0; } if (reuse) { flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - ptep_set_access_flags(vma, address, page_table, entry, 1); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); + if (ptep_set_access_flags(vma, address, page_table, entry,1)) + update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; goto unlock; } @@ -1522,16 +1741,15 @@ gotten: if (unlikely(anon_vma_prepare(vma))) goto oom; - if (old_page == ZERO_PAGE(address)) { - new_page = alloc_zeroed_user_highpage(vma, address); - if (!new_page) - goto oom; - } else { - new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); - if (!new_page) - goto oom; - cow_user_page(new_page, old_page, address); - } + VM_BUG_ON(old_page == ZERO_PAGE(0)); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + goto oom; + cow_user_page(new_page, old_page, address, vma); + __SetPageUptodate(new_page); + + if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) + goto oom_free_new; /* * Re-check the pte - we dropped the lock @@ -1539,7 +1757,7 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page); + page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -1549,23 +1767,49 @@ gotten: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - ptep_establish(vma, address, page_table, entry); + /* + * Clear the pte entry and flush it first, before updating the + * pte with the new entry. This will avoid a race condition + * seen in the presence of one thread doing SMC and another + * thread doing COW. + */ + ptep_clear_flush(vma, address, page_table); + set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; - } + } else + mem_cgroup_uncharge_page(new_page); + if (new_page) page_cache_release(new_page); if (old_page) page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + if (vma->vm_file) + file_update_time(vma->vm_file); + + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * do_no_page is protected similarly. + */ + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + put_page(dirty_page); + } return ret; +oom_free_new: + page_cache_release(new_page); oom: if (old_page) page_cache_release(old_page); @@ -1628,6 +1872,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long restart_addr; int need_break; + /* + * files that support invalidating or truncating portions of the + * file from under mmaped areas must have their ->fault function + * return a locked page (and set VM_FAULT_LOCKED in the return). + * This provides synchronisation against concurrent unmapping here. + */ + again: restart_addr = vma->vm_truncate_count; if (is_restart_addr(restart_addr) && start_addr < restart_addr) { @@ -1641,8 +1892,7 @@ again: restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); - need_break = need_resched() || - need_lockbreak(details->i_mmap_lock); + need_break = need_resched() || spin_needbreak(details->i_mmap_lock); if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ @@ -1718,9 +1968,7 @@ restart: } /** - * unmap_mapping_range - unmap the portion of all mmaps - * in the specified address_space corresponding to the specified - * page range in the underlying file. + * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE @@ -1758,17 +2006,8 @@ void unmap_mapping_range(struct address_space *mapping, spin_lock(&mapping->i_mmap_lock); - /* serialize i_size write against truncate_count write */ - smp_wmb(); - /* Protect against page faults, and endless unmapping loops */ + /* Protect against endless unmapping loops */ mapping->truncate_count++; - /* - * For archs where spin_lock has inclusive semantics like ia64 - * this smp_mb() will prevent to read pagetable contents - * before the truncate_count increment is visible to - * other cpus. - */ - smp_mb(); if (unlikely(is_restart_addr(mapping->truncate_count))) { if (mapping->truncate_count == 0) reset_vma_truncate_counts(mapping); @@ -1784,9 +2023,10 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/* - * Handle all mappings that got truncated by a "truncate()" - * system call. +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating * * NOTE! We have to be ready to update the memory sharing * between the file and the memory map for a potential last @@ -1794,40 +2034,49 @@ EXPORT_SYMBOL(unmap_mapping_range); */ int vmtruncate(struct inode * inode, loff_t offset) { - struct address_space *mapping = inode->i_mapping; - unsigned long limit; + if (inode->i_size < offset) { + unsigned long limit; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + i_size_write(inode, offset); + } else { + struct address_space *mapping = inode->i_mapping; + + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + i_size_write(inode, offset); + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + } - if (inode->i_size < offset) - goto do_expand; - /* - * truncation of in-use swapfiles is disallowed - it would cause - * subsequent swapout to scribble on the now-freed blocks. - */ - if (IS_SWAPFILE(inode)) - goto out_busy; - i_size_write(inode, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - -out_truncate: if (inode->i_op && inode->i_op->truncate) inode->i_op->truncate(inode); return 0; + out_sig: send_sig(SIGXFSZ, current, 0); out_big: return -EFBIG; -out_busy: - return -ETXTBSY; } EXPORT_SYMBOL(vmtruncate); @@ -1847,69 +2096,13 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) down_write(&inode->i_alloc_sem); unmap_mapping_range(mapping, offset, (end - offset), 1); truncate_inode_pages_range(mapping, offset, end); + unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); return 0; } -EXPORT_SYMBOL(vmtruncate_range); - -/* - * Primitive swap readahead code. We simply read an aligned block of - * (1 << page_cluster) entries in the swap area. This method is chosen - * because it doesn't cost us any seek time. We also make sure to queue - * the 'original' request together with the readahead ones... - * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold down_read on the vma->vm_mm if vma is not NULL. - */ -void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) -{ -#ifdef CONFIG_NUMA - struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; -#endif - int i, num; - struct page *new_page; - unsigned long offset; - - /* - * Get the number of handles we should do readahead io to. - */ - num = valid_swaphandles(entry, &offset); - for (i = 0; i < num; offset++, i++) { - /* Ok, do the async read-ahead now */ - new_page = read_swap_cache_async(swp_entry(swp_type(entry), - offset), vma, addr); - if (!new_page) - break; - page_cache_release(new_page); -#ifdef CONFIG_NUMA - /* - * Find the next applicable VMA for the NUMA policy. - */ - addr += PAGE_SIZE; - if (addr == 0) - vma = NULL; - if (vma) { - if (addr >= vma->vm_end) { - vma = next_vma; - next_vma = vma ? vma->vm_next : NULL; - } - if (vma && addr < vma->vm_start) - vma = NULL; - } else { - if (next_vma && addr >= next_vma->vm_start) { - vma = next_vma; - next_vma = vma->vm_next; - } - } -#endif - } - lru_add_drain(); /* Push any new pages onto the LRU now */ -} /* * We enter with non-exclusive mmap_sem (to exclude vma changes, @@ -1924,7 +2117,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; swp_entry_t entry; pte_t pte; - int ret = VM_FAULT_MINOR; + int ret = 0; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) goto out; @@ -1934,10 +2127,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, migration_entry_wait(mm, pmd, address); goto out; } + delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - swapin_readahead(entry, address, vma); - page = read_swap_cache_async(entry, vma, address); + grab_swap_token(); /* Contend for token _before_ read-in */ + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { /* * Back out if somebody else faulted in this pte @@ -1946,17 +2141,24 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; } /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - grab_swap_token(); + } + + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + ret = VM_FAULT_OOM; + goto out; } mark_page_accessed(page); lock_page(page); + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); /* * Back out if somebody else already faulted in this pte. @@ -1989,20 +2191,20 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(page); if (write_access) { - if (do_wp_page(mm, vma, address, - page_table, pmd, ptl, pte) == VM_FAULT_OOM) - ret = VM_FAULT_OOM; + ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); + if (ret & VM_FAULT_ERROR) + ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); - lazy_mmu_prot_update(pte); unlock: pte_unmap_unlock(page_table, ptl); out: return ret; out_nomap: + mem_cgroup_uncharge_page(page); pte_unmap_unlock(page_table, ptl); unlock_page(page); page_cache_release(page); @@ -2022,148 +2224,150 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; - if (write_access) { - /* Allocate our own private page. */ - pte_unmap(page_table); - - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_zeroed_user_highpage(vma, address); - if (!page) - goto oom; + /* Allocate our own private page. */ + pte_unmap(page_table); - entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_zeroed_user_highpage_movable(vma, address); + if (!page) + goto oom; + __SetPageUptodate(page); - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!pte_none(*page_table)) - goto release; - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); - page_add_new_anon_rmap(page, vma, address); - } else { - /* Map the ZERO_PAGE - vm_page_prot is readonly */ - page = ZERO_PAGE(address); - page_cache_get(page); - entry = mk_pte(page, vma->vm_page_prot); + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) + goto oom_free_page; - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (!pte_none(*page_table)) - goto release; - inc_mm_counter(mm, file_rss); - page_add_file_rmap(page); - } + entry = mk_pte(page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) + goto release; + inc_mm_counter(mm, anon_rss); + lru_cache_add_active(page); + page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); unlock: pte_unmap_unlock(page_table, ptl); - return VM_FAULT_MINOR; + return 0; release: + mem_cgroup_uncharge_page(page); page_cache_release(page); goto unlock; +oom_free_page: + page_cache_release(page); oom: return VM_FAULT_OOM; } /* - * do_no_page() tries to create a new page mapping. It aggressively + * __do_fault() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if - * the "write_access" parameter is true in order to avoid the next - * page fault. + * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid + * the next page fault. * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. + * but allow concurrent faults), and pte neither mapped nor locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) +static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { + pte_t *page_table; spinlock_t *ptl; - struct page *new_page; - struct address_space *mapping = NULL; + struct page *page; pte_t entry; - unsigned int sequence = 0; - int ret = VM_FAULT_MINOR; int anon = 0; + struct page *dirty_page = NULL; + struct vm_fault vmf; + int ret; + int page_mkwrite = 0; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.page = NULL; - pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - sequence = mapping->truncate_count; - smp_rmb(); /* serializes i_size against truncate_count */ - } -retry: - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + ret = vma->vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + /* - * No smp_rmb is needed here as long as there's a full - * spin_lock/unlock sequence inside the ->nopage callback - * (for the pagecache lookup) that acts as an implicit - * smp_mb() and prevents the i_size read to happen - * after the next truncate_count read. + * For consistency in subsequent calls, make the faulted page always + * locked. */ - - /* no page was available -- either SIGBUS or OOM */ - if (new_page == NOPAGE_SIGBUS) - return VM_FAULT_SIGBUS; - if (new_page == NOPAGE_OOM) - return VM_FAULT_OOM; + if (unlikely(!(ret & VM_FAULT_LOCKED))) + lock_page(vmf.page); + else + VM_BUG_ON(!PageLocked(vmf.page)); /* * Should we do an early C-O-W break? */ - if (write_access) { + page = vmf.page; + if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { - struct page *page; - - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_page_vma(GFP_HIGHUSER, vma, address); - if (!page) - goto oom; - copy_user_highpage(page, new_page, address); - page_cache_release(new_page); - new_page = page; anon = 1; - + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto out; + } + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + copy_user_highpage(page, vmf.page, address, vma); + __SetPageUptodate(page); } else { - /* if the page will be shareable, see if the backing + /* + * If the page will be shareable, see if the backing * address space wants to know that the page is about - * to become writable */ - if (vma->vm_ops->page_mkwrite && - vma->vm_ops->page_mkwrite(vma, new_page) < 0 - ) { - page_cache_release(new_page); - return VM_FAULT_SIGBUS; + * to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(page); + if (vma->vm_ops->page_mkwrite(vma, page) < 0) { + ret = VM_FAULT_SIGBUS; + anon = 1; /* no anon but release vmf.page */ + goto out_unlocked; + } + lock_page(page); + /* + * XXX: this is not quite right (racy vs + * invalidate) to unlock and relock the page + * like this, however a better fix requires + * reworking page_mkwrite locking API, which + * is better done later. + */ + if (!page->mapping) { + ret = 0; + anon = 1; /* no anon but release vmf.page */ + goto out; + } + page_mkwrite = 1; } } + } - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - /* - * For a file-backed vma, someone could have truncated or otherwise - * invalidated this page. If unmap_mapping_range got called, - * retry getting the page. - */ - if (mapping && unlikely(sequence != mapping->truncate_count)) { - pte_unmap_unlock(page_table, ptl); - page_cache_release(new_page); - cond_resched(); - sequence = mapping->truncate_count; - smp_rmb(); - goto retry; + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { + ret = VM_FAULT_OOM; + goto out; } + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + /* * This silly early PAGE_DIRTY setting removes a race * due to the bad i386 page protection. But it's valid @@ -2175,35 +2379,116 @@ retry: * handle that later. */ /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - flush_icache_page(vma, new_page); - entry = mk_pte(new_page, vma->vm_page_prot); - if (write_access) + if (likely(pte_same(*page_table, orig_pte))) { + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); if (anon) { - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(new_page); - page_add_new_anon_rmap(new_page, vma, address); + inc_mm_counter(mm, anon_rss); + lru_cache_add_active(page); + page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); - page_add_file_rmap(new_page); + page_add_file_rmap(page); + if (flags & FAULT_FLAG_WRITE) { + dirty_page = page; + get_page(dirty_page); + } } + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, entry); } else { - /* One of our sibling threads was faster, back out. */ - page_cache_release(new_page); - goto unlock; + mem_cgroup_uncharge_page(page); + if (anon) + page_cache_release(page); + else + anon = 1; /* no anon but release faulted_page */ } - /* no need to invalidate: a not-present page shouldn't be cached */ - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); -unlock: pte_unmap_unlock(page_table, ptl); + +out: + unlock_page(vmf.page); +out_unlocked: + if (anon) + page_cache_release(vmf.page); + else if (dirty_page) { + if (vma->vm_file) + file_update_time(vma->vm_file); + + set_page_dirty_balance(dirty_page, page_mkwrite); + put_page(dirty_page); + } + return ret; -oom: - page_cache_release(new_page); - return VM_FAULT_OOM; +} + +static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + pgoff_t pgoff = (((address & PAGE_MASK) + - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); + + pte_unmap(page_table); + return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +} + + +/* + * do_no_pfn() tries to create a new page mapping for a page without + * a struct_page backing it + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + * + * It is expected that the ->nopfn handler always returns the same pfn + * for a given virtual mapping. + * + * Mark this `noinline' to prevent it from bloating the main pagefault code. + */ +static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + spinlock_t *ptl; + pte_t entry; + unsigned long pfn; + + pte_unmap(page_table); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + + pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); + + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (unlikely(pfn == NOPFN_OOM)) + return VM_FAULT_OOM; + else if (unlikely(pfn == NOPFN_SIGBUS)) + return VM_FAULT_SIGBUS; + else if (unlikely(pfn == NOPFN_REFAULT)) + return 0; + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + entry = pfn_pte(pfn, vma->vm_page_prot); + if (write_access) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + set_pte_at(mm, address, page_table, entry); + } + pte_unmap_unlock(page_table, ptl); + return 0; } /* @@ -2215,33 +2500,28 @@ oom: * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, int write_access, pte_t orig_pte) { + unsigned int flags = FAULT_FLAG_NONLINEAR | + (write_access ? FAULT_FLAG_WRITE : 0); pgoff_t pgoff; - int err; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - return VM_FAULT_MINOR; + return 0; - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || + !(vma->vm_flags & VM_CAN_NONLINEAR))) { /* * Page table corrupted: show pte and kill process. */ print_bad_pte(vma, orig_pte, address); return VM_FAULT_OOM; } - /* We can then assume vm->vm_ops && vma->vm_ops->populate */ pgoff = pte_to_pgoff(orig_pte); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, - vma->vm_page_prot, pgoff, 0); - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err) - return VM_FAULT_SIGBUS; - return VM_FAULT_MAJOR; + return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -2262,20 +2542,24 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte_t *pte, pmd_t *pmd, int write_access) { pte_t entry; - pte_t old_entry; spinlock_t *ptl; - old_entry = entry = *pte; + entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); - return do_no_page(mm, vma, address, - pte, pmd, write_access); + if (vma->vm_ops) { + if (likely(vma->vm_ops->fault)) + return do_linear_fault(mm, vma, address, + pte, pmd, write_access, entry); + if (unlikely(vma->vm_ops->nopfn)) + return do_no_pfn(mm, vma, address, pte, + pmd, write_access); + } + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); } if (pte_file(entry)) - return do_file_page(mm, vma, address, + return do_nonlinear_fault(mm, vma, address, pte, pmd, write_access, entry); return do_swap_page(mm, vma, address, pte, pmd, write_access, entry); @@ -2292,10 +2576,8 @@ static inline int handle_pte_fault(struct mm_struct *mm, entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (!pte_same(old_entry, entry)) { - ptep_set_access_flags(vma, address, pte, entry, write_access); + if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); } else { /* * This is needed only for protection faults but the arch code @@ -2308,13 +2590,13 @@ static inline int handle_pte_fault(struct mm_struct *mm, } unlock: pte_unmap_unlock(pte, ptl); - return VM_FAULT_MINOR; + return 0; } /* * By the time we get here, we already hold the mm semaphore */ -int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) { pgd_t *pgd; @@ -2343,8 +2625,6 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, write_access); } -EXPORT_SYMBOL_GPL(__handle_mm_fault); - #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. @@ -2356,20 +2636,16 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) if (!new) return -ENOMEM; + smp_wmb(); /* See comment in __pte_alloc */ + spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) /* Another has populated it */ - pud_free(new); + pud_free(mm, new); else pgd_populate(mm, pgd, new); spin_unlock(&mm->page_table_lock); return 0; } -#else -/* Workaround for gcc 2.96 */ -int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - return 0; -} #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED @@ -2383,27 +2659,23 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) if (!new) return -ENOMEM; + smp_wmb(); /* See comment in __pte_alloc */ + spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK if (pud_present(*pud)) /* Another has populated it */ - pmd_free(new); + pmd_free(mm, new); else pud_populate(mm, pud, new); #else if (pgd_present(*pud)) /* Another has populated it */ - pmd_free(new); + pmd_free(mm, new); else pgd_populate(mm, pud, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; } -#else -/* Workaround for gcc 2.96 */ -int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) -{ - return 0; -} #endif /* __PAGETABLE_PMD_FOLDED */ int make_pages_present(unsigned long addr, unsigned long end) @@ -2417,7 +2689,7 @@ int make_pages_present(unsigned long addr, unsigned long end) write = (vma->vm_flags & VM_WRITE) != 0; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; + len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); if (ret < 0) @@ -2425,46 +2697,6 @@ int make_pages_present(unsigned long addr, unsigned long end) return ret == len ? 0 : -1; } -/* - * Map a vmalloc()-space virtual address to the physical page. - */ -struct page * vmalloc_to_page(void * vmalloc_addr) -{ - unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; - pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - - if (!pgd_none(*pgd)) { - pud = pud_offset(pgd, addr); - if (!pud_none(*pud)) { - pmd = pmd_offset(pud, addr); - if (!pmd_none(*pmd)) { - ptep = pte_offset_map(pmd, addr); - pte = *ptep; - if (pte_present(pte)) - page = pte_page(pte); - pte_unmap(ptep); - } - } - } - return page; -} - -EXPORT_SYMBOL(vmalloc_to_page); - -/* - * Map a vmalloc()-space virtual address to the physical page frame number. - */ -unsigned long vmalloc_to_pfn(void * vmalloc_addr) -{ - return page_to_pfn(vmalloc_to_page(vmalloc_addr)); -} - -EXPORT_SYMBOL(vmalloc_to_pfn); - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) @@ -2475,8 +2707,15 @@ static int __init gate_vma_init(void) gate_vma.vm_mm = NULL; gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_page_prot = PAGE_READONLY; - gate_vma.vm_flags = 0; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; return 0; } __initcall(gate_vma_init); @@ -2501,3 +2740,94 @@ int in_gate_area_no_task(unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct page *page; + void *old_buf = buf; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, ret, offset; + void *maddr; + + ret = get_user_pages(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (ret <= 0) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + up_read(&mm->mmap_sem); + mmput(mm); + + return buf - old_buf; +} + +/* + * Print the name of a VMA. + */ +void print_vma_addr(char *prefix, unsigned long ip) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + /* + * Do not print if we are in atomic + * contexts (in exception stacks, etc.): + */ + if (preempt_count()) + return; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma && vma->vm_file) { + struct file *f = vma->vm_file; + char *buf = (char *)__get_free_page(GFP_KERNEL); + if (buf) { + char *p, *s; + + p = d_path(&f->f_path, buf, PAGE_SIZE); + if (IS_ERR(p)) + p = "?"; + s = strrchr(p, '/'); + if (s) + p = s+1; + printk("%s%s[%lx+%lx]", prefix, p, + vma->vm_start, + vma->vm_end - vma->vm_start); + free_page((unsigned long)buf); + } + } + up_read(¤t->mm->mmap_sem); +}