X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmemory.c;h=164951c473058a25c081d5e47260d872068cdbb7;hb=79683f2d685cfb6ef9c97c5194e3ce3319e80cac;hp=87350321e66ffa026d47d22f054a429ae7977590;hpb=28b2ee20c7cba812b6f2ccf6d722cf86d00a84dc;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/memory.c b/mm/memory.c index 8735032..164951c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -61,6 +62,8 @@ #include #include +#include "internal.h" + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -211,7 +214,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, * * Must be called with pagetable lock held. */ -void free_pgd_range(struct mmu_gather **tlb, +void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -262,16 +265,16 @@ void free_pgd_range(struct mmu_gather **tlb, return; start = addr; - pgd = pgd_offset((*tlb)->mm, addr); + pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - free_pud_range(*tlb, pgd, addr, next, floor, ceiling); + free_pud_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { while (vma) { @@ -372,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) * * The calling function must still handle the error. */ -void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) +static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, + unsigned long vaddr) { printk(KERN_ERR "Bad pte = %08llx, process = %s, " "vm_flags = %lx, vaddr = %lx\n", @@ -649,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + int ret; /* * Don't copy ptes where a page fault will fill them correctly. @@ -664,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); + /* + * We need to invalidate the secondary MMU mappings only when + * there could be a permission downgrade on the ptes of the + * parent mm. And a permission downgrade will only happen if + * is_cow_mapping() returns true. + */ + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_start(src_mm, addr, end); + + ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next)) - return -ENOMEM; + if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next))) { + ret = -ENOMEM; + break; + } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - return 0; + + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_end(src_mm, + vma->vm_start, end); + return ret; } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -878,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = (*tlbp)->fullmm; + struct mm_struct *mm = vma->vm_mm; + mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -899,9 +922,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, } if (unlikely(is_vm_hugetlb_page(vma))) { - unmap_hugepage_range(vma, start, end); - zap_work -= (end - start) / - (HPAGE_SIZE / PAGE_SIZE); + /* + * It is undesirable to test vma->vm_file as it + * should be non-null for valid hugetlb area. + * However, vm_file will be NULL in the error + * cleanup path of do_mmap_pgoff. When + * hugetlbfs ->mmap method fails, + * do_mmap_pgoff() nullifies vma->vm_file + * before calling this function to clean up. + * Since no pte has actually been setup, it is + * safe to do nothing in this case. + */ + if (vma->vm_file) { + unmap_hugepage_range(vma, start, end, NULL); + zap_work -= (end - start) / + pages_per_huge_page(hstate_vma(vma)); + } + start = end; } else start = unmap_page_range(*tlbp, vma, @@ -929,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, } } out: + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ } @@ -956,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, return end; } +/** + * zap_vma_ptes - remove ptes mapping the vma + * @vma: vm_area_struct holding ptes to be zapped + * @address: starting address of pages to zap + * @size: number of bytes to zap + * + * This function only unmaps ptes assigned to VM_PFNMAP vmas. + * + * The entire address range must be fully contained within the vma. + * + * Returns 0 if successful. + */ +int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, + unsigned long size) +{ + if (address < vma->vm_start || address + size > vma->vm_end || + !(vma->vm_flags & VM_PFNMAP)) + return -1; + zap_page_range(vma, address, size, NULL); + return 0; +} +EXPORT_SYMBOL_GPL(zap_vma_ptes); + /* * Do a quick page-table lookup for a single page. */ @@ -982,19 +1043,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page_table; pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) + if (pud_none(*pud)) + goto no_page_table; + if (pud_huge(*pud)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pud_bad(*pud))) goto no_page_table; - + pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } - if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -1063,12 +1129,17 @@ static inline int use_zero_page(struct vm_area_struct *vma) return !vma->vm_ops || !vma->vm_ops->fault; } -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int vm_flags; + unsigned int vm_flags = 0; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); if (len <= 0) return 0; @@ -1092,7 +1163,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pud_t *pud; pmd_t *pmd; pte_t *pte; - if (write) /* user gate pages are read-only */ + + /* user gate pages are read-only */ + if (!ignore && write) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); @@ -1124,8 +1197,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) - || !(vm_flags & vma->vm_flags)) + if (!vma || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + (!ignore && !(vm_flags & vma->vm_flags))) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { @@ -1200,6 +1274,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } while (len); return i; } + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} + EXPORT_SYMBOL(get_user_pages); pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, @@ -1230,18 +1321,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, pte_t *pte; spinlock_t *ptl; - retval = mem_cgroup_charge(page, mm, GFP_KERNEL); - if (retval) - goto out; - retval = -EINVAL; if (PageAnon(page)) - goto out_uncharge; + goto out; retval = -ENOMEM; flush_dcache_page(page); pte = get_locked_pte(mm, addr, &ptl); if (!pte) - goto out_uncharge; + goto out; retval = -EBUSY; if (!pte_none(*pte)) goto out_unlock; @@ -1257,8 +1344,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, return retval; out_unlock: pte_unmap_unlock(pte, ptl); -out_uncharge: - mem_cgroup_uncharge_page(page); out: return retval; } @@ -1551,6 +1636,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long next; int err; + BUG_ON(pud_huge(*pud)); + pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; @@ -1592,10 +1679,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, { pgd_t *pgd; unsigned long next; - unsigned long end = addr + size; + unsigned long start = addr, end = addr + size; int err; BUG_ON(addr >= end); + mmu_notifier_invalidate_range_start(mm, start, end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -1603,6 +1691,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); + mmu_notifier_invalidate_range_end(mm, start, end); return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); @@ -1719,7 +1808,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page)) { - if (!TestSetPageLocked(old_page)) { + if (trylock_page(old_page)) { reuse = can_share_swap_page(old_page); unlock_page(old_page); } @@ -1788,6 +1877,15 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) { + lock_page(old_page); /* for LRU manipulation */ + clear_page_mlock(old_page); + unlock_page(old_page); + } cow_user_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -1815,12 +1913,14 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush(vma, address, page_table); - set_pte_at(mm, address, page_table, entry); - update_mmu_cache(vma, address, entry); - lru_cache_add_active(new_page); + ptep_clear_flush_notify(vma, address, page_table); + SetPageSwapBacked(new_page); + lru_cache_add_active_or_unevictable(new_page, vma); page_add_new_anon_rmap(new_page, vma, address); +//TODO: is this safe? do_anonymous_page() does it this way. + set_pte_at(mm, address, page_table, entry); + update_mmu_cache(vma, address, entry); if (old_page) { /* * Only after switching the pte to the new page may @@ -2218,16 +2318,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGMAJFAULT); } + mark_page_accessed(page); + + lock_page(page); + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); ret = VM_FAULT_OOM; + unlock_page(page); goto out; } - mark_page_accessed(page); - lock_page(page); - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - /* * Back out if somebody else already faulted in this pte. */ @@ -2254,7 +2355,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page_add_anon_rmap(page, vma, address); swap_free(entry); - if (vm_swap_full()) + if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) remove_exclusive_swap_page(page); unlock_page(page); @@ -2312,7 +2413,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); + SetPageSwapBacked(page); + lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2353,6 +2455,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t entry; int anon = 0; + int charged = 0; struct page *dirty_page = NULL; struct vm_fault vmf; int ret; @@ -2393,6 +2496,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out; } + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { + ret = VM_FAULT_OOM; + page_cache_release(page); + goto out; + } + charged = 1; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) + clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -2427,11 +2542,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, } - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { - ret = VM_FAULT_OOM; - goto out; - } - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); /* @@ -2450,11 +2560,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - set_pte_at(mm, address, page_table, entry); if (anon) { - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); - page_add_new_anon_rmap(page, vma, address); + inc_mm_counter(mm, anon_rss); + SetPageSwapBacked(page); + lru_cache_add_active_or_unevictable(page, vma); + page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(page); @@ -2463,11 +2573,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); } } +//TODO: is this safe? do_anonymous_page() does it this way. + set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, entry); } else { - mem_cgroup_uncharge_page(page); + if (charged) + mem_cgroup_uncharge_page(page); if (anon) page_cache_release(page); else @@ -2695,7 +2808,7 @@ int make_pages_present(unsigned long addr, unsigned long end) vma = find_vma(current->mm, addr); if (!vma) - return -1; + return -ENOMEM; write = (vma->vm_flags & VM_WRITE) != 0; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); @@ -2704,7 +2817,7 @@ int make_pages_present(unsigned long addr, unsigned long end) len, write, 0, NULL, NULL); if (ret < 0) return ret; - return ret == len ? 0 : -1; + return ret == len ? 0 : -EFAULT; } #if !defined(__HAVE_ARCH_GATE_AREA)