X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmremap.c;h=97bff2547719e702150e1cdc4d4a3f6b31a23213;hb=d327cf7449e6fd5cbac784c641770e9366faa386;hp=a32fed454bd7c60e15da0c720b1631b55a3f8835;hpb=9a61c349b28ec5aef7e929236571fd770fdef0bb;p=safe%2Fjmp%2Flinux-2.6

diff --git a/mm/mremap.c b/mm/mremap.c
index a32fed4..97bff25 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -3,7 +3,7 @@
  *
  *	(C) Copyright 1996 Linus Torvalds
  *
- *	Address space accounting code	<alan@redhat.com>
+ *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
  *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
  */
 
@@ -11,46 +11,23 @@
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
 #include <linux/shm.h>
+#include <linux/ksm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte = NULL;
-
-	pgd = pgd_offset(mm, addr);
-	if (pgd_none_or_clear_bad(pgd))
-		goto end;
-
-	pud = pud_offset(pgd, addr);
-	if (pud_none_or_clear_bad(pud))
-		goto end;
-
-	pmd = pmd_offset(pud, addr);
-	if (pmd_none_or_clear_bad(pmd))
-		goto end;
+#include "internal.h"
 
-	pte = pte_offset_map_nested(pmd, addr);
-	if (pte_none(*pte)) {
-		pte_unmap_nested(pte);
-		pte = NULL;
-	}
-end:
-	return pte;
-}
-
-static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -68,41 +45,49 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
 	if (pmd_none_or_clear_bad(pmd))
 		return NULL;
 
-	return pte_offset_map(pmd, addr);
+	return pmd;
 }
 
-static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte = NULL;
 
 	pgd = pgd_offset(mm, addr);
-
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
 		return NULL;
+
 	pmd = pmd_alloc(mm, pud, addr);
-	if (pmd)
-		pte = pte_alloc_map(mm, pmd, addr);
-	return pte;
+	if (!pmd)
+		return NULL;
+
+	if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+		return NULL;
+
+	return pmd;
 }
 
-static int
-move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
-		struct vm_area_struct *new_vma, unsigned long new_addr)
+static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+		unsigned long old_addr, unsigned long old_end,
+		struct vm_area_struct *new_vma, pmd_t *new_pmd,
+		unsigned long new_addr)
 {
 	struct address_space *mapping = NULL;
 	struct mm_struct *mm = vma->vm_mm;
-	int error = 0;
-	pte_t *src, *dst;
+	pte_t *old_pte, *new_pte, pte;
+	spinlock_t *old_ptl, *new_ptl;
+	unsigned long old_start;
 
+	old_start = old_addr;
+	mmu_notifier_invalidate_range_start(vma->vm_mm,
+					    old_start, old_end);
 	if (vma->vm_file) {
 		/*
 		 * Subtle point from Rajesh Venkatasubramanian: before
-		 * moving file-based ptes, we must lock vmtruncate out,
-		 * since it might clean the dst vma before the src vma,
+		 * moving file-based ptes, we must lock truncate_pagecache
+		 * out, since it might clean the dst vma before the src vma,
 		 * and we propagate stale pages into the dst afterward.
 		 */
 		mapping = vma->vm_file->f_mapping;
@@ -111,74 +96,71 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
 		    new_vma->vm_truncate_count != vma->vm_truncate_count)
 			new_vma->vm_truncate_count = 0;
 	}
-	spin_lock(&mm->page_table_lock);
 
-	src = get_one_pte_map_nested(mm, old_addr);
-	if (src) {
-		/*
-		 * Look to see whether alloc_one_pte_map needs to perform a
-		 * memory allocation.  If it does then we need to drop the
-		 * atomic kmap
-		 */
-		dst = get_one_pte_map(mm, new_addr);
-		if (unlikely(!dst)) {
-			pte_unmap_nested(src);
-			if (mapping)
-				spin_unlock(&mapping->i_mmap_lock);
-			dst = alloc_one_pte_map(mm, new_addr);
-			if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
-				spin_unlock(&mm->page_table_lock);
-				spin_lock(&mapping->i_mmap_lock);
-				spin_lock(&mm->page_table_lock);
-			}
-			src = get_one_pte_map_nested(mm, old_addr);
-		}
-		/*
-		 * Since alloc_one_pte_map can drop and re-acquire
-		 * page_table_lock, we should re-check the src entry...
-		 */
-		if (src) {
-			if (dst) {
-				pte_t pte;
-				pte = ptep_clear_flush(vma, old_addr, src);
-				/* ZERO_PAGE can be dependant on virtual addr */
-				if (pfn_valid(pte_pfn(pte)) &&
-					pte_page(pte) == ZERO_PAGE(old_addr))
-					pte = pte_wrprotect(mk_pte(ZERO_PAGE(new_addr), new_vma->vm_page_prot));
-				set_pte_at(mm, new_addr, dst, pte);
-			} else
-				error = -ENOMEM;
-			pte_unmap_nested(src);
-		}
-		if (dst)
-			pte_unmap(dst);
+	/*
+	 * We don't have to worry about the ordering of src and dst
+	 * pte locks because exclusive mmap_sem prevents deadlock.
+	 */
+	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ 	new_pte = pte_offset_map_nested(new_pmd, new_addr);
+	new_ptl = pte_lockptr(mm, new_pmd);
+	if (new_ptl != old_ptl)
+		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+	arch_enter_lazy_mmu_mode();
+
+	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
+				   new_pte++, new_addr += PAGE_SIZE) {
+		if (pte_none(*old_pte))
+			continue;
+		pte = ptep_clear_flush(vma, old_addr, old_pte);
+		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+		set_pte_at(mm, new_addr, new_pte, pte);
 	}
-	spin_unlock(&mm->page_table_lock);
+
+	arch_leave_lazy_mmu_mode();
+	if (new_ptl != old_ptl)
+		spin_unlock(new_ptl);
+	pte_unmap_nested(new_pte - 1);
+	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
-	return error;
+	mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
 
-static unsigned long move_page_tables(struct vm_area_struct *vma,
+#define LATENCY_LIMIT	(64 * PAGE_SIZE)
+
+unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len)
 {
-	unsigned long offset;
+	unsigned long extent, next, old_end;
+	pmd_t *old_pmd, *new_pmd;
 
-	flush_cache_range(vma, old_addr, old_addr + len);
+	old_end = old_addr + len;
+	flush_cache_range(vma, old_addr, old_end);
 
-	/*
-	 * This is not the clever way to do this, but we're taking the
-	 * easy way out on the assumption that most remappings will be
-	 * only a few pages.. This also makes error recovery easier.
-	 */
-	for (offset = 0; offset < len; offset += PAGE_SIZE) {
-		if (move_one_page(vma, old_addr + offset,
-				new_vma, new_addr + offset) < 0)
-			break;
+	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 		cond_resched();
+		next = (old_addr + PMD_SIZE) & PMD_MASK;
+		if (next - 1 > old_end)
+			next = old_end;
+		extent = next - old_addr;
+		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+		if (!old_pmd)
+			continue;
+		new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+		if (!new_pmd)
+			break;
+		next = (new_addr + PMD_SIZE) & PMD_MASK;
+		if (extent > next - new_addr)
+			extent = next - new_addr;
+		if (extent > LATENCY_LIMIT)
+			extent = LATENCY_LIMIT;
+		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
+				new_vma, new_pmd, new_addr);
 	}
-	return offset;
+
+	return len + old_addr - old_end;	/* how much done */
 }
 
 static unsigned long move_vma(struct vm_area_struct *vma,
@@ -191,7 +173,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	unsigned long new_pgoff;
 	unsigned long moved_len;
 	unsigned long excess = 0;
+	unsigned long hiwater_vm;
 	int split = 0;
+	int err;
 
 	/*
 	 * We'd prefer to avoid failure later on in do_munmap:
@@ -200,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (mm->map_count >= sysctl_max_map_count - 3)
 		return -ENOMEM;
 
+	/*
+	 * Advise KSM to break any KSM pages in the area to be moved:
+	 * it would be confusing if they were to turn up at the new
+	 * location, where they happen to coincide with different KSM
+	 * pages recently unmapped.  But leave vma->vm_flags as it was,
+	 * so KSM can come around to merge on vma and new_vma afterwards.
+	 */
+	err = ksm_madvise(vma, old_addr, old_addr + old_len,
+						MADV_UNMERGEABLE, &vm_flags);
+	if (err)
+		return err;
+
 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
 	if (!new_vma)
@@ -229,17 +225,24 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	}
 
 	/*
-	 * if we failed to move page tables we still do total_vm increment
-	 * since do_munmap() will decrement it by old_len == new_len
+	 * If we failed to move page tables we still do total_vm increment
+	 * since do_munmap() will decrement it by old_len == new_len.
+	 *
+	 * Since total_vm is about to be raised artificially high for a
+	 * moment, we need to restore high watermark afterwards: if stats
+	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
+	 * If this were a serious issue, we'd add a flag to do_munmap().
 	 */
+	hiwater_vm = mm->hiwater_vm;
 	mm->total_vm += new_len >> PAGE_SHIFT;
-	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
 	if (do_munmap(mm, old_addr, old_len) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
 	}
+	mm->hiwater_vm = hiwater_vm;
 
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
 	if (excess) {
@@ -251,8 +254,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += new_len >> PAGE_SHIFT;
 		if (new_len > old_len)
-			make_pages_present(new_addr + old_len,
-					   new_addr + new_len);
+			mlock_vma_pages_range(new_vma, new_addr + old_len,
+						       new_addr + new_len);
 	}
 
 	return new_addr;
@@ -269,6 +272,7 @@ unsigned long do_mremap(unsigned long addr,
 	unsigned long old_len, unsigned long new_len,
 	unsigned long flags, unsigned long new_addr)
 {
+	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
@@ -309,7 +313,11 @@ unsigned long do_mremap(unsigned long addr,
 		if ((addr <= new_addr) && (addr+old_len) > new_addr)
 			goto out;
 
-		ret = do_munmap(current->mm, new_addr, new_len);
+		ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+		if (ret)
+			goto out;
+
+		ret = do_munmap(mm, new_addr, new_len);
 		if (ret)
 			goto out;
 	}
@@ -320,7 +328,7 @@ unsigned long do_mremap(unsigned long addr,
 	 * do_munmap does all the needed commit accounting
 	 */
 	if (old_len >= new_len) {
-		ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
+		ret = do_munmap(mm, addr+new_len, old_len - new_len);
 		if (ret && old_len != new_len)
 			goto out;
 		ret = addr;
@@ -333,7 +341,7 @@ unsigned long do_mremap(unsigned long addr,
 	 * Ok, we need to grow..  or relocate.
 	 */
 	ret = -EFAULT;
-	vma = find_vma(current->mm, addr);
+	vma = find_vma(mm, addr);
 	if (!vma || vma->vm_start > addr)
 		goto out;
 	if (is_vm_hugetlb_page(vma)) {
@@ -343,20 +351,20 @@ unsigned long do_mremap(unsigned long addr,
 	/* We can't remap across vm area boundaries */
 	if (old_len > vma->vm_end - addr)
 		goto out;
-	if (vma->vm_flags & VM_DONTEXPAND) {
+	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
 		if (new_len > old_len)
 			goto out;
 	}
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
-		locked = current->mm->locked_vm << PAGE_SHIFT;
+		locked = mm->locked_vm << PAGE_SHIFT;
 		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += new_len - old_len;
 		ret = -EAGAIN;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			goto out;
 	}
-	if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) {
+	if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -383,12 +391,11 @@ unsigned long do_mremap(unsigned long addr,
 			vma_adjust(vma, vma->vm_start,
 				addr + new_len, vma->vm_pgoff, NULL);
 
-			current->mm->total_vm += pages;
-			__vm_stat_account(vma->vm_mm, vma->vm_flags,
-							vma->vm_file, pages);
+			mm->total_vm += pages;
+			vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
 			if (vma->vm_flags & VM_LOCKED) {
-				current->mm->locked_vm += pages;
-				make_pages_present(addr + old_len,
+				mm->locked_vm += pages;
+				mlock_vma_pages_range(vma, addr + old_len,
 						   addr + new_len);
 			}
 			ret = addr;
@@ -409,8 +416,13 @@ unsigned long do_mremap(unsigned long addr,
 
 			new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
 						vma->vm_pgoff, map_flags);
-			ret = new_addr;
-			if (new_addr & ~PAGE_MASK)
+			if (new_addr & ~PAGE_MASK) {
+				ret = new_addr;
+				goto out;
+			}
+
+			ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+			if (ret)
 				goto out;
 		}
 		ret = move_vma(vma, addr, old_len, new_len, new_addr);
@@ -422,9 +434,9 @@ out_nc:
 	return ret;
 }
 
-asmlinkage unsigned long sys_mremap(unsigned long addr,
-	unsigned long old_len, unsigned long new_len,
-	unsigned long flags, unsigned long new_addr)
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+		unsigned long, new_len, unsigned long, flags,
+		unsigned long, new_addr)
 {
 	unsigned long ret;