page-allocator: clear N_HIGH_MEMORY map before we set it again

[safe/jmp/linux-2.6] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 5b4ad5e..d5d1653 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
         return i;
  }
  
+/**
+ * get_user_pages() - pin user pages in memory
+ * @tsk:       task_struct of target task
+ * @mm:                mm_struct of target mm
+ * @start:     starting user address
+ * @len:       number of pages from start to pin
+ * @write:     whether pages will be written to by the caller
+ * @force:     whether to force write access even if user mapping is
+ *             readonly. This will result in the page being COWed even
+ *             in MAP_SHARED mappings. You do not want this.
+ * @pages:     array that receives pointers to the pages pinned.
+ *             Should be at least nr_pages long. Or NULL, if caller
+ *             only intends to ensure the pages are faulted in.
+ * @vmas:      array of pointers to vmas corresponding to each page.
+ *             Or NULL if the caller does not require them.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If len is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If write=0, the page must not be written to. If the page is written to,
+ * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
+ * after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ */
  int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                 unsigned long start, int len, int write, int force,
                 struct page **pages, struct vm_area_struct **vmas)
@@ -1945,6 +1995,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                  * get_user_pages(.write=1, .force=1).
                  */
                 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+                       struct vm_fault vmf;
+                       int tmp;
+
+                       vmf.virtual_address = (void __user *)(address &
+                                                               PAGE_MASK);
+                       vmf.pgoff = old_page->index;
+                       vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+                       vmf.page = old_page;
+
                         /*
                          * Notify the address space that the page is about to
                          * become writable so that it can prohibit this or wait
@@ -1956,8 +2015,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         page_cache_get(old_page);
                         pte_unmap_unlock(page_table, ptl);
  
-                       if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+                       tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+                       if (unlikely(tmp &
+                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+                               ret = tmp;
                                 goto unwritable_page;
+                       }
+                       if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+                               lock_page(old_page);
+                               if (!old_page->mapping) {
+                                       ret = 0; /* retry the fault */
+                                       unlock_page(old_page);
+                                       goto unwritable_page;
+                               }
+                       } else
+                               VM_BUG_ON(!PageLocked(old_page));
  
                         /*
                          * Since we dropped the lock we need to revalidate
@@ -1967,9 +2039,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                          */
                         page_table = pte_offset_map_lock(mm, pmd, address,
                                                          &ptl);
-                       page_cache_release(old_page);
-                       if (!pte_same(*page_table, orig_pte))
+                       if (!pte_same(*page_table, orig_pte)) {
+                               unlock_page(old_page);
+                               page_cache_release(old_page);
                                 goto unlock;
+                       }
  
                         page_mkwrite = 1;
                 }
@@ -2081,9 +2155,6 @@ gotten:
  unlock:
         pte_unmap_unlock(page_table, ptl);
         if (dirty_page) {
-               if (vma->vm_file)
-                       file_update_time(vma->vm_file);
-
                 /*
                  * Yes, Virginia, this is actually required to prevent a race
                  * with clear_page_dirty_for_io() from clearing the page dirty
@@ -2092,21 +2163,46 @@ unlock:
                  *
                  * do_no_page is protected similarly.
                  */
-               wait_on_page_locked(dirty_page);
-               set_page_dirty_balance(dirty_page, page_mkwrite);
+               if (!page_mkwrite) {
+                       wait_on_page_locked(dirty_page);
+                       set_page_dirty_balance(dirty_page, page_mkwrite);
+               }
                 put_page(dirty_page);
+               if (page_mkwrite) {
+                       struct address_space *mapping = dirty_page->mapping;
+
+                       set_page_dirty(dirty_page);
+                       unlock_page(dirty_page);
+                       page_cache_release(dirty_page);
+                       if (mapping)    {
+                               /*
+                                * Some device drivers do not set page.mapping
+                                * but still dirty their pages
+                                */
+                               balance_dirty_pages_ratelimited(mapping);
+                       }
+               }
+
+               /* file_update_time outside page_lock */
+               if (vma->vm_file)
+                       file_update_time(vma->vm_file);
         }
         return ret;
  oom_free_new:
         page_cache_release(new_page);
  oom:
-       if (old_page)
+       if (old_page) {
+               if (page_mkwrite) {
+                       unlock_page(old_page);
+                       page_cache_release(old_page);
+               }
                 page_cache_release(old_page);
+       }
         return VM_FAULT_OOM;
  
  unwritable_page:
         page_cache_release(old_page);
-       return VM_FAULT_SIGBUS;
+       return ret;
  }
  
  /*
@@ -2445,8 +2541,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
         if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
                 ret = VM_FAULT_OOM;
-               unlock_page(page);
-               goto out;
+               goto out_page;
         }
  
         /*
@@ -2508,6 +2603,7 @@ out:
  out_nomap:
         mem_cgroup_cancel_charge_swapin(ptr);
         pte_unmap_unlock(page_table, ptl);
+out_page:
         unlock_page(page);
         page_cache_release(page);
         return ret;
@@ -2648,25 +2744,25 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                          * to become writable
                          */
                         if (vma->vm_ops->page_mkwrite) {
+                               int tmp;
+
                                 unlock_page(page);
-                               if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
-                                       ret = VM_FAULT_SIGBUS;
-                                       anon = 1; /* no anon but release vmf.page */
-                                       goto out_unlocked;
-                               }
-                               lock_page(page);
-                               /*
-                                * XXX: this is not quite right (racy vs
-                                * invalidate) to unlock and relock the page
-                                * like this, however a better fix requires
-                                * reworking page_mkwrite locking API, which
-                                * is better done later.
-                                */
-                               if (!page->mapping) {
-                                       ret = 0;
-                                       anon = 1; /* no anon but release vmf.page */
-                                       goto out;
+                               vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+                               tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+                               if (unlikely(tmp &
+                                         (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+                                       ret = tmp;
+                                       goto unwritable_page;
                                 }
+                               if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+                                       lock_page(page);
+                                       if (!page->mapping) {
+                                               ret = 0; /* retry the fault */
+                                               unlock_page(page);
+                                               goto unwritable_page;
+                                       }
+                               } else
+                                       VM_BUG_ON(!PageLocked(page));
                                 page_mkwrite = 1;
                         }
                 }
@@ -2718,19 +2814,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         pte_unmap_unlock(page_table, ptl);
  
  out:
-       unlock_page(vmf.page);
-out_unlocked:
-       if (anon)
-               page_cache_release(vmf.page);
-       else if (dirty_page) {
-               if (vma->vm_file)
-                       file_update_time(vma->vm_file);
+       if (dirty_page) {
+               struct address_space *mapping = page->mapping;
  
-               set_page_dirty_balance(dirty_page, page_mkwrite);
+               if (set_page_dirty(dirty_page))
+                       page_mkwrite = 1;
+               unlock_page(dirty_page);
                 put_page(dirty_page);
+               if (page_mkwrite && mapping) {
+                       /*
+                        * Some device drivers do not set page.mapping but still
+                        * dirty their pages
+                        */
+                       balance_dirty_pages_ratelimited(mapping);
+               }
+
+               /* file_update_time outside page_lock */
+               if (vma->vm_file)
+                       file_update_time(vma->vm_file);
+       } else {
+               unlock_page(vmf.page);
+               if (anon)
+                       page_cache_release(vmf.page);
         }
  
         return ret;
+
+unwritable_page:
+       page_cache_release(page);
+       return ret;
  }
  
  static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -2991,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr)
  
  #endif /* __HAVE_ARCH_GATE_AREA */
  
-#ifdef CONFIG_HAVE_IOREMAP_PROT
-int follow_phys(struct vm_area_struct *vma,
-               unsigned long address, unsigned int flags,
-               unsigned long *prot, resource_size_t *phys)
+static int follow_pte(struct mm_struct *mm, unsigned long address,
+               pte_t **ptepp, spinlock_t **ptlp)
  {
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
-       pte_t *ptep, pte;
-       spinlock_t *ptl;
-       resource_size_t phys_addr = 0;
-       struct mm_struct *mm = vma->vm_mm;
-       int ret = -EINVAL;
-
-       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-               goto out;
+       pte_t *ptep;
  
         pgd = pgd_offset(mm, address);
         if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -3024,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma,
         if (pmd_huge(*pmd))
                 goto out;
  
-       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+       ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
         if (!ptep)
                 goto out;
+       if (!pte_present(*ptep))
+               goto unlock;
+       *ptepp = ptep;
+       return 0;
+unlock:
+       pte_unmap_unlock(ptep, *ptlp);
+out:
+       return -EINVAL;
+}
+
+/**
+ * follow_pfn - look up PFN at a user virtual address
+ * @vma: memory mapping
+ * @address: user virtual address
+ * @pfn: location to store found PFN
+ *
+ * Only IO mappings and raw PFN mappings are allowed.
+ *
+ * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ */
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+       unsigned long *pfn)
+{
+       int ret = -EINVAL;
+       spinlock_t *ptl;
+       pte_t *ptep;
  
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+               return ret;
+
+       ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
+       if (ret)
+               return ret;
+       *pfn = pte_pfn(*ptep);
+       pte_unmap_unlock(ptep, ptl);
+       return 0;
+}
+EXPORT_SYMBOL(follow_pfn);
+
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+int follow_phys(struct vm_area_struct *vma,
+               unsigned long address, unsigned int flags,
+               unsigned long *prot, resource_size_t *phys)
+{
+       int ret = -EINVAL;
+       pte_t *ptep, pte;
+       spinlock_t *ptl;
+
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+               goto out;
+
+       if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+               goto out;
         pte = *ptep;
-       if (!pte_present(pte))
-               goto unlock;
+
         if ((flags & FOLL_WRITE) && !pte_write(pte))
                 goto unlock;
-       phys_addr = pte_pfn(pte);
-       phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
  
         *prot = pgprot_val(pte_pgprot(pte));
-       *phys = phys_addr;
-       ret = 0;
+       *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
  
+       ret = 0;
  unlock:
         pte_unmap_unlock(ptep, ptl);
  out: