page-allocator: clear N_HIGH_MEMORY map before we set it again
[safe/jmp/linux-2.6] / mm / memory.c
index 5b4ad5e..d5d1653 100644 (file)
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        return i;
 }
 
+/**
+ * get_user_pages() - pin user pages in memory
+ * @tsk:       task_struct of target task
+ * @mm:                mm_struct of target mm
+ * @start:     starting user address
+ * @len:       number of pages from start to pin
+ * @write:     whether pages will be written to by the caller
+ * @force:     whether to force write access even if user mapping is
+ *             readonly. This will result in the page being COWed even
+ *             in MAP_SHARED mappings. You do not want this.
+ * @pages:     array that receives pointers to the pages pinned.
+ *             Should be at least nr_pages long. Or NULL, if caller
+ *             only intends to ensure the pages are faulted in.
+ * @vmas:      array of pointers to vmas corresponding to each page.
+ *             Or NULL if the caller does not require them.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If len is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If write=0, the page must not be written to. If the page is written to,
+ * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
+ * after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ */
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long start, int len, int write, int force,
                struct page **pages, struct vm_area_struct **vmas)
@@ -1945,6 +1995,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * get_user_pages(.write=1, .force=1).
                 */
                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+                       struct vm_fault vmf;
+                       int tmp;
+
+                       vmf.virtual_address = (void __user *)(address &
+                                                               PAGE_MASK);
+                       vmf.pgoff = old_page->index;
+                       vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+                       vmf.page = old_page;
+
                        /*
                         * Notify the address space that the page is about to
                         * become writable so that it can prohibit this or wait
@@ -1956,8 +2015,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        page_cache_get(old_page);
                        pte_unmap_unlock(page_table, ptl);
 
-                       if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+                       tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+                       if (unlikely(tmp &
+                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+                               ret = tmp;
                                goto unwritable_page;
+                       }
+                       if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+                               lock_page(old_page);
+                               if (!old_page->mapping) {
+                                       ret = 0; /* retry the fault */
+                                       unlock_page(old_page);
+                                       goto unwritable_page;
+                               }
+                       } else
+                               VM_BUG_ON(!PageLocked(old_page));
 
                        /*
                         * Since we dropped the lock we need to revalidate
@@ -1967,9 +2039,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         */
                        page_table = pte_offset_map_lock(mm, pmd, address,
                                                         &ptl);
-                       page_cache_release(old_page);
-                       if (!pte_same(*page_table, orig_pte))
+                       if (!pte_same(*page_table, orig_pte)) {
+                               unlock_page(old_page);
+                               page_cache_release(old_page);
                                goto unlock;
+                       }
 
                        page_mkwrite = 1;
                }
@@ -2081,9 +2155,6 @@ gotten:
 unlock:
        pte_unmap_unlock(page_table, ptl);
        if (dirty_page) {
-               if (vma->vm_file)
-                       file_update_time(vma->vm_file);
-
                /*
                 * Yes, Virginia, this is actually required to prevent a race
                 * with clear_page_dirty_for_io() from clearing the page dirty
@@ -2092,21 +2163,46 @@ unlock:
                 *
                 * do_no_page is protected similarly.
                 */
-               wait_on_page_locked(dirty_page);
-               set_page_dirty_balance(dirty_page, page_mkwrite);
+               if (!page_mkwrite) {
+                       wait_on_page_locked(dirty_page);
+                       set_page_dirty_balance(dirty_page, page_mkwrite);
+               }
                put_page(dirty_page);
+               if (page_mkwrite) {
+                       struct address_space *mapping = dirty_page->mapping;
+
+                       set_page_dirty(dirty_page);
+                       unlock_page(dirty_page);
+                       page_cache_release(dirty_page);
+                       if (mapping)    {
+                               /*
+                                * Some device drivers do not set page.mapping
+                                * but still dirty their pages
+                                */
+                               balance_dirty_pages_ratelimited(mapping);
+                       }
+               }
+
+               /* file_update_time outside page_lock */
+               if (vma->vm_file)
+                       file_update_time(vma->vm_file);
        }
        return ret;
 oom_free_new:
        page_cache_release(new_page);
 oom:
-       if (old_page)
+       if (old_page) {
+               if (page_mkwrite) {
+                       unlock_page(old_page);
+                       page_cache_release(old_page);
+               }
                page_cache_release(old_page);
+       }
        return VM_FAULT_OOM;
 
 unwritable_page:
        page_cache_release(old_page);
-       return VM_FAULT_SIGBUS;
+       return ret;
 }
 
 /*
@@ -2445,8 +2541,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
                ret = VM_FAULT_OOM;
-               unlock_page(page);
-               goto out;
+               goto out_page;
        }
 
        /*
@@ -2508,6 +2603,7 @@ out:
 out_nomap:
        mem_cgroup_cancel_charge_swapin(ptr);
        pte_unmap_unlock(page_table, ptl);
+out_page:
        unlock_page(page);
        page_cache_release(page);
        return ret;
@@ -2648,25 +2744,25 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         * to become writable
                         */
                        if (vma->vm_ops->page_mkwrite) {
+                               int tmp;
+
                                unlock_page(page);
-                               if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
-                                       ret = VM_FAULT_SIGBUS;
-                                       anon = 1; /* no anon but release vmf.page */
-                                       goto out_unlocked;
-                               }
-                               lock_page(page);
-                               /*
-                                * XXX: this is not quite right (racy vs
-                                * invalidate) to unlock and relock the page
-                                * like this, however a better fix requires
-                                * reworking page_mkwrite locking API, which
-                                * is better done later.
-                                */
-                               if (!page->mapping) {
-                                       ret = 0;
-                                       anon = 1; /* no anon but release vmf.page */
-                                       goto out;
+                               vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+                               tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+                               if (unlikely(tmp &
+                                         (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+                                       ret = tmp;
+                                       goto unwritable_page;
                                }
+                               if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+                                       lock_page(page);
+                                       if (!page->mapping) {
+                                               ret = 0; /* retry the fault */
+                                               unlock_page(page);
+                                               goto unwritable_page;
+                                       }
+                               } else
+                                       VM_BUG_ON(!PageLocked(page));
                                page_mkwrite = 1;
                        }
                }
@@ -2718,19 +2814,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_unmap_unlock(page_table, ptl);
 
 out:
-       unlock_page(vmf.page);
-out_unlocked:
-       if (anon)
-               page_cache_release(vmf.page);
-       else if (dirty_page) {
-               if (vma->vm_file)
-                       file_update_time(vma->vm_file);
+       if (dirty_page) {
+               struct address_space *mapping = page->mapping;
 
-               set_page_dirty_balance(dirty_page, page_mkwrite);
+               if (set_page_dirty(dirty_page))
+                       page_mkwrite = 1;
+               unlock_page(dirty_page);
                put_page(dirty_page);
+               if (page_mkwrite && mapping) {
+                       /*
+                        * Some device drivers do not set page.mapping but still
+                        * dirty their pages
+                        */
+                       balance_dirty_pages_ratelimited(mapping);
+               }
+
+               /* file_update_time outside page_lock */
+               if (vma->vm_file)
+                       file_update_time(vma->vm_file);
+       } else {
+               unlock_page(vmf.page);
+               if (anon)
+                       page_cache_release(vmf.page);
        }
 
        return ret;
+
+unwritable_page:
+       page_cache_release(page);
+       return ret;
 }
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -2991,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr)
 
 #endif /* __HAVE_ARCH_GATE_AREA */
 
-#ifdef CONFIG_HAVE_IOREMAP_PROT
-int follow_phys(struct vm_area_struct *vma,
-               unsigned long address, unsigned int flags,
-               unsigned long *prot, resource_size_t *phys)
+static int follow_pte(struct mm_struct *mm, unsigned long address,
+               pte_t **ptepp, spinlock_t **ptlp)
 {
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
-       pte_t *ptep, pte;
-       spinlock_t *ptl;
-       resource_size_t phys_addr = 0;
-       struct mm_struct *mm = vma->vm_mm;
-       int ret = -EINVAL;
-
-       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-               goto out;
+       pte_t *ptep;
 
        pgd = pgd_offset(mm, address);
        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -3024,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma,
        if (pmd_huge(*pmd))
                goto out;
 
-       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+       ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
        if (!ptep)
                goto out;
+       if (!pte_present(*ptep))
+               goto unlock;
+       *ptepp = ptep;
+       return 0;
+unlock:
+       pte_unmap_unlock(ptep, *ptlp);
+out:
+       return -EINVAL;
+}
+
+/**
+ * follow_pfn - look up PFN at a user virtual address
+ * @vma: memory mapping
+ * @address: user virtual address
+ * @pfn: location to store found PFN
+ *
+ * Only IO mappings and raw PFN mappings are allowed.
+ *
+ * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ */
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+       unsigned long *pfn)
+{
+       int ret = -EINVAL;
+       spinlock_t *ptl;
+       pte_t *ptep;
 
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+               return ret;
+
+       ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
+       if (ret)
+               return ret;
+       *pfn = pte_pfn(*ptep);
+       pte_unmap_unlock(ptep, ptl);
+       return 0;
+}
+EXPORT_SYMBOL(follow_pfn);
+
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+int follow_phys(struct vm_area_struct *vma,
+               unsigned long address, unsigned int flags,
+               unsigned long *prot, resource_size_t *phys)
+{
+       int ret = -EINVAL;
+       pte_t *ptep, pte;
+       spinlock_t *ptl;
+
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+               goto out;
+
+       if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+               goto out;
        pte = *ptep;
-       if (!pte_present(pte))
-               goto unlock;
+
        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;
-       phys_addr = pte_pfn(pte);
-       phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
 
        *prot = pgprot_val(pte_pgprot(pte));
-       *phys = phys_addr;
-       ret = 0;
+       *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
 
+       ret = 0;
 unlock:
        pte_unmap_unlock(ptep, ptl);
 out: