HWPOISON: The high level memory error handler in the VM v7
[safe/jmp/linux-2.6] / mm / rmap.c
index 7e90beb..09c3d0b 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -14,7 +14,7 @@
  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
- * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
+ * Contributions by Hugh Dickins 2003, 2004
  */
 
 /*
  *                 mapping->tree_lock (widely used, in set_page_dirty,
  *                           in arch-dependent flush_dcache_mmap_lock,
  *                           within inode_lock in __sync_single_inode)
+ *
+ * (code doesn't rely on that order so it could be switched around)
+ * ->tasklist_lock
+ *   anon_vma->lock      (memory_failure, collect_procs_anon)
+ *     pte map lock
  */
 
 #include <linux/mm.h>
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
-#include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
 
 #include <asm/tlbflush.h>
 
 #include "internal.h"
 
-struct kmem_cache *anon_vma_cachep;
+static struct kmem_cache *anon_vma_cachep;
+
+static inline struct anon_vma *anon_vma_alloc(void)
+{
+       return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+}
+
+static inline void anon_vma_free(struct anon_vma *anon_vma)
+{
+       kmem_cache_free(anon_vma_cachep, anon_vma);
+}
 
 /**
  * anon_vma_prepare - attach an anon_vma to a memory region
@@ -301,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
  * if the page is not mapped into the page tables of this VMA.  Only
  * valid for normal file or anonymous VMAs.
  */
-static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 {
        unsigned long address;
        pte_t *pte;
@@ -323,7 +338,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
  * repeatedly from either page_referenced_anon or page_referenced_file.
  */
 static int page_referenced_one(struct page *page,
-       struct vm_area_struct *vma, unsigned int *mapcount)
+                              struct vm_area_struct *vma,
+                              unsigned int *mapcount,
+                              unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -346,11 +363,21 @@ static int page_referenced_one(struct page *page,
         */
        if (vma->vm_flags & VM_LOCKED) {
                *mapcount = 1;  /* break early from loop */
+               *vm_flags |= VM_LOCKED;
                goto out_unmap;
        }
 
-       if (ptep_clear_flush_young_notify(vma, address, pte))
-               referenced++;
+       if (ptep_clear_flush_young_notify(vma, address, pte)) {
+               /*
+                * Don't treat a reference through a sequentially read
+                * mapping as such.  If the page has been used in
+                * another mapping, we will catch it; if this other
+                * mapping is already gone, the unmap path will have
+                * set PG_referenced or activated the page.
+                */
+               if (likely(!VM_SequentialReadHint(vma)))
+                       referenced++;
+       }
 
        /* Pretend the page is referenced if the task has the
           swap token and is in the middle of a page fault. */
@@ -362,11 +389,14 @@ out_unmap:
        (*mapcount)--;
        pte_unmap_unlock(pte, ptl);
 out:
+       if (referenced)
+               *vm_flags |= vma->vm_flags;
        return referenced;
 }
 
 static int page_referenced_anon(struct page *page,
-                               struct mem_cgroup *mem_cont)
+                               struct mem_cgroup *mem_cont,
+                               unsigned long *vm_flags)
 {
        unsigned int mapcount;
        struct anon_vma *anon_vma;
@@ -386,7 +416,8 @@ static int page_referenced_anon(struct page *page,
                 */
                if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
                        continue;
-               referenced += page_referenced_one(page, vma, &mapcount);
+               referenced += page_referenced_one(page, vma,
+                                                 &mapcount, vm_flags);
                if (!mapcount)
                        break;
        }
@@ -399,6 +430,7 @@ static int page_referenced_anon(struct page *page,
  * page_referenced_file - referenced check for object-based rmap
  * @page: the page we're checking references on.
  * @mem_cont: target memory controller
+ * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
  *
  * For an object-based mapped page, find all the places it is mapped and
  * check/clear the referenced flag.  This is done by following the page->mapping
@@ -408,7 +440,8 @@ static int page_referenced_anon(struct page *page,
  * This function is only called from page_referenced for object-based pages.
  */
 static int page_referenced_file(struct page *page,
-                               struct mem_cgroup *mem_cont)
+                               struct mem_cgroup *mem_cont,
+                               unsigned long *vm_flags)
 {
        unsigned int mapcount;
        struct address_space *mapping = page->mapping;
@@ -448,7 +481,8 @@ static int page_referenced_file(struct page *page,
                 */
                if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
                        continue;
-               referenced += page_referenced_one(page, vma, &mapcount);
+               referenced += page_referenced_one(page, vma,
+                                                 &mapcount, vm_flags);
                if (!mapcount)
                        break;
        }
@@ -462,29 +496,35 @@ static int page_referenced_file(struct page *page,
  * @page: the page to test
  * @is_locked: caller holds lock on the page
  * @mem_cont: target memory controller
+ * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
  *
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of ptes which referenced the page.
  */
-int page_referenced(struct page *page, int is_locked,
-                       struct mem_cgroup *mem_cont)
+int page_referenced(struct page *page,
+                   int is_locked,
+                   struct mem_cgroup *mem_cont,
+                   unsigned long *vm_flags)
 {
        int referenced = 0;
 
        if (TestClearPageReferenced(page))
                referenced++;
 
+       *vm_flags = 0;
        if (page_mapped(page) && page->mapping) {
                if (PageAnon(page))
-                       referenced += page_referenced_anon(page, mem_cont);
+                       referenced += page_referenced_anon(page, mem_cont,
+                                                               vm_flags);
                else if (is_locked)
-                       referenced += page_referenced_file(page, mem_cont);
+                       referenced += page_referenced_file(page, mem_cont,
+                                                               vm_flags);
                else if (!trylock_page(page))
                        referenced++;
                else {
                        if (page->mapping)
-                               referenced +=
-                                       page_referenced_file(page, mem_cont);
+                               referenced += page_referenced_file(page,
+                                                       mem_cont, vm_flags);
                        unlock_page(page);
                }
        }
@@ -651,9 +691,14 @@ void page_add_anon_rmap(struct page *page,
 void page_add_new_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
-       BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-       atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+       VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+       SetPageSwapBacked(page);
+       atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
        __page_set_anon_rmap(page, vma, address);
+       if (page_evictable(page, vma))
+               lru_cache_add_lru(page, LRU_ACTIVE_ANON);
+       else
+               add_page_to_unevictable_list(page);
 }
 
 /**
@@ -664,8 +709,10 @@ void page_add_new_anon_rmap(struct page *page,
  */
 void page_add_file_rmap(struct page *page)
 {
-       if (atomic_inc_and_test(&page->_mapcount))
+       if (atomic_inc_and_test(&page->_mapcount)) {
                __inc_zone_page_state(page, NR_FILE_MAPPED);
+               mem_cgroup_update_mapped_file_stat(page, 1);
+       }
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -683,7 +730,6 @@ void page_add_file_rmap(struct page *page)
  */
 void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
 {
-       BUG_ON(page_mapcount(page) == 0);
        if (PageAnon(page))
                __page_check_anon_rmap(page, vma, address);
        atomic_inc(&page->_mapcount);
@@ -693,28 +739,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
 /**
  * page_remove_rmap - take down pte mapping from a page
  * @page: page to remove mapping from
- * @vma: the vm area in which the mapping is removed
  *
  * The caller needs to hold the pte lock.
  */
-void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
+void page_remove_rmap(struct page *page)
 {
        if (atomic_add_negative(-1, &page->_mapcount)) {
-               if (unlikely(page_mapcount(page) < 0)) {
-                       printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
-                       printk (KERN_EMERG "  page pfn = %lx\n", page_to_pfn(page));
-                       printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
-                       printk (KERN_EMERG "  page->count = %x\n", page_count(page));
-                       printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
-                       print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
-                       if (vma->vm_ops) {
-                               print_symbol (KERN_EMERG "  vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
-                       }
-                       if (vma->vm_file && vma->vm_file->f_op)
-                               print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
-                       BUG();
-               }
-
                /*
                 * Now that the last pte has gone, s390 must transfer dirty
                 * flag from storage key to struct page.  We can usually skip
@@ -727,10 +757,11 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                        page_clear_dirty(page);
                        set_page_dirty(page);
                }
-
-               mem_cgroup_uncharge_page(page);
+               if (PageAnon(page))
+                       mem_cgroup_uncharge_page(page);
                __dec_zone_page_state(page,
                        PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+               mem_cgroup_update_mapped_file_stat(page, -1);
                /*
                 * It would be tidy to reset the PageAnon mapping here,
                 * but that might overwrite a racing page_add_anon_rmap
@@ -748,7 +779,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  */
 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-                               int migration)
+                               enum ttu_flags flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -770,11 +801,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * If it's recently referenced (perhaps page_referenced
         * skipped over this mm) then we should reactivate it.
         */
-       if (!migration) {
+       if (!(flags & TTU_IGNORE_MLOCK)) {
                if (vma->vm_flags & VM_LOCKED) {
                        ret = SWAP_MLOCK;
                        goto out_unmap;
                }
+       }
+       if (!(flags & TTU_IGNORE_ACCESS)) {
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
                        ret = SWAP_FAIL;
                        goto out_unmap;
@@ -792,7 +825,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        /* Update high watermark before we lower rss */
        update_hiwater_rss(mm);
 
-       if (PageAnon(page)) {
+       if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+               if (PageAnon(page))
+                       dec_mm_counter(mm, anon_rss);
+               else
+                       dec_mm_counter(mm, file_rss);
+               set_pte_at(mm, address, pte,
+                               swp_entry_to_pte(make_hwpoison_entry(page)));
+       } else if (PageAnon(page)) {
                swp_entry_t entry = { .val = page_private(page) };
 
                if (PageSwapCache(page)) {
@@ -808,32 +848,27 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                spin_unlock(&mmlist_lock);
                        }
                        dec_mm_counter(mm, anon_rss);
-#ifdef CONFIG_MIGRATION
-               } else {
+               } else if (PAGE_MIGRATION) {
                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
-                       BUG_ON(!migration);
+                       BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
                        entry = make_migration_entry(page, pte_write(pteval));
-#endif
                }
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                BUG_ON(pte_file(*pte));
-       } else
-#ifdef CONFIG_MIGRATION
-       if (migration) {
+       } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
                /* Establish migration entry for a file page */
                swp_entry_t entry;
                entry = make_migration_entry(page, pte_write(pteval));
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
        } else
-#endif
                dec_mm_counter(mm, file_rss);
 
 
-       page_remove_rmap(page, vma);
+       page_remove_rmap(page);
        page_cache_release(page);
 
 out_unmap:
@@ -948,7 +983,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                if (pte_dirty(pteval))
                        set_page_dirty(page);
 
-               page_remove_rmap(page, vma);
+               page_remove_rmap(page);
                page_cache_release(page);
                dec_mm_counter(mm, file_rss);
                (*mapcount)--;
@@ -993,12 +1028,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_anon(struct page *page, int unlock, int migration)
+static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
        struct anon_vma *anon_vma;
        struct vm_area_struct *vma;
        unsigned int mlocked = 0;
        int ret = SWAP_AGAIN;
+       int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
 
        if (MLOCK_PAGES && unlikely(unlock))
                ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
@@ -1014,7 +1050,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
                                continue;  /* must visit all unlocked vmas */
                        ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
                } else {
-                       ret = try_to_unmap_one(page, vma, migration);
+                       ret = try_to_unmap_one(page, vma, flags);
                        if (ret == SWAP_FAIL || !page_mapped(page))
                                break;
                }
@@ -1038,8 +1074,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 /**
  * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
  * @page: the page to unmap/unlock
- * @unlock:  request for unlock rather than unmap [unlikely]
- * @migration:  unmapping for migration - ignored if @unlock
+ * @flags: action and flags
  *
  * Find all the mappings of a page using the mapping pointer and the vma chains
  * contained in the address_space struct it points to.
@@ -1051,7 +1086,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_file(struct page *page, int unlock, int migration)
+static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 {
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1063,6 +1098,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
        unsigned int mlocked = 0;
+       int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
 
        if (MLOCK_PAGES && unlikely(unlock))
                ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
@@ -1070,11 +1106,12 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                if (MLOCK_PAGES && unlikely(unlock)) {
-                       if (!(vma->vm_flags & VM_LOCKED))
+                       if (!((vma->vm_flags & VM_LOCKED) &&
+                                               page_mapped_in_vma(page, vma)))
                                continue;       /* must visit all vmas */
                        ret = SWAP_MLOCK;
                } else {
-                       ret = try_to_unmap_one(page, vma, migration);
+                       ret = try_to_unmap_one(page, vma, flags);
                        if (ret == SWAP_FAIL || !page_mapped(page))
                                goto out;
                }
@@ -1099,7 +1136,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
                        ret = SWAP_MLOCK;       /* leave mlocked == 0 */
                        goto out;               /* no need to look further */
                }
-               if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
+               if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
+                       (vma->vm_flags & VM_LOCKED))
                        continue;
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
@@ -1133,7 +1171,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                       if (!MLOCK_PAGES && !migration &&
+                       if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
                            (vma->vm_flags & VM_LOCKED))
                                continue;
                        cursor = (unsigned long) vma->vm_private_data;
@@ -1173,7 +1211,7 @@ out:
 /**
  * try_to_unmap - try to remove all page table mappings to a page
  * @page: the page to get unmapped
- * @migration: migration flag
+ * @flags: action and flags
  *
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock.
@@ -1184,22 +1222,21 @@ out:
  * SWAP_FAIL   - the page is unswappable
  * SWAP_MLOCK  - page is mlocked.
  */
-int try_to_unmap(struct page *page, int migration)
+int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
        int ret;
 
        BUG_ON(!PageLocked(page));
 
        if (PageAnon(page))
-               ret = try_to_unmap_anon(page, 0, migration);
+               ret = try_to_unmap_anon(page, flags);
        else
-               ret = try_to_unmap_file(page, 0, migration);
+               ret = try_to_unmap_file(page, flags);
        if (ret != SWAP_MLOCK && !page_mapped(page))
                ret = SWAP_SUCCESS;
        return ret;
 }
 
-#ifdef CONFIG_UNEVICTABLE_LRU
 /**
  * try_to_munlock - try to munlock a page
  * @page: the page to be munlocked
@@ -1219,8 +1256,8 @@ int try_to_munlock(struct page *page)
        VM_BUG_ON(!PageLocked(page) || PageLRU(page));
 
        if (PageAnon(page))
-               return try_to_unmap_anon(page, 1, 0);
+               return try_to_unmap_anon(page, TTU_MUNLOCK);
        else
-               return try_to_unmap_file(page, 1, 0);
+               return try_to_unmap_file(page, TTU_MUNLOCK);
 }
-#endif
+