swap_info: swap count continuations

author Hugh Dickins <hugh.dickins@tiscali.co.uk>

Tue, 15 Dec 2009 01:58:46 +0000 (17:58 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 15 Dec 2009 16:53:15 +0000 (08:53 -0800)
author Hugh Dickins <hugh.dickins@tiscali.co.uk>
Tue, 15 Dec 2009 01:58:46 +0000 (17:58 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Dec 2009 16:53:15 +0000 (08:53 -0800)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index f733deb..389e7bd 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -145,15 +145,18 @@ enum {
         SWP_DISCARDABLE = (1 << 2),     /* blkdev supports discard */
         SWP_DISCARDING  = (1 << 3),     /* now discarding a free cluster */
         SWP_SOLIDSTATE  = (1 << 4),     /* blkdev seeks are cheap */
+       SWP_CONTINUED   = (1 << 5),     /* swap_map has count continuation */
                                         /* add others here before... */
         SWP_SCANNING    = (1 << 8),     /* refcount in scan_swap_map */
  };
  
  #define SWAP_CLUSTER_MAX 32
  
-#define SWAP_MAP_MAX   0x7e
-#define SWAP_MAP_BAD   0x7f
-#define SWAP_HAS_CACHE 0x80            /* There is a swap cache of entry. */
+#define SWAP_MAP_MAX   0x3e    /* Max duplication count, in first swap_map */
+#define SWAP_MAP_BAD   0x3f    /* Note pageblock is bad, in first swap_map */
+#define SWAP_HAS_CACHE 0x40    /* Flag page is cached, in first swap_map */
+#define SWAP_CONT_MAX  0x7f    /* Max count, in each swap_map continuation */
+#define COUNT_CONTINUED        0x80    /* See swap_map continuation for full count */
  
  /*
   * The in-memory structure used to track swap areas.
@@ -311,9 +314,10 @@ extern long total_swap_pages;
  extern void si_swapinfo(struct sysinfo *);
  extern swp_entry_t get_swap_page(void);
  extern swp_entry_t get_swap_page_of_type(int);
-extern void swap_duplicate(swp_entry_t);
-extern int swapcache_prepare(swp_entry_t);
  extern int valid_swaphandles(swp_entry_t, unsigned long *);
+extern int add_swap_count_continuation(swp_entry_t, gfp_t);
+extern int swap_duplicate(swp_entry_t);
+extern int swapcache_prepare(swp_entry_t);
  extern void swap_free(swp_entry_t);
  extern void swapcache_free(swp_entry_t, struct page *page);
  extern int free_swap_and_cache(swp_entry_t);
@@ -385,8 +389,14 @@ static inline void show_swap_cache_info(void)
  #define free_swap_and_cache(swp)       is_migration_entry(swp)
  #define swapcache_prepare(swp)         is_migration_entry(swp)
  
-static inline void swap_duplicate(swp_entry_t swp)
+static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
  {
+       return 0;
+}
+
+static inline int swap_duplicate(swp_entry_t swp)
+{
+       return 0;
  }
  
  static inline void swap_free(swp_entry_t swp)
diff --git a/mm/memory.c b/mm/memory.c

index 6ab19dd..543c446 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -572,7 +572,7 @@ out:
   * covered by this vma.
   */
  
-static inline void
+static inline unsigned long
  copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
                 unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 if (!pte_file(pte)) {
                         swp_entry_t entry = pte_to_swp_entry(pte);
  
-                       swap_duplicate(entry);
+                       if (swap_duplicate(entry) < 0)
+                               return entry.val;
+
                         /* make sure dst_mm is on swapoff's mmlist. */
                         if (unlikely(list_empty(&dst_mm->mmlist))) {
                                 spin_lock(&mmlist_lock);
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  
  out_set_pte:
         set_pte_at(dst_mm, addr, dst_pte, pte);
+       return 0;
  }
  
  static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         spinlock_t *src_ptl, *dst_ptl;
         int progress = 0;
         int rss[2];
+       swp_entry_t entry = (swp_entry_t){0};
  
  again:
         rss[1] = rss[0] = 0;
@@ -674,7 +678,10 @@ again:
                         progress++;
                         continue;
                 }
-               copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+               entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
+                                                       vma, addr, rss);
+               if (entry.val)
+                       break;
                 progress += 8;
         } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
  
@@ -684,6 +691,12 @@ again:
         add_mm_rss(dst_mm, rss[0], rss[1]);
         pte_unmap_unlock(orig_dst_pte, dst_ptl);
         cond_resched();
+
+       if (entry.val) {
+               if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
+                       return -ENOMEM;
+               progress = 0;
+       }
         if (addr != end)
                 goto again;
         return 0;
diff --git a/mm/rmap.c b/mm/rmap.c

index dd43373..710bb4b 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                          * Store the swap location in the pte.
                          * See handle_pte_fault() ...
                          */
-                       swap_duplicate(entry);
+                       if (swap_duplicate(entry) < 0) {
+                               set_pte_at(mm, address, pte, pteval);
+                               ret = SWAP_FAIL;
+                               goto out_unmap;
+                       }
                         if (list_empty(&mm->mmlist)) {
                                 spin_lock(&mmlist_lock);
                                 if (list_empty(&mm->mmlist))
diff --git a/mm/swapfile.c b/mm/swapfile.c

index c0d7b9e..cc5e7eb 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -35,11 +35,14 @@
  #include <linux/swapops.h>
  #include <linux/page_cgroup.h>
  
+static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+                                unsigned char);
+static void free_swap_count_continuations(struct swap_info_struct *);
+
  static DEFINE_SPINLOCK(swap_lock);
  static unsigned int nr_swapfiles;
  long nr_swap_pages;
  long total_swap_pages;
-static int swap_overflow;
  static int least_priority;
  
  static const char Bad_file[] = "Bad swap file entry ";
@@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex);
  
  static inline unsigned char swap_count(unsigned char ent)
  {
-       return ent & ~SWAP_HAS_CACHE;
+       return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
  }
  
  /* returns 1 if swap entry is freed */
@@ -545,8 +548,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
         if (usage == SWAP_HAS_CACHE) {
                 VM_BUG_ON(!has_cache);
                 has_cache = 0;
-       } else if (count < SWAP_MAP_MAX)
-               count--;
+       } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
+               if (count == COUNT_CONTINUED) {
+                       if (swap_count_continued(p, offset, count))
+                               count = SWAP_MAP_MAX | COUNT_CONTINUED;
+                       else
+                               count = SWAP_MAP_MAX;
+               } else
+                       count--;
+       }
  
         if (!count)
                 mem_cgroup_uncharge_swap(entry);
@@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, struct page *page)
  
  /*
   * How many references to page are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
   */
  static inline int page_swapcount(struct page *page)
  {
@@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int type)
         swp_entry_t entry;
         unsigned int i = 0;
         int retval = 0;
-       int reset_overflow = 0;
         int shmem;
  
         /*
@@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int type)
          * together, child after parent.  If we race with dup_mmap(), we
          * prefer to resolve parent before child, lest we miss entries
          * duplicated after we scanned child: using last mm would invert
-        * that.  Though it's only a serious concern when an overflowed
-        * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
+        * that.
          */
         start_mm = &init_mm;
         atomic_inc(&init_mm.mm_users);
@@ -1165,36 +1175,6 @@ static int try_to_unuse(unsigned int type)
                 }
  
                 /*
-                * How could swap count reach 0x7ffe ?
-                * There's no way to repeat a swap page within an mm
-                * (except in shmem, where it's the shared object which takes
-                * the reference count)?
-                * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
-                * short is too small....)
-                * If that's wrong, then we should worry more about
-                * exit_mmap() and do_munmap() cases described above:
-                * we might be resetting SWAP_MAP_MAX too early here.
-                *
-                * Yes, that's wrong: though very unlikely, swap count 0x7ffe
-                * could surely occur if pid_max raised from PID_MAX_DEFAULT;
-                * and we are now lowering SWAP_MAP_MAX to 0x7e, making it
-                * much easier to reach.  But the next patch will fix that.
-                *
-                * We know "Undead"s can happen, they're okay, so don't
-                * report them; but do report if we reset SWAP_MAP_MAX.
-                */
-               /* We might release the lock_page() in unuse_mm(). */
-               if (!PageSwapCache(page) || page_private(page) != entry.val)
-                       goto retry;
-
-               if (swap_count(*swap_map) == SWAP_MAP_MAX) {
-                       spin_lock(&swap_lock);
-                       *swap_map = SWAP_HAS_CACHE;
-                       spin_unlock(&swap_lock);
-                       reset_overflow = 1;
-               }
-
-               /*
                  * If a reference remains (rare), we would like to leave
                  * the page in the swap cache; but try_to_unmap could
                  * then re-duplicate the entry once we drop page lock,
@@ -1235,7 +1215,6 @@ static int try_to_unuse(unsigned int type)
                  * mark page dirty so shrink_page_list will preserve it.
                  */
                 SetPageDirty(page);
-retry:
                 unlock_page(page);
                 page_cache_release(page);
  
@@ -1247,10 +1226,6 @@ retry:
         }
  
         mmput(start_mm);
-       if (reset_overflow) {
-               printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
-               swap_overflow = 0;
-       }
         return retval;
  }
  
@@ -1593,6 +1568,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         up_write(&swap_unplug_sem);
  
         destroy_swap_extents(p);
+       if (p->flags & SWP_CONTINUED)
+               free_swap_count_continuations(p);
+
         mutex_lock(&swapon_mutex);
         spin_lock(&swap_lock);
         drain_mmlist();
@@ -2079,14 +2057,13 @@ void si_swapinfo(struct sysinfo *val)
  /*
   * Verify that a swap entry is valid and increment its swap map count.
   *
- * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
- * "permanent", but will be reclaimed by the next swapoff.
   * Returns error code in following case.
   * - success -> 0
   * - swp_entry is invalid -> EINVAL
   * - swp_entry is migration entry -> EINVAL
   * - swap-cache reference is requested but there is already one. -> EEXIST
   * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
   */
  static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
  {
@@ -2126,15 +2103,14 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
  
         } else if (count || has_cache) {
  
-               if (count < SWAP_MAP_MAX - 1)
-                       count++;
-               else if (count <= SWAP_MAP_MAX) {
-                       if (swap_overflow++ < 5)
-                               printk(KERN_WARNING
-                                      "swap_dup: swap entry overflow\n");
-                       count = SWAP_MAP_MAX;
-               } else
+               if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+                       count += usage;
+               else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
                         err = -EINVAL;
+               else if (swap_count_continued(p, offset, count))
+                       count = COUNT_CONTINUED;
+               else
+                       err = -ENOMEM;
         } else
                 err = -ENOENT;                  /* unused swap entry */
  
@@ -2153,9 +2129,13 @@ bad_file:
  /*
   * increase reference count of swap entry by 1.
   */
-void swap_duplicate(swp_entry_t entry)
+int swap_duplicate(swp_entry_t entry)
  {
-       __swap_duplicate(entry, 1);
+       int err = 0;
+
+       while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
+               err = add_swap_count_continuation(entry, GFP_ATOMIC);
+       return err;
  }
  
  /*
@@ -2222,3 +2202,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
         *offset = ++toff;
         return nr_pages? ++nr_pages: 0;
  }
+
+/*
+ * add_swap_count_continuation - called when a swap count is duplicated
+ * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
+ * page of the original vmalloc'ed swap_map, to hold the continuation count
+ * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
+ * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
+ *
+ * These continuation pages are seldom referenced: the common paths all work
+ * on the original swap_map, only referring to a continuation page when the
+ * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
+ *
+ * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
+ * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
+ * can be called after dropping locks.
+ */
+int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
+{
+       struct swap_info_struct *si;
+       struct page *head;
+       struct page *page;
+       struct page *list_page;
+       pgoff_t offset;
+       unsigned char count;
+
+       /*
+        * When debugging, it's easier to use __GFP_ZERO here; but it's better
+        * for latency not to zero a page while GFP_ATOMIC and holding locks.
+        */
+       page = alloc_page(gfp_mask | __GFP_HIGHMEM);
+
+       si = swap_info_get(entry);
+       if (!si) {
+               /*
+                * An acceptable race has occurred since the failing
+                * __swap_duplicate(): the swap entry has been freed,
+                * perhaps even the whole swap_map cleared for swapoff.
+                */
+               goto outer;
+       }
+
+       offset = swp_offset(entry);
+       count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
+
+       if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
+               /*
+                * The higher the swap count, the more likely it is that tasks
+                * will race to add swap count continuation: we need to avoid
+                * over-provisioning.
+                */
+               goto out;
+       }
+
+       if (!page) {
+               spin_unlock(&swap_lock);
+               return -ENOMEM;
+       }
+
+       /*
+        * We are fortunate that although vmalloc_to_page uses pte_offset_map,
+        * no architecture is using highmem pages for kernel pagetables: so it
+        * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
+        */
+       head = vmalloc_to_page(si->swap_map + offset);
+       offset &= ~PAGE_MASK;
+
+       /*
+        * Page allocation does not initialize the page's lru field,
+        * but it does always reset its private field.
+        */
+       if (!page_private(head)) {
+               BUG_ON(count & COUNT_CONTINUED);
+               INIT_LIST_HEAD(&head->lru);
+               set_page_private(head, SWP_CONTINUED);
+               si->flags |= SWP_CONTINUED;
+       }
+
+       list_for_each_entry(list_page, &head->lru, lru) {
+               unsigned char *map;
+
+               /*
+                * If the previous map said no continuation, but we've found
+                * a continuation page, free our allocation and use this one.
+                */
+               if (!(count & COUNT_CONTINUED))
+                       goto out;
+
+               map = kmap_atomic(list_page, KM_USER0) + offset;
+               count = *map;
+               kunmap_atomic(map, KM_USER0);
+
+               /*
+                * If this continuation count now has some space in it,
+                * free our allocation and use this one.
+                */
+               if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
+                       goto out;
+       }
+
+       list_add_tail(&page->lru, &head->lru);
+       page = NULL;                    /* now it's attached, don't free it */
+out:
+       spin_unlock(&swap_lock);
+outer:
+       if (page)
+               __free_page(page);
+       return 0;
+}
+
+/*
+ * swap_count_continued - when the original swap_map count is incremented
+ * from SWAP_MAP_MAX, check if there is already a continuation page to carry
+ * into, carry if so, or else fail until a new continuation page is allocated;
+ * when the original swap_map count is decremented from 0 with continuation,
+ * borrow from the continuation and report whether it still holds more.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ */
+static bool swap_count_continued(struct swap_info_struct *si,
+                                pgoff_t offset, unsigned char count)
+{
+       struct page *head;
+       struct page *page;
+       unsigned char *map;
+
+       head = vmalloc_to_page(si->swap_map + offset);
+       if (page_private(head) != SWP_CONTINUED) {
+               BUG_ON(count & COUNT_CONTINUED);
+               return false;           /* need to add count continuation */
+       }
+
+       offset &= ~PAGE_MASK;
+       page = list_entry(head->lru.next, struct page, lru);
+       map = kmap_atomic(page, KM_USER0) + offset;
+
+       if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
+               goto init_map;          /* jump over SWAP_CONT_MAX checks */
+
+       if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
+               /*
+                * Think of how you add 1 to 999
+                */
+               while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
+                       kunmap_atomic(map, KM_USER0);
+                       page = list_entry(page->lru.next, struct page, lru);
+                       BUG_ON(page == head);
+                       map = kmap_atomic(page, KM_USER0) + offset;
+               }
+               if (*map == SWAP_CONT_MAX) {
+                       kunmap_atomic(map, KM_USER0);
+                       page = list_entry(page->lru.next, struct page, lru);
+                       if (page == head)
+                               return false;   /* add count continuation */
+                       map = kmap_atomic(page, KM_USER0) + offset;
+init_map:              *map = 0;               /* we didn't zero the page */
+               }
+               *map += 1;
+               kunmap_atomic(map, KM_USER0);
+               page = list_entry(page->lru.prev, struct page, lru);
+               while (page != head) {
+                       map = kmap_atomic(page, KM_USER0) + offset;
+                       *map = COUNT_CONTINUED;
+                       kunmap_atomic(map, KM_USER0);
+                       page = list_entry(page->lru.prev, struct page, lru);
+               }
+               return true;                    /* incremented */
+
+       } else {                                /* decrementing */
+               /*
+                * Think of how you subtract 1 from 1000
+                */
+               BUG_ON(count != COUNT_CONTINUED);
+               while (*map == COUNT_CONTINUED) {
+                       kunmap_atomic(map, KM_USER0);
+                       page = list_entry(page->lru.next, struct page, lru);
+                       BUG_ON(page == head);
+                       map = kmap_atomic(page, KM_USER0) + offset;
+               }
+               BUG_ON(*map == 0);
+               *map -= 1;
+               if (*map == 0)
+                       count = 0;
+               kunmap_atomic(map, KM_USER0);
+               page = list_entry(page->lru.prev, struct page, lru);
+               while (page != head) {
+                       map = kmap_atomic(page, KM_USER0) + offset;
+                       *map = SWAP_CONT_MAX | count;
+                       count = COUNT_CONTINUED;
+                       kunmap_atomic(map, KM_USER0);
+                       page = list_entry(page->lru.prev, struct page, lru);
+               }
+               return count == COUNT_CONTINUED;
+       }
+}
+
+/*
+ * free_swap_count_continuations - swapoff free all the continuation pages
+ * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
+ */
+static void free_swap_count_continuations(struct swap_info_struct *si)
+{
+       pgoff_t offset;
+
+       for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
+               struct page *head;
+               head = vmalloc_to_page(si->swap_map + offset);
+               if (page_private(head)) {
+                       struct list_head *this, *next;
+                       list_for_each_safe(this, next, &head->lru) {
+                               struct page *page;
+                               page = list_entry(this, struct page, lru);
+                               list_del(this);
+                               __free_page(page);
+                       }
+               }
+       }
+}
author	Hugh Dickins <hugh.dickins@tiscali.co.uk>
	Tue, 15 Dec 2009 01:58:46 +0000 (17:58 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 15 Dec 2009 16:53:15 +0000 (08:53 -0800)
include/linux/swap.h		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history