Merge branch 'topic/misc' into for-linus

[safe/jmp/linux-2.6] / mm / swapfile.c
diff --git a/mm/swapfile.c b/mm/swapfile.c

index cc5e7eb..6cd0a8f 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -22,6 +22,7 @@
  #include <linux/seq_file.h>
  #include <linux/init.h>
  #include <linux/module.h>
+#include <linux/ksm.h>
  #include <linux/rmap.h>
  #include <linux/security.h>
  #include <linux/backing-dev.h>
@@ -38,6 +39,7 @@
  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
                                  unsigned char);
  static void free_swap_count_continuations(struct swap_info_struct *);
+static sector_t map_swap_entry(swp_entry_t, struct block_device**);
  
  static DEFINE_SPINLOCK(swap_lock);
  static unsigned int nr_swapfiles;
@@ -548,6 +550,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
         if (usage == SWAP_HAS_CACHE) {
                 VM_BUG_ON(!has_cache);
                 has_cache = 0;
+       } else if (count == SWAP_MAP_SHMEM) {
+               /*
+                * Or we could insist on shmem.c using a special
+                * swap_shmem_free() and free_shmem_swap_and_cache()...
+                */
+               count = 0;
         } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
                 if (count == COUNT_CONTINUED) {
                         if (swap_count_continued(p, offset, count))
@@ -643,6 +651,8 @@ int reuse_swap_page(struct page *page)
         int count;
  
         VM_BUG_ON(!PageLocked(page));
+       if (unlikely(PageKsm(page)))
+               return 0;
         count = page_mapcount(page);
         if (count <= 1 && PageSwapCache(page)) {
                 count += page_swapcount(page);
@@ -651,7 +661,7 @@ int reuse_swap_page(struct page *page)
                         SetPageDirty(page);
                 }
         }
-       return count == 1;
+       return count <= 1;
  }
  
  /*
@@ -713,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
         return p != NULL;
  }
  
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/**
+ * mem_cgroup_count_swap_user - count the user of a swap entry
+ * @ent: the swap entry to be checked
+ * @pagep: the pointer for the swap cache page of the entry to be stored
+ *
+ * Returns the number of the user of the swap entry. The number is valid only
+ * for swaps of anonymous pages.
+ * If the entry is found on swap cache, the page is stored to pagep with
+ * refcount of it being incremented.
+ */
+int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
+{
+       struct page *page;
+       struct swap_info_struct *p;
+       int count = 0;
+
+       page = find_get_page(&swapper_space, ent.val);
+       if (page)
+               count += page_mapcount(page);
+       p = swap_info_get(ent);
+       if (p) {
+               count += swap_count(p->swap_map[swp_offset(ent)]);
+               spin_unlock(&swap_lock);
+       }
+
+       *pagep = page;
+       return count;
+}
+#endif
+
  #ifdef CONFIG_HIBERNATION
  /*
   * Find the swap type that corresponds to given device (if any).
@@ -776,7 +817,7 @@ sector_t swapdev_block(int type, pgoff_t offset)
                 return 0;
         if (!(swap_info[type]->flags & SWP_WRITEOK))
                 return 0;
-       return map_swap_page(swp_entry(type, offset), &bdev);
+       return map_swap_entry(swp_entry(type, offset), &bdev);
  }
  
  /*
@@ -830,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                 goto out;
         }
  
-       inc_mm_counter(vma->vm_mm, anon_rss);
+       dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+       inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
         get_page(page);
         set_pte_at(vma->vm_mm, addr, pte,
                    pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -931,7 +973,7 @@ static int unuse_vma(struct vm_area_struct *vma,
         unsigned long addr, end, next;
         int ret;
  
-       if (page->mapping) {
+       if (page_anon_vma(page)) {
                 addr = page_address_in_vma(page, vma);
                 if (addr == -EFAULT)
                         return 0;
@@ -1031,7 +1073,6 @@ static int try_to_unuse(unsigned int type)
         swp_entry_t entry;
         unsigned int i = 0;
         int retval = 0;
-       int shmem;
  
         /*
          * When searching mms for an entry, a good strategy is to
@@ -1107,17 +1148,18 @@ static int try_to_unuse(unsigned int type)
  
                 /*
                  * Remove all references to entry.
-                * Whenever we reach init_mm, there's no address space
-                * to search, but use it as a reminder to search shmem.
                  */
-               shmem = 0;
                 swcount = *swap_map;
-               if (swap_count(swcount)) {
-                       if (start_mm == &init_mm)
-                               shmem = shmem_unuse(entry, page);
-                       else
-                               retval = unuse_mm(start_mm, entry, page);
+               if (swap_count(swcount) == SWAP_MAP_SHMEM) {
+                       retval = shmem_unuse(entry, page);
+                       /* page has already been unlocked and released */
+                       if (retval < 0)
+                               break;
+                       continue;
                 }
+               if (swap_count(swcount) && start_mm != &init_mm)
+                       retval = unuse_mm(start_mm, entry, page);
+
                 if (swap_count(*swap_map)) {
                         int set_start_mm = (*swap_map >= swcount);
                         struct list_head *p = &start_mm->mmlist;
@@ -1128,7 +1170,7 @@ static int try_to_unuse(unsigned int type)
                         atomic_inc(&new_start_mm->mm_users);
                         atomic_inc(&prev_mm->mm_users);
                         spin_lock(&mmlist_lock);
-                       while (swap_count(*swap_map) && !retval && !shmem &&
+                       while (swap_count(*swap_map) && !retval &&
                                         (p = p->next) != &start_mm->mmlist) {
                                 mm = list_entry(p, struct mm_struct, mmlist);
                                 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1142,10 +1184,9 @@ static int try_to_unuse(unsigned int type)
                                 swcount = *swap_map;
                                 if (!swap_count(swcount)) /* any usage ? */
                                         ;
-                               else if (mm == &init_mm) {
+                               else if (mm == &init_mm)
                                         set_start_mm = 1;
-                                       shmem = shmem_unuse(entry, page);
-                               } else
+                               else
                                         retval = unuse_mm(mm, entry, page);
  
                                 if (set_start_mm && *swap_map < swcount) {
@@ -1161,13 +1202,6 @@ static int try_to_unuse(unsigned int type)
                         mmput(start_mm);
                         start_mm = new_start_mm;
                 }
-               if (shmem) {
-                       /* page has already been unlocked and released */
-                       if (shmem > 0)
-                               continue;
-                       retval = shmem;
-                       break;
-               }
                 if (retval) {
                         unlock_page(page);
                         page_cache_release(page);
@@ -1186,6 +1220,12 @@ static int try_to_unuse(unsigned int type)
                  * read from disk into another page.  Splitting into two
                  * pages would be incorrect if swap supported "shared
                  * private" pages, but they are handled by tmpfs files.
+                *
+                * Given how unuse_vma() targets one particular offset
+                * in an anon_vma, once the anon_vma has been determined,
+                * this splitting happens to be just what is needed to
+                * handle where KSM pages have been swapped out: re-reading
+                * is unnecessarily slow, but we can fix that later on.
                  */
                 if (swap_count(*swap_map) &&
                      PageDirty(page) && PageSwapCache(page)) {
@@ -1251,10 +1291,11 @@ static void drain_mmlist(void)
  
  /*
   * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
- * corresponds to page offset `offset'.  Note that the type of this function
- * is sector_t, but it returns page offset into the bdev, not sector offset.
+ * corresponds to page offset for the specified swap entry.
+ * Note that the type of this function is sector_t, but it returns page offset
+ * into the bdev, not sector offset.
   */
-sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev)
+static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
  {
         struct swap_info_struct *sis;
         struct swap_extent *start_se;
@@ -1283,6 +1324,16 @@ sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev)
  }
  
  /*
+ * Returns the page offset into bdev for the specified page's swap entry.
+ */
+sector_t map_swap_page(struct page *page, struct block_device **bdev)
+{
+       swp_entry_t entry;
+       entry.val = page_private(page);
+       return map_swap_entry(entry, bdev);
+}
+
+/*
   * Free all of a swapdev's extent information
   */
  static void destroy_swap_extents(struct swap_info_struct *sis)
@@ -1740,11 +1791,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
         unsigned int type;
         int i, prev;
         int error;
-       union swap_header *swap_header = NULL;
-       unsigned int nr_good_pages = 0;
+       union swap_header *swap_header;
+       unsigned int nr_good_pages;
         int nr_extents = 0;
         sector_t span;
-       unsigned long maxpages = 1;
+       unsigned long maxpages;
         unsigned long swapfilepages;
         unsigned char *swap_map = NULL;
         struct page *page = NULL;
@@ -1903,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
          * swap pte.
          */
         maxpages = swp_offset(pte_to_swp_entry(
-                       swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
-       if (maxpages > swap_header->info.last_page)
-               maxpages = swap_header->info.last_page;
+                       swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+       if (maxpages > swap_header->info.last_page) {
+               maxpages = swap_header->info.last_page + 1;
+               /* p->max is an unsigned int: don't overflow it */
+               if ((unsigned int)maxpages == 0)
+                       maxpages = UINT_MAX;
+       }
         p->highest_bit = maxpages - 1;
  
         error = -EINVAL;
@@ -1929,23 +1984,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
         }
  
         memset(swap_map, 0, maxpages);
+       nr_good_pages = maxpages - 1;   /* omit header page */
+
         for (i = 0; i < swap_header->info.nr_badpages; i++) {
-               int page_nr = swap_header->info.badpages[i];
-               if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
+               unsigned int page_nr = swap_header->info.badpages[i];
+               if (page_nr == 0 || page_nr > swap_header->info.last_page) {
                         error = -EINVAL;
                         goto bad_swap;
                 }
-               swap_map[page_nr] = SWAP_MAP_BAD;
+               if (page_nr < maxpages) {
+                       swap_map[page_nr] = SWAP_MAP_BAD;
+                       nr_good_pages--;
+               }
         }
  
         error = swap_cgroup_swapon(type, maxpages);
         if (error)
                 goto bad_swap;
  
-       nr_good_pages = swap_header->info.last_page -
-                       swap_header->info.nr_badpages -
-                       1 /* header page */;
-
         if (nr_good_pages) {
                 swap_map[0] = SWAP_MAP_BAD;
                 p->max = maxpages;
@@ -2127,7 +2183,20 @@ bad_file:
  }
  
  /*
- * increase reference count of swap entry by 1.
+ * Help swapoff by noting that swap entry belongs to shmem/tmpfs
+ * (in which case its reference count is never incremented).
+ */
+void swap_shmem_alloc(swp_entry_t entry)
+{
+       __swap_duplicate(entry, SWAP_MAP_SHMEM);
+}
+
+/*
+ * Increase reference count of swap entry by 1.
+ * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
+ * but could not be atomically allocated.  Returns 0, just as if it succeeded,
+ * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
+ * might occur if a page table entry has got corrupted.
   */
  int swap_duplicate(swp_entry_t entry)
  {