Merge branch 'topic/misc' into for-linus
[safe/jmp/linux-2.6] / mm / swapfile.c
index cc5e7eb..6cd0a8f 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
@@ -38,6 +39,7 @@
 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
                                 unsigned char);
 static void free_swap_count_continuations(struct swap_info_struct *);
+static sector_t map_swap_entry(swp_entry_t, struct block_device**);
 
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
@@ -548,6 +550,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        if (usage == SWAP_HAS_CACHE) {
                VM_BUG_ON(!has_cache);
                has_cache = 0;
+       } else if (count == SWAP_MAP_SHMEM) {
+               /*
+                * Or we could insist on shmem.c using a special
+                * swap_shmem_free() and free_shmem_swap_and_cache()...
+                */
+               count = 0;
        } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
                if (count == COUNT_CONTINUED) {
                        if (swap_count_continued(p, offset, count))
@@ -643,6 +651,8 @@ int reuse_swap_page(struct page *page)
        int count;
 
        VM_BUG_ON(!PageLocked(page));
+       if (unlikely(PageKsm(page)))
+               return 0;
        count = page_mapcount(page);
        if (count <= 1 && PageSwapCache(page)) {
                count += page_swapcount(page);
@@ -651,7 +661,7 @@ int reuse_swap_page(struct page *page)
                        SetPageDirty(page);
                }
        }
-       return count == 1;
+       return count <= 1;
 }
 
 /*
@@ -713,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
        return p != NULL;
 }
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/**
+ * mem_cgroup_count_swap_user - count the user of a swap entry
+ * @ent: the swap entry to be checked
+ * @pagep: the pointer for the swap cache page of the entry to be stored
+ *
+ * Returns the number of the user of the swap entry. The number is valid only
+ * for swaps of anonymous pages.
+ * If the entry is found on swap cache, the page is stored to pagep with
+ * refcount of it being incremented.
+ */
+int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
+{
+       struct page *page;
+       struct swap_info_struct *p;
+       int count = 0;
+
+       page = find_get_page(&swapper_space, ent.val);
+       if (page)
+               count += page_mapcount(page);
+       p = swap_info_get(ent);
+       if (p) {
+               count += swap_count(p->swap_map[swp_offset(ent)]);
+               spin_unlock(&swap_lock);
+       }
+
+       *pagep = page;
+       return count;
+}
+#endif
+
 #ifdef CONFIG_HIBERNATION
 /*
  * Find the swap type that corresponds to given device (if any).
@@ -776,7 +817,7 @@ sector_t swapdev_block(int type, pgoff_t offset)
                return 0;
        if (!(swap_info[type]->flags & SWP_WRITEOK))
                return 0;
-       return map_swap_page(swp_entry(type, offset), &bdev);
+       return map_swap_entry(swp_entry(type, offset), &bdev);
 }
 
 /*
@@ -830,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                goto out;
        }
 
-       inc_mm_counter(vma->vm_mm, anon_rss);
+       dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+       inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
        get_page(page);
        set_pte_at(vma->vm_mm, addr, pte,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -931,7 +973,7 @@ static int unuse_vma(struct vm_area_struct *vma,
        unsigned long addr, end, next;
        int ret;
 
-       if (page->mapping) {
+       if (page_anon_vma(page)) {
                addr = page_address_in_vma(page, vma);
                if (addr == -EFAULT)
                        return 0;
@@ -1031,7 +1073,6 @@ static int try_to_unuse(unsigned int type)
        swp_entry_t entry;
        unsigned int i = 0;
        int retval = 0;
-       int shmem;
 
        /*
         * When searching mms for an entry, a good strategy is to
@@ -1107,17 +1148,18 @@ static int try_to_unuse(unsigned int type)
 
                /*
                 * Remove all references to entry.
-                * Whenever we reach init_mm, there's no address space
-                * to search, but use it as a reminder to search shmem.
                 */
-               shmem = 0;
                swcount = *swap_map;
-               if (swap_count(swcount)) {
-                       if (start_mm == &init_mm)
-                               shmem = shmem_unuse(entry, page);
-                       else
-                               retval = unuse_mm(start_mm, entry, page);
+               if (swap_count(swcount) == SWAP_MAP_SHMEM) {
+                       retval = shmem_unuse(entry, page);
+                       /* page has already been unlocked and released */
+                       if (retval < 0)
+                               break;
+                       continue;
                }
+               if (swap_count(swcount) && start_mm != &init_mm)
+                       retval = unuse_mm(start_mm, entry, page);
+
                if (swap_count(*swap_map)) {
                        int set_start_mm = (*swap_map >= swcount);
                        struct list_head *p = &start_mm->mmlist;
@@ -1128,7 +1170,7 @@ static int try_to_unuse(unsigned int type)
                        atomic_inc(&new_start_mm->mm_users);
                        atomic_inc(&prev_mm->mm_users);
                        spin_lock(&mmlist_lock);
-                       while (swap_count(*swap_map) && !retval && !shmem &&
+                       while (swap_count(*swap_map) && !retval &&
                                        (p = p->next) != &start_mm->mmlist) {
                                mm = list_entry(p, struct mm_struct, mmlist);
                                if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1142,10 +1184,9 @@ static int try_to_unuse(unsigned int type)
                                swcount = *swap_map;
                                if (!swap_count(swcount)) /* any usage ? */
                                        ;
-                               else if (mm == &init_mm) {
+                               else if (mm == &init_mm)
                                        set_start_mm = 1;
-                                       shmem = shmem_unuse(entry, page);
-                               } else
+                               else
                                        retval = unuse_mm(mm, entry, page);
 
                                if (set_start_mm && *swap_map < swcount) {
@@ -1161,13 +1202,6 @@ static int try_to_unuse(unsigned int type)
                        mmput(start_mm);
                        start_mm = new_start_mm;
                }
-               if (shmem) {
-                       /* page has already been unlocked and released */
-                       if (shmem > 0)
-                               continue;
-                       retval = shmem;
-                       break;
-               }
                if (retval) {
                        unlock_page(page);
                        page_cache_release(page);
@@ -1186,6 +1220,12 @@ static int try_to_unuse(unsigned int type)
                 * read from disk into another page.  Splitting into two
                 * pages would be incorrect if swap supported "shared
                 * private" pages, but they are handled by tmpfs files.
+                *
+                * Given how unuse_vma() targets one particular offset
+                * in an anon_vma, once the anon_vma has been determined,
+                * this splitting happens to be just what is needed to
+                * handle where KSM pages have been swapped out: re-reading
+                * is unnecessarily slow, but we can fix that later on.
                 */
                if (swap_count(*swap_map) &&
                     PageDirty(page) && PageSwapCache(page)) {
@@ -1251,10 +1291,11 @@ static void drain_mmlist(void)
 
 /*
  * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
- * corresponds to page offset `offset'.  Note that the type of this function
- * is sector_t, but it returns page offset into the bdev, not sector offset.
+ * corresponds to page offset for the specified swap entry.
+ * Note that the type of this function is sector_t, but it returns page offset
+ * into the bdev, not sector offset.
  */
-sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev)
+static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
 {
        struct swap_info_struct *sis;
        struct swap_extent *start_se;
@@ -1283,6 +1324,16 @@ sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev)
 }
 
 /*
+ * Returns the page offset into bdev for the specified page's swap entry.
+ */
+sector_t map_swap_page(struct page *page, struct block_device **bdev)
+{
+       swp_entry_t entry;
+       entry.val = page_private(page);
+       return map_swap_entry(entry, bdev);
+}
+
+/*
  * Free all of a swapdev's extent information
  */
 static void destroy_swap_extents(struct swap_info_struct *sis)
@@ -1740,11 +1791,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        unsigned int type;
        int i, prev;
        int error;
-       union swap_header *swap_header = NULL;
-       unsigned int nr_good_pages = 0;
+       union swap_header *swap_header;
+       unsigned int nr_good_pages;
        int nr_extents = 0;
        sector_t span;
-       unsigned long maxpages = 1;
+       unsigned long maxpages;
        unsigned long swapfilepages;
        unsigned char *swap_map = NULL;
        struct page *page = NULL;
@@ -1903,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
         * swap pte.
         */
        maxpages = swp_offset(pte_to_swp_entry(
-                       swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
-       if (maxpages > swap_header->info.last_page)
-               maxpages = swap_header->info.last_page;
+                       swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+       if (maxpages > swap_header->info.last_page) {
+               maxpages = swap_header->info.last_page + 1;
+               /* p->max is an unsigned int: don't overflow it */
+               if ((unsigned int)maxpages == 0)
+                       maxpages = UINT_MAX;
+       }
        p->highest_bit = maxpages - 1;
 
        error = -EINVAL;
@@ -1929,23 +1984,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        }
 
        memset(swap_map, 0, maxpages);
+       nr_good_pages = maxpages - 1;   /* omit header page */
+
        for (i = 0; i < swap_header->info.nr_badpages; i++) {
-               int page_nr = swap_header->info.badpages[i];
-               if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
+               unsigned int page_nr = swap_header->info.badpages[i];
+               if (page_nr == 0 || page_nr > swap_header->info.last_page) {
                        error = -EINVAL;
                        goto bad_swap;
                }
-               swap_map[page_nr] = SWAP_MAP_BAD;
+               if (page_nr < maxpages) {
+                       swap_map[page_nr] = SWAP_MAP_BAD;
+                       nr_good_pages--;
+               }
        }
 
        error = swap_cgroup_swapon(type, maxpages);
        if (error)
                goto bad_swap;
 
-       nr_good_pages = swap_header->info.last_page -
-                       swap_header->info.nr_badpages -
-                       1 /* header page */;
-
        if (nr_good_pages) {
                swap_map[0] = SWAP_MAP_BAD;
                p->max = maxpages;
@@ -2127,7 +2183,20 @@ bad_file:
 }
 
 /*
- * increase reference count of swap entry by 1.
+ * Help swapoff by noting that swap entry belongs to shmem/tmpfs
+ * (in which case its reference count is never incremented).
+ */
+void swap_shmem_alloc(swp_entry_t entry)
+{
+       __swap_duplicate(entry, SWAP_MAP_SHMEM);
+}
+
+/*
+ * Increase reference count of swap entry by 1.
+ * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
+ * but could not be atomically allocated.  Returns 0, just as if it succeeded,
+ * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
+ * might occur if a page table entry has got corrupted.
  */
 int swap_duplicate(swp_entry_t entry)
 {