#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
+#include <linux/page_cgroup.h>
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
static DEFINE_MUTEX(swapon_mutex);
+/* For reference count accounting in swap_map */
+/* enum for swap_map[] handling. internal use only */
+enum {
+ SWAP_MAP = 0, /* ops for reference from swap users */
+ SWAP_CACHE, /* ops for reference from swap cache */
+};
+
+static inline int swap_count(unsigned short ent)
+{
+ return ent & SWAP_COUNT_MASK;
+}
+
+static inline bool swap_has_cache(unsigned short ent)
+{
+ return !!(ent & SWAP_HAS_CACHE);
+}
+
+static inline unsigned short encode_swapmap(int count, bool has_cache)
+{
+ unsigned short ret = count;
+
+ if (has_cache)
+ return SWAP_HAS_CACHE | ret;
+ return ret;
+}
+
+/* returnes 1 if swap entry is freed */
+static int
+__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+{
+ int type = si - swap_info;
+ swp_entry_t entry = swp_entry(type, offset);
+ struct page *page;
+ int ret = 0;
+
+ page = find_get_page(&swapper_space, entry.val);
+ if (!page)
+ return 0;
+ /*
+ * This function is called from scan_swap_map() and it's called
+ * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
+ * We have to use trylock for avoiding deadlock. This is a special
+ * case and you should use try_to_free_swap() with explicit lock_page()
+ * in usual operations.
+ */
+ if (trylock_page(page)) {
+ ret = try_to_free_swap(page);
+ unlock_page(page);
+ }
+ page_cache_release(page);
+ return ret;
+}
+
/*
* We need this because the bdev->unplug_fn can sleep and we cannot
* hold swap_lock while calling the unplug_fn. And swap_lock
list_for_each_entry(se, &si->extent_list, list) {
sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
- pgoff_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9);
+ sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
if (se->start_page == 0) {
/* Do not discard the swap header page! */
}
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL);
+ nr_blocks, GFP_KERNEL,
+ DISCARD_FL_BARRIER);
if (err)
break;
start_page < se->start_page + se->nr_pages) {
pgoff_t offset = start_page - se->start_page;
sector_t start_block = se->start_block + offset;
- pgoff_t nr_blocks = se->nr_pages - offset;
+ sector_t nr_blocks = se->nr_pages - offset;
if (nr_blocks > nr_pages)
nr_blocks = nr_pages;
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO))
+ nr_blocks, GFP_NOIO,
+ DISCARD_FL_BARRIER))
break;
}
#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256
-static inline unsigned long scan_swap_map(struct swap_info_struct *si)
+static inline unsigned long scan_swap_map(struct swap_info_struct *si,
+ int cache)
{
unsigned long offset;
unsigned long scan_base;
goto no_page;
if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit;
+
+ /* reuse swap entry of cache-only swap if not busy. */
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ int swap_was_freed;
+ spin_unlock(&swap_lock);
+ swap_was_freed = __try_to_reclaim_swap(si, offset);
+ spin_lock(&swap_lock);
+ /* entry was freed successfully, try to use this again */
+ if (swap_was_freed)
+ goto checks;
+ goto scan; /* check next one */
+ }
+
if (si->swap_map[offset])
goto scan;
si->lowest_bit = si->max;
si->highest_bit = 0;
}
- si->swap_map[offset] = 1;
+ if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
+ si->swap_map[offset] = encode_swapmap(0, true);
+ else /* at suspend */
+ si->swap_map[offset] = encode_swapmap(1, false);
si->cluster_next = offset + 1;
si->flags -= SWP_SCANNING;
spin_lock(&swap_lock);
goto checks;
}
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ spin_lock(&swap_lock);
+ goto checks;
+ }
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
spin_lock(&swap_lock);
goto checks;
}
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ spin_lock(&swap_lock);
+ goto checks;
+ }
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
continue;
swap_list.next = next;
- offset = scan_swap_map(si);
+ /* This is called for allocating swap entry for cache */
+ offset = scan_swap_map(si, SWAP_CACHE);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
return (swp_entry_t) {0};
}
+/* The only caller of this function is now susupend routine */
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si;
si = swap_info + type;
if (si->flags & SWP_WRITEOK) {
nr_swap_pages--;
- offset = scan_swap_map(si);
+ /* This is called for allocating swap entry, not cache */
+ offset = scan_swap_map(si, SWAP_MAP);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
return NULL;
}
-static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+static int swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t ent, int cache)
{
- int count = p->swap_map[offset];
-
- if (count < SWAP_MAP_MAX) {
- count--;
- p->swap_map[offset] = count;
- if (!count) {
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit)
- p->highest_bit = offset;
- if (p->prio > swap_info[swap_list.next].prio)
- swap_list.next = p - swap_info;
- nr_swap_pages++;
- p->inuse_pages--;
+ unsigned long offset = swp_offset(ent);
+ int count = swap_count(p->swap_map[offset]);
+ bool has_cache;
+
+ has_cache = swap_has_cache(p->swap_map[offset]);
+
+ if (cache == SWAP_MAP) { /* dropping usage count of swap */
+ if (count < SWAP_MAP_MAX) {
+ count--;
+ p->swap_map[offset] = encode_swapmap(count, has_cache);
}
+ } else { /* dropping swap cache flag */
+ VM_BUG_ON(!has_cache);
+ p->swap_map[offset] = encode_swapmap(count, false);
+
+ }
+ /* return code. */
+ count = p->swap_map[offset];
+ /* free if no reference */
+ if (!count) {
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit)
+ p->highest_bit = offset;
+ if (p->prio > swap_info[swap_list.next].prio)
+ swap_list.next = p - swap_info;
+ nr_swap_pages++;
+ p->inuse_pages--;
}
+ if (!swap_count(count))
+ mem_cgroup_uncharge_swap(ent);
return count;
}
p = swap_info_get(entry);
if (p) {
- swap_entry_free(p, swp_offset(entry));
+ swap_entry_free(p, entry, SWAP_MAP);
spin_unlock(&swap_lock);
}
}
/*
+ * Called after dropping swapcache to decrease refcnt to swap entries.
+ */
+void swapcache_free(swp_entry_t entry, struct page *page)
+{
+ struct swap_info_struct *p;
+ int ret;
+
+ p = swap_info_get(entry);
+ if (p) {
+ ret = swap_entry_free(p, entry, SWAP_CACHE);
+ if (page) {
+ bool swapout;
+ if (ret)
+ swapout = true; /* the end of swap out */
+ else
+ swapout = false; /* no more swap users! */
+ mem_cgroup_uncharge_swapcache(page, entry, swapout);
+ }
+ spin_unlock(&swap_lock);
+ }
+ return;
+}
+
+/*
* How many references to page are currently swapped out?
*/
static inline int page_swapcount(struct page *page)
entry.val = page_private(page);
p = swap_info_get(entry);
if (p) {
- /* Subtract the 1 for the swap cache itself */
- count = p->swap_map[swp_offset(entry)] - 1;
+ count = swap_count(p->swap_map[swp_offset(entry)]);
spin_unlock(&swap_lock);
}
return count;
* Free the swap entry like above, but also try to
* free the page cache entry if it is the last user.
*/
-void free_swap_and_cache(swp_entry_t entry)
+int free_swap_and_cache(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
struct page *page = NULL;
- if (is_migration_entry(entry))
- return;
+ if (non_swap_entry(entry))
+ return 1;
p = swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, swp_offset(entry)) == 1) {
+ if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
page = find_get_page(&swapper_space, entry.val);
if (page && !trylock_page(page)) {
page_cache_release(page);
unlock_page(page);
page_cache_release(page);
}
+ return p != NULL;
}
#ifdef CONFIG_HIBERNATION
if (!bdev) {
if (bdev_p)
- *bdev_p = sis->bdev;
+ *bdev_p = bdgrab(sis->bdev);
spin_unlock(&swap_lock);
return i;
struct swap_extent, list);
if (se->start_block == offset) {
if (bdev_p)
- *bdev_p = sis->bdev;
+ *bdev_p = bdgrab(sis->bdev);
spin_unlock(&swap_lock);
bdput(bdev);
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct page *page)
{
+ struct mem_cgroup *ptr = NULL;
spinlock_t *ptl;
pte_t *pte;
int ret = 1;
- if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+ if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
ret = -ENOMEM;
+ goto out_nolock;
+ }
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
if (ret > 0)
- mem_cgroup_uncharge_page(page);
+ mem_cgroup_cancel_charge_swapin(ptr);
ret = 0;
goto out;
}
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
page_add_anon_rmap(page, vma, addr);
+ mem_cgroup_commit_charge_swapin(page, ptr);
swap_free(entry);
/*
* Move the page to the active list so it is not
activate_page(page);
out:
pte_unmap_unlock(pte, ptl);
+out_nolock:
return ret;
}
i = 1;
}
count = si->swap_map[i];
- if (count && count != SWAP_MAP_BAD)
+ if (count && swap_count(count) != SWAP_MAP_BAD)
break;
}
return i;
*/
shmem = 0;
swcount = *swap_map;
- if (swcount > 1) {
+ if (swap_count(swcount)) {
if (start_mm == &init_mm)
shmem = shmem_unuse(entry, page);
else
retval = unuse_mm(start_mm, entry, page);
}
- if (*swap_map > 1) {
+ if (swap_count(*swap_map)) {
int set_start_mm = (*swap_map >= swcount);
struct list_head *p = &start_mm->mmlist;
struct mm_struct *new_start_mm = start_mm;
atomic_inc(&new_start_mm->mm_users);
atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock);
- while (*swap_map > 1 && !retval && !shmem &&
+ while (swap_count(*swap_map) && !retval && !shmem &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
if (!atomic_inc_not_zero(&mm->mm_users))
cond_resched();
swcount = *swap_map;
- if (swcount <= 1)
+ if (!swap_count(swcount)) /* any usage ? */
;
else if (mm == &init_mm) {
set_start_mm = 1;
shmem = shmem_unuse(entry, page);
} else
retval = unuse_mm(mm, entry, page);
- if (set_start_mm && *swap_map < swcount) {
+
+ if (set_start_mm &&
+ swap_count(*swap_map) < swcount) {
mmput(new_start_mm);
atomic_inc(&mm->mm_users);
new_start_mm = mm;
}
/*
- * How could swap count reach 0x7fff when the maximum
- * pid is 0x7fff, and there's no way to repeat a swap
- * page within an mm (except in shmem, where it's the
- * shared object which takes the reference count)?
- * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
- *
+ * How could swap count reach 0x7ffe ?
+ * There's no way to repeat a swap page within an mm
+ * (except in shmem, where it's the shared object which takes
+ * the reference count)?
+ * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
+ * short is too small....)
* If that's wrong, then we should worry more about
* exit_mmap() and do_munmap() cases described above:
* we might be resetting SWAP_MAP_MAX too early here.
* We know "Undead"s can happen, they're okay, so don't
* report them; but do report if we reset SWAP_MAP_MAX.
*/
- if (*swap_map == SWAP_MAP_MAX) {
+ /* We might release the lock_page() in unuse_mm(). */
+ if (!PageSwapCache(page) || page_private(page) != entry.val)
+ goto retry;
+
+ if (swap_count(*swap_map) == SWAP_MAP_MAX) {
spin_lock(&swap_lock);
- *swap_map = 1;
+ *swap_map = encode_swapmap(0, true);
spin_unlock(&swap_lock);
reset_overflow = 1;
}
* pages would be incorrect if swap supported "shared
* private" pages, but they are handled by tmpfs files.
*/
- if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
+ if (swap_count(*swap_map) &&
+ PageDirty(page) && PageSwapCache(page)) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
* mark page dirty so shrink_page_list will preserve it.
*/
SetPageDirty(page);
+retry:
unlock_page(page);
page_cache_release(page);
return ret;
}
-#if 0 /* We don't need this yet */
-#include <linux/backing-dev.h>
-int page_queue_congested(struct page *page)
-{
- struct backing_dev_info *bdi;
-
- VM_BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
-
- if (PageSwapCache(page)) {
- swp_entry_t entry = { .val = page_private(page) };
- struct swap_info_struct *sis;
-
- sis = get_swap_info_struct(swp_type(entry));
- bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
- } else
- bdi = page->mapping->backing_dev_info;
- return bdi_write_congested(bdi);
-}
-#endif
-
-asmlinkage long sys_swapoff(const char __user * specialfile)
+SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct * p = NULL;
unsigned short *swap_map;
p->flags &= ~SWP_WRITEOK;
spin_unlock(&swap_lock);
- current->flags |= PF_SWAPOFF;
+ current->flags |= PF_OOM_ORIGIN;
err = try_to_unuse(type);
- current->flags &= ~PF_SWAPOFF;
+ current->flags &= ~PF_OOM_ORIGIN;
if (err) {
/* re-insert swap space back into swap_list */
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+ /* Destroy swap account informatin */
+ swap_cgroup_swapoff(type);
+
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
*
* The swapon system call
*/
-asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
+SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
struct swap_info_struct * p;
char *name = NULL;
}
swap_map[page_nr] = SWAP_MAP_BAD;
}
+
+ error = swap_cgroup_swapon(type, maxpages);
+ if (error)
+ goto bad_swap;
+
nr_good_pages = swap_header->info.last_page -
swap_header->info.nr_badpages -
1 /* header page */;
goto bad_swap;
}
- if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
- p->flags |= SWP_SOLIDSTATE;
- srandom32((u32)get_seconds());
- p->cluster_next = 1 + (random32() % p->highest_bit);
+ if (p->bdev) {
+ if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ p->flags |= SWP_SOLIDSTATE;
+ p->cluster_next = 1 + (random32() % p->highest_bit);
+ }
+ if (discard_swap(p) == 0)
+ p->flags |= SWP_DISCARDABLE;
}
- if (discard_swap(p) == 0)
- p->flags |= SWP_DISCARDABLE;
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
bd_release(bdev);
}
destroy_swap_extents(p);
+ swap_cgroup_swapoff(type);
bad_swap_2:
spin_lock(&swap_lock);
p->swap_file = NULL;
*
* Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
* "permanent", but will be reclaimed by the next swapoff.
+ * Returns error code in following case.
+ * - success -> 0
+ * - swp_entry is invalid -> EINVAL
+ * - swp_entry is migration entry -> EINVAL
+ * - swap-cache reference is requested but there is already one. -> EEXIST
+ * - swap-cache reference is requested but the entry is not used. -> ENOENT
*/
-int swap_duplicate(swp_entry_t entry)
+static int __swap_duplicate(swp_entry_t entry, bool cache)
{
struct swap_info_struct * p;
unsigned long offset, type;
- int result = 0;
+ int result = -EINVAL;
+ int count;
+ bool has_cache;
- if (is_migration_entry(entry))
- return 1;
+ if (non_swap_entry(entry))
+ return -EINVAL;
type = swp_type(entry);
if (type >= nr_swapfiles)
offset = swp_offset(entry);
spin_lock(&swap_lock);
- if (offset < p->max && p->swap_map[offset]) {
- if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
- p->swap_map[offset]++;
- result = 1;
- } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
+
+ if (unlikely(offset >= p->max))
+ goto unlock_out;
+
+ count = swap_count(p->swap_map[offset]);
+ has_cache = swap_has_cache(p->swap_map[offset]);
+
+ if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
+
+ /* set SWAP_HAS_CACHE if there is no cache and entry is used */
+ if (!has_cache && count) {
+ p->swap_map[offset] = encode_swapmap(count, true);
+ result = 0;
+ } else if (has_cache) /* someone added cache */
+ result = -EEXIST;
+ else if (!count) /* no users */
+ result = -ENOENT;
+
+ } else if (count || has_cache) {
+ if (count < SWAP_MAP_MAX - 1) {
+ p->swap_map[offset] = encode_swapmap(count + 1,
+ has_cache);
+ result = 0;
+ } else if (count <= SWAP_MAP_MAX) {
if (swap_overflow++ < 5)
- printk(KERN_WARNING "swap_dup: swap entry overflow\n");
- p->swap_map[offset] = SWAP_MAP_MAX;
- result = 1;
+ printk(KERN_WARNING
+ "swap_dup: swap entry overflow\n");
+ p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
+ has_cache);
+ result = 0;
}
- }
+ } else
+ result = -ENOENT; /* unused swap entry */
+unlock_out:
spin_unlock(&swap_lock);
out:
return result;
printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
goto out;
}
+/*
+ * increase reference count of swap entry by 1.
+ */
+void swap_duplicate(swp_entry_t entry)
+{
+ __swap_duplicate(entry, SWAP_MAP);
+}
+
+/*
+ * @entry: swap entry for which we allocate swap cache.
+ *
+ * Called when allocating swap cache for exising swap entry,
+ * This can return error codes. Returns 0 at success.
+ * -EBUSY means there is a swap cache.
+ * Note: return code is different from swap_duplicate().
+ */
+int swapcache_prepare(swp_entry_t entry)
+{
+ return __swap_duplicate(entry, SWAP_CACHE);
+}
+
struct swap_info_struct *
get_swap_info_struct(unsigned type)
/* Don't read in free or bad pages */
if (!si->swap_map[toff])
break;
- if (si->swap_map[toff] == SWAP_MAP_BAD)
+ if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
break;
}
/* Count contiguous allocated slots below our target */
/* Don't read in free or bad pages */
if (!si->swap_map[toff])
break;
- if (si->swap_map[toff] == SWAP_MAP_BAD)
+ if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
break;
}
spin_unlock(&swap_lock);