X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fswapfile.c;h=312fafe0ab6ed4815ac02da3f712aca18bacbbad;hb=0a0dd05dd7e1a800241888cbf515bf8d3dc2e59c;hp=c46c83d6aab028a24d69fa9bb7d9e2d8652b45b8;hpb=886bb7e9c3ed0bb3e4a2b1f336d8c6a6e5a4b782;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/swapfile.c b/mm/swapfile.c index c46c83d..312fafe 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -84,13 +86,94 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) up_read(&swap_unplug_sem); } +/* + * swapon tell device that all the old swap contents can be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static int discard_swap(struct swap_info_struct *si) +{ + struct swap_extent *se; + int err = 0; + + list_for_each_entry(se, &si->extent_list, list) { + sector_t start_block = se->start_block << (PAGE_SHIFT - 9); + sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); + + if (se->start_page == 0) { + /* Do not discard the swap header page! */ + start_block += 1 << (PAGE_SHIFT - 9); + nr_blocks -= 1 << (PAGE_SHIFT - 9); + if (!nr_blocks) + continue; + } + + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL); + if (err) + break; + + cond_resched(); + } + return err; /* That will often be -EOPNOTSUPP */ +} + +/* + * swap allocation tell device that a cluster of swap can now be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static void discard_swap_cluster(struct swap_info_struct *si, + pgoff_t start_page, pgoff_t nr_pages) +{ + struct swap_extent *se = si->curr_swap_extent; + int found_extent = 0; + + while (nr_pages) { + struct list_head *lh; + + if (se->start_page <= start_page && + start_page < se->start_page + se->nr_pages) { + pgoff_t offset = start_page - se->start_page; + sector_t start_block = se->start_block + offset; + sector_t nr_blocks = se->nr_pages - offset; + + if (nr_blocks > nr_pages) + nr_blocks = nr_pages; + start_page += nr_blocks; + nr_pages -= nr_blocks; + + if (!found_extent++) + si->curr_swap_extent = se; + + start_block <<= PAGE_SHIFT - 9; + nr_blocks <<= PAGE_SHIFT - 9; + if (blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_NOIO)) + break; + } + + lh = se->list.next; + if (lh == &si->extent_list) + lh = lh->next; + se = list_entry(lh, struct swap_extent, list); + } +} + +static int wait_for_discard(void *word) +{ + schedule(); + return 0; +} + #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 static inline unsigned long scan_swap_map(struct swap_info_struct *si) { - unsigned long offset, last_in_cluster; + unsigned long offset; + unsigned long scan_base; + unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + int found_free_cluster = 0; /* * We try to cluster swap pages by allocating them sequentially @@ -100,16 +183,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea + * And we let swap pages go all over an SSD partition. Hugh */ si->flags += SWP_SCANNING; - if (unlikely(!si->cluster_nr)) { - si->cluster_nr = SWAPFILE_CLUSTER - 1; - if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) - goto lowest; + scan_base = offset = si->cluster_next; + + if (unlikely(!si->cluster_nr--)) { + if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; + } + if (si->flags & SWP_DISCARDABLE) { + /* + * Start range check on racing allocations, in case + * they overlap the cluster we eventually decide on + * (we scan without swap_lock to allow preemption). + * It's hardly conceivable that cluster_nr could be + * wrapped during our scan, but don't depend on it. + */ + if (si->lowest_alloc) + goto checks; + si->lowest_alloc = si->max; + si->highest_alloc = 0; + } spin_unlock(&swap_lock); - offset = si->lowest_bit; + /* + * If seek is expensive, start searching for new cluster from + * start of partition, to minimize the span of allocated swap. + * But if seek is cheap, search from our current position, so + * that swap is allocated from all over the partition: if the + * Flash Translation Layer only remaps within limited zones, + * we don't want to wear out the first zone too quickly. + */ + if (!(si->flags & SWP_SOLIDSTATE)) + scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ @@ -118,43 +227,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&swap_lock); - si->cluster_next = offset-SWAPFILE_CLUSTER+1; - goto cluster; + offset -= SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + found_free_cluster = 1; + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } + + offset = si->lowest_bit; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; + + /* Locate the first empty (unaligned) cluster */ + for (; last_in_cluster < scan_base; offset++) { + if (si->swap_map[offset]) + last_in_cluster = offset + SWAPFILE_CLUSTER; + else if (offset == last_in_cluster) { + spin_lock(&swap_lock); + offset -= SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + found_free_cluster = 1; + goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } + + offset = scan_base; spin_lock(&swap_lock); - goto lowest; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + si->lowest_alloc = 0; } - si->cluster_nr--; -cluster: - offset = si->cluster_next; - if (offset > si->highest_bit) -lowest: offset = si->lowest_bit; -checks: if (!(si->flags & SWP_WRITEOK)) +checks: + if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) goto no_page; - if (!si->swap_map[offset]) { - if (offset == si->lowest_bit) - si->lowest_bit++; - if (offset == si->highest_bit) - si->highest_bit--; - si->inuse_pages++; - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; + if (offset > si->highest_bit) + scan_base = offset = si->lowest_bit; + if (si->swap_map[offset]) + goto scan; + + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + si->inuse_pages++; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + } + si->swap_map[offset] = 1; + si->cluster_next = offset + 1; + si->flags -= SWP_SCANNING; + + if (si->lowest_alloc) { + /* + * Only set when SWP_DISCARDABLE, and there's a scan + * for a free cluster in progress or just completed. + */ + if (found_free_cluster) { + /* + * To optimize wear-levelling, discard the + * old data of the cluster, taking care not to + * discard any of its pages that have already + * been allocated by racing tasks (offset has + * already stepped over any at the beginning). + */ + if (offset < si->highest_alloc && + si->lowest_alloc <= last_in_cluster) + last_in_cluster = si->lowest_alloc - 1; + si->flags |= SWP_DISCARDING; + spin_unlock(&swap_lock); + + if (offset < last_in_cluster) + discard_swap_cluster(si, offset, + last_in_cluster - offset + 1); + + spin_lock(&swap_lock); + si->lowest_alloc = 0; + si->flags &= ~SWP_DISCARDING; + + smp_mb(); /* wake_up_bit advises this */ + wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); + + } else if (si->flags & SWP_DISCARDING) { + /* + * Delay using pages allocated by racing tasks + * until the whole discard has been issued. We + * could defer that delay until swap_writepage, + * but it's easier to keep this self-contained. + */ + spin_unlock(&swap_lock); + wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), + wait_for_discard, TASK_UNINTERRUPTIBLE); + spin_lock(&swap_lock); + } else { + /* + * Note pages allocated by racing tasks while + * scan for a free cluster is in progress, so + * that its final discard can exclude them. + */ + if (offset < si->lowest_alloc) + si->lowest_alloc = offset; + if (offset > si->highest_alloc) + si->highest_alloc = offset; } - si->swap_map[offset] = 1; - si->cluster_next = offset + 1; - si->flags -= SWP_SCANNING; - return offset; } + return offset; +scan: spin_unlock(&swap_lock); while (++offset <= si->highest_bit) { if (!si->swap_map[offset]) { @@ -166,8 +356,18 @@ checks: if (!(si->flags & SWP_WRITEOK)) latency_ration = LATENCY_LIMIT; } } + offset = si->lowest_bit; + while (++offset < scan_base) { + if (!si->swap_map[offset]) { + spin_lock(&swap_lock); + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } spin_lock(&swap_lock); - goto lowest; no_page: si->flags -= SWP_SCANNING; @@ -271,8 +471,9 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) +static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) { + unsigned long offset = swp_offset(ent); int count = p->swap_map[offset]; if (count < SWAP_MAP_MAX) { @@ -287,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) swap_list.next = p - swap_info; nr_swap_pages++; p->inuse_pages--; + mem_cgroup_uncharge_swap(ent); } } return count; @@ -302,7 +504,7 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { - swap_entry_free(p, swp_offset(entry)); + swap_entry_free(p, entry); spin_unlock(&swap_lock); } } @@ -372,17 +574,17 @@ int try_to_free_swap(struct page *page) * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ -void free_swap_and_cache(swp_entry_t entry) +int free_swap_and_cache(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; struct page *page = NULL; if (is_migration_entry(entry)) - return; + return 1; p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) { + if (swap_entry_free(p, entry) == 1) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -404,6 +606,7 @@ void free_swap_and_cache(swp_entry_t entry) unlock_page(page); page_cache_release(page); } + return p != NULL; } #ifdef CONFIG_HIBERNATION @@ -432,7 +635,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) if (!bdev) { if (bdev_p) - *bdev_p = sis->bdev; + *bdev_p = bdget(sis->bdev->bd_dev); spin_unlock(&swap_lock); return i; @@ -444,7 +647,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) - *bdev_p = sis->bdev; + *bdev_p = bdget(sis->bdev->bd_dev); spin_unlock(&swap_lock); bdput(bdev); @@ -490,17 +693,20 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { + struct mem_cgroup *ptr = NULL; spinlock_t *ptl; pte_t *pte; int ret = 1; - if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { ret = -ENOMEM; + goto out_nolock; + } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { if (ret > 0) - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge_swapin(ptr); ret = 0; goto out; } @@ -510,6 +716,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); + mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); /* * Move the page to the active list so it is not @@ -518,6 +725,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, activate_page(page); out: pte_unmap_unlock(pte, ptl); +out_nolock: return ret; } @@ -1172,27 +1380,7 @@ out: return ret; } -#if 0 /* We don't need this yet */ -#include -int page_queue_congested(struct page *page) -{ - struct backing_dev_info *bdi; - - VM_BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ - - if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page_private(page) }; - struct swap_info_struct *sis; - - sis = get_swap_info_struct(swp_type(entry)); - bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; - } else - bdi = page->mapping->backing_dev_info; - return bdi_write_congested(bdi); -} -#endif - -asmlinkage long sys_swapoff(const char __user * specialfile) +SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct * p = NULL; unsigned short *swap_map; @@ -1312,6 +1500,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile) spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + /* Destroy swap account informatin */ + swap_cgroup_swapoff(type); + inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); @@ -1445,7 +1636,7 @@ late_initcall(max_swapfiles_check); * * The swapon system call */ -asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct * p; char *name = NULL; @@ -1456,7 +1647,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) int i, prev; int error; union swap_header *swap_header = NULL; - int swap_header_version; unsigned int nr_good_pages = 0; int nr_extents = 0; sector_t span; @@ -1553,102 +1743,92 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = PTR_ERR(page); goto bad_swap; } - kmap(page); - swap_header = page_address(page); + swap_header = kmap(page); - if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) - swap_header_version = 1; - else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) - swap_header_version = 2; - else { + if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { printk(KERN_ERR "Unable to find swap-space signature\n"); error = -EINVAL; goto bad_swap; } - switch (swap_header_version) { - case 1: - printk(KERN_ERR "version 0 swap is no longer supported. " - "Use mkswap -v1 %s\n", name); + /* swap partition endianess hack... */ + if (swab32(swap_header->info.version) == 1) { + swab32s(&swap_header->info.version); + swab32s(&swap_header->info.last_page); + swab32s(&swap_header->info.nr_badpages); + for (i = 0; i < swap_header->info.nr_badpages; i++) + swab32s(&swap_header->info.badpages[i]); + } + /* Check the swap header's sub-version */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); error = -EINVAL; goto bad_swap; - case 2: - /* swap partition endianess hack... */ - if (swab32(swap_header->info.version) == 1) { - swab32s(&swap_header->info.version); - swab32s(&swap_header->info.last_page); - swab32s(&swap_header->info.nr_badpages); - for (i = 0; i < swap_header->info.nr_badpages; i++) - swab32s(&swap_header->info.badpages[i]); - } - /* Check the swap header's sub-version and the size of - the swap file and bad block lists */ - if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); - error = -EINVAL; - goto bad_swap; - } + } - p->lowest_bit = 1; - p->cluster_next = 1; + p->lowest_bit = 1; + p->cluster_next = 1; - /* - * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number of - * bits for the swap offset in the swp_entry_t type and - * 2) the number of bits in the a swap pte as defined by - * the different architectures. In order to find the - * largest possible bit mask a swap entry with swap type 0 - * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again and finally the swap - * offset is extracted. This will mask all the bits from - * the initial ~0UL mask that can't be encoded in either - * the swp_entry_t or the architecture definition of a - * swap pte. - */ - maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; - if (maxpages > swap_header->info.last_page) - maxpages = swap_header->info.last_page; - p->highest_bit = maxpages - 1; + /* + * Find out how many pages are allowed for a single swap + * device. There are two limiting factors: 1) the number of + * bits for the swap offset in the swp_entry_t type and + * 2) the number of bits in the a swap pte as defined by + * the different architectures. In order to find the + * largest possible bit mask a swap entry with swap type 0 + * and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again and finally the swap + * offset is extracted. This will mask all the bits from + * the initial ~0UL mask that can't be encoded in either + * the swp_entry_t or the architecture definition of a + * swap pte. + */ + maxpages = swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; + p->highest_bit = maxpages - 1; - error = -EINVAL; - if (!maxpages) - goto bad_swap; - if (swapfilepages && maxpages > swapfilepages) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); - goto bad_swap; - } - if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) - goto bad_swap; - if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; + error = -EINVAL; + if (!maxpages) + goto bad_swap; + if (swapfilepages && maxpages > swapfilepages) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + goto bad_swap; + } + if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) + goto bad_swap; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; - /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages * sizeof(short)); - if (!swap_map) { - error = -ENOMEM; - goto bad_swap; - } + /* OK, set up the swap map and apply the bad block list */ + swap_map = vmalloc(maxpages * sizeof(short)); + if (!swap_map) { + error = -ENOMEM; + goto bad_swap; + } - error = 0; - memset(swap_map, 0, maxpages * sizeof(short)); - for (i = 0; i < swap_header->info.nr_badpages; i++) { - int page_nr = swap_header->info.badpages[i]; - if (page_nr <= 0 || page_nr >= swap_header->info.last_page) - error = -EINVAL; - else - swap_map[page_nr] = SWAP_MAP_BAD; - } - nr_good_pages = swap_header->info.last_page - - swap_header->info.nr_badpages - - 1 /* header page */; - if (error) + memset(swap_map, 0, maxpages * sizeof(short)); + for (i = 0; i < swap_header->info.nr_badpages; i++) { + int page_nr = swap_header->info.badpages[i]; + if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { + error = -EINVAL; goto bad_swap; + } + swap_map[page_nr] = SWAP_MAP_BAD; } + error = swap_cgroup_swapon(type, maxpages); + if (error) + goto bad_swap; + + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; + if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; p->max = maxpages; @@ -1666,6 +1846,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } + if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + p->cluster_next = 1 + (random32() % p->highest_bit); + } + if (discard_swap(p) == 0) + p->flags |= SWP_DISCARDABLE; + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); if (swap_flags & SWAP_FLAG_PREFER) @@ -1679,9 +1866,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) total_swap_pages += nr_good_pages; printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk\n", + "Priority:%d extents:%d across:%lluk %s%s\n", nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, - nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); + nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), + (p->flags & SWP_SOLIDSTATE) ? "SS" : "", + (p->flags & SWP_DISCARDABLE) ? "D" : ""); /* insert swap space into swap_list: */ prev = -1; @@ -1707,6 +1896,7 @@ bad_swap: bd_release(bdev); } destroy_swap_extents(p); + swap_cgroup_swapoff(type); bad_swap_2: spin_lock(&swap_lock); p->swap_file = NULL;