X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fksm.c;h=56a0da1f9979d7eaa9d85cbc0b20ef76215e4abf;hb=84c95c9acf088c99d8793d78036b67faa5d0b851;hp=f7d121c42d0107af0d828fcc968dfbbdde9af73f;hpb=db114b83ab6064d9b1d6ec5650e096c89bd95e25;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/ksm.c b/mm/ksm.c index f7d121c..56a0da1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -108,14 +109,14 @@ struct ksm_scan { /** * struct stable_node - node of the stable rbtree - * @page: pointer to struct page of the ksm page * @node: rb node of this ksm page in the stable tree * @hlist: hlist head of rmap_items using this ksm page + * @kpfn: page frame number of this ksm page */ struct stable_node { - struct page *page; struct rb_node node; struct hlist_head hlist; + unsigned long kpfn; }; /** @@ -178,9 +179,6 @@ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; -/* Limit on the number of unswappable pages used */ -static unsigned long ksm_max_kernel_pages; - /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = 100; @@ -413,6 +411,12 @@ static void break_cow(struct rmap_item *rmap_item) unsigned long addr = rmap_item->address; struct vm_area_struct *vma; + /* + * It is not an accident that whenever we want to break COW + * to undo, we also need to drop a reference to the anon_vma. + */ + drop_anon_vma(rmap_item); + down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) goto out; @@ -456,6 +460,79 @@ out: page = NULL; return page; } +static void remove_node_from_stable_tree(struct stable_node *stable_node) +{ + struct rmap_item *rmap_item; + struct hlist_node *hlist; + + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + if (rmap_item->hlist.next) + ksm_pages_sharing--; + else + ksm_pages_shared--; + drop_anon_vma(rmap_item); + rmap_item->address &= PAGE_MASK; + cond_resched(); + } + + rb_erase(&stable_node->node, &root_stable_tree); + free_stable_node(stable_node); +} + +/* + * get_ksm_page: checks if the page indicated by the stable node + * is still its ksm page, despite having held no reference to it. + * In which case we can trust the content of the page, and it + * returns the gotten page; but if the page has now been zapped, + * remove the stale node from the stable tree and return NULL. + * + * You would expect the stable_node to hold a reference to the ksm page. + * But if it increments the page's count, swapping out has to wait for + * ksmd to come around again before it can free the page, which may take + * seconds or even minutes: much too unresponsive. So instead we use a + * "keyhole reference": access to the ksm page from the stable node peeps + * out through its keyhole to see if that page still holds the right key, + * pointing back to this stable node. This relies on freeing a PageAnon + * page to reset its page->mapping to NULL, and relies on no other use of + * a page to put something that might look like our key in page->mapping. + * + * include/linux/pagemap.h page_cache_get_speculative() is a good reference, + * but this is different - made simpler by ksm_thread_mutex being held, but + * interesting for assuming that no other use of the struct page could ever + * put our expected_mapping into page->mapping (or a field of the union which + * coincides with page->mapping). The RCU calls are not for KSM at all, but + * to keep the page_count protocol described with page_cache_get_speculative. + * + * Note: it is possible that get_ksm_page() will return NULL one moment, + * then page the next, if the page is in between page_freeze_refs() and + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page + * is on its way to being freed; but it is an anomaly to bear in mind. + */ +static struct page *get_ksm_page(struct stable_node *stable_node) +{ + struct page *page; + void *expected_mapping; + + page = pfn_to_page(stable_node->kpfn); + expected_mapping = (void *)stable_node + + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + rcu_read_lock(); + if (page->mapping != expected_mapping) + goto stale; + if (!get_page_unless_zero(page)) + goto stale; + if (page->mapping != expected_mapping) { + put_page(page); + goto stale; + } + rcu_read_unlock(); + return page; +stale: + rcu_read_unlock(); + remove_node_from_stable_tree(stable_node); + return NULL; +} + /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. @@ -467,22 +544,19 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = stable_node->page; - lock_page(page); + page = get_ksm_page(stable_node); + if (!page) + goto out; + lock_page(page); hlist_del(&rmap_item->hlist); - if (stable_node->hlist.first) { - unlock_page(page); - ksm_pages_sharing--; - } else { - set_page_stable_node(page, NULL); - unlock_page(page); - put_page(page); + unlock_page(page); + put_page(page); - rb_erase(&stable_node->node, &root_stable_tree); - free_stable_node(stable_node); + if (stable_node->hlist.first) + ksm_pages_sharing--; + else ksm_pages_shared--; - } drop_anon_vma(rmap_item); rmap_item->address &= PAGE_MASK; @@ -504,7 +578,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } - +out: cond_resched(); /* we're called from many long loops */ } @@ -755,7 +829,8 @@ out: * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page * @page: the PageAnon page that we want to replace with kpage - * @kpage: the PageKsm page that we want to map instead of page + * @kpage: the PageKsm page that we want to map instead of page, + * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ @@ -788,15 +863,24 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ - if (write_protect_page(vma, page, &orig_pte) == 0 && - pages_identical(page, kpage)) - err = replace_page(vma, page, kpage, orig_pte); + if (write_protect_page(vma, page, &orig_pte) == 0) { + if (!kpage) { + /* + * While we hold page lock, upgrade page from + * PageAnon+anon_vma to PageKsm+NULL stable_node: + * stable_tree_insert() will update stable_node. + */ + set_page_stable_node(page, NULL); + mark_page_accessed(page); + err = 0; + } else if (pages_identical(page, kpage)) + err = replace_page(vma, page, kpage, orig_pte); + } - if ((vma->vm_flags & VM_LOCKED) && !err) { + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { munlock_vma_page(page); if (!PageMlocked(kpage)) { unlock_page(page); - lru_add_drain(); lock_page(kpage); mlock_vma_page(kpage); page = kpage; /* for final unlock */ @@ -846,7 +930,7 @@ out: * This function returns the kpage if we successfully merged two identical * pages into one ksm page, NULL otherwise. * - * Note that this function allocates a new kernel page: if one of the pages + * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, @@ -854,64 +938,20 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, struct rmap_item *tree_rmap_item, struct page *tree_page) { - struct mm_struct *mm = rmap_item->mm; - struct vm_area_struct *vma; - struct page *kpage; - int err = -EFAULT; - - /* - * The number of nodes in the stable tree - * is the number of kernel pages that we hold. - */ - if (ksm_max_kernel_pages && - ksm_max_kernel_pages <= ksm_pages_shared) - return NULL; - - kpage = alloc_page(GFP_HIGHUSER); - if (!kpage) - return NULL; - - down_read(&mm->mmap_sem); - if (ksm_test_exit(mm)) - goto up; - vma = find_vma(mm, rmap_item->address); - if (!vma || vma->vm_start > rmap_item->address) - goto up; - - copy_user_highpage(kpage, page, rmap_item->address, vma); - - SetPageDirty(kpage); - __SetPageUptodate(kpage); - SetPageSwapBacked(kpage); - set_page_stable_node(kpage, NULL); /* mark it PageKsm */ - lru_cache_add_lru(kpage, LRU_ACTIVE_ANON); - - err = try_to_merge_one_page(vma, page, kpage); - if (err) - goto up; - - /* Must get reference to anon_vma while still holding mmap_sem */ - hold_anon_vma(rmap_item, vma->anon_vma); -up: - up_read(&mm->mmap_sem); + int err; + err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, - tree_page, kpage); + tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ - if (err) { - drop_anon_vma(rmap_item); + if (err) break_cow(rmap_item); - } - } - if (err) { - put_page(kpage); - kpage = NULL; } - return kpage; + return err ? NULL : page; } /* @@ -923,7 +963,7 @@ up: * This function returns the stable tree node of identical content if found, * NULL otherwise. */ -static struct stable_node *stable_tree_search(struct page *page) +static struct page *stable_tree_search(struct page *page) { struct rb_node *node = root_stable_tree.rb_node; struct stable_node *stable_node; @@ -931,25 +971,29 @@ static struct stable_node *stable_tree_search(struct page *page) stable_node = page_stable_node(page); if (stable_node) { /* ksm page forked */ get_page(page); - return stable_node; + return page; } while (node) { + struct page *tree_page; int ret; cond_resched(); stable_node = rb_entry(node, struct stable_node, node); + tree_page = get_ksm_page(stable_node); + if (!tree_page) + return NULL; - ret = memcmp_pages(page, stable_node->page); + ret = memcmp_pages(page, tree_page); - if (ret < 0) + if (ret < 0) { + put_page(tree_page); node = node->rb_left; - else if (ret > 0) + } else if (ret > 0) { + put_page(tree_page); node = node->rb_right; - else { - get_page(stable_node->page); - return stable_node; - } + } else + return tree_page; } return NULL; @@ -969,12 +1013,17 @@ static struct stable_node *stable_tree_insert(struct page *kpage) struct stable_node *stable_node; while (*new) { + struct page *tree_page; int ret; cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); + tree_page = get_ksm_page(stable_node); + if (!tree_page) + return NULL; - ret = memcmp_pages(kpage, stable_node->page); + ret = memcmp_pages(kpage, tree_page); + put_page(tree_page); parent = *new; if (ret < 0) @@ -1000,8 +1049,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage) INIT_HLIST_HEAD(&stable_node->hlist); - get_page(kpage); - stable_node->page = kpage; + stable_node->kpfn = page_to_pfn(kpage); set_page_stable_node(kpage, stable_node); return stable_node; @@ -1112,9 +1160,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) remove_rmap_item_from_tree(rmap_item); /* We first start with searching the page inside the stable tree */ - stable_node = stable_tree_search(page); - if (stable_node) { - kpage = stable_node->page; + kpage = stable_tree_search(page); + if (kpage) { err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { /* @@ -1122,7 +1169,7 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) * add its rmap_item to the stable tree. */ lock_page(kpage); - stable_tree_append(rmap_item, stable_node); + stable_tree_append(rmap_item, page_stable_node(kpage)); unlock_page(kpage); } put_page(kpage); @@ -1130,19 +1177,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) } /* - * A ksm page might have got here by fork, but its other - * references have already been removed from the stable tree. - * Or it might be left over from a break_ksm which failed - * when the mem_cgroup had reached its limit: try again now. - */ - if (PageKsm(page)) - break_cow(rmap_item); - - /* - * In case the hash value of the page was changed from the last time we - * have calculated it, this page to be changed frequely, therefore we - * don't want to insert it to the unstable tree, and we don't want to - * waste our time to search if there is something identical to it there. + * If the hash value of the page has changed from the last time + * we calculated it, this page is changing frequently: therefore we + * don't want to insert it in the unstable tree, and we don't want + * to waste our time searching for something identical to it there. */ checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { @@ -1171,7 +1209,6 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) stable_tree_append(rmap_item, stable_node); } unlock_page(kpage); - put_page(kpage); /* * If we fail to insert the page into the stable tree, @@ -1180,9 +1217,7 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) * in which case we need to break_cow on both. */ if (!stable_node) { - drop_anon_vma(tree_rmap_item); break_cow(tree_rmap_item); - drop_anon_vma(rmap_item); break_cow(rmap_item); } } @@ -1610,6 +1645,122 @@ out: return ret; } +#ifdef CONFIG_MIGRATION +int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct stable_node *stable_node; + struct hlist_node *hlist; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return ret; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + ret = rmap_one(page, vma, rmap_item->address, arg); + if (ret != SWAP_AGAIN) { + spin_unlock(&anon_vma->lock); + goto out; + } + } + spin_unlock(&anon_vma->lock); + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +void ksm_migrate_page(struct page *newpage, struct page *oldpage) +{ + struct stable_node *stable_node; + + VM_BUG_ON(!PageLocked(oldpage)); + VM_BUG_ON(!PageLocked(newpage)); + VM_BUG_ON(newpage->mapping != oldpage->mapping); + + stable_node = page_stable_node(newpage); + if (stable_node) { + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); + stable_node->kpfn = page_to_pfn(newpage); + } +} +#endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct rb_node *node; + + for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { + struct stable_node *stable_node; + + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) + return stable_node; + } + return NULL; +} + +static int ksm_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + struct stable_node *stable_node; + + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Keep it very simple for now: just lock out ksmd and + * MADV_UNMERGEABLE while any memory is going offline. + */ + mutex_lock(&ksm_thread_mutex); + break; + + case MEM_OFFLINE: + /* + * Most of the work is done by page migration; but there might + * be a few stable_nodes left over, still pointing to struct + * pages which have been offlined: prune those from the tree. + */ + while ((stable_node = ksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages)) != NULL) + remove_node_from_stable_tree(stable_node); + /* fallthrough */ + + case MEM_CANCEL_OFFLINE: + mutex_unlock(&ksm_thread_mutex); + break; + } + return NOTIFY_OK; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. @@ -1688,8 +1839,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, - * breaking COW to free the unswappable pages_shared (but leaves - * mm_slots on the list for when ksmd may be set running again). + * breaking COW to free the pages_shared (but leaves mm_slots + * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); @@ -1714,29 +1865,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); -static ssize_t max_kernel_pages_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long nr_pages; - - err = strict_strtoul(buf, 10, &nr_pages); - if (err) - return -EINVAL; - - ksm_max_kernel_pages = nr_pages; - - return count; -} - -static ssize_t max_kernel_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", ksm_max_kernel_pages); -} -KSM_ATTR(max_kernel_pages); - static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1786,7 +1914,6 @@ static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, - &max_kernel_pages_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, @@ -1806,8 +1933,6 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; - ksm_max_kernel_pages = totalram_pages / 4; - err = ksm_slab_init(); if (err) goto out; @@ -1835,6 +1960,13 @@ static int __init ksm_init(void) #endif /* CONFIG_SYSFS */ +#ifdef CONFIG_MEMORY_HOTREMOVE + /* + * Choose a high priority since the callback takes ksm_thread_mutex: + * later callbacks could only be taking locks which nest within that. + */ + hotplug_memory_notifier(ksm_memory_callback, 100); +#endif return 0; out_free2: