X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fksm.c;h=6c3e99b4ae7c0726851ea5281fbb39f90f7b9ea2;hb=0e950fa686d53a57ee6c47f477ecfc681670c6a9;hp=9b7af2eb4280c42df709c73c251b5f549b92aa62;hpb=7b6ba2c7d3baf8cd9f888e05563dcc32e368baab;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/ksm.c b/mm/ksm.c index 9b7af2e..6c3e99b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -29,11 +29,13 @@ #include #include #include +#include #include #include #include #include +#include "internal.h" /* * A few notes about the KSM scanning process, @@ -109,16 +111,18 @@ struct ksm_scan { * struct stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @hlist: hlist head of rmap_items using this ksm page + * @kpfn: page frame number of this ksm page */ struct stable_node { struct rb_node node; struct hlist_head hlist; + unsigned long kpfn; }; /** * struct rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list - * @filler: unused space we're making available in this patch + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address @@ -128,7 +132,7 @@ struct stable_node { */ struct rmap_item { struct rmap_item *rmap_list; - unsigned long filler; + struct anon_vma *anon_vma; /* when stable */ struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ @@ -175,9 +179,6 @@ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; -/* Limit on the number of unswappable pages used */ -static unsigned long ksm_max_kernel_pages; - /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = 100; @@ -313,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) return rmap_item->address & STABLE_FLAG; } +static void hold_anon_vma(struct rmap_item *rmap_item, + struct anon_vma *anon_vma) +{ + rmap_item->anon_vma = anon_vma; + atomic_inc(&anon_vma->external_refcount); +} + +static void drop_anon_vma(struct rmap_item *rmap_item) +{ + struct anon_vma *anon_vma = rmap_item->anon_vma; + + if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { + int empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + if (empty) + anon_vma_free(anon_vma); + } +} + /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -345,7 +365,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); page = follow_page(vma, addr, FOLL_GET); - if (!page) + if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma->vm_mm, vma, addr, @@ -391,6 +411,12 @@ static void break_cow(struct rmap_item *rmap_item) unsigned long addr = rmap_item->address; struct vm_area_struct *vma; + /* + * It is not an accident that whenever we want to break COW + * to undo, we also need to drop a reference to the anon_vma. + */ + drop_anon_vma(rmap_item); + down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) goto out; @@ -421,7 +447,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) goto out; page = follow_page(vma, addr, FOLL_GET); - if (!page) + if (IS_ERR_OR_NULL(page)) goto out; if (PageAnon(page)) { flush_anon_page(vma, page, addr); @@ -434,21 +460,77 @@ out: page = NULL; return page; } +static void remove_node_from_stable_tree(struct stable_node *stable_node) +{ + struct rmap_item *rmap_item; + struct hlist_node *hlist; + + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + if (rmap_item->hlist.next) + ksm_pages_sharing--; + else + ksm_pages_shared--; + drop_anon_vma(rmap_item); + rmap_item->address &= PAGE_MASK; + cond_resched(); + } + + rb_erase(&stable_node->node, &root_stable_tree); + free_stable_node(stable_node); +} + /* - * get_ksm_page: checks if the page at the virtual address in rmap_item - * is still PageKsm, in which case we can trust the content of the page, - * and it returns the gotten page; but NULL if the page has been zapped. + * get_ksm_page: checks if the page indicated by the stable node + * is still its ksm page, despite having held no reference to it. + * In which case we can trust the content of the page, and it + * returns the gotten page; but if the page has now been zapped, + * remove the stale node from the stable tree and return NULL. + * + * You would expect the stable_node to hold a reference to the ksm page. + * But if it increments the page's count, swapping out has to wait for + * ksmd to come around again before it can free the page, which may take + * seconds or even minutes: much too unresponsive. So instead we use a + * "keyhole reference": access to the ksm page from the stable node peeps + * out through its keyhole to see if that page still holds the right key, + * pointing back to this stable node. This relies on freeing a PageAnon + * page to reset its page->mapping to NULL, and relies on no other use of + * a page to put something that might look like our key in page->mapping. + * + * include/linux/pagemap.h page_cache_get_speculative() is a good reference, + * but this is different - made simpler by ksm_thread_mutex being held, but + * interesting for assuming that no other use of the struct page could ever + * put our expected_mapping into page->mapping (or a field of the union which + * coincides with page->mapping). The RCU calls are not for KSM at all, but + * to keep the page_count protocol described with page_cache_get_speculative. + * + * Note: it is possible that get_ksm_page() will return NULL one moment, + * then page the next, if the page is in between page_freeze_refs() and + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page + * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct rmap_item *rmap_item) +static struct page *get_ksm_page(struct stable_node *stable_node) { struct page *page; - - page = get_mergeable_page(rmap_item); - if (page && !PageKsm(page)) { + void *expected_mapping; + + page = pfn_to_page(stable_node->kpfn); + expected_mapping = (void *)stable_node + + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + rcu_read_lock(); + if (page->mapping != expected_mapping) + goto stale; + if (!get_page_unless_zero(page)) + goto stale; + if (page->mapping != expected_mapping) { put_page(page); - page = NULL; + goto stale; } + rcu_read_unlock(); return page; +stale: + rcu_read_unlock(); + remove_node_from_stable_tree(stable_node); + return NULL; } /* @@ -459,17 +541,24 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { struct stable_node *stable_node; + struct page *page; stable_node = rmap_item->head; + page = get_ksm_page(stable_node); + if (!page) + goto out; + + lock_page(page); hlist_del(&rmap_item->hlist); + unlock_page(page); + put_page(page); + if (stable_node->hlist.first) ksm_pages_sharing--; - else { - rb_erase(&stable_node->node, &root_stable_tree); - free_stable_node(stable_node); + else ksm_pages_shared--; - } + drop_anon_vma(rmap_item); rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { @@ -489,7 +578,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } - +out: cond_resched(); /* we're called from many long loops */ } @@ -662,7 +751,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * page */ if (page_mapcount(page) + 1 + swapped != page_count(page)) { - set_pte_at_notify(mm, addr, ptep, entry); + set_pte_at(mm, addr, ptep, entry); goto out_unlock; } entry = pte_wrprotect(entry); @@ -721,7 +810,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } get_page(kpage); - page_add_ksm_rmap(kpage); + page_add_anon_rmap(kpage, vma, addr); flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush(vma, addr, ptep); @@ -740,8 +829,8 @@ out: * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page * @page: the PageAnon page that we want to replace with kpage - * @kpage: the PageKsm page (or newly allocated page which page_add_ksm_rmap - * will make PageKsm) that we want to map instead of page + * @kpage: the PageKsm page that we want to map instead of page, + * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ @@ -751,6 +840,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, pte_t orig_pte = __pte(0); int err = -EFAULT; + if (page == kpage) /* ksm page forked */ + return 0; + if (!(vma->vm_flags & VM_MERGEABLE)) goto out; if (!PageAnon(page)) @@ -771,9 +863,29 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ - if (write_protect_page(vma, page, &orig_pte) == 0 && - pages_identical(page, kpage)) - err = replace_page(vma, page, kpage, orig_pte); + if (write_protect_page(vma, page, &orig_pte) == 0) { + if (!kpage) { + /* + * While we hold page lock, upgrade page from + * PageAnon+anon_vma to PageKsm+NULL stable_node: + * stable_tree_insert() will update stable_node. + */ + set_page_stable_node(page, NULL); + mark_page_accessed(page); + err = 0; + } else if (pages_identical(page, kpage)) + err = replace_page(vma, page, kpage, orig_pte); + } + + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { + munlock_vma_page(page); + if (!PageMlocked(kpage)) { + unlock_page(page); + lock_page(kpage); + mlock_vma_page(kpage); + page = kpage; /* for final unlock */ + } + } unlock_page(page); out: @@ -801,6 +913,11 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, goto out; err = try_to_merge_one_page(vma, page, kpage); + if (err) + goto out; + + /* Must get reference to anon_vma while still holding mmap_sem */ + hold_anon_vma(rmap_item, vma->anon_vma); out: up_read(&mm->mmap_sem); return err; @@ -813,7 +930,7 @@ out: * This function returns the kpage if we successfully merged two identical * pages into one ksm page, NULL otherwise. * - * Note that this function allocates a new kernel page: if one of the pages + * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, @@ -821,38 +938,12 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, struct rmap_item *tree_rmap_item, struct page *tree_page) { - struct mm_struct *mm = rmap_item->mm; - struct vm_area_struct *vma; - struct page *kpage; - int err = -EFAULT; - - /* - * The number of nodes in the stable tree - * is the number of kernel pages that we hold. - */ - if (ksm_max_kernel_pages && - ksm_max_kernel_pages <= ksm_pages_shared) - return NULL; - - kpage = alloc_page(GFP_HIGHUSER); - if (!kpage) - return NULL; - - down_read(&mm->mmap_sem); - if (ksm_test_exit(mm)) - goto up; - vma = find_vma(mm, rmap_item->address); - if (!vma || vma->vm_start > rmap_item->address) - goto up; - - copy_user_highpage(kpage, page, rmap_item->address, vma); - err = try_to_merge_one_page(vma, page, kpage); -up: - up_read(&mm->mmap_sem); + int err; + err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, - tree_page, kpage); + tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. @@ -860,11 +951,7 @@ up: if (err) break_cow(rmap_item); } - if (err) { - put_page(kpage); - kpage = NULL; - } - return kpage; + return err ? NULL : page; } /* @@ -876,29 +963,25 @@ up: * This function returns the stable tree node of identical content if found, * NULL otherwise. */ -static struct stable_node *stable_tree_search(struct page *page, - struct page **tree_pagep) +static struct page *stable_tree_search(struct page *page) { struct rb_node *node = root_stable_tree.rb_node; struct stable_node *stable_node; + stable_node = page_stable_node(page); + if (stable_node) { /* ksm page forked */ + get_page(page); + return page; + } + while (node) { - struct hlist_node *hlist, *hnext; - struct rmap_item *tree_rmap_item; struct page *tree_page; int ret; + cond_resched(); stable_node = rb_entry(node, struct stable_node, node); - hlist_for_each_entry_safe(tree_rmap_item, hlist, hnext, - &stable_node->hlist, hlist) { - BUG_ON(!in_stable_tree(tree_rmap_item)); - cond_resched(); - tree_page = get_ksm_page(tree_rmap_item); - if (tree_page) - break; - remove_rmap_item_from_tree(tree_rmap_item); - } - if (!hlist) + tree_page = get_ksm_page(stable_node); + if (!tree_page) return NULL; ret = memcmp_pages(page, tree_page); @@ -909,10 +992,8 @@ static struct stable_node *stable_tree_search(struct page *page, } else if (ret > 0) { put_page(tree_page); node = node->rb_right; - } else { - *tree_pagep = tree_page; - return stable_node; - } + } else + return tree_page; } return NULL; @@ -932,22 +1013,13 @@ static struct stable_node *stable_tree_insert(struct page *kpage) struct stable_node *stable_node; while (*new) { - struct hlist_node *hlist, *hnext; - struct rmap_item *tree_rmap_item; struct page *tree_page; int ret; + cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - hlist_for_each_entry_safe(tree_rmap_item, hlist, hnext, - &stable_node->hlist, hlist) { - BUG_ON(!in_stable_tree(tree_rmap_item)); - cond_resched(); - tree_page = get_ksm_page(tree_rmap_item); - if (tree_page) - break; - remove_rmap_item_from_tree(tree_rmap_item); - } - if (!hlist) + tree_page = get_ksm_page(stable_node); + if (!tree_page) return NULL; ret = memcmp_pages(kpage, tree_page); @@ -977,6 +1049,9 @@ static struct stable_node *stable_tree_insert(struct page *kpage) INIT_HLIST_HEAD(&stable_node->hlist); + stable_node->kpfn = page_to_pfn(kpage); + set_page_stable_node(kpage, stable_node); + return stable_node; } @@ -1011,7 +1086,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, cond_resched(); tree_rmap_item = rb_entry(*new, struct rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); - if (!tree_page) + if (IS_ERR_OR_NULL(tree_page)) return NULL; /* @@ -1085,39 +1160,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) remove_rmap_item_from_tree(rmap_item); /* We first start with searching the page inside the stable tree */ - stable_node = stable_tree_search(page, &tree_page); - if (stable_node) { - kpage = tree_page; - if (page == kpage) /* forked */ - err = 0; - else - err = try_to_merge_with_ksm_page(rmap_item, - page, kpage); + kpage = stable_tree_search(page); + if (kpage) { + err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { /* * The page was successfully merged: * add its rmap_item to the stable tree. */ - stable_tree_append(rmap_item, stable_node); + lock_page(kpage); + stable_tree_append(rmap_item, page_stable_node(kpage)); + unlock_page(kpage); } put_page(kpage); return; } /* - * A ksm page might have got here by fork, but its other - * references have already been removed from the stable tree. - * Or it might be left over from a break_ksm which failed - * when the mem_cgroup had reached its limit: try again now. - */ - if (PageKsm(page)) - break_cow(rmap_item); - - /* - * In case the hash value of the page was changed from the last time we - * have calculated it, this page to be changed frequely, therefore we - * don't want to insert it to the unstable tree, and we don't want to - * waste our time to search if there is something identical to it there. + * If the hash value of the page has changed from the last time + * we calculated it, this page is changing frequently: therefore we + * don't want to insert it in the unstable tree, and we don't want + * to waste our time searching for something identical to it there. */ checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { @@ -1139,12 +1202,13 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) if (kpage) { remove_rmap_item_from_tree(tree_rmap_item); + lock_page(kpage); stable_node = stable_tree_insert(kpage); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node); stable_tree_append(rmap_item, stable_node); } - put_page(kpage); + unlock_page(kpage); /* * If we fail to insert the page into the stable tree, @@ -1230,7 +1294,7 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (*page && PageAnon(*page)) { + if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1244,7 +1308,7 @@ next_mm: up_read(&mm->mmap_sem); return rmap_item; } - if (*page) + if (!IS_ERR_OR_NULL(*page)) put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); @@ -1303,7 +1367,7 @@ next_mm: static void ksm_do_scan(unsigned int scan_npages) { struct rmap_item *rmap_item; - struct page *page; + struct page *uninitialized_var(page); while (scan_npages--) { cond_resched(); @@ -1312,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages) return; if (!PageKsm(page) || !in_stable_tree(rmap_item)) cmp_and_merge_page(page, rmap_item); - else if (page_mapcount(page) == 1) { - /* - * Replace now-unshared ksm page by ordinary page. - */ - break_cow(rmap_item); - remove_rmap_item_from_tree(rmap_item); - rmap_item->oldchecksum = calc_checksum(page); - } put_page(page); } } @@ -1364,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) + VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) return 0; /* just ignore the advice */ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { @@ -1462,6 +1518,255 @@ void __ksm_exit(struct mm_struct *mm) } } +struct page *ksm_does_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct page *new_page; + + unlock_page(page); /* any racers will COW it, not modify it */ + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page) { + copy_user_highpage(new_page, page, address, vma); + + SetPageDirty(new_page); + __SetPageUptodate(new_page); + SetPageSwapBacked(new_page); + __set_page_locked(new_page); + + if (page_evictable(new_page, vma)) + lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); + else + add_page_to_unevictable_list(new_page); + } + + page_cache_release(page); + return new_page; +} + +int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, + unsigned long *vm_flags) +{ + struct stable_node *stable_node; + struct rmap_item *rmap_item; + struct hlist_node *hlist; + unsigned int mapcount = page_mapcount(page); + int referenced = 0; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return 0; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) + continue; + + referenced += page_referenced_one(page, vma, + rmap_item->address, &mapcount, vm_flags); + if (!search_new_forks || !mapcount) + break; + } + spin_unlock(&anon_vma->lock); + if (!mapcount) + goto out; + } + if (!search_new_forks++) + goto again; +out: + return referenced; +} + +int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) +{ + struct stable_node *stable_node; + struct hlist_node *hlist; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return SWAP_FAIL; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + ret = try_to_unmap_one(page, vma, + rmap_item->address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) { + spin_unlock(&anon_vma->lock); + goto out; + } + } + spin_unlock(&anon_vma->lock); + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +#ifdef CONFIG_MIGRATION +int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct stable_node *stable_node; + struct hlist_node *hlist; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return ret; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + ret = rmap_one(page, vma, rmap_item->address, arg); + if (ret != SWAP_AGAIN) { + spin_unlock(&anon_vma->lock); + goto out; + } + } + spin_unlock(&anon_vma->lock); + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +void ksm_migrate_page(struct page *newpage, struct page *oldpage) +{ + struct stable_node *stable_node; + + VM_BUG_ON(!PageLocked(oldpage)); + VM_BUG_ON(!PageLocked(newpage)); + VM_BUG_ON(newpage->mapping != oldpage->mapping); + + stable_node = page_stable_node(newpage); + if (stable_node) { + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); + stable_node->kpfn = page_to_pfn(newpage); + } +} +#endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct rb_node *node; + + for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { + struct stable_node *stable_node; + + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) + return stable_node; + } + return NULL; +} + +static int ksm_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + struct stable_node *stable_node; + + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Keep it very simple for now: just lock out ksmd and + * MADV_UNMERGEABLE while any memory is going offline. + */ + mutex_lock(&ksm_thread_mutex); + break; + + case MEM_OFFLINE: + /* + * Most of the work is done by page migration; but there might + * be a few stable_nodes left over, still pointing to struct + * pages which have been offlined: prune those from the tree. + */ + while ((stable_node = ksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages)) != NULL) + remove_node_from_stable_tree(stable_node); + /* fallthrough */ + + case MEM_CANCEL_OFFLINE: + mutex_unlock(&ksm_thread_mutex); + break; + } + return NOTIFY_OK; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. @@ -1540,8 +1845,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, - * breaking COW to free the unswappable pages_shared (but leaves - * mm_slots on the list for when ksmd may be set running again). + * breaking COW to free the pages_shared (but leaves mm_slots + * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); @@ -1566,29 +1871,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); -static ssize_t max_kernel_pages_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long nr_pages; - - err = strict_strtoul(buf, 10, &nr_pages); - if (err) - return -EINVAL; - - ksm_max_kernel_pages = nr_pages; - - return count; -} - -static ssize_t max_kernel_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", ksm_max_kernel_pages); -} -KSM_ATTR(max_kernel_pages); - static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1638,7 +1920,6 @@ static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, - &max_kernel_pages_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, @@ -1658,8 +1939,6 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; - ksm_max_kernel_pages = totalram_pages / 4; - err = ksm_slab_init(); if (err) goto out; @@ -1687,6 +1966,13 @@ static int __init ksm_init(void) #endif /* CONFIG_SYSFS */ +#ifdef CONFIG_MEMORY_HOTREMOVE + /* + * Choose a high priority since the callback takes ksm_thread_mutex: + * later callbacks could only be taking locks which nest within that. + */ + hotplug_memory_notifier(ksm_memory_callback, 100); +#endif return 0; out_free2: