#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
+#include <linux/memory.h>
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
/**
* struct stable_node - node of the stable rbtree
- * @page: pointer to struct page of the ksm page
* @node: rb node of this ksm page in the stable tree
* @hlist: hlist head of rmap_items using this ksm page
+ * @kpfn: page frame number of this ksm page
*/
struct stable_node {
- struct page *page;
struct rb_node node;
struct hlist_head hlist;
+ unsigned long kpfn;
};
/**
/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;
-/* Limit on the number of unswappable pages used */
-static unsigned long ksm_max_kernel_pages;
-
/* Number of pages ksmd should scan in one batch */
static unsigned int ksm_thread_pages_to_scan = 100;
unsigned long addr = rmap_item->address;
struct vm_area_struct *vma;
+ /*
+ * It is not an accident that whenever we want to break COW
+ * to undo, we also need to drop a reference to the anon_vma.
+ */
+ drop_anon_vma(rmap_item);
+
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
goto out;
return page;
}
+static void remove_node_from_stable_tree(struct stable_node *stable_node)
+{
+ struct rmap_item *rmap_item;
+ struct hlist_node *hlist;
+
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ if (rmap_item->hlist.next)
+ ksm_pages_sharing--;
+ else
+ ksm_pages_shared--;
+ drop_anon_vma(rmap_item);
+ rmap_item->address &= PAGE_MASK;
+ cond_resched();
+ }
+
+ rb_erase(&stable_node->node, &root_stable_tree);
+ free_stable_node(stable_node);
+}
+
+/*
+ * get_ksm_page: checks if the page indicated by the stable node
+ * is still its ksm page, despite having held no reference to it.
+ * In which case we can trust the content of the page, and it
+ * returns the gotten page; but if the page has now been zapped,
+ * remove the stale node from the stable tree and return NULL.
+ *
+ * You would expect the stable_node to hold a reference to the ksm page.
+ * But if it increments the page's count, swapping out has to wait for
+ * ksmd to come around again before it can free the page, which may take
+ * seconds or even minutes: much too unresponsive. So instead we use a
+ * "keyhole reference": access to the ksm page from the stable node peeps
+ * out through its keyhole to see if that page still holds the right key,
+ * pointing back to this stable node. This relies on freeing a PageAnon
+ * page to reset its page->mapping to NULL, and relies on no other use of
+ * a page to put something that might look like our key in page->mapping.
+ *
+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
+ * but this is different - made simpler by ksm_thread_mutex being held, but
+ * interesting for assuming that no other use of the struct page could ever
+ * put our expected_mapping into page->mapping (or a field of the union which
+ * coincides with page->mapping). The RCU calls are not for KSM at all, but
+ * to keep the page_count protocol described with page_cache_get_speculative.
+ *
+ * Note: it is possible that get_ksm_page() will return NULL one moment,
+ * then page the next, if the page is in between page_freeze_refs() and
+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
+ * is on its way to being freed; but it is an anomaly to bear in mind.
+ */
+static struct page *get_ksm_page(struct stable_node *stable_node)
+{
+ struct page *page;
+ void *expected_mapping;
+
+ page = pfn_to_page(stable_node->kpfn);
+ expected_mapping = (void *)stable_node +
+ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+ rcu_read_lock();
+ if (page->mapping != expected_mapping)
+ goto stale;
+ if (!get_page_unless_zero(page))
+ goto stale;
+ if (page->mapping != expected_mapping) {
+ put_page(page);
+ goto stale;
+ }
+ rcu_read_unlock();
+ return page;
+stale:
+ rcu_read_unlock();
+ remove_node_from_stable_tree(stable_node);
+ return NULL;
+}
+
/*
* Removing rmap_item from stable or unstable tree.
* This function will clean the information from the stable/unstable tree.
struct page *page;
stable_node = rmap_item->head;
- page = stable_node->page;
- lock_page(page);
+ page = get_ksm_page(stable_node);
+ if (!page)
+ goto out;
+ lock_page(page);
hlist_del(&rmap_item->hlist);
- if (stable_node->hlist.first) {
- unlock_page(page);
- ksm_pages_sharing--;
- } else {
- set_page_stable_node(page, NULL);
- unlock_page(page);
- put_page(page);
+ unlock_page(page);
+ put_page(page);
- rb_erase(&stable_node->node, &root_stable_tree);
- free_stable_node(stable_node);
+ if (stable_node->hlist.first)
+ ksm_pages_sharing--;
+ else
ksm_pages_shared--;
- }
drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK;
ksm_pages_unshared--;
rmap_item->address &= PAGE_MASK;
}
-
+out:
cond_resched(); /* we're called from many long loops */
}
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
* @page: the PageAnon page that we want to replace with kpage
- * @kpage: the PageKsm page that we want to map instead of page
+ * @kpage: the PageKsm page that we want to map instead of page,
+ * or NULL the first time when we want to use page as kpage.
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
*/
* ptes are necessarily already write-protected. But in either
* case, we need to lock and check page_count is not raised.
*/
- if (write_protect_page(vma, page, &orig_pte) == 0 &&
- pages_identical(page, kpage))
- err = replace_page(vma, page, kpage, orig_pte);
+ if (write_protect_page(vma, page, &orig_pte) == 0) {
+ if (!kpage) {
+ /*
+ * While we hold page lock, upgrade page from
+ * PageAnon+anon_vma to PageKsm+NULL stable_node:
+ * stable_tree_insert() will update stable_node.
+ */
+ set_page_stable_node(page, NULL);
+ mark_page_accessed(page);
+ err = 0;
+ } else if (pages_identical(page, kpage))
+ err = replace_page(vma, page, kpage, orig_pte);
+ }
- if ((vma->vm_flags & VM_LOCKED) && !err) {
+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
munlock_vma_page(page);
if (!PageMlocked(kpage)) {
unlock_page(page);
- lru_add_drain();
lock_page(kpage);
mlock_vma_page(kpage);
page = kpage; /* for final unlock */
* This function returns the kpage if we successfully merged two identical
* pages into one ksm page, NULL otherwise.
*
- * Note that this function allocates a new kernel page: if one of the pages
+ * Note that this function upgrades page to ksm page: if one of the pages
* is already a ksm page, try_to_merge_with_ksm_page should be used.
*/
static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
struct rmap_item *tree_rmap_item,
struct page *tree_page)
{
- struct mm_struct *mm = rmap_item->mm;
- struct vm_area_struct *vma;
- struct page *kpage;
- int err = -EFAULT;
-
- /*
- * The number of nodes in the stable tree
- * is the number of kernel pages that we hold.
- */
- if (ksm_max_kernel_pages &&
- ksm_max_kernel_pages <= ksm_pages_shared)
- return NULL;
-
- kpage = alloc_page(GFP_HIGHUSER);
- if (!kpage)
- return NULL;
-
- down_read(&mm->mmap_sem);
- if (ksm_test_exit(mm))
- goto up;
- vma = find_vma(mm, rmap_item->address);
- if (!vma || vma->vm_start > rmap_item->address)
- goto up;
-
- copy_user_highpage(kpage, page, rmap_item->address, vma);
-
- SetPageDirty(kpage);
- __SetPageUptodate(kpage);
- SetPageSwapBacked(kpage);
- set_page_stable_node(kpage, NULL); /* mark it PageKsm */
- lru_cache_add_lru(kpage, LRU_ACTIVE_ANON);
-
- err = try_to_merge_one_page(vma, page, kpage);
- if (err)
- goto up;
-
- /* Must get reference to anon_vma while still holding mmap_sem */
- hold_anon_vma(rmap_item, vma->anon_vma);
-up:
- up_read(&mm->mmap_sem);
+ int err;
+ err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
if (!err) {
err = try_to_merge_with_ksm_page(tree_rmap_item,
- tree_page, kpage);
+ tree_page, page);
/*
* If that fails, we have a ksm page with only one pte
* pointing to it: so break it.
*/
- if (err) {
- drop_anon_vma(rmap_item);
+ if (err)
break_cow(rmap_item);
- }
- }
- if (err) {
- put_page(kpage);
- kpage = NULL;
}
- return kpage;
+ return err ? NULL : page;
}
/*
* This function returns the stable tree node of identical content if found,
* NULL otherwise.
*/
-static struct stable_node *stable_tree_search(struct page *page)
+static struct page *stable_tree_search(struct page *page)
{
struct rb_node *node = root_stable_tree.rb_node;
struct stable_node *stable_node;
stable_node = page_stable_node(page);
if (stable_node) { /* ksm page forked */
get_page(page);
- return stable_node;
+ return page;
}
while (node) {
+ struct page *tree_page;
int ret;
cond_resched();
stable_node = rb_entry(node, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node);
+ if (!tree_page)
+ return NULL;
- ret = memcmp_pages(page, stable_node->page);
+ ret = memcmp_pages(page, tree_page);
- if (ret < 0)
+ if (ret < 0) {
+ put_page(tree_page);
node = node->rb_left;
- else if (ret > 0)
+ } else if (ret > 0) {
+ put_page(tree_page);
node = node->rb_right;
- else {
- get_page(stable_node->page);
- return stable_node;
- }
+ } else
+ return tree_page;
}
return NULL;
struct stable_node *stable_node;
while (*new) {
+ struct page *tree_page;
int ret;
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node);
+ if (!tree_page)
+ return NULL;
- ret = memcmp_pages(kpage, stable_node->page);
+ ret = memcmp_pages(kpage, tree_page);
+ put_page(tree_page);
parent = *new;
if (ret < 0)
INIT_HLIST_HEAD(&stable_node->hlist);
- get_page(kpage);
- stable_node->page = kpage;
+ stable_node->kpfn = page_to_pfn(kpage);
set_page_stable_node(kpage, stable_node);
return stable_node;
remove_rmap_item_from_tree(rmap_item);
/* We first start with searching the page inside the stable tree */
- stable_node = stable_tree_search(page);
- if (stable_node) {
- kpage = stable_node->page;
+ kpage = stable_tree_search(page);
+ if (kpage) {
err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) {
/*
* add its rmap_item to the stable tree.
*/
lock_page(kpage);
- stable_tree_append(rmap_item, stable_node);
+ stable_tree_append(rmap_item, page_stable_node(kpage));
unlock_page(kpage);
}
put_page(kpage);
}
/*
- * A ksm page might have got here by fork, but its other
- * references have already been removed from the stable tree.
- * Or it might be left over from a break_ksm which failed
- * when the mem_cgroup had reached its limit: try again now.
- */
- if (PageKsm(page))
- break_cow(rmap_item);
-
- /*
- * In case the hash value of the page was changed from the last time we
- * have calculated it, this page to be changed frequely, therefore we
- * don't want to insert it to the unstable tree, and we don't want to
- * waste our time to search if there is something identical to it there.
+ * If the hash value of the page has changed from the last time
+ * we calculated it, this page is changing frequently: therefore we
+ * don't want to insert it in the unstable tree, and we don't want
+ * to waste our time searching for something identical to it there.
*/
checksum = calc_checksum(page);
if (rmap_item->oldchecksum != checksum) {
stable_tree_append(rmap_item, stable_node);
}
unlock_page(kpage);
- put_page(kpage);
/*
* If we fail to insert the page into the stable tree,
* in which case we need to break_cow on both.
*/
if (!stable_node) {
- drop_anon_vma(tree_rmap_item);
break_cow(tree_rmap_item);
- drop_anon_vma(rmap_item);
break_cow(rmap_item);
}
}
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
spin_lock(&anon_vma->lock);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
spin_lock(&anon_vma->lock);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
return ret;
}
+#ifdef CONFIG_MIGRATION
+int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
+ struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+ struct stable_node *stable_node;
+ struct hlist_node *hlist;
+ struct rmap_item *rmap_item;
+ int ret = SWAP_AGAIN;
+ int search_new_forks = 0;
+
+ VM_BUG_ON(!PageKsm(page));
+ VM_BUG_ON(!PageLocked(page));
+
+ stable_node = page_stable_node(page);
+ if (!stable_node)
+ return ret;
+again:
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
+ struct vm_area_struct *vma;
+
+ spin_lock(&anon_vma->lock);
+ list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma = vmac->vma;
+ if (rmap_item->address < vma->vm_start ||
+ rmap_item->address >= vma->vm_end)
+ continue;
+ /*
+ * Initially we examine only the vma which covers this
+ * rmap_item; but later, if there is still work to do,
+ * we examine covering vmas in other mms: in case they
+ * were forked from the original since ksmd passed.
+ */
+ if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ continue;
+
+ ret = rmap_one(page, vma, rmap_item->address, arg);
+ if (ret != SWAP_AGAIN) {
+ spin_unlock(&anon_vma->lock);
+ goto out;
+ }
+ }
+ spin_unlock(&anon_vma->lock);
+ }
+ if (!search_new_forks++)
+ goto again;
+out:
+ return ret;
+}
+
+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+{
+ struct stable_node *stable_node;
+
+ VM_BUG_ON(!PageLocked(oldpage));
+ VM_BUG_ON(!PageLocked(newpage));
+ VM_BUG_ON(newpage->mapping != oldpage->mapping);
+
+ stable_node = page_stable_node(newpage);
+ if (stable_node) {
+ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
+ stable_node->kpfn = page_to_pfn(newpage);
+ }
+}
+#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct rb_node *node;
+
+ for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
+ struct stable_node *stable_node;
+
+ stable_node = rb_entry(node, struct stable_node, node);
+ if (stable_node->kpfn >= start_pfn &&
+ stable_node->kpfn < end_pfn)
+ return stable_node;
+ }
+ return NULL;
+}
+
+static int ksm_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = arg;
+ struct stable_node *stable_node;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ /*
+ * Keep it very simple for now: just lock out ksmd and
+ * MADV_UNMERGEABLE while any memory is going offline.
+ */
+ mutex_lock(&ksm_thread_mutex);
+ break;
+
+ case MEM_OFFLINE:
+ /*
+ * Most of the work is done by page migration; but there might
+ * be a few stable_nodes left over, still pointing to struct
+ * pages which have been offlined: prune those from the tree.
+ */
+ while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
+ mn->start_pfn + mn->nr_pages)) != NULL)
+ remove_node_from_stable_tree(stable_node);
+ /* fallthrough */
+
+ case MEM_CANCEL_OFFLINE:
+ mutex_unlock(&ksm_thread_mutex);
+ break;
+ }
+ return NOTIFY_OK;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
#ifdef CONFIG_SYSFS
/*
* This all compiles without CONFIG_SYSFS, but is a waste of space.
/*
* KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
* KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
- * breaking COW to free the unswappable pages_shared (but leaves
- * mm_slots on the list for when ksmd may be set running again).
+ * breaking COW to free the pages_shared (but leaves mm_slots
+ * on the list for when ksmd may be set running again).
*/
mutex_lock(&ksm_thread_mutex);
}
KSM_ATTR(run);
-static ssize_t max_kernel_pages_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- int err;
- unsigned long nr_pages;
-
- err = strict_strtoul(buf, 10, &nr_pages);
- if (err)
- return -EINVAL;
-
- ksm_max_kernel_pages = nr_pages;
-
- return count;
-}
-
-static ssize_t max_kernel_pages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
-}
-KSM_ATTR(max_kernel_pages);
-
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
&sleep_millisecs_attr.attr,
&pages_to_scan_attr.attr,
&run_attr.attr,
- &max_kernel_pages_attr.attr,
&pages_shared_attr.attr,
&pages_sharing_attr.attr,
&pages_unshared_attr.attr,
struct task_struct *ksm_thread;
int err;
- ksm_max_kernel_pages = totalram_pages / 4;
-
err = ksm_slab_init();
if (err)
goto out;
#endif /* CONFIG_SYSFS */
+#ifdef CONFIG_MEMORY_HOTREMOVE
+ /*
+ * Choose a high priority since the callback takes ksm_thread_mutex:
+ * later callbacks could only be taking locks which nest within that.
+ */
+ hotplug_memory_notifier(ksm_memory_callback, 100);
+#endif
return 0;
out_free2: