X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fksm.c;h=5575f8628fef5306611055092e47ed9f472f4027;hb=0bb38a5cdeb39f543657ec6fb9950343d2de6918;hp=46b4c522ea4fe79c159bdfb034d5f527392d077d;hpb=b4028260334e1ecf63fb5e0a95d65bb2db02c1ec;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/ksm.c b/mm/ksm.c index 46b4c52..5575f86 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -152,22 +153,28 @@ static struct kmem_cache *mm_slot_cache; /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; -/* The number of page slots sharing those nodes */ +/* The number of page slots additionally sharing those nodes */ static unsigned long ksm_pages_sharing; +/* The number of nodes in the unstable tree */ +static unsigned long ksm_pages_unshared; + +/* The number of rmap_items in use: to calculate pages_volatile */ +static unsigned long ksm_rmap_items; + /* Limit on the number of unswappable pages used */ static unsigned long ksm_max_kernel_pages; /* Number of pages ksmd should scan in one batch */ -static unsigned int ksm_thread_pages_to_scan; +static unsigned int ksm_thread_pages_to_scan = 100; /* Milliseconds ksmd should sleep between batches */ -static unsigned int ksm_thread_sleep_millisecs; +static unsigned int ksm_thread_sleep_millisecs = 20; #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 -static unsigned int ksm_run; +static unsigned int ksm_run = KSM_RUN_STOP; static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DEFINE_MUTEX(ksm_thread_mutex); @@ -204,11 +211,17 @@ static void __init ksm_slab_free(void) static inline struct rmap_item *alloc_rmap_item(void) { - return kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + struct rmap_item *rmap_item; + + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + if (rmap_item) + ksm_rmap_items++; + return rmap_item; } static inline void free_rmap_item(struct rmap_item *rmap_item) { + ksm_rmap_items--; rmap_item->mm = NULL; /* debug safety */ kmem_cache_free(rmap_item_cache, rmap_item); } @@ -272,6 +285,19 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) } /* + * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's + * page tables after it has passed through ksm_exit() - which, if necessary, + * takes mmap_sem briefly to serialize against them. ksm_exit() does not set + * a special flag: they can just back out as soon as mm_users goes to zero. + * ksm_test_exit() is used throughout to make this test for exit: in some + * places for correctness, in some places just to avoid unnecessary work. + */ +static inline bool ksm_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +/* * We use break_ksm to break COW on a ksm page: it's a stripped down * * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) @@ -282,10 +308,10 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. */ -static void break_ksm(struct vm_area_struct *vma, unsigned long addr) +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { struct page *page; - int ret; + int ret = 0; do { cond_resched(); @@ -298,27 +324,52 @@ static void break_ksm(struct vm_area_struct *vma, unsigned long addr) else ret = VM_FAULT_WRITE; put_page(page); - } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS))); - - /* Which leaves us looping there if VM_FAULT_OOM: hmmm... */ + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); + /* + * We must loop because handle_mm_fault() may back out if there's + * any difficulty e.g. if pte accessed bit gets updated concurrently. + * + * VM_FAULT_WRITE is what we have been hoping for: it indicates that + * COW has been broken, even if the vma does not permit VM_WRITE; + * but note that a concurrent fault might break PageKsm for us. + * + * VM_FAULT_SIGBUS could occur if we race with truncation of the + * backing file, which also invalidates anonymous pages: that's + * okay, that truncation will have unmapped the PageKsm for us. + * + * VM_FAULT_OOM: at the time of writing (late July 2009), setting + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the + * current task has TIF_MEMDIE set, and will be OOM killed on return + * to user; and ksmd, having no mm, would never be chosen for that. + * + * But if the mm is in a limited mem_cgroup, then the fault may fail + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and + * even ksmd can fail in this way - though it's usually breaking ksm + * just to undo a merge it made a moment before, so unlikely to oom. + * + * That's a pity: we might therefore have more kernel pages allocated + * than we're counting as nodes in the stable tree; but ksm_do_scan + * will retry to break_cow on each pass, so should recover the page + * in due course. The important thing is to not let VM_MERGEABLE + * be cleared while any such pages might remain in the area. + */ + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } -static void __break_cow(struct mm_struct *mm, unsigned long addr) +static void break_cow(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) - return; + goto out; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) - return; + goto out; break_ksm(vma, addr); -} - -static void break_cow(struct mm_struct *mm, unsigned long addr) -{ - down_read(&mm->mmap_sem); - __break_cow(mm, addr); +out: up_read(&mm->mmap_sem); } @@ -330,6 +381,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) struct page *page; down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) goto out; @@ -382,6 +435,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) &next_item->node, &root_stable_tree); next_item->address |= NODE_FLAG; + ksm_pages_sharing--; } else { rb_erase(&rmap_item->node, &root_stable_tree); ksm_pages_shared--; @@ -395,29 +449,25 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) BUG_ON(next_item->prev != rmap_item); next_item->prev = rmap_item->prev; } + ksm_pages_sharing--; } rmap_item->next = NULL; - ksm_pages_sharing--; } else if (rmap_item->address & NODE_FLAG) { unsigned char age; /* - * ksm_thread can and must skip the rb_erase, because + * Usually ksmd can and must skip the rb_erase, because * root_unstable_tree was already reset to RB_ROOT. - * But __ksm_exit has to be careful: do the rb_erase - * if it's interrupting a scan, and this rmap_item was - * inserted by this scan rather than left from before. - * - * Because of the case in which remove_mm_from_lists - * increments seqnr before removing rmaps, unstable_nr - * may even be 2 behind seqnr, but should never be - * further behind. Yes, I did have trouble with this! + * But be careful when an mm is exiting: do the rb_erase + * if this rmap_item was inserted by this scan, rather + * than left over from before. */ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); - BUG_ON(age > 2); + BUG_ON(age > 1); if (!age) rb_erase(&rmap_item->node, &root_unstable_tree); + ksm_pages_unshared--; } rmap_item->address &= PAGE_MASK; @@ -425,17 +475,6 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) cond_resched(); /* we're called from many long loops */ } -static void remove_all_slot_rmap_items(struct mm_slot *mm_slot) -{ - struct rmap_item *rmap_item, *node; - - list_for_each_entry_safe(rmap_item, node, &mm_slot->rmap_list, link) { - remove_rmap_item_from_tree(rmap_item); - list_del(&rmap_item->link); - free_rmap_item(rmap_item); - } -} - static void remove_trailing_rmap_items(struct mm_slot *mm_slot, struct list_head *cur) { @@ -457,72 +496,91 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot, * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing * rmap_items from parent to child at fork time (so as not to waste time * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. */ -static void unmerge_ksm_pages(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static int unmerge_ksm_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { unsigned long addr; + int err = 0; - for (addr = start; addr < end; addr += PAGE_SIZE) - break_ksm(vma, addr); + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { + if (ksm_test_exit(vma->vm_mm)) + break; + if (signal_pending(current)) + err = -ERESTARTSYS; + else + err = break_ksm(vma, addr); + } + return err; } -static void unmerge_and_remove_all_rmap_items(void) +#ifdef CONFIG_SYSFS +/* + * Only called through the sysfs control interface: + */ +static int unmerge_and_remove_all_rmap_items(void) { struct mm_slot *mm_slot; struct mm_struct *mm; struct vm_area_struct *vma; + int err = 0; + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, + struct mm_slot, mm_list); + spin_unlock(&ksm_mmlist_lock); - list_for_each_entry(mm_slot, &ksm_mm_head.mm_list, mm_list) { + for (mm_slot = ksm_scan.mm_slot; + mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { mm = mm_slot->mm; down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (ksm_test_exit(mm)) + break; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; - unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end); + err = unmerge_ksm_pages(vma, + vma->vm_start, vma->vm_end); + if (err) + goto error; } - remove_all_slot_rmap_items(mm_slot); - up_read(&mm->mmap_sem); - } - spin_lock(&ksm_mmlist_lock); - if (ksm_scan.mm_slot != &ksm_mm_head) { - ksm_scan.mm_slot = &ksm_mm_head; - ksm_scan.seqnr++; + remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_test_exit(mm)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } } - spin_unlock(&ksm_mmlist_lock); -} -static void remove_mm_from_lists(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; + ksm_scan.seqnr = 0; + return 0; +error: + up_read(&mm->mmap_sem); spin_lock(&ksm_mmlist_lock); - mm_slot = get_mm_slot(mm); - - /* - * This mm_slot is always at the scanning cursor when we're - * called from scan_get_next_rmap_item; but it's a special - * case when we're called from __ksm_exit. - */ - if (ksm_scan.mm_slot == mm_slot) { - ksm_scan.mm_slot = list_entry( - mm_slot->mm_list.next, struct mm_slot, mm_list); - ksm_scan.address = 0; - ksm_scan.rmap_item = list_entry( - &ksm_scan.mm_slot->rmap_list, struct rmap_item, link); - if (ksm_scan.mm_slot == &ksm_mm_head) - ksm_scan.seqnr++; - } - - hlist_del(&mm_slot->link); - list_del(&mm_slot->mm_list); + ksm_scan.mm_slot = &ksm_mm_head; spin_unlock(&ksm_mmlist_lock); - - remove_all_slot_rmap_items(mm_slot); - free_mm_slot(mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); + return err; } +#endif /* CONFIG_SYSFS */ static u32 calc_checksum(struct page *page) { @@ -726,6 +784,32 @@ out: } /* + * try_to_merge_with_ksm_page - like try_to_merge_two_pages, + * but no new kernel page is allocated: kpage must already be a ksm page. + */ +static int try_to_merge_with_ksm_page(struct mm_struct *mm1, + unsigned long addr1, + struct page *page1, + struct page *kpage) +{ + struct vm_area_struct *vma; + int err = -EFAULT; + + down_read(&mm1->mmap_sem); + if (ksm_test_exit(mm1)) + goto out; + + vma = find_vma(mm1, addr1); + if (!vma || vma->vm_start > addr1) + goto out; + + err = try_to_merge_one_page(vma, page1, kpage); +out: + up_read(&mm1->mmap_sem); + return err; +} + +/* * try_to_merge_two_pages - take two identical pages and prepare them * to be merged into one page. * @@ -756,11 +840,14 @@ static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, return err; down_read(&mm1->mmap_sem); + if (ksm_test_exit(mm1)) { + up_read(&mm1->mmap_sem); + goto out; + } vma = find_vma(mm1, addr1); if (!vma || vma->vm_start > addr1) { - put_page(kpage); up_read(&mm1->mmap_sem); - return err; + goto out; } copy_user_highpage(kpage, page1, addr1, vma); @@ -768,61 +855,20 @@ static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, up_read(&mm1->mmap_sem); if (!err) { - down_read(&mm2->mmap_sem); - vma = find_vma(mm2, addr2); - if (!vma || vma->vm_start > addr2) { - put_page(kpage); - up_read(&mm2->mmap_sem); - break_cow(mm1, addr1); - return -EFAULT; - } - - err = try_to_merge_one_page(vma, page2, kpage); - up_read(&mm2->mmap_sem); - + err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); /* - * If the second try_to_merge_one_page failed, we have a - * ksm page with just one pte pointing to it, so break it. + * If that fails, we have a ksm page with only one pte + * pointing to it: so break it. */ if (err) break_cow(mm1, addr1); - else - ksm_pages_sharing += 2; } - +out: put_page(kpage); return err; } /* - * try_to_merge_with_ksm_page - like try_to_merge_two_pages, - * but no new kernel page is allocated: kpage must already be a ksm page. - */ -static int try_to_merge_with_ksm_page(struct mm_struct *mm1, - unsigned long addr1, - struct page *page1, - struct page *kpage) -{ - struct vm_area_struct *vma; - int err = -EFAULT; - - down_read(&mm1->mmap_sem); - vma = find_vma(mm1, addr1); - if (!vma || vma->vm_start > addr1) { - up_read(&mm1->mmap_sem); - return err; - } - - err = try_to_merge_one_page(vma, page1, kpage); - up_read(&mm1->mmap_sem); - - if (!err) - ksm_pages_sharing++; - - return err; -} - -/* * stable_tree_search - search page inside the stable tree * @page: the page that we are searching identical pages to. * @page2: pointer into identical page that we are holding inside the stable @@ -928,13 +974,12 @@ static struct rmap_item *stable_tree_insert(struct page *page, } } - ksm_pages_shared++; - rmap_item->address |= NODE_FLAG | STABLE_FLAG; rmap_item->next = NULL; rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, &root_stable_tree); + ksm_pages_shared++; return rmap_item; } @@ -967,6 +1012,7 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page, struct rmap_item *tree_rmap_item; int ret; + cond_resched(); tree_rmap_item = rb_entry(*new, struct rmap_item, node); page2[0] = get_mergeable_page(tree_rmap_item); if (!page2[0]) @@ -1000,6 +1046,7 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page, rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, &root_unstable_tree); + ksm_pages_unshared++; return NULL; } @@ -1019,13 +1066,15 @@ static void stable_tree_append(struct rmap_item *rmap_item, tree_rmap_item->next = rmap_item; rmap_item->address |= STABLE_FLAG; + + ksm_pages_sharing++; } /* - * cmp_and_merge_page - take a page computes its hash value and check if there - * is similar hash value to different page, - * in case we find that there is similar hash to different page we call to - * try_to_merge_two_pages(). + * cmp_and_merge_page - first see if page can be merged into the stable tree; + * if not, compare checksum to previous and if it's the same, see if page can + * be inserted into the unstable tree, or merged with a page already there and + * both transferred to the stable tree. * * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page @@ -1043,10 +1092,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) /* We first start with searching the page inside the stable tree */ tree_rmap_item = stable_tree_search(page, page2, rmap_item); if (tree_rmap_item) { - if (page == page2[0]) { /* forked */ - ksm_pages_sharing++; + if (page == page2[0]) /* forked */ err = 0; - } else + else err = try_to_merge_with_ksm_page(rmap_item->mm, rmap_item->address, page, page2[0]); @@ -1065,6 +1113,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) /* * A ksm page might have got here by fork, but its other * references have already been removed from the stable tree. + * Or it might be left over from a break_ksm which failed + * when the mem_cgroup had reached its limit: try again now. */ if (PageKsm(page)) break_cow(rmap_item->mm, rmap_item->address); @@ -1095,6 +1145,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) if (!err) { rb_erase(&tree_rmap_item->node, &root_unstable_tree); tree_rmap_item->address &= ~NODE_FLAG; + ksm_pages_unshared--; + /* * If we fail to insert the page into the stable tree, * we will have 2 virtual addresses that are pointing @@ -1107,7 +1159,6 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) break_cow(tree_rmap_item->mm, tree_rmap_item->address); break_cow(rmap_item->mm, rmap_item->address); - ksm_pages_sharing -= 2; } } @@ -1172,7 +1223,12 @@ next_mm: mm = slot->mm; down_read(&mm->mmap_sem); - for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) { + if (ksm_test_exit(mm)) + vma = NULL; + else + vma = find_vma(mm, ksm_scan.address); + + for (; vma; vma = vma->vm_next) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) @@ -1181,6 +1237,8 @@ next_mm: ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { + if (ksm_test_exit(mm)) + break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); if (*page && PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); @@ -1203,40 +1261,48 @@ next_mm: } } - if (!ksm_scan.address) { - /* - * We've completed a full scan of all vmas, holding mmap_sem - * throughout, and found no VM_MERGEABLE: so do the same as - * __ksm_exit does to remove this mm from all our lists now. - */ - remove_mm_from_lists(mm); - up_read(&mm->mmap_sem); - slot = ksm_scan.mm_slot; - if (slot != &ksm_mm_head) - goto next_mm; - return NULL; + if (ksm_test_exit(mm)) { + ksm_scan.address = 0; + ksm_scan.rmap_item = list_entry(&slot->rmap_list, + struct rmap_item, link); } - /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); - up_read(&mm->mmap_sem); spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); - ksm_scan.mm_slot = slot; - spin_unlock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_scan.address == 0) { + /* + * We've completed a full scan of all vmas, holding mmap_sem + * throughout, and found no VM_MERGEABLE: so do the same as + * __ksm_exit does to remove this mm from all our lists now. + * This applies either when cleaning up after __ksm_exit + * (but beware: we can reach here even before __ksm_exit), + * or when all VM_MERGEABLE areas have been unmapped (and + * mmap_sem then protects against race with MADV_MERGEABLE). + */ + hlist_del(&slot->link); + list_del(&slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } /* Repeat until we've completed scanning the whole list */ + slot = ksm_scan.mm_slot; if (slot != &ksm_mm_head) goto next_mm; - /* - * Bump seqnr here rather than at top, so that __ksm_exit - * can skip rb_erase on unstable tree until we run again. - */ ksm_scan.seqnr++; return NULL; } @@ -1257,25 +1323,39 @@ static void ksm_do_scan(unsigned int scan_npages) return; if (!PageKsm(page) || !in_stable_tree(rmap_item)) cmp_and_merge_page(page, rmap_item); + else if (page_mapcount(page) == 1) { + /* + * Replace now-unshared ksm page by ordinary page. + */ + break_cow(rmap_item->mm, rmap_item->address); + remove_rmap_item_from_tree(rmap_item); + rmap_item->oldchecksum = calc_checksum(page); + } put_page(page); } } +static int ksmd_should_run(void) +{ + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); +} + static int ksm_scan_thread(void *nothing) { set_user_nice(current, 5); while (!kthread_should_stop()) { - if (ksm_run & KSM_RUN_MERGE) { - mutex_lock(&ksm_thread_mutex); + mutex_lock(&ksm_thread_mutex); + if (ksmd_should_run()) ksm_do_scan(ksm_thread_pages_to_scan); - mutex_unlock(&ksm_thread_mutex); + mutex_unlock(&ksm_thread_mutex); + + if (ksmd_should_run()) { schedule_timeout_interruptible( msecs_to_jiffies(ksm_thread_sleep_millisecs)); } else { wait_event_interruptible(ksm_thread_wait, - (ksm_run & KSM_RUN_MERGE) || - kthread_should_stop()); + ksmd_should_run() || kthread_should_stop()); } } return 0; @@ -1285,6 +1365,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; + int err; switch (advice) { case MADV_MERGEABLE: @@ -1297,9 +1378,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, VM_MIXEDMAP | VM_SAO)) return 0; /* just ignore the advice */ - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) - if (__ksm_enter(mm) < 0) - return -EAGAIN; + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } *vm_flags |= VM_MERGEABLE; break; @@ -1308,8 +1391,11 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (!(*vm_flags & VM_MERGEABLE)) return 0; /* just ignore the advice */ - if (vma->anon_vma) - unmerge_ksm_pages(vma, start, end); + if (vma->anon_vma) { + err = unmerge_ksm_pages(vma, start, end); + if (err) + return err; + } *vm_flags &= ~VM_MERGEABLE; break; @@ -1320,10 +1406,16 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, int __ksm_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot = alloc_mm_slot(); + struct mm_slot *mm_slot; + int needs_wakeup; + + mm_slot = alloc_mm_slot(); if (!mm_slot) return -ENOMEM; + /* Check ksm_run too? Would need tighter locking */ + needs_wakeup = list_empty(&ksm_mm_head.mm_list); + spin_lock(&ksm_mmlist_lock); insert_to_mm_slots_hash(mm, mm_slot); /* @@ -1335,21 +1427,57 @@ int __ksm_enter(struct mm_struct *mm) spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); + atomic_inc(&mm->mm_count); + + if (needs_wakeup) + wake_up_interruptible(&ksm_thread_wait); + return 0; } void __ksm_exit(struct mm_struct *mm) { + struct mm_slot *mm_slot; + int easy_to_free = 0; + /* - * This process is exiting: doesn't hold and doesn't need mmap_sem; - * but we do need to exclude ksmd and other exiters while we modify - * the various lists and trees. + * This process is exiting: if it's straightforward (as is the + * case when ksmd was never running), free mm_slot immediately. + * But if it's at the cursor or has rmap_items linked to it, use + * mmap_sem to synchronize with any break_cows before pagetables + * are freed, and leave the mm_slot on the list for ksmd to free. + * Beware: ksm may already have noticed it exiting and freed the slot. */ - mutex_lock(&ksm_thread_mutex); - remove_mm_from_lists(mm); - mutex_unlock(&ksm_thread_mutex); + + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && ksm_scan.mm_slot != mm_slot) { + if (list_empty(&mm_slot->rmap_list)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + easy_to_free = 1; + } else { + list_move(&mm_slot->mm_list, + &ksm_scan.mm_slot->mm_list); + } + } + spin_unlock(&ksm_mmlist_lock); + + if (easy_to_free) { + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mmdrop(mm); + } else if (mm_slot) { + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } } +#ifdef CONFIG_SYSFS +/* + * This all compiles without CONFIG_SYSFS, but is a waste of space. + */ + #define KSM_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KSM_ATTR(_name) \ @@ -1430,8 +1558,15 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, mutex_lock(&ksm_thread_mutex); if (ksm_run != flags) { ksm_run = flags; - if (flags & KSM_RUN_UNMERGE) - unmerge_and_remove_all_rmap_items(); + if (flags & KSM_RUN_UNMERGE) { + current->flags |= PF_OOM_ORIGIN; + err = unmerge_and_remove_all_rmap_items(); + current->flags &= ~PF_OOM_ORIGIN; + if (err) { + ksm_run = KSM_RUN_STOP; + count = err; + } + } } mutex_unlock(&ksm_thread_mutex); @@ -1475,11 +1610,41 @@ KSM_ATTR_RO(pages_shared); static ssize_t pages_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", - ksm_pages_sharing - ksm_pages_shared); + return sprintf(buf, "%lu\n", ksm_pages_sharing); } KSM_ATTR_RO(pages_sharing); +static ssize_t pages_unshared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_unshared); +} +KSM_ATTR_RO(pages_unshared); + +static ssize_t pages_volatile_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + long ksm_pages_volatile; + + ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared + - ksm_pages_sharing - ksm_pages_unshared; + /* + * It was not worth any locking to calculate that statistic, + * but it might therefore sometimes be negative: conceal that. + */ + if (ksm_pages_volatile < 0) + ksm_pages_volatile = 0; + return sprintf(buf, "%ld\n", ksm_pages_volatile); +} +KSM_ATTR_RO(pages_volatile); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_scan.seqnr); +} +KSM_ATTR_RO(full_scans); + static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, @@ -1487,6 +1652,9 @@ static struct attribute *ksm_attrs[] = { &max_kernel_pages_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, + &pages_unshared_attr.attr, + &pages_volatile_attr.attr, + &full_scans_attr.attr, NULL, }; @@ -1494,12 +1662,15 @@ static struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; +#endif /* CONFIG_SYSFS */ static int __init ksm_init(void) { struct task_struct *ksm_thread; int err; + ksm_max_kernel_pages = totalram_pages / 4; + err = ksm_slab_init(); if (err) goto out; @@ -1515,16 +1686,20 @@ static int __init ksm_init(void) goto out_free2; } +#ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); if (err) { printk(KERN_ERR "ksm: register sysfs failed\n"); - goto out_free3; + kthread_stop(ksm_thread); + goto out_free2; } +#else + ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ + +#endif /* CONFIG_SYSFS */ return 0; -out_free3: - kthread_stop(ksm_thread); out_free2: mm_slots_hash_free(); out_free1: