X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fksm.c;h=f7edac356f465275031110db70c1e57aafbc5cda;hb=2c6854fdadf940678fd54779b778f6faafb870bb;hp=d9e3cfcc150c4fd8fc13fa34bffc024925011c26;hpb=d952b79136a6c32a3f97e0628ca78340f1d5c6f9;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/ksm.c b/mm/ksm.c index d9e3cfc..f7edac3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -165,15 +166,15 @@ static unsigned long ksm_rmap_items; static unsigned long ksm_max_kernel_pages; /* Number of pages ksmd should scan in one batch */ -static unsigned int ksm_thread_pages_to_scan; +static unsigned int ksm_thread_pages_to_scan = 100; /* Milliseconds ksmd should sleep between batches */ -static unsigned int ksm_thread_sleep_millisecs; +static unsigned int ksm_thread_sleep_millisecs = 20; #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 -static unsigned int ksm_run; +static unsigned int ksm_run = KSM_RUN_STOP; static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DEFINE_MUTEX(ksm_thread_mutex); @@ -183,6 +184,11 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock); sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) +static void __init ksm_init_max_kernel_pages(void) +{ + ksm_max_kernel_pages = nr_free_buffer_pages() / 4; +} + static int __init ksm_slab_init(void) { rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); @@ -284,6 +290,19 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) } /* + * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's + * page tables after it has passed through ksm_exit() - which, if necessary, + * takes mmap_sem briefly to serialize against them. ksm_exit() does not set + * a special flag: they can just back out as soon as mm_users goes to zero. + * ksm_test_exit() is used throughout to make this test for exit: in some + * places for correctness, in some places just to avoid unnecessary work. + */ +static inline bool ksm_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +/* * We use break_ksm to break COW on a ksm page: it's a stripped down * * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) @@ -347,6 +366,8 @@ static void break_cow(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma; down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) goto out; @@ -365,6 +386,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) struct page *page; down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) goto out; @@ -439,19 +462,14 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) } else if (rmap_item->address & NODE_FLAG) { unsigned char age; /* - * ksm_thread can and must skip the rb_erase, because + * Usually ksmd can and must skip the rb_erase, because * root_unstable_tree was already reset to RB_ROOT. - * But __ksm_exit has to be careful: do the rb_erase - * if it's interrupting a scan, and this rmap_item was - * inserted by this scan rather than left from before. - * - * Because of the case in which remove_mm_from_lists - * increments seqnr before removing rmaps, unstable_nr - * may even be 2 behind seqnr, but should never be - * further behind. Yes, I did have trouble with this! + * But be careful when an mm is exiting: do the rb_erase + * if this rmap_item was inserted by this scan, rather + * than left over from before. */ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); - BUG_ON(age > 2); + BUG_ON(age > 1); if (!age) rb_erase(&rmap_item->node, &root_unstable_tree); ksm_pages_unshared--; @@ -496,6 +514,8 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, int err = 0; for (addr = start; addr < end && !err; addr += PAGE_SIZE) { + if (ksm_test_exit(vma->vm_mm)) + break; if (signal_pending(current)) err = -ERESTARTSYS; else @@ -504,6 +524,10 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } +#ifdef CONFIG_SYSFS +/* + * Only called through the sysfs control interface: + */ static int unmerge_and_remove_all_rmap_items(void) { struct mm_slot *mm_slot; @@ -512,70 +536,56 @@ static int unmerge_and_remove_all_rmap_items(void) int err = 0; spin_lock(&ksm_mmlist_lock); - mm_slot = list_entry(ksm_mm_head.mm_list.next, + ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, struct mm_slot, mm_list); spin_unlock(&ksm_mmlist_lock); - while (mm_slot != &ksm_mm_head) { + for (mm_slot = ksm_scan.mm_slot; + mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { mm = mm_slot->mm; down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (ksm_test_exit(mm)) + break; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end); - if (err) { - up_read(&mm->mmap_sem); - goto out; - } + if (err) + goto error; } + remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); - up_read(&mm->mmap_sem); spin_lock(&ksm_mmlist_lock); - mm_slot = list_entry(mm_slot->mm_list.next, + ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, struct mm_slot, mm_list); - spin_unlock(&ksm_mmlist_lock); + if (ksm_test_exit(mm)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } } ksm_scan.seqnr = 0; -out: + return 0; + +error: + up_read(&mm->mmap_sem); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = &ksm_mm_head; spin_unlock(&ksm_mmlist_lock); return err; } - -static void remove_mm_from_lists(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - - spin_lock(&ksm_mmlist_lock); - mm_slot = get_mm_slot(mm); - - /* - * This mm_slot is always at the scanning cursor when we're - * called from scan_get_next_rmap_item; but it's a special - * case when we're called from __ksm_exit. - */ - if (ksm_scan.mm_slot == mm_slot) { - ksm_scan.mm_slot = list_entry( - mm_slot->mm_list.next, struct mm_slot, mm_list); - ksm_scan.address = 0; - ksm_scan.rmap_item = list_entry( - &ksm_scan.mm_slot->rmap_list, struct rmap_item, link); - if (ksm_scan.mm_slot == &ksm_mm_head) - ksm_scan.seqnr++; - } - - hlist_del(&mm_slot->link); - list_del(&mm_slot->mm_list); - spin_unlock(&ksm_mmlist_lock); - - remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); - free_mm_slot(mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); -} +#endif /* CONFIG_SYSFS */ static u32 calc_checksum(struct page *page) { @@ -791,6 +801,9 @@ static int try_to_merge_with_ksm_page(struct mm_struct *mm1, int err = -EFAULT; down_read(&mm1->mmap_sem); + if (ksm_test_exit(mm1)) + goto out; + vma = find_vma(mm1, addr1); if (!vma || vma->vm_start > addr1) goto out; @@ -832,6 +845,10 @@ static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, return err; down_read(&mm1->mmap_sem); + if (ksm_test_exit(mm1)) { + up_read(&mm1->mmap_sem); + goto out; + } vma = find_vma(mm1, addr1); if (!vma || vma->vm_start > addr1) { up_read(&mm1->mmap_sem); @@ -1210,7 +1227,12 @@ next_mm: mm = slot->mm; down_read(&mm->mmap_sem); - for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) { + if (ksm_test_exit(mm)) + vma = NULL; + else + vma = find_vma(mm, ksm_scan.address); + + for (; vma; vma = vma->vm_next) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) @@ -1219,6 +1241,8 @@ next_mm: ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { + if (ksm_test_exit(mm)) + break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); if (*page && PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); @@ -1241,40 +1265,48 @@ next_mm: } } - if (!ksm_scan.address) { - /* - * We've completed a full scan of all vmas, holding mmap_sem - * throughout, and found no VM_MERGEABLE: so do the same as - * __ksm_exit does to remove this mm from all our lists now. - */ - remove_mm_from_lists(mm); - up_read(&mm->mmap_sem); - slot = ksm_scan.mm_slot; - if (slot != &ksm_mm_head) - goto next_mm; - return NULL; + if (ksm_test_exit(mm)) { + ksm_scan.address = 0; + ksm_scan.rmap_item = list_entry(&slot->rmap_list, + struct rmap_item, link); } - /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); - up_read(&mm->mmap_sem); spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); - ksm_scan.mm_slot = slot; - spin_unlock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_scan.address == 0) { + /* + * We've completed a full scan of all vmas, holding mmap_sem + * throughout, and found no VM_MERGEABLE: so do the same as + * __ksm_exit does to remove this mm from all our lists now. + * This applies either when cleaning up after __ksm_exit + * (but beware: we can reach here even before __ksm_exit), + * or when all VM_MERGEABLE areas have been unmapped (and + * mmap_sem then protects against race with MADV_MERGEABLE). + */ + hlist_del(&slot->link); + list_del(&slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } /* Repeat until we've completed scanning the whole list */ + slot = ksm_scan.mm_slot; if (slot != &ksm_mm_head) goto next_mm; - /* - * Bump seqnr here rather than at top, so that __ksm_exit - * can skip rb_erase on unstable tree until we run again. - */ ksm_scan.seqnr++; return NULL; } @@ -1399,6 +1431,7 @@ int __ksm_enter(struct mm_struct *mm) spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); + atomic_inc(&mm->mm_count); if (needs_wakeup) wake_up_interruptible(&ksm_thread_wait); @@ -1408,16 +1441,47 @@ int __ksm_enter(struct mm_struct *mm) void __ksm_exit(struct mm_struct *mm) { + struct mm_slot *mm_slot; + int easy_to_free = 0; + /* - * This process is exiting: doesn't hold and doesn't need mmap_sem; - * but we do need to exclude ksmd and other exiters while we modify - * the various lists and trees. + * This process is exiting: if it's straightforward (as is the + * case when ksmd was never running), free mm_slot immediately. + * But if it's at the cursor or has rmap_items linked to it, use + * mmap_sem to synchronize with any break_cows before pagetables + * are freed, and leave the mm_slot on the list for ksmd to free. + * Beware: ksm may already have noticed it exiting and freed the slot. */ - mutex_lock(&ksm_thread_mutex); - remove_mm_from_lists(mm); - mutex_unlock(&ksm_thread_mutex); + + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && ksm_scan.mm_slot != mm_slot) { + if (list_empty(&mm_slot->rmap_list)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + easy_to_free = 1; + } else { + list_move(&mm_slot->mm_list, + &ksm_scan.mm_slot->mm_list); + } + } + spin_unlock(&ksm_mmlist_lock); + + if (easy_to_free) { + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mmdrop(mm); + } else if (mm_slot) { + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } } +#ifdef CONFIG_SYSFS +/* + * This all compiles without CONFIG_SYSFS, but is a waste of space. + */ + #define KSM_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KSM_ATTR(_name) \ @@ -1499,7 +1563,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { + current->flags |= PF_OOM_ORIGIN; err = unmerge_and_remove_all_rmap_items(); + current->flags &= ~PF_OOM_ORIGIN; if (err) { ksm_run = KSM_RUN_STOP; count = err; @@ -1600,12 +1666,15 @@ static struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; +#endif /* CONFIG_SYSFS */ static int __init ksm_init(void) { struct task_struct *ksm_thread; int err; + ksm_init_max_kernel_pages(); + err = ksm_slab_init(); if (err) goto out; @@ -1621,16 +1690,17 @@ static int __init ksm_init(void) goto out_free2; } +#ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); if (err) { printk(KERN_ERR "ksm: register sysfs failed\n"); - goto out_free3; + kthread_stop(ksm_thread); + goto out_free2; } +#endif /* CONFIG_SYSFS */ return 0; -out_free3: - kthread_stop(ksm_thread); out_free2: mm_slots_hash_free(); out_free1: