2c02094807e0bd11abd98639bdd6691f492425ab
[safe/jmp/linux-2.6] / mm / ksm.c
1 /*
2  * Memory merging support.
3  *
4  * This code enables dynamic sharing of identical pages found in different
5  * memory areas, even if they are not shared by fork()
6  *
7  * Copyright (C) 2008 Red Hat, Inc.
8  * Authors:
9  *      Izik Eidus
10  *      Andrea Arcangeli
11  *      Chris Wright
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.
14  */
15
16 #include <linux/errno.h>
17 #include <linux/mm.h>
18 #include <linux/fs.h>
19 #include <linux/mman.h>
20 #include <linux/sched.h>
21 #include <linux/rwsem.h>
22 #include <linux/pagemap.h>
23 #include <linux/rmap.h>
24 #include <linux/spinlock.h>
25 #include <linux/jhash.h>
26 #include <linux/delay.h>
27 #include <linux/kthread.h>
28 #include <linux/wait.h>
29 #include <linux/slab.h>
30 #include <linux/rbtree.h>
31 #include <linux/mmu_notifier.h>
32 #include <linux/ksm.h>
33
34 #include <asm/tlbflush.h>
35
36 /*
37  * A few notes about the KSM scanning process,
38  * to make it easier to understand the data structures below:
39  *
40  * In order to reduce excessive scanning, KSM sorts the memory pages by their
41  * contents into a data structure that holds pointers to the pages' locations.
42  *
43  * Since the contents of the pages may change at any moment, KSM cannot just
44  * insert the pages into a normal sorted tree and expect it to find anything.
45  * Therefore KSM uses two data structures - the stable and the unstable tree.
46  *
47  * The stable tree holds pointers to all the merged pages (ksm pages), sorted
48  * by their contents.  Because each such page is write-protected, searching on
49  * this tree is fully assured to be working (except when pages are unmapped),
50  * and therefore this tree is called the stable tree.
51  *
52  * In addition to the stable tree, KSM uses a second data structure called the
53  * unstable tree: this tree holds pointers to pages which have been found to
54  * be "unchanged for a period of time".  The unstable tree sorts these pages
55  * by their contents, but since they are not write-protected, KSM cannot rely
56  * upon the unstable tree to work correctly - the unstable tree is liable to
57  * be corrupted as its contents are modified, and so it is called unstable.
58  *
59  * KSM solves this problem by several techniques:
60  *
61  * 1) The unstable tree is flushed every time KSM completes scanning all
62  *    memory areas, and then the tree is rebuilt again from the beginning.
63  * 2) KSM will only insert into the unstable tree, pages whose hash value
64  *    has not changed since the previous scan of all memory areas.
65  * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
66  *    colors of the nodes and not on their contents, assuring that even when
67  *    the tree gets "corrupted" it won't get out of balance, so scanning time
68  *    remains the same (also, searching and inserting nodes in an rbtree uses
69  *    the same algorithm, so we have no overhead when we flush and rebuild).
70  * 4) KSM never flushes the stable tree, which means that even if it were to
71  *    take 10 attempts to find a page in the unstable tree, once it is found,
72  *    it is secured in the stable tree.  (When we scan a new page, we first
73  *    compare it against the stable tree, and then against the unstable tree.)
74  */
75
76 /**
77  * struct mm_slot - ksm information per mm that is being scanned
78  * @link: link to the mm_slots hash list
79  * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
80  * @rmap_list: head for this mm_slot's list of rmap_items
81  * @mm: the mm that this information is valid for
82  */
83 struct mm_slot {
84         struct hlist_node link;
85         struct list_head mm_list;
86         struct list_head rmap_list;
87         struct mm_struct *mm;
88 };
89
90 /**
91  * struct ksm_scan - cursor for scanning
92  * @mm_slot: the current mm_slot we are scanning
93  * @address: the next address inside that to be scanned
94  * @rmap_item: the current rmap that we are scanning inside the rmap_list
95  * @seqnr: count of completed full scans (needed when removing unstable node)
96  *
97  * There is only the one ksm_scan instance of this cursor structure.
98  */
99 struct ksm_scan {
100         struct mm_slot *mm_slot;
101         unsigned long address;
102         struct rmap_item *rmap_item;
103         unsigned long seqnr;
104 };
105
106 /**
107  * struct rmap_item - reverse mapping item for virtual addresses
108  * @link: link into mm_slot's rmap_list (rmap_list is per mm)
109  * @mm: the memory structure this rmap_item is pointing into
110  * @address: the virtual address this rmap_item tracks (+ flags in low bits)
111  * @oldchecksum: previous checksum of the page at that virtual address
112  * @node: rb_node of this rmap_item in either unstable or stable tree
113  * @next: next rmap_item hanging off the same node of the stable tree
114  * @prev: previous rmap_item hanging off the same node of the stable tree
115  */
116 struct rmap_item {
117         struct list_head link;
118         struct mm_struct *mm;
119         unsigned long address;          /* + low bits used for flags below */
120         union {
121                 unsigned int oldchecksum;               /* when unstable */
122                 struct rmap_item *next;                 /* when stable */
123         };
124         union {
125                 struct rb_node node;                    /* when tree node */
126                 struct rmap_item *prev;                 /* in stable list */
127         };
128 };
129
130 #define SEQNR_MASK      0x0ff   /* low bits of unstable tree seqnr */
131 #define NODE_FLAG       0x100   /* is a node of unstable or stable tree */
132 #define STABLE_FLAG     0x200   /* is a node or list item of stable tree */
133
134 /* The stable and unstable tree heads */
135 static struct rb_root root_stable_tree = RB_ROOT;
136 static struct rb_root root_unstable_tree = RB_ROOT;
137
138 #define MM_SLOTS_HASH_HEADS 1024
139 static struct hlist_head *mm_slots_hash;
140
141 static struct mm_slot ksm_mm_head = {
142         .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
143 };
144 static struct ksm_scan ksm_scan = {
145         .mm_slot = &ksm_mm_head,
146 };
147
148 static struct kmem_cache *rmap_item_cache;
149 static struct kmem_cache *mm_slot_cache;
150
151 /* The number of nodes in the stable tree */
152 static unsigned long ksm_kernel_pages_allocated;
153
154 /* The number of page slots sharing those nodes */
155 static unsigned long ksm_pages_shared;
156
157 /* Limit on the number of unswappable pages used */
158 static unsigned long ksm_max_kernel_pages;
159
160 /* Number of pages ksmd should scan in one batch */
161 static unsigned int ksm_thread_pages_to_scan;
162
163 /* Milliseconds ksmd should sleep between batches */
164 static unsigned int ksm_thread_sleep_millisecs;
165
166 #define KSM_RUN_STOP    0
167 #define KSM_RUN_MERGE   1
168 #define KSM_RUN_UNMERGE 2
169 static unsigned int ksm_run;
170
171 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
172 static DEFINE_MUTEX(ksm_thread_mutex);
173 static DEFINE_SPINLOCK(ksm_mmlist_lock);
174
175 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
176                 sizeof(struct __struct), __alignof__(struct __struct),\
177                 (__flags), NULL)
178
179 static int __init ksm_slab_init(void)
180 {
181         rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
182         if (!rmap_item_cache)
183                 goto out;
184
185         mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
186         if (!mm_slot_cache)
187                 goto out_free;
188
189         return 0;
190
191 out_free:
192         kmem_cache_destroy(rmap_item_cache);
193 out:
194         return -ENOMEM;
195 }
196
197 static void __init ksm_slab_free(void)
198 {
199         kmem_cache_destroy(mm_slot_cache);
200         kmem_cache_destroy(rmap_item_cache);
201         mm_slot_cache = NULL;
202 }
203
204 static inline struct rmap_item *alloc_rmap_item(void)
205 {
206         return kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
207 }
208
209 static inline void free_rmap_item(struct rmap_item *rmap_item)
210 {
211         rmap_item->mm = NULL;   /* debug safety */
212         kmem_cache_free(rmap_item_cache, rmap_item);
213 }
214
215 static inline struct mm_slot *alloc_mm_slot(void)
216 {
217         if (!mm_slot_cache)     /* initialization failed */
218                 return NULL;
219         return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
220 }
221
222 static inline void free_mm_slot(struct mm_slot *mm_slot)
223 {
224         kmem_cache_free(mm_slot_cache, mm_slot);
225 }
226
227 static int __init mm_slots_hash_init(void)
228 {
229         mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
230                                 GFP_KERNEL);
231         if (!mm_slots_hash)
232                 return -ENOMEM;
233         return 0;
234 }
235
236 static void __init mm_slots_hash_free(void)
237 {
238         kfree(mm_slots_hash);
239 }
240
241 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
242 {
243         struct mm_slot *mm_slot;
244         struct hlist_head *bucket;
245         struct hlist_node *node;
246
247         bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
248                                 % MM_SLOTS_HASH_HEADS];
249         hlist_for_each_entry(mm_slot, node, bucket, link) {
250                 if (mm == mm_slot->mm)
251                         return mm_slot;
252         }
253         return NULL;
254 }
255
256 static void insert_to_mm_slots_hash(struct mm_struct *mm,
257                                     struct mm_slot *mm_slot)
258 {
259         struct hlist_head *bucket;
260
261         bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
262                                 % MM_SLOTS_HASH_HEADS];
263         mm_slot->mm = mm;
264         INIT_LIST_HEAD(&mm_slot->rmap_list);
265         hlist_add_head(&mm_slot->link, bucket);
266 }
267
268 static inline int in_stable_tree(struct rmap_item *rmap_item)
269 {
270         return rmap_item->address & STABLE_FLAG;
271 }
272
273 /*
274  * We use break_ksm to break COW on a ksm page: it's a stripped down
275  *
276  *      if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
277  *              put_page(page);
278  *
279  * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
280  * in case the application has unmapped and remapped mm,addr meanwhile.
281  * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
282  * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
283  */
284 static void break_ksm(struct vm_area_struct *vma, unsigned long addr)
285 {
286         struct page *page;
287         int ret;
288
289         do {
290                 cond_resched();
291                 page = follow_page(vma, addr, FOLL_GET);
292                 if (!page)
293                         break;
294                 if (PageKsm(page))
295                         ret = handle_mm_fault(vma->vm_mm, vma, addr,
296                                                         FAULT_FLAG_WRITE);
297                 else
298                         ret = VM_FAULT_WRITE;
299                 put_page(page);
300         } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS)));
301
302         /* Which leaves us looping there if VM_FAULT_OOM: hmmm... */
303 }
304
305 static void __break_cow(struct mm_struct *mm, unsigned long addr)
306 {
307         struct vm_area_struct *vma;
308
309         vma = find_vma(mm, addr);
310         if (!vma || vma->vm_start > addr)
311                 return;
312         if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
313                 return;
314         break_ksm(vma, addr);
315 }
316
317 static void break_cow(struct mm_struct *mm, unsigned long addr)
318 {
319         down_read(&mm->mmap_sem);
320         __break_cow(mm, addr);
321         up_read(&mm->mmap_sem);
322 }
323
324 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
325 {
326         struct mm_struct *mm = rmap_item->mm;
327         unsigned long addr = rmap_item->address;
328         struct vm_area_struct *vma;
329         struct page *page;
330
331         down_read(&mm->mmap_sem);
332         vma = find_vma(mm, addr);
333         if (!vma || vma->vm_start > addr)
334                 goto out;
335         if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
336                 goto out;
337
338         page = follow_page(vma, addr, FOLL_GET);
339         if (!page)
340                 goto out;
341         if (PageAnon(page)) {
342                 flush_anon_page(vma, page, addr);
343                 flush_dcache_page(page);
344         } else {
345                 put_page(page);
346 out:            page = NULL;
347         }
348         up_read(&mm->mmap_sem);
349         return page;
350 }
351
352 /*
353  * get_ksm_page: checks if the page at the virtual address in rmap_item
354  * is still PageKsm, in which case we can trust the content of the page,
355  * and it returns the gotten page; but NULL if the page has been zapped.
356  */
357 static struct page *get_ksm_page(struct rmap_item *rmap_item)
358 {
359         struct page *page;
360
361         page = get_mergeable_page(rmap_item);
362         if (page && !PageKsm(page)) {
363                 put_page(page);
364                 page = NULL;
365         }
366         return page;
367 }
368
369 /*
370  * Removing rmap_item from stable or unstable tree.
371  * This function will clean the information from the stable/unstable tree.
372  */
373 static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
374 {
375         if (in_stable_tree(rmap_item)) {
376                 struct rmap_item *next_item = rmap_item->next;
377
378                 if (rmap_item->address & NODE_FLAG) {
379                         if (next_item) {
380                                 rb_replace_node(&rmap_item->node,
381                                                 &next_item->node,
382                                                 &root_stable_tree);
383                                 next_item->address |= NODE_FLAG;
384                         } else {
385                                 rb_erase(&rmap_item->node, &root_stable_tree);
386                                 ksm_kernel_pages_allocated--;
387                         }
388                 } else {
389                         struct rmap_item *prev_item = rmap_item->prev;
390
391                         BUG_ON(prev_item->next != rmap_item);
392                         prev_item->next = next_item;
393                         if (next_item) {
394                                 BUG_ON(next_item->prev != rmap_item);
395                                 next_item->prev = rmap_item->prev;
396                         }
397                 }
398
399                 rmap_item->next = NULL;
400                 ksm_pages_shared--;
401
402         } else if (rmap_item->address & NODE_FLAG) {
403                 unsigned char age;
404                 /*
405                  * ksm_thread can and must skip the rb_erase, because
406                  * root_unstable_tree was already reset to RB_ROOT.
407                  * But __ksm_exit has to be careful: do the rb_erase
408                  * if it's interrupting a scan, and this rmap_item was
409                  * inserted by this scan rather than left from before.
410                  *
411                  * Because of the case in which remove_mm_from_lists
412                  * increments seqnr before removing rmaps, unstable_nr
413                  * may even be 2 behind seqnr, but should never be
414                  * further behind.  Yes, I did have trouble with this!
415                  */
416                 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
417                 BUG_ON(age > 2);
418                 if (!age)
419                         rb_erase(&rmap_item->node, &root_unstable_tree);
420         }
421
422         rmap_item->address &= PAGE_MASK;
423
424         cond_resched();         /* we're called from many long loops */
425 }
426
427 static void remove_all_slot_rmap_items(struct mm_slot *mm_slot)
428 {
429         struct rmap_item *rmap_item, *node;
430
431         list_for_each_entry_safe(rmap_item, node, &mm_slot->rmap_list, link) {
432                 remove_rmap_item_from_tree(rmap_item);
433                 list_del(&rmap_item->link);
434                 free_rmap_item(rmap_item);
435         }
436 }
437
438 static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
439                                        struct list_head *cur)
440 {
441         struct rmap_item *rmap_item;
442
443         while (cur != &mm_slot->rmap_list) {
444                 rmap_item = list_entry(cur, struct rmap_item, link);
445                 cur = cur->next;
446                 remove_rmap_item_from_tree(rmap_item);
447                 list_del(&rmap_item->link);
448                 free_rmap_item(rmap_item);
449         }
450 }
451
452 /*
453  * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
454  * than check every pte of a given vma, the locking doesn't quite work for
455  * that - an rmap_item is assigned to the stable tree after inserting ksm
456  * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
457  * rmap_items from parent to child at fork time (so as not to waste time
458  * if exit comes before the next scan reaches it).
459  */
460 static void unmerge_ksm_pages(struct vm_area_struct *vma,
461                               unsigned long start, unsigned long end)
462 {
463         unsigned long addr;
464
465         for (addr = start; addr < end; addr += PAGE_SIZE)
466                 break_ksm(vma, addr);
467 }
468
469 static void unmerge_and_remove_all_rmap_items(void)
470 {
471         struct mm_slot *mm_slot;
472         struct mm_struct *mm;
473         struct vm_area_struct *vma;
474
475         list_for_each_entry(mm_slot, &ksm_mm_head.mm_list, mm_list) {
476                 mm = mm_slot->mm;
477                 down_read(&mm->mmap_sem);
478                 for (vma = mm->mmap; vma; vma = vma->vm_next) {
479                         if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
480                                 continue;
481                         unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);
482                 }
483                 remove_all_slot_rmap_items(mm_slot);
484                 up_read(&mm->mmap_sem);
485         }
486
487         spin_lock(&ksm_mmlist_lock);
488         if (ksm_scan.mm_slot != &ksm_mm_head) {
489                 ksm_scan.mm_slot = &ksm_mm_head;
490                 ksm_scan.seqnr++;
491         }
492         spin_unlock(&ksm_mmlist_lock);
493 }
494
495 static void remove_mm_from_lists(struct mm_struct *mm)
496 {
497         struct mm_slot *mm_slot;
498
499         spin_lock(&ksm_mmlist_lock);
500         mm_slot = get_mm_slot(mm);
501
502         /*
503          * This mm_slot is always at the scanning cursor when we're
504          * called from scan_get_next_rmap_item; but it's a special
505          * case when we're called from __ksm_exit.
506          */
507         if (ksm_scan.mm_slot == mm_slot) {
508                 ksm_scan.mm_slot = list_entry(
509                         mm_slot->mm_list.next, struct mm_slot, mm_list);
510                 ksm_scan.address = 0;
511                 ksm_scan.rmap_item = list_entry(
512                         &ksm_scan.mm_slot->rmap_list, struct rmap_item, link);
513                 if (ksm_scan.mm_slot == &ksm_mm_head)
514                         ksm_scan.seqnr++;
515         }
516
517         hlist_del(&mm_slot->link);
518         list_del(&mm_slot->mm_list);
519         spin_unlock(&ksm_mmlist_lock);
520
521         remove_all_slot_rmap_items(mm_slot);
522         free_mm_slot(mm_slot);
523         clear_bit(MMF_VM_MERGEABLE, &mm->flags);
524 }
525
526 static u32 calc_checksum(struct page *page)
527 {
528         u32 checksum;
529         void *addr = kmap_atomic(page, KM_USER0);
530         checksum = jhash2(addr, PAGE_SIZE / 4, 17);
531         kunmap_atomic(addr, KM_USER0);
532         return checksum;
533 }
534
535 static int memcmp_pages(struct page *page1, struct page *page2)
536 {
537         char *addr1, *addr2;
538         int ret;
539
540         addr1 = kmap_atomic(page1, KM_USER0);
541         addr2 = kmap_atomic(page2, KM_USER1);
542         ret = memcmp(addr1, addr2, PAGE_SIZE);
543         kunmap_atomic(addr2, KM_USER1);
544         kunmap_atomic(addr1, KM_USER0);
545         return ret;
546 }
547
548 static inline int pages_identical(struct page *page1, struct page *page2)
549 {
550         return !memcmp_pages(page1, page2);
551 }
552
553 static int write_protect_page(struct vm_area_struct *vma, struct page *page,
554                               pte_t *orig_pte)
555 {
556         struct mm_struct *mm = vma->vm_mm;
557         unsigned long addr;
558         pte_t *ptep;
559         spinlock_t *ptl;
560         int swapped;
561         int err = -EFAULT;
562
563         addr = page_address_in_vma(page, vma);
564         if (addr == -EFAULT)
565                 goto out;
566
567         ptep = page_check_address(page, mm, addr, &ptl, 0);
568         if (!ptep)
569                 goto out;
570
571         if (pte_write(*ptep)) {
572                 pte_t entry;
573
574                 swapped = PageSwapCache(page);
575                 flush_cache_page(vma, addr, page_to_pfn(page));
576                 /*
577                  * Ok this is tricky, when get_user_pages_fast() run it doesnt
578                  * take any lock, therefore the check that we are going to make
579                  * with the pagecount against the mapcount is racey and
580                  * O_DIRECT can happen right after the check.
581                  * So we clear the pte and flush the tlb before the check
582                  * this assure us that no O_DIRECT can happen after the check
583                  * or in the middle of the check.
584                  */
585                 entry = ptep_clear_flush(vma, addr, ptep);
586                 /*
587                  * Check that no O_DIRECT or similar I/O is in progress on the
588                  * page
589                  */
590                 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
591                         set_pte_at_notify(mm, addr, ptep, entry);
592                         goto out_unlock;
593                 }
594                 entry = pte_wrprotect(entry);
595                 set_pte_at_notify(mm, addr, ptep, entry);
596         }
597         *orig_pte = *ptep;
598         err = 0;
599
600 out_unlock:
601         pte_unmap_unlock(ptep, ptl);
602 out:
603         return err;
604 }
605
606 /**
607  * replace_page - replace page in vma by new ksm page
608  * @vma:      vma that holds the pte pointing to oldpage
609  * @oldpage:  the page we are replacing by newpage
610  * @newpage:  the ksm page we replace oldpage by
611  * @orig_pte: the original value of the pte
612  *
613  * Returns 0 on success, -EFAULT on failure.
614  */
615 static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
616                         struct page *newpage, pte_t orig_pte)
617 {
618         struct mm_struct *mm = vma->vm_mm;
619         pgd_t *pgd;
620         pud_t *pud;
621         pmd_t *pmd;
622         pte_t *ptep;
623         spinlock_t *ptl;
624         unsigned long addr;
625         pgprot_t prot;
626         int err = -EFAULT;
627
628         prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
629
630         addr = page_address_in_vma(oldpage, vma);
631         if (addr == -EFAULT)
632                 goto out;
633
634         pgd = pgd_offset(mm, addr);
635         if (!pgd_present(*pgd))
636                 goto out;
637
638         pud = pud_offset(pgd, addr);
639         if (!pud_present(*pud))
640                 goto out;
641
642         pmd = pmd_offset(pud, addr);
643         if (!pmd_present(*pmd))
644                 goto out;
645
646         ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
647         if (!pte_same(*ptep, orig_pte)) {
648                 pte_unmap_unlock(ptep, ptl);
649                 goto out;
650         }
651
652         get_page(newpage);
653         page_add_ksm_rmap(newpage);
654
655         flush_cache_page(vma, addr, pte_pfn(*ptep));
656         ptep_clear_flush(vma, addr, ptep);
657         set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
658
659         page_remove_rmap(oldpage);
660         put_page(oldpage);
661
662         pte_unmap_unlock(ptep, ptl);
663         err = 0;
664 out:
665         return err;
666 }
667
668 /*
669  * try_to_merge_one_page - take two pages and merge them into one
670  * @vma: the vma that hold the pte pointing into oldpage
671  * @oldpage: the page that we want to replace with newpage
672  * @newpage: the page that we want to map instead of oldpage
673  *
674  * Note:
675  * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
676  * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
677  *
678  * This function returns 0 if the pages were merged, -EFAULT otherwise.
679  */
680 static int try_to_merge_one_page(struct vm_area_struct *vma,
681                                  struct page *oldpage,
682                                  struct page *newpage)
683 {
684         pte_t orig_pte = __pte(0);
685         int err = -EFAULT;
686
687         if (!(vma->vm_flags & VM_MERGEABLE))
688                 goto out;
689
690         if (!PageAnon(oldpage))
691                 goto out;
692
693         get_page(newpage);
694         get_page(oldpage);
695
696         /*
697          * We need the page lock to read a stable PageSwapCache in
698          * write_protect_page().  We use trylock_page() instead of
699          * lock_page() because we don't want to wait here - we
700          * prefer to continue scanning and merging different pages,
701          * then come back to this page when it is unlocked.
702          */
703         if (!trylock_page(oldpage))
704                 goto out_putpage;
705         /*
706          * If this anonymous page is mapped only here, its pte may need
707          * to be write-protected.  If it's mapped elsewhere, all of its
708          * ptes are necessarily already write-protected.  But in either
709          * case, we need to lock and check page_count is not raised.
710          */
711         if (write_protect_page(vma, oldpage, &orig_pte)) {
712                 unlock_page(oldpage);
713                 goto out_putpage;
714         }
715         unlock_page(oldpage);
716
717         if (pages_identical(oldpage, newpage))
718                 err = replace_page(vma, oldpage, newpage, orig_pte);
719
720 out_putpage:
721         put_page(oldpage);
722         put_page(newpage);
723 out:
724         return err;
725 }
726
727 /*
728  * try_to_merge_two_pages - take two identical pages and prepare them
729  * to be merged into one page.
730  *
731  * This function returns 0 if we successfully mapped two identical pages
732  * into one page, -EFAULT otherwise.
733  *
734  * Note that this function allocates a new kernel page: if one of the pages
735  * is already a ksm page, try_to_merge_with_ksm_page should be used.
736  */
737 static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
738                                   struct page *page1, struct mm_struct *mm2,
739                                   unsigned long addr2, struct page *page2)
740 {
741         struct vm_area_struct *vma;
742         struct page *kpage;
743         int err = -EFAULT;
744
745         /*
746          * The number of nodes in the stable tree
747          * is the number of kernel pages that we hold.
748          */
749         if (ksm_max_kernel_pages &&
750             ksm_max_kernel_pages <= ksm_kernel_pages_allocated)
751                 return err;
752
753         kpage = alloc_page(GFP_HIGHUSER);
754         if (!kpage)
755                 return err;
756
757         down_read(&mm1->mmap_sem);
758         vma = find_vma(mm1, addr1);
759         if (!vma || vma->vm_start > addr1) {
760                 put_page(kpage);
761                 up_read(&mm1->mmap_sem);
762                 return err;
763         }
764
765         copy_user_highpage(kpage, page1, addr1, vma);
766         err = try_to_merge_one_page(vma, page1, kpage);
767         up_read(&mm1->mmap_sem);
768
769         if (!err) {
770                 down_read(&mm2->mmap_sem);
771                 vma = find_vma(mm2, addr2);
772                 if (!vma || vma->vm_start > addr2) {
773                         put_page(kpage);
774                         up_read(&mm2->mmap_sem);
775                         break_cow(mm1, addr1);
776                         return -EFAULT;
777                 }
778
779                 err = try_to_merge_one_page(vma, page2, kpage);
780                 up_read(&mm2->mmap_sem);
781
782                 /*
783                  * If the second try_to_merge_one_page failed, we have a
784                  * ksm page with just one pte pointing to it, so break it.
785                  */
786                 if (err)
787                         break_cow(mm1, addr1);
788                 else
789                         ksm_pages_shared += 2;
790         }
791
792         put_page(kpage);
793         return err;
794 }
795
796 /*
797  * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
798  * but no new kernel page is allocated: kpage must already be a ksm page.
799  */
800 static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
801                                       unsigned long addr1,
802                                       struct page *page1,
803                                       struct page *kpage)
804 {
805         struct vm_area_struct *vma;
806         int err = -EFAULT;
807
808         down_read(&mm1->mmap_sem);
809         vma = find_vma(mm1, addr1);
810         if (!vma || vma->vm_start > addr1) {
811                 up_read(&mm1->mmap_sem);
812                 return err;
813         }
814
815         err = try_to_merge_one_page(vma, page1, kpage);
816         up_read(&mm1->mmap_sem);
817
818         if (!err)
819                 ksm_pages_shared++;
820
821         return err;
822 }
823
824 /*
825  * stable_tree_search - search page inside the stable tree
826  * @page: the page that we are searching identical pages to.
827  * @page2: pointer into identical page that we are holding inside the stable
828  *         tree that we have found.
829  * @rmap_item: the reverse mapping item
830  *
831  * This function checks if there is a page inside the stable tree
832  * with identical content to the page that we are scanning right now.
833  *
834  * This function return rmap_item pointer to the identical item if found,
835  * NULL otherwise.
836  */
837 static struct rmap_item *stable_tree_search(struct page *page,
838                                             struct page **page2,
839                                             struct rmap_item *rmap_item)
840 {
841         struct rb_node *node = root_stable_tree.rb_node;
842
843         while (node) {
844                 struct rmap_item *tree_rmap_item, *next_rmap_item;
845                 int ret;
846
847                 tree_rmap_item = rb_entry(node, struct rmap_item, node);
848                 while (tree_rmap_item) {
849                         BUG_ON(!in_stable_tree(tree_rmap_item));
850                         cond_resched();
851                         page2[0] = get_ksm_page(tree_rmap_item);
852                         if (page2[0])
853                                 break;
854                         next_rmap_item = tree_rmap_item->next;
855                         remove_rmap_item_from_tree(tree_rmap_item);
856                         tree_rmap_item = next_rmap_item;
857                 }
858                 if (!tree_rmap_item)
859                         return NULL;
860
861                 ret = memcmp_pages(page, page2[0]);
862
863                 if (ret < 0) {
864                         put_page(page2[0]);
865                         node = node->rb_left;
866                 } else if (ret > 0) {
867                         put_page(page2[0]);
868                         node = node->rb_right;
869                 } else {
870                         return tree_rmap_item;
871                 }
872         }
873
874         return NULL;
875 }
876
877 /*
878  * stable_tree_insert - insert rmap_item pointing to new ksm page
879  * into the stable tree.
880  *
881  * @page: the page that we are searching identical page to inside the stable
882  *        tree.
883  * @rmap_item: pointer to the reverse mapping item.
884  *
885  * This function returns rmap_item if success, NULL otherwise.
886  */
887 static struct rmap_item *stable_tree_insert(struct page *page,
888                                             struct rmap_item *rmap_item)
889 {
890         struct rb_node **new = &root_stable_tree.rb_node;
891         struct rb_node *parent = NULL;
892
893         while (*new) {
894                 struct rmap_item *tree_rmap_item, *next_rmap_item;
895                 struct page *tree_page;
896                 int ret;
897
898                 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
899                 while (tree_rmap_item) {
900                         BUG_ON(!in_stable_tree(tree_rmap_item));
901                         cond_resched();
902                         tree_page = get_ksm_page(tree_rmap_item);
903                         if (tree_page)
904                                 break;
905                         next_rmap_item = tree_rmap_item->next;
906                         remove_rmap_item_from_tree(tree_rmap_item);
907                         tree_rmap_item = next_rmap_item;
908                 }
909                 if (!tree_rmap_item)
910                         return NULL;
911
912                 ret = memcmp_pages(page, tree_page);
913                 put_page(tree_page);
914
915                 parent = *new;
916                 if (ret < 0)
917                         new = &parent->rb_left;
918                 else if (ret > 0)
919                         new = &parent->rb_right;
920                 else {
921                         /*
922                          * It is not a bug that stable_tree_search() didn't
923                          * find this node: because at that time our page was
924                          * not yet write-protected, so may have changed since.
925                          */
926                         return NULL;
927                 }
928         }
929
930         ksm_kernel_pages_allocated++;
931
932         rmap_item->address |= NODE_FLAG | STABLE_FLAG;
933         rmap_item->next = NULL;
934         rb_link_node(&rmap_item->node, parent, new);
935         rb_insert_color(&rmap_item->node, &root_stable_tree);
936
937         return rmap_item;
938 }
939
940 /*
941  * unstable_tree_search_insert - search and insert items into the unstable tree.
942  *
943  * @page: the page that we are going to search for identical page or to insert
944  *        into the unstable tree
945  * @page2: pointer into identical page that was found inside the unstable tree
946  * @rmap_item: the reverse mapping item of page
947  *
948  * This function searches for a page in the unstable tree identical to the
949  * page currently being scanned; and if no identical page is found in the
950  * tree, we insert rmap_item as a new object into the unstable tree.
951  *
952  * This function returns pointer to rmap_item found to be identical
953  * to the currently scanned page, NULL otherwise.
954  *
955  * This function does both searching and inserting, because they share
956  * the same walking algorithm in an rbtree.
957  */
958 static struct rmap_item *unstable_tree_search_insert(struct page *page,
959                                                 struct page **page2,
960                                                 struct rmap_item *rmap_item)
961 {
962         struct rb_node **new = &root_unstable_tree.rb_node;
963         struct rb_node *parent = NULL;
964
965         while (*new) {
966                 struct rmap_item *tree_rmap_item;
967                 int ret;
968
969                 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
970                 page2[0] = get_mergeable_page(tree_rmap_item);
971                 if (!page2[0])
972                         return NULL;
973
974                 /*
975                  * Don't substitute an unswappable ksm page
976                  * just for one good swappable forked page.
977                  */
978                 if (page == page2[0]) {
979                         put_page(page2[0]);
980                         return NULL;
981                 }
982
983                 ret = memcmp_pages(page, page2[0]);
984
985                 parent = *new;
986                 if (ret < 0) {
987                         put_page(page2[0]);
988                         new = &parent->rb_left;
989                 } else if (ret > 0) {
990                         put_page(page2[0]);
991                         new = &parent->rb_right;
992                 } else {
993                         return tree_rmap_item;
994                 }
995         }
996
997         rmap_item->address |= NODE_FLAG;
998         rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
999         rb_link_node(&rmap_item->node, parent, new);
1000         rb_insert_color(&rmap_item->node, &root_unstable_tree);
1001
1002         return NULL;
1003 }
1004
1005 /*
1006  * stable_tree_append - add another rmap_item to the linked list of
1007  * rmap_items hanging off a given node of the stable tree, all sharing
1008  * the same ksm page.
1009  */
1010 static void stable_tree_append(struct rmap_item *rmap_item,
1011                                struct rmap_item *tree_rmap_item)
1012 {
1013         rmap_item->next = tree_rmap_item->next;
1014         rmap_item->prev = tree_rmap_item;
1015
1016         if (tree_rmap_item->next)
1017                 tree_rmap_item->next->prev = rmap_item;
1018
1019         tree_rmap_item->next = rmap_item;
1020         rmap_item->address |= STABLE_FLAG;
1021 }
1022
1023 /*
1024  * cmp_and_merge_page - take a page computes its hash value and check if there
1025  * is similar hash value to different page,
1026  * in case we find that there is similar hash to different page we call to
1027  * try_to_merge_two_pages().
1028  *
1029  * @page: the page that we are searching identical page to.
1030  * @rmap_item: the reverse mapping into the virtual address of this page
1031  */
1032 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1033 {
1034         struct page *page2[1];
1035         struct rmap_item *tree_rmap_item;
1036         unsigned int checksum;
1037         int err;
1038
1039         if (in_stable_tree(rmap_item))
1040                 remove_rmap_item_from_tree(rmap_item);
1041
1042         /* We first start with searching the page inside the stable tree */
1043         tree_rmap_item = stable_tree_search(page, page2, rmap_item);
1044         if (tree_rmap_item) {
1045                 if (page == page2[0]) {                 /* forked */
1046                         ksm_pages_shared++;
1047                         err = 0;
1048                 } else
1049                         err = try_to_merge_with_ksm_page(rmap_item->mm,
1050                                                          rmap_item->address,
1051                                                          page, page2[0]);
1052                 put_page(page2[0]);
1053
1054                 if (!err) {
1055                         /*
1056                          * The page was successfully merged:
1057                          * add its rmap_item to the stable tree.
1058                          */
1059                         stable_tree_append(rmap_item, tree_rmap_item);
1060                 }
1061                 return;
1062         }
1063
1064         /*
1065          * A ksm page might have got here by fork, but its other
1066          * references have already been removed from the stable tree.
1067          */
1068         if (PageKsm(page))
1069                 break_cow(rmap_item->mm, rmap_item->address);
1070
1071         /*
1072          * In case the hash value of the page was changed from the last time we
1073          * have calculated it, this page to be changed frequely, therefore we
1074          * don't want to insert it to the unstable tree, and we don't want to
1075          * waste our time to search if there is something identical to it there.
1076          */
1077         checksum = calc_checksum(page);
1078         if (rmap_item->oldchecksum != checksum) {
1079                 rmap_item->oldchecksum = checksum;
1080                 return;
1081         }
1082
1083         tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
1084         if (tree_rmap_item) {
1085                 err = try_to_merge_two_pages(rmap_item->mm,
1086                                              rmap_item->address, page,
1087                                              tree_rmap_item->mm,
1088                                              tree_rmap_item->address, page2[0]);
1089                 /*
1090                  * As soon as we merge this page, we want to remove the
1091                  * rmap_item of the page we have merged with from the unstable
1092                  * tree, and insert it instead as new node in the stable tree.
1093                  */
1094                 if (!err) {
1095                         rb_erase(&tree_rmap_item->node, &root_unstable_tree);
1096                         tree_rmap_item->address &= ~NODE_FLAG;
1097                         /*
1098                          * If we fail to insert the page into the stable tree,
1099                          * we will have 2 virtual addresses that are pointing
1100                          * to a ksm page left outside the stable tree,
1101                          * in which case we need to break_cow on both.
1102                          */
1103                         if (stable_tree_insert(page2[0], tree_rmap_item))
1104                                 stable_tree_append(rmap_item, tree_rmap_item);
1105                         else {
1106                                 break_cow(tree_rmap_item->mm,
1107                                                 tree_rmap_item->address);
1108                                 break_cow(rmap_item->mm, rmap_item->address);
1109                                 ksm_pages_shared -= 2;
1110                         }
1111                 }
1112
1113                 put_page(page2[0]);
1114         }
1115 }
1116
1117 static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1118                                             struct list_head *cur,
1119                                             unsigned long addr)
1120 {
1121         struct rmap_item *rmap_item;
1122
1123         while (cur != &mm_slot->rmap_list) {
1124                 rmap_item = list_entry(cur, struct rmap_item, link);
1125                 if ((rmap_item->address & PAGE_MASK) == addr) {
1126                         if (!in_stable_tree(rmap_item))
1127                                 remove_rmap_item_from_tree(rmap_item);
1128                         return rmap_item;
1129                 }
1130                 if (rmap_item->address > addr)
1131                         break;
1132                 cur = cur->next;
1133                 remove_rmap_item_from_tree(rmap_item);
1134                 list_del(&rmap_item->link);
1135                 free_rmap_item(rmap_item);
1136         }
1137
1138         rmap_item = alloc_rmap_item();
1139         if (rmap_item) {
1140                 /* It has already been zeroed */
1141                 rmap_item->mm = mm_slot->mm;
1142                 rmap_item->address = addr;
1143                 list_add_tail(&rmap_item->link, cur);
1144         }
1145         return rmap_item;
1146 }
1147
1148 static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1149 {
1150         struct mm_struct *mm;
1151         struct mm_slot *slot;
1152         struct vm_area_struct *vma;
1153         struct rmap_item *rmap_item;
1154
1155         if (list_empty(&ksm_mm_head.mm_list))
1156                 return NULL;
1157
1158         slot = ksm_scan.mm_slot;
1159         if (slot == &ksm_mm_head) {
1160                 root_unstable_tree = RB_ROOT;
1161
1162                 spin_lock(&ksm_mmlist_lock);
1163                 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1164                 ksm_scan.mm_slot = slot;
1165                 spin_unlock(&ksm_mmlist_lock);
1166 next_mm:
1167                 ksm_scan.address = 0;
1168                 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1169                                                 struct rmap_item, link);
1170         }
1171
1172         mm = slot->mm;
1173         down_read(&mm->mmap_sem);
1174         for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) {
1175                 if (!(vma->vm_flags & VM_MERGEABLE))
1176                         continue;
1177                 if (ksm_scan.address < vma->vm_start)
1178                         ksm_scan.address = vma->vm_start;
1179                 if (!vma->anon_vma)
1180                         ksm_scan.address = vma->vm_end;
1181
1182                 while (ksm_scan.address < vma->vm_end) {
1183                         *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1184                         if (*page && PageAnon(*page)) {
1185                                 flush_anon_page(vma, *page, ksm_scan.address);
1186                                 flush_dcache_page(*page);
1187                                 rmap_item = get_next_rmap_item(slot,
1188                                         ksm_scan.rmap_item->link.next,
1189                                         ksm_scan.address);
1190                                 if (rmap_item) {
1191                                         ksm_scan.rmap_item = rmap_item;
1192                                         ksm_scan.address += PAGE_SIZE;
1193                                 } else
1194                                         put_page(*page);
1195                                 up_read(&mm->mmap_sem);
1196                                 return rmap_item;
1197                         }
1198                         if (*page)
1199                                 put_page(*page);
1200                         ksm_scan.address += PAGE_SIZE;
1201                         cond_resched();
1202                 }
1203         }
1204
1205         if (!ksm_scan.address) {
1206                 /*
1207                  * We've completed a full scan of all vmas, holding mmap_sem
1208                  * throughout, and found no VM_MERGEABLE: so do the same as
1209                  * __ksm_exit does to remove this mm from all our lists now.
1210                  */
1211                 remove_mm_from_lists(mm);
1212                 up_read(&mm->mmap_sem);
1213                 slot = ksm_scan.mm_slot;
1214                 if (slot != &ksm_mm_head)
1215                         goto next_mm;
1216                 return NULL;
1217         }
1218
1219         /*
1220          * Nuke all the rmap_items that are above this current rmap:
1221          * because there were no VM_MERGEABLE vmas with such addresses.
1222          */
1223         remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
1224         up_read(&mm->mmap_sem);
1225
1226         spin_lock(&ksm_mmlist_lock);
1227         slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1228         ksm_scan.mm_slot = slot;
1229         spin_unlock(&ksm_mmlist_lock);
1230
1231         /* Repeat until we've completed scanning the whole list */
1232         if (slot != &ksm_mm_head)
1233                 goto next_mm;
1234
1235         /*
1236          * Bump seqnr here rather than at top, so that __ksm_exit
1237          * can skip rb_erase on unstable tree until we run again.
1238          */
1239         ksm_scan.seqnr++;
1240         return NULL;
1241 }
1242
1243 /**
1244  * ksm_do_scan  - the ksm scanner main worker function.
1245  * @scan_npages - number of pages we want to scan before we return.
1246  */
1247 static void ksm_do_scan(unsigned int scan_npages)
1248 {
1249         struct rmap_item *rmap_item;
1250         struct page *page;
1251
1252         while (scan_npages--) {
1253                 cond_resched();
1254                 rmap_item = scan_get_next_rmap_item(&page);
1255                 if (!rmap_item)
1256                         return;
1257                 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1258                         cmp_and_merge_page(page, rmap_item);
1259                 put_page(page);
1260         }
1261 }
1262
1263 static int ksm_scan_thread(void *nothing)
1264 {
1265         set_user_nice(current, 0);
1266
1267         while (!kthread_should_stop()) {
1268                 if (ksm_run & KSM_RUN_MERGE) {
1269                         mutex_lock(&ksm_thread_mutex);
1270                         ksm_do_scan(ksm_thread_pages_to_scan);
1271                         mutex_unlock(&ksm_thread_mutex);
1272                         schedule_timeout_interruptible(
1273                                 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1274                 } else {
1275                         wait_event_interruptible(ksm_thread_wait,
1276                                         (ksm_run & KSM_RUN_MERGE) ||
1277                                         kthread_should_stop());
1278                 }
1279         }
1280         return 0;
1281 }
1282
1283 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1284                 unsigned long end, int advice, unsigned long *vm_flags)
1285 {
1286         struct mm_struct *mm = vma->vm_mm;
1287
1288         switch (advice) {
1289         case MADV_MERGEABLE:
1290                 /*
1291                  * Be somewhat over-protective for now!
1292                  */
1293                 if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
1294                                  VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
1295                                  VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
1296                                  VM_MIXEDMAP  | VM_SAO))
1297                         return 0;               /* just ignore the advice */
1298
1299                 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
1300                         if (__ksm_enter(mm) < 0)
1301                                 return -EAGAIN;
1302
1303                 *vm_flags |= VM_MERGEABLE;
1304                 break;
1305
1306         case MADV_UNMERGEABLE:
1307                 if (!(*vm_flags & VM_MERGEABLE))
1308                         return 0;               /* just ignore the advice */
1309
1310                 if (vma->anon_vma)
1311                         unmerge_ksm_pages(vma, start, end);
1312
1313                 *vm_flags &= ~VM_MERGEABLE;
1314                 break;
1315         }
1316
1317         return 0;
1318 }
1319
1320 int __ksm_enter(struct mm_struct *mm)
1321 {
1322         struct mm_slot *mm_slot = alloc_mm_slot();
1323         if (!mm_slot)
1324                 return -ENOMEM;
1325
1326         spin_lock(&ksm_mmlist_lock);
1327         insert_to_mm_slots_hash(mm, mm_slot);
1328         /*
1329          * Insert just behind the scanning cursor, to let the area settle
1330          * down a little; when fork is followed by immediate exec, we don't
1331          * want ksmd to waste time setting up and tearing down an rmap_list.
1332          */
1333         list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1334         spin_unlock(&ksm_mmlist_lock);
1335
1336         set_bit(MMF_VM_MERGEABLE, &mm->flags);
1337         return 0;
1338 }
1339
1340 void __ksm_exit(struct mm_struct *mm)
1341 {
1342         /*
1343          * This process is exiting: doesn't hold and doesn't need mmap_sem;
1344          * but we do need to exclude ksmd and other exiters while we modify
1345          * the various lists and trees.
1346          */
1347         mutex_lock(&ksm_thread_mutex);
1348         remove_mm_from_lists(mm);
1349         mutex_unlock(&ksm_thread_mutex);
1350 }
1351
1352 #define KSM_ATTR_RO(_name) \
1353         static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1354 #define KSM_ATTR(_name) \
1355         static struct kobj_attribute _name##_attr = \
1356                 __ATTR(_name, 0644, _name##_show, _name##_store)
1357
1358 static ssize_t sleep_millisecs_show(struct kobject *kobj,
1359                                     struct kobj_attribute *attr, char *buf)
1360 {
1361         return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
1362 }
1363
1364 static ssize_t sleep_millisecs_store(struct kobject *kobj,
1365                                      struct kobj_attribute *attr,
1366                                      const char *buf, size_t count)
1367 {
1368         unsigned long msecs;
1369         int err;
1370
1371         err = strict_strtoul(buf, 10, &msecs);
1372         if (err || msecs > UINT_MAX)
1373                 return -EINVAL;
1374
1375         ksm_thread_sleep_millisecs = msecs;
1376
1377         return count;
1378 }
1379 KSM_ATTR(sleep_millisecs);
1380
1381 static ssize_t pages_to_scan_show(struct kobject *kobj,
1382                                   struct kobj_attribute *attr, char *buf)
1383 {
1384         return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
1385 }
1386
1387 static ssize_t pages_to_scan_store(struct kobject *kobj,
1388                                    struct kobj_attribute *attr,
1389                                    const char *buf, size_t count)
1390 {
1391         int err;
1392         unsigned long nr_pages;
1393
1394         err = strict_strtoul(buf, 10, &nr_pages);
1395         if (err || nr_pages > UINT_MAX)
1396                 return -EINVAL;
1397
1398         ksm_thread_pages_to_scan = nr_pages;
1399
1400         return count;
1401 }
1402 KSM_ATTR(pages_to_scan);
1403
1404 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1405                         char *buf)
1406 {
1407         return sprintf(buf, "%u\n", ksm_run);
1408 }
1409
1410 static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1411                          const char *buf, size_t count)
1412 {
1413         int err;
1414         unsigned long flags;
1415
1416         err = strict_strtoul(buf, 10, &flags);
1417         if (err || flags > UINT_MAX)
1418                 return -EINVAL;
1419         if (flags > KSM_RUN_UNMERGE)
1420                 return -EINVAL;
1421
1422         /*
1423          * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
1424          * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
1425          * breaking COW to free the kernel_pages_allocated (but leaves
1426          * mm_slots on the list for when ksmd may be set running again).
1427          */
1428
1429         mutex_lock(&ksm_thread_mutex);
1430         if (ksm_run != flags) {
1431                 ksm_run = flags;
1432                 if (flags & KSM_RUN_UNMERGE)
1433                         unmerge_and_remove_all_rmap_items();
1434         }
1435         mutex_unlock(&ksm_thread_mutex);
1436
1437         if (flags & KSM_RUN_MERGE)
1438                 wake_up_interruptible(&ksm_thread_wait);
1439
1440         return count;
1441 }
1442 KSM_ATTR(run);
1443
1444 static ssize_t pages_shared_show(struct kobject *kobj,
1445                                  struct kobj_attribute *attr, char *buf)
1446 {
1447         return sprintf(buf, "%lu\n",
1448                         ksm_pages_shared - ksm_kernel_pages_allocated);
1449 }
1450 KSM_ATTR_RO(pages_shared);
1451
1452 static ssize_t kernel_pages_allocated_show(struct kobject *kobj,
1453                                            struct kobj_attribute *attr,
1454                                            char *buf)
1455 {
1456         return sprintf(buf, "%lu\n", ksm_kernel_pages_allocated);
1457 }
1458 KSM_ATTR_RO(kernel_pages_allocated);
1459
1460 static ssize_t max_kernel_pages_store(struct kobject *kobj,
1461                                       struct kobj_attribute *attr,
1462                                       const char *buf, size_t count)
1463 {
1464         int err;
1465         unsigned long nr_pages;
1466
1467         err = strict_strtoul(buf, 10, &nr_pages);
1468         if (err)
1469                 return -EINVAL;
1470
1471         ksm_max_kernel_pages = nr_pages;
1472
1473         return count;
1474 }
1475
1476 static ssize_t max_kernel_pages_show(struct kobject *kobj,
1477                                      struct kobj_attribute *attr, char *buf)
1478 {
1479         return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
1480 }
1481 KSM_ATTR(max_kernel_pages);
1482
1483 static struct attribute *ksm_attrs[] = {
1484         &sleep_millisecs_attr.attr,
1485         &pages_to_scan_attr.attr,
1486         &run_attr.attr,
1487         &pages_shared_attr.attr,
1488         &kernel_pages_allocated_attr.attr,
1489         &max_kernel_pages_attr.attr,
1490         NULL,
1491 };
1492
1493 static struct attribute_group ksm_attr_group = {
1494         .attrs = ksm_attrs,
1495         .name = "ksm",
1496 };
1497
1498 static int __init ksm_init(void)
1499 {
1500         struct task_struct *ksm_thread;
1501         int err;
1502
1503         err = ksm_slab_init();
1504         if (err)
1505                 goto out;
1506
1507         err = mm_slots_hash_init();
1508         if (err)
1509                 goto out_free1;
1510
1511         ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
1512         if (IS_ERR(ksm_thread)) {
1513                 printk(KERN_ERR "ksm: creating kthread failed\n");
1514                 err = PTR_ERR(ksm_thread);
1515                 goto out_free2;
1516         }
1517
1518         err = sysfs_create_group(mm_kobj, &ksm_attr_group);
1519         if (err) {
1520                 printk(KERN_ERR "ksm: register sysfs failed\n");
1521                 goto out_free3;
1522         }
1523
1524         return 0;
1525
1526 out_free3:
1527         kthread_stop(ksm_thread);
1528 out_free2:
1529         mm_slots_hash_free();
1530 out_free1:
1531         ksm_slab_free();
1532 out:
1533         return err;
1534 }
1535 module_init(ksm_init)