mm: lockless pagecache
authorNick Piggin <npiggin@suse.de>
Sat, 26 Jul 2008 02:45:31 +0000 (19:45 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 26 Jul 2008 19:00:06 +0000 (12:00 -0700)
Combine page_cache_get_speculative with lockless radix tree lookups to
introduce lockless page cache lookups (ie.  no mapping->tree_lock on the
read-side).

The only atomicity changes this introduces is that the gang pagecache
lookup functions now behave as if they are implemented with multiple
find_get_page calls, rather than operating on a snapshot of the pages.  In
practice, this atomicity guarantee is not used anyway, and it is to
replace individual lookups, so these semantics are natural.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/filemap.c

index 4e182a9..feb8448 100644 (file)
@@ -637,15 +637,35 @@ void __lock_page_nosync(struct page *page)
  * Is there a pagecache struct page at the given (mapping, offset) tuple?
  * If yes, increment its refcount and return it; if no, return NULL.
  */
-struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
 {
+       void **pagep;
        struct page *page;
 
-       read_lock_irq(&mapping->tree_lock);
-       page = radix_tree_lookup(&mapping->page_tree, offset);
-       if (page)
-               page_cache_get(page);
-       read_unlock_irq(&mapping->tree_lock);
+       rcu_read_lock();
+repeat:
+       page = NULL;
+       pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
+       if (pagep) {
+               page = radix_tree_deref_slot(pagep);
+               if (unlikely(!page || page == RADIX_TREE_RETRY))
+                       goto repeat;
+
+               if (!page_cache_get_speculative(page))
+                       goto repeat;
+
+               /*
+                * Has the page moved?
+                * This is part of the lockless pagecache protocol. See
+                * include/linux/pagemap.h for details.
+                */
+               if (unlikely(page != *pagep)) {
+                       page_cache_release(page);
+                       goto repeat;
+               }
+       }
+       rcu_read_unlock();
+
        return page;
 }
 EXPORT_SYMBOL(find_get_page);
@@ -660,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);
  *
  * Returns zero if the page was not present. find_lock_page() may sleep.
  */
-struct page *find_lock_page(struct address_space *mapping,
-                               pgoff_t offset)
+struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
 {
        struct page *page;
 
 repeat:
-       read_lock_irq(&mapping->tree_lock);
-       page = radix_tree_lookup(&mapping->page_tree, offset);
+       page = find_get_page(mapping, offset);
        if (page) {
-               page_cache_get(page);
-               if (TestSetPageLocked(page)) {
-                       read_unlock_irq(&mapping->tree_lock);
-                       __lock_page(page);
-
-                       /* Has the page been truncated while we slept? */
-                       if (unlikely(page->mapping != mapping)) {
-                               unlock_page(page);
-                               page_cache_release(page);
-                               goto repeat;
-                       }
-                       VM_BUG_ON(page->index != offset);
-                       goto out;
+               lock_page(page);
+               /* Has the page been truncated? */
+               if (unlikely(page->mapping != mapping)) {
+                       unlock_page(page);
+                       page_cache_release(page);
+                       goto repeat;
                }
+               VM_BUG_ON(page->index != offset);
        }
-       read_unlock_irq(&mapping->tree_lock);
-out:
        return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -751,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
        unsigned int i;
        unsigned int ret;
+       unsigned int nr_found;
+
+       rcu_read_lock();
+restart:
+       nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                               (void ***)pages, start, nr_pages);
+       ret = 0;
+       for (i = 0; i < nr_found; i++) {
+               struct page *page;
+repeat:
+               page = radix_tree_deref_slot((void **)pages[i]);
+               if (unlikely(!page))
+                       continue;
+               /*
+                * this can only trigger if nr_found == 1, making livelock
+                * a non issue.
+                */
+               if (unlikely(page == RADIX_TREE_RETRY))
+                       goto restart;
+
+               if (!page_cache_get_speculative(page))
+                       goto repeat;
 
-       read_lock_irq(&mapping->tree_lock);
-       ret = radix_tree_gang_lookup(&mapping->page_tree,
-                               (void **)pages, start, nr_pages);
-       for (i = 0; i < ret; i++)
-               page_cache_get(pages[i]);
-       read_unlock_irq(&mapping->tree_lock);
+               /* Has the page moved? */
+               if (unlikely(page != *((void **)pages[i]))) {
+                       page_cache_release(page);
+                       goto repeat;
+               }
+
+               pages[ret] = page;
+               ret++;
+       }
+       rcu_read_unlock();
        return ret;
 }
 
@@ -778,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 {
        unsigned int i;
        unsigned int ret;
+       unsigned int nr_found;
+
+       rcu_read_lock();
+restart:
+       nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                               (void ***)pages, index, nr_pages);
+       ret = 0;
+       for (i = 0; i < nr_found; i++) {
+               struct page *page;
+repeat:
+               page = radix_tree_deref_slot((void **)pages[i]);
+               if (unlikely(!page))
+                       continue;
+               /*
+                * this can only trigger if nr_found == 1, making livelock
+                * a non issue.
+                */
+               if (unlikely(page == RADIX_TREE_RETRY))
+                       goto restart;
 
-       read_lock_irq(&mapping->tree_lock);
-       ret = radix_tree_gang_lookup(&mapping->page_tree,
-                               (void **)pages, index, nr_pages);
-       for (i = 0; i < ret; i++) {
-               if (pages[i]->mapping == NULL || pages[i]->index != index)
+               if (page->mapping == NULL || page->index != index)
                        break;
 
-               page_cache_get(pages[i]);
+               if (!page_cache_get_speculative(page))
+                       goto repeat;
+
+               /* Has the page moved? */
+               if (unlikely(page != *((void **)pages[i]))) {
+                       page_cache_release(page);
+                       goto repeat;
+               }
+
+               pages[ret] = page;
+               ret++;
                index++;
        }
-       read_unlock_irq(&mapping->tree_lock);
-       return i;
+       rcu_read_unlock();
+       return ret;
 }
 EXPORT_SYMBOL(find_get_pages_contig);
 
@@ -810,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 {
        unsigned int i;
        unsigned int ret;
+       unsigned int nr_found;
+
+       rcu_read_lock();
+restart:
+       nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
+                               (void ***)pages, *index, nr_pages, tag);
+       ret = 0;
+       for (i = 0; i < nr_found; i++) {
+               struct page *page;
+repeat:
+               page = radix_tree_deref_slot((void **)pages[i]);
+               if (unlikely(!page))
+                       continue;
+               /*
+                * this can only trigger if nr_found == 1, making livelock
+                * a non issue.
+                */
+               if (unlikely(page == RADIX_TREE_RETRY))
+                       goto restart;
+
+               if (!page_cache_get_speculative(page))
+                       goto repeat;
+
+               /* Has the page moved? */
+               if (unlikely(page != *((void **)pages[i]))) {
+                       page_cache_release(page);
+                       goto repeat;
+               }
+
+               pages[ret] = page;
+               ret++;
+       }
+       rcu_read_unlock();
 
-       read_lock_irq(&mapping->tree_lock);
-       ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
-                               (void **)pages, *index, nr_pages, tag);
-       for (i = 0; i < ret; i++)
-               page_cache_get(pages[i]);
        if (ret)
                *index = pages[ret - 1]->index + 1;
-       read_unlock_irq(&mapping->tree_lock);
+
        return ret;
 }
 EXPORT_SYMBOL(find_get_pages_tag);