X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Fbtrfs%2Fextent_io.c;h=d2d03684fab261fb9663f1d18ef19d9f23d98b78;hb=e7874c996b8591f59d78efa519031dab5b58723b;hp=314041fdfa43a25d5df9cc865b6831563a2c4b64;hpb=c8b978188c9a0fd3d535c13debd19d522b726f1f;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 314041f..d2d0368 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2,14 +2,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include #include "extent_io.h" @@ -18,21 +16,15 @@ #include "ctree.h" #include "btrfs_inode.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); - static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; static LIST_HEAD(buffers); static LIST_HEAD(states); -#define LEAK_DEBUG 1 -#ifdef LEAK_DEBUG -static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED; +#define LEAK_DEBUG 0 +#if LEAK_DEBUG +static DEFINE_SPINLOCK(leak_lock); #endif #define BUFFER_LRU_MAX 64 @@ -47,19 +39,27 @@ struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; get_extent_t *get_extent; + + /* tells writepage not to lock the state bits for this range + * it still does the unlocking + */ + unsigned int extent_locked:1; + + /* tells the submit_bio code to use a WRITE_SYNC */ + unsigned int sync_io:1; }; int __init extent_io_init(void) { - extent_state_cache = btrfs_cache_create("extent_state", - sizeof(struct extent_state), 0, - NULL); + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = btrfs_cache_create("extent_buffers", - sizeof(struct extent_buffer), 0, - NULL); + extent_buffer_cache = kmem_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; return 0; @@ -76,7 +76,11 @@ void extent_io_exit(void) while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); - printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs)); + printk(KERN_ERR "btrfs state leak: start %llu end %llu " + "state %lu in tree %p refs %d\n", + (unsigned long long)state->start, + (unsigned long long)state->end, + state->state, state->tree, atomic_read(&state->refs)); list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); @@ -84,7 +88,9 @@ void extent_io_exit(void) while (!list_empty(&buffers)) { eb = list_entry(buffers.next, struct extent_buffer, leak_list); - printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs)); + printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + "refs %d\n", (unsigned long long)eb->start, + eb->len, atomic_read(&eb->refs)); list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } @@ -97,20 +103,19 @@ void extent_io_exit(void) void extent_io_tree_init(struct extent_io_tree *tree, struct address_space *mapping, gfp_t mask) { - tree->state.rb_node = NULL; - tree->buffer.rb_node = NULL; + tree->state = RB_ROOT; + tree->buffer = RB_ROOT; tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); spin_lock_init(&tree->buffer_lock); tree->mapping = mapping; } -EXPORT_SYMBOL(extent_io_tree_init); -struct extent_state *alloc_extent_state(gfp_t mask) +static struct extent_state *alloc_extent_state(gfp_t mask) { struct extent_state *state; -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; #endif @@ -120,7 +125,7 @@ struct extent_state *alloc_extent_state(gfp_t mask) state->state = 0; state->private = 0; state->tree = NULL; -#ifdef LEAK_DEBUG +#if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); list_add(&state->leak_list, &states); spin_unlock_irqrestore(&leak_lock, flags); @@ -129,18 +134,17 @@ struct extent_state *alloc_extent_state(gfp_t mask) init_waitqueue_head(&state->wq); return state; } -EXPORT_SYMBOL(alloc_extent_state); -void free_extent_state(struct extent_state *state) +static void free_extent_state(struct extent_state *state) { if (!state) return; if (atomic_dec_and_test(&state->refs)) { -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; #endif WARN_ON(state->tree); -#ifdef LEAK_DEBUG +#if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); list_del(&state->leak_list); spin_unlock_irqrestore(&leak_lock, flags); @@ -148,16 +152,15 @@ void free_extent_state(struct extent_state *state) kmem_cache_free(extent_state_cache, state); } } -EXPORT_SYMBOL(free_extent_state); static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) { - struct rb_node ** p = &root->rb_node; - struct rb_node * parent = NULL; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; struct tree_entry *entry; - while(*p) { + while (*p) { parent = *p; entry = rb_entry(parent, struct tree_entry, rb_node); @@ -180,13 +183,13 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, struct rb_node **next_ret) { struct rb_root *root = &tree->state; - struct rb_node * n = root->rb_node; + struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev = NULL; struct tree_entry *entry; struct tree_entry *prev_entry = NULL; - while(n) { + while (n) { entry = rb_entry(n, struct tree_entry, rb_node); prev = n; prev_entry = entry; @@ -195,14 +198,13 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, n = n->rb_left; else if (offset > entry->end) n = n->rb_right; - else { + else return n; - } } if (prev_ret) { orig_prev = prev; - while(prev && offset > prev_entry->end) { + while (prev && offset > prev_entry->end) { prev = rb_next(prev); prev_entry = rb_entry(prev, struct tree_entry, rb_node); } @@ -212,7 +214,7 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, if (next_ret) { prev_entry = rb_entry(prev, struct tree_entry, rb_node); - while(prev && offset < prev_entry->start) { + while (prev && offset < prev_entry->start) { prev = rb_prev(prev); prev_entry = rb_entry(prev, struct tree_entry, rb_node); } @@ -228,9 +230,8 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree, struct rb_node *ret; ret = __etree_search(tree, offset, &prev, NULL); - if (!ret) { + if (!ret) return prev; - } return ret; } @@ -238,11 +239,11 @@ static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, u64 offset, struct rb_node *node) { struct rb_root *root = &tree->buffer; - struct rb_node ** p = &root->rb_node; - struct rb_node * parent = NULL; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; struct extent_buffer *eb; - while(*p) { + while (*p) { parent = *p; eb = rb_entry(parent, struct extent_buffer, rb_node); @@ -263,10 +264,10 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree, u64 offset) { struct rb_root *root = &tree->buffer; - struct rb_node * n = root->rb_node; + struct rb_node *n = root->rb_node; struct extent_buffer *eb; - while(n) { + while (n) { eb = rb_entry(n, struct extent_buffer, rb_node); if (offset < eb->start) n = n->rb_left; @@ -278,6 +279,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree, return NULL; } +static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, + struct extent_state *other) +{ + if (tree->ops && tree->ops->merge_extent_hook) + tree->ops->merge_extent_hook(tree->mapping->host, new, + other); +} + /* * utility function to look for merge candidates inside a given range. * Any extents with matching state are merged together into a single @@ -301,6 +310,7 @@ static int merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { + merge_cb(tree, state, other); state->start = other->start; other->tree = NULL; rb_erase(&other->rb_node, &tree->state); @@ -312,33 +322,37 @@ static int merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { + merge_cb(tree, state, other); other->start = state->start; state->tree = NULL; rb_erase(&state->rb_node, &tree->state); free_extent_state(state); + state = NULL; } } + return 0; } -static void set_state_cb(struct extent_io_tree *tree, +static int set_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned long bits) { if (tree->ops && tree->ops->set_bit_hook) { - tree->ops->set_bit_hook(tree->mapping->host, state->start, - state->end, state->state, bits); + return tree->ops->set_bit_hook(tree->mapping->host, + state->start, state->end, + state->state, bits); } + + return 0; } static void clear_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned long bits) { - if (tree->ops && tree->ops->set_bit_hook) { - tree->ops->clear_bit_hook(tree->mapping->host, state->start, - state->end, state->state, bits); - } + if (tree->ops && tree->ops->clear_bit_hook) + tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } /* @@ -356,22 +370,31 @@ static int insert_state(struct extent_io_tree *tree, int bits) { struct rb_node *node; + int ret; if (end < start) { - printk("end < start %Lu %Lu\n", end, start); + printk(KERN_ERR "btrfs end < start %llu %llu\n", + (unsigned long long)end, + (unsigned long long)start); WARN_ON(1); } + state->start = start; + state->end = end; + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + if (bits & EXTENT_DIRTY) tree->dirty_bytes += end - start + 1; - set_state_cb(tree, state, bits); state->state |= bits; - state->start = start; - state->end = end; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); + printk(KERN_ERR "btrfs found node %llu %llu on insert of " + "%llu %llu\n", (unsigned long long)found->start, + (unsigned long long)found->end, + (unsigned long long)start, (unsigned long long)end); free_extent_state(state); return -EEXIST; } @@ -380,6 +403,15 @@ static int insert_state(struct extent_io_tree *tree, return 0; } +static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, + u64 split) +{ + if (tree->ops && tree->ops->split_extent_hook) + return tree->ops->split_extent_hook(tree->mapping->host, + orig, split); + return 0; +} + /* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an @@ -398,6 +430,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct extent_state *prealloc, u64 split) { struct rb_node *node; + + split_cb(tree, orig, split); + prealloc->start = orig->start; prealloc->end = split - 1; prealloc->state = orig->state; @@ -405,9 +440,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); free_extent_state(prealloc); return -EEXIST; } @@ -427,7 +459,8 @@ static int clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, int bits, int wake, int delete) { - int ret = state->state & bits; + int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int ret = state->state & bits_to_clear; if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -435,7 +468,7 @@ static int clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); - state->state &= ~bits; + state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); if (delete || state->state == 0) { @@ -467,15 +500,22 @@ static int clear_state_bit(struct extent_io_tree *tree, * bits were already set, or zero if none of the bits were already set. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, gfp_t mask) + int bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask) { struct extent_state *state; + struct extent_state *cached; struct extent_state *prealloc = NULL; + struct rb_node *next_node; struct rb_node *node; - unsigned long flags; + u64 last_end; int err; int set = 0; + int clear = 0; + if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + clear = 1; again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -483,7 +523,24 @@ again: return -ENOMEM; } - spin_lock_irqsave(&tree->lock, flags); + spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + + if (clear) { + *cached_state = NULL; + cached_state = NULL; + } + + if (cached && cached->tree && cached->start == start) { + if (clear) + atomic_dec(&cached->refs); + state = cached; + goto hit_next; + } + if (clear) + free_extent_state(cached); + } /* * this search will find the extents that end after * our range starts @@ -492,9 +549,11 @@ again: if (!node) goto out; state = rb_entry(node, struct extent_state, rb_node); +hit_next: if (state->start > end) goto out; WARN_ON(state->end < start); + last_end = state->end; /* * | ---- desired range ---- | @@ -521,11 +580,11 @@ again: if (err) goto out; if (state->end <= end) { - start = state->end + 1; - set |= clear_state_bit(tree, state, bits, - wake, delete); - } else { - start = state->start; + set |= clear_state_bit(tree, state, bits, wake, + delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; } goto search_again; } @@ -540,21 +599,34 @@ again: prealloc = alloc_extent_state(GFP_ATOMIC); err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, - wake, delete); + + set |= clear_state_bit(tree, prealloc, bits, wake, delete); + prealloc = NULL; goto out; } - start = state->end + 1; + if (state->end < end && prealloc && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; + set |= clear_state_bit(tree, state, bits, wake, delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start <= end && next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } goto search_again; out: - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock(&tree->lock); if (prealloc) free_extent_state(prealloc); @@ -563,21 +635,22 @@ out: search_again: if (start > end) goto out; - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock(&tree->lock); if (mask & __GFP_WAIT) cond_resched(); goto again; } -EXPORT_SYMBOL(clear_extent_bit); static int wait_on_state(struct extent_io_tree *tree, struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) { DEFINE_WAIT(wait); prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&tree->lock); + spin_unlock(&tree->lock); schedule(); - spin_lock_irq(&tree->lock); + spin_lock(&tree->lock); finish_wait(&state->wq, &wait); return 0; } @@ -592,7 +665,7 @@ int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) struct extent_state *state; struct rb_node *node; - spin_lock_irq(&tree->lock); + spin_lock(&tree->lock); again: while (1) { /* @@ -621,51 +694,69 @@ again: break; if (need_resched()) { - spin_unlock_irq(&tree->lock); + spin_unlock(&tree->lock); cond_resched(); - spin_lock_irq(&tree->lock); + spin_lock(&tree->lock); } } out: - spin_unlock_irq(&tree->lock); + spin_unlock(&tree->lock); return 0; } -EXPORT_SYMBOL(wait_extent_bit); -static void set_state_bits(struct extent_io_tree *tree, +static int set_state_bits(struct extent_io_tree *tree, struct extent_state *state, int bits) { + int ret; + + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - set_state_cb(tree, state, bits); state->state |= bits; + + return 0; +} + +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + if (cached_ptr && !(*cached_ptr)) { + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { + *cached_ptr = state; + atomic_inc(&state->refs); + } + } } /* - * set some bits on a range in the tree. This may require allocations - * or sleeping, so the gfp mask is used to indicate what is allowed. + * set some bits on a range in the tree. This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed. * - * If 'exclusive' == 1, this will fail with -EEXIST if some part of the - * range already has the desired bits set. The start of the existing - * range is returned in failed_start in this case. + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The start of the + * existing range is returned in failed_start in this case. * - * [start, end] is inclusive - * This takes the tree lock. + * [start, end] is inclusive This takes the tree lock. */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, - int exclusive, u64 *failed_start, gfp_t mask) + +static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, + gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; - unsigned long flags; int err = 0; - int set; u64 last_start; u64 last_end; + again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -673,7 +764,14 @@ again: return -ENOMEM; } - spin_lock_irqsave(&tree->lock, flags); + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start == start && state->tree) { + node = &state->rb_node; + goto hit_next; + } + } /* * this search will find all the extents that end after * our range starts. @@ -685,8 +783,8 @@ again: BUG_ON(err == -EEXIST); goto out; } - state = rb_entry(node, struct extent_state, rb_node); +hit_next: last_start = state->start; last_end = state->end; @@ -697,15 +795,32 @@ again: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set = state->state & bits; - if (set && exclusive) { + struct rb_node *next_node; + if (state->state & exclusive_bits) { *failed_start = state->start; err = -EEXIST; goto out; } - set_state_bits(tree, state, bits); - start = state->end + 1; + + err = set_state_bits(tree, state, bits); + if (err) + goto out; + + cache_state(state, cached_state); merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + + start = last_end + 1; + if (start < end && prealloc && !need_resched()) { + next_node = rb_next(node); + if (next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + } goto search_again; } @@ -726,8 +841,7 @@ again: * desired bit on it. */ if (state->start < start) { - set = state->state & bits; - if (exclusive && set) { + if (state->state & exclusive_bits) { *failed_start = start; err = -EEXIST; goto out; @@ -738,11 +852,14 @@ again: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, bits); - start = state->end + 1; + err = set_state_bits(tree, state, bits); + if (err) + goto out; + cache_state(state, cached_state); merge_state(tree, state); - } else { - start = state->start; + if (last_end == (u64)-1) + goto out; + start = last_end + 1; } goto search_again; } @@ -758,13 +875,16 @@ again: if (end < last_start) this_end = end; else - this_end = last_start -1; + this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, bits); - prealloc = NULL; BUG_ON(err == -EEXIST); - if (err) + if (err) { + prealloc = NULL; goto out; + } + cache_state(prealloc, cached_state); + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -775,8 +895,7 @@ again: * on the first half */ if (state->start <= end && state->end > end) { - set = state->state & bits; - if (exclusive && set) { + if (state->state & exclusive_bits) { *failed_start = start; err = -EEXIST; goto out; @@ -784,7 +903,12 @@ again: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - set_state_bits(tree, prealloc, bits); + err = set_state_bits(tree, prealloc, bits); + if (err) { + prealloc = NULL; + goto out; + } + cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; goto out; @@ -793,7 +917,7 @@ again: goto search_again; out: - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock(&tree->lock); if (prealloc) free_extent_state(prealloc); @@ -802,130 +926,97 @@ out: search_again: if (start > end) goto out; - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock(&tree->lock); if (mask & __GFP_WAIT) cond_resched(); goto again; } -EXPORT_SYMBOL(set_extent_bit); /* wrappers around set/clear extent bit */ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, - mask); + NULL, mask); } -EXPORT_SYMBOL(set_extent_dirty); - -int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); -} -EXPORT_SYMBOL(set_extent_ordered); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { return set_extent_bit(tree, start, end, bits, 0, NULL, - mask); + NULL, mask); } -EXPORT_SYMBOL(set_extent_bits); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { - return clear_extent_bit(tree, start, end, bits, 0, 0, mask); + return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } -EXPORT_SYMBOL(clear_extent_bits); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) + struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY, - 0, NULL, mask); + EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + 0, NULL, cached_state, mask); } -EXPORT_SYMBOL(set_extent_delalloc); int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); -} -EXPORT_SYMBOL(clear_extent_dirty); - -int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, + NULL, mask); } -EXPORT_SYMBOL(clear_extent_ordered); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, - mask); + NULL, mask); } -EXPORT_SYMBOL(set_extent_new); -int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, +static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, + NULL, mask); } -EXPORT_SYMBOL(clear_extent_new); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_uptodate); - -int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); + NULL, mask); } -EXPORT_SYMBOL(clear_extent_uptodate); -int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, + gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, - 0, NULL, mask); + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, mask); } -EXPORT_SYMBOL(set_extent_writeback); - -int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); -} -EXPORT_SYMBOL(clear_extent_writeback); int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) { return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); } -EXPORT_SYMBOL(wait_on_extent_writeback); /* * either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, struct extent_state **cached_state, gfp_t mask) { int err; u64 failed_start; while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, - &failed_start, mask); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, mask); if (err == -EEXIST && (mask & __GFP_WAIT)) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -936,14 +1027,42 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) } return err; } -EXPORT_SYMBOL(lock_extent); + +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + return lock_extent_bits(tree, start, end, 0, NULL, mask); +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + int err; + u64 failed_start; + + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, mask); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, 1, 0, NULL, mask); + return 0; + } + return 1; +} + +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + mask); } -EXPORT_SYMBOL(unlock_extent); /* * helper function to set pages and extents in the tree dirty @@ -961,15 +1080,13 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - set_extent_dirty(tree, start, end, GFP_NOFS); return 0; } -EXPORT_SYMBOL(set_range_dirty); /* * helper function to set both pages and extents in the tree writeback */ -int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -982,10 +1099,8 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - set_extent_writeback(tree, start, end, GFP_NOFS); return 0; } -EXPORT_SYMBOL(set_range_writeback); /* * find the first offset in the io tree with 'bits' set. zero is @@ -1001,17 +1116,16 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state *state; int ret = 1; - spin_lock_irq(&tree->lock); + spin_lock(&tree->lock); /* * this search will find all the extents that end after * our range starts. */ node = tree_search(tree, start); - if (!node) { + if (!node) goto out; - } - while(1) { + while (1) { state = rb_entry(node, struct extent_state, rb_node); if (state->end >= start && (state->state & bits)) { *start_ret = state->start; @@ -1024,10 +1138,9 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, break; } out: - spin_unlock_irq(&tree->lock); + spin_unlock(&tree->lock); return ret; } -EXPORT_SYMBOL(find_first_extent_bit); /* find the first state struct with 'bits' set after 'start', and * return it. tree->lock must be held. NULL will returned if @@ -1044,15 +1157,14 @@ struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, * our range starts. */ node = tree_search(tree, start); - if (!node) { + if (!node) goto out; - } - while(1) { + while (1) { state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) { + if (state->end >= start && (state->state & bits)) return state; - } + node = rb_next(node); if (!node) break; @@ -1060,7 +1172,6 @@ struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, out: return NULL; } -EXPORT_SYMBOL(find_first_extent_bit_state); /* * find a contiguous range of bytes in the file marked as delalloc, not @@ -1069,7 +1180,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state); * 1 is returned if we find something, 0 if nothing was in the tree */ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, - u64 *start, u64 *end, u64 max_bytes) + u64 *start, u64 *end, u64 max_bytes, + struct extent_state **cached_state) { struct rb_node *node; struct extent_state *state; @@ -1077,7 +1189,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, u64 found = 0; u64 total_bytes = 0; - spin_lock_irq(&tree->lock); + spin_lock(&tree->lock); /* * this search will find all the extents that end after @@ -1090,7 +1202,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, goto out; } - while(1) { + while (1) { state = rb_entry(node, struct extent_state, rb_node); if (found && (state->start != cur_start || (state->state & EXTENT_BOUNDARY))) { @@ -1101,8 +1213,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *end = state->end; goto out; } - if (!found) + if (!found) { *start = state->start; + *cached_state = state; + atomic_inc(&state->refs); + } found++; *end = state->end; cur_start = state->end + 1; @@ -1114,7 +1229,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, break; } out: - spin_unlock_irq(&tree->lock); + spin_unlock(&tree->lock); return found; } @@ -1132,9 +1247,10 @@ static noinline int __unlock_for_delalloc(struct inode *inode, if (index == locked_page->index && end_index == index) return 0; - while(nr_pages > 0) { + while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, - min(nr_pages, ARRAY_SIZE(pages)), pages); + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { if (pages[i] != locked_page) unlock_page(pages[i]); @@ -1167,9 +1283,10 @@ static noinline int lock_delalloc_pages(struct inode *inode, /* skip the page at the start index */ nrpages = end_index - index + 1; - while(nrpages > 0) { + while (nrpages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, - min(nrpages, ARRAY_SIZE(pages)), pages); + min_t(unsigned long, + nrpages, ARRAY_SIZE(pages)), pages); if (ret == 0) { ret = -EAGAIN; goto done; @@ -1180,11 +1297,19 @@ static noinline int lock_delalloc_pages(struct inode *inode, * the caller is taking responsibility for * locked_page */ - if (pages[i] != locked_page) + if (pages[i] != locked_page) { lock_page(pages[i]); + if (!PageDirty(pages[i]) || + pages[i]->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(pages[i]); + page_cache_release(pages[i]); + goto done; + } + } page_cache_release(pages[i]); + pages_locked++; } - pages_locked += ret; nrpages -= ret; index += ret; cond_resched(); @@ -1215,6 +1340,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode, u64 delalloc_start; u64 delalloc_end; u64 found; + struct extent_state *cached_state = NULL; int ret; int loops = 0; @@ -1223,21 +1349,29 @@ again: delalloc_start = *start; delalloc_end = 0; found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, - max_bytes); - if (!found) { + max_bytes, &cached_state); + if (!found || delalloc_end <= *start) { *start = delalloc_start; *end = delalloc_end; + free_extent_state(cached_state); return found; } /* + * start comes from the offset of locked_page. We have to lock + * pages in order, so we can't process delalloc bytes before + * locked_page + */ + if (delalloc_start < *start) + delalloc_start = *start; + + /* * make sure to limit the number of pages we try to lock down * if we're looping. */ - if (delalloc_end + 1 - delalloc_start > max_bytes && loops) { - delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) & - ~((u64)PAGE_CACHE_SIZE - 1); - } + if (delalloc_end + 1 - delalloc_start > max_bytes && loops) + delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, delalloc_start, delalloc_end); @@ -1245,6 +1379,7 @@ again: /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ + free_extent_state(cached_state); if (!loops) { unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); max_bytes = PAGE_CACHE_SIZE - offset; @@ -1258,18 +1393,21 @@ again: BUG_ON(ret); /* step three, lock the state bits for the whole range */ - lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + lock_extent_bits(tree, delalloc_start, delalloc_end, + 0, &cached_state, GFP_NOFS); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, 1); + EXTENT_DELALLOC, 1, cached_state); if (!ret) { - unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + unlock_extent_cached(tree, delalloc_start, delalloc_end, + &cached_state, GFP_NOFS); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); goto again; } + free_extent_state(cached_state); *start = delalloc_start; *end = delalloc_end; out_failed: @@ -1279,8 +1417,7 @@ out_failed: int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int clear_dirty, int set_writeback, - int end_writeback) + unsigned long op) { int ret; struct page *pages[16]; @@ -1288,28 +1425,46 @@ int extent_clear_unlock_delalloc(struct inode *inode, unsigned long end_index = end >> PAGE_CACHE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; - int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + int clear_bits = 0; - if (clear_dirty) + if (op & EXTENT_CLEAR_UNLOCK) + clear_bits |= EXTENT_LOCKED; + if (op & EXTENT_CLEAR_DIRTY) clear_bits |= EXTENT_DIRTY; - clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); + if (op & EXTENT_CLEAR_DELALLOC) + clear_bits |= EXTENT_DELALLOC; + + if (op & EXTENT_CLEAR_ACCOUNTING) + clear_bits |= EXTENT_DO_ACCOUNTING; - while(nr_pages > 0) { + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | + EXTENT_SET_PRIVATE2))) + return 0; + + while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, - min(nr_pages, ARRAY_SIZE(pages)), pages); + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { + + if (op & EXTENT_SET_PRIVATE2) + SetPagePrivate2(pages[i]); + if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; } - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_page_dirty_for_io(pages[i]); - if (set_writeback) + if (op & EXTENT_SET_WRITEBACK) set_page_writeback(pages[i]); - if (end_writeback) + if (op & EXTENT_END_WRITEBACK) end_page_writeback(pages[i]); - unlock_page(pages[i]); + if (op & EXTENT_CLEAR_UNLOCK_PAGE) + unlock_page(pages[i]); page_cache_release(pages[i]); } nr_pages -= ret; @@ -1318,7 +1473,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, } return 0; } -EXPORT_SYMBOL(extent_clear_unlock_delalloc); /* * count the number of bytes in the tree that have a given bit(s) @@ -1336,12 +1490,11 @@ u64 count_range_bits(struct extent_io_tree *tree, int found = 0; if (search_end <= cur_start) { - printk("search_end %Lu start %Lu\n", search_end, cur_start); WARN_ON(1); return 0; } - spin_lock_irq(&tree->lock); + spin_lock(&tree->lock); if (cur_start == 0 && bits == EXTENT_DIRTY) { total_bytes = tree->dirty_bytes; goto out; @@ -1351,11 +1504,10 @@ u64 count_range_bits(struct extent_io_tree *tree, * our range starts. */ node = tree_search(tree, cur_start); - if (!node) { + if (!node) goto out; - } - while(1) { + while (1) { state = rb_entry(node, struct extent_state, rb_node); if (state->start > search_end) break; @@ -1374,71 +1526,9 @@ u64 count_range_bits(struct extent_io_tree *tree, break; } out: - spin_unlock_irq(&tree->lock); + spin_unlock(&tree->lock); return total_bytes; } -/* - * helper function to lock both pages and extents in the tree. - * pages must be locked first. - */ -int lock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - int err; - - while (index <= end_index) { - page = grab_cache_page(tree->mapping, index); - if (!page) { - err = -ENOMEM; - goto failed; - } - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto failed; - } - index++; - } - lock_extent(tree, start, end, GFP_NOFS); - return 0; - -failed: - /* - * we failed above in getting the page at 'index', so we undo here - * up to but not including the page at 'index' - */ - end_index = index; - index = start >> PAGE_CACHE_SHIFT; - while (index < end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - return err; -} -EXPORT_SYMBOL(lock_range); - -/* - * helper function to unlock both pages and extents in the tree. - */ -int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - unlock_extent(tree, start, end, GFP_NOFS); - return 0; -} -EXPORT_SYMBOL(unlock_range); /* * set the private field for a given byte offset in the tree. If there isn't @@ -1450,7 +1540,7 @@ int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) struct extent_state *state; int ret = 0; - spin_lock_irq(&tree->lock); + spin_lock(&tree->lock); /* * this search will find all the extents that end after * our range starts. @@ -1467,7 +1557,7 @@ int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) } state->private = private; out: - spin_unlock_irq(&tree->lock); + spin_unlock(&tree->lock); return ret; } @@ -1477,7 +1567,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) struct extent_state *state; int ret = 0; - spin_lock_irq(&tree->lock); + spin_lock(&tree->lock); /* * this search will find all the extents that end after * our range starts. @@ -1494,7 +1584,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) } *private = state->private; out: - spin_unlock_irq(&tree->lock); + spin_unlock(&tree->lock); return ret; } @@ -1505,15 +1595,17 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled) + int bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; int bitset = 0; - unsigned long flags; - spin_lock_irqsave(&tree->lock, flags); - node = tree_search(tree, start); + spin_lock(&tree->lock); + if (cached && cached->tree && cached->start == start) + node = &cached->rb_node; + else + node = tree_search(tree, start); while (node && start <= end) { state = rb_entry(node, struct extent_state, rb_node); @@ -1533,6 +1625,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, bitset = 0; break; } + + if (state->end == (u64)-1) + break; + start = state->end + 1; if (start > end) break; @@ -1543,10 +1639,9 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, break; } } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock(&tree->lock); return bitset; } -EXPORT_SYMBOL(test_range_bit); /* * helper function to set a given page up to date if all the @@ -1557,7 +1652,7 @@ static int check_page_uptodate(struct extent_io_tree *tree, { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); return 0; } @@ -1571,7 +1666,7 @@ static int check_page_locked(struct extent_io_tree *tree, { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); return 0; } @@ -1583,10 +1678,7 @@ static int check_page_locked(struct extent_io_tree *tree, static int check_page_writeback(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) - end_page_writeback(page); + end_page_writeback(page); return 0; } @@ -1644,13 +1736,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err) } if (!uptodate) { - clear_extent_uptodate(tree, start, end, GFP_ATOMIC); + clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); ClearPageUptodate(page); SetPageError(page); } - clear_extent_writeback(tree, start, end, GFP_ATOMIC); - if (whole_page) end_page_writeback(page); else @@ -1674,13 +1764,17 @@ static void end_bio_extent_writepage(struct bio *bio, int err) static void end_bio_extent_readpage(struct bio *bio, int err) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio->bi_io_vec; struct extent_io_tree *tree; u64 start; u64 end; int whole_page; int ret; + if (err) + uptodate = 0; + do { struct page *page = bvec->bv_page; tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -1694,7 +1788,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) else whole_page = 0; - if (--bvec >= bio->bi_io_vec) + if (++bvec <= bvec_end) prefetchw(&bvec->bv_page->flags); if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { @@ -1710,13 +1804,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + if (err) + uptodate = 0; continue; } } - if (uptodate) + if (uptodate) { set_extent_uptodate(tree, start, end, GFP_ATOMIC); + } unlock_extent(tree, start, end, GFP_ATOMIC); if (whole_page) { @@ -1736,7 +1833,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } check_page_locked(tree, page); } - } while (bvec >= bio->bi_io_vec); + } while (bvec <= bvec_end); bio_put(bio); } @@ -1845,7 +1942,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, int contig = 0; int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; - size_t page_size = min(size, PAGE_CACHE_SIZE); + size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); if (bio_ret && *bio_ret) { bio = *bio_ret; @@ -1873,19 +1970,15 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, nr = bio_get_nr_vecs(bdev); bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); - if (!bio) { - printk("failed to allocate bio nr %d\n", nr); - } bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; - if (bio_ret) { + if (bio_ret) *bio_ret = bio; - } else { + else ret = submit_one_bio(rw, bio, mirror_num, bio_flags); - } return ret; } @@ -1899,7 +1992,7 @@ void set_page_extent_mapped(struct page *page) } } -void set_page_extent_head(struct page *page, unsigned long len) +static void set_page_extent_head(struct page *page, unsigned long len) { set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); } @@ -1973,13 +2066,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, break; } extent_offset = cur - em->start; - if (extent_map_end(em) <= cur) { -printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur); - } BUG_ON(extent_map_end(em) <= cur); - if (end < cur) { -printk("2bad mapping end %Lu cur %Lu\n", end, cur); - } BUG_ON(end < cur); if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) @@ -1997,6 +2084,8 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur); } bdev = em->bdev; block_start = em->block_start; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + block_start = EXTENT_MAP_HOLE; free_extent_map(em); em = NULL; @@ -2016,7 +2105,8 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur); continue; } /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { + if (test_range_bit(tree, cur, cur_end, + EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); cur = cur + iosize; @@ -2077,7 +2167,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, submit_one_bio(READ, bio, 0, bio_flags); return ret; } -EXPORT_SYMBOL(extent_read_full_page); + +static noinline void update_nr_written(struct page *page, + struct writeback_control *wbc, + unsigned long nr_written) +{ + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; +} /* * the writepage semantics are similar to regular writepage. extent @@ -2102,6 +2201,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 iosize; u64 unlock_start; sector_t sector; + struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; int ret; @@ -2114,6 +2214,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 delalloc_end; int page_started; int compressed; + int write_flags; + unsigned long nr_written = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC_PLUG; + else + write_flags = WRITE; WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); @@ -2140,48 +2247,80 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_start = start; delalloc_end = 0; page_started = 0; - while(delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, + if (!epd->extent_locked) { + u64 delalloc_to_write = 0; + /* + * make sure the wbc mapping index is at least updated + * to this page. + */ + update_nr_written(page, wbc, 0); + + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, page, &delalloc_start, &delalloc_end, 128 * 1024 * 1024); - if (nr_delalloc == 0) { + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + tree->ops->fill_delalloc(inode, page, delalloc_start, + delalloc_end, &page_started, + &nr_written); + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; delalloc_start = delalloc_end + 1; - continue; } - tree->ops->fill_delalloc(inode, page, delalloc_start, - delalloc_end, &page_started); - delalloc_start = delalloc_end + 1; - } - - /* did the fill delalloc function already unlock and start the IO? */ - if (page_started) { - return 0; - } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; - lock_extent(tree, start, page_end, GFP_NOFS); - unlock_start = start; + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + ret = 0; + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= nr_written; + goto done_unlocked; + } + } if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); if (ret == -EAGAIN) { - unlock_extent(tree, start, page_end, GFP_NOFS); redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); - return 0; + ret = 0; + goto done_unlocked; } } - end = page_end; - if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { - printk("found delalloc bits after lock_extent\n"); - } + /* + * we don't want to touch the inode after unlocking the page, + * so we update the mapping writeback index now + */ + update_nr_written(page, wbc, nr_written + 1); + end = page_end; if (last_byte <= start) { - clear_extent_dirty(tree, start, page_end, GFP_NOFS); - unlock_extent(tree, start, page_end, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -2189,13 +2328,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - set_extent_uptodate(tree, start, page_end, GFP_NOFS); blocksize = inode->i_sb->s_blocksize; while (cur <= end) { if (cur >= last_byte) { - clear_extent_dirty(tree, cur, page_end, GFP_NOFS); - unlock_extent(tree, unlock_start, page_end, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); @@ -2227,12 +2363,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { - clear_extent_dirty(tree, cur, - cur + iosize - 1, GFP_NOFS); - - unlock_extent(tree, unlock_start, cur + iosize -1, - GFP_NOFS); - /* * end_io notification does not happen here for * compressed extents @@ -2257,13 +2387,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } /* leave this out until we have a page_mkwrite call */ if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0)) { + EXTENT_DIRTY, 0, NULL)) { cur = cur + iosize; pg_offset += iosize; continue; } - clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1); @@ -2277,15 +2406,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { - printk("warning page %lu not writeback, " - "cur %llu end %llu\n", page->index, - (unsigned long long)cur, + printk(KERN_ERR "btrfs warning page %lu not " + "writeback, cur %llu end %llu\n", + page->index, (unsigned long long)cur, (unsigned long long)end); } - ret = submit_extent_page(WRITE, tree, page, sector, - iosize, pg_offset, bdev, - &epd->bio, max_nr, + ret = submit_extent_page(write_flags, tree, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, end_bio_extent_writepage, 0, 0, 0); if (ret) @@ -2301,9 +2430,12 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (unlock_start <= page_end) - unlock_extent(tree, unlock_start, page_end, GFP_NOFS); unlock_page(page); + +done_unlocked: + + /* drop our reference on any cached states */ + free_extent_state(cached_state); return 0; } @@ -2322,14 +2454,15 @@ done: * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. */ -int extent_write_cache_pages(struct extent_io_tree *tree, +static int extent_write_cache_pages(struct extent_io_tree *tree, struct address_space *mapping, struct writeback_control *wbc, - writepage_t writepage, void *data) + writepage_t writepage, void *data, + void (*flush_fn)(void *)) { - struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; + int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; @@ -2337,11 +2470,6 @@ int extent_write_cache_pages(struct extent_io_tree *tree, int scanned = 0; int range_whole = 0; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ @@ -2354,10 +2482,10 @@ int extent_write_cache_pages(struct extent_io_tree *tree, scanned = 1; } retry: - while (!done && (index <= end) && + while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + PAGECACHE_TAG_DIRTY, min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; scanned = 1; @@ -2387,8 +2515,11 @@ retry: continue; } - if (wbc->sync_mode != WB_SYNC_NONE) + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(page)) + flush_fn(data); wait_on_page_writeback(page); + } if (PageWriteback(page) || !clear_page_dirty_for_io(page)) { @@ -2402,12 +2533,15 @@ retry: unlock_page(page); ret = 0; } - if (ret || (--(wbc->nr_to_write) <= 0)) - done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; + if (ret) done = 1; - } + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; } pagevec_release(&pvec); cond_resched(); @@ -2421,14 +2555,25 @@ retry: index = 0; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; - - if (wbc->range_cont) - wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } -EXPORT_SYMBOL(extent_write_cache_pages); + +static void flush_epd_write_bio(struct extent_page_data *epd) +{ + if (epd->bio) { + if (epd->sync_io) + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); + else + submit_one_bio(WRITE, epd->bio, 0, 0); + epd->bio = NULL; + } +} + +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + flush_epd_write_bio(epd); +} int extent_write_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, @@ -2440,28 +2585,70 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .bio = NULL, .tree = tree, .get_extent = get_extent, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = wbc->bdi, - .sync_mode = WB_SYNC_NONE, + .sync_mode = wbc->sync_mode, .older_than_this = NULL, .nr_to_write = 64, .range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_end = (loff_t)-1, }; - ret = __extent_writepage(page, wbc, &epd); extent_write_cache_pages(tree, mapping, &wbc_writepages, - __extent_writepage, &epd); - if (epd.bio) { - submit_one_bio(WRITE, epd.bio, 0, 0); - } + __extent_writepage, &epd, flush_write_bio); + flush_epd_write_bio(&epd); return ret; } -EXPORT_SYMBOL(extent_write_full_page); +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode) +{ + int ret = 0; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 1, + .sync_io = mode == WB_SYNC_ALL, + }; + struct writeback_control wbc_writepages = { + .bdi = inode->i_mapping->backing_dev_info, + .sync_mode = mode, + .older_than_this = NULL, + .nr_to_write = nr_pages * 2, + .range_start = start, + .range_end = end + 1, + }; + + while (start <= end) { + page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + if (clear_page_dirty_for_io(page)) + ret = __extent_writepage(page, &wbc_writepages, &epd); + else { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + start + PAGE_CACHE_SIZE - 1, + NULL, 1); + unlock_page(page); + } + page_cache_release(page); + start += PAGE_CACHE_SIZE; + } + + flush_epd_write_bio(&epd); + return ret; +} int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, @@ -2473,16 +2660,16 @@ int extent_writepages(struct extent_io_tree *tree, .bio = NULL, .tree = tree, .get_extent = get_extent, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; ret = extent_write_cache_pages(tree, mapping, wbc, - __extent_writepage, &epd); - if (epd.bio) { - submit_one_bio(WRITE, epd.bio, 0, 0); - } + __extent_writepage, &epd, + flush_write_bio); + flush_epd_write_bio(&epd); return ret; } -EXPORT_SYMBOL(extent_writepages); int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, @@ -2491,39 +2678,25 @@ int extent_readpages(struct extent_io_tree *tree, { struct bio *bio = NULL; unsigned page_idx; - struct pagevec pvec; unsigned long bio_flags = 0; - pagevec_init(&pvec, 0); for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_entry(pages->prev, struct page, lru); prefetchw(&page->flags); list_del(&page->lru); - /* - * what we want to do here is call add_to_page_cache_lru, - * but that isn't exported, so we reproduce it here - */ - if (!add_to_page_cache(page, mapping, + if (!add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) { - - /* open coding of lru_cache_add, also not exported */ - page_cache_get(page); - if (!pagevec_add(&pvec, page)) - __pagevec_lru_add(&pvec); __extent_read_full_page(tree, page, get_extent, &bio, 0, &bio_flags); } page_cache_release(page); } - if (pagevec_count(&pvec)) - __pagevec_lru_add(&pvec); BUG_ON(!list_empty(pages)); if (bio) submit_one_bio(READ, bio, 0, bio_flags); return 0; } -EXPORT_SYMBOL(extent_readpages); /* * basic invalidatepage code, this waits on any locked or writeback @@ -2533,22 +2706,23 @@ EXPORT_SYMBOL(extent_readpages); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset) { + struct extent_state *cached_state = NULL; u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); u64 end = start + PAGE_CACHE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; - start += (offset + blocksize -1) & ~(blocksize - 1); + start += (offset + blocksize - 1) & ~(blocksize - 1); if (start > end) return 0; - lock_extent(tree, start, end, GFP_NOFS); - wait_on_extent_writeback(tree, start, end); + lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); + wait_on_page_writeback(page); clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, - 1, 1, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, + 1, 1, &cached_state, GFP_NOFS); return 0; } -EXPORT_SYMBOL(extent_invalidatepage); /* * simple commit_write call, set_range_dirty is used to mark both @@ -2569,7 +2743,6 @@ int extent_commit_write(struct extent_io_tree *tree, } return 0; } -EXPORT_SYMBOL(extent_commit_write); int extent_prepare_write(struct extent_io_tree *tree, struct inode *inode, struct page *page, @@ -2598,12 +2771,12 @@ int extent_prepare_write(struct extent_io_tree *tree, orig_block_start = block_start; lock_extent(tree, page_start, page_end, GFP_NOFS); - while(block_start <= block_end) { + while (block_start <= block_end) { em = get_extent(inode, page, page_offset, block_start, block_end - block_start + 1, 1); - if (IS_ERR(em) || !em) { + if (IS_ERR(em) || !em) goto err; - } + cur_end = min(block_end, extent_map_end(em) - 1); block_off_start = block_start & (PAGE_CACHE_SIZE - 1); block_off_end = block_off_start + blocksize; @@ -2627,7 +2800,7 @@ int extent_prepare_write(struct extent_io_tree *tree, !isnew && !PageUptodate(page) && (block_off_end > to || block_off_start < from) && !test_range_bit(tree, block_start, cur_end, - EXTENT_UPTODATE, 1)) { + EXTENT_UPTODATE, 1, NULL)) { u64 sector; u64 extent_offset = block_start - em->start; size_t iosize; @@ -2641,7 +2814,7 @@ int extent_prepare_write(struct extent_io_tree *tree, */ set_extent_bit(tree, block_start, block_start + iosize - 1, - EXTENT_LOCKED, 0, NULL, GFP_NOFS); + EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); ret = submit_extent_page(READ, tree, page, sector, iosize, page_offset, em->bdev, NULL, 1, @@ -2667,7 +2840,6 @@ err: /* FIXME, zero out newly allocated blocks on error */ return err; } -EXPORT_SYMBOL(extent_prepare_write); /* * a helper for releasepage, this tests for areas of the page that @@ -2683,17 +2855,21 @@ int try_release_extent_state(struct extent_map_tree *map, int ret = 1; if (test_range_bit(tree, start, end, - EXTENT_IOBITS | EXTENT_ORDERED, 0)) + EXTENT_IOBITS, 0, NULL)) ret = 0; else { if ((mask & GFP_NOFS) == GFP_NOFS) mask = GFP_NOFS; - clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - 1, 1, mask); + /* + * at this point we can safely clear everything except the + * locked bit and the nodatasum bit + */ + clear_extent_bit(tree, start, end, + ~(EXTENT_LOCKED | EXTENT_NODATASUM), + 0, 0, NULL, mask); } return ret; } -EXPORT_SYMBOL(try_release_extent_state); /* * a helper for releasepage. As long as there are no locked extents @@ -2713,29 +2889,28 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 len; while (start <= end) { len = end - start + 1; - spin_lock(&map->lock); + write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); if (!em || IS_ERR(em)) { - spin_unlock(&map->lock); + write_unlock(&map->lock); break; } if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || em->start != start) { - spin_unlock(&map->lock); + write_unlock(&map->lock); free_extent_map(em); break; } if (!test_range_bit(tree, em->start, extent_map_end(em) - 1, - EXTENT_LOCKED | EXTENT_WRITEBACK | - EXTENT_ORDERED, - 0)) { + EXTENT_LOCKED | EXTENT_WRITEBACK, + 0, NULL)) { remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em); } start = extent_map_end(em); - spin_unlock(&map->lock); + write_unlock(&map->lock); /* once for us */ free_extent_map(em); @@ -2743,22 +2918,26 @@ int try_release_extent_mapping(struct extent_map_tree *map, } return try_release_extent_state(map, tree, page, mask); } -EXPORT_SYMBOL(try_release_extent_mapping); sector_t extent_bmap(struct address_space *mapping, sector_t iblock, get_extent_t *get_extent) { struct inode *inode = mapping->host; + struct extent_state *cached_state = NULL; u64 start = iblock << inode->i_blkbits; sector_t sector = 0; + size_t blksize = (1 << inode->i_blkbits); struct extent_map *em; - em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0); + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, + 0, &cached_state, GFP_NOFS); + em = get_extent(inode, NULL, 0, start, blksize, 0); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, + start + blksize - 1, &cached_state, GFP_NOFS); if (!em || IS_ERR(em)) return 0; - if (em->block_start == EXTENT_MAP_INLINE || - em->block_start == EXTENT_MAP_HOLE) + if (em->block_start > EXTENT_MAP_LAST_BYTE) goto out; sector = (em->block_start + start - em->start) >> inode->i_blkbits; @@ -2767,6 +2946,93 @@ out: return sector; } +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent) +{ + int ret; + u64 off = start; + u64 max = start + len; + u32 flags = 0; + u64 disko = 0; + struct extent_map *em = NULL; + struct extent_state *cached_state = NULL; + int end = 0; + u64 em_start = 0, em_len = 0; + unsigned long emflags; + ret = 0; + + if (len == 0) + return -EINVAL; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, + &cached_state, GFP_NOFS); + em = get_extent(inode, NULL, 0, off, max - off, 0); + if (!em) + goto out; + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + while (!end) { + off = em->start + em->len; + if (off >= max) + end = 1; + + em_start = em->start; + em_len = em->len; + + disko = 0; + flags = 0; + + if (em->block_start == EXTENT_MAP_LAST_BYTE) { + end = 1; + flags |= FIEMAP_EXTENT_LAST; + } else if (em->block_start == EXTENT_MAP_HOLE) { + flags |= FIEMAP_EXTENT_UNWRITTEN; + } else if (em->block_start == EXTENT_MAP_INLINE) { + flags |= (FIEMAP_EXTENT_DATA_INLINE | + FIEMAP_EXTENT_NOT_ALIGNED); + } else if (em->block_start == EXTENT_MAP_DELALLOC) { + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + } else { + disko = em->block_start; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + flags |= FIEMAP_EXTENT_ENCODED; + + emflags = em->flags; + free_extent_map(em); + em = NULL; + + if (!end) { + em = get_extent(inode, NULL, 0, off, max - off, 0); + if (!em) + goto out; + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + emflags = em->flags; + } + if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; + } +out_free: + free_extent_map(em); +out: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, + &cached_state, GFP_NOFS); + return ret; +} + static inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { @@ -2804,15 +3070,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, gfp_t mask) { struct extent_buffer *eb = NULL; -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; #endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); eb->start = start; eb->len = len; - mutex_init(&eb->mutex); -#ifdef LEAK_DEBUG + spin_lock_init(&eb->lock); + init_waitqueue_head(&eb->lock_wq); + +#if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); list_add(&eb->leak_list, &buffers); spin_unlock_irqrestore(&leak_lock, flags); @@ -2824,7 +3092,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, static void __free_extent_buffer(struct extent_buffer *eb) { -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; spin_lock_irqsave(&leak_lock, flags); list_del(&eb->leak_list); @@ -2892,8 +3160,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, unlock_page(p); } if (uptodate) - eb->flags |= EXTENT_UPTODATE; - eb->flags |= EXTENT_BUFFER_FILLED; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); spin_lock(&tree->buffer_lock); exists = buffer_tree_insert(tree, start, &eb->rb_node); @@ -2903,10 +3170,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, spin_unlock(&tree->buffer_lock); goto free_eb; } - spin_unlock(&tree->buffer_lock); - /* add one reference for the tree */ atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); return eb; free_eb: @@ -2918,7 +3184,6 @@ free_eb: __free_extent_buffer(eb); return exists; } -EXPORT_SYMBOL(alloc_extent_buffer); struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, @@ -2937,7 +3202,6 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, return eb; } -EXPORT_SYMBOL(find_extent_buffer); void free_extent_buffer(struct extent_buffer *eb) { @@ -2949,46 +3213,27 @@ void free_extent_buffer(struct extent_buffer *eb) WARN_ON(1); } -EXPORT_SYMBOL(free_extent_buffer); int clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb) { - int set; unsigned long i; unsigned long num_pages; struct page *page; - u64 start = eb->start; - u64 end = start + eb->len - 1; - - set = clear_extent_dirty(tree, start, end, GFP_NOFS); num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); + if (!PageDirty(page)) + continue; + lock_page(page); if (i == 0) set_page_extent_head(page, eb->len); else set_page_private(page, EXTENT_PAGE_PRIVATE); - /* - * if we're on the last page or the first page and the - * block isn't aligned on a page boundary, do extra checks - * to make sure we don't clean page that is partially dirty - */ - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - start = (u64)page->index << PAGE_CACHE_SHIFT; - end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, - EXTENT_DIRTY, 0)) { - unlock_page(page); - continue; - } - } clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -3001,7 +3246,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, } return 0; } -EXPORT_SYMBOL(clear_extent_buffer_dirty); int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, struct extent_buffer *eb) @@ -3009,51 +3253,34 @@ int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, return wait_on_extent_writeback(tree, eb->start, eb->start + eb->len - 1); } -EXPORT_SYMBOL(wait_on_extent_buffer_writeback); int set_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; + int was_dirty = 0; + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *page = extent_buffer_page(eb, i); - /* writepage may need to do something special for the - * first page, we have to make sure page->private is - * properly set. releasepage may drop page->private - * on us if the page isn't already dirty. - */ - lock_page(page); - if (i == 0) { - set_page_extent_head(page, eb->len); - } else if (PagePrivate(page) && - page->private != EXTENT_PAGE_PRIVATE) { - set_page_extent_mapped(page); - } + for (i = 0; i < num_pages; i++) __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); - set_extent_dirty(tree, page_offset(page), - page_offset(page) + PAGE_CACHE_SIZE -1, - GFP_NOFS); - unlock_page(page); - } - return 0; + return was_dirty; } -EXPORT_SYMBOL(set_extent_buffer_dirty); int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) + struct extent_buffer *eb, + struct extent_state **cached_state) { unsigned long i; struct page *page; unsigned long num_pages; num_pages = num_extent_pages(eb->start, eb->len); - eb->flags &= ~EXTENT_UPTODATE; + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - GFP_NOFS); + cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3085,7 +3312,6 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree, } return 0; } -EXPORT_SYMBOL(set_extent_buffer_uptodate); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end) @@ -3096,10 +3322,10 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); + ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); if (ret) return 1; - while(start <= end) { + while (start <= end) { index = start >> PAGE_CACHE_SHIFT; page = find_get_page(tree->mapping, index); uptodate = PageUptodate(page); @@ -3114,7 +3340,8 @@ int extent_range_uptodate(struct extent_io_tree *tree, } int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) + struct extent_buffer *eb, + struct extent_state *cached_state) { int ret = 0; unsigned long num_pages; @@ -3122,11 +3349,11 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, struct page *page; int pg_uptodate = 1; - if (eb->flags & EXTENT_UPTODATE) + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 1; ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1); + EXTENT_UPTODATE, 1, cached_state); if (ret) return ret; @@ -3140,7 +3367,6 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, } return pg_uptodate; } -EXPORT_SYMBOL(extent_buffer_uptodate); int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, @@ -3159,11 +3385,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned long bio_flags = 0; - if (eb->flags & EXTENT_UPTODATE) + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1)) { + EXTENT_UPTODATE, 1, NULL)) { return 0; } @@ -3185,16 +3411,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, lock_page(page); } locked_pages++; - if (!PageUptodate(page)) { + if (!PageUptodate(page)) all_uptodate = 0; - } } if (all_uptodate) { if (start_i == 0) - eb->flags |= EXTENT_UPTODATE; - if (ret) { - printk("all up to date but ret is %d\n", ret); - } + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); goto unlock_exit; } @@ -3209,10 +3431,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, err = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags); - if (err) { + if (err) ret = err; - printk("err %d from __extent_read_full_page\n", ret); - } } else { unlock_page(page); } @@ -3221,26 +3441,23 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (bio) submit_one_bio(READ, bio, mirror_num, bio_flags); - if (ret || !wait) { - if (ret) - printk("ret %d wait %d returning\n", ret, wait); + if (ret || !wait) return ret; - } + for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); wait_on_page_locked(page); - if (!PageUptodate(page)) { - printk("page not uptodate after wait_on_page_locked\n"); + if (!PageUptodate(page)) ret = -EIO; - } } + if (!ret) - eb->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: i = start_i; - while(locked_pages > 0) { + while (locked_pages > 0) { page = extent_buffer_page(eb, i); i++; unlock_page(page); @@ -3248,7 +3465,6 @@ unlock_exit: } return ret; } -EXPORT_SYMBOL(read_extent_buffer_pages); void read_extent_buffer(struct extent_buffer *eb, void *dstv, unsigned long start, @@ -3267,7 +3483,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - while(len > 0) { + while (len > 0) { page = extent_buffer_page(eb, i); cur = min(len, (PAGE_CACHE_SIZE - offset)); @@ -3281,7 +3497,6 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, i++; } } -EXPORT_SYMBOL(read_extent_buffer); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, char **token, char **map, @@ -3306,8 +3521,11 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, offset = 0; *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; } + if (start + min_len > eb->len) { -printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len); + printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + "wanted %lu %lu\n", (unsigned long long)eb->start, + eb->len, start, min_len); WARN_ON(1); } @@ -3318,7 +3536,6 @@ printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, *map_len = PAGE_CACHE_SIZE - offset; return 0; } -EXPORT_SYMBOL(map_private_extent_buffer); int map_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, @@ -3343,13 +3560,11 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start, } return err; } -EXPORT_SYMBOL(map_extent_buffer); void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) { kunmap_atomic(token, km); } -EXPORT_SYMBOL(unmap_extent_buffer); int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, unsigned long start, @@ -3369,7 +3584,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - while(len > 0) { + while (len > 0) { page = extent_buffer_page(eb, i); cur = min(len, (PAGE_CACHE_SIZE - offset)); @@ -3387,7 +3602,6 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, } return ret; } -EXPORT_SYMBOL(memcmp_extent_buffer); void write_extent_buffer(struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len) @@ -3405,7 +3619,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - while(len > 0) { + while (len > 0) { page = extent_buffer_page(eb, i); WARN_ON(!PageUptodate(page)); @@ -3420,7 +3634,6 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, i++; } } -EXPORT_SYMBOL(write_extent_buffer); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len) @@ -3437,7 +3650,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - while(len > 0) { + while (len > 0) { page = extent_buffer_page(eb, i); WARN_ON(!PageUptodate(page)); @@ -3451,7 +3664,6 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, i++; } } -EXPORT_SYMBOL(memset_extent_buffer); void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, @@ -3470,7 +3682,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, offset = (start_offset + dst_offset) & ((unsigned long)PAGE_CACHE_SIZE - 1); - while(len > 0) { + while (len > 0) { page = extent_buffer_page(dst, i); WARN_ON(!PageUptodate(page)); @@ -3486,7 +3698,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, i++; } } -EXPORT_SYMBOL(copy_extent_buffer); static void move_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, @@ -3537,17 +3748,17 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk("memmove bogus src_offset %lu move len %lu len %lu\n", - src_offset, len, dst->len); + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu dst len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk("memmove bogus dst_offset %lu move len %lu len %lu\n", - dst_offset, len, dst->len); + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu dst len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } - while(len > 0) { + while (len > 0) { dst_off_in_page = (start_offset + dst_offset) & ((unsigned long)PAGE_CACHE_SIZE - 1); src_off_in_page = (start_offset + src_offset) & @@ -3570,7 +3781,6 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, len -= cur; } } -EXPORT_SYMBOL(memcpy_extent_buffer); void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) @@ -3585,20 +3795,20 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk("memmove bogus src_offset %lu move len %lu len %lu\n", - src_offset, len, dst->len); + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk("memmove bogus dst_offset %lu move len %lu len %lu\n", - dst_offset, len, dst->len); + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } - while(len > 0) { + while (len > 0) { dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; @@ -3619,7 +3829,6 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, len -= cur; } } -EXPORT_SYMBOL(memmove_extent_buffer); int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) { @@ -3638,6 +3847,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) ret = 0; goto out; } + if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + ret = 0; + goto out; + } /* at this point we can safely release the extent buffer */ num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) @@ -3648,4 +3861,3 @@ out: spin_unlock(&tree->buffer_lock); return ret; } -EXPORT_SYMBOL(try_release_extent_buffer);