X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Fbtrfs%2Fextent-tree.c;h=559f72489b3bf02b4477369da854bf371cbbd0e4;hb=f958a1320ff7a1e0e861d3c90de6da12a88839dc;hp=9bcb9c09c3b81271b87bf3c043e61df2ff6a8cb4;hpb=11833d66be94b514652466802100378046c16b72;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9bcb9c0..559f724 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, struct extent_buffer **must_clean); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -81,6 +83,17 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) return (cache->flags & bits) == bits; } +void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +{ + atomic_inc(&cache->count); +} + +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) + kfree(cache); +} + /* * this adds the block group to the fs_info rb tree for the block group * cache @@ -154,7 +167,7 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, } } if (ret) - atomic_inc(&ret->count); + btrfs_get_block_group(ret); spin_unlock(&info->block_group_cache_lock); return ret; @@ -193,6 +206,14 @@ static int exclude_super_stripes(struct btrfs_root *root, int stripe_len; int i, nr, ret; + if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, cache->key.objectid, + stripe_len); + BUG_ON(ret); + } + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); ret = btrfs_rmap_block(&root->fs_info->mapping_tree, @@ -201,6 +222,7 @@ static int exclude_super_stripes(struct btrfs_root *root, BUG_ON(ret); while (nr--) { + cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); BUG_ON(ret); @@ -252,7 +274,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, if (ret) break; - if (extent_start == start) { + if (extent_start <= start) { start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; @@ -295,6 +317,9 @@ static int caching_kthread(void *data) return -ENOMEM; exclude_super_stripes(extent_root, block_group); + spin_lock(&block_group->space_info->lock); + block_group->space_info->bytes_super += block_group->bytes_super; + spin_unlock(&block_group->space_info->lock); last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -393,6 +418,8 @@ err: put_caching_control(caching_ctl); atomic_dec(&block_group->space_info->caching_threads); + btrfs_put_block_group(block_group); + return 0; } @@ -433,6 +460,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache) up_write(&fs_info->extent_commit_sem); atomic_inc(&cache->space_info->caching_threads); + btrfs_get_block_group(cache); tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", cache->key.objectid); @@ -472,12 +500,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return cache; } -void btrfs_put_block_group(struct btrfs_block_group_cache *cache) -{ - if (atomic_dec_and_test(&cache->count)) - kfree(cache); -} - static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -1562,22 +1584,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#ifdef BIO_RW_DISCARD static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, + DISCARD_FL_BARRIER); } -#endif static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { -#ifdef BIO_RW_DISCARD int ret; u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; + if (!btrfs_test_opt(root, DISCARD)) + return 0; + /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, bytenr, &map_length, &multi, 0); @@ -1597,9 +1620,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } return ret; -#else - return 0; -#endif } int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -2570,7 +2590,7 @@ next_block_group(struct btrfs_root *root, if (node) { cache = rb_entry(node, struct btrfs_block_group_cache, cache_node); - atomic_inc(&cache->count); + btrfs_get_block_group(cache); } else cache = NULL; spin_unlock(&root->fs_info->block_group_cache_lock); @@ -2760,60 +2780,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) alloc_target); } +static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) +{ + u64 num_bytes; + int level; + + level = BTRFS_MAX_LEVEL - 2; + /* + * NOTE: these calculations are absolutely the worst possible case. + * This assumes that _every_ item we insert will require a new leaf, and + * that the tree has grown to its maximum level size. + */ + + /* + * for every item we insert we could insert both an extent item and a + * extent ref item. Then for ever item we insert, we will need to cow + * both the original leaf, plus the leaf to the left and right of it. + * + * Unless we are talking about the extent root, then we just want the + * number of items * 2, since we just need the extent item plus its ref. + */ + if (root == root->fs_info->extent_root) + num_bytes = num_items * 2; + else + num_bytes = (num_items + (2 * num_items)) * 3; + + /* + * num_bytes is total number of leaves we could need times the leaf + * size, and then for every leaf we could end up cow'ing 2 nodes per + * level, down to the leaf level. + */ + num_bytes = (num_bytes * root->leafsize) + + (num_bytes * (level * 2)) * root->nodesize; + + return num_bytes; +} + /* - * for now this just makes sure we have at least 5% of our metadata space free - * for use. + * Unreserve metadata space for delalloc. If we have less reserved credits than + * we have extents, this function does nothing. */ -int btrfs_check_metadata_free_space(struct btrfs_root *root) +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) { struct btrfs_fs_info *info = root->fs_info; struct btrfs_space_info *meta_sinfo; - u64 alloc_target, thresh; - int committed = 0, ret; + u64 num_bytes; + u64 alloc_target; + bool bug = false; /* get the space info for where the metadata will live */ alloc_target = btrfs_get_alloc_profile(root, 0); meta_sinfo = __find_space_info(info, alloc_target); -again: + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); + spin_lock(&meta_sinfo->lock); - if (!meta_sinfo->full) - thresh = meta_sinfo->total_bytes * 80; - else - thresh = meta_sinfo->total_bytes * 95; + spin_lock(&BTRFS_I(inode)->accounting_lock); + if (BTRFS_I(inode)->reserved_extents <= + BTRFS_I(inode)->outstanding_extents) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); + spin_unlock(&meta_sinfo->lock); + return 0; + } + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + BTRFS_I(inode)->reserved_extents--; + BUG_ON(BTRFS_I(inode)->reserved_extents < 0); + + if (meta_sinfo->bytes_delalloc < num_bytes) { + bug = true; + meta_sinfo->bytes_delalloc = 0; + } else { + meta_sinfo->bytes_delalloc -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + + return 0; +} + +static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +{ + u64 thresh; + thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use; + + thresh = meta_sinfo->total_bytes - thresh; + thresh *= 80; do_div(thresh, 100); + if (thresh <= meta_sinfo->bytes_delalloc) + meta_sinfo->force_delalloc = 1; + else + meta_sinfo->force_delalloc = 0; +} - if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { - struct btrfs_trans_handle *trans; - if (!meta_sinfo->full) { - meta_sinfo->force_alloc = 1; - spin_unlock(&meta_sinfo->lock); +struct async_flush { + struct btrfs_root *root; + struct btrfs_space_info *info; + struct btrfs_work work; +}; - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; +static noinline void flush_delalloc_async(struct btrfs_work *work) +{ + struct async_flush *async; + struct btrfs_root *root; + struct btrfs_space_info *info; - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, alloc_target, 0); - btrfs_end_transaction(trans, root); + async = container_of(work, struct async_flush, work); + root = async->root; + info = async->info; + + btrfs_start_delalloc_inodes(root, 0); + wake_up(&info->flush_wait); + btrfs_wait_ordered_extents(root, 0, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); + + kfree(async); +} + +static void wait_on_flush(struct btrfs_space_info *info) +{ + DEFINE_WAIT(wait); + u64 used; + + while (1) { + prepare_to_wait(&info->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_lock(&info->lock); + if (!info->flushing) { + spin_unlock(&info->lock); + break; + } + + used = info->bytes_used + info->bytes_reserved + + info->bytes_pinned + info->bytes_readonly + + info->bytes_super + info->bytes_root + + info->bytes_may_use + info->bytes_delalloc; + if (used < info->total_bytes) { + spin_unlock(&info->lock); + break; + } + spin_unlock(&info->lock); + schedule(); + } + finish_wait(&info->flush_wait, &wait); +} + +static void flush_delalloc(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct async_flush *async; + bool wait = false; + + spin_lock(&info->lock); + + if (!info->flushing) { + info->flushing = 1; + init_waitqueue_head(&info->flush_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_on_flush(info); + return; + } + + async = kzalloc(sizeof(*async), GFP_NOFS); + if (!async) + goto flush; + + async->root = root; + async->info = info; + async->work.func = flush_delalloc_async; + + btrfs_queue_worker(&root->fs_info->enospc_workers, + &async->work); + wait_on_flush(info); + return; + +flush: + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); +} + +static int maybe_allocate_chunk(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + bool wait = false; + int ret = 0; + u64 min_metadata; + u64 free_space; + + free_space = btrfs_super_total_bytes(disk_super); + /* + * we allow the metadata to grow to a max of either 10gb or 5% of the + * space in the volume. + */ + min_metadata = min((u64)10 * 1024 * 1024 * 1024, + div64_u64(free_space * 5, 100)); + if (info->total_bytes >= min_metadata) { + spin_unlock(&info->lock); + return 0; + } + + if (info->full) { + spin_unlock(&info->lock); + return 0; + } + + if (!info->allocating_chunk) { + info->force_alloc = 1; + info->allocating_chunk = 1; + init_waitqueue_head(&info->allocate_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_event(info->allocate_wait, + !info->allocating_chunk); + return 1; + } + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 4096 + 2 * 1024 * 1024, + info->flags, 0); + btrfs_end_transaction(trans, root); + if (ret) + goto out; +out: + spin_lock(&info->lock); + info->allocating_chunk = 0; + spin_unlock(&info->lock); + wake_up(&info->allocate_wait); + + if (ret) + return 0; + return 1; +} + +/* + * Reserve metadata space for delalloc. + */ +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int flushed = 0; + int force_delalloc; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); +again: + spin_lock(&meta_sinfo->lock); + + force_delalloc = meta_sinfo->force_delalloc; + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!flushed) + meta_sinfo->bytes_delalloc += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + flushed++; + + if (flushed == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + flushed++; + } else { + spin_unlock(&meta_sinfo->lock); + } + + if (flushed == 2) { + filemap_flush(inode->i_mapping); + goto again; + } else if (flushed == 3) { + flush_delalloc(root, meta_sinfo); goto again; } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_delalloc -= num_bytes; spin_unlock(&meta_sinfo->lock); + printk(KERN_ERR "enospc, has %d, reserved %d\n", + BTRFS_I(inode)->outstanding_extents, + BTRFS_I(inode)->reserved_extents); + dump_space_info(meta_sinfo, 0, 0); + return -ENOSPC; + } - if (!committed) { - committed = 1; - trans = btrfs_join_transaction(root, 1); - if (!trans) - return -ENOMEM; - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; + BTRFS_I(inode)->reserved_extents++; + check_force_delalloc(meta_sinfo); + spin_unlock(&meta_sinfo->lock); + + if (!flushed && force_delalloc) + filemap_flush(inode->i_mapping); + + return 0; +} + +/* + * unreserve num_items number of items worth of metadata space. This needs to + * be paired with btrfs_reserve_metadata_space. + * + * NOTE: if you have the option, run this _AFTER_ you do a + * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref + * oprations which will result in more used metadata, so we want to make sure we + * can do that without issue. + */ +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 alloc_target; + bool bug = false; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); + + spin_lock(&meta_sinfo->lock); + if (meta_sinfo->bytes_may_use < num_bytes) { + bug = true; + meta_sinfo->bytes_may_use = 0; + } else { + meta_sinfo->bytes_may_use -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + + return 0; +} + +/* + * Reserve some metadata space for use. We'll calculate the worste case number + * of bytes that would be needed to modify num_items number of items. If we + * have space, fantastic, if not, you get -ENOSPC. Please call + * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of + * items you reserved, since whatever metadata you needed should have already + * been allocated. + * + * This will commit the transaction to make more space if we don't have enough + * metadata space. THe only time we don't do this is if we're reserving space + * inside of a transaction, then we will just return -ENOSPC and it is the + * callers responsibility to handle it properly. + */ +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int retries = 0; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); +again: + spin_lock(&meta_sinfo->lock); + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!retries) + meta_sinfo->bytes_may_use += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + retries++; + if (retries == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + retries++; + } else { + spin_unlock(&meta_sinfo->lock); + } + + if (retries == 2) { + flush_delalloc(root, meta_sinfo); goto again; } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_may_use -= num_bytes; + spin_unlock(&meta_sinfo->lock); + + dump_space_info(meta_sinfo, 0, 0); return -ENOSPC; } + + check_force_delalloc(meta_sinfo); spin_unlock(&meta_sinfo->lock); return 0; @@ -2833,13 +3241,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); data_sinfo = BTRFS_I(inode)->space_info; + if (!data_sinfo) + goto alloc; + again: /* make sure we have enough space to handle the data first */ spin_lock(&data_sinfo->lock); if (data_sinfo->total_bytes - data_sinfo->bytes_used - data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - - data_sinfo->bytes_may_use < bytes) { + data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { struct btrfs_trans_handle *trans; /* @@ -2851,7 +3262,7 @@ again: data_sinfo->force_alloc = 1; spin_unlock(&data_sinfo->lock); - +alloc: alloc_target = btrfs_get_alloc_profile(root, 1); trans = btrfs_start_transaction(root, 1); if (!trans) @@ -2863,12 +3274,17 @@ again: btrfs_end_transaction(trans, root); if (ret) return ret; + + if (!data_sinfo) { + btrfs_set_inode_space_info(root, inode); + data_sinfo = BTRFS_I(inode)->space_info; + } goto again; } spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ - if (!committed) { + if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); if (!trans) @@ -2896,7 +3312,7 @@ again: BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); - return btrfs_check_metadata_free_space(root); + return 0; } /* @@ -2995,17 +3411,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(!space_info); spin_lock(&space_info->lock); - if (space_info->force_alloc) { + if (space_info->force_alloc) force = 1; - space_info->force_alloc = 0; - } if (space_info->full) { spin_unlock(&space_info->lock); goto out; } thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 6); + thresh = div_factor(thresh, 8); if (!force && (space_info->bytes_used + space_info->bytes_pinned + space_info->bytes_reserved + alloc_bytes) < thresh) { @@ -3019,7 +3433,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, * we keep a reasonable number of metadata chunks allocated in the * FS as well. */ - if (flags & BTRFS_BLOCK_GROUP_DATA) { + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { fs_info->data_chunk_allocations++; if (!(fs_info->data_chunk_allocations % fs_info->metadata_ratio)) @@ -3027,8 +3441,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } ret = btrfs_alloc_chunk(trans, extent_root, flags); + spin_lock(&space_info->lock); if (ret) space_info->full = 1; + space_info->force_alloc = 0; + spin_unlock(&space_info->lock); out: mutex_unlock(&extent_root->fs_info->chunk_mutex); return ret; @@ -3053,14 +3470,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, else old_val -= num_bytes; btrfs_set_super_bytes_used(&info->super_copy, old_val); - - /* block accounting for root item */ - old_val = btrfs_root_used(&root->root_item); - if (alloc) - old_val += num_bytes; - else - old_val -= num_bytes; - btrfs_set_root_used(&root->root_item, old_val); spin_unlock(&info->delalloc_lock); while (total) { @@ -3286,6 +3695,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, if (is_data) goto pinit; + /* + * discard is sloooow, and so triggering discards on + * individual btree blocks isn't a good plan. Just + * pin everything in discard mode. + */ + if (btrfs_test_opt(root, DISCARD)) + goto pinit; + buf = btrfs_find_tree_block(root, bytenr, num_bytes); if (!buf) goto pinit; @@ -3640,6 +4057,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, return ret; } +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 parent, u64 root_objectid, int level) +{ + u64 used; + spin_lock(&root->node_lock); + used = btrfs_root_used(&root->root_item) - blocksize; + btrfs_set_root_used(&root->root_item, used); + spin_unlock(&root->node_lock); + + return btrfs_free_extent(trans, root, bytenr, blocksize, + parent, root_objectid, level, 0); +} + static u64 stripe_align(struct btrfs_root *root, u64 val) { u64 mask = ((u64)root->stripesize - 1); @@ -3693,7 +4125,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) } enum btrfs_loop_type { - LOOP_CACHED_ONLY = 0, + LOOP_FIND_IDEAL = 0, LOOP_CACHING_NOWAIT = 1, LOOP_CACHING_WAIT = 2, LOOP_ALLOC_CHUNK = 3, @@ -3722,10 +4154,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group = NULL; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; + int done_chunk_alloc = 0; struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; bool found_uncached_bg = false; + bool failed_cluster_refill = false; + bool failed_alloc = false; + u64 ideal_cache_percent = 0; + u64 ideal_cache_offset = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -3761,14 +4198,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { +ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. + * + * However if we are re-searching with an ideal block group + * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, data) && - block_group_cache_done(block_group)) { + (block_group->cached != BTRFS_CACHE_NO || + search_start == ideal_cache_offset)) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -3780,50 +4222,76 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, */ btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); - } else + } else { goto have_block_group; + } } else if (block_group) { btrfs_put_block_group(block_group); } } - search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups, list) { u64 offset; int cached; - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); search_start = block_group->key.objectid; have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + u64 free_percent; + + free_percent = btrfs_block_group_used(&block_group->item); + free_percent *= 100; + free_percent = div64_u64(free_percent, + block_group->key.offset); + free_percent = 100 - free_percent; + if (free_percent > ideal_cache_percent && + likely(!block_group->ro)) { + ideal_cache_offset = block_group->key.objectid; + ideal_cache_percent = free_percent; + } + /* - * we want to start caching kthreads, but not too many - * right off the bat so we don't overwhelm the system, - * so only start them if there are less than 2 and we're - * in the initial allocation phase. + * We only want to start kthread caching if we are at + * the point where we will wait for caching to make + * progress, or if our ideal search is over and we've + * found somebody to start caching. */ if (loop > LOOP_CACHING_NOWAIT || - atomic_read(&space_info->caching_threads) < 2) { + (loop > LOOP_FIND_IDEAL && + atomic_read(&space_info->caching_threads) < 2)) { ret = cache_block_group(block_group); BUG_ON(ret); } - } - - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) { found_uncached_bg = true; - /* if we only want cached bgs, loop */ - if (loop == LOOP_CACHED_ONLY) + /* + * If loop is set for cached only, try the next block + * group. + */ + if (loop == LOOP_FIND_IDEAL) goto loop; } + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) + found_uncached_bg = true; + if (unlikely(block_group->ro)) goto loop; - if (last_ptr) { + /* + * Ok we want to try and use the cluster allocator, so lets look + * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will + * have tried the cluster allocator plenty of times at this + * point and not have found anything, so we are likely way too + * fragmented for the clustering stuff to find anything, so lets + * just skip it and let the allocator find whatever block it can + * find + */ + if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { /* * the refill lock keeps out other * people trying to start a new cluster @@ -3855,7 +4323,7 @@ have_block_group: btrfs_put_block_group(block_group); block_group = last_ptr->block_group; - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); spin_unlock(&last_ptr->lock); spin_unlock(&last_ptr->refill_lock); @@ -3898,9 +4366,11 @@ refill_cluster: spin_unlock(&last_ptr->refill_lock); goto checks; } - } else if (!cached && loop > LOOP_CACHING_NOWAIT) { + } else if (!cached && loop > LOOP_CACHING_NOWAIT + && !failed_cluster_refill) { spin_unlock(&last_ptr->refill_lock); + failed_cluster_refill = true; wait_block_group_cache_progress(block_group, num_bytes + empty_cluster + empty_size); goto have_block_group; @@ -3912,25 +4382,30 @@ refill_cluster: * cluster. Free the cluster we've been trying * to use, and go to the next block group */ - if (loop < LOOP_NO_EMPTY_SIZE) { - btrfs_return_cluster_to_free_space(NULL, - last_ptr); - spin_unlock(&last_ptr->refill_lock); - goto loop; - } + btrfs_return_cluster_to_free_space(NULL, last_ptr); spin_unlock(&last_ptr->refill_lock); + goto loop; } offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset && (cached || (!cached && - loop == LOOP_CACHING_NOWAIT))) { - goto loop; - } else if (!offset && (!cached && - loop > LOOP_CACHING_NOWAIT)) { + /* + * If we didn't find a chunk, and we haven't failed on this + * block group before, and this block group is in the middle of + * caching and we are ok with waiting, then go ahead and wait + * for progress to be made, and set failed_alloc to true. + * + * If failed_alloc is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !failed_alloc && !cached && + loop > LOOP_CACHING_NOWAIT) { wait_block_group_cache_progress(block_group, - num_bytes + empty_size); + num_bytes + empty_size); + failed_alloc = true; goto have_block_group; + } else if (!offset) { + goto loop; } checks: search_start = stripe_align(root, offset); @@ -3977,13 +4452,17 @@ checks: /* we are all good, lets return */ break; loop: + failed_cluster_refill = false; + failed_alloc = false; btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); - /* LOOP_CACHED_ONLY, only search fully cached block groups - * LOOP_CACHING_NOWAIT, search partially cached block groups, but - * dont wait foR them to finish caching + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for + * for them to make caching progress. Also + * determine the best possible bg to cache + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try @@ -3992,12 +4471,47 @@ loop: if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && (found_uncached_bg || empty_size || empty_cluster || allowed_chunk_alloc)) { - if (found_uncached_bg) { + if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; - if (loop < LOOP_CACHING_WAIT) { - loop++; + loop++; + if (!ideal_cache_percent && + atomic_read(&space_info->caching_threads)) goto search; - } + + /* + * 1 of the following 2 things have happened so far + * + * 1) We found an ideal block group for caching that + * is mostly full and will cache quickly, so we might + * as well wait for it. + * + * 2) We searched for cached only and we didn't find + * anything, and we didn't start any caching kthreads + * either, so chances are we will loop through and + * start a couple caching kthreads, and then come back + * around and just wait for them. This will be slower + * because we will have 2 caching kthreads reading at + * the same time when we could have just started one + * and waited for it to get far enough to give us an + * allocation, so go ahead and go to the wait caching + * loop. + */ + loop = LOOP_CACHING_WAIT; + search_start = ideal_cache_offset; + ideal_cache_percent = 0; + goto ideal_cache; + } else if (loop == LOOP_FIND_IDEAL) { + /* + * Didn't find a uncached bg, wait on anything we find + * next. + */ + loop = LOOP_CACHING_WAIT; + goto search; + } + + if (loop < LOOP_CACHING_WAIT) { + loop++; + goto search; } if (loop == LOOP_ALLOC_CHUNK) { @@ -4009,7 +4523,8 @@ loop: ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, 1); allowed_chunk_alloc = 0; - } else { + done_chunk_alloc = 1; + } else if (!done_chunk_alloc) { space_info->force_alloc = 1; } @@ -4034,21 +4549,32 @@ loop: return ret; } -static void dump_space_info(struct btrfs_space_info *info, u64 bytes) +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) { struct btrfs_block_group_cache *cache; + spin_lock(&info->lock); printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved), + info->bytes_pinned - info->bytes_reserved - + info->bytes_super), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", + " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" + "\n", (unsigned long long)info->total_bytes, (unsigned long long)info->bytes_pinned, (unsigned long long)info->bytes_delalloc, (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_used); + (unsigned long long)info->bytes_used, + (unsigned long long)info->bytes_root, + (unsigned long long)info->bytes_super, + (unsigned long long)info->bytes_reserved); + spin_unlock(&info->lock); + + if (!dump_block_groups) + return; down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { @@ -4075,7 +4601,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, { int ret; u64 search_start = 0; - struct btrfs_fs_info *info = root->fs_info; data = btrfs_get_alloc_profile(root, data); again: @@ -4083,17 +4608,9 @@ again: * the only place that sets empty_size is btrfs_realloc_node, which * is not called recursively on allocations */ - if (empty_size || root->ref_cows) { - if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, - BTRFS_BLOCK_GROUP_METADATA | - (info->metadata_alloc_profile & - info->avail_metadata_alloc_bits), 0); - } + if (empty_size || root->ref_cows) ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes + 2 * 1024 * 1024, data, 0); - } WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, @@ -4116,7 +4633,7 @@ again: printk(KERN_ERR "btrfs allocation failed flags %llu, " "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes); + dump_space_info(sinfo, num_bytes, 1); } return ret; @@ -4394,6 +4911,14 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, extent_op); BUG_ON(ret); } + + if (root_objectid == root->root_key.objectid) { + u64 used; + spin_lock(&root->node_lock); + used = btrfs_root_used(&root->root_item) + num_bytes; + btrfs_set_root_used(&root->root_item, used); + spin_unlock(&root->node_lock); + } return ret; } @@ -4416,8 +4941,16 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - set_extent_dirty(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + /* + * we allow two log transactions at a time, use different + * EXENT bit to differentiate dirty pages. + */ + if (root->log_transid % 2 == 0) + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + else + set_extent_new(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); } else { set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); @@ -4453,452 +4986,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return buf; } -#if 0 -int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *leaf) -{ - u64 disk_bytenr; - u64 num_bytes; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - u32 nritems; - int i; - int ret; - - BUG_ON(!btrfs_is_leaf(leaf)); - nritems = btrfs_header_nritems(leaf); - - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &key, i); - - /* only extents have references, skip everything else */ - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - - /* inline extents live in the btree, they don't have refs */ - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - - /* holes don't have refs */ - if (disk_bytenr == 0) - continue; - - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, - leaf->start, 0, key.objectid, 0); - BUG_ON(ret); - } - return 0; -} - -static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_leaf_ref *ref) -{ - int i; - int ret; - struct btrfs_extent_info *info; - struct refsort *sorted; - - if (ref->nritems == 0) - return 0; - - sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); - for (i = 0; i < ref->nritems; i++) { - sorted[i].bytenr = ref->extents[i].bytenr; - sorted[i].slot = i; - } - sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); - - /* - * the items in the ref were sorted when the ref was inserted - * into the ref cache, so this is already in order - */ - for (i = 0; i < ref->nritems; i++) { - info = ref->extents + sorted[i].slot; - ret = btrfs_free_extent(trans, root, info->bytenr, - info->num_bytes, ref->bytenr, - ref->owner, ref->generation, - info->objectid, 0); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - - BUG_ON(ret); - info++; - } - - kfree(sorted); - return 0; -} - - -static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 start, - u64 len, u32 *refs) -{ - int ret; - - ret = btrfs_lookup_extent_refs(trans, root, start, len, refs); - BUG_ON(ret); - -#if 0 /* some debugging code in case we see problems here */ - /* if the refs count is one, it won't get increased again. But - * if the ref count is > 1, someone may be decreasing it at - * the same time we are. - */ - if (*refs != 1) { - struct extent_buffer *eb = NULL; - eb = btrfs_find_create_tree_block(root, start, len); - if (eb) - btrfs_tree_lock(eb); - - mutex_lock(&root->fs_info->alloc_mutex); - ret = lookup_extent_ref(NULL, root, start, len, refs); - BUG_ON(ret); - mutex_unlock(&root->fs_info->alloc_mutex); - - if (eb) { - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - if (*refs == 1) { - printk(KERN_ERR "btrfs block %llu went down to one " - "during drop_snap\n", (unsigned long long)start); - } - - } -#endif - - cond_resched(); - return ret; -} +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; + int reada_slot; + int reada_count; +}; +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 -/* - * this is used while deleting old snapshots, and it drops the refs - * on a whole subtree starting from a level 1 node. - * - * The idea is to sort all the leaf pointers, and then drop the - * ref on all the leaves in order. Most of the time the leaves - * will have ref cache entries, so no leaf IOs will be required to - * find the extents they have references on. - * - * For each leaf, any references it has are also dropped in order - * - * This ends up dropping the references in something close to optimal - * order for reading and modifying the extent allocation tree. - */ -static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path) +static noinline void reada_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct walk_control *wc, + struct btrfs_path *path) { u64 bytenr; - u64 root_owner; - u64 root_gen; - struct extent_buffer *eb = path->nodes[1]; - struct extent_buffer *leaf; - struct btrfs_leaf_ref *ref; - struct refsort *sorted = NULL; - int nritems = btrfs_header_nritems(eb); + u64 generation; + u64 refs; + u64 flags; + u64 last = 0; + u32 nritems; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *eb; int ret; - int i; - int refi = 0; - int slot = path->slots[1]; - u32 blocksize = btrfs_level_size(root, 0); - u32 refs; - - if (nritems == 0) - goto out; - - root_owner = btrfs_header_owner(eb); - root_gen = btrfs_header_generation(eb); - sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); + int slot; + int nread = 0; - /* - * step one, sort all the leaf pointers so we don't scribble - * randomly into the extent allocation tree - */ - for (i = slot; i < nritems; i++) { - sorted[refi].bytenr = btrfs_node_blockptr(eb, i); - sorted[refi].slot = i; - refi++; + if (path->slots[wc->level] < wc->reada_slot) { + wc->reada_count = wc->reada_count * 2 / 3; + wc->reada_count = max(wc->reada_count, 2); + } else { + wc->reada_count = wc->reada_count * 3 / 2; + wc->reada_count = min_t(int, wc->reada_count, + BTRFS_NODEPTRS_PER_BLOCK(root)); } - /* - * nritems won't be zero, but if we're picking up drop_snapshot - * after a crash, slot might be > 0, so double check things - * just in case. - */ - if (refi == 0) - goto out; + eb = path->nodes[wc->level]; + nritems = btrfs_header_nritems(eb); + blocksize = btrfs_level_size(root, wc->level - 1); - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + for (slot = path->slots[wc->level]; slot < nritems; slot++) { + if (nread >= wc->reada_count) + break; - /* - * the first loop frees everything the leaves point to - */ - for (i = 0; i < refi; i++) { - u64 ptr_gen; + cond_resched(); + bytenr = btrfs_node_blockptr(eb, slot); + generation = btrfs_node_ptr_generation(eb, slot); - bytenr = sorted[i].bytenr; + if (slot == path->slots[wc->level]) + goto reada; - /* - * check the reference count on this leaf. If it is > 1 - * we just decrement it below and don't update any - * of the refs the leaf points to. - */ - ret = drop_snap_lookup_refcount(trans, root, bytenr, - blocksize, &refs); - BUG_ON(ret); - if (refs != 1) + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) continue; - ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); - - /* - * the leaf only had one reference, which means the - * only thing pointing to this leaf is the snapshot - * we're deleting. It isn't possible for the reference - * count to increase again later - * - * The reference cache is checked for the leaf, - * and if found we'll be able to drop any refs held by - * the leaf without needing to read it in. - */ - ref = btrfs_lookup_leaf_ref(root, bytenr); - if (ref && ref->generation != ptr_gen) { - btrfs_free_leaf_ref(root, ref); - ref = NULL; - } - if (ref) { - ret = cache_drop_leaf_ref(trans, root, ref); - BUG_ON(ret); - btrfs_remove_leaf_ref(root, ref); - btrfs_free_leaf_ref(root, ref); - } else { - /* - * the leaf wasn't in the reference cache, so - * we have to read it. - */ - leaf = read_tree_block(root, bytenr, blocksize, - ptr_gen); - ret = btrfs_drop_leaf_ref(trans, root, leaf); - BUG_ON(ret); - free_extent_buffer(leaf); - } - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - } - - /* - * run through the loop again to free the refs on the leaves. - * This is faster than doing it in the loop above because - * the leaves are likely to be clustered together. We end up - * working in nice chunks on the extent allocation tree. - */ - for (i = 0; i < refi; i++) { - bytenr = sorted[i].bytenr; - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, eb->start, - root_owner, root_gen, 0, 1); + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &refs, &flags); BUG_ON(ret); + BUG_ON(refs == 0); - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - } -out: - kfree(sorted); - - /* - * update the path to show we've processed the entire level 1 - * node. This will get saved into the root's drop_snapshot_progress - * field so these drops are not repeated again if this transaction - * commits. - */ - path->slots[1] = nritems; - return 0; -} - -/* - * helper function for drop_snapshot, this walks down the tree dropping ref - * counts as it goes. - */ -static noinline int walk_down_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level) -{ - u64 root_owner; - u64 root_gen; - u64 bytenr; - u64 ptr_gen; - struct extent_buffer *next; - struct extent_buffer *cur; - struct extent_buffer *parent; - u32 blocksize; - int ret; - u32 refs; - - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, - path->nodes[*level]->len, &refs); - BUG_ON(ret); - if (refs > 1) - goto out; - - /* - * walk down to the last node level and free all the leaves - */ - while (*level >= 0) { - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - cur = path->nodes[*level]; - - if (btrfs_header_level(cur) != *level) - WARN_ON(1); - - if (path->slots[*level] >= - btrfs_header_nritems(cur)) - break; + if (wc->stage == DROP_REFERENCE) { + if (refs == 1) + goto reada; - /* the new code goes down to level 1 and does all the - * leaves pointed to that node in bulk. So, this check - * for level 0 will always be false. - * - * But, the disk format allows the drop_snapshot_progress - * field in the root to leave things in a state where - * a leaf will need cleaning up here. If someone crashes - * with the old code and then boots with the new code, - * we might find a leaf here. - */ - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, cur); - BUG_ON(ret); - break; + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + if (!wc->update_ref || + generation <= root->root_key.offset) + continue; + btrfs_node_key_to_cpu(eb, &key, slot); + ret = btrfs_comp_cpu_keys(&key, + &wc->update_progress); + if (ret < 0) + continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; } - - /* - * once we get to level one, process the whole node - * at once, including everything below it. - */ - if (*level == 1) { - ret = drop_level_one_refs(trans, root, path); - BUG_ON(ret); +reada: + ret = readahead_tree_block(root, bytenr, blocksize, + generation); + if (ret) break; - } - - bytenr = btrfs_node_blockptr(cur, path->slots[*level]); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); - - ret = drop_snap_lookup_refcount(trans, root, bytenr, - blocksize, &refs); - BUG_ON(ret); - - /* - * if there is more than one reference, we don't need - * to read that node to drop any references it has. We - * just drop the ref we hold on that node and move on to the - * next slot in this level. - */ - if (refs != 1) { - parent = path->nodes[*level]; - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - path->slots[*level]++; - - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - root_owner, root_gen, - *level - 1, 1); - BUG_ON(ret); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - - continue; - } - - /* - * we need to keep freeing things in the next level down. - * read the block and loop around to process it - */ - next = read_tree_block(root, bytenr, blocksize, ptr_gen); - WARN_ON(*level <= 0); - if (path->nodes[*level-1]) - free_extent_buffer(path->nodes[*level-1]); - path->nodes[*level-1] = next; - *level = btrfs_header_level(next); - path->slots[*level] = 0; - cond_resched(); - } -out: - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - - if (path->nodes[*level] == root->node) { - parent = path->nodes[*level]; - bytenr = path->nodes[*level]->start; - } else { - parent = path->nodes[*level + 1]; - bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); + last = bytenr + blocksize; + nread++; } - - blocksize = btrfs_level_size(root, *level); - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - - /* - * cleanup and free the reference on the last node - * we processed - */ - ret = btrfs_free_extent(trans, root, bytenr, blocksize, - parent->start, root_owner, root_gen, - *level, 1); - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - - *level += 1; - BUG_ON(ret); - - cond_resched(); - return 0; + wc->reada_slot = slot; } -#endif - -struct walk_control { - u64 refs[BTRFS_MAX_LEVEL]; - u64 flags[BTRFS_MAX_LEVEL]; - struct btrfs_key update_progress; - int stage; - int level; - int shared_level; - int update_ref; - int keep_locks; -}; - -#define DROP_REFERENCE 1 -#define UPDATE_BACKREF 2 /* * hepler to process tree block while walking down the tree. * - * when wc->stage == DROP_REFERENCE, this function checks - * reference count of the block. if the block is shared and - * we need update back refs for the subtree rooted at the - * block, this function changes wc->stage to UPDATE_BACKREF - * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. * @@ -4907,11 +5096,10 @@ struct walk_control { static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc) + struct walk_control *wc, int lookup_info) { int level = wc->level; struct extent_buffer *eb = path->nodes[level]; - struct btrfs_key key; u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; @@ -4923,30 +5111,16 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ - if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || - (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); ret = btrfs_lookup_extent_info(trans, root, eb->start, eb->len, &wc->refs[level], - &wc->flags[level]); - BUG_ON(ret); - BUG_ON(wc->refs[level] == 0); - } - - if (wc->stage == DROP_REFERENCE && - wc->update_ref && wc->refs[level] > 1) { - BUG_ON(eb == root->node); - BUG_ON(path->slots[level] > 0); - if (level == 0) - btrfs_item_key_to_cpu(eb, &key, path->slots[level]); - else - btrfs_node_key_to_cpu(eb, &key, path->slots[level]); - if (btrfs_header_owner(eb) == root->root_key.objectid && - btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) { - wc->stage = UPDATE_BACKREF; - wc->shared_level = level; - } + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); } if (wc->stage == DROP_REFERENCE) { @@ -4985,6 +5159,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, } /* + * hepler to process tree block pointer. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block pointed to. if the block + * is shared and we need update back refs for the subtree + * rooted at the block, this function changes wc->stage to + * UPDATE_BACKREF. if the block is shared and there is no + * need to update back, this function drops the reference + * to the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int do_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int *lookup_info) +{ + u64 bytenr; + u64 generation; + u64 parent; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *next; + int level = wc->level; + int reada = 0; + int ret = 0; + + generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + /* + * if the lower level block was created before the snapshot + * was created, we know there is no need to update back refs + * for the subtree + */ + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) { + *lookup_info = 1; + return 1; + } + + bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + blocksize = btrfs_level_size(root, level - 1); + + next = btrfs_find_tree_block(root, bytenr, blocksize); + if (!next) { + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + reada = 1; + } + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &wc->refs[level - 1], + &wc->flags[level - 1]); + BUG_ON(ret); + BUG_ON(wc->refs[level - 1] == 0); + *lookup_info = 0; + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level - 1] > 1) { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + + if (!wc->update_ref || + generation <= root->root_key.offset) + goto skip; + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); + if (ret < 0) + goto skip; + + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; + } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + } + + if (!btrfs_buffer_uptodate(next, generation)) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + next = NULL; + *lookup_info = 1; + } + + if (!next) { + if (reada && level == 1) + reada_walk_down(trans, root, wc, path); + next = read_tree_block(root, bytenr, blocksize, generation); + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + } + + level--; + BUG_ON(level != btrfs_header_level(next)); + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = 1; + wc->level = level; + if (wc->level == 1) + wc->reada_slot = 0; + return 0; +skip: + wc->refs[level - 1] = 0; + wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level])); + parent = 0; + } + + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, + root->root_key.objectid, level - 1, 0); + BUG_ON(ret); + } + btrfs_tree_unlock(next); + free_extent_buffer(next); + *lookup_info = 1; + return 1; +} + +/* * hepler to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops @@ -5011,7 +5315,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (level < wc->shared_level) goto out; - BUG_ON(wc->refs[level] <= 1); ret = find_next_key(path, level + 1, &wc->update_progress); if (ret > 0) wc->update_ref = 0; @@ -5042,8 +5345,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, path->locks[level] = 0; return 1; } - } else { - BUG_ON(level != 0); } } @@ -5096,39 +5397,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc) { - struct extent_buffer *next; - struct extent_buffer *cur; - u64 bytenr; - u64 ptr_gen; - u32 blocksize; int level = wc->level; + int lookup_info = 1; int ret; while (level >= 0) { - cur = path->nodes[level]; - BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); - - ret = walk_down_proc(trans, root, path, wc); + ret = walk_down_proc(trans, root, path, wc, lookup_info); if (ret > 0) break; if (level == 0) break; - bytenr = btrfs_node_blockptr(cur, path->slots[level]); - blocksize = btrfs_level_size(root, level - 1); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); - - next = read_tree_block(root, bytenr, blocksize, ptr_gen); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; - level--; - BUG_ON(level != btrfs_header_level(next)); - path->nodes[level] = next; - path->slots[level] = 0; - path->locks[level] = 1; - wc->level = level; + ret = do_walk_down(trans, root, path, wc, &lookup_info); + if (ret > 0) { + path->slots[level]++; + continue; + } + level = wc->level; } return 0; } @@ -5218,9 +5508,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) err = ret; goto out; } - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key))); + WARN_ON(ret > 0); /* * unlock our path, this is safe because only this @@ -5255,6 +5543,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { ret = walk_down_tree(trans, root, path, wc); @@ -5307,9 +5596,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) ret = btrfs_del_root(trans, tree_root, &root->root_key); BUG_ON(ret); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root); + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_find_last_root(tree_root, root->root_key.objectid, + NULL, NULL); + BUG_ON(ret < 0); + if (ret > 0) { + ret = btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); + BUG_ON(ret); + } + } + + if (root->in_radix) { + btrfs_free_fs_root(tree_root->fs_info, root); + } else { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); + } out: btrfs_end_transaction(trans, tree_root); kfree(wc); @@ -5361,6 +5665,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { wret = walk_down_tree(trans, root, path, wc); @@ -6948,287 +7253,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root, return 0; } -#if 0 -static int __insert_orphan_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 size) -{ - struct btrfs_path *path; - struct btrfs_inode_item *item; - struct extent_buffer *leaf; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - ret = btrfs_insert_empty_inode(trans, root, path, objectid); - if (ret) - goto out; - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); - btrfs_set_inode_generation(leaf, item, 1); - btrfs_set_inode_size(leaf, item, size); - btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(root, path); -out: - btrfs_free_path(path); - return ret; -} - -static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) +/* + * checks to see if its even possible to relocate this block group. + * + * @return - -1 if it's not a good idea to relocate this block group, 0 if its + * ok to go ahead and try. + */ +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) { - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root; - struct btrfs_key root_key; - u64 objectid = BTRFS_FIRST_FREE_OBJECTID; - int err = 0; + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_device *device; + int full = 0; + int ret = 0; - root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &root_key); - if (IS_ERR(root)) - return ERR_CAST(root); + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + /* odd, couldn't find the block group, leave it alone */ + if (!block_group) + return -1; - err = btrfs_find_free_objectid(trans, root, objectid, &objectid); - if (err) + /* no bytes used, we're good */ + if (!btrfs_block_group_used(&block_group->item)) goto out; - err = __insert_orphan_inode(trans, root, objectid, group->key.offset); - BUG_ON(err); - - err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, - group->key.offset, 0, group->key.offset, - 0, 0, 0); - BUG_ON(err); - - inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->location.objectid = objectid; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - BUG_ON(is_bad_inode(inode)); - } else { - BUG_ON(1); - } - BTRFS_I(inode)->index_cnt = group->key.objectid; - - err = btrfs_orphan_add(trans, inode); -out: - btrfs_end_transaction(trans, root); - if (err) { - if (inode) - iput(inode); - inode = ERR_PTR(err); - } - return inode; -} - -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) -{ - - struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; - struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct list_head list; - size_t offset; - int ret; - u64 disk_bytenr; - - INIT_LIST_HEAD(&list); - - ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->len != len); - - disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; - ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, - disk_bytenr + len - 1, &list); - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del_init(&sums->list); - - sector_sum = sums->sums; - sums->bytenr = ordered->start; + space_info = block_group->space_info; + spin_lock(&space_info->lock); - offset = 0; - while (offset < sums->len) { - sector_sum->bytenr += ordered->start - disk_bytenr; - sector_sum++; - offset += root->sectorsize; - } + full = space_info->full; - btrfs_add_ordered_sum(inode, ordered, sums); + /* + * if this is the last block group we have in this space, we can't + * relocate it unless we're able to allocate a new chunk below. + * + * Otherwise, we need to make sure we have room in the space to handle + * all of the extents from this block group. If we can, we're good + */ + if ((space_info->total_bytes != block_group->key.offset) && + (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + btrfs_block_group_used(&block_group->item) < + space_info->total_bytes)) { + spin_unlock(&space_info->lock); + goto out; } - btrfs_put_ordered_extent(ordered); - return 0; -} - -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) -{ - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct btrfs_fs_info *info = root->fs_info; - struct extent_buffer *leaf; - struct inode *reloc_inode; - struct btrfs_block_group_cache *block_group; - struct btrfs_key key; - u64 skipped; - u64 cur_byte; - u64 total_found; - u32 nritems; - int ret; - int progress; - int pass = 0; - - root = root->fs_info->extent_root; - - block_group = btrfs_lookup_block_group(info, group_start); - BUG_ON(!block_group); - - printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", - (unsigned long long)block_group->key.objectid, - (unsigned long long)block_group->flags); - - path = btrfs_alloc_path(); - BUG_ON(!path); - - reloc_inode = create_reloc_inode(info, block_group); - BUG_ON(IS_ERR(reloc_inode)); - - __alloc_chunk_for_shrink(root, block_group, 1); - set_block_group_readonly(block_group); - - btrfs_start_delalloc_inodes(info->tree_root); - btrfs_wait_ordered_extents(info->tree_root, 0); -again: - skipped = 0; - total_found = 0; - progress = 0; - key.objectid = block_group->key.objectid; - key.offset = 0; - key.type = 0; - cur_byte = key.objectid; - - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); + spin_unlock(&space_info->lock); - mutex_lock(&root->fs_info->cleaner_mutex); - btrfs_clean_old_snapshots(info->tree_root); - btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); - mutex_unlock(&root->fs_info->cleaner_mutex); + /* + * ok we don't have enough space, but maybe we have free space on our + * devices to allocate new chunks for relocation, so loop through our + * alloc devices and guess if we have enough space. However, if we + * were marked as full, then we know there aren't enough chunks, and we + * can just return. + */ + ret = -1; + if (full) + goto out; - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); + mutex_lock(&root->fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + u64 min_free = btrfs_block_group_used(&block_group->item); + u64 dev_offset, max_avail; - while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; -next: - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret == 1) { - ret = 0; + /* + * check to make sure we can actually find a chunk with enough + * space to fit our block group in. + */ + if (device->total_bytes > device->bytes_used + min_free) { + ret = find_free_dev_extent(NULL, device, min_free, + &dev_offset, &max_avail); + if (!ret) break; - } - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid >= block_group->key.objectid + - block_group->key.offset) - break; - - if (progress && need_resched()) { - btrfs_release_path(root, path); - cond_resched(); - progress = 0; - continue; - } - progress = 1; - - if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || - key.objectid + key.offset <= cur_byte) { - path->slots[0]++; - goto next; - } - - total_found++; - cur_byte = key.objectid + key.offset; - btrfs_release_path(root, path); - - __alloc_chunk_for_shrink(root, block_group, 0); - ret = relocate_one_extent(root, path, &key, block_group, - reloc_inode, pass); - BUG_ON(ret < 0); - if (ret > 0) - skipped++; - - key.objectid = cur_byte; - key.type = 0; - key.offset = 0; - } - - btrfs_release_path(root, path); - - if (pass == 0) { - btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); - invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); - } - - if (total_found > 0) { - printk(KERN_INFO "btrfs found %llu extents in pass %d\n", - (unsigned long long)total_found, pass); - pass++; - if (total_found == skipped && pass > 2) { - iput(reloc_inode); - reloc_inode = create_reloc_inode(info, block_group); - pass = 0; + ret = -1; } - goto again; } - - /* delete reloc_inode */ - iput(reloc_inode); - - /* unpin extents in this range */ - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); - - spin_lock(&block_group->lock); - WARN_ON(block_group->pinned > 0); - WARN_ON(block_group->reserved > 0); - WARN_ON(btrfs_block_group_used(&block_group->item) > 0); - spin_unlock(&block_group->lock); - btrfs_put_block_group(block_group); - ret = 0; + mutex_unlock(&root->fs_info->chunk_mutex); out: - btrfs_free_path(path); + btrfs_put_block_group(block_group); return ret; } -#endif static int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key) @@ -7299,9 +7403,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) wait_block_group_cache_done(block_group); btrfs_remove_free_space_cache(block_group); - - WARN_ON(atomic_read(&block_group->count) != 1); - kfree(block_group); + btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); } @@ -7395,8 +7497,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { + exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + free_excluded_extents(root, cache); } else if (btrfs_block_group_used(&cache->item) == 0) { exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; @@ -7413,6 +7517,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) &space_info); BUG_ON(ret); cache->space_info = space_info; + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_super += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + down_write(&space_info->groups_sem); list_add_tail(&cache->list, &space_info->block_groups); up_write(&space_info->groups_sem); @@ -7482,6 +7590,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); + + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_super += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + down_write(&cache->space_info->groups_sem); list_add_tail(&cache->list, &cache->space_info->block_groups); up_write(&cache->space_info->groups_sem);