Btrfs: Link block groups of different raid types
authorYan, Zheng <zheng.yan@oracle.com>
Sun, 16 May 2010 14:46:24 +0000 (10:46 -0400)
committerChris Mason <chris.mason@oracle.com>
Tue, 25 May 2010 14:34:47 +0000 (10:34 -0400)
The size of reserved space is stored in space_info. If block groups
of different raid types are linked to separate space_info, changing
allocation profile will corrupt reserved space accounting.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/super.c

index 746a724..6132088 100644 (file)
@@ -663,6 +663,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 #define BTRFS_BLOCK_GROUP_DUP     (1 << 5)
 #define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+#define BTRFS_NR_RAID_TYPES       5
 
 struct btrfs_block_group_item {
        __le64 used;
@@ -674,7 +675,8 @@ struct btrfs_space_info {
        u64 flags;
 
        u64 total_bytes;        /* total bytes in the space */
-       u64 bytes_used;         /* total bytes used on disk */
+       u64 bytes_used;         /* total bytes used,
+                                  this does't take mirrors into account */
        u64 bytes_pinned;       /* total bytes pinned, will be freed when the
                                   transaction finishes */
        u64 bytes_reserved;     /* total bytes the allocator has reserved for
@@ -687,6 +689,7 @@ struct btrfs_space_info {
                                   delalloc/allocations */
        u64 bytes_delalloc;     /* number of bytes currently reserved for
                                   delayed allocation */
+       u64 disk_used;          /* total bytes used on disk */
 
        int full;               /* indicates that we cannot allocate any more
                                   chunks for this space */
@@ -704,7 +707,7 @@ struct btrfs_space_info {
        int flushing;
 
        /* for block groups in our same type */
-       struct list_head block_groups;
+       struct list_head block_groups[BTRFS_NR_RAID_TYPES];
        spinlock_t lock;
        struct rw_semaphore groups_sem;
        atomic_t caching_threads;
index b34d32f..f5db036 100644 (file)
@@ -507,6 +507,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
 
+       flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+                BTRFS_BLOCK_GROUP_METADATA;
+
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags == flags) {
@@ -2660,12 +2663,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                             struct btrfs_space_info **space_info)
 {
        struct btrfs_space_info *found;
+       int i;
+       int factor;
+
+       if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                    BTRFS_BLOCK_GROUP_RAID10))
+               factor = 2;
+       else
+               factor = 1;
 
        found = __find_space_info(info, flags);
        if (found) {
                spin_lock(&found->lock);
                found->total_bytes += total_bytes;
                found->bytes_used += bytes_used;
+               found->disk_used += bytes_used * factor;
                found->full = 0;
                spin_unlock(&found->lock);
                *space_info = found;
@@ -2675,14 +2687,18 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (!found)
                return -ENOMEM;
 
-       INIT_LIST_HEAD(&found->block_groups);
+       for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+               INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
        init_waitqueue_head(&found->flush_wait);
        init_waitqueue_head(&found->allocate_wait);
        spin_lock_init(&found->lock);
-       found->flags = flags;
+       found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+                               BTRFS_BLOCK_GROUP_SYSTEM |
+                               BTRFS_BLOCK_GROUP_METADATA);
        found->total_bytes = total_bytes;
        found->bytes_used = bytes_used;
+       found->disk_used = bytes_used * factor;
        found->bytes_pinned = 0;
        found->bytes_reserved = 0;
        found->bytes_readonly = 0;
@@ -2752,26 +2768,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        return flags;
 }
 
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-       struct btrfs_fs_info *info = root->fs_info;
-       u64 alloc_profile;
-
-       if (data) {
-               alloc_profile = info->avail_data_alloc_bits &
-                       info->data_alloc_profile;
-               data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-       } else if (root == root->fs_info->chunk_root) {
-               alloc_profile = info->avail_system_alloc_bits &
-                       info->system_alloc_profile;
-               data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-       } else {
-               alloc_profile = info->avail_metadata_alloc_bits &
-                       info->metadata_alloc_profile;
-               data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-       }
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               flags |= root->fs_info->avail_data_alloc_bits &
+                        root->fs_info->data_alloc_profile;
+       else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               flags |= root->fs_info->avail_system_alloc_bits &
+                        root->fs_info->system_alloc_profile;
+       else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               flags |= root->fs_info->avail_metadata_alloc_bits &
+                        root->fs_info->metadata_alloc_profile;
+       return btrfs_reduce_alloc_profile(root, flags);
+}
+
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+{
+       u64 flags;
 
-       return btrfs_reduce_alloc_profile(root, data);
+       if (data)
+               flags = BTRFS_BLOCK_GROUP_DATA;
+       else if (root == root->fs_info->chunk_root)
+               flags = BTRFS_BLOCK_GROUP_SYSTEM;
+       else
+               flags = BTRFS_BLOCK_GROUP_METADATA;
+
+       return get_alloc_profile(root, flags);
 }
 
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
@@ -3468,6 +3490,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 {
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *info = root->fs_info;
+       int factor;
        u64 total = num_bytes;
        u64 old_val;
        u64 byte_in_group;
@@ -3486,6 +3509,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache)
                        return -1;
+               if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+                                   BTRFS_BLOCK_GROUP_RAID1 |
+                                   BTRFS_BLOCK_GROUP_RAID10))
+                       factor = 2;
+               else
+                       factor = 1;
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
 
@@ -3498,18 +3527,20 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        old_val += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
-                       cache->space_info->bytes_used += num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
+                       cache->space_info->bytes_used += num_bytes;
+                       cache->space_info->disk_used += num_bytes * factor;
                        if (cache->ro)
                                cache->space_info->bytes_readonly -= num_bytes;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
                } else {
                        old_val -= num_bytes;
+                       btrfs_set_block_group_used(&cache->item, old_val);
                        cache->space_info->bytes_used -= num_bytes;
+                       cache->space_info->disk_used -= num_bytes * factor;
                        if (cache->ro)
                                cache->space_info->bytes_readonly += num_bytes;
-                       btrfs_set_block_group_used(&cache->item, old_val);
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
                        if (mark_free) {
@@ -4134,6 +4165,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return 0;
 }
 
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+       int index;
+       if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+               index = 0;
+       else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+               index = 1;
+       else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+               index = 2;
+       else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+               index = 3;
+       else
+               index = 4;
+       return index;
+}
+
 enum btrfs_loop_type {
        LOOP_FIND_IDEAL = 0,
        LOOP_CACHING_NOWAIT = 1,
@@ -4167,6 +4214,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        int done_chunk_alloc = 0;
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
+       int index = 0;
        int loop = 0;
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
@@ -4237,6 +4285,7 @@ ideal_cache:
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
+                               index = get_block_group_index(block_group);
                                goto have_block_group;
                        }
                } else if (block_group) {
@@ -4245,7 +4294,8 @@ ideal_cache:
        }
 search:
        down_read(&space_info->groups_sem);
-       list_for_each_entry(block_group, &space_info->block_groups, list) {
+       list_for_each_entry(block_group, &space_info->block_groups[index],
+                           list) {
                u64 offset;
                int cached;
 
@@ -4468,10 +4518,14 @@ checks:
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
+               BUG_ON(index != get_block_group_index(block_group));
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
 
+       if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+               goto search;
+
        /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
         *                      for them to make caching progress.  Also
         *                      determine the best possible bg to cache
@@ -4485,6 +4539,7 @@ loop:
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
            (found_uncached_bg || empty_size || empty_cluster ||
             allowed_chunk_alloc)) {
+               index = 0;
                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                        found_uncached_bg = false;
                        loop++;
@@ -4567,6 +4622,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups)
 {
        struct btrfs_block_group_cache *cache;
+       int index = 0;
 
        spin_lock(&info->lock);
        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
@@ -4591,7 +4647,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                return;
 
        down_read(&info->groups_sem);
-       list_for_each_entry(cache, &info->block_groups, list) {
+again:
+       list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
                       "%llu pinned %llu reserved\n",
@@ -4603,6 +4660,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                btrfs_dump_free_space(cache, bytes);
                spin_unlock(&cache->lock);
        }
+       if (++index < BTRFS_NR_RAID_TYPES)
+               goto again;
        up_read(&info->groups_sem);
 }
 
@@ -7447,6 +7506,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        return 0;
 }
 
+static void __link_block_group(struct btrfs_space_info *space_info,
+                              struct btrfs_block_group_cache *cache)
+{
+       int index = get_block_group_index(cache);
+
+       down_write(&space_info->groups_sem);
+       list_add_tail(&cache->list, &space_info->block_groups[index]);
+       up_write(&space_info->groups_sem);
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
        struct btrfs_path *path;
@@ -7468,10 +7537,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
        while (1) {
                ret = find_first_block_group(root, path, &key);
-               if (ret > 0) {
-                       ret = 0;
-                       goto error;
-               }
+               if (ret > 0)
+                       break;
                if (ret != 0)
                        goto error;
 
@@ -7540,9 +7607,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                cache->space_info->bytes_super += cache->bytes_super;
                spin_unlock(&cache->space_info->lock);
 
-               down_write(&space_info->groups_sem);
-               list_add_tail(&cache->list, &space_info->block_groups);
-               up_write(&space_info->groups_sem);
+               __link_block_group(space_info, cache);
 
                ret = btrfs_add_block_group_cache(root->fs_info, cache);
                BUG_ON(ret);
@@ -7551,6 +7616,22 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                if (btrfs_chunk_readonly(root, cache->key.objectid))
                        set_block_group_readonly(cache);
        }
+
+       list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+               if (!(get_alloc_profile(root, space_info->flags) &
+                     (BTRFS_BLOCK_GROUP_RAID10 |
+                      BTRFS_BLOCK_GROUP_RAID1 |
+                      BTRFS_BLOCK_GROUP_DUP)))
+                       continue;
+               /*
+                * avoid allocating from un-mirrored block group if there are
+                * mirrored block groups.
+                */
+               list_for_each_entry(cache, &space_info->block_groups[3], list)
+                       set_block_group_readonly(cache);
+               list_for_each_entry(cache, &space_info->block_groups[4], list)
+                       set_block_group_readonly(cache);
+       }
        ret = 0;
 error:
        btrfs_free_path(path);
@@ -7614,9 +7695,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->space_info->bytes_super += cache->bytes_super;
        spin_unlock(&cache->space_info->lock);
 
-       down_write(&cache->space_info->groups_sem);
-       list_add_tail(&cache->list, &cache->space_info->block_groups);
-       up_write(&cache->space_info->groups_sem);
+       __link_block_group(cache->space_info, cache);
 
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
        BUG_ON(ret);
index 1866dff..55265c2 100644 (file)
@@ -714,34 +714,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
-       u64 data_used = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
 
        rcu_read_lock();
-       list_for_each_entry_rcu(found, head, list) {
-               if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
-                                   BTRFS_BLOCK_GROUP_RAID10|
-                                   BTRFS_BLOCK_GROUP_RAID1)) {
-                       total_used += found->bytes_used;
-                       if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                               data_used += found->bytes_used;
-                       else
-                               data_used += found->total_bytes;
-               }
-
-               total_used += found->bytes_used;
-               if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                       data_used += found->bytes_used;
-               else
-                       data_used += found->total_bytes;
-       }
+       list_for_each_entry_rcu(found, head, list)
+               total_used += found->disk_used;
        rcu_read_unlock();
 
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-       buf->f_bavail = buf->f_blocks - (data_used >> bits);
+       buf->f_bavail = buf->f_bfree;
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;