Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 21 May 2010 14:20:17 +0000 (07:20 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 21 May 2010 14:20:17 +0000 (07:20 -0700)
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2: (47 commits)
  ocfs2: Silence a gcc warning.
  ocfs2: Don't retry xattr set in case value extension fails.
  ocfs2:dlm: avoid dlm->ast_lock lockres->spinlock dependency break
  ocfs2: Reset xattr value size after xa_cleanup_value_truncate().
  fs/ocfs2/dlm: Use kstrdup
  fs/ocfs2/dlm: Drop memory allocation cast
  Ocfs2: Optimize punching-hole code.
  Ocfs2: Make ocfs2_find_cpos_for_left_leaf() public.
  Ocfs2: Fix hole punching to correctly do CoW during cluster zeroing.
  Ocfs2: Optimize ocfs2 truncate to use ocfs2_remove_btree_range() instead.
  ocfs2: Block signals for mkdir/link/symlink/O_CREAT.
  ocfs2: Wrap signal blocking in void functions.
  ocfs2/dlm: Increase o2dlm lockres hash size
  ocfs2: Make ocfs2_extend_trans() really extend.
  ocfs2/trivial: Code cleanup for allocation reservation.
  ocfs2: make ocfs2_adjust_resv_from_alloc simple.
  ocfs2: Make nointr a default mount option
  ocfs2/dlm: Make o2dlm domain join/leave messages KERN_NOTICE
  o2net: log socket state changes
  ocfs2: print node # when tcp fails
  ...

41 files changed:
Documentation/filesystems/ocfs2.txt
fs/ocfs2/Makefile
fs/ocfs2/alloc.c
fs/ocfs2/alloc.h
fs/ocfs2/aops.c
fs/ocfs2/cluster/masklog.c
fs/ocfs2/cluster/masklog.h
fs/ocfs2/cluster/tcp.c
fs/ocfs2/dir.c
fs/ocfs2/dlm/dlmast.c
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmconvert.c
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmlock.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/dlm/dlmthread.c
fs/ocfs2/dlm/dlmunlock.c
fs/ocfs2/file.c
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/journal.c
fs/ocfs2/journal.h
fs/ocfs2/localalloc.c
fs/ocfs2/localalloc.h
fs/ocfs2/mmap.c
fs/ocfs2/namei.c
fs/ocfs2/ocfs2.h
fs/ocfs2/ocfs2_fs.h
fs/ocfs2/quota_global.c
fs/ocfs2/quota_local.c
fs/ocfs2/refcounttree.c
fs/ocfs2/refcounttree.h
fs/ocfs2/reservations.c [new file with mode: 0644]
fs/ocfs2/reservations.h [new file with mode: 0644]
fs/ocfs2/resize.c
fs/ocfs2/suballoc.c
fs/ocfs2/suballoc.h
fs/ocfs2/super.c
fs/ocfs2/super.h
fs/ocfs2/xattr.c

index c58b9f5..1f7ae14 100644 (file)
@@ -80,3 +80,10 @@ user_xattr   (*)     Enables Extended User Attributes.
 nouser_xattr           Disables Extended User Attributes.
 acl                    Enables POSIX Access Control Lists support.
 noacl          (*)     Disables POSIX Access Control Lists support.
+resv_level=2   (*)     Set how agressive allocation reservations will be.
+                       Valid values are between 0 (reservations off) to 8
+                       (maximum space for reservations).
+dir_resv_level=        (*)     By default, directory reservations will scale with file
+                       reservations - users should rarely need to change this
+                       value. If allocation reservations are turned off, this
+                       option will have no effect.
index 791c088..07d9fd8 100644 (file)
@@ -29,6 +29,7 @@ ocfs2-objs := \
        mmap.o                  \
        namei.o                 \
        refcounttree.o          \
+       reservations.o          \
        resize.o                \
        slot_map.o              \
        suballoc.o              \
index 9f8bd91..215e12c 100644 (file)
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
        int count, status, i;
        u16 suballoc_bit_start;
        u32 num_got;
-       u64 first_blkno;
+       u64 suballoc_loc, first_blkno;
        struct ocfs2_super *osb =
                OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
        struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
 
        count = 0;
        while (count < wanted) {
-               status = ocfs2_claim_metadata(osb,
-                                             handle,
+               status = ocfs2_claim_metadata(handle,
                                              meta_ac,
                                              wanted - count,
+                                             &suballoc_loc,
                                              &suballoc_bit_start,
                                              &num_got,
                                              &first_blkno);
@@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
                        eb->h_suballoc_slot =
                                cpu_to_le16(meta_ac->ac_alloc_slot);
+                       eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
                        eb->h_list.l_count =
                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
 
                        /* We'll also be dirtied by the caller, so
                         * this isn't absolutely necessary. */
-                       status = ocfs2_journal_dirty(handle, bhs[i]);
-                       if (status < 0) {
-                               mlog_errno(status);
-                               goto bail;
-                       }
+                       ocfs2_journal_dirty(handle, bhs[i]);
                }
 
                count += num_got;
@@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
                goto out;
        }
 
-       status = ocfs2_extend_trans(handle, path_num_items(path) +
-                                   handle->h_buffer_credits);
+       status = ocfs2_extend_trans(handle, path_num_items(path));
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
                if (!eb_el->l_tree_depth)
                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
 
-               status = ocfs2_journal_dirty(handle, bh);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
-
+               ocfs2_journal_dirty(handle, bh);
                next_blkno = le64_to_cpu(eb->h_blkno);
        }
 
@@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
 
-       status = ocfs2_journal_dirty(handle, *last_eb_bh);
-       if (status < 0)
-               mlog_errno(status);
-       status = ocfs2_journal_dirty(handle, et->et_root_bh);
-       if (status < 0)
-               mlog_errno(status);
-       if (eb_bh) {
-               status = ocfs2_journal_dirty(handle, eb_bh);
-               if (status < 0)
-                       mlog_errno(status);
-       }
+       ocfs2_journal_dirty(handle, *last_eb_bh);
+       ocfs2_journal_dirty(handle, et->et_root_bh);
+       if (eb_bh)
+               ocfs2_journal_dirty(handle, eb_bh);
 
        /*
         * Some callers want to track the rightmost leaf so pass it
@@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
                eb_el->l_recs[i] = root_el->l_recs[i];
 
-       status = ocfs2_journal_dirty(handle, new_eb_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+       ocfs2_journal_dirty(handle, new_eb_bh);
 
        status = ocfs2_et_root_journal_access(handle, et,
                                              OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        if (root_el->l_tree_depth == cpu_to_le16(1))
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
-       status = ocfs2_journal_dirty(handle, et->et_root_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+       ocfs2_journal_dirty(handle, et->et_root_bh);
 
        *ret_new_eb_bh = new_eb_bh;
        new_eb_bh = NULL;
@@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                                       struct ocfs2_path *right_path,
                                       int subtree_index)
 {
-       int ret, i, idx;
+       int i, idx;
        struct ocfs2_extent_list *el, *left_el, *right_el;
        struct ocfs2_extent_rec *left_rec, *right_rec;
        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
                                              right_el);
 
-               ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
-               if (ret)
-                       mlog_errno(ret);
-
-               ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
-               if (ret)
-                       mlog_errno(ret);
+               ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+               ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
 
                /*
                 * Setup our list pointers now so that the current
@@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
 
        root_bh = left_path->p_node[subtree_index].bh;
 
-       ret = ocfs2_journal_dirty(handle, root_bh);
-       if (ret)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, root_bh);
 }
 
 static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
 
        ocfs2_create_empty_extent(right_el);
 
-       ret = ocfs2_journal_dirty(handle, right_leaf_bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
+       ocfs2_journal_dirty(handle, right_leaf_bh);
 
        /* Do the copy now. */
        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
        le16_add_cpu(&left_el->l_next_free_rec, 1);
 
-       ret = ocfs2_journal_dirty(handle, left_leaf_bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
+       ocfs2_journal_dirty(handle, left_leaf_bh);
 
        ocfs2_complete_edge_insert(handle, left_path, right_path,
                                   subtree_index);
@@ -2249,8 +2210,8 @@ out:
  *
  * Will return zero if the path passed in is already the leftmost path.
  */
-static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
-                                        struct ocfs2_path *path, u32 *cpos)
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+                                 struct ocfs2_path *path, u32 *cpos)
 {
        int i, j, ret = 0;
        u64 blkno;
@@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
                                           int op_credits,
                                           struct ocfs2_path *path)
 {
-       int ret;
+       int ret = 0;
        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
 
-       if (handle->h_buffer_credits < credits) {
+       if (handle->h_buffer_credits < credits)
                ret = ocfs2_extend_trans(handle,
                                         credits - handle->h_buffer_credits);
-               if (ret)
-                       return ret;
 
-               if (unlikely(handle->h_buffer_credits < credits))
-                       return ocfs2_extend_trans(handle, credits);
-       }
-
-       return 0;
+       return ret;
 }
 
 /*
@@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
         * records for all the bh in the path.
         * So we have to allocate extra credits and access them.
         */
-       ret = ocfs2_extend_trans(handle,
-                                handle->h_buffer_credits + subtree_index);
+       ret = ocfs2_extend_trans(handle, subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
                ocfs2_remove_empty_extent(right_leaf_el);
        }
 
-       ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-       if (ret)
-               mlog_errno(ret);
-       ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-       if (ret)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+       ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
 
        if (del_right_subtree) {
                ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
                if (right_has_empty)
                        ocfs2_remove_empty_extent(left_leaf_el);
 
-               ret = ocfs2_journal_dirty(handle, et_root_bh);
-               if (ret)
-                       mlog_errno(ret);
+               ocfs2_journal_dirty(handle, et_root_bh);
 
                *deleted = 1;
        } else
@@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
        }
 
        ocfs2_remove_empty_extent(el);
-
-       ret = ocfs2_journal_dirty(handle, bh);
-       if (ret)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, bh);
 
 out:
        return ret;
@@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
 
        ocfs2_cleanup_merge(el, index);
 
-       ret = ocfs2_journal_dirty(handle, bh);
-       if (ret)
-               mlog_errno(ret);
-
+       ocfs2_journal_dirty(handle, bh);
        if (right_path) {
-               ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-               if (ret)
-                       mlog_errno(ret);
-
+               ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
                ocfs2_complete_edge_insert(handle, left_path, right_path,
                                           subtree_index);
        }
@@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
 
        ocfs2_cleanup_merge(el, index);
 
-       ret = ocfs2_journal_dirty(handle, bh);
-       if (ret)
-               mlog_errno(ret);
-
+       ocfs2_journal_dirty(handle, bh);
        if (left_path) {
-               ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-               if (ret)
-                       mlog_errno(ret);
+               ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
 
                /*
                 * In the situation that the right_rec is empty and the extent
@@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
                le32_add_cpu(&rec->e_int_clusters,
                             -le32_to_cpu(rec->e_cpos));
 
-               ret = ocfs2_journal_dirty(handle, bh);
-               if (ret)
-                       mlog_errno(ret);
-
+               ocfs2_journal_dirty(handle, bh);
        }
 }
 
@@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
 
        if (left_path) {
-               int credits = handle->h_buffer_credits;
-
                /*
                 * There's a chance that left_path got passed back to
                 * us without being accounted for in the
                 * journal. Extend our transaction here to be sure we
                 * can change those blocks.
                 */
-               credits += left_path->p_tree_depth;
-
-               ret = ocfs2_extend_trans(handle, credits);
+               ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
                 * dirty this for us.
                 */
                if (left_path)
-                       ret = ocfs2_journal_dirty(handle,
-                                                 path_leaf_bh(left_path));
-                       if (ret)
-                               mlog_errno(ret);
+                       ocfs2_journal_dirty(handle,
+                                           path_leaf_bh(left_path));
        } else
                ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
                                     insert);
 
-       ret = ocfs2_journal_dirty(handle, leaf_bh);
-       if (ret)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, leaf_bh);
 
        if (left_path) {
                /*
@@ -4384,9 +4307,7 @@ out_update_clusters:
                ocfs2_et_update_clusters(et,
                                         le16_to_cpu(insert_rec->e_leaf_clusters));
 
-       ret = ocfs2_journal_dirty(handle, et->et_root_bh);
-       if (ret)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, et->et_root_bh);
 
 out:
        ocfs2_free_path(left_path);
@@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                goto leave;
        }
 
-       status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+       status = __ocfs2_claim_clusters(handle, data_ac, 1,
                                        clusters_to_add, &bit_off, &num_bits);
        if (status < 0) {
                if (status != -ENOSPC)
@@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                goto leave;
        }
 
-       status = ocfs2_journal_dirty(handle, et->et_root_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto leave;
-       }
+       ocfs2_journal_dirty(handle, et->et_root_bh);
 
        clusters_to_add -= num_bits;
        *logical_offset += num_bits;
@@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
                            int index, u32 new_range,
                            struct ocfs2_alloc_context *meta_ac)
 {
-       int ret, depth, credits = handle->h_buffer_credits;
+       int ret, depth, credits;
        struct buffer_head *last_eb_bh = NULL;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
        } else
                rightmost_el = path_leaf_el(path);
 
-       credits += path->p_tree_depth +
-                  ocfs2_extend_meta_needed(et->et_root_el);
+       credits = path->p_tree_depth +
+                 ocfs2_extend_meta_needed(et->et_root_el);
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -5671,19 +5588,97 @@ out:
        return ret;
 }
 
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+                                             struct ocfs2_extent_tree *et,
+                                             u32 extents_to_split,
+                                             struct ocfs2_alloc_context **ac,
+                                             int extra_blocks)
+{
+       int ret = 0, num_free_extents;
+       unsigned int max_recs_needed = 2 * extents_to_split;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       *ac = NULL;
+
+       num_free_extents = ocfs2_num_free_extents(osb, et);
+       if (num_free_extents < 0) {
+               ret = num_free_extents;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (!num_free_extents ||
+           (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+               extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+       if (extra_blocks) {
+               ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+               if (ret < 0) {
+                       if (ret != -ENOSPC)
+                               mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+out:
+       if (ret) {
+               if (*ac) {
+                       ocfs2_free_alloc_context(*ac);
+                       *ac = NULL;
+               }
+       }
+
+       return ret;
+}
+
 int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_extent_tree *et,
-                            u32 cpos, u32 phys_cpos, u32 len,
-                            struct ocfs2_cached_dealloc_ctxt *dealloc)
+                            u32 cpos, u32 phys_cpos, u32 len, int flags,
+                            struct ocfs2_cached_dealloc_ctxt *dealloc,
+                            u64 refcount_loc)
 {
-       int ret;
+       int ret, credits = 0, extra_blocks = 0;
        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct inode *tl_inode = osb->osb_tl_inode;
        handle_t *handle;
        struct ocfs2_alloc_context *meta_ac = NULL;
+       struct ocfs2_refcount_tree *ref_tree = NULL;
+
+       if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+               BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                        OCFS2_HAS_REFCOUNT_FL));
+
+               ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+                                              &ref_tree, NULL);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
 
-       ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+               ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                           refcount_loc,
+                                                           phys_blkno,
+                                                           len,
+                                                           &credits,
+                                                           &extra_blocks);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+       ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+                                                extra_blocks);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
                }
        }
 
-       handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+       handle = ocfs2_start_trans(osb,
+                       ocfs2_remove_extent_credits(osb->sb) + credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
@@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
 
        ocfs2_et_update_clusters(et, -len);
 
-       ret = ocfs2_journal_dirty(handle, et->et_root_bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out_commit;
-       }
+       ocfs2_journal_dirty(handle, et->et_root_bh);
 
-       ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
-       if (ret)
-               mlog_errno(ret);
+       if (phys_blkno) {
+               if (flags & OCFS2_EXT_REFCOUNTED)
+                       ret = ocfs2_decrease_refcount(inode, handle,
+                                       ocfs2_blocks_to_clusters(osb->sb,
+                                                                phys_blkno),
+                                       len, meta_ac,
+                                       dealloc, 1);
+               else
+                       ret = ocfs2_truncate_log_append(osb, handle,
+                                                       phys_blkno, len);
+               if (ret)
+                       mlog_errno(ret);
+
+       }
 
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -5742,6 +5745,9 @@ out:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
 
+       if (ref_tree)
+               ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
        return ret;
 }
 
@@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        }
        tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
 
-       status = ocfs2_journal_dirty(handle, tl_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+       ocfs2_journal_dirty(handle, tl_bh);
 
 bail:
        mlog_exit(status);
@@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 
                tl->tl_used = cpu_to_le16(i);
 
-               status = ocfs2_journal_dirty(handle, tl_bh);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
+               ocfs2_journal_dirty(handle, tl_bh);
 
                /* TODO: Perhaps we can calculate the bulk of the
                 * credits up front rather than extending like
@@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
  */
 struct ocfs2_cached_block_free {
        struct ocfs2_cached_block_free          *free_next;
+       u64                                     free_bg;
        u64                                     free_blk;
        unsigned int                            free_bit;
 };
@@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
        }
 
        while (head) {
-               bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
-                                                     head->free_bit);
+               if (head->free_bg)
+                       bg_blkno = head->free_bg;
+               else
+                       bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                                                             head->free_bit);
                mlog(0, "Free bit: (bit %u, blkno %llu)\n",
                     head->free_bit, (unsigned long long)head->free_blk);
 
@@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
        int ret = 0;
        struct ocfs2_cached_block_free *item;
 
-       item = kmalloc(sizeof(*item), GFP_NOFS);
+       item = kzalloc(sizeof(*item), GFP_NOFS);
        if (item == NULL) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
 }
 
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                             int type, int slot, u64 blkno,
-                             unsigned int bit)
+                             int type, int slot, u64 suballoc,
+                             u64 blkno, unsigned int bit)
 {
        int ret;
        struct ocfs2_per_slot_free_list *fl;
@@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                goto out;
        }
 
-       item = kmalloc(sizeof(*item), GFP_NOFS);
+       item = kzalloc(sizeof(*item), GFP_NOFS);
        if (item == NULL) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
        mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
             type, slot, bit, (unsigned long long)blkno);
 
+       item->free_bg = suballoc;
        item->free_blk = blkno;
        item->free_bit = bit;
        item->free_next = fl->f_first;
@@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 {
        return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
                                         le16_to_cpu(eb->h_suballoc_slot),
+                                        le64_to_cpu(eb->h_suballoc_loc),
                                         le64_to_cpu(eb->h_blkno),
                                         le16_to_cpu(eb->h_suballoc_bit));
 }
 
-/* This function will figure out whether the currently last extent
- * block will be deleted, and if it will, what the new last extent
- * block will be so we can update his h_next_leaf_blk field, as well
- * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct inode *inode,
-                                      unsigned int clusters_to_del,
-                                      struct ocfs2_path *path,
-                                      struct buffer_head **new_last_eb)
-{
-       int next_free, ret = 0;
-       u32 cpos;
-       struct ocfs2_extent_rec *rec;
-       struct ocfs2_extent_block *eb;
-       struct ocfs2_extent_list *el;
-       struct buffer_head *bh = NULL;
-
-       *new_last_eb = NULL;
-
-       /* we have no tree, so of course, no last_eb. */
-       if (!path->p_tree_depth)
-               goto out;
-
-       /* trunc to zero special case - this makes tree_depth = 0
-        * regardless of what it is.  */
-       if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
-               goto out;
-
-       el = path_leaf_el(path);
-       BUG_ON(!el->l_next_free_rec);
-
-       /*
-        * Make sure that this extent list will actually be empty
-        * after we clear away the data. We can shortcut out if
-        * there's more than one non-empty extent in the
-        * list. Otherwise, a check of the remaining extent is
-        * necessary.
-        */
-       next_free = le16_to_cpu(el->l_next_free_rec);
-       rec = NULL;
-       if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-               if (next_free > 2)
-                       goto out;
-
-               /* We may have a valid extent in index 1, check it. */
-               if (next_free == 2)
-                       rec = &el->l_recs[1];
-
-               /*
-                * Fall through - no more nonempty extents, so we want
-                * to delete this leaf.
-                */
-       } else {
-               if (next_free > 1)
-                       goto out;
-
-               rec = &el->l_recs[0];
-       }
-
-       if (rec) {
-               /*
-                * Check it we'll only be trimming off the end of this
-                * cluster.
-                */
-               if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
-                       goto out;
-       }
-
-       ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
-
-       ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
-
-       eb = (struct ocfs2_extent_block *) bh->b_data;
-       el = &eb->h_list;
-
-       /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
-        * Any corruption is a code bug. */
-       BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-
-       *new_last_eb = bh;
-       get_bh(*new_last_eb);
-       mlog(0, "returning block %llu, (cpos: %u)\n",
-            (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
-out:
-       brelse(bh);
-
-       return ret;
-}
-
-/*
- * Trim some clusters off the rightmost edge of a tree. Only called
- * during truncate.
- *
- * The caller needs to:
- *   - start journaling of each path component.
- *   - compute and fully set up any new last ext block
- */
-static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
-                          handle_t *handle, struct ocfs2_truncate_context *tc,
-                          u32 clusters_to_del, u64 *delete_start, u8 *flags)
-{
-       int ret, i, index = path->p_tree_depth;
-       u32 new_edge = 0;
-       u64 deleted_eb = 0;
-       struct buffer_head *bh;
-       struct ocfs2_extent_list *el;
-       struct ocfs2_extent_rec *rec;
-
-       *delete_start = 0;
-       *flags = 0;
-
-       while (index >= 0) {
-               bh = path->p_node[index].bh;
-               el = path->p_node[index].el;
-
-               mlog(0, "traveling tree (index = %d, block = %llu)\n",
-                    index,  (unsigned long long)bh->b_blocknr);
-
-               BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-
-               if (index !=
-                   (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
-                       ocfs2_error(inode->i_sb,
-                                   "Inode %lu has invalid ext. block %llu",
-                                   inode->i_ino,
-                                   (unsigned long long)bh->b_blocknr);
-                       ret = -EROFS;
-                       goto out;
-               }
-
-find_tail_record:
-               i = le16_to_cpu(el->l_next_free_rec) - 1;
-               rec = &el->l_recs[i];
-
-               mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
-                    "next = %u\n", i, le32_to_cpu(rec->e_cpos),
-                    ocfs2_rec_clusters(el, rec),
-                    (unsigned long long)le64_to_cpu(rec->e_blkno),
-                    le16_to_cpu(el->l_next_free_rec));
-
-               BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
-
-               if (le16_to_cpu(el->l_tree_depth) == 0) {
-                       /*
-                        * If the leaf block contains a single empty
-                        * extent and no records, we can just remove
-                        * the block.
-                        */
-                       if (i == 0 && ocfs2_is_empty_extent(rec)) {
-                               memset(rec, 0,
-                                      sizeof(struct ocfs2_extent_rec));
-                               el->l_next_free_rec = cpu_to_le16(0);
-
-                               goto delete;
-                       }
-
-                       /*
-                        * Remove any empty extents by shifting things
-                        * left. That should make life much easier on
-                        * the code below. This condition is rare
-                        * enough that we shouldn't see a performance
-                        * hit.
-                        */
-                       if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-                               le16_add_cpu(&el->l_next_free_rec, -1);
-
-                               for(i = 0;
-                                   i < le16_to_cpu(el->l_next_free_rec); i++)
-                                       el->l_recs[i] = el->l_recs[i + 1];
-
-                               memset(&el->l_recs[i], 0,
-                                      sizeof(struct ocfs2_extent_rec));
-
-                               /*
-                                * We've modified our extent list. The
-                                * simplest way to handle this change
-                                * is to being the search from the
-                                * start again.
-                                */
-                               goto find_tail_record;
-                       }
-
-                       le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
-
-                       /*
-                        * We'll use "new_edge" on our way back up the
-                        * tree to know what our rightmost cpos is.
-                        */
-                       new_edge = le16_to_cpu(rec->e_leaf_clusters);
-                       new_edge += le32_to_cpu(rec->e_cpos);
-
-                       /*
-                        * The caller will use this to delete data blocks.
-                        */
-                       *delete_start = le64_to_cpu(rec->e_blkno)
-                               + ocfs2_clusters_to_blocks(inode->i_sb,
-                                       le16_to_cpu(rec->e_leaf_clusters));
-                       *flags = rec->e_flags;
-
-                       /*
-                        * If it's now empty, remove this record.
-                        */
-                       if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
-                               memset(rec, 0,
-                                      sizeof(struct ocfs2_extent_rec));
-                               le16_add_cpu(&el->l_next_free_rec, -1);
-                       }
-               } else {
-                       if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
-                               memset(rec, 0,
-                                      sizeof(struct ocfs2_extent_rec));
-                               le16_add_cpu(&el->l_next_free_rec, -1);
-
-                               goto delete;
-                       }
-
-                       /* Can this actually happen? */
-                       if (le16_to_cpu(el->l_next_free_rec) == 0)
-                               goto delete;
-
-                       /*
-                        * We never actually deleted any clusters
-                        * because our leaf was empty. There's no
-                        * reason to adjust the rightmost edge then.
-                        */
-                       if (new_edge == 0)
-                               goto delete;
-
-                       rec->e_int_clusters = cpu_to_le32(new_edge);
-                       le32_add_cpu(&rec->e_int_clusters,
-                                    -le32_to_cpu(rec->e_cpos));
-
-                        /*
-                         * A deleted child record should have been
-                         * caught above.
-                         */
-                        BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
-               }
-
-delete:
-               ret = ocfs2_journal_dirty(handle, bh);
-               if (ret) {
-                       mlog_errno(ret);
-                       goto out;
-               }
-
-               mlog(0, "extent list container %llu, after: record %d: "
-                    "(%u, %u, %llu), next = %u.\n",
-                    (unsigned long long)bh->b_blocknr, i,
-                    le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
-                    (unsigned long long)le64_to_cpu(rec->e_blkno),
-                    le16_to_cpu(el->l_next_free_rec));
-
-               /*
-                * We must be careful to only attempt delete of an
-                * extent block (and not the root inode block).
-                */
-               if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
-                       struct ocfs2_extent_block *eb =
-                               (struct ocfs2_extent_block *)bh->b_data;
-
-                       /*
-                        * Save this for use when processing the
-                        * parent block.
-                        */
-                       deleted_eb = le64_to_cpu(eb->h_blkno);
-
-                       mlog(0, "deleting this extent block.\n");
-
-                       ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
-
-                       BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
-                       BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
-                       BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-
-                       ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
-                       /* An error here is not fatal. */
-                       if (ret < 0)
-                               mlog_errno(ret);
-               } else {
-                       deleted_eb = 0;
-               }
-
-               index--;
-       }
-
-       ret = 0;
-out:
-       return ret;
-}
-
-static int ocfs2_do_truncate(struct ocfs2_super *osb,
-                            unsigned int clusters_to_del,
-                            struct inode *inode,
-                            struct buffer_head *fe_bh,
-                            handle_t *handle,
-                            struct ocfs2_truncate_context *tc,
-                            struct ocfs2_path *path,
-                            struct ocfs2_alloc_context *meta_ac)
-{
-       int status;
-       struct ocfs2_dinode *fe;
-       struct ocfs2_extent_block *last_eb = NULL;
-       struct ocfs2_extent_list *el;
-       struct buffer_head *last_eb_bh = NULL;
-       u64 delete_blk = 0;
-       u8 rec_flags;
-
-       fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
-       status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
-                                            path, &last_eb_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
-
-       /*
-        * Each component will be touched, so we might as well journal
-        * here to avoid having to handle errors later.
-        */
-       status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
-
-       if (last_eb_bh) {
-               status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
-                                                OCFS2_JOURNAL_ACCESS_WRITE);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
-
-               last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-       }
-
-       el = &(fe->id2.i_list);
-
-       /*
-        * Lower levels depend on this never happening, but it's best
-        * to check it up here before changing the tree.
-        */
-       if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
-               ocfs2_error(inode->i_sb,
-                           "Inode %lu has an empty extent record, depth %u\n",
-                           inode->i_ino, le16_to_cpu(el->l_tree_depth));
-               status = -EROFS;
-               goto bail;
-       }
-
-       dquot_free_space_nodirty(inode,
-                       ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
-       spin_lock(&OCFS2_I(inode)->ip_lock);
-       OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
-                                     clusters_to_del;
-       spin_unlock(&OCFS2_I(inode)->ip_lock);
-       le32_add_cpu(&fe->i_clusters, -clusters_to_del);
-       inode->i_blocks = ocfs2_inode_sector_count(inode);
-
-       status = ocfs2_trim_tree(inode, path, handle, tc,
-                                clusters_to_del, &delete_blk, &rec_flags);
-       if (status) {
-               mlog_errno(status);
-               goto bail;
-       }
-
-       if (le32_to_cpu(fe->i_clusters) == 0) {
-               /* trunc to zero is a special case. */
-               el->l_tree_depth = 0;
-               fe->i_last_eb_blk = 0;
-       } else if (last_eb)
-               fe->i_last_eb_blk = last_eb->h_blkno;
-
-       status = ocfs2_journal_dirty(handle, fe_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
-
-       if (last_eb) {
-               /* If there will be a new last extent block, then by
-                * definition, there cannot be any leaves to the right of
-                * him. */
-               last_eb->h_next_leaf_blk = 0;
-               status = ocfs2_journal_dirty(handle, last_eb_bh);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
-       }
-
-       if (delete_blk) {
-               if (rec_flags & OCFS2_EXT_REFCOUNTED)
-                       status = ocfs2_decrease_refcount(inode, handle,
-                                       ocfs2_blocks_to_clusters(osb->sb,
-                                                                delete_blk),
-                                       clusters_to_del, meta_ac,
-                                       &tc->tc_dealloc, 1);
-               else
-                       status = ocfs2_truncate_log_append(osb, handle,
-                                                          delete_blk,
-                                                          clusters_to_del);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
-       }
-       status = 0;
-bail:
-       brelse(last_eb_bh);
-       mlog_exit(status);
-       return status;
-}
-
 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 {
        set_buffer_uptodate(bh);
@@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                        goto out_commit;
                did_quota = 1;
 
-               ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+               data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+               ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
                                           &num);
                if (ret) {
                        mlog_errno(ret);
@@ -7406,26 +6989,29 @@ out:
  */
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
-                         struct buffer_head *fe_bh,
-                         struct ocfs2_truncate_context *tc)
+                         struct buffer_head *di_bh)
 {
-       int status, i, credits, tl_sem = 0;
-       u32 clusters_to_del, new_highest_cpos, range;
+       int status = 0, i, flags = 0;
+       u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
        u64 blkno = 0;
        struct ocfs2_extent_list *el;
-       handle_t *handle = NULL;
-       struct inode *tl_inode = osb->osb_tl_inode;
+       struct ocfs2_extent_rec *rec;
        struct ocfs2_path *path = NULL;
-       struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
-       struct ocfs2_alloc_context *meta_ac = NULL;
-       struct ocfs2_refcount_tree *ref_tree = NULL;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       struct ocfs2_extent_list *root_el = &(di->id2.i_list);
+       u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+       struct ocfs2_extent_tree et;
+       struct ocfs2_cached_dealloc_ctxt dealloc;
 
        mlog_entry_void();
 
+       ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+       ocfs2_init_dealloc_ctxt(&dealloc);
+
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
 
-       path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+       path = ocfs2_new_path(di_bh, &di->id2.i_list,
                              ocfs2_journal_access_di);
        if (!path) {
                status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
                goto bail;
        }
 
-       credits = 0;
-
        /*
         * Truncate always works against the rightmost tree branch.
         */
@@ -7480,101 +7064,62 @@ start:
        }
 
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-       range = le32_to_cpu(el->l_recs[i].e_cpos) +
-               ocfs2_rec_clusters(el, &el->l_recs[i]);
-       if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
-               clusters_to_del = 0;
-       } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
-               clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
-               blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+       rec = &el->l_recs[i];
+       flags = rec->e_flags;
+       range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+       if (i == 0 && ocfs2_is_empty_extent(rec)) {
+               /*
+                * Lower levels depend on this never happening, but it's best
+                * to check it up here before changing the tree.
+               */
+               if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+                       ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+                                   "extent record, depth %u\n", inode->i_ino,
+                                   le16_to_cpu(root_el->l_tree_depth));
+                       status = -EROFS;
+                       goto bail;
+               }
+               trunc_cpos = le32_to_cpu(rec->e_cpos);
+               trunc_len = 0;
+               blkno = 0;
+       } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+               /*
+                * Truncate entire record.
+                */
+               trunc_cpos = le32_to_cpu(rec->e_cpos);
+               trunc_len = ocfs2_rec_clusters(el, rec);
+               blkno = le64_to_cpu(rec->e_blkno);
        } else if (range > new_highest_cpos) {
-               clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
-                                  le32_to_cpu(el->l_recs[i].e_cpos)) -
-                                 new_highest_cpos;
-               blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
-                       ocfs2_clusters_to_blocks(inode->i_sb,
-                               ocfs2_rec_clusters(el, &el->l_recs[i]) -
-                               clusters_to_del);
+               /*
+                * Partial truncate. it also should be
+                * the last truncate we're doing.
+                */
+               trunc_cpos = new_highest_cpos;
+               trunc_len = range - new_highest_cpos;
+               coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+               blkno = le64_to_cpu(rec->e_blkno) +
+                               ocfs2_clusters_to_blocks(inode->i_sb, coff);
        } else {
+               /*
+                * Truncate completed, leave happily.
+                */
                status = 0;
                goto bail;
        }
 
-       mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
-            clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-
-       if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
-               BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-                        OCFS2_HAS_REFCOUNT_FL));
-
-               status = ocfs2_lock_refcount_tree(osb,
-                                               le64_to_cpu(di->i_refcount_loc),
-                                               1, &ref_tree, NULL);
-               if (status) {
-                       mlog_errno(status);
-                       goto bail;
-               }
-
-               status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
-                                                              blkno,
-                                                              clusters_to_del,
-                                                              &credits,
-                                                              &meta_ac);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
-       }
-
-       mutex_lock(&tl_inode->i_mutex);
-       tl_sem = 1;
-       /* ocfs2_truncate_log_needs_flush guarantees us at least one
-        * record is free for use. If there isn't any, we flush to get
-        * an empty truncate log.  */
-       if (ocfs2_truncate_log_needs_flush(osb)) {
-               status = __ocfs2_flush_truncate_log(osb);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
-       }
+       phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
 
-       credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
-                                               (struct ocfs2_dinode *)fe_bh->b_data,
-                                               el);
-       handle = ocfs2_start_trans(osb, credits);
-       if (IS_ERR(handle)) {
-               status = PTR_ERR(handle);
-               handle = NULL;
-               mlog_errno(status);
-               goto bail;
-       }
-
-       status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-                                  tc, path, meta_ac);
+       status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+                                         phys_cpos, trunc_len, flags, &dealloc,
+                                         refcount_loc);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
 
-       mutex_unlock(&tl_inode->i_mutex);
-       tl_sem = 0;
-
-       ocfs2_commit_trans(osb, handle);
-       handle = NULL;
-
        ocfs2_reinit_path(path, 1);
 
-       if (meta_ac) {
-               ocfs2_free_alloc_context(meta_ac);
-               meta_ac = NULL;
-       }
-
-       if (ref_tree) {
-               ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-               ref_tree = NULL;
-       }
-
        /*
         * The check above will catch the case where we've truncated
         * away all allocation.
@@ -7585,25 +7130,10 @@ bail:
 
        ocfs2_schedule_truncate_log_flush(osb, 1);
 
-       if (tl_sem)
-               mutex_unlock(&tl_inode->i_mutex);
-
-       if (handle)
-               ocfs2_commit_trans(osb, handle);
-
-       if (meta_ac)
-               ocfs2_free_alloc_context(meta_ac);
-
-       if (ref_tree)
-               ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-
-       ocfs2_run_deallocs(osb, &tc->tc_dealloc);
+       ocfs2_run_deallocs(osb, &dealloc);
 
        ocfs2_free_path(path);
 
-       /* This will drop the ext_alloc cluster lock for us */
-       ocfs2_free_truncate_context(tc);
-
        mlog_exit(status);
        return status;
 }
index 1db4359..55762b5 100644 (file)
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_extent_tree *et,
-                            u32 cpos, u32 phys_cpos, u32 len,
-                            struct ocfs2_cached_dealloc_ctxt *dealloc);
+                            u32 cpos, u32 phys_cpos, u32 len, int flags,
+                            struct ocfs2_cached_dealloc_ctxt *dealloc,
+                            u64 refcount_loc);
 
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                u64 blkno, unsigned int bit);
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                             int type, int slot, u64 blkno,
+                             int type, int slot, u64 suballoc, u64 blkno,
                              unsigned int bit);
 static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
 {
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
-                         struct buffer_head *fe_bh,
-                         struct ocfs2_truncate_context *tc);
+                         struct buffer_head *di_bh);
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
                          unsigned int start, unsigned int end, int trunc);
 
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
                              struct ocfs2_path *path);
 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
                                   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+                                 struct ocfs2_path *path, u32 *cpos);
 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
                            struct ocfs2_path *left,
                            struct ocfs2_path *right);
index 21441dd..3623ca2 100644 (file)
@@ -1735,6 +1735,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                        goto out;
                }
 
+               if (data_ac)
+                       data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
                credits = ocfs2_calc_extend_credits(inode->i_sb,
                                                    &di->id2.i_list,
                                                    clusters_to_alloc);
index 3bb928a..c7fba39 100644 (file)
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
+       define_mask(RESERVATIONS),
 };
 
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
index 3dfddbe..fd96e2a 100644 (file)
 #define ML_ERROR       0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE      0x0000000200000000ULL /* setn to KERN_NOTICE */
 #define ML_KTHREAD     0x0000000400000000ULL /* kernel thread activity */
+#define        ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
 
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
index 73e743e..aa75ca3 100644 (file)
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
                        o2net_sc_queue_work(sc, &sc->sc_connect_work);
                        break;
                default:
+                       printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+                             " shutdown, state %d\n",
+                             SC_NODEF_ARGS(sc), sk->sk_state);
                        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
                        break;
        }
index efd77d0..f04ebcf 100644 (file)
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        else
                                de->inode = 0;
                        dir->i_version++;
-                       status = ocfs2_journal_dirty(handle, bh);
+                       ocfs2_journal_dirty(handle, bh);
                        goto bail;
                }
                i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
                                ocfs2_recalc_free_list(dir, handle, lookup);
 
                        dir->i_version++;
-                       status = ocfs2_journal_dirty(handle, insert_bh);
+                       ocfs2_journal_dirty(handle, insert_bh);
                        retval = 0;
                        goto bail;
                }
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        }
 
        ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
-
        ocfs2_journal_dirty(handle, di_bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
 
        i_size_write(inode, size);
        inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                ocfs2_init_dir_trailer(inode, new_bh, size);
        }
 
-       status = ocfs2_journal_dirty(handle, new_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+       ocfs2_journal_dirty(handle, new_bh);
 
        i_size_write(inode, inode->i_sb->s_blocksize);
        inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        int ret;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
        u16 dr_suballoc_bit;
-       u64 dr_blkno;
+       u64 suballoc_loc, dr_blkno;
        unsigned int num_bits;
        struct buffer_head *dx_root_bh = NULL;
        struct ocfs2_dx_root_block *dx_root;
        struct ocfs2_dir_block_trailer *trailer =
                ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
 
-       ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
-                                  &num_bits, &dr_blkno);
+       ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+                                  &dr_suballoc_bit, &num_bits, &dr_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        memset(dx_root, 0, osb->sb->s_blocksize);
        strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
        dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+       dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
        dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
        dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
        dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
                dx_root->dr_list.l_count =
                        cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
        }
-
-       ret = ocfs2_journal_dirty(handle, dx_root_bh);
-       if (ret)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, dx_root_bh);
 
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
                                      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
 
-       ret = ocfs2_journal_dirty(handle, di_bh);
-       if (ret)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, di_bh);
 
        *ret_dx_root_bh = dx_root_bh;
        dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
         * chance of contiguousness as the directory grows in number
         * of entries.
         */
-       ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+       ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * if we only get one now, that's enough to continue. The rest
         * will be claimed after the conversion to extents.
         */
-       ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+       if (ocfs2_dir_resv_allowed(osb))
+               data_ac->ac_resv = &oi->ip_la_data_resv;
+       ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                ocfs2_init_dir_trailer(dir, dirdata_bh, i);
        }
 
-       ret = ocfs2_journal_dirty(handle, dirdata_bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out_commit;
-       }
+       ocfs2_journal_dirty(handle, dirdata_bh);
 
        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
                /*
@@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         */
        dir->i_blocks = ocfs2_inode_sector_count(dir);
 
-       ret = ocfs2_journal_dirty(handle, di_bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out_commit;
-       }
+       ocfs2_journal_dirty(handle, di_bh);
 
        if (ocfs2_supports_indexed_dirs(osb)) {
                ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * pass. Claim the 2nd cluster as a separate extent.
         */
        if (alloc > len) {
-               ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+               ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
                                           &len);
                if (ret) {
                        mlog_errno(ret);
@@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                        goto bail;
                }
 
+               if (ocfs2_dir_resv_allowed(osb))
+                       data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
+
                credits = ocfs2_calc_extend_credits(sb, el, 1);
        } else {
                spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3423,11 +3407,7 @@ do_extend:
        } else {
                de->rec_len = cpu_to_le16(sb->s_blocksize);
        }
-       status = ocfs2_journal_dirty(handle, new_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+       ocfs2_journal_dirty(handle, new_bh);
 
        dir_i_size += dir->i_sb->s_blocksize;
        i_size_write(dir, dir_i_size);
@@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
             sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
             dx_leaf_sort_swap);
 
-       ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out_commit;
-       }
+       ocfs2_journal_dirty(handle, dx_leaf_bh);
 
        ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
                                           &split_hash);
@@ -4490,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
 
        blk = le64_to_cpu(dx_root->dr_blkno);
        bit = le16_to_cpu(dx_root->dr_suballoc_bit);
-       bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+       if (dx_root->dr_suballoc_loc)
+               bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+       else
+               bg_blkno = ocfs2_which_suballoc_group(blk, bit);
        ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
                                       bit, bg_blkno, 1);
        if (ret)
@@ -4551,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
 
                p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
 
-               ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
-                                              &dealloc);
+               ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
+                                              &dealloc, 0);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
index 12d5eb7..f449991 100644 (file)
@@ -88,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
        return 0;
 }
 
-static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        mlog_entry_void();
 
@@ -145,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 }
 
 
-static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        mlog_entry_void();
 
@@ -451,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
                                     lock->ml.node, &status);
        if (ret < 0)
-               mlog_errno(ret);
+               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                    "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+                    lock->ml.node);
        else {
                if (status == DLM_RECOVERING) {
                        mlog(ML_ERROR, "sent AST to node %u, it thinks this "
index 0102be3..4b6ae2c 100644 (file)
@@ -37,7 +37,7 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
 
-#define DLM_HASH_SIZE_DEFAULT  (1 << 14)
+#define DLM_HASH_SIZE_DEFAULT  (1 << 17)
 #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
 # define DLM_HASH_PAGES                1
 #else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
 
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_do_local_ast(struct dlm_ctxt *dlm,
                      struct dlm_lock_resource *res,
                      struct dlm_lock *lock);
index 90803b4..9f30491 100644 (file)
@@ -390,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
                } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
                        dlm_error(ret);
        } else {
-               mlog_errno(tmpret);
+               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                    "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+                    res->owner);
                if (dlm_is_host_down(tmpret)) {
                        /* instead of logging the same network error over
                         * and over, sleep here and wait for the heartbeat
index 988c905..6b5a492 100644 (file)
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
 
        assert_spin_locked(&dlm->spinlock);
 
-       printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
+       printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
 
        while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
                                     node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 
        node = exit_msg->node_idx;
 
-       printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
+       printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
 
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
        status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
                                    &leave_msg, sizeof(leave_msg), node,
                                    NULL);
-
+       if (status < 0)
+               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                    "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
        mlog(0, "status return %d from o2net_send_message\n", status);
 
        return status;
@@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                set_bit(assert->node_idx, dlm->domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
 
-               printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+               printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
                       assert->node_idx, dlm->name);
                __dlm_print_nodes(dlm);
 
@@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
                                    &cancel_msg, sizeof(cancel_msg), node,
                                    NULL);
        if (status < 0) {
-               mlog_errno(status);
+               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                    "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+                    node);
                goto bail;
        }
 
@@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
 
        status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
-                                   sizeof(join_msg), node,
-                                   &join_resp);
+                                   sizeof(join_msg), node, &join_resp);
        if (status < 0 && status != -ENOPROTOOPT) {
-               mlog_errno(status);
+               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                    "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+                    node);
                goto bail;
        }
        dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
                                    &assert_msg, sizeof(assert_msg), node,
                                    NULL);
        if (status < 0)
-               mlog_errno(status);
+               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                    "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+                    node);
 
        return status;
 }
@@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                goto leave;
        }
 
-       dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+       dlm->name = kstrdup(domain, GFP_KERNEL);
        if (dlm->name == NULL) {
                mlog_errno(-ENOMEM);
                kfree(dlm);
@@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        for (i = 0; i < DLM_HASH_BUCKETS; i++)
                INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
 
-       strcpy(dlm->name, domain);
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
 
index 7333377..69cf369 100644 (file)
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
                        BUG();
                }
        } else {
-               mlog_errno(tmpret);
+               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                    "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
+                    res->owner);
                if (dlm_is_host_down(tmpret)) {
                        ret = DLM_RECOVERING;
                        mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
        struct dlm_lock *lock;
        int kernel_allocated = 0;
 
-       lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+       lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
        if (!lock)
                return NULL;
 
index 9289b43..4a7506a 100644 (file)
@@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 {
        struct dlm_lock_resource *res = NULL;
 
-       res = (struct dlm_lock_resource *)
-                               kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
+       res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
        if (!res)
                goto error;
 
-       res->lockname.name = (char *)
-                               kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+       res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
        if (!res->lockname.name)
                goto error;
 
@@ -757,8 +755,7 @@ lookup:
                spin_unlock(&dlm->spinlock);
                mlog(0, "allocating a new resource\n");
                /* nothing found and we need to allocate one. */
-               alloc_mle = (struct dlm_master_list_entry *)
-                       kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+               alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                if (!alloc_mle)
                        goto leave;
                res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1542,8 +1539,7 @@ way_up_top:
                        spin_unlock(&dlm->master_lock);
                        spin_unlock(&dlm->spinlock);
 
-                       mle = (struct dlm_master_list_entry *)
-                               kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+                       mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                        if (!mle) {
                                response = DLM_MASTER_RESP_ERROR;
                                mlog_errno(-ENOMEM);
@@ -1666,7 +1662,9 @@ again:
                tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
                                            &assert, sizeof(assert), to, &r);
                if (tmpret < 0) {
-                       mlog(0, "assert_master returned %d!\n", tmpret);
+                       mlog(ML_ERROR, "Error %d when sending message %u (key "
+                            "0x%x) to node %u\n", tmpret,
+                            DLM_ASSERT_MASTER_MSG, dlm->key, to);
                        if (!dlm_is_host_down(tmpret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
                                BUG();
@@ -2205,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
                                 &deref, sizeof(deref), res->owner, &r);
        if (ret < 0)
-               mlog_errno(ret);
+               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                    "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
+                    res->owner);
        else if (r < 0) {
                /* BAD.  other node says I did not have a ref. */
                mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2452,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
                goto leave;
        }
 
-       mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-                                                               GFP_NOFS);
+       mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
        if (!mle) {
                mlog_errno(ret);
                goto leave;
@@ -2975,7 +2974,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                         &migrate, sizeof(migrate), nodenum,
                                         &status);
                if (ret < 0) {
-                       mlog(0, "migrate_request returned %d!\n", ret);
+                       mlog(ML_ERROR, "Error %d when sending message %u (key "
+                            "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
+                            dlm->key, nodenum);
                        if (!dlm_is_host_down(ret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", ret);
                                BUG();
@@ -3033,8 +3034,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
        hash = dlm_lockid_hash(name, namelen);
 
        /* preallocate.. if this fails, abort */
-       mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-                                                        GFP_NOFS);
+       mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 
        if (!mle) {
                ret = -ENOMEM;
index b4f99de..f8b75ce 100644 (file)
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
 
        /* negative status is handled by caller */
        if (ret < 0)
-               mlog_errno(ret);
+               mlog(ML_ERROR, "Error %d when sending message %u (key "
+                    "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
+                    dlm->key, request_from);
 
        // return from here, then
        // sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
        ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
                                 sizeof(done_msg), send_to, &tmpret);
        if (ret < 0) {
+               mlog(ML_ERROR, "Error %d when sending message %u (key "
+                    "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
+                    dlm->key, send_to);
                if (!dlm_is_host_down(ret)) {
-                       mlog_errno(ret);
-                       mlog(ML_ERROR, "%s: unknown error sending data-done "
-                            "to %u\n", dlm->name, send_to);
                        BUG();
                }
        } else
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
        if (ret < 0) {
                /* XXX: negative status is not handled.
                 * this will end up killing this node. */
-               mlog_errno(ret);
+               mlog(ML_ERROR, "Error %d when sending message %u (key "
+                    "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
+                    dlm->key, send_to);
        } else {
                /* might get an -ENOMEM back here */
                ret = status;
@@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                                 &req, sizeof(req), nodenum, &status);
        /* XXX: negative status not handled properly here. */
        if (ret < 0)
-               mlog_errno(ret);
+               mlog(ML_ERROR, "Error %d when sending message %u (key "
+                    "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+                    dlm->key, nodenum);
        else {
                BUG_ON(status < 0);
                BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -2640,7 +2646,7 @@ retry:
                if (dlm_is_host_down(ret)) {
                        /* node is down.  not involved in recovery
                         * so just keep going */
-                       mlog(0, "%s: node %u was down when sending "
+                       mlog(ML_NOTICE, "%s: node %u was down when sending "
                             "begin reco msg (%d)\n", dlm->name, nodenum, ret);
                        ret = 0;
                }
@@ -2660,11 +2666,12 @@ retry:
                }
                if (ret < 0) {
                        struct dlm_lock_resource *res;
+
                        /* this is now a serious problem, possibly ENOMEM
                         * in the network stack.  must retry */
                        mlog_errno(ret);
                        mlog(ML_ERROR, "begin reco of dlm %s to node %u "
-                           returned %d\n", dlm->name, nodenum, ret);
+                            "returned %d\n", dlm->name, nodenum, ret);
                        res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
                                                 DLM_RECOVERY_LOCK_NAME_LEN);
                        if (res) {
@@ -2789,7 +2796,9 @@ stage2:
                if (ret >= 0)
                        ret = status;
                if (ret < 0) {
-                       mlog_errno(ret);
+                       mlog(ML_ERROR, "Error %d when sending message %u (key "
+                            "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+                            dlm->key, nodenum);
                        if (dlm_is_host_down(ret)) {
                                /* this has no effect on this recovery
                                 * session, so set the status to zero to
index 11a6d1f..d4f73ca 100644 (file)
@@ -309,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
         * spinlock, and because we know that it is not migrating/
         * recovering/in-progress, it is fine to reserve asts and
         * basts right before queueing them all throughout */
+       assert_spin_locked(&dlm->ast_lock);
        assert_spin_locked(&res->spinlock);
        BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
                              DLM_LOCK_RES_RECOVERING|
@@ -337,7 +338,7 @@ converting:
                        /* queue the BAST if not already */
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                               dlm_queue_bast(dlm, lock);
+                               __dlm_queue_bast(dlm, lock);
                        }
                        /* update the highest_blocked if needed */
                        if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -355,7 +356,7 @@ converting:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                               dlm_queue_bast(dlm, lock);
+                               __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.convert_type)
                                lock->ml.highest_blocked =
@@ -383,7 +384,7 @@ converting:
                spin_unlock(&target->spinlock);
 
                __dlm_lockres_reserve_ast(res);
-               dlm_queue_ast(dlm, target);
+               __dlm_queue_ast(dlm, target);
                /* go back and check for more */
                goto converting;
        }
@@ -402,7 +403,7 @@ blocked:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                               dlm_queue_bast(dlm, lock);
+                               __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.type)
                                lock->ml.highest_blocked = target->ml.type;
@@ -418,7 +419,7 @@ blocked:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                               dlm_queue_bast(dlm, lock);
+                               __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.type)
                                lock->ml.highest_blocked = target->ml.type;
@@ -444,7 +445,7 @@ blocked:
                spin_unlock(&target->spinlock);
 
                __dlm_lockres_reserve_ast(res);
-               dlm_queue_ast(dlm, target);
+               __dlm_queue_ast(dlm, target);
                /* go back and check for more */
                goto converting;
        }
@@ -674,6 +675,7 @@ static int dlm_thread(void *data)
                        /* lockres can be re-dirtied/re-added to the
                         * dirty_list in this gap, but that is ok */
 
+                       spin_lock(&dlm->ast_lock);
                        spin_lock(&res->spinlock);
                        if (res->owner != dlm->node_num) {
                                __dlm_print_one_lock_resource(res);
@@ -694,6 +696,7 @@ static int dlm_thread(void *data)
                                /* move it to the tail and keep going */
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
+                               spin_unlock(&dlm->ast_lock);
                                mlog(0, "delaying list shuffling for in-"
                                     "progress lockres %.*s, state=%d\n",
                                     res->lockname.len, res->lockname.name,
@@ -715,6 +718,7 @@ static int dlm_thread(void *data)
                        dlm_shuffle_lists(dlm, res);
                        res->state &= ~DLM_LOCK_RES_DIRTY;
                        spin_unlock(&res->spinlock);
+                       spin_unlock(&dlm->ast_lock);
 
                        dlm_lockres_calc_usage(dlm, res);
 
index b47c1b9..817287c 100644 (file)
@@ -354,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
                        mlog(0, "master was in-progress.  retry\n");
                ret = status;
        } else {
-               mlog_errno(tmpret);
+               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                    "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
                if (dlm_is_host_down(tmpret)) {
                        /* NOTE: this seems strange, but it is what we want.
                         * when the master goes down during a cancel or
index a5fbd9c..f74f140 100644 (file)
@@ -278,10 +278,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
        inode->i_atime = CURRENT_TIME;
        di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
        di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-
-       ret = ocfs2_journal_dirty(handle, bh);
-       if (ret < 0)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, bh);
 
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -430,9 +427,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 
-       status = ocfs2_journal_dirty(handle, fe_bh);
-       if (status < 0)
-               mlog_errno(status);
+       ocfs2_journal_dirty(handle, fe_bh);
 
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -449,7 +444,6 @@ static int ocfs2_truncate_file(struct inode *inode,
        int status = 0;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-       struct ocfs2_truncate_context *tc = NULL;
 
        mlog_entry("(inode = %llu, new_i_size = %llu\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -488,6 +482,9 @@ static int ocfs2_truncate_file(struct inode *inode,
 
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
+       ocfs2_resv_discard(&osb->osb_la_resmap,
+                          &OCFS2_I(inode)->ip_la_data_resv);
+
        /*
         * The inode lock forced other nodes to sync and drop their
         * pages, which (correctly) happens even if we have a truncate
@@ -517,13 +514,7 @@ static int ocfs2_truncate_file(struct inode *inode,
                goto bail_unlock_sem;
        }
 
-       status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail_unlock_sem;
-       }
-
-       status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
+       status = ocfs2_commit_truncate(osb, inode, di_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail_unlock_sem;
@@ -666,11 +657,7 @@ restarted_transaction:
                goto leave;
        }
 
-       status = ocfs2_journal_dirty(handle, bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto leave;
-       }
+       ocfs2_journal_dirty(handle, bh);
 
        spin_lock(&OCFS2_I(inode)->ip_lock);
        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -1195,9 +1182,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
        di = (struct ocfs2_dinode *) bh->b_data;
        di->i_mode = cpu_to_le16(inode->i_mode);
 
-       ret = ocfs2_journal_dirty(handle, bh);
-       if (ret < 0)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, bh);
 
 out_trans:
        ocfs2_commit_trans(osb, handle);
@@ -1434,16 +1419,90 @@ out:
        return ret;
 }
 
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+       int i;
+       struct ocfs2_extent_rec *rec = NULL;
+
+       for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+
+               rec = &el->l_recs[i];
+
+               if (le32_to_cpu(rec->e_cpos) < pos)
+                       break;
+       }
+
+       return i;
+}
+
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+                                struct ocfs2_extent_list *el,
+                                struct ocfs2_extent_rec *rec,
+                                u32 trunc_start, u32 *trunc_cpos,
+                                u32 *trunc_len, u32 *trunc_end,
+                                u64 *blkno, int *done)
+{
+       int ret = 0;
+       u32 coff, range;
+
+       range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+       if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+               *trunc_cpos = le32_to_cpu(rec->e_cpos);
+               /*
+                * Skip holes if any.
+                */
+               if (range < *trunc_end)
+                       *trunc_end = range;
+               *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+               *blkno = le64_to_cpu(rec->e_blkno);
+               *trunc_end = le32_to_cpu(rec->e_cpos);
+       } else if (range > trunc_start) {
+               *trunc_cpos = trunc_start;
+               *trunc_len = *trunc_end - trunc_start;
+               coff = trunc_start - le32_to_cpu(rec->e_cpos);
+               *blkno = le64_to_cpu(rec->e_blkno) +
+                               ocfs2_clusters_to_blocks(inode->i_sb, coff);
+               *trunc_end = trunc_start;
+       } else {
+               /*
+                * It may have two following possibilities:
+                *
+                * - last record has been removed
+                * - trunc_start was within a hole
+                *
+                * both two cases mean the completion of hole punching.
+                */
+               ret = 1;
+       }
+
+       *done = ret;
+}
+
 static int ocfs2_remove_inode_range(struct inode *inode,
                                    struct buffer_head *di_bh, u64 byte_start,
                                    u64 byte_len)
 {
-       int ret = 0;
-       u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+       int ret = 0, flags = 0, done = 0, i;
+       u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+       u32 cluster_in_el;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct address_space *mapping = inode->i_mapping;
        struct ocfs2_extent_tree et;
+       struct ocfs2_path *path = NULL;
+       struct ocfs2_extent_list *el = NULL;
+       struct ocfs2_extent_rec *rec = NULL;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
 
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1469,17 +1528,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
 
+       /*
+        * For reflinks, we may need to CoW 2 clusters which might be
+        * partially zero'd later, if hole's start and end offset were
+        * within one cluster(means is not exactly aligned to clustersize).
+        */
+
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+
+               ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
-       trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
-       if (trunc_len >= trunc_start)
-               trunc_len -= trunc_start;
-       else
-               trunc_len = 0;
+       trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
+       cluster_in_el = trunc_end;
 
-       mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+       mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             (unsigned long long)byte_start,
-            (unsigned long long)byte_len, trunc_start, trunc_len);
+            (unsigned long long)byte_len, trunc_start, trunc_end);
 
        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
        if (ret) {
@@ -1487,31 +1564,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
 
-       cpos = trunc_start;
-       while (trunc_len) {
-               ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
-                                        &alloc_size, NULL);
+       path = ocfs2_new_path_from_et(&et);
+       if (!path) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       while (trunc_end > trunc_start) {
+
+               ret = ocfs2_find_path(INODE_CACHE(inode), path,
+                                     cluster_in_el);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
 
-               if (alloc_size > trunc_len)
-                       alloc_size = trunc_len;
+               el = path_leaf_el(path);
 
-               /* Only do work for non-holes */
-               if (phys_cpos != 0) {
-                       ret = ocfs2_remove_btree_range(inode, &et, cpos,
-                                                      phys_cpos, alloc_size,
-                                                      &dealloc);
+               i = ocfs2_find_rec(el, trunc_end);
+               /*
+                * Need to go to previous extent block.
+                */
+               if (i < 0) {
+                       if (path->p_tree_depth == 0)
+                               break;
+
+                       ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+                                                           path,
+                                                           &cluster_in_el);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
+
+                       /*
+                        * We've reached the leftmost extent block,
+                        * it's safe to leave.
+                        */
+                       if (cluster_in_el == 0)
+                               break;
+
+                       /*
+                        * The 'pos' searched for previous extent block is
+                        * always one cluster less than actual trunc_end.
+                        */
+                       trunc_end = cluster_in_el + 1;
+
+                       ocfs2_reinit_path(path, 1);
+
+                       continue;
+
+               } else
+                       rec = &el->l_recs[i];
+
+               ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+                                    &trunc_len, &trunc_end, &blkno, &done);
+               if (done)
+                       break;
+
+               flags = rec->e_flags;
+               phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+               ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+                                              phys_cpos, trunc_len, flags,
+                                              &dealloc, refcount_loc);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
                }
 
-               cpos += alloc_size;
-               trunc_len -= alloc_size;
+               cluster_in_el = trunc_end;
+
+               ocfs2_reinit_path(path, 1);
        }
 
        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
index af18988..abb0a95 100644 (file)
@@ -376,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
        OCFS2_I(inode)->ip_last_used_slot = 0;
        OCFS2_I(inode)->ip_last_used_group = 0;
+
+       if (S_ISDIR(inode->i_mode))
+               ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+                                   OCFS2_RESV_FLAG_DIR);
        mlog_exit_void();
 }
 
@@ -539,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                                     struct buffer_head *fe_bh)
 {
        int status = 0;
-       struct ocfs2_truncate_context *tc = NULL;
        struct ocfs2_dinode *fe;
        handle_t *handle = NULL;
 
@@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
 
-               status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto out;
-               }
-
-               status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+               status = ocfs2_commit_truncate(osb, inode, fe_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto out;
@@ -659,12 +656,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 
        di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
        di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
-
-       status = ocfs2_journal_dirty(handle, di_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail_commit;
-       }
+       ocfs2_journal_dirty(handle, di_bh);
 
        ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
        dquot_free_inode(inode);
@@ -980,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
 void ocfs2_delete_inode(struct inode *inode)
 {
        int wipe, status;
-       sigset_t blocked, oldset;
+       sigset_t oldset;
        struct buffer_head *di_bh = NULL;
 
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -1007,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode)
         * messaging paths may return us -ERESTARTSYS. Which would
         * cause us to exit early, resulting in inodes being orphaned
         * forever. */
-       sigfillset(&blocked);
-       status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-       if (status < 0) {
-               mlog_errno(status);
-               ocfs2_cleanup_delete_inode(inode, 1);
-               goto bail;
-       }
+       ocfs2_block_signals(&oldset);
 
        /*
         * Synchronize us against ocfs2_get_dentry. We take this in
@@ -1087,9 +1073,7 @@ bail_unlock_nfs_sync:
        ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
 
 bail_unblock:
-       status = sigprocmask(SIG_SETMASK, &oldset, NULL);
-       if (status < 0)
-               mlog_errno(status);
+       ocfs2_unblock_signals(&oldset);
 bail:
        clear_inode(inode);
        mlog_exit_void();
@@ -1123,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode)
        ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
 
+       ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+                          &oi->ip_la_data_resv);
+       ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
        /* We very well may get a clear_inode before all an inodes
         * metadata has hit disk. Of course, we can't drop any cluster
         * locks until the journal has finished with it. The only
@@ -1298,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
        fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 
-       status = ocfs2_journal_dirty(handle, bh);
-       if (status < 0)
-               mlog_errno(status);
-
-       status = 0;
+       ocfs2_journal_dirty(handle, bh);
 leave:
-
        mlog_exit(status);
        return status;
 }
index 0b28e19..9f5f5fc 100644 (file)
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
        /* Only valid if the inode is the dir. */
        u32                             ip_last_used_slot;
        u64                             ip_last_used_group;
+
+       struct ocfs2_alloc_reservation  ip_la_data_resv;
 };
 
 /*
index 9336c60..47878cf 100644 (file)
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 }
 
 /*
- * 'nblocks' is what you want to add to the current
- * transaction. extend_trans will either extend the current handle by
- * nblocks, or commit it and start a new one with nblocks credits.
+ * 'nblocks' is what you want to add to the current transaction.
  *
  * This might call jbd2_journal_restart() which will commit dirty buffers
  * and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
  */
 int ocfs2_extend_trans(handle_t *handle, int nblocks)
 {
-       int status;
+       int status, old_nblocks;
 
        BUG_ON(!handle);
-       BUG_ON(!nblocks);
+       BUG_ON(nblocks < 0);
+
+       if (!nblocks)
+               return 0;
 
+       old_nblocks = handle->h_buffer_credits;
        mlog_entry_void();
 
        mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
                mlog(0,
                     "jbd2_journal_extend failed, trying "
                     "jbd2_journal_restart\n");
-               status = jbd2_journal_restart(handle, nblocks);
+               status = jbd2_journal_restart(handle,
+                                             old_nblocks + nblocks);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
        return __ocfs2_journal_access(handle, ci, bh, NULL, type);
 }
 
-int ocfs2_journal_dirty(handle_t *handle,
-                       struct buffer_head *bh)
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
 {
        int status;
 
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
                   (unsigned long long)bh->b_blocknr);
 
        status = jbd2_journal_dirty_metadata(handle, bh);
-       if (status < 0)
-               mlog(ML_ERROR, "Could not dirty metadata buffer. "
-                    "(bh->b_blocknr=%llu)\n",
-                    (unsigned long long)bh->b_blocknr);
+       BUG_ON(status);
 
-       mlog_exit(status);
-       return status;
+       mlog_exit_void();
 }
 
 #define OCFS2_DEFAULT_COMMIT_INTERVAL  (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
index 3f74e09..b5baaa8 100644 (file)
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
  *     <modify the bh>
  *     ocfs2_journal_dirty(handle, bh);
  */
-int                  ocfs2_journal_dirty(handle_t *handle,
-                                        struct buffer_head *bh);
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
 
 /*
  *  Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
        return blocks;
 }
 
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list.  The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits().  They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+       return ocfs2_extent_recs_per_gd(sb);
+}
+
 static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
                                                unsigned int clusters_to_del,
                                                struct ocfs2_dinode *fe,
index c983715..3d74196 100644 (file)
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                                             struct ocfs2_dinode *alloc,
-                                            u32 numbits);
+                                            u32 *numbits,
+                                            struct ocfs2_alloc_reservation *resv);
 
 static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
 
@@ -74,6 +75,144 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
 
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ *   group descriptors for other allocations (such as block groups,
+ *   etc). Picking default sizes which are a multiple of 4 could help
+ *   - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ *   file systems. This can easily be taken care of by limiting our
+ *   default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ *   particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K   group: 126M     la: 121M
+ * csize: 8K   group: 252M     la: 243M
+ * csize: 16K  group: 504M     la: 486M
+ * csize: 32K  group: 1008M    la: 972M
+ * csize: 64K  group: 2016M    la: 1944M
+ * csize: 128K group: 4032M    la: 3888M
+ * csize: 256K group: 8064M    la: 7776M
+ * csize: 512K group: 16128M   la: 15552M
+ * csize: 1024K        group: 32256M   la: 31104M
+ */
+#define        OCFS2_LA_MAX_DEFAULT_MB 256
+#define        OCFS2_LA_OLD_DEFAULT    8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+       unsigned int la_mb;
+       unsigned int gd_mb;
+       unsigned int megs_per_slot;
+       struct super_block *sb = osb->sb;
+
+       gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+               8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+
+       /*
+        * This takes care of files systems with very small group
+        * descriptors - 512 byte blocksize at cluster sizes lower
+        * than 16K and also 1k blocksize with 4k cluster size.
+        */
+       if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+           || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+               return OCFS2_LA_OLD_DEFAULT;
+
+       /*
+        * Leave enough room for some block groups and make the final
+        * value we work from a multiple of 4.
+        */
+       gd_mb -= 16;
+       gd_mb &= 0xFFFFFFFB;
+
+       la_mb = gd_mb;
+
+       /*
+        * Keep window sizes down to a reasonable default
+        */
+       if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+               /*
+                * Some clustersize / blocksize combinations will have
+                * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+                * default size, but get poor distribution when
+                * limited to exactly 256 megabytes.
+                *
+                * As an example, 16K clustersize at 4K blocksize
+                * gives us a cluster group size of 504M. Paring the
+                * local alloc size down to 256 however, would give us
+                * only one window and around 200MB left in the
+                * cluster group. Instead, find the first size below
+                * 256 which would give us an even distribution.
+                *
+                * Larger cluster group sizes actually work out pretty
+                * well when pared to 256, so we don't have to do this
+                * for any group that fits more than two
+                * OCFS2_LA_MAX_DEFAULT_MB windows.
+                */
+               if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+                       la_mb = 256;
+               else {
+                       unsigned int gd_mult = gd_mb;
+
+                       while (gd_mult > 256)
+                               gd_mult = gd_mult >> 1;
+
+                       la_mb = gd_mult;
+               }
+       }
+
+       megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+       megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+       /* Too many nodes, too few disk clusters. */
+       if (megs_per_slot < la_mb)
+               la_mb = megs_per_slot;
+
+       return la_mb;
+}
+
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+       struct super_block *sb = osb->sb;
+       unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+       unsigned int la_max_mb;
+
+       la_max_mb = ocfs2_clusters_to_megabytes(sb,
+                                               ocfs2_local_alloc_size(sb) * 8);
+
+       mlog(0, "requested: %dM, max: %uM, default: %uM\n",
+            requested_mb, la_max_mb, la_default_mb);
+
+       if (requested_mb == -1) {
+               /* No user request - use defaults */
+               osb->local_alloc_default_bits =
+                       ocfs2_megabytes_to_clusters(sb, la_default_mb);
+       } else if (requested_mb > la_max_mb) {
+               /* Request is too big, we give the maximum available */
+               osb->local_alloc_default_bits =
+                       ocfs2_megabytes_to_clusters(sb, la_max_mb);
+       } else {
+               osb->local_alloc_default_bits =
+                       ocfs2_megabytes_to_clusters(sb, requested_mb);
+       }
+
+       osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
+
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
        return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
                     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
                osb->local_alloc_bits =
                        ocfs2_megabytes_to_clusters(osb->sb,
-                                                   OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
+                                                   ocfs2_la_default_mb(osb));
        }
 
        /* read the alloc off disk */
@@ -262,6 +401,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 
        osb->local_alloc_state = OCFS2_LA_DISABLED;
 
+       ocfs2_resmap_uninit(&osb->osb_la_resmap);
+
        main_bm_inode = ocfs2_get_system_file_inode(osb,
                                                    GLOBAL_BITMAP_SYSTEM_INODE,
                                                    OCFS2_INVALID_SLOT);
@@ -305,12 +446,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        }
 
        ocfs2_clear_local_alloc(alloc);
-
-       status = ocfs2_journal_dirty(handle, bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto out_commit;
-       }
+       ocfs2_journal_dirty(handle, bh);
 
        brelse(bh);
        osb->local_alloc_bh = NULL;
@@ -481,46 +617,6 @@ out:
        return status;
 }
 
-/* Check to see if the local alloc window is within ac->ac_max_block */
-static int ocfs2_local_alloc_in_range(struct inode *inode,
-                                     struct ocfs2_alloc_context *ac,
-                                     u32 bits_wanted)
-{
-       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-       struct ocfs2_dinode *alloc;
-       struct ocfs2_local_alloc *la;
-       int start;
-       u64 block_off;
-
-       if (!ac->ac_max_block)
-               return 1;
-
-       alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
-       la = OCFS2_LOCAL_ALLOC(alloc);
-
-       start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
-       if (start == -1) {
-               mlog_errno(-ENOSPC);
-               return 0;
-       }
-
-       /*
-        * Converting (bm_off + start + bits_wanted) to blocks gives us
-        * the blkno just past our actual allocation.  This is perfect
-        * to compare with ac_max_block.
-        */
-       block_off = ocfs2_clusters_to_blocks(inode->i_sb,
-                                            le32_to_cpu(la->la_bm_off) +
-                                            start + bits_wanted);
-       mlog(0, "Checking %llu against %llu\n",
-            (unsigned long long)block_off,
-            (unsigned long long)ac->ac_max_block);
-       if (block_off > ac->ac_max_block)
-               return 0;
-
-       return 1;
-}
-
 /*
  * make sure we've got at least bits_wanted contiguous bits in the
  * local alloc. You lose them when you drop i_mutex.
@@ -613,17 +709,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
                mlog(0, "Calling in_range for max block %llu\n",
                     (unsigned long long)ac->ac_max_block);
 
-       if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
-                                       bits_wanted)) {
-               /*
-                * The window is outside ac->ac_max_block.
-                * This errno tells the caller to keep localalloc enabled
-                * but to get the allocation from the main bitmap.
-                */
-               status = -EFBIG;
-               goto bail;
-       }
-
        ac->ac_inode = local_alloc_inode;
        /* We should never use localalloc from another slot */
        ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +749,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
        la = OCFS2_LOCAL_ALLOC(alloc);
 
-       start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+       start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+                                                 ac->ac_resv);
        if (start == -1) {
                /* TODO: Shouldn't we just BUG here? */
                status = -ENOSPC;
@@ -674,8 +760,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 
        bitmap = la->la_bitmap;
        *bit_off = le32_to_cpu(la->la_bm_off) + start;
-       /* local alloc is always contiguous by nature -- we never
-        * delete bits from it! */
        *num_bits = bits_wanted;
 
        status = ocfs2_journal_access_di(handle,
@@ -687,18 +771,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
                goto bail;
        }
 
+       ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+                                 bits_wanted);
+
        while(bits_wanted--)
                ocfs2_set_bit(start++, bitmap);
 
        le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+       ocfs2_journal_dirty(handle, osb->local_alloc_bh);
 
-       status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
-
-       status = 0;
 bail:
        mlog_exit(status);
        return status;
@@ -722,13 +803,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 }
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
-                                            struct ocfs2_dinode *alloc,
-                                            u32 numbits)
+                                    struct ocfs2_dinode *alloc,
+                                    u32 *numbits,
+                                    struct ocfs2_alloc_reservation *resv)
 {
        int numfound, bitoff, left, startoff, lastzero;
+       int local_resv = 0;
+       struct ocfs2_alloc_reservation r;
        void *bitmap = NULL;
+       struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
 
-       mlog_entry("(numbits wanted = %u)\n", numbits);
+       mlog_entry("(numbits wanted = %u)\n", *numbits);
 
        if (!alloc->id1.bitmap1.i_total) {
                mlog(0, "No bits in my window!\n");
@@ -736,6 +821,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                goto bail;
        }
 
+       if (!resv) {
+               local_resv = 1;
+               ocfs2_resv_init_once(&r);
+               ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+               resv = &r;
+       }
+
+       numfound = *numbits;
+       if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+               if (numfound < *numbits)
+                       *numbits = numfound;
+               goto bail;
+       }
+
+       /*
+        * Code error. While reservations are enabled, local
+        * allocation should _always_ go through them.
+        */
+       BUG_ON(osb->osb_resv_level != 0);
+
+       /*
+        * Reservations are disabled. Handle this the old way.
+        */
+
        bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
 
        numfound = bitoff = startoff = 0;
@@ -761,7 +870,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                        startoff = bitoff+1;
                }
                /* we got everything we needed */
-               if (numfound == numbits) {
+               if (numfound == *numbits) {
                        /* mlog(0, "Found it all!\n"); */
                        break;
                }
@@ -770,12 +879,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
        mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
             numfound);
 
-       if (numfound == numbits)
+       if (numfound == *numbits)
                bitoff = startoff - numfound;
        else
                bitoff = -1;
 
 bail:
+       if (local_resv)
+               ocfs2_resv_discard(resmap, resv);
+
        mlog_exit(bitoff);
        return bitoff;
 }
@@ -1049,7 +1161,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
        /* we used the generic suballoc reserve function, but we set
         * everything up nicely, so there's no reason why we can't use
         * the more specific cluster api to claim bits. */
-       status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
+       status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
                                      &cluster_off, &cluster_count);
        if (status == -ENOSPC) {
 retry_enospc:
@@ -1063,7 +1175,7 @@ retry_enospc:
                        goto bail;
 
                ac->ac_bits_wanted = osb->local_alloc_default_bits;
-               status = ocfs2_claim_clusters(osb, handle, ac,
+               status = ocfs2_claim_clusters(handle, ac,
                                              osb->local_alloc_bits,
                                              &cluster_off,
                                              &cluster_count);
@@ -1098,6 +1210,9 @@ retry_enospc:
        memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
               le16_to_cpu(la->la_size));
 
+       ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+                            OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
+
        mlog(0, "New window allocated:\n");
        mlog(0, "window la_bm_off = %u\n",
             OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1169,12 +1284,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        }
 
        ocfs2_clear_local_alloc(alloc);
-
-       status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+       ocfs2_journal_dirty(handle, osb->local_alloc_bh);
 
        status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
                                          main_bm_inode, main_bm_bh);
@@ -1192,7 +1302,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 
        atomic_inc(&osb->alloc_stats.moves);
 
-       status = 0;
 bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
index ac5ea9f..1be9b58 100644 (file)
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
 
 void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
 
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
+
 int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
                                     int node_num,
                                     struct ocfs2_dinode **alloc_copy);
index 7898bd3..af2b8fe 100644 (file)
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
+#include "super.h"
 
-static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
-{
-       /* The best way to deal with signals in the vm path is
-        * to block them upfront, rather than allowing the
-        * locking paths to return -ERESTARTSYS. */
-       sigfillset(blocked);
-
-       /* We should technically never get a bad return value
-        * from sigprocmask */
-       return sigprocmask(SIG_BLOCK, blocked, oldset);
-}
-
-static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
-{
-       return sigprocmask(SIG_SETMASK, oldset, NULL);
-}
 
 static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
-       sigset_t blocked, oldset;
-       int error, ret;
+       sigset_t oldset;
+       int ret;
 
        mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
 
-       error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
-       if (error < 0) {
-               mlog_errno(error);
-               ret = VM_FAULT_SIGBUS;
-               goto out;
-       }
-
+       ocfs2_block_signals(&oldset);
        ret = filemap_fault(area, vmf);
+       ocfs2_unblock_signals(&oldset);
 
-       error = ocfs2_vm_op_unblock_sigs(&oldset);
-       if (error < 0)
-               mlog_errno(error);
-out:
        mlog_exit_ptr(vmf->page);
        return ret;
 }
@@ -158,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct buffer_head *di_bh = NULL;
-       sigset_t blocked, oldset;
-       int ret, ret2;
+       sigset_t oldset;
+       int ret;
 
-       ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
-       if (ret < 0) {
-               mlog_errno(ret);
-               return ret;
-       }
+       ocfs2_block_signals(&oldset);
 
        /*
         * The cluster locks taken will block a truncate from another
@@ -193,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        ocfs2_inode_unlock(inode, 1);
 
 out:
-       ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
-       if (ret2 < 0)
-               mlog_errno(ret2);
+       ocfs2_unblock_signals(&oldset);
        if (ret)
                ret = VM_FAULT_SIGBUS;
        return ret;
index 4cbb18f..db5dd3e 100644 (file)
@@ -239,6 +239,8 @@ static int ocfs2_mknod(struct inode *dir,
        };
        int did_quota_inode = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+       sigset_t oldset;
+       int did_block_signals = 0;
 
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -350,6 +352,10 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
 
+       /* Starting to change things, restart is no longer possible. */
+       ocfs2_block_signals(&oldset);
+       did_block_signals = 1;
+
        status = dquot_alloc_inode(inode);
        if (status)
                goto leave;
@@ -384,11 +390,7 @@ static int ocfs2_mknod(struct inode *dir,
                        goto leave;
                }
                ocfs2_add_links_count(dirfe, 1);
-               status = ocfs2_journal_dirty(handle, parent_fe_bh);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto leave;
-               }
+               ocfs2_journal_dirty(handle, parent_fe_bh);
                inc_nlink(dir);
        }
 
@@ -439,6 +441,8 @@ leave:
                ocfs2_commit_trans(osb, handle);
 
        ocfs2_inode_unlock(dir, 1);
+       if (did_block_signals)
+               ocfs2_unblock_signals(&oldset);
 
        if (status == -ENOSPC)
                mlog(0, "Disk is full\n");
@@ -487,14 +491,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        int status = 0;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_extent_list *fel;
-       u64 fe_blkno = 0;
+       u64 suballoc_loc, fe_blkno = 0;
        u16 suballoc_bit;
        u16 feat;
 
        *new_fe_bh = NULL;
 
-       status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
-                                      inode_ac, &suballoc_bit, &fe_blkno);
+       status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+                                      inode_ac, &suballoc_loc,
+                                      &suballoc_bit, &fe_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -531,6 +536,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_generation = cpu_to_le32(inode->i_generation);
        fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
        fe->i_blkno = cpu_to_le64(fe_blkno);
+       fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
        fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -567,11 +573,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
        }
 
-       status = ocfs2_journal_dirty(handle, *new_fe_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto leave;
-       }
+       ocfs2_journal_dirty(handle, *new_fe_bh);
 
        ocfs2_populate_inode(inode, fe, 1);
        ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -637,6 +639,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+       sigset_t oldset;
 
        mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
                   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -693,6 +696,9 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
 
+       /* Starting to change things, restart is no longer possible. */
+       ocfs2_block_signals(&oldset);
+
        err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (err < 0) {
@@ -705,14 +711,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-
-       err = ocfs2_journal_dirty(handle, fe_bh);
-       if (err < 0) {
-               ocfs2_add_links_count(fe, -1);
-               drop_nlink(inode);
-               mlog_errno(err);
-               goto out_commit;
-       }
+       ocfs2_journal_dirty(handle, fe_bh);
 
        err = ocfs2_add_entry(handle, dentry, inode,
                              OCFS2_I(inode)->ip_blkno,
@@ -736,6 +735,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 
 out_commit:
        ocfs2_commit_trans(osb, handle);
+       ocfs2_unblock_signals(&oldset);
 out_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
 
@@ -909,12 +909,7 @@ static int ocfs2_unlink(struct inode *dir,
                drop_nlink(inode);
        drop_nlink(inode);
        ocfs2_set_links_count(fe, inode->i_nlink);
-
-       status = ocfs2_journal_dirty(handle, fe_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto leave;
-       }
+       ocfs2_journal_dirty(handle, fe_bh);
 
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        if (S_ISDIR(inode->i_mode))
@@ -1332,12 +1327,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        ocfs2_set_links_count(newfe, 0);
                else
                        ocfs2_add_links_count(newfe, -1);
-
-               status = ocfs2_journal_dirty(handle, newfe_bh);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
+               ocfs2_journal_dirty(handle, newfe_bh);
        } else {
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1356,10 +1346,7 @@ static int ocfs2_rename(struct inode *old_dir,
 
                old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
                old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
-
-               status = ocfs2_journal_dirty(handle, old_inode_bh);
-               if (status < 0)
-                       mlog_errno(status);
+               ocfs2_journal_dirty(handle, old_inode_bh);
        } else
                mlog_errno(status);
 
@@ -1431,7 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,
                                                         OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                        ocfs2_set_links_count(fe, old_dir->i_nlink);
-                       status = ocfs2_journal_dirty(handle, old_dir_bh);
+                       ocfs2_journal_dirty(handle, old_dir_bh);
                }
        }
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1563,11 +1550,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
                       bytes_left);
 
-               status = ocfs2_journal_dirty(handle, bhs[virtual]);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
+               ocfs2_journal_dirty(handle, bhs[virtual]);
 
                virtual++;
                p_blkno++;
@@ -1611,6 +1594,8 @@ static int ocfs2_symlink(struct inode *dir,
        };
        int did_quota = 0, did_quota_inode = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+       sigset_t oldset;
+       int did_block_signals = 0;
 
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1706,6 +1691,10 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
 
+       /* Starting to change things, restart is no longer possible. */
+       ocfs2_block_signals(&oldset);
+       did_block_signals = 1;
+
        status = dquot_alloc_inode(inode);
        if (status)
                goto bail;
@@ -1814,6 +1803,8 @@ bail:
                ocfs2_commit_trans(osb, handle);
 
        ocfs2_inode_unlock(dir, 1);
+       if (did_block_signals)
+               ocfs2_unblock_signals(&oldset);
 
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
@@ -1961,12 +1952,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, 1);
        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
-
-       status = ocfs2_journal_dirty(handle, orphan_dir_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto leave;
-       }
+       ocfs2_journal_dirty(handle, orphan_dir_bh);
 
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
                                   OCFS2_ORPHAN_NAMELEN, inode,
@@ -2065,12 +2051,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, -1);
        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
-
-       status = ocfs2_journal_dirty(handle, orphan_dir_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto leave;
-       }
+       ocfs2_journal_dirty(handle, orphan_dir_bh);
 
 leave:
        ocfs2_free_dir_lookup_result(&lookup);
index adf5e2e..c67003b 100644 (file)
@@ -47,6 +47,7 @@
 /* For struct ocfs2_blockcheck_stats */
 #include "blockcheck.h"
 
+#include "reservations.h"
 
 /* Caching of metadata buffers */
 
@@ -341,6 +342,9 @@ struct ocfs2_super
         */
        unsigned int local_alloc_bits;
        unsigned int local_alloc_default_bits;
+       /* osb_clusters_at_boot can become stale! Do not trust it to
+        * be up to date. */
+       unsigned int osb_clusters_at_boot;
 
        enum ocfs2_local_alloc_state local_alloc_state; /* protected
                                                         * by osb_lock */
@@ -349,6 +353,11 @@ struct ocfs2_super
 
        u64 la_last_gd;
 
+       struct ocfs2_reservation_map    osb_la_resmap;
+
+       unsigned int    osb_resv_level;
+       unsigned int    osb_dir_resv_level;
+
        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
@@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
        return 0;
 }
 
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+       if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+               return 1;
+       return 0;
+}
+
 static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
 {
        if (ocfs2_supports_indexed_dirs(osb))
@@ -763,6 +779,12 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
        return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
 
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+                                                      unsigned int clusters)
+{
+       return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
 static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
 {
        ext2_set_bit(bit, bitmap);
index bb37218..33f1c9a 100644 (file)
                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
                                         | OCFS2_FEATURE_INCOMPAT_META_ECC \
                                         | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
-                                        | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+                                        | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+                                        | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP   (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 /* Refcount tree support */
 #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE   0x1000
 
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG    0x2000
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
 #define OCFS2_MIN_JOURNAL_SIZE         (4 * 1024 * 1024)
 
 /*
- * Default local alloc size (in megabytes)
- *
- * The value chosen should be such that most allocations, including new
- * block groups, use local alloc.
- */
-#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
-
-/*
  * Inline extended attribute size (in bytes)
  * The value chosen should be aligned to 16 byte boundaries.
  */
@@ -512,7 +508,10 @@ struct ocfs2_extent_block
                                           block group */
        __le32 h_fs_generation;         /* Must match super block */
        __le64 h_blkno;                 /* Offset on disk, in blocks */
-/*20*/ __le64 h_reserved3;
+/*20*/ __le64 h_suballoc_loc;          /* Suballocator block group this
+                                          eb belongs to.  Only valid
+                                          if allocated from a
+                                          discontiguous block group */
        __le64 h_next_leaf_blk;         /* Offset on disk, in blocks,
                                           of next leaf header pointing
                                           to data */
@@ -679,7 +678,11 @@ struct ocfs2_dinode {
 /*80*/ struct ocfs2_block_check i_check;       /* Error checking */
 /*88*/ __le64 i_dx_root;               /* Pointer to dir index root block */
 /*90*/ __le64 i_refcount_loc;
-       __le64 i_reserved2[4];
+       __le64 i_suballoc_loc;          /* Suballocator block group this
+                                          inode belongs to.  Only valid
+                                          if allocated from a
+                                          discontiguous block group */
+/*A0*/ __le64 i_reserved2[3];
 /*B8*/ union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -814,7 +817,12 @@ struct ocfs2_dx_root_block {
        __le32          dr_reserved2;
        __le64          dr_free_blk;            /* Pointer to head of free
                                                 * unindexed block list. */
-       __le64          dr_reserved3[15];
+       __le64          dr_suballoc_loc;        /* Suballocator block group
+                                                  this root belongs to.
+                                                  Only valid if allocated
+                                                  from a discontiguous
+                                                  block group */
+       __le64          dr_reserved3[14];
        union {
                struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
                                                   * bits for maximum space
@@ -840,6 +848,13 @@ struct ocfs2_dx_leaf {
 };
 
 /*
+ * Largest bitmap for a block (suballocator) group in bytes.  This limit
+ * does not affect cluster groups (global allocator).  Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE       256
+
+/*
  * On disk allocator group structure for OCFS2
  */
 struct ocfs2_group_desc
@@ -860,7 +875,29 @@ struct ocfs2_group_desc
        __le64   bg_blkno;               /* Offset on disk, in blocks */
 /*30*/ struct ocfs2_block_check bg_check;      /* Error checking */
        __le64   bg_reserved2;
-/*40*/ __u8    bg_bitmap[0];
+/*40*/ union {
+               __u8    bg_bitmap[0];
+               struct {
+                       /*
+                        * Block groups may be discontiguous when
+                        * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+                        * The extents of a discontigous block group are
+                        * stored in bg_list.  It is a flat list.
+                        * l_tree_depth must always be zero.  A
+                        * discontiguous group is signified by a non-zero
+                        * bg_list->l_next_free_rec.  Only block groups
+                        * can be discontiguous; Cluster groups cannot.
+                        * We've never made a block group with more than
+                        * 2048 blocks (256 bytes of bg_bitmap).  This
+                        * codifies that limit so that we can fit bg_list.
+                        * bg_size of a discontiguous block group will
+                        * be 256 to match bg_bitmap_filler.
+                        */
+                       __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/                        struct ocfs2_extent_list bg_list;
+               };
+       };
+/* Actual on-disk size is one block */
 };
 
 struct ocfs2_refcount_rec {
@@ -905,7 +942,11 @@ struct ocfs2_refcount_block {
 /*40*/ __le32 rf_generation;           /* generation number. all be the same
                                         * for the same refcount tree. */
        __le32 rf_reserved0;
-       __le64 rf_reserved1[7];
+       __le64 rf_suballoc_loc;         /* Suballocator block group this
+                                          refcount block belongs to. Only
+                                          valid if allocated from a
+                                          discontiguous block group */
+/*50*/ __le64 rf_reserved1[6];
 /*80*/ union {
                struct ocfs2_refcount_list rf_records;  /* List of refcount
                                                          records */
@@ -1017,7 +1058,10 @@ struct ocfs2_xattr_block {
                                        real xattr or a xattr tree. */
        __le16  xb_reserved0;
        __le32  xb_reserved1;
-       __le64  xb_reserved2;
+       __le64  xb_suballoc_loc;        /* Suballocator block group this
+                                          xattr block belongs to. Only
+                                          valid if allocated from a
+                                          discontiguous block group */
 /*30*/ union {
                struct ocfs2_xattr_header xb_header; /* xattr header if this
                                                        block contains xattr */
@@ -1254,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
        return size / sizeof(struct ocfs2_extent_rec);
 }
 
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+       int size;
+
+       size = sb->s_blocksize -
+               offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+
+       return size / sizeof(struct ocfs2_extent_rec);
+}
+
 static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
 {
        int size;
@@ -1284,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
        return size;
 }
 
-static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+                                         int suballocator,
+                                         u32 feature_incompat)
 {
-       int size;
-
-       size = sb->s_blocksize -
+       int size = sb->s_blocksize -
                offsetof(struct ocfs2_group_desc, bg_bitmap);
 
+       /*
+        * The cluster allocator uses the entire block.  Suballocators have
+        * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+        * code expects bg_size set to the maximum.  Thus we must keep
+        * bg_size as-is unless discontig_bg is enabled.
+        */
+       if (suballocator &&
+           (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+               size = OCFS2_MAX_BG_BITMAP_SIZE;
+
        return size;
 }
 
@@ -1402,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
        return size / sizeof(struct ocfs2_extent_rec);
 }
 
-static inline int ocfs2_local_alloc_size(int blocksize)
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
 {
        int size;
 
        size = blocksize -
-               offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+               offsetof(struct ocfs2_group_desc, bg_list.l_recs);
 
-       return size;
+       return size / sizeof(struct ocfs2_extent_rec);
 }
 
-static inline int ocfs2_group_bitmap_size(int blocksize)
+static inline int ocfs2_local_alloc_size(int blocksize)
 {
        int size;
 
        size = blocksize -
+               offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+       return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize,
+                                         int suballocator,
+                                         uint32_t feature_incompat)
+{
+       int size = sb->s_blocksize -
                offsetof(struct ocfs2_group_desc, bg_bitmap);
 
+       /*
+        * The cluster allocator uses the entire block.  Suballocators have
+        * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+        * code expects bg_size set to the maximum.  Thus we must keep
+        * bg_size as-is unless discontig_bg is enabled.
+        */
+       if (suballocator &&
+           (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+               size = OCFS2_MAX_BG_BITMAP_SIZE;
+
        return size;
 }
 
@@ -1491,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
        de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
 
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+       if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+            le16_to_cpu(gd->bg_size)) !=
+           offsetof(struct ocfs2_group_desc, bg_list))
+               return 0;
+       /*
+        * Only valid to check l_next_free_rec if
+        * bg_bitmap + bg_size == bg_list.
+        */
+       if (!gd->bg_list.l_next_free_rec)
+               return 0;
+       return 1;
+}
 #endif  /* _OCFS2_FS_H */
 
index ab42a74..04ae76d 100644 (file)
@@ -261,10 +261,8 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                brelse(bh);
                goto out;
        }
-       err = ocfs2_journal_dirty(handle, bh);
+       ocfs2_journal_dirty(handle, bh);
        brelse(bh);
-       if (err < 0)
-               goto out;
 out:
        if (err) {
                mutex_unlock(&gqinode->i_mutex);
index 9ad4930..884b641 100644 (file)
@@ -119,12 +119,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        lock_buffer(bh);
        modify(bh, private);
        unlock_buffer(bh);
-       status = ocfs2_journal_dirty(handle, bh);
-       if (status < 0) {
-               mlog_errno(status);
-               ocfs2_commit_trans(OCFS2_SB(sb), handle);
-               return status;
-       }
+       ocfs2_journal_dirty(handle, bh);
+
        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
        if (status < 0) {
                mlog_errno(status);
@@ -523,9 +519,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
                        ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
                        le32_add_cpu(&dchunk->dqc_free, 1);
                        unlock_buffer(qbh);
-                       status = ocfs2_journal_dirty(handle, qbh);
-                       if (status < 0)
-                               mlog_errno(status);
+                       ocfs2_journal_dirty(handle, qbh);
 out_commit:
                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -631,9 +625,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                lock_buffer(bh);
                ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
                unlock_buffer(bh);
-               status = ocfs2_journal_dirty(handle, bh);
-               if (status < 0)
-                       mlog_errno(status);
+               ocfs2_journal_dirty(handle, bh);
 out_trans:
                ocfs2_commit_trans(osb, handle);
 out_bh:
@@ -1009,11 +1001,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
               OCFS2_QBLK_RESERVED_SPACE);
        unlock_buffer(bh);
-       status = ocfs2_journal_dirty(handle, bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto out_trans;
-       }
+       ocfs2_journal_dirty(handle, bh);
 
        /* Initialize new block with structures */
        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
@@ -1040,11 +1028,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        lock_buffer(dbh);
        memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
        unlock_buffer(dbh);
-       status = ocfs2_journal_dirty(handle, dbh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto out_trans;
-       }
+       ocfs2_journal_dirty(handle, dbh);
 
        /* Update local quotafile info */
        oinfo->dqi_blocks += 2;
@@ -1155,11 +1139,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        lock_buffer(bh);
        memset(bh->b_data, 0, sb->s_blocksize);
        unlock_buffer(bh);
-       status = ocfs2_journal_dirty(handle, bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto out_trans;
-       }
+       ocfs2_journal_dirty(handle, bh);
+
        /* Update chunk header */
        status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
                                         chunk->qc_headerbh,
@@ -1173,11 +1154,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        lock_buffer(chunk->qc_headerbh);
        le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
        unlock_buffer(chunk->qc_headerbh);
-       status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto out_trans;
-       }
+       ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+
        /* Update file header */
        oinfo->dqi_blocks++;
        status = ocfs2_local_write_info(sb, type);
@@ -1312,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
        ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
        le32_add_cpu(&dchunk->dqc_free, 1);
        unlock_buffer(od->dq_chunk->qc_headerbh);
-       status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto out;
-       }
-       status = 0;
+       ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+
 out:
        /* Clear the read bit so that next time someone uses this
         * dquot he reads fresh info from disk and allocates local
index 5cbcd0f..4793f36 100644 (file)
@@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
        u16 suballoc_bit_start;
        u32 num_got;
-       u64 first_blkno;
+       u64 suballoc_loc, first_blkno;
 
        BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
 
@@ -596,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
                goto out_commit;
        }
 
-       ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+       ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &first_blkno);
        if (ret) {
@@ -626,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        memset(rb, 0, inode->i_sb->s_blocksize);
        strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
        rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+       rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
        rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -790,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
        if (le32_to_cpu(rb->rf_count) == 1) {
                blk = le64_to_cpu(rb->rf_blkno);
                bit = le16_to_cpu(rb->rf_suballoc_bit);
-               bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+               if (rb->rf_suballoc_loc)
+                       bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+               else
+                       bg_blkno = ocfs2_which_suballoc_group(blk, bit);
 
                alloc_inode = ocfs2_get_system_file_inode(osb,
                                        EXTENT_ALLOC_SYSTEM_INODE,
@@ -1268,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
        } else if (merge)
                ocfs2_refcount_rec_merge(rb, index);
 
-       ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
-       if (ret)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, ref_leaf_bh);
 out:
        return ret;
 }
@@ -1284,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got;
-       u64 blkno;
+       u64 suballoc_loc, blkno;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct buffer_head *new_bh = NULL;
        struct ocfs2_refcount_block *new_rb;
@@ -1298,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
                goto out;
        }
 
-       ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+       ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &blkno);
        if (ret) {
@@ -1330,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
 
        new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+       new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_blkno = cpu_to_le64(blkno);
        new_rb->rf_cpos = cpu_to_le32(0);
@@ -1524,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got, new_cpos;
-       u64 blkno;
+       u64 suballoc_loc, blkno;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct ocfs2_refcount_block *root_rb =
                        (struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1548,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
                goto out;
        }
 
-       ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+       ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &blkno);
        if (ret) {
@@ -1576,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        memset(new_rb, 0, sb->s_blocksize);
        strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+       new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
        new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1694,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
         * 2 more credits, one for the leaf refcount block, one for
         * the extent block contains the extent rec.
         */
-       ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
+       ret = ocfs2_extend_trans(handle, 2);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1802,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
        if (merge)
                ocfs2_refcount_rec_merge(rb, index);
 
-       ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
+       ocfs2_journal_dirty(handle, ref_leaf_bh);
 
        if (index == 0) {
                ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1977,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
                        ocfs2_refcount_rec_merge(rb, index);
        }
 
-       ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
-       if (ret)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, ref_leaf_bh);
 
 out:
        brelse(new_bh);
@@ -2112,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
         */
        ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
                                        le16_to_cpu(rb->rf_suballoc_slot),
+                                       le64_to_cpu(rb->rf_suballoc_loc),
                                        le64_to_cpu(rb->rf_blkno),
                                        le16_to_cpu(rb->rf_suballoc_bit));
        if (ret) {
@@ -2516,20 +2515,19 @@ out:
  *
  * Normally the refcount blocks store these refcount should be
  * contiguous also, so that we can get the number easily.
- * As for meta_ac, we will at most add split 2 refcount record and
- * 2 more refcount block, so just check it in a rough way.
+ * We will at most add split 2 refcount records and 2 more
+ * refcount blocks, so just check it in a rough way.
  *
  * Caller must hold refcount tree lock.
  */
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-                                         struct buffer_head *di_bh,
+                                         u64 refcount_loc,
                                          u64 phys_blkno,
                                          u32 clusters,
                                          int *credits,
-                                         struct ocfs2_alloc_context **meta_ac)
+                                         int *ref_blocks)
 {
-       int ret, ref_blocks = 0;
-       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       int ret;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *ref_root_bh = NULL;
        struct ocfs2_refcount_tree *tree;
@@ -2546,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
        BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
 
        ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
-                                     le64_to_cpu(di->i_refcount_loc), &tree);
+                                     refcount_loc, &tree);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
 
-       ret = ocfs2_read_refcount_block(&tree->rf_ci,
-                                       le64_to_cpu(di->i_refcount_loc),
+       ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
                                        &ref_root_bh);
        if (ret) {
                mlog_errno(ret);
@@ -2564,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
                                               &tree->rf_ci,
                                               ref_root_bh,
                                               start_cpos, clusters,
-                                              &ref_blocks, credits);
+                                              ref_blocks, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
 
-       mlog(0, "reserve new metadata %d, credits = %d\n",
-            ref_blocks, *credits);
-
-       if (ref_blocks) {
-               ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
-                                                       ref_blocks, meta_ac);
-               if (ret)
-                       mlog_errno(ret);
-       }
+       mlog(0, "reserve new metadata %d blocks, credits = %d\n",
+            *ref_blocks, *credits);
 
 out:
        brelse(ref_root_bh);
@@ -3040,11 +3030,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
                }
 
                memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
-               ret = ocfs2_journal_dirty(handle, new_bh);
-               if (ret) {
-                       mlog_errno(ret);
-                       break;
-               }
+               ocfs2_journal_dirty(handle, new_bh);
 
                brelse(new_bh);
                brelse(old_bh);
@@ -3282,7 +3268,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                } else {
                        delete = 1;
 
-                       ret = __ocfs2_claim_clusters(osb, handle,
+                       ret = __ocfs2_claim_clusters(handle,
                                                     context->data_ac,
                                                     1, set_len,
                                                     &new_bit, &new_len);
index c1d19b1..9983ba1 100644 (file)
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
                            struct ocfs2_cached_dealloc_ctxt *dealloc,
                            int delete);
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-                                         struct buffer_head *di_bh,
+                                         u64 refcount_loc,
                                          u64 phys_blkno,
                                          u32 clusters,
                                          int *credits,
-                                         struct ocfs2_alloc_context **meta_ac);
+                                         int *ref_blocks);
 int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos);
 
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644 (file)
index 0000000..4065002
--- /dev/null
@@ -0,0 +1,847 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+
+#define MLOG_MASK_PREFIX ML_RESERVATIONS
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+
+DEFINE_SPINLOCK(resv_lock);
+
+#define        OCFS2_MIN_RESV_WINDOW_BITS      8
+#define        OCFS2_MAX_RESV_WINDOW_BITS      1024
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+       return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+                                          struct ocfs2_alloc_reservation *resv)
+{
+       struct ocfs2_super *osb = resmap->m_osb;
+       unsigned int bits;
+
+       if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+               /* 8, 16, 32, 64, 128, 256, 512, 1024 */
+               bits = 4 << osb->osb_resv_level;
+       } else {
+               bits = 4 << osb->osb_dir_resv_level;
+       }
+       return bits;
+}
+
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+       if (resv->r_len)
+               return resv->r_start + resv->r_len - 1;
+       return resv->r_start;
+}
+
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+       return !!(resv->r_len == 0);
+}
+
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+       if (resmap->m_osb->osb_resv_level == 0)
+               return 1;
+       return 0;
+}
+
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+       struct ocfs2_super *osb = resmap->m_osb;
+       struct rb_node *node;
+       struct ocfs2_alloc_reservation *resv;
+       int i = 0;
+
+       mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+            osb->dev_str, resmap->m_bitmap_len);
+
+       node = rb_first(&resmap->m_reservations);
+       while (node) {
+               resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+               mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+                    "\tlast_len: %u\n", resv->r_start,
+                    ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+                    resv->r_last_len);
+
+               node = rb_next(node);
+               i++;
+       }
+
+       mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+
+       i = 0;
+       list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+               mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+                    "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+                    ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+                    resv->r_last_len);
+
+               i++;
+       }
+}
+
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+                                     int i,
+                                     struct ocfs2_alloc_reservation *resv)
+{
+       char *disk_bitmap = resmap->m_disk_bitmap;
+       unsigned int start = resv->r_start;
+       unsigned int end = ocfs2_resv_end(resv);
+
+       while (start <= end) {
+               if (ocfs2_test_bit(start, disk_bitmap)) {
+                       mlog(ML_ERROR,
+                            "reservation %d covers an allocated area "
+                            "starting at bit %u!\n", i, start);
+                       return 1;
+               }
+
+               start++;
+       }
+       return 0;
+}
+
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+       unsigned int off = 0;
+       int i = 0;
+       struct rb_node *node;
+       struct ocfs2_alloc_reservation *resv;
+
+       node = rb_first(&resmap->m_reservations);
+       while (node) {
+               resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+               if (i > 0 && resv->r_start <= off) {
+                       mlog(ML_ERROR, "reservation %d has bad start off!\n",
+                            i);
+                       goto bad;
+               }
+
+               if (resv->r_len == 0) {
+                       mlog(ML_ERROR, "reservation %d has no length!\n",
+                            i);
+                       goto bad;
+               }
+
+               if (resv->r_start > ocfs2_resv_end(resv)) {
+                       mlog(ML_ERROR, "reservation %d has invalid range!\n",
+                            i);
+                       goto bad;
+               }
+
+               if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+                       mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+                            i);
+                       goto bad;
+               }
+
+               if (ocfs2_validate_resmap_bits(resmap, i, resv))
+                       goto bad;
+
+               off = ocfs2_resv_end(resv);
+               node = rb_next(node);
+
+               i++;
+       }
+       return;
+
+bad:
+       ocfs2_dump_resv(resmap);
+       BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+
+}
+#endif
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+       memset(resv, 0, sizeof(*resv));
+       INIT_LIST_HEAD(&resv->r_lru);
+}
+
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+                        unsigned int flags)
+{
+       BUG_ON(flags & ~OCFS2_RESV_TYPES);
+
+       resv->r_flags |= flags;
+}
+
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+                     struct ocfs2_reservation_map *resmap)
+{
+       memset(resmap, 0, sizeof(*resmap));
+
+       resmap->m_osb = osb;
+       resmap->m_reservations = RB_ROOT;
+       /* m_bitmap_len is initialized to zero by the above memset. */
+       INIT_LIST_HEAD(&resmap->m_lru);
+
+       return 0;
+}
+
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+                               struct ocfs2_alloc_reservation *resv)
+{
+       assert_spin_locked(&resv_lock);
+
+       if (!list_empty(&resv->r_lru))
+               list_del_init(&resv->r_lru);
+
+       list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+       resv->r_len = 0;
+       resv->r_start = 0;
+}
+
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+                             struct ocfs2_alloc_reservation *resv)
+{
+       if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+               list_del_init(&resv->r_lru);
+               rb_erase(&resv->r_node, &resmap->m_reservations);
+               resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+       }
+}
+
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                                struct ocfs2_alloc_reservation *resv)
+{
+       assert_spin_locked(&resv_lock);
+
+       __ocfs2_resv_trunc(resv);
+       /*
+        * last_len and last_start no longer make sense if
+        * we're changing the range of our allocations.
+        */
+       resv->r_last_len = resv->r_last_start = 0;
+
+       ocfs2_resv_remove(resmap, resv);
+}
+
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                       struct ocfs2_alloc_reservation *resv)
+{
+       if (resv) {
+               spin_lock(&resv_lock);
+               __ocfs2_resv_discard(resmap, resv);
+               spin_unlock(&resv_lock);
+       }
+}
+
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+       struct rb_node *node;
+       struct ocfs2_alloc_reservation *resv;
+
+       assert_spin_locked(&resv_lock);
+
+       while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+               resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+               __ocfs2_resv_discard(resmap, resv);
+       }
+}
+
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+                         unsigned int clen, char *disk_bitmap)
+{
+       if (ocfs2_resmap_disabled(resmap))
+               return;
+
+       spin_lock(&resv_lock);
+
+       ocfs2_resmap_clear_all_resv(resmap);
+       resmap->m_bitmap_len = clen;
+       resmap->m_disk_bitmap = disk_bitmap;
+
+       spin_unlock(&resv_lock);
+}
+
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+       /* Does nothing for now. Keep this around for API symmetry */
+}
+
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+                             struct ocfs2_alloc_reservation *new)
+{
+       struct rb_root *root = &resmap->m_reservations;
+       struct rb_node *parent = NULL;
+       struct rb_node **p = &root->rb_node;
+       struct ocfs2_alloc_reservation *tmp;
+
+       assert_spin_locked(&resv_lock);
+
+       mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
+            new->r_len);
+
+       while (*p) {
+               parent = *p;
+
+               tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+
+               if (new->r_start < tmp->r_start) {
+                       p = &(*p)->rb_left;
+
+                       /*
+                        * This is a good place to check for
+                        * overlapping reservations.
+                        */
+                       BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+               } else if (new->r_start > ocfs2_resv_end(tmp)) {
+                       p = &(*p)->rb_right;
+               } else {
+                       /* This should never happen! */
+                       mlog(ML_ERROR, "Duplicate reservation window!\n");
+                       BUG();
+               }
+       }
+
+       rb_link_node(&new->r_node, parent, p);
+       rb_insert_color(&new->r_node, root);
+       new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+
+       ocfs2_resv_mark_lru(resmap, new);
+
+       ocfs2_check_resmap(resmap);
+}
+
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+       struct ocfs2_alloc_reservation *resv = NULL;
+       struct ocfs2_alloc_reservation *prev_resv = NULL;
+       struct rb_node *node = resmap->m_reservations.rb_node;
+
+       assert_spin_locked(&resv_lock);
+
+       if (!node)
+               return NULL;
+
+       node = rb_first(&resmap->m_reservations);
+       while (node) {
+               resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+               if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+                       break;
+
+               /* Check if we overshot the reservation just before goal? */
+               if (resv->r_start > goal) {
+                       resv = prev_resv;
+                       break;
+               }
+
+               prev_resv = resv;
+               node = rb_next(node);
+       }
+
+       return resv;
+}
+
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+                                      unsigned int wanted,
+                                      unsigned int search_start,
+                                      unsigned int search_len,
+                                      unsigned int *rstart,
+                                      unsigned int *rlen)
+{
+       void *bitmap = resmap->m_disk_bitmap;
+       unsigned int best_start, best_len = 0;
+       int offset, start, found;
+
+       mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
+            wanted, search_start, search_len, resmap->m_bitmap_len);
+
+       found = best_start = best_len = 0;
+
+       start = search_start;
+       while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+                                                start)) != -1) {
+               /* Search reached end of the region */
+               if (offset >= (search_start + search_len))
+                       break;
+
+               if (offset == start) {
+                       /* we found a zero */
+                       found++;
+                       /* move start to the next bit to test */
+                       start++;
+               } else {
+                       /* got a zero after some ones */
+                       found = 1;
+                       start = offset + 1;
+               }
+               if (found > best_len) {
+                       best_len = found;
+                       best_start = start - found;
+               }
+
+               if (found >= wanted)
+                       break;
+       }
+
+       if (best_len == 0)
+               return 0;
+
+       if (best_len >= wanted)
+               best_len = wanted;
+
+       *rlen = best_len;
+       *rstart = best_start;
+
+       mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+
+       return *rlen;
+}
+
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+                                    struct ocfs2_alloc_reservation *resv,
+                                    unsigned int goal, unsigned int wanted)
+{
+       struct rb_root *root = &resmap->m_reservations;
+       unsigned int gap_start, gap_end, gap_len;
+       struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+       struct rb_node *prev, *next;
+       unsigned int cstart, clen;
+       unsigned int best_start = 0, best_len = 0;
+
+       /*
+        * Nasty cases to consider:
+        *
+        * - rbtree is empty
+        * - our window should be first in all reservations
+        * - our window should be last in all reservations
+        * - need to make sure we don't go past end of bitmap
+        */
+
+       mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
+            resv->r_start, ocfs2_resv_end(resv), goal, wanted);
+
+       assert_spin_locked(&resv_lock);
+
+       if (RB_EMPTY_ROOT(root)) {
+               /*
+                * Easiest case - empty tree. We can just take
+                * whatever window of free bits we want.
+                */
+
+               mlog(0, "Empty root\n");
+
+               clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+                                                  resmap->m_bitmap_len - goal,
+                                                  &cstart, &clen);
+
+               /*
+                * This should never happen - the local alloc window
+                * will always have free bits when we're called.
+                */
+               BUG_ON(goal == 0 && clen == 0);
+
+               if (clen == 0)
+                       return;
+
+               resv->r_start = cstart;
+               resv->r_len = clen;
+
+               ocfs2_resv_insert(resmap, resv);
+               return;
+       }
+
+       prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+
+       if (prev_resv == NULL) {
+               mlog(0, "Goal on LHS of leftmost window\n");
+
+               /*
+                * A NULL here means that the search code couldn't
+                * find a window that starts before goal.
+                *
+                * However, we can take the first window after goal,
+                * which is also by definition, the leftmost window in
+                * the entire tree. If we can find free bits in the
+                * gap between goal and the LHS window, then the
+                * reservation can safely be placed there.
+                *
+                * Otherwise we fall back to a linear search, checking
+                * the gaps in between windows for a place to
+                * allocate.
+                */
+
+               next = rb_first(root);
+               next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+                                    r_node);
+
+               /*
+                * The search should never return such a window. (see
+                * comment above
+                */
+               if (next_resv->r_start <= goal) {
+                       mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+                            goal, next_resv->r_start, next_resv->r_len);
+                       ocfs2_dump_resv(resmap);
+                       BUG();
+               }
+
+               clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+                                                  next_resv->r_start - goal,
+                                                  &cstart, &clen);
+               if (clen) {
+                       best_len = clen;
+                       best_start = cstart;
+                       if (best_len == wanted)
+                               goto out_insert;
+               }
+
+               prev_resv = next_resv;
+               next_resv = NULL;
+       }
+
+       prev = &prev_resv->r_node;
+
+       /* Now we do a linear search for a window, starting at 'prev_rsv' */
+       while (1) {
+               next = rb_next(prev);
+               if (next) {
+                       mlog(0, "One more resv found in linear search\n");
+                       next_resv = rb_entry(next,
+                                            struct ocfs2_alloc_reservation,
+                                            r_node);
+
+                       gap_start = ocfs2_resv_end(prev_resv) + 1;
+                       gap_end = next_resv->r_start - 1;
+                       gap_len = gap_end - gap_start + 1;
+               } else {
+                       mlog(0, "No next node\n");
+                       /*
+                        * We're at the rightmost edge of the
+                        * tree. See if a reservation between this
+                        * window and the end of the bitmap will work.
+                        */
+                       gap_start = ocfs2_resv_end(prev_resv) + 1;
+                       gap_len = resmap->m_bitmap_len - gap_start;
+                       gap_end = resmap->m_bitmap_len - 1;
+               }
+
+               /*
+                * No need to check this gap if we have already found
+                * a larger region of free bits.
+                */
+               if (gap_len <= best_len)
+                       goto next_resv;
+
+               clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+                                                  gap_len, &cstart, &clen);
+               if (clen == wanted) {
+                       best_len = clen;
+                       best_start = cstart;
+                       goto out_insert;
+               } else if (clen > best_len) {
+                       best_len = clen;
+                       best_start = cstart;
+               }
+
+next_resv:
+               if (!next)
+                       break;
+
+               prev = next;
+               prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+                                    r_node);
+       }
+
+out_insert:
+       if (best_len) {
+               resv->r_start = best_start;
+               resv->r_len = best_len;
+               ocfs2_resv_insert(resmap, resv);
+       }
+}
+
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+                                  struct ocfs2_alloc_reservation *resv,
+                                  unsigned int wanted)
+{
+       struct ocfs2_alloc_reservation *lru_resv;
+       int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+       unsigned int min_bits;
+
+       if (!tmpwindow)
+               min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+       else
+               min_bits = wanted; /* We at know the temp window will use all
+                                   * of these bits */
+
+       /*
+        * Take the first reservation off the LRU as our 'target'. We
+        * don't try to be smart about it. There might be a case for
+        * searching based on size but I don't have enough data to be
+        * sure. --Mark (3/16/2010)
+        */
+       lru_resv = list_first_entry(&resmap->m_lru,
+                                   struct ocfs2_alloc_reservation, r_lru);
+
+       mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
+            lru_resv->r_len, ocfs2_resv_end(lru_resv));
+
+       /*
+        * Cannibalize (some or all) of the target reservation and
+        * feed it to the current window.
+        */
+       if (lru_resv->r_len <= min_bits) {
+               /*
+                * Discard completely if size is less than or equal to a
+                * reasonable threshold - 50% of window bits for non temporary
+                * windows.
+                */
+               resv->r_start = lru_resv->r_start;
+               resv->r_len = lru_resv->r_len;
+
+               __ocfs2_resv_discard(resmap, lru_resv);
+       } else {
+               unsigned int shrink;
+               if (tmpwindow)
+                       shrink = min_bits;
+               else
+                       shrink = lru_resv->r_len / 2;
+
+               lru_resv->r_len -= shrink;
+
+               resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+               resv->r_len = shrink;
+       }
+
+       mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+            "r_len: %u r_last_start: %u r_last_len: %u\n",
+            resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+            resv->r_last_start, resv->r_last_len);
+
+       ocfs2_resv_insert(resmap, resv);
+}
+
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+                                  struct ocfs2_alloc_reservation *resv,
+                                  unsigned int wanted)
+{
+       unsigned int goal = 0;
+
+       BUG_ON(!ocfs2_resv_empty(resv));
+
+       /*
+        * Begin by trying to get a window as close to the previous
+        * one as possible. Using the most recent allocation as a
+        * start goal makes sense.
+        */
+       if (resv->r_last_len) {
+               goal = resv->r_last_start + resv->r_last_len;
+               if (goal >= resmap->m_bitmap_len)
+                       goal = 0;
+       }
+
+       __ocfs2_resv_find_window(resmap, resv, goal, wanted);
+
+       /* Search from last alloc didn't work, try once more from beginning. */
+       if (ocfs2_resv_empty(resv) && goal != 0)
+               __ocfs2_resv_find_window(resmap, resv, 0, wanted);
+
+       if (ocfs2_resv_empty(resv)) {
+               /*
+                * Still empty? Pull oldest one off the LRU, remove it from
+                * tree, put this one in it's place.
+                */
+               ocfs2_cannibalize_resv(resmap, resv, wanted);
+       }
+
+       BUG_ON(ocfs2_resv_empty(resv));
+}
+
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+                          struct ocfs2_alloc_reservation *resv,
+                          int *cstart, int *clen)
+{
+       unsigned int wanted = *clen;
+
+       if (resv == NULL || ocfs2_resmap_disabled(resmap))
+               return -ENOSPC;
+
+       spin_lock(&resv_lock);
+
+       /*
+        * We don't want to over-allocate for temporary
+        * windows. Otherwise, we run the risk of fragmenting the
+        * allocation space.
+        */
+       wanted = ocfs2_resv_window_bits(resmap, resv);
+       if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+               wanted = *clen;
+
+       if (ocfs2_resv_empty(resv)) {
+               mlog(0, "empty reservation, find new window\n");
+
+               /*
+                * Try to get a window here. If it works, we must fall
+                * through and test the bitmap . This avoids some
+                * ping-ponging of windows due to non-reserved space
+                * being allocation before we initialize a window for
+                * that inode.
+                */
+               ocfs2_resv_find_window(resmap, resv, wanted);
+       }
+
+       BUG_ON(ocfs2_resv_empty(resv));
+
+       *cstart = resv->r_start;
+       *clen = resv->r_len;
+
+       spin_unlock(&resv_lock);
+       return 0;
+}
+
+static void
+       ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+                                    struct ocfs2_alloc_reservation *resv,
+                                    unsigned int start, unsigned int end)
+{
+       unsigned int rhs = 0;
+       unsigned int old_end = ocfs2_resv_end(resv);
+
+       BUG_ON(start != resv->r_start || old_end < end);
+
+       /*
+        * Completely used? We can remove it then.
+        */
+       if (old_end == end) {
+               __ocfs2_resv_discard(resmap, resv);
+               return;
+       }
+
+       rhs = old_end - end;
+
+       /*
+        * This should have been trapped above.
+        */
+       BUG_ON(rhs == 0);
+
+       resv->r_start = end + 1;
+       resv->r_len = old_end - resv->r_start + 1;
+}
+
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+                              struct ocfs2_alloc_reservation *resv,
+                              u32 cstart, u32 clen)
+{
+       unsigned int cend = cstart + clen - 1;
+
+       if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+               return;
+
+       if (resv == NULL)
+               return;
+
+       BUG_ON(cstart != resv->r_start);
+
+       spin_lock(&resv_lock);
+
+       mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
+            "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
+            cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
+            resv->r_len, resv->r_last_start, resv->r_last_len);
+
+       BUG_ON(cstart < resv->r_start);
+       BUG_ON(cstart > ocfs2_resv_end(resv));
+       BUG_ON(cend > ocfs2_resv_end(resv));
+
+       ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+       resv->r_last_start = cstart;
+       resv->r_last_len = clen;
+
+       /*
+        * May have been discarded above from
+        * ocfs2_adjust_resv_from_alloc().
+        */
+       if (!ocfs2_resv_empty(resv))
+               ocfs2_resv_mark_lru(resmap, resv);
+
+       mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+            "r_len: %u r_last_start: %u r_last_len: %u\n",
+            resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+            resv->r_last_start, resv->r_last_len);
+
+       ocfs2_check_resmap(resmap);
+
+       spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644 (file)
index 0000000..1e49cc2
--- /dev/null
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef        OCFS2_RESERVATIONS_H
+#define        OCFS2_RESERVATIONS_H
+
+#include <linux/rbtree.h>
+
+#define OCFS2_DEFAULT_RESV_LEVEL       2
+#define OCFS2_MAX_RESV_LEVEL   9
+#define OCFS2_MIN_RESV_LEVEL   0
+
+struct ocfs2_alloc_reservation {
+       struct rb_node  r_node;
+
+       unsigned int    r_start;        /* Begining of current window */
+       unsigned int    r_len;          /* Length of the window */
+
+       unsigned int    r_last_len;     /* Length of most recent alloc */
+       unsigned int    r_last_start;   /* Start of most recent alloc */
+       struct list_head        r_lru;  /* LRU list head */
+
+       unsigned int    r_flags;
+};
+
+#define        OCFS2_RESV_FLAG_INUSE   0x01    /* Set when r_node is part of a btree */
+#define        OCFS2_RESV_FLAG_TMP     0x02    /* Temporary reservation, will be
+                                        * destroyed immedately after use */
+#define        OCFS2_RESV_FLAG_DIR     0x04    /* Reservation is for an unindexed
+                                        * directory btree */
+
+struct ocfs2_reservation_map {
+       struct rb_root          m_reservations;
+       char                    *m_disk_bitmap;
+
+       struct ocfs2_super      *m_osb;
+
+       /* The following are not initialized to meaningful values until a disk
+        * bitmap is provided. */
+       u32                     m_bitmap_len;   /* Number of valid
+                                                * bits available */
+
+       struct list_head        m_lru;          /* LRU of reservations
+                                                * structures. */
+
+};
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+
+#define OCFS2_RESV_TYPES       (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+                        unsigned int flags);
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                       struct ocfs2_alloc_reservation *resv);
+
+
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+                     struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+                         unsigned int clen, char *disk_bitmap);
+
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+                          struct ocfs2_alloc_reservation *resv,
+                          int *cstart, int *clen);
+
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+                              struct ocfs2_alloc_reservation *resv,
+                              u32 cstart, u32 clen);
+
+#endif /* OCFS2_RESERVATIONS_H */
index 3c3d673..dacd553 100644 (file)
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
                le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
        }
 
-       ret = ocfs2_journal_dirty(handle, group_bh);
-       if (ret < 0) {
-               mlog_errno(ret);
-               goto out_rollback;
-       }
+       ocfs2_journal_dirty(handle, group_bh);
 
        /* update the inode accordingly. */
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-                                ocfs2_group_bitmap_size(osb->sb) * 8) {
+               ocfs2_group_bitmap_size(osb->sb, 0,
+                                       osb->s_feature_incompat) * 8) {
                mlog(ML_ERROR, "The disk is too old and small. "
                     "Force to do offline resize.");
                ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
 
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-                                ocfs2_group_bitmap_size(osb->sb) * 8) {
+               ocfs2_group_bitmap_size(osb->sb, 0,
+                                       osb->s_feature_incompat) * 8) {
                mlog(ML_ERROR, "The disk is too old and small."
                     " Force to do offline resize.");
                ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 
        group = (struct ocfs2_group_desc *)group_bh->b_data;
        group->bg_next_group = cr->c_blkno;
-
-       ret = ocfs2_journal_dirty(handle, group_bh);
-       if (ret < 0) {
-               mlog_errno(ret);
-               goto out_commit;
-       }
+       ocfs2_journal_dirty(handle, group_bh);
 
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
                                      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
index 19ba00f..f4c2a9e 100644 (file)
 
 #define OCFS2_MAX_TO_STEAL             1024
 
+struct ocfs2_suballoc_result {
+       u64             sr_bg_blkno;    /* The bg we allocated from.  Set
+                                          to 0 when a block group is
+                                          contiguous. */
+       u64             sr_blkno;       /* The first allocated block */
+       unsigned int    sr_bit_offset;  /* The bit in the bg */
+       unsigned int    sr_bits;        /* How many bits we claimed */
+};
+
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
                                  struct inode *alloc_inode,
                                  struct buffer_head *bg_bh,
                                  u64 group_blkno,
+                                 unsigned int group_clusters,
                                  u16 my_chain,
                                  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
                                      u64 max_block,
-                                     u16 *bit_off, u16 *bits_found);
+                                     struct ocfs2_suballoc_result *res);
 static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
                                    u64 max_block,
-                                   u16 *bit_off, u16 *bits_found);
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
-                                    struct ocfs2_alloc_context *ac,
+                                   struct ocfs2_suballoc_result *res);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
                                     handle_t *handle,
                                     u32 bits_wanted,
                                     u32 min_bits,
-                                    u16 *bit_off,
-                                    unsigned int *num_bits,
-                                    u64 *bg_blkno);
+                                    struct ocfs2_suballoc_result *res);
 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
                                         int nr);
 static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -130,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
        }
        brelse(ac->ac_bh);
        ac->ac_bh = NULL;
+       ac->ac_resv = NULL;
 }
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -325,14 +333,38 @@ out:
        return rc;
 }
 
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+                                         struct ocfs2_group_desc *bg,
+                                         struct ocfs2_chain_list *cl,
+                                         u64 p_blkno, u32 clusters)
+{
+       struct ocfs2_extent_list *el = &bg->bg_list;
+       struct ocfs2_extent_rec *rec;
+
+       BUG_ON(!ocfs2_supports_discontig_bg(osb));
+       if (!el->l_next_free_rec)
+               el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+       rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+       rec->e_blkno = cpu_to_le64(p_blkno);
+       rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+                                 le16_to_cpu(cl->cl_bpc));
+       rec->e_leaf_clusters = cpu_to_le32(clusters);
+       le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+       le16_add_cpu(&bg->bg_free_bits_count,
+                    clusters * le16_to_cpu(cl->cl_bpc));
+       le16_add_cpu(&el->l_next_free_rec, 1);
+}
+
 static int ocfs2_block_group_fill(handle_t *handle,
                                  struct inode *alloc_inode,
                                  struct buffer_head *bg_bh,
                                  u64 group_blkno,
+                                 unsigned int group_clusters,
                                  u16 my_chain,
                                  struct ocfs2_chain_list *cl)
 {
        int status = 0;
+       struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct super_block * sb = alloc_inode->i_sb;
 
@@ -359,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
        memset(bg, 0, sb->s_blocksize);
        strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
        bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
-       bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
-       bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+       bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
+                                               osb->s_feature_incompat));
        bg->bg_chain = cpu_to_le16(my_chain);
        bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
        bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
        bg->bg_blkno = cpu_to_le64(group_blkno);
+       if (group_clusters == le16_to_cpu(cl->cl_cpg))
+               bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+       else
+               ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+                                             group_clusters);
+
        /* set the 1st bit in the bitmap to account for the descriptor block */
        ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
        bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
 
-       status = ocfs2_journal_dirty(handle, bg_bh);
-       if (status < 0)
-               mlog_errno(status);
+       ocfs2_journal_dirty(handle, bg_bh);
 
        /* There is no need to zero out or otherwise initialize the
         * other blocks in a group - All valid FS metadata in a block
@@ -397,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
        return best;
 }
 
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+                              struct inode *alloc_inode,
+                              struct ocfs2_alloc_context *ac,
+                              struct ocfs2_chain_list *cl)
+{
+       int status;
+       u32 bit_off, num_bits;
+       u64 bg_blkno;
+       struct buffer_head *bg_bh;
+       unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+
+       status = ocfs2_claim_clusters(handle, ac,
+                                     le16_to_cpu(cl->cl_cpg), &bit_off,
+                                     &num_bits);
+       if (status < 0) {
+               if (status != -ENOSPC)
+                       mlog_errno(status);
+               goto bail;
+       }
+
+       /* setup the group */
+       bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+       mlog(0, "new descriptor, record %u, at block %llu\n",
+            alloc_rec, (unsigned long long)bg_blkno);
+
+       bg_bh = sb_getblk(osb->sb, bg_blkno);
+       if (!bg_bh) {
+               status = -EIO;
+               mlog_errno(status);
+               goto bail;
+       }
+       ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+       status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+                                       bg_blkno, num_bits, alloc_rec, cl);
+       if (status < 0) {
+               brelse(bg_bh);
+               mlog_errno(status);
+       }
+
+bail:
+       return status ? ERR_PTR(status) : bg_bh;
+}
+
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+                                       handle_t *handle,
+                                       struct ocfs2_alloc_context *ac,
+                                       unsigned int min_bits,
+                                       u32 *bit_off, u32 *num_bits)
+{
+       int status = 0;
+
+       while (min_bits) {
+               status = ocfs2_claim_clusters(handle, ac, min_bits,
+                                             bit_off, num_bits);
+               if (status != -ENOSPC)
+                       break;
+
+               min_bits >>= 1;
+       }
+
+       return status;
+}
+
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+                                           struct inode *alloc_inode,
+                                           struct buffer_head *bg_bh,
+                                           struct ocfs2_alloc_context *ac,
+                                           struct ocfs2_chain_list *cl,
+                                           unsigned int min_bits)
+{
+       int status;
+       struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+       struct ocfs2_group_desc *bg =
+               (struct ocfs2_group_desc *)bg_bh->b_data;
+       unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+                        le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+       u32 p_cpos, clusters;
+       u64 p_blkno;
+       struct ocfs2_extent_list *el = &bg->bg_list;
+
+       status = ocfs2_journal_access_gd(handle,
+                                        INODE_CACHE(alloc_inode),
+                                        bg_bh,
+                                        OCFS2_JOURNAL_ACCESS_CREATE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+                               le16_to_cpu(el->l_count))) {
+               if (min_bits > needed)
+                       min_bits = needed;
+               status = ocfs2_block_group_claim_bits(osb, handle, ac,
+                                                     min_bits, &p_cpos,
+                                                     &clusters);
+               if (status < 0) {
+                       if (status != -ENOSPC)
+                               mlog_errno(status);
+                       goto bail;
+               }
+               p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+               ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+                                             clusters);
+
+               min_bits = clusters;
+               needed = le16_to_cpu(cl->cl_cpg) -
+                        le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+       }
+
+       if (needed > 0) {
+               /*
+                * We have used up all the extent rec but can't fill up
+                * the cpg. So bail out.
+                */
+               status = -ENOSPC;
+               goto bail;
+       }
+
+       ocfs2_journal_dirty(handle, bg_bh);
+
+bail:
+       return status;
+}
+
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+                                  struct ocfs2_alloc_context *cluster_ac,
+                                  struct inode *alloc_inode,
+                                  struct buffer_head *bg_bh)
+{
+       int i, ret;
+       struct ocfs2_group_desc *bg;
+       struct ocfs2_extent_list *el;
+       struct ocfs2_extent_rec *rec;
+
+       if (!bg_bh)
+               return;
+
+       bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+       el = &bg->bg_list;
+       for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+               rec = &el->l_recs[i];
+               ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+                                         cluster_ac->ac_bh,
+                                         le64_to_cpu(rec->e_blkno),
+                                         le32_to_cpu(rec->e_leaf_clusters));
+               if (ret)
+                       mlog_errno(ret);
+               /* Try all the clusters to free */
+       }
+
+       ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+       brelse(bg_bh);
+}
+
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+                                 struct inode *alloc_inode,
+                                 struct ocfs2_alloc_context *ac,
+                                 struct ocfs2_chain_list *cl)
+{
+       int status;
+       u32 bit_off, num_bits;
+       u64 bg_blkno;
+       unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+       struct buffer_head *bg_bh = NULL;
+       unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+       struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+
+       if (!ocfs2_supports_discontig_bg(osb)) {
+               status = -ENOSPC;
+               goto bail;
+       }
+
+       status = ocfs2_extend_trans(handle,
+                                   ocfs2_calc_bg_discontig_credits(osb->sb));
+       if (status) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       /*
+        * We're going to be grabbing from multiple cluster groups.
+        * We don't have enough credits to relink them all, and the
+        * cluster groups will be staying in cache for the duration of
+        * this operation.
+        */
+       ac->ac_allow_chain_relink = 0;
+
+       /* Claim the first region */
+       status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+                                             &bit_off, &num_bits);
+       if (status < 0) {
+               if (status != -ENOSPC)
+                       mlog_errno(status);
+               goto bail;
+       }
+       min_bits = num_bits;
+
+       /* setup the group */
+       bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+       mlog(0, "new descriptor, record %u, at block %llu\n",
+            alloc_rec, (unsigned long long)bg_blkno);
+
+       bg_bh = sb_getblk(osb->sb, bg_blkno);
+       if (!bg_bh) {
+               status = -EIO;
+               mlog_errno(status);
+               goto bail;
+       }
+       ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+       status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+                                       bg_blkno, num_bits, alloc_rec, cl);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+                                                 bg_bh, ac, cl, min_bits);
+       if (status)
+               mlog_errno(status);
+
+bail:
+       if (status)
+               ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+       return status ? ERR_PTR(status) : bg_bh;
+}
+
 /*
  * We expect the block group allocator to already be locked.
  */
@@ -412,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        struct ocfs2_chain_list *cl;
        struct ocfs2_alloc_context *ac = NULL;
        handle_t *handle = NULL;
-       u32 bit_off, num_bits;
        u16 alloc_rec;
-       u64 bg_blkno;
        struct buffer_head *bg_bh = NULL;
        struct ocfs2_group_desc *bg;
 
@@ -447,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                     (unsigned long long)*last_alloc_group);
                ac->ac_last_group = *last_alloc_group;
        }
-       status = ocfs2_claim_clusters(osb,
-                                     handle,
-                                     ac,
-                                     le16_to_cpu(cl->cl_cpg),
-                                     &bit_off,
-                                     &num_bits);
-       if (status < 0) {
+
+       bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
+                                              ac, cl);
+       if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
+               bg_bh = ocfs2_block_group_alloc_discontig(handle,
+                                                         alloc_inode,
+                                                         ac, cl);
+       if (IS_ERR(bg_bh)) {
+               status = PTR_ERR(bg_bh);
+               bg_bh = NULL;
                if (status != -ENOSPC)
                        mlog_errno(status);
                goto bail;
        }
-
-       alloc_rec = ocfs2_find_smallest_chain(cl);
-
-       /* setup the group */
-       bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-       mlog(0, "new descriptor, record %u, at block %llu\n",
-            alloc_rec, (unsigned long long)bg_blkno);
-
-       bg_bh = sb_getblk(osb->sb, bg_blkno);
-       if (!bg_bh) {
-               status = -EIO;
-               mlog_errno(status);
-               goto bail;
-       }
-       ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
-
-       status = ocfs2_block_group_fill(handle,
-                                       alloc_inode,
-                                       bg_bh,
-                                       bg_blkno,
-                                       alloc_rec,
-                                       cl);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
-
        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -494,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                goto bail;
        }
 
+       alloc_rec = le16_to_cpu(bg->bg_chain);
        le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
                     le16_to_cpu(bg->bg_free_bits_count));
-       le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
-       cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
+       le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
+                    le16_to_cpu(bg->bg_bits));
+       cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg->bg_blkno);
        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
                le16_add_cpu(&cl->cl_next_free_rec, 1);
 
@@ -506,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
        le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
 
-       status = ocfs2_journal_dirty(handle, bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+       ocfs2_journal_dirty(handle, bh);
 
        spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
        OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -760,7 +1000,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
                                             EXTENT_ALLOC_SYSTEM_INODE,
                                             (u32)osb->slot_num, NULL,
-                                            ALLOC_NEW_GROUP);
+                                            ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
 
 
        if (status >= 0) {
@@ -946,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                status = ocfs2_reserve_local_alloc_bits(osb,
                                                        bits_wanted,
                                                        *ac);
-               if (status == -EFBIG) {
-                       /* The local alloc window is outside ac_max_block.
-                        * use the main bitmap. */
-                       status = -ENOSPC;
-               } else if ((status < 0) && (status != -ENOSPC)) {
+               if ((status < 0) && (status != -ENOSPC)) {
                        mlog_errno(status);
                        goto bail;
                }
@@ -1033,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
                                             struct buffer_head *bg_bh,
                                             unsigned int bits_wanted,
                                             unsigned int total_bits,
-                                            u16 *bit_off,
-                                            u16 *bits_found)
+                                            struct ocfs2_suballoc_result *res)
 {
        void *bitmap;
        u16 best_offset, best_size;
@@ -1078,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
                }
        }
 
-       /* XXX: I think the first clause is equivalent to the second
-        *      - jlbec */
-       if (found == bits_wanted) {
-               *bit_off = start - found;
-               *bits_found = found;
-       } else if (best_size) {
-               *bit_off = best_offset;
-               *bits_found = best_size;
+       if (best_size) {
+               res->sr_bit_offset = best_offset;
+               res->sr_bits = best_size;
        } else {
                status = -ENOSPC;
                /* No error log here -- see the comment above
@@ -1129,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        }
 
        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
-
        while(num_bits--)
                ocfs2_set_bit(bit_off++, bitmap);
 
-       status = ocfs2_journal_dirty(handle,
-                                    group_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+       ocfs2_journal_dirty(handle, group_bh);
 
 bail:
        mlog_exit(status);
@@ -1202,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
 
        prev_bg->bg_next_group = bg->bg_next_group;
-
-       status = ocfs2_journal_dirty(handle, prev_bg_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto out_rollback;
-       }
+       ocfs2_journal_dirty(handle, prev_bg_bh);
 
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
                                         bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1217,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
 
        bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
-
-       status = ocfs2_journal_dirty(handle, bg_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto out_rollback;
-       }
+       ocfs2_journal_dirty(handle, bg_bh);
 
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
                                         fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1232,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
 
        fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+       ocfs2_journal_dirty(handle, fe_bh);
 
-       status = ocfs2_journal_dirty(handle, fe_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto out_rollback;
-       }
-
-       status = 0;
 out_rollback:
        if (status < 0) {
                fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1263,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
                                      u64 max_block,
-                                     u16 *bit_off, u16 *bits_found)
+                                     struct ocfs2_suballoc_result *res)
 {
        int search = -ENOSPC;
        int ret;
        u64 blkoff;
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-       u16 tmp_off, tmp_found;
        unsigned int max_bits, gd_cluster_off;
 
        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1297,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
-                                                       max_bits,
-                                                       &tmp_off, &tmp_found);
+                                                       max_bits, res);
                if (ret)
                        return ret;
 
                if (max_block) {
                        blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
                                                          gd_cluster_off +
-                                                         tmp_off + tmp_found);
+                                                         res->sr_bit_offset +
+                                                         res->sr_bits);
                        mlog(0, "Checking %llu against %llu\n",
                             (unsigned long long)blkoff,
                             (unsigned long long)max_block);
@@ -1317,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                 * return success, but we still want to return
                 * -ENOSPC unless it found the minimum number
                 * of bits. */
-               if (min_bits <= tmp_found) {
-                       *bit_off = tmp_off;
-                       *bits_found = tmp_found;
+               if (min_bits <= res->sr_bits)
                        search = 0; /* success */
-               } else if (tmp_found) {
+               else if (res->sr_bits) {
                        /*
                         * Don't show bits which we'll be returning
                         * for allocation to the local alloc bitmap.
                         */
-                       ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
+                       ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
                }
        }
 
@@ -1337,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
                                    u64 max_block,
-                                   u16 *bit_off, u16 *bits_found)
+                                   struct ocfs2_suballoc_result *res)
 {
        int ret = -ENOSPC;
        u64 blkoff;
@@ -1350,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode,
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
                                                        le16_to_cpu(bg->bg_bits),
-                                                       bit_off, bits_found);
+                                                       res);
                if (!ret && max_block) {
-                       blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
-                               *bits_found;
+                       blkoff = le64_to_cpu(bg->bg_blkno) +
+                               res->sr_bit_offset + res->sr_bits;
                        mlog(0, "Checking %llu against %llu\n",
                             (unsigned long long)blkoff,
                             (unsigned long long)max_block);
@@ -1386,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
-
-       ret = ocfs2_journal_dirty(handle, di_bh);
-       if (ret < 0)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, di_bh);
 
 out:
        return ret;
 }
 
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+                                        struct ocfs2_extent_rec *rec,
+                                        struct ocfs2_chain_list *cl)
+{
+       unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+       unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+       unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
+
+       if (res->sr_bit_offset < bitoff)
+               return 0;
+       if (res->sr_bit_offset >= (bitoff + bitcount))
+               return 0;
+       res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+               (res->sr_bit_offset - bitoff);
+       if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+               res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+       return 1;
+}
+
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+                                         struct ocfs2_group_desc *bg,
+                                         struct ocfs2_suballoc_result *res)
+{
+       int i;
+       u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
+       struct ocfs2_extent_rec *rec;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+       struct ocfs2_chain_list *cl = &di->id2.i_chain;
+
+       if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+               res->sr_blkno = 0;
+               return;
+       }
+
+       res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+       res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
+       if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+           !bg->bg_list.l_next_free_rec)
+               return;
+
+       for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+               rec = &bg->bg_list.l_recs[i];
+               if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+                       res->sr_bg_blkno = bg_blkno;  /* Restore */
+                       break;
+               }
+       }
+}
+
 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
                                  handle_t *handle,
                                  u32 bits_wanted,
                                  u32 min_bits,
-                                 u16 *bit_off,
-                                 unsigned int *num_bits,
-                                 u64 gd_blkno,
+                                 struct ocfs2_suballoc_result *res,
                                  u16 *bits_left)
 {
        int ret;
-       u16 found;
        struct buffer_head *group_bh = NULL;
        struct ocfs2_group_desc *gd;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
        struct inode *alloc_inode = ac->ac_inode;
 
-       ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
-                                         &group_bh);
+       ret = ocfs2_read_group_descriptor(alloc_inode, di,
+                                         res->sr_bg_blkno, &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -1420,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 
        gd = (struct ocfs2_group_desc *) group_bh->b_data;
        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-                                 ac->ac_max_block, bit_off, &found);
+                                 ac->ac_max_block, res);
        if (ret < 0) {
                if (ret != -ENOSPC)
                        mlog_errno(ret);
                goto out;
        }
 
-       *num_bits = found;
+       if (!ret)
+               ocfs2_bg_discontig_fix_result(ac, gd, res);
 
        ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
-                                              *num_bits,
+                                              res->sr_bits,
                                               le16_to_cpu(gd->bg_chain));
        if (ret < 0) {
                mlog_errno(ret);
@@ -1438,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        }
 
        ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
-                                        *bit_off, *num_bits);
+                                        res->sr_bit_offset, res->sr_bits);
        if (ret < 0)
                mlog_errno(ret);
 
@@ -1454,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                              handle_t *handle,
                              u32 bits_wanted,
                              u32 min_bits,
-                             u16 *bit_off,
-                             unsigned int *num_bits,
-                             u64 *bg_blkno,
+                             struct ocfs2_suballoc_result *res,
                              u16 *bits_left)
 {
        int status;
-       u16 chain, tmp_bits;
+       u16 chain;
        u32 tmp_used;
        u64 next_group;
        struct inode *alloc_inode = ac->ac_inode;
@@ -1489,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
         * the 1st group with any empty bits. */
        while ((status = ac->ac_group_search(alloc_inode, group_bh,
                                             bits_wanted, min_bits,
-                                            ac->ac_max_block, bit_off,
-                                            &tmp_bits)) == -ENOSPC) {
+                                            ac->ac_max_block,
+                                            res)) == -ENOSPC) {
                if (!bg->bg_next_group)
                        break;
 
@@ -1515,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
 
        mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-            tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+            res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
 
-       *num_bits = tmp_bits;
+       res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
+
+       BUG_ON(res->sr_bits == 0);
+       if (!status)
+               ocfs2_bg_discontig_fix_result(ac, bg, res);
 
-       BUG_ON(*num_bits == 0);
 
        /*
         * Keep track of previous block descriptor read. When
@@ -1536,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
         */
        if (ac->ac_allow_chain_relink &&
            (prev_group_bh) &&
-           (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+           (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
                status = ocfs2_relink_block_group(handle, alloc_inode,
                                                  ac->ac_bh, group_bh,
                                                  prev_group_bh, chain);
@@ -1558,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
 
        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
-       fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
-       le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
-
-       status = ocfs2_journal_dirty(handle,
-                                    ac->ac_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+       fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
+       le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
+       ocfs2_journal_dirty(handle, ac->ac_bh);
 
        status = ocfs2_block_group_set_bits(handle,
                                            alloc_inode,
                                            bg,
                                            group_bh,
-                                           *bit_off,
-                                           *num_bits);
+                                           res->sr_bit_offset,
+                                           res->sr_bits);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
 
-       mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
+       mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
             (unsigned long long)le64_to_cpu(fe->i_blkno));
 
-       *bg_blkno = le64_to_cpu(bg->bg_blkno);
        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
        brelse(group_bh);
@@ -1593,19 +1836,15 @@ bail:
 }
 
 /* will give out up to bits_wanted contiguous bits. */
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
-                                    struct ocfs2_alloc_context *ac,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
                                     handle_t *handle,
                                     u32 bits_wanted,
                                     u32 min_bits,
-                                    u16 *bit_off,
-                                    unsigned int *num_bits,
-                                    u64 *bg_blkno)
+                                    struct ocfs2_suballoc_result *res)
 {
        int status;
        u16 victim, i;
        u16 bits_left = 0;
-       u64 hint_blkno = ac->ac_last_group;
        struct ocfs2_chain_list *cl;
        struct ocfs2_dinode *fe;
 
@@ -1623,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
-               ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
+               ocfs2_error(ac->ac_inode->i_sb,
+                           "Chain allocator dinode %llu has %u used "
                            "bits but only %u total.",
                            (unsigned long long)le64_to_cpu(fe->i_blkno),
                            le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1632,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
                goto bail;
        }
 
-       if (hint_blkno) {
+       res->sr_bg_blkno = ac->ac_last_group;
+       if (res->sr_bg_blkno) {
                /* Attempt to short-circuit the usual search mechanism
                 * by jumping straight to the most recently used
                 * allocation group. This helps us mantain some
                 * contiguousness across allocations. */
                status = ocfs2_search_one_group(ac, handle, bits_wanted,
-                                               min_bits, bit_off, num_bits,
-                                               hint_blkno, &bits_left);
-               if (!status) {
-                       /* Be careful to update *bg_blkno here as the
-                        * caller is expecting it to be filled in, and
-                        * ocfs2_search_one_group() won't do that for
-                        * us. */
-                       *bg_blkno = hint_blkno;
+                                               min_bits, res, &bits_left);
+               if (!status)
                        goto set_hint;
-               }
                if (status < 0 && status != -ENOSPC) {
                        mlog_errno(status);
                        goto bail;
@@ -1660,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        ac->ac_chain = victim;
        ac->ac_allow_chain_relink = 1;
 
-       status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
-                                   num_bits, bg_blkno, &bits_left);
+       status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+                                   res, &bits_left);
        if (!status)
                goto set_hint;
        if (status < 0 && status != -ENOSPC) {
@@ -1685,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 
                ac->ac_chain = i;
                status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
-                                           bit_off, num_bits, bg_blkno,
-                                           &bits_left);
+                                           res, &bits_left);
                if (!status)
                        break;
                if (status < 0 && status != -ENOSPC) {
@@ -1703,7 +1936,7 @@ set_hint:
                if (bits_left < min_bits)
                        ac->ac_last_group = 0;
                else
-                       ac->ac_last_group = *bg_blkno;
+                       ac->ac_last_group = res->sr_bg_blkno;
        }
 
 bail:
@@ -1711,37 +1944,37 @@ bail:
        return status;
 }
 
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
-                        handle_t *handle,
+int ocfs2_claim_metadata(handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
+                        u64 *suballoc_loc,
                         u16 *suballoc_bit_start,
                         unsigned int *num_bits,
                         u64 *blkno_start)
 {
        int status;
-       u64 bg_blkno;
+       struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
 
        BUG_ON(!ac);
        BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
        BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
 
-       status = ocfs2_claim_suballoc_bits(osb,
-                                          ac,
+       status = ocfs2_claim_suballoc_bits(ac,
                                           handle,
                                           bits_wanted,
                                           1,
-                                          suballoc_bit_start,
-                                          num_bits,
-                                          &bg_blkno);
+                                          &res);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-       atomic_inc(&osb->alloc_stats.bg_allocs);
+       atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
 
-       *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
-       ac->ac_bits_given += (*num_bits);
+       *suballoc_loc = res.sr_bg_blkno;
+       *suballoc_bit_start = res.sr_bit_offset;
+       *blkno_start = res.sr_blkno;
+       ac->ac_bits_given += res.sr_bits;
+       *num_bits = res.sr_bits;
        status = 0;
 bail:
        mlog_exit(status);
@@ -1749,10 +1982,10 @@ bail:
 }
 
 static void ocfs2_init_inode_ac_group(struct inode *dir,
-                                     struct buffer_head *parent_fe_bh,
+                                     struct buffer_head *parent_di_bh,
                                      struct ocfs2_alloc_context *ac)
 {
-       struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
        /*
         * Try to allocate inodes from some specific group.
         *
@@ -1766,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
        if (OCFS2_I(dir)->ip_last_used_group &&
            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
-       else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
-               ac->ac_last_group = ocfs2_which_suballoc_group(
-                                       le64_to_cpu(fe->i_blkno),
-                                       le16_to_cpu(fe->i_suballoc_bit));
+       else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
+               if (di->i_suballoc_loc)
+                       ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
+               else
+                       ac->ac_last_group = ocfs2_which_suballoc_group(
+                                       le64_to_cpu(di->i_blkno),
+                                       le16_to_cpu(di->i_suballoc_bit));
+       }
 }
 
 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1779,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
 }
 
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
-                         handle_t *handle,
+int ocfs2_claim_new_inode(handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
+                         u64 *suballoc_loc,
                          u16 *suballoc_bit,
                          u64 *fe_blkno)
 {
        int status;
-       unsigned int num_bits;
-       u64 bg_blkno;
+       struct ocfs2_suballoc_result res;
 
        mlog_entry_void();
 
@@ -1800,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
 
        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
 
-       status = ocfs2_claim_suballoc_bits(osb,
-                                          ac,
+       status = ocfs2_claim_suballoc_bits(ac,
                                           handle,
                                           1,
                                           1,
-                                          suballoc_bit,
-                                          &num_bits,
-                                          &bg_blkno);
+                                          &res);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-       atomic_inc(&osb->alloc_stats.bg_allocs);
+       atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
 
-       BUG_ON(num_bits != 1);
+       BUG_ON(res.sr_bits != 1);
 
-       *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+       *suballoc_loc = res.sr_bg_blkno;
+       *suballoc_bit = res.sr_bit_offset;
+       *fe_blkno = res.sr_blkno;
        ac->ac_bits_given++;
        ocfs2_save_inode_ac_group(dir, ac);
        status = 0;
@@ -1886,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
  * contig. allocation, set to '1' to indicate we can deal with extents
  * of any size.
  */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
-                          handle_t *handle,
+int __ocfs2_claim_clusters(handle_t *handle,
                           struct ocfs2_alloc_context *ac,
                           u32 min_clusters,
                           u32 max_clusters,
@@ -1896,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
        int status;
        unsigned int bits_wanted = max_clusters;
-       u64 bg_blkno = 0;
-       u16 bg_bit_off;
+       struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
+       struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
 
        mlog_entry_void();
 
@@ -1907,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
               && ac->ac_which != OCFS2_AC_USE_MAIN);
 
        if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+               WARN_ON(min_clusters > 1);
+
                status = ocfs2_claim_local_alloc_bits(osb,
                                                      handle,
                                                      ac,
@@ -1929,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
                if (bits_wanted > (osb->bitmap_cpg - 1))
                        bits_wanted = osb->bitmap_cpg - 1;
 
-               status = ocfs2_claim_suballoc_bits(osb,
-                                                  ac,
+               status = ocfs2_claim_suballoc_bits(ac,
                                                   handle,
                                                   bits_wanted,
                                                   min_clusters,
-                                                  &bg_bit_off,
-                                                  num_clusters,
-                                                  &bg_blkno);
+                                                  &res);
                if (!status) {
+                       BUG_ON(res.sr_blkno); /* cluster alloc can't set */
                        *cluster_start =
                                ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
-                                                                bg_blkno,
-                                                                bg_bit_off);
+                                                                res.sr_bg_blkno,
+                                                                res.sr_bit_offset);
                        atomic_inc(&osb->alloc_stats.bitmap_data);
+                       *num_clusters = res.sr_bits;
                }
        }
        if (status < 0) {
@@ -1958,8 +2193,7 @@ bail:
        return status;
 }
 
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
-                        handle_t *handle,
+int ocfs2_claim_clusters(handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 min_clusters,
                         u32 *cluster_start,
@@ -1967,7 +2201,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
        unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
 
-       return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+       return __ocfs2_claim_clusters(handle, ac, min_clusters,
                                      bits_wanted, cluster_start, num_clusters);
 }
 
@@ -2023,9 +2257,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
        if (undo_fn)
                jbd_unlock_bh_state(group_bh);
 
-       status = ocfs2_journal_dirty(handle, group_bh);
-       if (status < 0)
-               mlog_errno(status);
+       ocfs2_journal_dirty(handle, group_bh);
 bail:
        return status;
 }
@@ -2092,12 +2324,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
                     count);
        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
        fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
-
-       status = ocfs2_journal_dirty(handle, alloc_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+       ocfs2_journal_dirty(handle, alloc_bh);
 
 bail:
        brelse(group_bh);
@@ -2126,6 +2353,8 @@ int ocfs2_free_dinode(handle_t *handle,
        u16 bit = le16_to_cpu(di->i_suballoc_bit);
        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
 
+       if (di->i_suballoc_loc)
+               bg_blkno = le64_to_cpu(di->i_suballoc_loc);
        return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
                                        inode_alloc_bh, bit, bg_blkno, 1);
 }
@@ -2395,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
                                   struct buffer_head *alloc_bh, u64 blkno,
                                   u16 bit, int *res)
 {
-       struct ocfs2_dinode *alloc_fe;
+       struct ocfs2_dinode *alloc_di;
        struct ocfs2_group_desc *group;
        struct buffer_head *group_bh = NULL;
        u64 bg_blkno;
@@ -2404,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
        mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
                   (unsigned int)bit);
 
-       alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
-       if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+       alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
+       if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
                     (unsigned int)bit,
-                    ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+                    ocfs2_bits_per_group(&alloc_di->id2.i_chain));
                status = -EINVAL;
                goto bail;
        }
 
-       bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
-       status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+       if (alloc_di->i_suballoc_loc)
+               bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
+       else
+               bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+       status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
                                             &group_bh);
        if (status < 0) {
                mlog(ML_ERROR, "read group %llu failed %d\n",
index e0f46df..a017dd3 100644 (file)
 #ifndef _CHAINALLOC_H_
 #define _CHAINALLOC_H_
 
+struct ocfs2_suballoc_result;
 typedef int (group_search_t)(struct inode *,
                             struct buffer_head *,
                             u32,                       /* bits_wanted */
                             u32,                       /* min_bits */
                             u64,                       /* max_block */
-                            u16 *,                     /* *bit_off */
-                            u16 *);                    /* *bits_found */
+                            struct ocfs2_suballoc_result *);
+                                                       /* found bits */
 
 struct ocfs2_alloc_context {
        struct inode *ac_inode;    /* which bitmap are we allocating from? */
@@ -54,6 +55,8 @@ struct ocfs2_alloc_context {
        u64    ac_last_group;
        u64    ac_max_block;  /* Highest block number to allocate. 0 is
                                 is the same as ~0 - unlimited */
+
+       struct ocfs2_alloc_reservation  *ac_resv;
 };
 
 void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac);
 
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
-                        handle_t *handle,
+int ocfs2_claim_metadata(handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
+                        u64 *suballoc_loc,
                         u16 *suballoc_bit_start,
                         u32 *num_bits,
                         u64 *blkno_start);
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
-                         handle_t *handle,
+int ocfs2_claim_new_inode(handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
+                         u64 *suballoc_loc,
                          u16 *suballoc_bit,
                          u64 *fe_blkno);
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
-                        handle_t *handle,
+int ocfs2_claim_clusters(handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 min_clusters,
                         u32 *cluster_start,
@@ -104,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
  * Use this variant of ocfs2_claim_clusters to specify a maxiumum
  * number of clusters smaller than the allocation reserved.
  */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
-                          handle_t *handle,
+int __ocfs2_claim_clusters(handle_t *handle,
                           struct ocfs2_alloc_context *ac,
                           u32 min_clusters,
                           u32 max_clusters,
index dee0319..1c2c39f 100644 (file)
@@ -94,7 +94,9 @@ struct mount_options
        unsigned long   mount_opt;
        unsigned int    atime_quantum;
        signed short    slot;
-       unsigned int    localalloc_opt;
+       int             localalloc_opt;
+       unsigned int    resv_level;
+       int             dir_resv_level;
        char            cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
 
@@ -176,6 +178,8 @@ enum {
        Opt_noacl,
        Opt_usrquota,
        Opt_grpquota,
+       Opt_resv_level,
+       Opt_dir_resv_level,
        Opt_err,
 };
 
@@ -202,6 +206,8 @@ static const match_table_t tokens = {
        {Opt_noacl, "noacl"},
        {Opt_usrquota, "usrquota"},
        {Opt_grpquota, "grpquota"},
+       {Opt_resv_level, "resv_level=%u"},
+       {Opt_dir_resv_level, "dir_resv_level=%u"},
        {Opt_err, NULL}
 };
 
@@ -1028,8 +1034,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
-       osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
-       osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+       ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+       osb->osb_resv_level = parsed_options.resv_level;
+       osb->osb_dir_resv_level = parsed_options.resv_level;
+       if (parsed_options.dir_resv_level == -1)
+               osb->osb_dir_resv_level = parsed_options.resv_level;
+       else
+               osb->osb_dir_resv_level = parsed_options.dir_resv_level;
 
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -1285,11 +1297,13 @@ static int ocfs2_parse_options(struct super_block *sb,
                   options ? options : "(none)");
 
        mopt->commit_interval = 0;
-       mopt->mount_opt = 0;
+       mopt->mount_opt = OCFS2_MOUNT_NOINTR;
        mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
        mopt->slot = OCFS2_INVALID_SLOT;
-       mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+       mopt->localalloc_opt = -1;
        mopt->cluster_stack[0] = '\0';
+       mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+       mopt->dir_resv_level = -1;
 
        if (!options) {
                status = 1;
@@ -1380,7 +1394,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                                status = 0;
                                goto bail;
                        }
-                       if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+                       if (option >= 0)
                                mopt->localalloc_opt = option;
                        break;
                case Opt_localflocks:
@@ -1433,6 +1447,28 @@ static int ocfs2_parse_options(struct super_block *sb,
                        mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                        break;
+               case Opt_resv_level:
+                       if (is_remount)
+                               break;
+                       if (match_int(&args[0], &option)) {
+                               status = 0;
+                               goto bail;
+                       }
+                       if (option >= OCFS2_MIN_RESV_LEVEL &&
+                           option < OCFS2_MAX_RESV_LEVEL)
+                               mopt->resv_level = option;
+                       break;
+               case Opt_dir_resv_level:
+                       if (is_remount)
+                               break;
+                       if (match_int(&args[0], &option)) {
+                               status = 0;
+                               goto bail;
+                       }
+                       if (option >= OCFS2_MIN_RESV_LEVEL &&
+                           option < OCFS2_MAX_RESV_LEVEL)
+                               mopt->dir_resv_level = option;
+                       break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1487,7 +1523,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                           (unsigned) (osb->osb_commit_interval / HZ));
 
        local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
-       if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+       if (local_alloc_megs != ocfs2_la_default_mb(osb))
                seq_printf(s, ",localalloc=%d", local_alloc_megs);
 
        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1514,6 +1550,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        else
                seq_printf(s, ",noacl");
 
+       if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+               seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+
+       if (osb->osb_dir_resv_level != osb->osb_resv_level)
+               seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+
        return 0;
 }
 
@@ -1688,6 +1730,8 @@ static void ocfs2_inode_init_once(void *data)
        oi->ip_blkno = 0ULL;
        oi->ip_clusters = 0;
 
+       ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
        ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -2042,6 +2086,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
        init_waitqueue_head(&osb->osb_mount_event);
 
+       status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+       if (status) {
+               mlog_errno(status);
+               goto bail;
+       }
+
        osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
        if (!osb->vol_label) {
                mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2224,9 +2274,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
 
        osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+       osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
        iput(inode);
 
-       osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
+       osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+                                osb->s_feature_incompat) * 8;
 
        status = ocfs2_init_slot_info(osb);
        if (status < 0) {
@@ -2509,5 +2561,25 @@ void __ocfs2_abort(struct super_block* sb,
        ocfs2_handle_error(sb);
 }
 
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+       int rc;
+       sigset_t blocked;
+
+       sigfillset(&blocked);
+       rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+       BUG_ON(rc);
+}
+
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+       int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+       BUG_ON(rc);
+}
+
 module_init(ocfs2_init);
 module_exit(ocfs2_exit);
index 783f527..40c7de0 100644 (file)
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
 
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
 
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
+
 #endif /* OCFS2_SUPER_H */
index 3e77730..98ee6c4 100644 (file)
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
        struct ocfs2_alloc_context *meta_ac;
        struct ocfs2_alloc_context *data_ac;
        struct ocfs2_cached_dealloc_ctxt dealloc;
+       int set_abort;
 };
 
 #define OCFS2_XATTR_ROOT_SIZE  (sizeof(struct ocfs2_xattr_def_value_root))
@@ -739,11 +740,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
                goto leave;
        }
 
-       status = ocfs2_journal_dirty(handle, vb->vb_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto leave;
-       }
+       ocfs2_journal_dirty(handle, vb->vb_bh);
 
        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
 
@@ -786,12 +783,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
        }
 
        le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
-
-       ret = ocfs2_journal_dirty(handle, vb->vb_bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
+       ocfs2_journal_dirty(handle, vb->vb_bh);
 
        if (ext_flags & OCFS2_EXT_REFCOUNTED)
                ret = ocfs2_decrease_refcount(inode, handle,
@@ -1374,11 +1366,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                                memset(bh->b_data + cp_len, 0,
                                       blocksize - cp_len);
 
-                       ret = ocfs2_journal_dirty(handle, bh);
-                       if (ret < 0) {
-                               mlog_errno(ret);
-                               goto out;
-                       }
+                       ocfs2_journal_dirty(handle, bh);
                        brelse(bh);
                        bh = NULL;
 
@@ -2148,15 +2136,19 @@ alloc_value:
                orig_clusters = ocfs2_xa_value_clusters(loc);
                rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
                if (rc < 0) {
-                       /*
-                        * If we tried to grow an existing external value,
-                        * ocfs2_xa_cleanuP-value_truncate() is going to
-                        * let it stand.  We have to restore its original
-                        * value size.
-                        */
-                       loc->xl_entry->xe_value_size = orig_value_size;
+                       ctxt->set_abort = 1;
                        ocfs2_xa_cleanup_value_truncate(loc, "growing",
                                                        orig_clusters);
+                       /*
+                        * If we were growing an existing value,
+                        * ocfs2_xa_cleanup_value_truncate() won't remove
+                        * the entry. We need to restore the original value
+                        * size.
+                        */
+                       if (loc->xl_entry) {
+                               BUG_ON(!orig_value_size);
+                               loc->xl_entry->xe_value_size = orig_value_size;
+                       }
                        mlog_errno(rc);
                }
        }
@@ -2479,7 +2471,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        blk = le64_to_cpu(xb->xb_blkno);
        bit = le16_to_cpu(xb->xb_suballoc_bit);
-       bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+       if (xb->xb_suballoc_loc)
+               bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+       else
+               bg_blkno = ocfs2_which_suballoc_group(blk, bit);
 
        xb_alloc_inode = ocfs2_get_system_file_inode(osb,
                                EXTENT_ALLOC_SYSTEM_INODE,
@@ -2594,9 +2589,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
 
-       ret = ocfs2_journal_dirty(handle, di_bh);
-       if (ret < 0)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, di_bh);
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
@@ -2724,9 +2717,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
 
-       ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
-       if (ret < 0)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(ctxt->handle, di_bh);
 
 out:
        return ret;
@@ -2846,9 +2837,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got;
-       u64 first_blkno;
+       u64 suballoc_loc, first_blkno;
        struct ocfs2_dinode *di =  (struct ocfs2_dinode *)inode_bh->b_data;
-       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct buffer_head *new_bh = NULL;
        struct ocfs2_xattr_block *xblk;
 
@@ -2859,9 +2849,9 @@ static int ocfs2_create_xattr_block(struct inode *inode,
                goto end;
        }
 
-       ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
-                                  &suballoc_bit_start, &num_got,
-                                  &first_blkno);
+       ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
+                                  &suballoc_loc, &suballoc_bit_start,
+                                  &num_got, &first_blkno);
        if (ret < 0) {
                mlog_errno(ret);
                goto end;
@@ -2883,8 +2873,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
        memset(xblk, 0, inode->i_sb->s_blocksize);
        strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
        xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+       xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
        xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
-       xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+       xblk->xb_fs_generation =
+               cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
        xblk->xb_blkno = cpu_to_le64(first_blkno);
        if (indexed) {
                struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2948,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                ret = ocfs2_xa_set(&loc, xi, ctxt);
                if (!ret)
                        xs->here = loc.xl_entry;
-               else if (ret != -ENOSPC)
+               else if ((ret != -ENOSPC) || ctxt->set_abort)
                        goto end;
                else {
                        ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3312,14 +3304,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                goto out;
                        }
 
-                       ret = ocfs2_extend_trans(ctxt->handle, credits +
-                                       ctxt->handle->h_buffer_credits);
+                       ret = ocfs2_extend_trans(ctxt->handle, credits);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
-               } else if (ret == -ENOSPC) {
+               } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
                        if (di->i_xattr_loc && !xbs->xattr_bh) {
                                ret = ocfs2_xattr_block_find(inode,
                                                             xi->xi_name_index,
@@ -3343,8 +3334,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                        goto out;
                                }
 
-                               ret = ocfs2_extend_trans(ctxt->handle, credits +
-                                       ctxt->handle->h_buffer_credits);
+                               ret = ocfs2_extend_trans(ctxt->handle, credits);
                                if (ret) {
                                        mlog_errno(ret);
                                        goto out;
@@ -3378,8 +3368,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                        goto out;
                                }
 
-                               ret = ocfs2_extend_trans(ctxt->handle, credits +
-                                               ctxt->handle->h_buffer_credits);
+                               ret = ocfs2_extend_trans(ctxt->handle, credits);
                                if (ret) {
                                        mlog_errno(ret);
                                        goto out;
@@ -4249,7 +4238,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        u32 bit_off, len;
        u64 blkno;
        handle_t *handle = ctxt->handle;
-       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *xb_bh = xs->xattr_bh;
        struct ocfs2_xattr_block *xb =
@@ -4277,7 +4265,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
                goto out;
        }
 
-       ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+       ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
                                     1, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
@@ -4887,8 +4875,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
         * We need to update the first bucket of the old extent and all
         * the buckets going to the new extent.
         */
-       credits = ((num_buckets + 1) * blks_per_bucket) +
-               handle->h_buffer_credits;
+       credits = ((num_buckets + 1) * blks_per_bucket);
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -4958,7 +4945,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
                                      u32 *first_hash)
 {
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-       int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
+       int ret, credits = 2 * blk_per_bucket;
 
        BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
 
@@ -5099,7 +5086,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                goto leave;
        }
 
-       ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
+       ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
                                     clusters_to_add, &bit_off, &num_bits);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -5153,9 +5140,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                goto leave;
        }
 
-       ret = ocfs2_journal_dirty(handle, root_bh);
-       if (ret < 0)
-               mlog_errno(ret);
+       ocfs2_journal_dirty(handle, root_bh);
 
 leave:
        return ret;
@@ -5200,8 +5185,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
         * existing bucket.  Then we add the last existing bucket, the
         * new bucket, and the first bucket (3 * blk_per_bucket).
         */
-       credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
-                 handle->h_buffer_credits;
+       credits = (end_blk - target_blk) + (3 * blk_per_bucket);
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -5477,12 +5461,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
        }
 
        le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
-
-       ret = ocfs2_journal_dirty(handle, root_bh);
-       if (ret) {
-               mlog_errno(ret);
-               goto out_commit;
-       }
+       ocfs2_journal_dirty(handle, root_bh);
 
        ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
        if (ret)
@@ -6935,7 +6914,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
                goto out;
        }
 
-       ret = ocfs2_claim_clusters(osb, handle, data_ac,
+       ret = ocfs2_claim_clusters(handle, data_ac,
                                   len, &p_cluster, &num_clusters);
        if (ret) {
                mlog_errno(ret);