+ if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
+ /*
+ * Left-right contig implies this.
+ */
+ BUG_ON(!ctxt->c_split_covers_rec);
+
+ /*
+ * Since the leftright insert always covers the entire
+ * extent, this call will delete the insert record
+ * entirely, resulting in an empty extent record added to
+ * the extent block.
+ *
+ * Since the adding of an empty extent shifts
+ * everything back to the right, there's no need to
+ * update split_index here.
+ *
+ * When the split_index is zero, we need to merge it to the
+ * prevoius extent block. It is more efficient and easier
+ * if we do merge_right first and merge_left later.
+ */
+ ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
+ split_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We can only get this from logic error above.
+ */
+ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+
+ /* The merge left us with an empty extent, remove it. */
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ rec = &el->l_recs[split_index];
+
+ /*
+ * Note that we don't pass split_rec here on purpose -
+ * we've merged it into the rec already.
+ */
+ ret = ocfs2_merge_rec_left(path, handle, et, rec,
+ dealloc, split_index);
+
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+ /*
+ * Error from this last rotate is not critical, so
+ * print but don't bubble it up.
+ */
+ if (ret)
+ mlog_errno(ret);
+ ret = 0;
+ } else {
+ /*
+ * Merge a record to the left or right.
+ *
+ * 'contig_type' is relative to the existing record,
+ * so for example, if we're "right contig", it's to
+ * the record on the left (hence the left merge).
+ */
+ if (ctxt->c_contig_type == CONTIG_RIGHT) {
+ ret = ocfs2_merge_rec_left(path, handle, et,
+ split_rec, dealloc,
+ split_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ ret = ocfs2_merge_rec_right(path, handle,
+ et, split_rec,
+ split_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (ctxt->c_split_covers_rec) {
+ /*
+ * The merge may have left an empty extent in
+ * our leaf. Try to rotate it away.
+ */
+ ret = ocfs2_rotate_tree_left(handle, et, path,
+ dealloc);
+ if (ret)
+ mlog_errno(ret);
+ ret = 0;
+ }
+ }
+
+out:
+ return ret;
+}
+
+static void ocfs2_subtract_from_rec(struct super_block *sb,
+ enum ocfs2_split_type split,
+ struct ocfs2_extent_rec *rec,
+ struct ocfs2_extent_rec *split_rec)
+{
+ u64 len_blocks;
+
+ len_blocks = ocfs2_clusters_to_blocks(sb,
+ le16_to_cpu(split_rec->e_leaf_clusters));
+
+ if (split == SPLIT_LEFT) {
+ /*
+ * Region is on the left edge of the existing
+ * record.
+ */
+ le32_add_cpu(&rec->e_cpos,
+ le16_to_cpu(split_rec->e_leaf_clusters));
+ le64_add_cpu(&rec->e_blkno, len_blocks);
+ le16_add_cpu(&rec->e_leaf_clusters,
+ -le16_to_cpu(split_rec->e_leaf_clusters));
+ } else {
+ /*
+ * Region is on the right edge of the existing
+ * record.
+ */
+ le16_add_cpu(&rec->e_leaf_clusters,
+ -le16_to_cpu(split_rec->e_leaf_clusters));
+ }
+}
+
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_insert_type *insert)
+{
+ int i = insert->ins_contig_index;
+ unsigned int range;
+ struct ocfs2_extent_rec *rec;
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ if (insert->ins_split != SPLIT_NONE) {
+ i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
+ BUG_ON(i == -1);
+ rec = &el->l_recs[i];
+ ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ insert->ins_split, rec,
+ insert_rec);
+ goto rotate;
+ }
+
+ /*
+ * Contiguous insert - either left or right.
+ */
+ if (insert->ins_contig != CONTIG_NONE) {
+ rec = &el->l_recs[i];
+ if (insert->ins_contig == CONTIG_LEFT) {
+ rec->e_blkno = insert_rec->e_blkno;
+ rec->e_cpos = insert_rec->e_cpos;
+ }
+ le16_add_cpu(&rec->e_leaf_clusters,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ return;
+ }
+
+ /*
+ * Handle insert into an empty leaf.
+ */
+ if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+ ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+ ocfs2_is_empty_extent(&el->l_recs[0]))) {
+ el->l_recs[0] = *insert_rec;
+ el->l_next_free_rec = cpu_to_le16(1);
+ return;
+ }
+
+ /*
+ * Appending insert.
+ */
+ if (insert->ins_appending == APPEND_TAIL) {
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[i];
+ range = le32_to_cpu(rec->e_cpos)
+ + le16_to_cpu(rec->e_leaf_clusters);
+ BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+
+ mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+ le16_to_cpu(el->l_count),
+ "owner %llu, depth %u, count %u, next free %u, "
+ "rec.cpos %u, rec.clusters %u, "
+ "insert.cpos %u, insert.clusters %u\n",
+ ocfs2_metadata_cache_owner(et->et_ci),
+ le16_to_cpu(el->l_tree_depth),
+ le16_to_cpu(el->l_count),
+ le16_to_cpu(el->l_next_free_rec),
+ le32_to_cpu(el->l_recs[i].e_cpos),
+ le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+ le32_to_cpu(insert_rec->e_cpos),
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ i++;
+ el->l_recs[i] = *insert_rec;
+ le16_add_cpu(&el->l_next_free_rec, 1);
+ return;
+ }
+
+rotate:
+ /*
+ * Ok, we have to rotate.
+ *
+ * At this point, it is safe to assume that inserting into an
+ * empty leaf and appending to a leaf have both been handled
+ * above.
+ *
+ * This leaf needs to have space, either by the empty 1st
+ * extent record, or by virtue of an l_next_rec < l_count.
+ */
+ ocfs2_rotate_leaf(el, insert_rec);
+}
+
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int ret, i, next_free;
+ struct buffer_head *bh;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ /*
+ * Update everything except the leaf block.
+ */
+ for (i = 0; i < path->p_tree_depth; i++) {
+ bh = path->p_node[i].bh;
+ el = path->p_node[i].el;
+
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (next_free == 0) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has a bad extent list",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+ ret = -EIO;
+ return;
+ }
+
+ rec = &el->l_recs[next_free - 1];
+
+ rec->e_int_clusters = insert_rec->e_cpos;
+ le32_add_cpu(&rec->e_int_clusters,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ le32_add_cpu(&rec->e_int_clusters,
+ -le32_to_cpu(rec->e_cpos));
+
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret)
+ mlog_errno(ret);
+
+ }
+}
+
+static int ocfs2_append_rec_to_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_path *right_path,
+ struct ocfs2_path **ret_left_path)
+{
+ int ret, next_free;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_path *left_path = NULL;
+
+ *ret_left_path = NULL;
+
+ /*
+ * This shouldn't happen for non-trees. The extent rec cluster
+ * count manipulation below only works for interior nodes.
+ */
+ BUG_ON(right_path->p_tree_depth == 0);
+
+ /*
+ * If our appending insert is at the leftmost edge of a leaf,
+ * then we might need to update the rightmost records of the
+ * neighboring path.
+ */
+ el = path_leaf_el(right_path);
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (next_free == 0 ||
+ (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+ u32 left_cpos;
+
+ ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ right_path, &left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "Append may need a left path update. cpos: %u, "
+ "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
+ left_cpos);
+
+ /*
+ * No need to worry if the append is already in the
+ * leftmost leaf.
+ */
+ if (left_cpos) {
+ left_path = ocfs2_new_path_from_path(right_path);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * ocfs2_insert_path() will pass the left_path to the
+ * journal for us.
+ */
+ }
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
+
+ *ret_left_path = left_path;
+ ret = 0;
+out:
+ if (ret != 0)
+ ocfs2_free_path(left_path);
+
+ return ret;
+}
+
+static void ocfs2_split_record(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ struct ocfs2_extent_rec *split_rec,
+ enum ocfs2_split_type split)
+{
+ int index;
+ u32 cpos = le32_to_cpu(split_rec->e_cpos);
+ struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
+ struct ocfs2_extent_rec *rec, *tmprec;
+
+ right_el = path_leaf_el(right_path);
+ if (left_path)
+ left_el = path_leaf_el(left_path);
+
+ el = right_el;
+ insert_el = right_el;
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index != -1) {
+ if (index == 0 && left_path) {
+ BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+
+ /*
+ * This typically means that the record
+ * started in the left path but moved to the
+ * right as a result of rotation. We either
+ * move the existing record to the left, or we
+ * do the later insert there.
+ *
+ * In this case, the left path should always
+ * exist as the rotate code will have passed
+ * it back for a post-insert update.
+ */
+
+ if (split == SPLIT_LEFT) {
+ /*
+ * It's a left split. Since we know
+ * that the rotate code gave us an
+ * empty extent in the left path, we
+ * can just do the insert there.
+ */
+ insert_el = left_el;
+ } else {
+ /*
+ * Right split - we have to move the
+ * existing record over to the left
+ * leaf. The insert will be into the
+ * newly created empty extent in the
+ * right leaf.
+ */
+ tmprec = &right_el->l_recs[index];
+ ocfs2_rotate_leaf(left_el, tmprec);
+ el = left_el;
+
+ memset(tmprec, 0, sizeof(*tmprec));
+ index = ocfs2_search_extent_list(left_el, cpos);
+ BUG_ON(index == -1);
+ }
+ }
+ } else {
+ BUG_ON(!left_path);
+ BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
+ /*
+ * Left path is easy - we can just allow the insert to
+ * happen.
+ */
+ el = left_el;
+ insert_el = left_el;
+ index = ocfs2_search_extent_list(el, cpos);
+ BUG_ON(index == -1);
+ }
+
+ rec = &el->l_recs[index];
+ ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ split, rec, split_rec);
+ ocfs2_rotate_leaf(insert_el, split_rec);
+}
+
+/*
+ * This function only does inserts on an allocation b-tree. For tree
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_insert_type *insert)
+{
+ int ret, subtree_index;
+ struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+
+ if (left_path) {
+ int credits = handle->h_buffer_credits;
+
+ /*
+ * There's a chance that left_path got passed back to
+ * us without being accounted for in the
+ * journal. Extend our transaction here to be sure we
+ * can change those blocks.
+ */
+ credits += left_path->p_tree_depth;
+
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /*
+ * Pass both paths to the journal. The majority of inserts
+ * will be touching all components anyway.
+ */
+ ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (insert->ins_split != SPLIT_NONE) {
+ /*
+ * We could call ocfs2_insert_at_leaf() for some types
+ * of splits, but it's easier to just let one separate
+ * function sort it all out.
+ */
+ ocfs2_split_record(et, left_path, right_path,
+ insert_rec, insert->ins_split);
+
+ /*
+ * Split might have modified either leaf and we don't
+ * have a guarantee that the later edge insert will
+ * dirty this for us.
+ */
+ if (left_path)
+ ret = ocfs2_journal_dirty(handle,
+ path_leaf_bh(left_path));
+ if (ret)
+ mlog_errno(ret);
+ } else
+ ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
+ insert);
+
+ ret = ocfs2_journal_dirty(handle, leaf_bh);
+ if (ret)
+ mlog_errno(ret);
+
+ if (left_path) {
+ /*
+ * The rotate code has indicated that we need to fix
+ * up portions of the tree after the insert.
+ *
+ * XXX: Should we extend the transaction here?
+ */
+ subtree_index = ocfs2_find_subtree_root(et, left_path,
+ right_path);
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int ocfs2_do_insert_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_insert_type *type)
+{
+ int ret, rotate = 0;
+ u32 cpos;
+ struct ocfs2_path *right_path = NULL;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_list *el;
+
+ el = et->et_root_el;
+
+ ret = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (le16_to_cpu(el->l_tree_depth) == 0) {
+ ocfs2_insert_at_leaf(et, insert_rec, el, type);
+ goto out_update_clusters;
+ }
+
+ right_path = ocfs2_new_path_from_et(et);
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Determine the path to start with. Rotations need the
+ * rightmost path, everything else can go directly to the
+ * target leaf.
+ */
+ cpos = le32_to_cpu(insert_rec->e_cpos);
+ if (type->ins_appending == APPEND_NONE &&
+ type->ins_contig == CONTIG_NONE) {
+ rotate = 1;
+ cpos = UINT_MAX;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, right_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Rotations and appends need special treatment - they modify
+ * parts of the tree's above them.
+ *
+ * Both might pass back a path immediate to the left of the
+ * one being inserted to. This will be cause
+ * ocfs2_insert_path() to modify the rightmost records of
+ * left_path to account for an edge insert.
+ *
+ * XXX: When modifying this code, keep in mind that an insert
+ * can wind up skipping both of these two special cases...
+ */
+ if (rotate) {
+ ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
+ le32_to_cpu(insert_rec->e_cpos),
+ right_path, &left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * ocfs2_rotate_tree_right() might have extended the
+ * transaction without re-journaling our tree root.
+ */
+ ret = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else if (type->ins_appending == APPEND_TAIL
+ && type->ins_contig != CONTIG_LEFT) {
+ ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
+ right_path, &left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_insert_path(handle, et, left_path, right_path,
+ insert_rec, type);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out_update_clusters:
+ if (type->ins_split == SPLIT_NONE)
+ ocfs2_et_update_clusters(et,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+
+ ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_free_path(left_path);
+ ocfs2_free_path(right_path);
+
+ return ret;
+}
+
+static enum ocfs2_contig_type
+ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_extent_list *el, int index,
+ struct ocfs2_extent_rec *split_rec)
+{
+ int status;
+ enum ocfs2_contig_type ret = CONTIG_NONE;
+ u32 left_cpos, right_cpos;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_extent_list *new_el;
+ struct ocfs2_path *left_path = NULL, *right_path = NULL;
+ struct buffer_head *bh;
+ struct ocfs2_extent_block *eb;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+
+ if (index > 0) {
+ rec = &el->l_recs[index - 1];
+ } else if (path->p_tree_depth > 0) {
+ status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
+ if (status)
+ goto out;
+
+ if (left_cpos != 0) {
+ left_path = ocfs2_new_path_from_path(path);
+ if (!left_path)
+ goto out;
+
+ status = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
+ if (status)
+ goto out;
+
+ new_el = path_leaf_el(left_path);
+
+ if (le16_to_cpu(new_el->l_next_free_rec) !=
+ le16_to_cpu(new_el->l_count)) {
+ bh = path_leaf_bh(left_path);
+ eb = (struct ocfs2_extent_block *)bh->b_data;
+ ocfs2_error(sb,
+ "Extent block #%llu has an "
+ "invalid l_next_free_rec of "
+ "%d. It should have "
+ "matched the l_count of %d",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec),
+ le16_to_cpu(new_el->l_count));
+ status = -EINVAL;
+ goto out;
+ }
+ rec = &new_el->l_recs[
+ le16_to_cpu(new_el->l_next_free_rec) - 1];
+ }
+ }
+
+ /*
+ * We're careful to check for an empty extent record here -
+ * the merge code will know what to do if it sees one.
+ */
+ if (rec) {
+ if (index == 1 && ocfs2_is_empty_extent(rec)) {
+ if (split_rec->e_cpos == el->l_recs[index].e_cpos)
+ ret = CONTIG_RIGHT;
+ } else {
+ ret = ocfs2_et_extent_contig(et, rec, split_rec);
+ }
+ }
+
+ rec = NULL;
+ if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
+ rec = &el->l_recs[index + 1];
+ else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
+ path->p_tree_depth > 0) {
+ status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
+ if (status)
+ goto out;
+
+ if (right_cpos == 0)
+ goto out;
+
+ right_path = ocfs2_new_path_from_path(path);
+ if (!right_path)
+ goto out;
+
+ status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
+ if (status)
+ goto out;
+
+ new_el = path_leaf_el(right_path);
+ rec = &new_el->l_recs[0];
+ if (ocfs2_is_empty_extent(rec)) {
+ if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
+ bh = path_leaf_bh(right_path);
+ eb = (struct ocfs2_extent_block *)bh->b_data;
+ ocfs2_error(sb,
+ "Extent block #%llu has an "
+ "invalid l_next_free_rec of %d",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec));
+ status = -EINVAL;
+ goto out;
+ }
+ rec = &new_el->l_recs[1];
+ }
+ }
+
+ if (rec) {
+ enum ocfs2_contig_type contig_type;
+
+ contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
+
+ if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
+ ret = CONTIG_LEFTRIGHT;
+ else if (ret == CONTIG_NONE)
+ ret = contig_type;
+ }
+
+out:
+ if (left_path)
+ ocfs2_free_path(left_path);
+ if (right_path)
+ ocfs2_free_path(right_path);
+
+ return ret;
+}
+
+static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
+ struct ocfs2_insert_type *insert,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int i;
+ enum ocfs2_contig_type contig_type = CONTIG_NONE;
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
+ insert_rec);
+ if (contig_type != CONTIG_NONE) {
+ insert->ins_contig_index = i;
+ break;
+ }
+ }
+ insert->ins_contig = contig_type;
+
+ if (insert->ins_contig != CONTIG_NONE) {
+ struct ocfs2_extent_rec *rec =
+ &el->l_recs[insert->ins_contig_index];
+ unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
+ le16_to_cpu(insert_rec->e_leaf_clusters);
+
+ /*
+ * Caller might want us to limit the size of extents, don't
+ * calculate contiguousness if we might exceed that limit.
+ */
+ if (et->et_max_leaf_clusters &&
+ (len > et->et_max_leaf_clusters))
+ insert->ins_contig = CONTIG_NONE;
+ }
+}
+
+/*
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the root extent list for tree's with 0
+ * depth. If we consider the root extent list to be the rightmost leaf node
+ * then the logic here makes sense.
+ */
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int i;
+ u32 cpos = le32_to_cpu(insert_rec->e_cpos);
+ struct ocfs2_extent_rec *rec;
+
+ insert->ins_appending = APPEND_NONE;
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ if (!el->l_next_free_rec)
+ goto set_tail_append;
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+ /* Were all records empty? */
+ if (le16_to_cpu(el->l_next_free_rec) == 1)
+ goto set_tail_append;
+ }
+
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[i];
+
+ if (cpos >=
+ (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
+ goto set_tail_append;
+
+ return;
+
+set_tail_append:
+ insert->ins_appending = APPEND_TAIL;
+}
+
+/*
+ * Helper function called at the begining of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ * - Whether the new extent is contiguous with an existing one.
+ * - The current tree depth.
+ * - Whether the insert is an appending one.
+ * - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
+ struct buffer_head **last_eb_bh,
+ struct ocfs2_extent_rec *insert_rec,
+ int *free_records,
+ struct ocfs2_insert_type *insert)
+{
+ int ret;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_path *path = NULL;
+ struct buffer_head *bh = NULL;
+
+ insert->ins_split = SPLIT_NONE;
+
+ el = et->et_root_el;
+ insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+
+ if (el->l_tree_depth) {
+ /*
+ * If we have tree depth, we read in the
+ * rightmost extent block ahead of time as
+ * ocfs2_figure_insert_type() and ocfs2_add_branch()
+ * may want it later.
+ */
+ ret = ocfs2_read_extent_block(et->et_ci,
+ ocfs2_et_get_last_eb_blk(et),
+ &bh);
+ if (ret) {
+ mlog_exit(ret);
+ goto out;
+ }
+ eb = (struct ocfs2_extent_block *) bh->b_data;
+ el = &eb->h_list;
+ }
+
+ /*
+ * Unless we have a contiguous insert, we'll need to know if
+ * there is room left in our allocation tree for another
+ * extent record.
+ *
+ * XXX: This test is simplistic, we can search for empty
+ * extent records too.
+ */
+ *free_records = le16_to_cpu(el->l_count) -
+ le16_to_cpu(el->l_next_free_rec);
+
+ if (!insert->ins_tree_depth) {
+ ocfs2_figure_contig_type(et, insert, el, insert_rec);
+ ocfs2_figure_appending_type(insert, el, insert_rec);
+ return 0;
+ }
+
+ path = ocfs2_new_path_from_et(et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * In the case that we're inserting past what the tree
+ * currently accounts for, ocfs2_find_path() will return for
+ * us the rightmost tree path. This is accounted for below in
+ * the appending code.
+ */
+ ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ /*
+ * Now that we have the path, there's two things we want to determine:
+ * 1) Contiguousness (also set contig_index if this is so)
+ *
+ * 2) Are we doing an append? We can trivially break this up
+ * into two types of appends: simple record append, or a
+ * rotate inside the tail leaf.
+ */
+ ocfs2_figure_contig_type(et, insert, el, insert_rec);
+
+ /*
+ * The insert code isn't quite ready to deal with all cases of
+ * left contiguousness. Specifically, if it's an insert into
+ * the 1st record in a leaf, it will require the adjustment of
+ * cluster count on the last record of the path directly to it's
+ * left. For now, just catch that case and fool the layers
+ * above us. This works just fine for tree_depth == 0, which
+ * is why we allow that above.
+ */
+ if (insert->ins_contig == CONTIG_LEFT &&
+ insert->ins_contig_index == 0)
+ insert->ins_contig = CONTIG_NONE;
+
+ /*
+ * Ok, so we can simply compare against last_eb to figure out
+ * whether the path doesn't exist. This will only happen in
+ * the case that we're doing a tail append, so maybe we can
+ * take advantage of that information somehow.
+ */
+ if (ocfs2_et_get_last_eb_blk(et) ==
+ path_leaf_bh(path)->b_blocknr) {
+ /*
+ * Ok, ocfs2_find_path() returned us the rightmost
+ * tree path. This might be an appending insert. There are
+ * two cases:
+ * 1) We're doing a true append at the tail:
+ * -This might even be off the end of the leaf
+ * 2) We're "appending" by rotating in the tail
+ */
+ ocfs2_figure_appending_type(insert, el, insert_rec);
+ }
+
+out:
+ ocfs2_free_path(path);
+
+ if (ret == 0)
+ *last_eb_bh = bh;
+ else
+ brelse(bh);
+ return ret;
+}
+
+/*
+ * Insert an extent into a btree.
+ *
+ * The caller needs to update the owning btree's cluster count.
+ */
+int ocfs2_insert_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos,
+ u64 start_blk,
+ u32 new_clusters,
+ u8 flags,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int status;
+ int uninitialized_var(free_records);
+ struct buffer_head *last_eb_bh = NULL;
+ struct ocfs2_insert_type insert = {0, };
+ struct ocfs2_extent_rec rec;
+
+ mlog(0, "add %u clusters at position %u to owner %llu\n",
+ new_clusters, cpos,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+
+ memset(&rec, 0, sizeof(rec));
+ rec.e_cpos = cpu_to_le32(cpos);
+ rec.e_blkno = cpu_to_le64(start_blk);
+ rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+ rec.e_flags = flags;
+ status = ocfs2_et_insert_check(et, &rec);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
+ &free_records, &insert);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
+ "Insert.contig_index: %d, Insert.free_records: %d, "
+ "Insert.tree_depth: %d\n",
+ insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
+ free_records, insert.ins_tree_depth);
+
+ if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
+ status = ocfs2_grow_tree(handle, et,
+ &insert.ins_tree_depth, &last_eb_bh,
+ meta_ac);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ /* Finally, we can add clusters. This might rotate the tree for us. */
+ status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
+ if (status < 0)
+ mlog_errno(status);
+ else
+ ocfs2_et_extent_map_insert(et, &rec);
+
+bail:
+ brelse(last_eb_bh);
+
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * Allcate and add clusters into the extent b-tree.
+ * The new clusters(clusters_to_add) will be inserted at logical_offset.
+ * The extent b-tree's root is specified by et, and
+ * it is not limited to the file storage. Any extent tree can use this
+ * function if it implements the proper ocfs2_extent_tree.
+ */
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret)
+{
+ int status = 0;
+ int free_extents;
+ enum ocfs2_alloc_restarted reason = RESTART_NONE;
+ u32 bit_off, num_bits;
+ u64 block;
+ u8 flags = 0;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+
+ BUG_ON(!clusters_to_add);
+
+ if (mark_unwritten)
+ flags = OCFS2_EXT_UNWRITTEN;
+
+ free_extents = ocfs2_num_free_extents(osb, et);
+ if (free_extents < 0) {
+ status = free_extents;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* there are two cases which could cause us to EAGAIN in the
+ * we-need-more-metadata case:
+ * 1) we haven't reserved *any*
+ * 2) we are so fragmented, we've needed to add metadata too
+ * many times. */
+ if (!free_extents && !meta_ac) {
+ mlog(0, "we haven't reserved any metadata!\n");
+ status = -EAGAIN;
+ reason = RESTART_META;
+ goto leave;
+ } else if ((!free_extents)
+ && (ocfs2_alloc_context_bits_left(meta_ac)
+ < ocfs2_extend_meta_needed(et->et_root_el))) {
+ mlog(0, "filesystem is really fragmented...\n");
+ status = -EAGAIN;
+ reason = RESTART_META;
+ goto leave;
+ }
+
+ status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+ clusters_to_add, &bit_off, &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ BUG_ON(num_bits > clusters_to_add);
+
+ /* reserve our write early -- insert_extent may update the tree root */
+ status = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
+ num_bits, bit_off,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+ status = ocfs2_insert_extent(handle, et, *logical_offset, block,
+ num_bits, flags, meta_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ status = ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ clusters_to_add -= num_bits;
+ *logical_offset += num_bits;
+
+ if (clusters_to_add) {
+ mlog(0, "need to alloc once more, wanted = %u\n",
+ clusters_to_add);
+ status = -EAGAIN;
+ reason = RESTART_TRANS;
+ }
+
+leave:
+ mlog_exit(status);
+ if (reason_ret)
+ *reason_ret = reason;
+ return status;
+}
+
+static void ocfs2_make_right_split_rec(struct super_block *sb,
+ struct ocfs2_extent_rec *split_rec,
+ u32 cpos,
+ struct ocfs2_extent_rec *rec)
+{
+ u32 rec_cpos = le32_to_cpu(rec->e_cpos);
+ u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
+
+ memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
+
+ split_rec->e_cpos = cpu_to_le32(cpos);
+ split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
+
+ split_rec->e_blkno = rec->e_blkno;
+ le64_add_cpu(&split_rec->e_blkno,
+ ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
+
+ split_rec->e_flags = rec->e_flags;
+}
+
+static int ocfs2_split_and_insert(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct buffer_head **last_eb_bh,
+ int split_index,
+ struct ocfs2_extent_rec *orig_split_rec,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret = 0, depth;
+ unsigned int insert_range, rec_range, do_leftright = 0;
+ struct ocfs2_extent_rec tmprec;
+ struct ocfs2_extent_list *rightmost_el;
+ struct ocfs2_extent_rec rec;
+ struct ocfs2_extent_rec split_rec = *orig_split_rec;
+ struct ocfs2_insert_type insert;
+ struct ocfs2_extent_block *eb;
+
+leftright:
+ /*
+ * Store a copy of the record on the stack - it might move
+ * around as the tree is manipulated below.
+ */
+ rec = path_leaf_el(path)->l_recs[split_index];
+
+ rightmost_el = et->et_root_el;
+
+ depth = le16_to_cpu(rightmost_el->l_tree_depth);
+ if (depth) {
+ BUG_ON(!(*last_eb_bh));
+ eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+ rightmost_el = &eb->h_list;
+ }
+
+ if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+ le16_to_cpu(rightmost_el->l_count)) {
+ ret = ocfs2_grow_tree(handle, et,
+ &depth, last_eb_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+ insert.ins_appending = APPEND_NONE;
+ insert.ins_contig = CONTIG_NONE;
+ insert.ins_tree_depth = depth;
+
+ insert_range = le32_to_cpu(split_rec.e_cpos) +
+ le16_to_cpu(split_rec.e_leaf_clusters);
+ rec_range = le32_to_cpu(rec.e_cpos) +
+ le16_to_cpu(rec.e_leaf_clusters);
+
+ if (split_rec.e_cpos == rec.e_cpos) {
+ insert.ins_split = SPLIT_LEFT;
+ } else if (insert_range == rec_range) {
+ insert.ins_split = SPLIT_RIGHT;
+ } else {
+ /*
+ * Left/right split. We fake this as a right split
+ * first and then make a second pass as a left split.
+ */
+ insert.ins_split = SPLIT_RIGHT;
+
+ ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ &tmprec, insert_range, &rec);
+
+ split_rec = tmprec;
+
+ BUG_ON(do_leftright);
+ do_leftright = 1;
+ }
+
+ ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (do_leftright == 1) {
+ u32 cpos;
+ struct ocfs2_extent_list *el;
+
+ do_leftright++;
+ split_rec = *orig_split_rec;
+
+ ocfs2_reinit_path(path, 1);
+
+ cpos = le32_to_cpu(split_rec.e_cpos);
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ split_index = ocfs2_search_extent_list(el, cpos);
+ goto leftright;
+ }
+out:
+
+ return ret;
+}
+
+static int ocfs2_replace_extent_rec(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_extent_list *el,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec)
+{
+ int ret;
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
+ path_num_items(path) - 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el->l_recs[split_index] = *split_rec;
+
+ ocfs2_journal_dirty(handle, path_leaf_bh(path));
+out:
+ return ret;
+}
+
+/*
+ * Split part or all of the extent record at split_index in the leaf
+ * pointed to by path. Merge with the contiguous extent record if needed.
+ *
+ * Care is taken to handle contiguousness so as to not grow the tree.
+ *
+ * meta_ac is not strictly necessary - we only truly need it if growth
+ * of the tree is required. All other cases will degrade into a less
+ * optimal tree layout.
+ *
+ * last_eb_bh should be the rightmost leaf block for any extent
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
+ *
+ * This code is optimized for readability - several passes might be
+ * made over certain portions of the tree. All of those blocks will
+ * have been brought into cache (and pinned via the journal), so the
+ * extra overhead is not expressed in terms of disk reads.
+ */
+int ocfs2_split_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ struct ocfs2_extent_list *el = path_leaf_el(path);
+ struct buffer_head *last_eb_bh = NULL;
+ struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+ struct ocfs2_merge_ctxt ctxt;
+ struct ocfs2_extent_list *rightmost_el;
+
+ if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
+ ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
+ (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
+ split_index,
+ split_rec);
+
+ /*
+ * The core merge / split code wants to know how much room is
+ * left in this allocation tree, so we pass the
+ * rightmost extent list.
+ */
+ if (path->p_tree_depth) {
+ struct ocfs2_extent_block *eb;
+
+ ret = ocfs2_read_extent_block(et->et_ci,
+ ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
+ if (ret) {
+ mlog_exit(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ rightmost_el = &eb->h_list;
+ } else
+ rightmost_el = path_root_el(path);
+
+ if (rec->e_cpos == split_rec->e_cpos &&
+ rec->e_leaf_clusters == split_rec->e_leaf_clusters)
+ ctxt.c_split_covers_rec = 1;
+ else
+ ctxt.c_split_covers_rec = 0;
+
+ ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+ mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
+ split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
+ ctxt.c_split_covers_rec);
+
+ if (ctxt.c_contig_type == CONTIG_NONE) {
+ if (ctxt.c_split_covers_rec)
+ ret = ocfs2_replace_extent_rec(handle, et, path, el,
+ split_index, split_rec);
+ else
+ ret = ocfs2_split_and_insert(handle, et, path,
+ &last_eb_bh, split_index,
+ split_rec, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+ } else {
+ ret = ocfs2_try_to_merge_extent(handle, et, path,
+ split_index, split_rec,
+ dealloc, &ctxt);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out:
+ brelse(last_eb_bh);
+ return ret;
+}
+
+/*
+ * Change the flags of the already-existing extent at cpos for len clusters.
+ *
+ * new_flags: the flags we want to set.
+ * clear_flags: the flags we want to clear.
+ * phys: the new physical offset we want this new extent starts from.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_change_extent_flag(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int new_flags, int clear_flags)
+{
+ int ret, index;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+ u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
+ struct ocfs2_extent_rec split_rec;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ left_path = ocfs2_new_path_from_et(et);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ el = path_leaf_el(left_path);
+
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(sb,
+ "Owner %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci), cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = -EIO;
+ rec = &el->l_recs[index];
+ if (new_flags && (rec->e_flags & new_flags)) {
+ mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
+ "extent that already had them",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ new_flags);
+ goto out;
+ }
+
+ if (clear_flags && !(rec->e_flags & clear_flags)) {
+ mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
+ "extent that didn't have them",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ clear_flags);
+ goto out;
+ }
+
+ memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
+ split_rec.e_cpos = cpu_to_le32(cpos);
+ split_rec.e_leaf_clusters = cpu_to_le16(len);
+ split_rec.e_blkno = cpu_to_le64(start_blkno);
+ split_rec.e_flags = rec->e_flags;
+ if (new_flags)
+ split_rec.e_flags |= new_flags;
+ if (clear_flags)
+ split_rec.e_flags &= ~clear_flags;
+
+ ret = ocfs2_split_extent(handle, et, left_path,
+ index, &split_rec, meta_ac,
+ dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_free_path(left_path);
+ return ret;
+
+}
+
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ * This removes the unwritten extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle, u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+
+ mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
+ inode->i_ino, cpos, len, phys);
+
+ if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+ "that are being written to, but the feature bit "
+ "is not set in the super block.",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * XXX: This should be fixed up so that we just re-insert the
+ * next extent records.
+ */
+ ocfs2_et_extent_map_truncate(et, 0);
+
+ ret = ocfs2_change_extent_flag(handle, et, cpos,
+ len, phys, meta_ac, dealloc,
+ 0, OCFS2_EXT_UNWRITTEN);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ int index, u32 new_range,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret, depth, credits = handle->h_buffer_credits;
+ struct buffer_head *last_eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *rightmost_el, *el;
+ struct ocfs2_extent_rec split_rec;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_insert_type insert;
+
+ /*
+ * Setup the record to split before we grow the tree.
+ */
+ el = path_leaf_el(path);
+ rec = &el->l_recs[index];
+ ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ &split_rec, new_range, rec);
+
+ depth = path->p_tree_depth;
+ if (depth > 0) {
+ ret = ocfs2_read_extent_block(et->et_ci,
+ ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ rightmost_el = &eb->h_list;
+ } else
+ rightmost_el = path_leaf_el(path);
+
+ credits += path->p_tree_depth +
+ ocfs2_extend_meta_needed(et->et_root_el);
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+ le16_to_cpu(rightmost_el->l_count)) {
+ ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+ insert.ins_appending = APPEND_NONE;
+ insert.ins_contig = CONTIG_NONE;
+ insert.ins_split = SPLIT_RIGHT;
+ insert.ins_tree_depth = depth;
+
+ ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(last_eb_bh);
+ return ret;
+}
+
+static int ocfs2_truncate_rec(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path, int index,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u32 cpos, u32 len)
+{
+ int ret;
+ u32 left_cpos, rec_range, trunc_range;
+ int wants_rotate = 0, is_rightmost_tree_rec = 0;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_list *el = path_leaf_el(path);
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_extent_block *eb;
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ index--;
+ }
+
+ if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
+ path->p_tree_depth) {
+ /*
+ * Check whether this is the rightmost tree record. If
+ * we remove all of this record or part of its right
+ * edge then an update of the record lengths above it
+ * will be required.
+ */
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+ if (eb->h_next_leaf_blk == 0)
+ is_rightmost_tree_rec = 1;
+ }
+
+ rec = &el->l_recs[index];
+ if (index == 0 && path->p_tree_depth &&
+ le32_to_cpu(rec->e_cpos) == cpos) {
+ /*
+ * Changing the leftmost offset (via partial or whole
+ * record truncate) of an interior (or rightmost) path
+ * means we have to update the subtree that is formed
+ * by this leaf and the one to it's left.
+ *
+ * There are two cases we can skip:
+ * 1) Path is the leftmost one in our btree.
+ * 2) The leaf is rightmost and will be empty after
+ * we remove the extent record - the rotate code
+ * knows how to update the newly formed edge.
+ */
+
+ ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
+ left_path = ocfs2_new_path_from_path(path);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ }
+
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+ trunc_range = cpos + len;
+
+ if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
+ int next_free;
+
+ memset(rec, 0, sizeof(*rec));
+ ocfs2_cleanup_merge(el, index);
+ wants_rotate = 1;
+
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (is_rightmost_tree_rec && next_free > 1) {
+ /*
+ * We skip the edge update if this path will
+ * be deleted by the rotate code.
+ */
+ rec = &el->l_recs[next_free - 1];
+ ocfs2_adjust_rightmost_records(handle, et, path,
+ rec);
+ }
+ } else if (le32_to_cpu(rec->e_cpos) == cpos) {
+ /* Remove leftmost portion of the record. */
+ le32_add_cpu(&rec->e_cpos, len);
+ le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
+ le16_add_cpu(&rec->e_leaf_clusters, -len);
+ } else if (rec_range == trunc_range) {
+ /* Remove rightmost portion of the record */
+ le16_add_cpu(&rec->e_leaf_clusters, -len);
+ if (is_rightmost_tree_rec)
+ ocfs2_adjust_rightmost_records(handle, et, path, rec);
+ } else {
+ /* Caller should have trapped this. */
+ mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
+ "(%u, %u)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ le32_to_cpu(rec->e_cpos),
+ le16_to_cpu(rec->e_leaf_clusters), cpos, len);
+ BUG();
+ }
+
+ if (left_path) {
+ int subtree_index;
+
+ subtree_index = ocfs2_find_subtree_root(et, left_path, path);
+ ocfs2_complete_edge_insert(handle, left_path, path,
+ subtree_index);
+ }
+
+ ocfs2_journal_dirty(handle, path_leaf_bh(path));
+
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ ocfs2_free_path(left_path);
+ return ret;
+}
+
+int ocfs2_remove_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, index;
+ u32 rec_range, trunc_range;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_path *path = NULL;
+
+ /*
+ * XXX: Why are we truncating to 0 instead of wherever this
+ * affects us?
+ */
+ ocfs2_et_extent_map_truncate(et, 0);
+
+ path = ocfs2_new_path_from_et(et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * We have 3 cases of extent removal:
+ * 1) Range covers the entire extent rec
+ * 2) Range begins or ends on one edge of the extent rec
+ * 3) Range is in the middle of the extent rec (no shared edges)
+ *
+ * For case 1 we remove the extent rec and left rotate to
+ * fill the hole.
+ *
+ * For case 2 we just shrink the existing extent rec, with a
+ * tree update if the shrinking edge is also the edge of an
+ * extent block.
+ *
+ * For case 3 we do a right split to turn the extent rec into
+ * something case 2 can handle.
+ */
+ rec = &el->l_recs[index];
+ rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+ trunc_range = cpos + len;
+
+ BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
+
+ mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
+ "(cpos %u, len %u)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos, len, index,
+ le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
+
+ if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
+ ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+ cpos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ ret = ocfs2_split_tree(handle, et, path, index,
+ trunc_range, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * The split could have manipulated the tree enough to
+ * move the record location, so we have to look for it again.
+ */
+ ocfs2_reinit_path(path, 1);
+
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu: split at cpos %u lost record.",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * Double check our values here. If anything is fishy,
+ * it's easier to catch it at the top level.
+ */
+ rec = &el->l_recs[index];
+ rec_range = le32_to_cpu(rec->e_cpos) +
+ ocfs2_rec_clusters(el, rec);
+ if (rec_range != trunc_range) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu: error after split at cpos %u"
+ "trunc len %u, existing record is (%u,%u)",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos, len, le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+ cpos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+int ocfs2_remove_btree_range(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 phys_cpos, u32 len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+
+ ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ vfs_dq_free_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(inode->i_sb, len));
+
+ ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_et_update_clusters(et, -len);
+
+ ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+{
+ struct buffer_head *tl_bh = osb->osb_tl_bh;
+ struct ocfs2_dinode *di;
+ struct ocfs2_truncate_log *tl;
+
+ di = (struct ocfs2_dinode *) tl_bh->b_data;
+ tl = &di->id2.i_dealloc;
+
+ mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
+ "slot %d, invalid truncate log parameters: used = "
+ "%u, count = %u\n", osb->slot_num,
+ le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
+ return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
+}
+
+static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
+ unsigned int new_start)
+{
+ unsigned int tail_index;
+ unsigned int current_tail;
+
+ /* No records, nothing to coalesce */
+ if (!le16_to_cpu(tl->tl_used))
+ return 0;
+
+ tail_index = le16_to_cpu(tl->tl_used) - 1;
+ current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
+ current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
+
+ return current_tail == new_start;
+}
+
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+ handle_t *handle,
+ u64 start_blk,
+ unsigned int num_clusters)
+{
+ int status, index;
+ unsigned int start_cluster, tl_count;
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct buffer_head *tl_bh = osb->osb_tl_bh;
+ struct ocfs2_dinode *di;
+ struct ocfs2_truncate_log *tl;
+
+ mlog_entry("start_blk = %llu, num_clusters = %u\n",
+ (unsigned long long)start_blk, num_clusters);
+
+ BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+
+ start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
+
+ di = (struct ocfs2_dinode *) tl_bh->b_data;
+
+ /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
+ * by the underlying call to ocfs2_read_inode_block(), so any
+ * corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
+ tl_count = le16_to_cpu(tl->tl_count);
+ mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+ tl_count == 0,
+ "Truncate record count on #%llu invalid "
+ "wanted %u, actual %u\n",
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+ ocfs2_truncate_recs_per_inode(osb->sb),
+ le16_to_cpu(tl->tl_count));
+
+ /* Caller should have known to flush before calling us. */
+ index = le16_to_cpu(tl->tl_used);
+ if (index >= tl_count) {
+ status = -ENOSPC;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ mlog(0, "Log truncate of %u clusters starting at cluster %u to "
+ "%llu (index = %d)\n", num_clusters, start_cluster,
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
+
+ if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
+ /*
+ * Move index back to the record we are coalescing with.
+ * ocfs2_truncate_log_can_coalesce() guarantees nonzero
+ */
+ index--;
+
+ num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
+ mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
+ index, le32_to_cpu(tl->tl_recs[index].t_start),
+ num_clusters);
+ } else {
+ tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
+ tl->tl_used = cpu_to_le16(index + 1);
+ }
+ tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
+
+ status = ocfs2_journal_dirty(handle, tl_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+bail:
+ mlog_exit(status);
+ return status;
+}
+
+static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct inode *data_alloc_inode,
+ struct buffer_head *data_alloc_bh)
+{
+ int status = 0;
+ int i;
+ unsigned int num_clusters;
+ u64 start_blk;
+ struct ocfs2_truncate_rec rec;
+ struct ocfs2_dinode *di;
+ struct ocfs2_truncate_log *tl;
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct buffer_head *tl_bh = osb->osb_tl_bh;
+
+ mlog_entry_void();
+
+ di = (struct ocfs2_dinode *) tl_bh->b_data;
+ tl = &di->id2.i_dealloc;
+ i = le16_to_cpu(tl->tl_used) - 1;
+ while (i >= 0) {
+ /* Caller has given us at least enough credits to
+ * update the truncate log dinode */
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ tl->tl_used = cpu_to_le16(i);
+
+ status = ocfs2_journal_dirty(handle, tl_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* TODO: Perhaps we can calculate the bulk of the
+ * credits up front rather than extending like
+ * this. */
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ rec = tl->tl_recs[i];
+ start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
+ le32_to_cpu(rec.t_start));
+ num_clusters = le32_to_cpu(rec.t_clusters);
+
+ /* if start_blk is not set, we ignore the record as
+ * invalid. */
+ if (start_blk) {
+ mlog(0, "free record %d, start = %u, clusters = %u\n",
+ i, le32_to_cpu(rec.t_start), num_clusters);
+
+ status = ocfs2_free_clusters(handle, data_alloc_inode,
+ data_alloc_bh, start_blk,
+ num_clusters);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+ i--;
+ }
+
+bail:
+ mlog_exit(status);
+ return status;
+}
+
+/* Expects you to already be holding tl_inode->i_mutex */
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+ int status;
+ unsigned int num_to_flush;
+ handle_t *handle;
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct inode *data_alloc_inode = NULL;
+ struct buffer_head *tl_bh = osb->osb_tl_bh;
+ struct buffer_head *data_alloc_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_truncate_log *tl;
+
+ mlog_entry_void();
+
+ BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+
+ di = (struct ocfs2_dinode *) tl_bh->b_data;
+
+ /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
+ * by the underlying call to ocfs2_read_inode_block(), so any
+ * corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
+ num_to_flush = le16_to_cpu(tl->tl_used);
+ mlog(0, "Flush %u records from truncate log #%llu\n",
+ num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
+ if (!num_to_flush) {
+ status = 0;
+ goto out;
+ }
+
+ data_alloc_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!data_alloc_inode) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Could not get bitmap inode!\n");
+ goto out;
+ }
+
+ mutex_lock(&data_alloc_inode->i_mutex);
+
+ status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_mutex;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_unlock;
+ }
+
+ status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+ data_alloc_bh);
+ if (status < 0)
+ mlog_errno(status);
+
+ ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+ brelse(data_alloc_bh);
+ ocfs2_inode_unlock(data_alloc_inode, 1);
+
+out_mutex:
+ mutex_unlock(&data_alloc_inode->i_mutex);
+ iput(data_alloc_inode);
+
+out:
+ mlog_exit(status);
+ return status;
+}
+
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+ int status;
+ struct inode *tl_inode = osb->osb_tl_inode;
+
+ mutex_lock(&tl_inode->i_mutex);
+ status = __ocfs2_flush_truncate_log(osb);
+ mutex_unlock(&tl_inode->i_mutex);
+
+ return status;
+}
+
+static void ocfs2_truncate_log_worker(struct work_struct *work)
+{
+ int status;
+ struct ocfs2_super *osb =
+ container_of(work, struct ocfs2_super,
+ osb_truncate_log_wq.work);
+
+ mlog_entry_void();
+
+ status = ocfs2_flush_truncate_log(osb);
+ if (status < 0)
+ mlog_errno(status);
+ else
+ ocfs2_init_inode_steal_slot(osb);
+
+ mlog_exit(status);
+}
+
+#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+ int cancel)
+{
+ if (osb->osb_tl_inode) {
+ /* We want to push off log flushes while truncates are
+ * still running. */
+ if (cancel)
+ cancel_delayed_work(&osb->osb_truncate_log_wq);
+
+ queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+ OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
+ }
+}
+
+static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
+ int slot_num,
+ struct inode **tl_inode,
+ struct buffer_head **tl_bh)
+{
+ int status;
+ struct inode *inode = NULL;
+ struct buffer_head *bh = NULL;
+
+ inode = ocfs2_get_system_file_inode(osb,
+ TRUNCATE_LOG_SYSTEM_INODE,
+ slot_num);
+ if (!inode) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Could not get load truncate log inode!\n");
+ goto bail;
+ }
+
+ status = ocfs2_read_inode_block(inode, &bh);
+ if (status < 0) {
+ iput(inode);
+ mlog_errno(status);
+ goto bail;
+ }
+
+ *tl_inode = inode;
+ *tl_bh = bh;
+bail:
+ mlog_exit(status);
+ return status;
+}
+
+/* called during the 1st stage of node recovery. we stamp a clean
+ * truncate log and pass back a copy for processing later. if the
+ * truncate log does not require processing, a *tl_copy is set to
+ * NULL. */
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+ int slot_num,
+ struct ocfs2_dinode **tl_copy)
+{