X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Focfs2%2Frefcounttree.c;h=4793f36f6518b25f312274d2fcdc84492de26340;hb=03e62303cf56e87337115f14842321043df2b4bb;hp=2c7974cccaf8077765f76d82c4b9b810f8a0a346;hpb=1823cb0b9fe5e6d48017ee3f92428f69c0235d87;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 2c7974c..4793f36 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -31,6 +31,44 @@ #include "sysfile.h" #include "dlmglue.h" #include "extent_map.h" +#include "aops.h" +#include "xattr.h" +#include "namei.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ocfs2_cow_context { + struct inode *inode; + u32 cow_start; + u32 cow_len; + struct ocfs2_extent_tree data_et; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_root_bh; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; + void *cow_object; + struct ocfs2_post_refcount *post_refcount; + int extra_credits; + int (*get_clusters)(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags); + int (*cow_duplicate_clusters)(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +}; static inline struct ocfs2_refcount_tree * cache_info_to_refcount(struct ocfs2_caching_info *ci) @@ -237,7 +275,7 @@ static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } -void ocfs2_kref_remove_refcount_tree(struct kref *kref) +static void ocfs2_kref_remove_refcount_tree(struct kref *kref) { struct ocfs2_refcount_tree *tree = container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); @@ -485,23 +523,6 @@ out: return ret; } -int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw, - struct ocfs2_refcount_tree **ret_tree, - struct buffer_head **ref_bh) -{ - int ret; - u64 ref_blkno; - - ret = ocfs2_get_refcount_block(inode, &ref_blkno); - if (ret) { - mlog_errno(ret); - return ret; - } - - return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, - rw, ret_tree, ref_bh); -} - void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree, int rw) { @@ -549,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; u16 suballoc_bit_start; u32 num_got; - u64 first_blkno; + u64 suballoc_loc, first_blkno; BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); @@ -575,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, goto out_commit; } - ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &first_blkno); if (ret) { @@ -604,7 +625,8 @@ static int ocfs2_create_refcount_tree(struct inode *inode, rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(rb, 0, inode->i_sb->s_blocksize); strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); - rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); + rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); rb->rf_blkno = cpu_to_le64(first_blkno); @@ -769,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) if (le32_to_cpu(rb->rf_count) == 1) { blk = le64_to_cpu(rb->rf_blkno); bit = le16_to_cpu(rb->rf_suballoc_bit); - bg_blkno = ocfs2_which_suballoc_group(blk, bit); + if (rb->rf_suballoc_loc) + bg_blkno = le64_to_cpu(rb->rf_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, @@ -894,6 +919,139 @@ out: } /* + * Try to remove refcount tree. The mechanism is: + * 1) Check whether i_clusters == 0, if no, exit. + * 2) check whether we have i_xattr_loc in dinode. if yes, exit. + * 3) Check whether we have inline xattr stored outside, if yes, exit. + * 4) Remove the tree. + */ +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&oi->ip_xattr_sem); + down_write(&oi->ip_alloc_sem); + + if (oi->ip_clusters) + goto out; + + if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc) + goto out; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL && + ocfs2_has_inline_xattr_value_outside(inode, di)) + goto out; + + ret = ocfs2_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); +out: + up_write(&oi->ip_alloc_sem); + up_write(&oi->ip_xattr_sem); + return 0; +} + +/* + * Find the end range for a leaf refcount block indicated by + * el->l_recs[index].e_blkno. + */ +static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct ocfs2_extent_block *eb, + struct ocfs2_extent_list *el, + int index, u32 *cpos_end) +{ + int ret, i, subtree_root; + u32 cpos; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_extent_list *tmp_el; + + if (index < le16_to_cpu(el->l_next_free_rec) - 1) { + /* + * We have a extent rec after index, so just use the e_cpos + * of the next extent rec. + */ + *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); + return 0; + } + + if (!eb || (eb && !eb->h_next_leaf_blk)) { + /* + * We are the last extent rec, so any high cpos should + * be stored in this leaf refcount block. + */ + *cpos_end = UINT_MAX; + return 0; + } + + /* + * If the extent block isn't the last one, we have to find + * the subtree root between this extent block and the next + * leaf extent block and get the corresponding e_cpos from + * the subroot. Otherwise we may corrupt the b-tree. + */ + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + left_path = ocfs2_new_path_from_et(&et); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); + ret = ocfs2_find_path(ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + right_path = ocfs2_new_path_from_path(left_path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(&et, left_path, + right_path); + + tmp_el = left_path->p_node[subtree_root].el; + blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; + for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) { + if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { + *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); + break; + } + } + + BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec)); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + return ret; +} + +/* * Given a cpos and len, try to find the refcount record which contains cpos. * 1. If cpos can be found in one refcount record, return the record. * 2. If cpos can't be found, return a fake record which start from cpos @@ -908,10 +1066,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, struct buffer_head **ret_bh) { int ret = 0, i, found; - u32 low_cpos; + u32 low_cpos, uninitialized_var(cpos_end); struct ocfs2_extent_list *el; - struct ocfs2_extent_rec *tmp, *rec = NULL; - struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_block *eb = NULL; struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *rb = @@ -959,12 +1117,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, } } - /* adjust len when we have ocfs2_extent_rec after it. */ - if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { - tmp = &el->l_recs[i+1]; + if (found) { + ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, + eb, el, i, &cpos_end); + if (ret) { + mlog_errno(ret); + goto out; + } - if (le32_to_cpu(tmp->e_cpos) < cpos + len) - len = le32_to_cpu(tmp->e_cpos) - cpos; + if (cpos_end < low_cpos + len) + len = cpos_end - low_cpos; } ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), @@ -1078,7 +1240,7 @@ static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb, static int ocfs2_change_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_leaf_bh, - int index, int change) + int index, int merge, int change) { int ret; struct ocfs2_refcount_block *rb = @@ -1107,12 +1269,10 @@ static int ocfs2_change_refcount_rec(handle_t *handle, } le16_add_cpu(&rl->rl_used, -1); - } else + } else if (merge) ocfs2_refcount_rec_merge(rb, index); - ret = ocfs2_journal_dirty(handle, ref_leaf_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, ref_leaf_bh); out: return ret; } @@ -1126,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, int ret; u16 suballoc_bit_start; u32 num_got; - u64 blkno; + u64 suballoc_loc, blkno; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct buffer_head *new_bh = NULL; struct ocfs2_refcount_block *new_rb; @@ -1140,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, goto out; } - ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &blkno); if (ret) { @@ -1171,7 +1331,8 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; - new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); + new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_blkno = cpu_to_le64(blkno); new_rb->rf_cpos = cpu_to_le32(0); @@ -1343,7 +1504,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, /* change old and new rl_used accordingly. */ le16_add_cpu(&rl->rl_used, -num_moved); - new_rl->rl_used = cpu_to_le32(num_moved); + new_rl->rl_used = cpu_to_le16(num_moved); sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), @@ -1366,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, int ret; u16 suballoc_bit_start; u32 num_got, new_cpos; - u64 blkno; + u64 suballoc_loc, blkno; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *root_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; @@ -1390,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, goto out; } - ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &blkno); if (ret) { @@ -1417,7 +1578,8 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(new_rb, 0, sb->s_blocksize); strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); - new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); + new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); new_rb->rf_blkno = cpu_to_le64(blkno); @@ -1536,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle, * 2 more credits, one for the leaf refcount block, one for * the extent block contains the extent rec. */ - ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); + ret = ocfs2_extend_trans(handle, 2); if (ret < 0) { mlog_errno(ret); goto out; @@ -1583,7 +1745,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_refcount_rec *rec, - int index, + int index, int merge, struct ocfs2_alloc_context *meta_ac) { int ret; @@ -1641,13 +1803,10 @@ static int ocfs2_insert_refcount_rec(handle_t *handle, le16_add_cpu(&rf_list->rl_used, 1); - ocfs2_refcount_rec_merge(rb, index); + if (merge) + ocfs2_refcount_rec_merge(rb, index); - ret = ocfs2_journal_dirty(handle, ref_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, ref_leaf_bh); if (index == 0) { ret = ocfs2_adjust_refcount_rec(handle, ci, @@ -1675,7 +1834,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_refcount_rec *split_rec, - int index, + int index, int merge, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { @@ -1721,7 +1880,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle, recs_need++; /* If the leaf block don't have enough record, expand it. */ - if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) { + if (le16_to_cpu(rf_list->rl_used) + recs_need > + le16_to_cpu(rf_list->rl_count)) { struct ocfs2_refcount_rec tmp_rec; u64 cpos = le64_to_cpu(orig_rec->r_cpos); len = le32_to_cpu(orig_rec->r_clusters); @@ -1783,7 +1943,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle, memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); le64_add_cpu(&tail_rec->r_cpos, le32_to_cpu(tail_rec->r_clusters) - len); - tail_rec->r_clusters = le32_to_cpu(len); + tail_rec->r_clusters = cpu_to_le32(len); } /* @@ -1813,12 +1973,11 @@ static int ocfs2_split_refcount_rec(handle_t *handle, le32_to_cpu(split_rec->r_refcount), (unsigned long long)ref_leaf_bh->b_blocknr, index); - ocfs2_refcount_rec_merge(rb, index); + if (merge) + ocfs2_refcount_rec_merge(rb, index); } - ret = ocfs2_journal_dirty(handle, ref_leaf_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, ref_leaf_bh); out: brelse(new_bh); @@ -1828,7 +1987,7 @@ out: static int __ocfs2_increase_refcount(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, - u64 cpos, u32 len, + u64 cpos, u32 len, int merge, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { @@ -1868,7 +2027,8 @@ static int __ocfs2_increase_refcount(handle_t *handle, "count %u\n", (unsigned long long)cpos, set_len, le32_to_cpu(rec.r_refcount)); ret = ocfs2_change_refcount_rec(handle, ci, - ref_leaf_bh, index, 1); + ref_leaf_bh, index, + merge, 1); if (ret) { mlog_errno(ret); goto out; @@ -1881,7 +2041,8 @@ static int __ocfs2_increase_refcount(handle_t *handle, set_len); ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, - &rec, index, meta_ac); + &rec, index, + merge, meta_ac); if (ret) { mlog_errno(ret); goto out; @@ -1899,7 +2060,7 @@ static int __ocfs2_increase_refcount(handle_t *handle, set_len, le32_to_cpu(rec.r_refcount)); ret = ocfs2_split_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, - &rec, index, + &rec, index, merge, meta_ac, dealloc); if (ret) { mlog_errno(ret); @@ -1949,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle, */ ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, le16_to_cpu(rb->rf_suballoc_slot), + le64_to_cpu(rb->rf_suballoc_loc), le64_to_cpu(rb->rf_blkno), le16_to_cpu(rb->rf_suballoc_bit)); if (ret) { @@ -1992,6 +2154,18 @@ out: return ret; } +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + return __ocfs2_increase_refcount(handle, ci, ref_root_bh, + cpos, len, 1, + meta_ac, dealloc); +} + static int ocfs2_decrease_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, @@ -2012,7 +2186,7 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle, if (cpos == le64_to_cpu(rec->r_cpos) && len == le32_to_cpu(rec->r_clusters)) ret = ocfs2_change_refcount_rec(handle, ci, - ref_leaf_bh, index, -1); + ref_leaf_bh, index, 1, -1); else { struct ocfs2_refcount_rec split = *rec; split.r_cpos = cpu_to_le64(cpos); @@ -2028,7 +2202,7 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle, le32_to_cpu(rec->r_clusters)); ret = ocfs2_split_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, - &split, index, + &split, index, 1, meta_ac, dealloc); } @@ -2055,7 +2229,8 @@ static int __ocfs2_decrease_refcount(handle_t *handle, struct buffer_head *ref_root_bh, u64 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) { int ret = 0, index = 0; struct ocfs2_refcount_rec rec; @@ -2063,9 +2238,10 @@ static int __ocfs2_decrease_refcount(handle_t *handle, struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct buffer_head *ref_leaf_bh = NULL; - mlog(0, "Tree owner %llu, decrease refcount start %llu, len %u\n", + mlog(0, "Tree owner %llu, decrease refcount start %llu, " + "len %u, delete %u\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long)cpos, len); + (unsigned long long)cpos, len, delete); while (len) { ret = ocfs2_get_refcount_rec(ci, ref_root_bh, @@ -2078,6 +2254,8 @@ static int __ocfs2_decrease_refcount(handle_t *handle, r_count = le32_to_cpu(rec.r_refcount); BUG_ON(r_count == 0); + if (!delete) + BUG_ON(r_count > 1); r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - cpos; @@ -2091,7 +2269,7 @@ static int __ocfs2_decrease_refcount(handle_t *handle, goto out; } - if (le32_to_cpu(rec.r_refcount) == 1) { + if (le32_to_cpu(rec.r_refcount) == 1 && delete) { ret = ocfs2_cache_cluster_dealloc(dealloc, ocfs2_clusters_to_blocks(sb, cpos), r_len); @@ -2116,7 +2294,8 @@ out: int ocfs2_decrease_refcount(struct inode *inode, handle_t *handle, u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) { int ret; u64 ref_blkno; @@ -2146,10 +2325,2125 @@ int ocfs2_decrease_refcount(struct inode *inode, } ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh, - cpos, len, meta_ac, dealloc); + cpos, len, meta_ac, dealloc, delete); + if (ret) + mlog_errno(ret); +out: + brelse(ref_root_bh); + return ret; +} + +/* + * Mark the already-existing extent at cpos as refcounted for len clusters. + * This adds the refcount extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +static int ocfs2_mark_extent_refcounted(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, + u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n", + inode->i_ino, cpos, len, phys); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + OCFS2_EXT_REFCOUNTED, 0); if (ret) mlog_errno(ret); + +out: + return ret; +} + +/* + * Given some contiguous physical clusters, calculate what we need + * for modifying their refcount. + */ +static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 start_cpos, + u32 clusters, + int *meta_add, + int *credits) +{ + int ret = 0, index, ref_blocks = 0, recs_add = 0; + u64 cpos = start_cpos; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_rec rec; + struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; + u32 len; + + mlog(0, "start_cpos %llu, clusters %u\n", + (unsigned long long)start_cpos, clusters); + while (clusters) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, clusters, &rec, + &index, &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (ref_leaf_bh != prev_bh) { + /* + * Now we encounter a new leaf block, so calculate + * whether we need to extend the old leaf. + */ + if (prev_bh) { + rb = (struct ocfs2_refcount_block *) + prev_bh->b_data; + + if (le64_to_cpu(rb->rf_records.rl_used) + + recs_add > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + } + + recs_add = 0; + *credits += 1; + brelse(prev_bh); + prev_bh = ref_leaf_bh; + get_bh(prev_bh); + } + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + + mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu," + "rec->r_clusters %u, rec->r_refcount %u, index %d\n", + recs_add, (unsigned long long)cpos, clusters, + (unsigned long long)le64_to_cpu(rec.r_cpos), + le32_to_cpu(rec.r_clusters), + le32_to_cpu(rec.r_refcount), index); + + len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - cpos; + /* + * If the refcount rec already exist, cool. We just need + * to check whether there is a split. Otherwise we just need + * to increase the refcount. + * If we will insert one, increases recs_add. + * + * We record all the records which will be inserted to the + * same refcount block, so that we can tell exactly whether + * we need a new refcount block or not. + */ + if (rec.r_refcount) { + /* Check whether we need a split at the beginning. */ + if (cpos == start_cpos && + cpos != le64_to_cpu(rec.r_cpos)) + recs_add++; + + /* Check whether we need a split in the end. */ + if (cpos + clusters < le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) + recs_add++; + } else + recs_add++; + + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + clusters -= len; + cpos += len; + } + + if (prev_bh) { + rb = (struct ocfs2_refcount_block *)prev_bh->b_data; + + if (le64_to_cpu(rb->rf_records.rl_used) + recs_add > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + + *credits += 1; + } + + if (!ref_blocks) + goto out; + + mlog(0, "we need ref_blocks %d\n", ref_blocks); + *meta_add += ref_blocks; + *credits += ref_blocks; + + /* + * So we may need ref_blocks to insert into the tree. + * That also means we need to change the b-tree and add that number + * of records since we never merge them. + * We need one more block for expansion since the new created leaf + * block is also full and needs split. + */ + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + *meta_add += ocfs2_extend_meta_needed(et.et_root_el); + *credits += ocfs2_calc_extend_credits(sb, + et.et_root_el, + ref_blocks); + } else { + *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + *meta_add += 1; + } + +out: + brelse(ref_leaf_bh); + brelse(prev_bh); + return ret; +} + +/* + * For refcount tree, we will decrease some contiguous clusters + * refcount count, so just go through it to see how many blocks + * we gonna touch and whether we need to create new blocks. + * + * Normally the refcount blocks store these refcount should be + * contiguous also, so that we can get the number easily. + * We will at most add split 2 refcount records and 2 more + * refcount blocks, so just check it in a rough way. + * + * Caller must hold refcount tree lock. + */ +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + u64 refcount_loc, + u64 phys_blkno, + u32 clusters, + int *credits, + int *ref_blocks) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *tree; + u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), + refcount_loc, &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, + &tree->rf_ci, + ref_root_bh, + start_cpos, clusters, + ref_blocks, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "reserve new metadata %d blocks, credits = %d\n", + *ref_blocks, *credits); + out: brelse(ref_root_bh); return ret; } + +#define MAX_CONTIG_BYTES 1048576 + +static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb) +{ + return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES); +} + +static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb) +{ + return ~(ocfs2_cow_contig_clusters(sb) - 1); +} + +/* + * Given an extent that starts at 'start' and an I/O that starts at 'cpos', + * find an offset (start + (n * contig_clusters)) that is closest to cpos + * while still being less than or equal to it. + * + * The goal is to break the extent at a multiple of contig_clusters. + */ +static inline unsigned int ocfs2_cow_align_start(struct super_block *sb, + unsigned int start, + unsigned int cpos) +{ + BUG_ON(start > cpos); + + return start + ((cpos - start) & ocfs2_cow_contig_mask(sb)); +} + +/* + * Given a cluster count of len, pad it out so that it is a multiple + * of contig_clusters. + */ +static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, + unsigned int len) +{ + unsigned int padded = + (len + (ocfs2_cow_contig_clusters(sb) - 1)) & + ocfs2_cow_contig_mask(sb); + + /* Did we wrap? */ + if (padded < len) + padded = UINT_MAX; + + return padded; +} + +/* + * Calculate out the start and number of virtual clusters we need to to CoW. + * + * cpos is vitual start cluster position we want to do CoW in a + * file and write_len is the cluster length. + * max_cpos is the place where we want to stop CoW intentionally. + * + * Normal we will start CoW from the beginning of extent record cotaining cpos. + * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we + * get good I/O from the resulting extent tree. + */ +static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, + struct ocfs2_extent_list *el, + u32 cpos, + u32 write_len, + u32 max_cpos, + u32 *cow_start, + u32 *cow_len) +{ + int ret = 0; + int tree_height = le16_to_cpu(el->l_tree_depth), i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb = NULL; + struct ocfs2_extent_rec *rec; + unsigned int want_clusters, rec_end = 0; + int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb); + int leaf_clusters; + + BUG_ON(cpos + write_len > max_cpos); + + if (tree_height > 0) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + *cow_len = 0; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + if (ocfs2_is_empty_extent(rec)) { + mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " + "index %d\n", inode->i_ino, i); + continue; + } + + if (le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters) <= cpos) + continue; + + if (*cow_len == 0) { + /* + * We should find a refcounted record in the + * first pass. + */ + BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); + *cow_start = le32_to_cpu(rec->e_cpos); + } + + /* + * If we encounter a hole, a non-refcounted record or + * pass the max_cpos, stop the search. + */ + if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || + (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) || + (max_cpos <= le32_to_cpu(rec->e_cpos))) + break; + + leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); + rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; + if (rec_end > max_cpos) { + rec_end = max_cpos; + leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos); + } + + /* + * How many clusters do we actually need from + * this extent? First we see how many we actually + * need to complete the write. If that's smaller + * than contig_clusters, we try for contig_clusters. + */ + if (!*cow_len) + want_clusters = write_len; + else + want_clusters = (cpos + write_len) - + (*cow_start + *cow_len); + if (want_clusters < contig_clusters) + want_clusters = contig_clusters; + + /* + * If the write does not cover the whole extent, we + * need to calculate how we're going to split the extent. + * We try to do it on contig_clusters boundaries. + * + * Any extent smaller than contig_clusters will be + * CoWed in its entirety. + */ + if (leaf_clusters <= contig_clusters) + *cow_len += leaf_clusters; + else if (*cow_len || (*cow_start == cpos)) { + /* + * This extent needs to be CoW'd from its + * beginning, so all we have to do is compute + * how many clusters to grab. We align + * want_clusters to the edge of contig_clusters + * to get better I/O. + */ + want_clusters = ocfs2_cow_align_length(inode->i_sb, + want_clusters); + + if (leaf_clusters < want_clusters) + *cow_len += leaf_clusters; + else + *cow_len += want_clusters; + } else if ((*cow_start + contig_clusters) >= + (cpos + write_len)) { + /* + * Breaking off contig_clusters at the front + * of the extent will cover our write. That's + * easy. + */ + *cow_len = contig_clusters; + } else if ((rec_end - cpos) <= contig_clusters) { + /* + * Breaking off contig_clusters at the tail of + * this extent will cover cpos. + */ + *cow_start = rec_end - contig_clusters; + *cow_len = contig_clusters; + } else if ((rec_end - cpos) <= want_clusters) { + /* + * While we can't fit the entire write in this + * extent, we know that the write goes from cpos + * to the end of the extent. Break that off. + * We try to break it at some multiple of + * contig_clusters from the front of the extent. + * Failing that (ie, cpos is within + * contig_clusters of the front), we'll CoW the + * entire extent. + */ + *cow_start = ocfs2_cow_align_start(inode->i_sb, + *cow_start, cpos); + *cow_len = rec_end - *cow_start; + } else { + /* + * Ok, the entire write lives in the middle of + * this extent. Let's try to slice the extent up + * nicely. Optimally, our CoW region starts at + * m*contig_clusters from the beginning of the + * extent and goes for n*contig_clusters, + * covering the entire write. + */ + *cow_start = ocfs2_cow_align_start(inode->i_sb, + *cow_start, cpos); + + want_clusters = (cpos + write_len) - *cow_start; + want_clusters = ocfs2_cow_align_length(inode->i_sb, + want_clusters); + if (*cow_start + want_clusters <= rec_end) + *cow_len = want_clusters; + else + *cow_len = rec_end - *cow_start; + } + + /* Have we covered our entire write yet? */ + if ((*cow_start + *cow_len) >= (cpos + write_len)) + break; + + /* + * If we reach the end of the extent block and don't get enough + * clusters, continue with the next extent block if possible. + */ + if (i + 1 == le16_to_cpu(el->l_next_free_rec) && + eb && eb->h_next_leaf_blk) { + brelse(eb_bh); + eb_bh = NULL; + + ret = ocfs2_read_extent_block(INODE_CACHE(inode), + le64_to_cpu(eb->h_next_leaf_blk), + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + i = -1; + } + } + +out: + brelse(eb_bh); + return ret; +} + +/* + * Prepare meta_ac, data_ac and calculate credits when we want to add some + * num_clusters in data_tree "et" and change the refcount for the old + * clusters(starting form p_cluster) in the refcount tree. + * + * Note: + * 1. since we may split the old tree, so we at most will need num_clusters + 2 + * more new leaf records. + * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so + * just give data_ac = NULL. + */ +static int ocfs2_lock_refcount_allocators(struct super_block *sb, + u32 p_cluster, u32 num_clusters, + struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac, + int *credits) +{ + int ret = 0, meta_add = 0; + int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < num_clusters + 2) + meta_add = + ocfs2_extend_meta_needed(et->et_root_el); + + *credits += ocfs2_calc_extend_credits(sb, et->et_root_el, + num_clusters + 2); + + ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, + p_cluster, num_clusters, + &meta_add, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n", + meta_add, num_clusters, *credits); + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) { + ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters, + data_ac); + if (ret) + mlog_errno(ret); + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) +{ + BUG_ON(buffer_dirty(bh)); + + clear_buffer_mapped(bh); + + return 0; +} + +static int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0, partial; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct page *page; + pgoff_t page_index; + unsigned int from, to; + loff_t offset, end, map_end; + struct address_space *mapping = context->inode->i_mapping; + + mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, + new_cluster, new_len, cpos); + + offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); + + while (offset < end) { + page_index = offset >> PAGE_CACHE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + if (map_end > end) + map_end = end; + + /* from, to is the offset within the page. */ + from = offset & (PAGE_CACHE_SIZE - 1); + to = PAGE_CACHE_SIZE; + if (map_end & (PAGE_CACHE_SIZE - 1)) + to = map_end & (PAGE_CACHE_SIZE - 1); + + page = grab_cache_page(mapping, page_index); + + /* + * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page + * can't be dirtied before we CoW it out. + */ + if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) + BUG_ON(PageDirty(page)); + + if (!PageUptodate(page)) { + ret = block_read_full_page(page, ocfs2_get_block); + if (ret) { + mlog_errno(ret); + goto unlock; + } + lock_page(page); + } + + if (page_has_buffers(page)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_clear_cow_buffer); + if (ret) { + mlog_errno(ret); + goto unlock; + } + } + + ocfs2_map_and_dirty_page(context->inode, + handle, from, to, + page, 0, &new_block); + mark_page_accessed(page); +unlock: + unlock_page(page); + page_cache_release(page); + page = NULL; + offset = map_end; + if (ret) + break; + } + + return ret; +} + +static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0; + struct super_block *sb = context->inode->i_sb; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); + u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct ocfs2_super *osb = OCFS2_SB(sb); + struct buffer_head *old_bh = NULL; + struct buffer_head *new_bh = NULL; + + mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster, + new_cluster, new_len); + + for (i = 0; i < blocks; i++, old_block++, new_block++) { + new_bh = sb_getblk(osb->sb, new_block); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + break; + } + + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_read_block(ci, old_block, &old_bh, NULL); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_journal_access(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); + ocfs2_journal_dirty(handle, new_bh); + + brelse(new_bh); + brelse(old_bh); + new_bh = NULL; + old_bh = NULL; + } + + brelse(new_bh); + brelse(old_bh); + return ret; +} + +static int ocfs2_clear_ext_refcount(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 p_cluster, u32 len, + unsigned int ext_flags, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, index; + struct ocfs2_extent_rec replace_rec; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 ino = ocfs2_metadata_cache_owner(et->et_ci); + + mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n", + (unsigned long long)ino, cpos, len, p_cluster, ext_flags); + + memset(&replace_rec, 0, sizeof(replace_rec)); + replace_rec.e_cpos = cpu_to_le32(cpos); + replace_rec.e_leaf_clusters = cpu_to_le16(len); + replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, + p_cluster)); + replace_rec.e_flags = ext_flags; + replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ino, cpos); + ret = -EROFS; + goto out; + } + + ret = ocfs2_split_extent(handle, et, path, index, + &replace_rec, meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(path); + return ret; +} + +static int ocfs2_replace_clusters(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old, + u32 new, u32 len, + unsigned int ext_flags) +{ + int ret; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + u64 ino = ocfs2_metadata_cache_owner(ci); + + mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n", + (unsigned long long)ino, cpos, old, new, len, ext_flags); + + /*If the old clusters is unwritten, no need to duplicate. */ + if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { + ret = context->cow_duplicate_clusters(handle, context, cpos, + old, new, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_clear_ext_refcount(handle, &context->data_et, + cpos, new, len, ext_flags, + context->meta_ac, &context->dealloc); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static int ocfs2_cow_sync_writeback(struct super_block *sb, + struct ocfs2_cow_context *context, + u32 cpos, u32 num_clusters) +{ + int ret = 0; + loff_t offset, end, map_end; + pgoff_t page_index; + struct page *page; + + if (ocfs2_should_order_data(context->inode)) + return 0; + + offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); + + ret = filemap_fdatawrite_range(context->inode->i_mapping, + offset, end - 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + while (offset < end) { + page_index = offset >> PAGE_CACHE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + if (map_end > end) + map_end = end; + + page = grab_cache_page(context->inode->i_mapping, page_index); + BUG_ON(!page); + + wait_on_page_writeback(page); + if (PageError(page)) { + ret = -EIO; + mlog_errno(ret); + } else + mark_page_accessed(page); + + unlock_page(page); + page_cache_release(page); + page = NULL; + offset = map_end; + if (ret) + break; + } + + return ret; +} + +static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + return ocfs2_get_clusters(context->inode, v_cluster, p_cluster, + num_clusters, extent_flags); +} + +static int ocfs2_make_clusters_writable(struct super_block *sb, + struct ocfs2_cow_context *context, + u32 cpos, u32 p_cluster, + u32 num_clusters, unsigned int e_flags) +{ + int ret, delete, index, credits = 0; + u32 new_bit, new_len; + unsigned int set_len; + struct ocfs2_super *osb = OCFS2_SB(sb); + handle_t *handle; + struct buffer_head *ref_leaf_bh = NULL; + struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; + struct ocfs2_refcount_rec rec; + + mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n", + cpos, p_cluster, num_clusters, e_flags); + + ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, + &context->data_et, + ref_ci, + context->ref_root_bh, + &context->meta_ac, + &context->data_ac, &credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + if (context->post_refcount) + credits += context->post_refcount->credits; + + credits += context->extra_credits; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + while (num_clusters) { + ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, + p_cluster, num_clusters, + &rec, &index, &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + BUG_ON(!rec.r_refcount); + set_len = min((u64)p_cluster + num_clusters, + le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - p_cluster; + + /* + * There are many different situation here. + * 1. If refcount == 1, remove the flag and don't COW. + * 2. If refcount > 1, allocate clusters. + * Here we may not allocate r_len once at a time, so continue + * until we reach num_clusters. + */ + if (le32_to_cpu(rec.r_refcount) == 1) { + delete = 0; + ret = ocfs2_clear_ext_refcount(handle, + &context->data_et, + cpos, p_cluster, + set_len, e_flags, + context->meta_ac, + &context->dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } else { + delete = 1; + + ret = __ocfs2_claim_clusters(handle, + context->data_ac, + 1, set_len, + &new_bit, &new_len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_replace_clusters(handle, context, + cpos, p_cluster, new_bit, + new_len, e_flags); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + set_len = new_len; + } + + ret = __ocfs2_decrease_refcount(handle, ref_ci, + context->ref_root_bh, + p_cluster, set_len, + context->meta_ac, + &context->dealloc, delete); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + cpos += set_len; + p_cluster += set_len; + num_clusters -= set_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + + /* handle any post_cow action. */ + if (context->post_refcount && context->post_refcount->func) { + ret = context->post_refcount->func(context->inode, handle, + context->post_refcount->para); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + if (context->get_clusters == ocfs2_di_get_clusters) { + ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (context->data_ac) { + ocfs2_free_alloc_context(context->data_ac); + context->data_ac = NULL; + } + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + brelse(ref_leaf_bh); + + return ret; +} + +static int ocfs2_replace_cow(struct ocfs2_cow_context *context) +{ + int ret = 0; + struct inode *inode = context->inode; + u32 cow_start = context->cow_start, cow_len = context->cow_len; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + return -EROFS; + } + + ocfs2_init_dealloc_ctxt(&context->dealloc); + + while (cow_len) { + ret = context->get_clusters(context, cow_start, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); + + if (cow_len < num_clusters) + num_clusters = cow_len; + + ret = ocfs2_make_clusters_writable(inode->i_sb, context, + cow_start, p_cluster, + num_clusters, ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + cow_len -= num_clusters; + cow_start += num_clusters; + } + + if (ocfs2_dealloc_has_cluster(&context->dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &context->dealloc); + } + + return ret; +} + +/* + * Starting at cpos, try to CoW write_len clusters. Don't CoW + * past max_cpos. This will stop when it runs into a hole or an + * unrefcounted extent. + */ +static int ocfs2_refcount_cow_hunk(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos) +{ + int ret; + u32 cow_start = 0, cow_len = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + struct ocfs2_cow_context *context = NULL; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, + cpos, write_len, max_cpos, + &cow_start, &cow_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, " + "cow_len %u\n", inode->i_ino, + cpos, write_len, cow_start, cow_len); + + BUG_ON(cow_len == 0); + + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); + if (!context) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + context->inode = inode; + context->cow_start = cow_start; + context->cow_len = cow_len; + context->ref_tree = ref_tree; + context->ref_root_bh = ref_root_bh; + context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; + context->get_clusters = ocfs2_di_get_clusters; + + ocfs2_init_dinode_extent_tree(&context->data_et, + INODE_CACHE(inode), di_bh); + + ret = ocfs2_replace_cow(context); + if (ret) + mlog_errno(ret); + + /* + * truncate the extent map here since no matter whether we meet with + * any error during the action, we shouldn't trust cached extent map + * any more. + */ + ocfs2_extent_map_trunc(inode, cow_start); + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + kfree(context); + return ret; +} + +/* + * CoW any and all clusters between cpos and cpos+write_len. + * Don't CoW past max_cpos. If this returns successfully, all + * clusters between cpos and cpos+write_len are safe to modify. + */ +int ocfs2_refcount_cow(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos) +{ + int ret = 0; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + + while (write_len) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + if (write_len < num_clusters) + num_clusters = write_len; + + if (ext_flags & OCFS2_EXT_REFCOUNTED) { + ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, + num_clusters, max_cpos); + if (ret) { + mlog_errno(ret); + break; + } + } + + write_len -= num_clusters; + cpos += num_clusters; + } + + return ret; +} + +static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + struct inode *inode = context->inode; + struct ocfs2_xattr_value_root *xv = context->cow_object; + + return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster, + num_clusters, &xv->xr_list, + extent_flags); +} + +/* + * Given a xattr value root, calculate the most meta/credits we need for + * refcount tree change if we truncate it to 0. + */ +int ocfs2_refcounted_xattr_delete_need(struct inode *inode, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_xattr_value_root *xv, + int *meta_add, int *credits) +{ + int ret = 0, index, ref_blocks = 0; + u32 p_cluster, num_clusters; + u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters); + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_rec rec; + struct buffer_head *ref_leaf_bh = NULL; + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos += num_clusters; + + while (num_clusters) { + ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh, + p_cluster, num_clusters, + &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!rec.r_refcount); + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + + /* + * We really don't know whether the other clusters is in + * this refcount block or not, so just take the worst + * case that all the clusters are in this block and each + * one will split a refcount rec, so totally we need + * clusters * 2 new refcount rec. + */ + if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + + *credits += 1; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + + if (num_clusters <= le32_to_cpu(rec.r_clusters)) + break; + else + num_clusters -= le32_to_cpu(rec.r_clusters); + p_cluster += num_clusters; + } + } + + *meta_add += ref_blocks; + if (!ref_blocks) + goto out; + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + else { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); + *credits += ocfs2_calc_extend_credits(inode->i_sb, + et.et_root_el, + ref_blocks); + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +/* + * Do CoW for xattr. + */ +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_root_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post) +{ + int ret; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_cow_context *context = NULL; + u32 cow_start, cow_len; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, + cpos, write_len, UINT_MAX, + &cow_start, &cow_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(cow_len == 0); + + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); + if (!context) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + context->inode = inode; + context->cow_start = cow_start; + context->cow_len = cow_len; + context->ref_tree = ref_tree; + context->ref_root_bh = ref_root_bh;; + context->cow_object = xv; + + context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; + /* We need the extra credits for duplicate_clusters by jbd. */ + context->extra_credits = + ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len; + context->get_clusters = ocfs2_xattr_value_get_clusters; + context->post_refcount = post; + + ocfs2_init_xattr_value_extent_tree(&context->data_et, + INODE_CACHE(inode), vb); + + ret = ocfs2_replace_cow(context); + if (ret) + mlog_errno(ret); + +out: + kfree(context); + return ret; +} + +/* + * Insert a new extent into refcount tree and mark a extent rec + * as refcounted in the dinode tree. + */ +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post) +{ + int ret; + handle_t *handle; + int credits = 1, ref_blocks = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, + ref_ci, ref_root_bh, + p_cluster, num_clusters, + &ref_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "reserve new metadata %d, credits = %d\n", + ref_blocks, credits); + + if (ref_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ref_blocks, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (post) + credits += post->credits; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, + cpos, num_clusters, p_cluster, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, + p_cluster, num_clusters, 0, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (post && post->func) { + ret = post->func(inode, handle, post->para); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_change_ctime(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_attach_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret, data_changed = 0; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_tree *ref_tree; + unsigned int ext_flags; + loff_t size; + u32 cpos, num_clusters, clusters, p_cluster; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree di_et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + ret = ocfs2_create_refcount_tree(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + BUG_ON(!di->i_refcount_loc); + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(di->i_refcount_loc), 1, + &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + goto attach_xattr; + + ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); + + size = i_size_read(inode); + clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + + if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { + ret = ocfs2_add_refcount_flag(inode, &di_et, + &ref_tree->rf_ci, + ref_root_bh, cpos, + p_cluster, num_clusters, + &dealloc, NULL); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + data_changed = 1; + } + cpos += num_clusters; + } + +attach_xattr: + if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, + &ref_tree->rf_ci, + ref_root_bh, + &dealloc); + if (ret) { + mlog_errno(ret); + goto unlock; + } + } + + if (data_changed) { + ret = ocfs2_change_ctime(inode, di_bh); + if (ret) + mlog_errno(ret); + } + +unlock: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + + if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } +out: + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode, 0); + + return ret; +} + +static int ocfs2_add_refcounted_extent(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + unsigned int ext_flags, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + handle_t *handle; + int credits = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_lock_refcount_allocators(inode->i_sb, + p_cluster, num_clusters, + et, ref_ci, + ref_root_bh, &meta_ac, + NULL, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_insert_extent(handle, et, cpos, + ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), + num_clusters, ext_flags, meta_ac); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, + p_cluster, num_clusters, + meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_duplicate_inline_data(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data; + + BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + t_di->id2.i_data.id_count = s_di->id2.i_data.id_count; + memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data, + le16_to_cpu(s_di->id2.i_data.id_count)); + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL; + t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features); + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + + ocfs2_journal_dirty(handle, t_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +static int ocfs2_duplicate_extent_list(struct inode *s_inode, + struct inode *t_inode, + struct buffer_head *t_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + u32 p_cluster, num_clusters, clusters, cpos; + loff_t size; + unsigned int ext_flags; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); + + size = i_size_read(s_inode); + clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + + if (p_cluster) { + ret = ocfs2_add_refcounted_extent(t_inode, &et, + ref_ci, ref_root_bh, + cpos, p_cluster, + num_clusters, + ext_flags, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += num_clusters; + } + +out: + return ret; +} + +/* + * change the new file's attributes to the src. + * + * reflink creates a snapshot of a file, that means the attributes + * must be identical except for three exceptions - nlink, ino, and ctime. + */ +static int ocfs2_complete_reflink(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh, + bool preserve) +{ + int ret; + handle_t *handle; + struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data; + loff_t size = i_size_read(s_inode); + + handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; + OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr; + OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + i_size_write(t_inode, size); + t_inode->i_blocks = s_inode->i_blocks; + + di->i_xattr_inline_size = s_di->i_xattr_inline_size; + di->i_clusters = s_di->i_clusters; + di->i_size = s_di->i_size; + di->i_dyn_features = s_di->i_dyn_features; + di->i_attr = s_di->i_attr; + + if (preserve) { + t_inode->i_uid = s_inode->i_uid; + t_inode->i_gid = s_inode->i_gid; + t_inode->i_mode = s_inode->i_mode; + di->i_uid = s_di->i_uid; + di->i_gid = s_di->i_gid; + di->i_mode = s_di->i_mode; + + /* + * update time. + * we want mtime to appear identical to the source and + * update ctime. + */ + t_inode->i_ctime = CURRENT_TIME; + + di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); + + t_inode->i_mtime = s_inode->i_mtime; + di->i_mtime = s_di->i_mtime; + di->i_mtime_nsec = s_di->i_mtime_nsec; + } + + ocfs2_journal_dirty(handle, t_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle); + return ret; +} + +static int ocfs2_create_reflink_node(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh, + bool preserve) +{ + int ret; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_refcount_block *rb; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_refcount_tree *ref_tree; + + ocfs2_init_dealloc_ctxt(&dealloc); + + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(di->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, + t_inode, t_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, + &ref_tree->rf_ci, ref_root_bh, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + return ret; +} + +static int __ocfs2_reflink(struct dentry *old_dentry, + struct buffer_head *old_bh, + struct inode *new_inode, + bool preserve) +{ + int ret; + struct inode *inode = old_dentry->d_inode; + struct buffer_head *new_bh = NULL; + + ret = filemap_fdatawrite(inode->i_mapping); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_attach_refcount_tree(inode, old_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + mutex_lock(&new_inode->i_mutex); + ret = ocfs2_inode_lock(new_inode, &new_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_create_reflink_node(inode, old_bh, + new_inode, new_bh, preserve); + if (ret) { + mlog_errno(ret); + goto inode_unlock; + } + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_reflink_xattrs(inode, old_bh, + new_inode, new_bh, + preserve); + if (ret) { + mlog_errno(ret); + goto inode_unlock; + } + } + + ret = ocfs2_complete_reflink(inode, old_bh, + new_inode, new_bh, preserve); + if (ret) + mlog_errno(ret); + +inode_unlock: + ocfs2_inode_unlock(new_inode, 1); + brelse(new_bh); +out_unlock: + mutex_unlock(&new_inode->i_mutex); +out: + if (!ret) { + ret = filemap_fdatawait(inode->i_mapping); + if (ret) + mlog_errno(ret); + } + return ret; +} + +static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) +{ + int error; + struct inode *inode = old_dentry->d_inode; + struct buffer_head *old_bh = NULL; + struct inode *new_orphan_inode = NULL; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + &new_orphan_inode); + if (error) { + mlog_errno(error); + goto out; + } + + error = ocfs2_inode_lock(inode, &old_bh, 1); + if (error) { + mlog_errno(error); + goto out; + } + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + down_write(&OCFS2_I(inode)->ip_alloc_sem); + error = __ocfs2_reflink(old_dentry, old_bh, + new_orphan_inode, preserve); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + up_write(&OCFS2_I(inode)->ip_xattr_sem); + + ocfs2_inode_unlock(inode, 1); + brelse(old_bh); + + if (error) { + mlog_errno(error); + goto out; + } + + /* If the security isn't preserved, we need to re-initialize them. */ + if (!preserve) { + error = ocfs2_init_security_and_acl(dir, new_orphan_inode); + if (error) + mlog_errno(error); + } +out: + if (!error) { + error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + new_dentry); + if (error) + mlog_errno(error); + } + + if (new_orphan_inode) { + /* + * We need to open_unlock the inode no matter whether we + * succeed or not, so that other nodes can delete it later. + */ + ocfs2_open_unlock(new_orphan_inode); + if (error) + iput(new_orphan_inode); + } + + return error; +} + +/* + * Below here are the bits used by OCFS2_IOC_REFLINK() to fake + * sys_reflink(). This will go away when vfs_reflink() exists in + * fs/namei.c. + */ + +/* copied from may_create in VFS. */ +static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* copied from user_path_parent. */ +static int ocfs2_user_path_parent(const char __user *path, + struct nameidata *nd, char **name) +{ + char *s = getname(path); + int error; + + if (IS_ERR(s)) + return PTR_ERR(s); + + error = path_lookup(s, LOOKUP_PARENT, nd); + if (error) + putname(s); + else + *name = s; + + return error; +} + +/** + * ocfs2_vfs_reflink - Create a reference-counted link + * + * @old_dentry: source dentry + inode + * @dir: directory to create the target + * @new_dentry: target dentry + * @preserve: if true, preserve all file attributes + */ +static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) +{ + struct inode *inode = old_dentry->d_inode; + int error; + + if (!inode) + return -ENOENT; + + error = ocfs2_may_create(dir, new_dentry); + if (error) + return error; + + if (dir->i_sb != inode->i_sb) + return -EXDEV; + + /* + * A reflink to an append-only or immutable file cannot be created. + */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + /* Only regular files can be reflinked. */ + if (!S_ISREG(inode->i_mode)) + return -EPERM; + + /* + * If the caller wants to preserve ownership, they require the + * rights to do so. + */ + if (preserve) { + if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN)) + return -EPERM; + if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) + return -EPERM; + } + + /* + * If the caller is modifying any aspect of the attributes, they + * are not creating a snapshot. They need read permission on the + * file. + */ + if (!preserve) { + error = inode_permission(inode, MAY_READ); + if (error) + return error; + } + + mutex_lock(&inode->i_mutex); + dquot_initialize(dir); + error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); + mutex_unlock(&inode->i_mutex); + if (!error) + fsnotify_create(dir, new_dentry); + return error; +} +/* + * Most codes are copied from sys_linkat. + */ +int ocfs2_reflink_ioctl(struct inode *inode, + const char __user *oldname, + const char __user *newname, + bool preserve) +{ + struct dentry *new_dentry; + struct nameidata nd; + struct path old_path; + int error; + char *to = NULL; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + error = user_path_at(AT_FDCWD, oldname, 0, &old_path); + if (error) { + mlog_errno(error); + return error; + } + + error = ocfs2_user_path_parent(newname, &nd, &to); + if (error) { + mlog_errno(error); + goto out; + } + + error = -EXDEV; + if (old_path.mnt != nd.path.mnt) + goto out_release; + new_dentry = lookup_create(&nd, 0); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) { + mlog_errno(error); + goto out_unlock; + } + + error = mnt_want_write(nd.path.mnt); + if (error) { + mlog_errno(error); + goto out_dput; + } + + error = ocfs2_vfs_reflink(old_path.dentry, + nd.path.dentry->d_inode, + new_dentry, preserve); + mnt_drop_write(nd.path.mnt); +out_dput: + dput(new_dentry); +out_unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); +out_release: + path_put(&nd.path); + putname(to); +out: + path_put(&old_path); + + return error; +}