X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Fext4%2Fextents.c;h=441716f33b4a9c70be602ed88dc20d5eb8449608;hb=109f55651954def97fa41ee71c464d268c512ab0;hp=7844bbb2bac0ed700d25ecf13267aa6696ef2af2;hpb=9ddfc3dc75b5cc55ff3cfa586e962d252f1db9d3;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7844bbb..441716f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "ext4_extents.h" @@ -48,7 +49,7 @@ * ext_pblock: * combine low and high parts of physical block number into ext4_fsblk_t */ -static ext4_fsblk_t ext_pblock(struct ext4_extent *ex) +ext4_fsblk_t ext_pblock(struct ext4_extent *ex) { ext4_fsblk_t block; @@ -92,16 +93,27 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } -static int ext4_ext_journal_restart(handle_t *handle, int needed) +static int ext4_ext_truncate_extend_restart(handle_t *handle, + struct inode *inode, + int needed) { int err; + if (!ext4_handle_valid(handle)) + return 0; if (handle->h_buffer_credits > needed) return 0; err = ext4_journal_extend(handle, needed); - if (err) + if (err <= 0) return err; - return ext4_journal_restart(handle, needed); + err = ext4_truncate_restart_trans(handle, inode, needed); + /* + * We have dropped i_data_sem so someone might have cached again + * an extent we are going to truncate. + */ + ext4_ext_invalidate_cache(inode); + + return err; } /* @@ -133,7 +145,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode, int err; if (path->p_bh) { /* path points to block */ - err = ext4_journal_dirty_metadata(handle, path->p_bh); + err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); @@ -149,6 +161,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, ext4_fsblk_t bg_start; ext4_fsblk_t last_block; ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); int depth; if (path) { @@ -167,10 +181,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); @@ -190,64 +225,102 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_meta_block(handle, inode, goal, err); + newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); return newblock; } -static int ext4_ext_space_block(struct inode *inode) +static inline int ext4_ext_space_block(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 6) - size = 6; + if (size > 6) + size = 6; #endif + } return size; } -static int ext4_ext_space_block_idx(struct inode *inode) +static inline int ext4_ext_space_block_idx(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 5) - size = 5; + if (size > 5) + size = 5; #endif + } return size; } -static int ext4_ext_space_root(struct inode *inode) +static inline int ext4_ext_space_root(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 3) - size = 3; + if (size > 3) + size = 3; #endif + } return size; } -static int ext4_ext_space_root_idx(struct inode *inode) +static inline int ext4_ext_space_root_idx(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 4) - size = 4; + if (size > 4) + size = 4; #endif + } return size; } +/* + * Calculate the number of metadata blocks needed + * to allocate @blocks + * Worse case is one block per extent + */ +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) +{ + int lcap, icap, rcap, leafs, idxs, num; + int newextents = blocks; + + rcap = ext4_ext_space_root_idx(inode, 0); + lcap = ext4_ext_space_block(inode, 0); + icap = ext4_ext_space_block_idx(inode, 0); + + /* number of new leaf blocks needed */ + num = leafs = (newextents + lcap - 1) / lcap; + + /* + * Worse case, we need separate index block(s) + * to link all new leaf blocks + */ + idxs = (leafs + icap - 1) / icap; + do { + num += idxs; + idxs = (idxs + icap - 1) / icap; + } while (idxs > rcap); + + return num; +} + static int ext4_ext_max_entries(struct inode *inode, int depth) { @@ -255,20 +328,69 @@ ext4_ext_max_entries(struct inode *inode, int depth) if (depth == ext_depth(inode)) { if (depth == 0) - max = ext4_ext_space_root(inode); + max = ext4_ext_space_root(inode, 1); else - max = ext4_ext_space_root_idx(inode); + max = ext4_ext_space_root_idx(inode, 1); } else { if (depth == 0) - max = ext4_ext_space_block(inode); + max = ext4_ext_space_block(inode, 1); else - max = ext4_ext_space_block_idx(inode); + max = ext4_ext_space_block_idx(inode, 1); } return max; } -static int __ext4_ext_check_header(const char *function, struct inode *inode, +static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) +{ + ext4_fsblk_t block = ext_pblock(ext); + int len = ext4_ext_get_actual_len(ext); + + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); +} + +static int ext4_valid_extent_idx(struct inode *inode, + struct ext4_extent_idx *ext_idx) +{ + ext4_fsblk_t block = idx_pblock(ext_idx); + + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); +} + +static int ext4_valid_extent_entries(struct inode *inode, + struct ext4_extent_header *eh, + int depth) +{ + struct ext4_extent *ext; + struct ext4_extent_idx *ext_idx; + unsigned short entries; + if (eh->eh_entries == 0) + return 1; + + entries = le16_to_cpu(eh->eh_entries); + + if (depth == 0) { + /* leaf entries */ + ext = EXT_FIRST_EXTENT(eh); + while (entries) { + if (!ext4_valid_extent(inode, ext)) + return 0; + ext++; + entries--; + } + } else { + ext_idx = EXT_FIRST_INDEX(eh); + while (entries) { + if (!ext4_valid_extent_idx(inode, ext_idx)) + return 0; + ext_idx++; + entries--; + } + } + return 1; +} + +static int __ext4_ext_check(const char *function, struct inode *inode, struct ext4_extent_header *eh, int depth) { @@ -296,11 +418,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode, error_msg = "invalid eh_entries"; goto corrupted; } + if (!ext4_valid_extent_entries(inode, eh, depth)) { + error_msg = "invalid extent entries"; + goto corrupted; + } return 0; corrupted: ext4_error(inode->i_sb, function, - "bad header in inode #%lu: %s - magic %x, " + "bad header/extent in inode #%lu: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), @@ -309,8 +435,13 @@ corrupted: return -EIO; } -#define ext4_ext_check_header(inode, eh, depth) \ - __ext4_ext_check_header(__func__, inode, eh, depth) +#define ext4_ext_check(inode, eh, depth) \ + __ext4_ext_check(__func__, inode, eh, depth) + +int ext4_ext_check_inode(struct inode *inode) +{ + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); +} #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) @@ -323,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), idx_pblock(path->p_idx)); } else if (path->p_ext) { - ext_debug(" %d:%d:%llu ", + ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), + ext4_ext_is_uninitialized(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext_pblock(path->p_ext)); } else @@ -346,15 +478,18 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); + ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { - ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), + ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), ext4_ext_get_actual_len(ex), ext_pblock(ex)); } ext_debug("\n"); } #else -#define ext4_ext_show_path(inode,path) -#define ext4_ext_show_leaf(inode,path) +#define ext4_ext_show_path(inode, path) +#define ext4_ext_show_leaf(inode, path) #endif void ext4_ext_drop_refs(struct ext4_ext_path *path) @@ -410,9 +545,10 @@ ext4_ext_binsearch_idx(struct inode *inode, for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { if (k != 0 && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { - printk("k=%d, ix=0x%p, first=0x%p\n", k, - ix, EXT_FIRST_INDEX(eh)); - printk("%u <= %u\n", + printk(KERN_DEBUG "k=%d, ix=0x%p, " + "first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk(KERN_DEBUG "%u <= %u\n", le32_to_cpu(ix->ei_block), le32_to_cpu(ix[-1].ei_block)); } @@ -465,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode, } path->p_ext = l - 1; - ext_debug(" -> %d:%llu:%d ", + ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext_pblock(path->p_ext), + ext4_ext_is_uninitialized(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH @@ -497,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_depth = 0; eh->eh_entries = 0; eh->eh_magic = EXT4_EXT_MAGIC; - eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode)); + eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); ext4_mark_inode_dirty(handle, inode); ext4_ext_invalidate_cache(inode); return 0; @@ -513,9 +650,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_inode_hdr(inode); depth = ext_depth(inode); - if (ext4_ext_check_header(inode, eh, depth)) - return ERR_PTR(-EIO); - /* account possible depth increase */ if (!path) { @@ -531,6 +665,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, i = depth; /* walk through the tree */ while (i) { + int need_to_validate = 0; + ext_debug("depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -539,10 +675,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = sb_bread(inode->i_sb, path[ppos].p_block); - if (!bh) + bh = sb_getblk(inode->i_sb, path[ppos].p_block); + if (unlikely(!bh)) goto err; - + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto err; + } + /* validate the extent entries */ + need_to_validate = 1; + } eh = ext_block_hdr(bh); ppos++; BUG_ON(ppos > depth); @@ -550,7 +693,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_hdr = eh; i--; - if (ext4_ext_check_header(inode, eh, i)) + if (need_to_validate && ext4_ext_check(inode, eh, i)) goto err; } @@ -580,7 +723,7 @@ err: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ -static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, +int ext4_ext_insert_index(handle_t *handle, struct inode *inode, struct ext4_ext_path *curp, int logical, ext4_fsblk_t ptr) { @@ -716,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh = ext_block_hdr(bh); neh->eh_entries = 0; - neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; ex = EXT_FIRST_EXTENT(neh); @@ -729,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, path[depth].p_ext++; while (path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)) { - ext_debug("move %d:%llu:%d in new leaf %llu\n", + ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(path[depth].p_ext->ee_block), ext_pblock(path[depth].p_ext), + ext4_ext_is_uninitialized(path[depth].p_ext), ext4_ext_get_actual_len(path[depth].p_ext), newblock); /*memmove(ex++, path[depth].p_ext++, @@ -748,7 +892,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); @@ -791,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh = ext_block_hdr(bh); neh->eh_entries = cpu_to_le16(1); neh->eh_magic = EXT4_EXT_MAGIC; - neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); neh->eh_depth = cpu_to_le16(depth - i); fidx = EXT_FIRST_INDEX(neh); fidx->ei_block = border; @@ -827,7 +971,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); @@ -916,14 +1060,14 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, /* old root could have indexes or leaves * so calculate e_max right way */ if (ext_depth(inode)) - neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); else - neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto out; @@ -933,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, goto out; curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; - curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); + curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); curp->p_hdr->eh_entries = cpu_to_le16(1); curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); @@ -1088,7 +1232,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent_idx *ix; struct ext4_extent *ex; ext4_fsblk_t block; - int depth, ee_len; + int depth; /* Note, NOT eh_depth; depth from top of tree */ + int ee_len; BUG_ON(path == NULL); depth = path->p_depth; @@ -1128,15 +1273,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, while (--depth >= 0) { ix = path[depth].p_idx; if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) - break; + goto got_index; } - if (depth < 0) { - /* we've gone up to the root and - * found no index to the right */ - return 0; - } + /* we've gone up to the root and found no index to the right */ + return 0; +got_index: /* we've found index to the right, let's * follow it and find the closest allocated * block to the right */ @@ -1147,7 +1290,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check_header(inode, eh, depth)) { + /* subtract from p_depth to get proper eh_depth */ + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } @@ -1160,7 +1304,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } @@ -1169,7 +1313,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, *phys = ext_pblock(ex); put_bh(bh); return 0; - } /* @@ -1297,7 +1440,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, return err; } -static int +int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { @@ -1411,7 +1554,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode, /* * get the next allocated block if the extent in the path - * is before the requested block(s) + * is before the requested block(s) */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); @@ -1443,9 +1586,9 @@ out: */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext) + struct ext4_extent *newext, int flag) { - struct ext4_extent_header * eh; + struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; @@ -1459,10 +1602,13 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, BUG_ON(path[depth].p_hdr == NULL); /* try to insert block into found extent and return */ - if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append %d block to %d:%d (from %llu)\n", + if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + && ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), ext4_ext_get_actual_len(ex), ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -1531,9 +1677,10 @@ has_space: if (!nearex) { /* there is no extent in this leaf, create first one */ - ext_debug("first extent in the leaf: %d:%llu:%d\n", + ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext)); path[depth].p_ext = EXT_FIRST_EXTENT(eh); } else if (le32_to_cpu(newext->ee_block) @@ -1543,10 +1690,11 @@ has_space: len = EXT_MAX_EXTENT(eh) - nearex; len = (len - 1) * sizeof(struct ext4_extent); len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:%d after: nearest 0x%p, " + ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 2, nearex + 1, len); @@ -1556,10 +1704,11 @@ has_space: BUG_ON(newext->ee_block == nearex->ee_block); len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:%d before: nearest 0x%p, " + ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 1, nearex, len); @@ -1574,7 +1723,8 @@ has_space: merge: /* try to merge extents to the right */ - ext4_ext_try_to_merge(inode, path, nearex); + if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + ext4_ext_try_to_merge(inode, path, nearex); /* try to merge extents to the left */ @@ -1590,22 +1740,130 @@ cleanup: ext4_ext_drop_refs(npath); kfree(npath); } - ext4_ext_tree_changed(inode); ext4_ext_invalidate_cache(inode); return err; } +int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) +{ + struct ext4_ext_path *path = NULL; + struct ext4_ext_cache cbex; + struct ext4_extent *ex; + ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t last = block + num; + int depth, exists, err = 0; + + BUG_ON(func == NULL); + BUG_ON(inode == NULL); + + while (block < last && block != EXT_MAX_BLOCK) { + num = last - block; + /* find extent for this block */ + path = ext4_ext_find_extent(inode, block, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + ex = path[depth].p_ext; + next = ext4_ext_next_allocated_block(path); + + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (le32_to_cpu(ex->ee_block) > block) { + /* need to allocate space before found extent */ + start = block; + end = le32_to_cpu(ex->ee_block); + if (block + num < end) + end = block + num; + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (end >= next) + end = next; + } else if (block >= le32_to_cpu(ex->ee_block)) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + BUG_ON(end <= start); + + if (!exists) { + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; + cbex.ec_type = EXT4_EXT_CACHE_GAP; + } else { + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = ext4_ext_get_actual_len(ex); + cbex.ec_start = ext_pblock(ex); + cbex.ec_type = EXT4_EXT_CACHE_EXTENT; + } + + BUG_ON(cbex.ec_len == 0); + err = func(inode, path, &cbex, ex, cbdata); + ext4_ext_drop_refs(path); + + if (err < 0) + break; + + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; + } + + if (ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + block = cbex.ec_block + cbex.ec_len; + } + + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + + return err; +} + static void ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, __u32 len, ext4_fsblk_t start, int type) { struct ext4_ext_cache *cex; BUG_ON(len == 0); + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; cex->ec_type = type; cex->ec_block = block; cex->ec_len = len; cex->ec_start = start; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } /* @@ -1662,12 +1920,17 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_extent *ex) { struct ext4_ext_cache *cex; + int ret = EXT4_EXT_CACHE_NO; + /* + * We borrow i_block_reservation_lock to protect i_cached_extent + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; /* has cache valid data? */ if (cex->ec_type == EXT4_EXT_CACHE_NO) - return EXT4_EXT_CACHE_NO; + goto errout; BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && cex->ec_type != EXT4_EXT_CACHE_EXTENT); @@ -1678,11 +1941,11 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); - return cex->ec_type; + ret = cex->ec_type; } - - /* not in cache */ - return EXT4_EXT_CACHE_NO; +errout: + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return ret; } /* @@ -1717,54 +1980,62 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, } /* - * ext4_ext_calc_credits_for_insert: - * This routine returns max. credits that the extent tree can consume. - * It should be OK for low-performance paths like ->writepage() - * To allow many writing processes to fit into a single transaction, - * the caller should calculate credits under i_data_sem and - * pass the actual path. + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. */ -int ext4_ext_calc_credits_for_insert(struct inode *inode, +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { - int depth, needed; - if (path) { + int depth = ext_depth(inode); + int ret = 0; + /* probably there is space in leaf? */ - depth = ext_depth(inode); if (le16_to_cpu(path[depth].p_hdr->eh_entries) - < le16_to_cpu(path[depth].p_hdr->eh_max)) - return 1; - } + < le16_to_cpu(path[depth].p_hdr->eh_max)) { - /* - * given 32-bit logical block (4294967296 blocks), max. tree - * can be 4 levels in depth -- 4 * 340^4 == 53453440000. - * Let's also add one more level for imbalance. - */ - depth = 5; - - /* allocation of new data block(s) */ - needed = 2; + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadat blocks still need to be + * accounted. + */ + /* 1 bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + return ret; + } + } - /* - * tree can be full, so it would need to grow in depth: - * we need one credit to modify old root, credits for - * new root will be added in split accounting - */ - needed += 1; + return ext4_chunk_trans_blocks(inode, nrblocks); +} - /* - * Index split can happen, we would need: - * allocate intermediate indexes (bitmap + group) - * + change two blocks at each level, but root (already included) - */ - needed += (depth * 2) + (depth * 2); +/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then + * in the worse case, each tree level index/leaf need to be changed + * if the tree split due to insert a new extent, then the old tree + * index/leaf need to be updated too + * + * If the nrblocks are discontiguous, they could cause + * the whole tree split more than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int index; + int depth = ext_depth(inode); - /* any allocation modifies superblock */ - needed += 1; + if (chunk) + index = depth * 2; + else + index = depth * 3; - return needed; + return index; } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, @@ -1843,13 +2114,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; ex_ee_len = ext4_ext_get_actual_len(ex); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { - ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); + + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + else + uninitialized = 0; + + ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, + uninitialized, ex_ee_len); path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; @@ -1880,18 +2156,20 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, BUG_ON(b != ex_ee_block + ex_ee_len - 1); } - /* at present, extent can't cross block group: */ - /* leaf + bitmap + group desc + sb + inode */ - credits = 5; + /* + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups plus ex_ee_len/blocks_per_block_group for + * the worst case + */ + credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); if (ex == EXT_FIRST_EXTENT(eh)) { correct_index = 1; credits += (ext_depth(inode)) + 1; } -#ifdef CONFIG_QUOTA credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - err = ext4_ext_journal_restart(handle, credits); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); if (err) goto out; @@ -1989,7 +2267,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) return -ENOMEM; } path[0].p_hdr = ext_inode_hdr(inode); - if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { + if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; } @@ -2043,7 +2321,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) err = -EIO; break; } - if (ext4_ext_check_header(inode, ext_block_hdr(bh), + if (ext4_ext_check(inode, ext_block_hdr(bh), depth - i - 1)) { err = -EIO; break; @@ -2080,12 +2358,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) if (err == 0) { ext_inode_hdr(inode)->eh_depth = 0; ext_inode_hdr(inode)->eh_max = - cpu_to_le16(ext4_ext_space_root(inode)); + cpu_to_le16(ext4_ext_space_root(inode, 0)); err = ext4_ext_dirty(handle, inode, path); } } out: - ext4_ext_tree_changed(inode); ext4_ext_drop_refs(path); kfree(path); ext4_journal_stop(handle); @@ -2102,8 +2379,9 @@ void ext4_ext_init(struct super_block *sb) * possible initialization would be here */ - if (test_opt(sb, EXTENTS)) { - printk("EXT4-fs: file extents enabled"); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { +#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) + printk(KERN_INFO "EXT4-fs: file extents enabled"); #ifdef AGGRESSIVE_TEST printk(", aggressive tests"); #endif @@ -2114,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb) printk(", stats"); #endif printk("\n"); +#endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); EXT4_SB(sb)->s_ext_min = 1 << 30; @@ -2127,7 +2406,7 @@ void ext4_ext_init(struct super_block *sb) */ void ext4_ext_release(struct super_block *sb) { - if (!test_opt(sb, EXTENTS)) + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) return; #ifdef EXTENTS_STATS @@ -2174,8 +2453,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) len = ee_len; bio = bio_alloc(GFP_NOIO, len); - if (!bio) - return -ENOMEM; bio->bi_sector = ee_pblock; bio->bi_bdev = inode->i_sb->s_bdev; @@ -2217,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) } #define EXT4_EXT_ZERO_LEN 7 - /* * This function is called by ext4_ext_get_blocks() if someone tries to write * to an uninitialized extent. It may result in splitting the uninitialized @@ -2232,7 +2508,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t iblock, - unsigned long max_blocks) + unsigned int max_blocks) { struct ext4_extent *ex, newex, orig_ex; struct ext4_extent *ex1 = NULL; @@ -2293,7 +2569,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, unsigned int newdepth; /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ if (allocated <= EXT4_EXT_ZERO_LEN) { - /* Mark first half uninitialized. + /* + * iblock == ee_block is handled by the zerouout + * at the beginning. + * Mark first half uninitialized. * Mark second half initialized and zero out the * initialized extent */ @@ -2307,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex3->ee_block = cpu_to_le32(iblock); ext4_ext_store_pblock(ex3, newblock); ex3->ee_len = cpu_to_le16(allocated); - err = ext4_ext_insert_extent(handle, inode, path, ex3); + err = ext4_ext_insert_extent(handle, inode, path, + ex3, 0); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2316,7 +2596,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_len = orig_ex.ee_len; ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); - /* zeroed the full extent */ + /* blocks available from iblock */ return allocated; } else if (err) @@ -2344,6 +2624,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = PTR_ERR(path); return err; } + /* get the second half extent details */ ex = path[depth].p_ext; err = ext4_ext_get_access(handle, inode, path + depth); @@ -2362,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex3, newblock + max_blocks); ex3->ee_len = cpu_to_le16(allocated - max_blocks); ext4_ext_mark_uninitialized(ex3); - err = ext4_ext_insert_extent(handle, inode, path, ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2373,6 +2654,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ + /* blocks available from iblock */ return allocated; } else if (err) @@ -2383,28 +2665,27 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ newdepth = ext_depth(inode); /* - * update the extent length after successfull insert of the + * update the extent length after successful insert of the * split extent */ orig_ex.ee_len = cpu_to_le16(ee_len - ext4_ext_get_actual_len(ex3)); - if (newdepth != depth) { - depth = newdepth; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, iblock, path); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } - eh = path[depth].p_hdr; - ex = path[depth].p_ext; - if (ex2 != &newex) - ex2 = ex; - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + allocated = max_blocks; /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying @@ -2422,6 +2703,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ + /* blocks available from iblock */ return allocated; } } @@ -2479,7 +2761,191 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = ext4_ext_dirty(handle, inode, path + depth); goto out; insert: - err = ext4_ext_insert_extent(handle, inode, path, &newex); + err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + return allocated; + } else if (err) + goto fix_extent_len; +out: + ext4_ext_show_leaf(inode, path); + return err ? err : allocated; + +fix_extent_len: + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; +} + +/* + * This function is called by ext4_ext_get_blocks() from + * ext4_get_blocks_dio_write() when DIO to write + * to an uninitialized extent. + * + * Writing to an uninitized extent may result in splitting the uninitialized + * extent into multiple /intialized unintialized extents (up to three) + * There are three possibilities: + * a> There is no split required: Entire extent should be uninitialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * One of more index blocks maybe needed if the extent tree grow after + * the unintialized extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the uninitialized extent before DIO submit + * the IO. The uninitilized extent called at this time will be split + * into three uninitialized extent(at most). After IO complete, the part + * being filled will be convert to initialized by the end_io callback function + * via ext4_convert_unwritten_extents(). + */ +static int ext4_split_unwritten_extents(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t iblock, + unsigned int max_blocks, + int flags) +{ + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex1 = NULL; + struct ext4_extent *ex2 = NULL; + struct ext4_extent *ex3 = NULL; + struct ext4_extent_header *eh; + ext4_lblk_t ee_block; + unsigned int allocated, ee_len, depth; + ext4_fsblk_t newblock; + int err = 0; + int ret = 0; + + ext_debug("ext4_split_unwritten_extents: inode %lu," + "iblock %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)iblock, max_blocks); + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (iblock - ee_block); + newblock = iblock - ee_block + ext_pblock(ex); + ex2 = ex; + orig_ex.ee_block = ex->ee_block; + orig_ex.ee_len = cpu_to_le16(ee_len); + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + + /* + * if the entire unintialized extent length less than + * the size of extent to write, there is no need to split + * uninitialized extent + */ + if (allocated <= max_blocks) + return ret; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* ex1: ee_block to iblock - 1 : uninitialized */ + if (iblock > ee_block) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * for sanity, update the length of the ex2 extent before + * we insert ex3, if ex1 is NULL. This is to avoid temporary + * overlap of blocks. + */ + if (!ex1 && allocated > max_blocks) + ex2->ee_len = cpu_to_le16(max_blocks); + /* ex3: to ee_block + ee_len : uninitialised */ + if (allocated > max_blocks) { + unsigned int newdepth; + ex3 = &newex; + ex3->ee_block = cpu_to_le32(iblock + max_blocks); + ext4_ext_store_pblock(ex3, newblock + max_blocks); + ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + /* blocks available from iblock */ + return allocated; + + } else if (err) + goto fix_extent_len; + /* + * The depth, and hence eh & ex might change + * as part of the insert above. + */ + newdepth = ext_depth(inode); + /* + * update the extent length after successful insert of the + * split extent + */ + orig_ex.ee_len = cpu_to_le16(ee_len - + ext4_ext_get_actual_len(ex3)); + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + allocated = max_blocks; + } + /* + * If there was a change of depth as part of the + * insertion of ex3 above, we need to update the length + * of the ex1 extent again here + */ + if (ex1 && ex1 != ex) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, + * uninitialised still. + */ + ex2->ee_block = cpu_to_le32(iblock); + ext4_ext_store_pblock(ex2, newblock); + ex2->ee_len = cpu_to_le16(allocated); + ext4_ext_mark_uninitialized(ex2); + if (ex2 != ex) + goto insert; + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); + ext_debug("out here\n"); + goto out; +insert: + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2494,6 +2960,7 @@ insert: } else if (err) goto fix_extent_len; out: + ext4_ext_show_leaf(inode, path); return err ? err : allocated; fix_extent_len: @@ -2504,7 +2971,141 @@ fix_extent_len: ext4_ext_dirty(handle, inode, path + depth); return err; } +static int ext4_convert_unwritten_extents_dio(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + struct ext4_extent_header *eh; + int depth; + int err = 0; + int ret = 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as initialized */ + ext4_ext_mark_initialized(ex); + + /* + * We have to see if it can be merged with the extent + * on the left. + */ + if (ex > EXT_FIRST_EXTENT(eh)) { + /* + * To merge left, pass "ex - 1" to try_to_merge(), + * since it merges towards right _only_. + */ + ret = ext4_ext_try_to_merge(inode, path, ex - 1); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + ex--; + } + } + /* + * Try to Merge towards right. + */ + ret = ext4_ext_try_to_merge(inode, path, ex); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + } + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + +static int +ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned int max_blocks, + struct ext4_ext_path *path, int flags, + unsigned int allocated, struct buffer_head *bh_result, + ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" + "block %llu, max_blocks %u, flags %d, allocated %u", + inode->i_ino, (unsigned long long)iblock, max_blocks, + flags, allocated); + ext4_ext_show_leaf(inode, path); + + /* DIO get_block() before submit the IO, split the extent */ + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + ret = ext4_split_unwritten_extents(handle, + inode, path, iblock, + max_blocks, flags); + /* flag the io_end struct that we need convert when IO done */ + if (io) + io->flag = DIO_AIO_UNWRITTEN; + goto out; + } + /* DIO end_io complete, convert the filled extent to written */ + if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { + ret = ext4_convert_unwritten_extents_dio(handle, inode, + path); + goto out2; + } + /* buffered IO case */ + /* + * repeat fallocate creation request + * we already have an unwritten extent + */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) + goto map_out; + + /* buffered READ or buffered write_begin() lookup */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * We have blocks reserved already. We + * return allocated blocks so that delalloc + * won't do block reservation for us. But + * the buffer head will be unmapped so that + * a read from the block returns 0s. + */ + set_buffer_unwritten(bh_result); + goto out1; + } + /* buffered write, writepage time, convert*/ + ret = ext4_ext_convert_to_initialized(handle, inode, + path, iblock, + max_blocks); +out: + if (ret <= 0) { + err = ret; + goto out2; + } else + allocated = ret; + set_buffer_new(bh_result); +map_out: + set_buffer_mapped(bh_result); +out1: + if (allocated > max_blocks) + allocated = max_blocks; + ext4_ext_show_leaf(inode, path); + bh_result->b_bdev = inode->i_sb->s_bdev; + bh_result->b_blocknr = newblock; +out2: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return err ? err : allocated; +} /* * Block allocation/map/preallocation routine for extents based files * @@ -2525,26 +3126,27 @@ fix_extent_len: */ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, - int create, int extend_disksize) + unsigned int max_blocks, struct buffer_head *bh_result, + int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; struct ext4_extent newex, *ex; - ext4_fsblk_t goal, newblock; - int err = 0, depth, ret; - unsigned long allocated = 0; + ext4_fsblk_t newblock; + int err = 0, depth, ret, cache_type; + unsigned int allocated = 0; struct ext4_allocation_request ar; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; __clear_bit(BH_New, &bh_result->b_state); - ext_debug("blocks %u/%lu requested for inode %u\n", + ext_debug("blocks %u/%u requested for inode %lu\n", iblock, max_blocks, inode->i_ino); /* check in cache */ - goal = ext4_ext_in_cache(inode, iblock, &newex); - if (goal) { - if (goal == EXT4_EXT_CACHE_GAP) { - if (!create) { + cache_type = ext4_ext_in_cache(inode, iblock, &newex); + if (cache_type) { + if (cache_type == EXT4_EXT_CACHE_GAP) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * block isn't allocated yet and * user doesn't want to allocate it @@ -2552,7 +3154,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out2; } /* we should allocate requested block */ - } else if (goal == EXT4_EXT_CACHE_EXTENT) { + } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { /* block is already allocated */ newblock = iblock - le32_to_cpu(newex.ee_block) @@ -2600,7 +3202,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = iblock - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (iblock - ee_block); - ext_debug("%u fit into %lu:%d -> %llu\n", iblock, + ext_debug("%u fit into %u:%d -> %llu\n", iblock, ee_block, ee_len, newblock); /* Do not put uninitialized extent in the cache */ @@ -2610,31 +3212,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, EXT4_EXT_CACHE_EXTENT); goto out; } - if (create == EXT4_CREATE_UNINITIALIZED_EXT) - goto out; - if (!create) { - /* - * We have blocks reserved already. We - * return allocated blocks so that delalloc - * won't do block reservation for us. But - * the buffer head will be unmapped so that - * a read from the block returns 0s. - */ - if (allocated > max_blocks) - allocated = max_blocks; - set_buffer_unwritten(bh_result); - goto out2; - } - - ret = ext4_ext_convert_to_initialized(handle, inode, - path, iblock, - max_blocks); - if (ret <= 0) { - err = ret; - goto out2; - } else - allocated = ret; - goto outnew; + ret = ext4_ext_handle_uninitialized_extents(handle, + inode, iblock, max_blocks, path, + flags, allocated, bh_result, newblock); + return ret; } } @@ -2642,7 +3223,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero */ - if (!create) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * put just found gap into cache to speed up * subsequent requests @@ -2651,11 +3232,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out2; } /* - * Okay, we need to do block allocation. Lazily initialize the block - * allocation info here if necessary. + * Okay, we need to do block allocation. */ - if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) - ext4_init_block_alloc_info(inode); /* find neighbour allocated blocks */ ar.lleft = iblock; @@ -2674,10 +3252,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * EXT_UNINIT_MAX_LEN. */ if (max_blocks > EXT_INIT_MAX_LEN && - create != EXT4_CREATE_UNINITIALIZED_EXT) + !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) max_blocks = EXT_INIT_MAX_LEN; else if (max_blocks > EXT_UNINIT_MAX_LEN && - create == EXT4_CREATE_UNINITIALIZED_EXT) + (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) max_blocks = EXT_UNINIT_MAX_LEN; /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ @@ -2702,20 +3280,32 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; - ext_debug("allocate new block: goal %llu, found %llu/%lu\n", - goal, newblock, allocated); + ext_debug("allocate new block: goal %llu, found %llu/%u\n", + ar.goal, newblock, allocated); /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(ar.len); - if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ + /* Mark uninitialized */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ ext4_ext_mark_uninitialized(&newex); - err = ext4_ext_insert_extent(handle, inode, path, &newex); + /* + * io_end structure was created for every async + * direct IO write to the middle of the file. + * To avoid unecessary convertion for every aio dio rewrite + * to the mid of file, here we flag the IO that is really + * need the convertion. + * + */ + if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) + io->flag = DIO_AIO_UNWRITTEN; + } + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ - ext4_mb_discard_inode_preallocations(inode); + ext4_discard_preallocations(inode); ext4_free_blocks(handle, inode, ext_pblock(&newex), ext4_ext_get_actual_len(&newex), 0); goto out2; @@ -2724,14 +3314,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); -outnew: - if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = inode->i_size; - set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ - if (create != EXT4_CREATE_UNINITIALIZED_EXT) + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) ext4_ext_put_in_cache(inode, iblock, allocated, newblock, EXT4_EXT_CACHE_EXTENT); out: @@ -2760,7 +3346,7 @@ void ext4_ext_truncate(struct inode *inode) /* * probably first extent we're gonna free will be last in block */ - err = ext4_writepage_trans_blocks(inode) + 3; + err = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, err); if (IS_ERR(handle)) return; @@ -2774,7 +3360,7 @@ void ext4_ext_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); - ext4_mb_discard_inode_preallocations(inode); + ext4_discard_preallocations(inode); /* * TODO: optimization is possible here. @@ -2794,7 +3380,7 @@ void ext4_ext_truncate(struct inode *inode) * transaction synchronous. */ if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); out_stop: up_write(&EXT4_I(inode)->i_data_sem); @@ -2813,27 +3399,6 @@ out_stop: ext4_journal_stop(handle); } -/* - * ext4_ext_writepage_trans_blocks: - * calculate max number of blocks we could modify - * in order to allocate new block for an inode - */ -int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) -{ - int needed; - - needed = ext4_ext_calc_credits_for_insert(inode, NULL); - - /* caller wants to allocate num blocks, but note it includes sb */ - needed = needed * num - (num - 1); - -#ifdef CONFIG_QUOTA - needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - - return needed; -} - static void ext4_falloc_update_inode(struct inode *inode, int mode, loff_t new_size, int update_ctime) { @@ -2848,10 +3413,11 @@ static void ext4_falloc_update_inode(struct inode *inode, * Update only when preallocation was requested beyond * the file size. */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - new_size > i_size_read(inode)) { - i_size_write(inode, new_size); - EXT4_I(inode)->i_disksize = new_size; + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + if (new_size > i_size_read(inode)) + i_size_write(inode, new_size); + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); } } @@ -2868,7 +3434,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) handle_t *handle; ext4_lblk_t block; loff_t new_size; - unsigned long max_blocks; + unsigned int max_blocks; int ret = 0; int ret2 = 0; int retries = 0; @@ -2894,10 +3460,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - block; /* - * credits to insert 1 extent into extent tree + buffers to be able to - * modify 1 super block, 1 block bitmap and 1 group descriptor. + * credits to insert 1 extent into extent tree */ - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; + credits = ext4_chunk_trans_blocks(inode, max_blocks); mutex_lock(&inode->i_mutex); retry: while (ret >= 0 && ret < max_blocks) { @@ -2908,15 +3473,16 @@ retry: ret = PTR_ERR(handle); break; } - ret = ext4_get_blocks_wrap(handle, inode, block, - max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0); + map_bh.b_state = 0; + ret = ext4_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); printk(KERN_ERR "%s: ext4_ext_get_blocks " "returned error inode#%lu, block=%u, " - "max_blocks=%lu", __func__, + "max_blocks=%u", __func__, inode->i_ino, block, max_blocks); #endif ext4_mark_inode_dirty(handle, inode); @@ -2944,3 +3510,209 @@ retry: mutex_unlock(&inode->i_mutex); return ret > 0 ? ret2 : ret; } + +/* + * This function convert a range of blocks to written extents + * The caller of this function will pass the start offset and the size. + * all unwritten extents within this range will be converted to + * written extents. + * + * This function is called from the direct IO end io call back + * function, to convert the fallocated extents after IO is completed. + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + ext4_lblk_t block; + unsigned int max_blocks; + int ret = 0; + int ret2 = 0; + struct buffer_head map_bh; + unsigned int credits, blkbits = inode->i_blkbits; + + block = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) + - block; + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + while (ret >= 0 && ret < max_blocks) { + block = block + ret; + max_blocks = max_blocks - ret; + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + map_bh.b_state = 0; + ret = ext4_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_GET_BLOCKS_DIO_CONVERT_EXT); + if (ret <= 0) { + WARN_ON(ret <= 0); + printk(KERN_ERR "%s: ext4_ext_get_blocks " + "returned error inode#%lu, block=%u, " + "max_blocks=%u", __func__, + inode->i_ino, block, max_blocks); + } + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2 ) + break; + } + return ret > 0 ? ret2 : ret; +} +/* + * Callback function called for each extent to gather FIEMAP information. + */ +static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, + struct ext4_ext_cache *newex, struct ext4_extent *ex, + void *data) +{ + struct fiemap_extent_info *fieinfo = data; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; + __u64 logical; + __u64 physical; + __u64 length; + __u32 flags = 0; + int error; + + logical = (__u64)newex->ec_block << blksize_bits; + + if (newex->ec_type == EXT4_EXT_CACHE_GAP) { + pgoff_t offset; + struct page *page; + struct buffer_head *bh = NULL; + + offset = logical >> PAGE_SHIFT; + page = find_get_page(inode->i_mapping, offset); + if (!page || !page_has_buffers(page)) + return EXT_CONTINUE; + + bh = page_buffers(page); + + if (!bh) + return EXT_CONTINUE; + + if (buffer_delay(bh)) { + flags |= FIEMAP_EXTENT_DELALLOC; + page_cache_release(page); + } else { + page_cache_release(page); + return EXT_CONTINUE; + } + } + + physical = (__u64)newex->ec_start << blksize_bits; + length = (__u64)newex->ec_len << blksize_bits; + + if (ex && ext4_ext_is_uninitialized(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + + /* + * If this extent reaches EXT_MAX_BLOCK, it must be last. + * + * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, + * this also indicates no more allocated blocks. + * + * XXX this might miss a single-block extent at EXT_MAX_BLOCK + */ + if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || + newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) { + loff_t size = i_size_read(inode); + loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb); + + flags |= FIEMAP_EXTENT_LAST; + if ((flags & FIEMAP_EXTENT_DELALLOC) && + logical+length > size) + length = (size - logical + bs - 1) & ~(bs-1); + } + + error = fiemap_fill_next_extent(fieinfo, logical, physical, + length, flags); + if (error < 0) + return error; + if (error == 1) + return EXT_BREAK; + + return EXT_CONTINUE; +} + +/* fiemap flags we can handle specified here */ +#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +static int ext4_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_LAST; + int blockbits = inode->i_sb->s_blocksize_bits; + int error = 0; + + /* in-inode? */ + if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + struct ext4_iloc iloc; + int offset; /* offset of xattr in inode */ + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + physical = iloc.bh->b_blocknr << blockbits; + offset = EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize; + physical += offset; + length = EXT4_SB(inode->i_sb)->s_inode_size - offset; + flags |= FIEMAP_EXTENT_DATA_INLINE; + } else { /* external block */ + physical = EXT4_I(inode)->i_file_acl << blockbits; + length = inode->i_sb->s_blocksize; + } + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + return (error < 0 ? error : 0); +} + +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + ext4_lblk_t start_blk; + ext4_lblk_t len_blks; + int error = 0; + + /* fallback to generic here if not in extents fmt */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); + + if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + return -EBADR; + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + error = ext4_xattr_fiemap(inode, fieinfo); + } else { + start_blk = start >> inode->i_sb->s_blocksize_bits; + len_blks = len >> inode->i_sb->s_blocksize_bits; + + /* + * Walk the extent tree gathering extent information. + * ext4_ext_fiemap_cb will push extents back to user. + */ + down_read(&EXT4_I(inode)->i_data_sem); + error = ext4_ext_walk_space(inode, start_blk, len_blks, + ext4_ext_fiemap_cb, fieinfo); + up_read(&EXT4_I(inode)->i_data_sem); + } + + return error; +} +