X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;ds=sidebyside;f=fs%2Fext4%2Finode.c;h=3e0f6af9d08d9a6faf3bea7c25087b94b8b883b0;hb=0993dbedf2cc2f5fd0701fa3b27afdd303536b87;hp=0c0ddc1401e4cde5f3b043b82dfe4f21dea21846;hpb=1032988c71f3f85483b2b4319684d1205a704c02;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0c0ddc1..3e0f6af 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,8 @@ #include #include #include +#include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -71,59 +73,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * The ext4 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. - * - * "bh" may be NULL: a metadata block may have been freed from memory - * but there may still be a record of it in the journal, and that record - * still needs to be revoked. - * - * If the handle isn't valid we're not journaling, but we still need to - * call into ext4_journal_revoke() to put the buffer head. - */ -int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr) -{ - int err; - - might_sleep(); - - trace_ext4_forget(inode, is_metadata, blocknr); - BUFFER_TRACE(bh, "enter"); - - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %x\n", - bh, is_metadata, inode->i_mode, - test_opt(inode->i_sb, DATA_FLAGS)); - - /* Never use the revoke function if we are doing full data - * journaling: there is no need to, and a V1 superblock won't - * support it. Otherwise, only skip the revoke on un-journaled - * data blocks. */ - - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - (!is_metadata && !ext4_should_journal_data(inode))) { - if (bh) { - BUFFER_TRACE(bh, "call jbd2_journal_forget"); - return ext4_journal_forget(handle, bh); - } - return 0; - } - - /* - * data!=journal && (is_metadata || should_journal_data(inode)) - */ - BUFFER_TRACE(bh, "call ext4_journal_revoke"); - err = ext4_journal_revoke(handle, blocknr, bh); - if (err) - ext4_abort(inode->i_sb, __func__, - "error %d when attempting revoke", err); - BUFFER_TRACE(bh, "exit"); - return err; -} - -/* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ @@ -223,6 +172,9 @@ void ext4_delete_inode(struct inode *inode) handle_t *handle; int err; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); @@ -247,7 +199,7 @@ void ext4_delete_inode(struct inode *inode) inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } @@ -265,7 +217,7 @@ void ext4_delete_inode(struct inode *inode) if (err > 0) err = ext4_journal_restart(handle, 3); if (err != 0) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "couldn't extend journal (err %d)", err); stop_handle: ext4_journal_stop(handle); @@ -376,8 +328,7 @@ static int ext4_block_to_path(struct inode *inode, offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { - ext4_warning(inode->i_sb, "ext4_block_to_path", - "block %lu > max in inode %lu", + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } @@ -397,7 +348,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - ext4_error(inode->i_sb, function, + __ext4_error(inode->i_sb, function, "invalid block reference %u " "in inode #%lu", blk, inode->i_ino); return -EIO; @@ -660,7 +611,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, if (*err) goto failed_out; - BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } target -= count; /* allocate blocks for indirect blocks */ @@ -696,7 +654,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; current_block = ext4_mb_new_blocks(handle, &ar, err); - BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } if (*err && (target == blks)) { /* @@ -722,7 +687,7 @@ allocated: return ret; failed_out: for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); return ret; } @@ -818,14 +783,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return err; failed: /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); for (i = 1; i <= n ; i++) { - BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); - ext4_journal_forget(handle, branch[i].bh); + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); } - for (i = 0; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); - ext4_free_blocks(handle, inode, new_blocks[i], num, 0); + ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); return err; } @@ -904,12 +875,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, err_out: for (i = 1; i <= num; i++) { - BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); - ext4_journal_forget(handle, where[i].bh); - ext4_free_blocks(handle, inode, - le32_to_cpu(where[i-1].key), 1, 0); + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); } - ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), + blks, 0); return err; } @@ -1022,10 +997,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, if (!err) err = ext4_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - else + if (err) goto cleanup; set_buffer_new(bh_result); + + ext4_update_inode_fsync_trans(handle, inode, 1); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) @@ -1044,92 +1021,121 @@ out: return err; } -qsize_t ext4_get_reserved_space(struct inode *inode) +#ifdef CONFIG_QUOTA +qsize_t *ext4_get_reserved_space(struct inode *inode) { - unsigned long long total; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + - EXT4_I(inode)->i_reserved_meta_blocks; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - - return total; + return &EXT4_I(inode)->i_reserved_quota; } +#endif + /* * Calculate the number of metadata blocks need to reserve - * to allocate @blocks for non extent file based file + * to allocate a new block at @lblocks for non extent file based file */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +static int ext4_indirect_calc_metadata_amount(struct inode *inode, + sector_t lblock) { - int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ind_blks, dind_blks, tind_blks; - - /* number of new indirect blocks needed */ - ind_blks = (blocks + icap - 1) / icap; + struct ext4_inode_info *ei = EXT4_I(inode); + sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); + int blk_bits; - dind_blks = (ind_blks + icap - 1) / icap; + if (lblock < EXT4_NDIR_BLOCKS) + return 0; - tind_blks = 1; + lblock -= EXT4_NDIR_BLOCKS; - return ind_blks + dind_blks + tind_blks; + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = order_base_2(lblock); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; } /* * Calculate the number of metadata blocks need to reserve - * to allocate given number of blocks + * to allocate a block located at @lblock */ -static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) { - if (!blocks) - return 0; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_calc_metadata_amount(inode, blocks); + return ext4_ext_calc_metadata_amount(inode, lblock); - return ext4_indirect_calc_metadata_amount(inode, blocks); + return ext4_indirect_calc_metadata_amount(inode, lblock); } -static void ext4_da_update_reserve_space(struct inode *inode, int used) +/* + * Called with i_data_sem down, which is important since we can call + * ext4_discard_preallocations() from here. + */ +void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - used; - mdb = ext4_calc_metadata_amount(inode, total); - - /* figure out how many metablocks to release */ - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - - if (mdb_free) { - /* Account for allocated meta_blocks */ - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; - - /* update fs dirty blocks counter */ + struct ext4_inode_info *ei = EXT4_I(inode); + int mdb_free = 0, allocated_meta_blocks = 0; + + spin_lock(&ei->i_block_reservation_lock); + trace_ext4_da_update_reserve_space(inode, used); + if (unlikely(used > ei->i_reserved_data_blocks)) { + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " + "with only %d reserved data blocks\n", + __func__, inode->i_ino, used, + ei->i_reserved_data_blocks); + WARN_ON(1); + used = ei->i_reserved_data_blocks; + } + + /* Update per-inode reservations */ + ei->i_reserved_data_blocks -= used; + used += ei->i_allocated_meta_blocks; + ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; + allocated_meta_blocks = ei->i_allocated_meta_blocks; + ei->i_allocated_meta_blocks = 0; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); + + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + mdb_free = ei->i_reserved_meta_blocks; + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); - EXT4_I(inode)->i_allocated_meta_blocks = 0; - EXT4_I(inode)->i_reserved_meta_blocks = mdb; } - - /* update per-inode reservations */ - BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= used; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - /* - * free those over-booking quota for metadata blocks - */ - if (mdb_free) - vfs_dq_release_reservation_block(inode, mdb_free); + /* Update quota subsystem */ + if (quota_claim) { + dquot_claim_block(inode, used); + if (mdb_free) + dquot_release_reservation_block(inode, mdb_free); + } else { + /* + * We did fallocate with an offset that is already delayed + * allocated. So on delayed allocated writeback we should + * not update the quota for allocated blocks. But then + * converting an fallocate region to initialized region would + * have caused a metadata allocation. So claim quota for + * that + */ + if (allocated_meta_blocks) + dquot_claim_block(inode, allocated_meta_blocks); + dquot_release_reservation_block(inode, mdb_free + used); + } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ - if (!total && (atomic_read(&inode->i_writecount) == 0)) + if ((ei->i_reserved_data_blocks == 0) && + (atomic_read(&inode->i_writecount) == 0)) ext4_discard_preallocations(inode); } @@ -1137,7 +1143,7 @@ static int check_block_validity(struct inode *inode, const char *msg, sector_t logical, sector_t phys, int len) { if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { - ext4_error(inode->i_sb, msg, + __ext4_error(inode->i_sb, msg, "inode #%lu logical block %llu mapped to %llu " "(size %d)", inode->i_ino, (unsigned long long) logical, @@ -1319,20 +1325,22 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * i_data's format changing. Force the migrate * to fail by clearing migrate flags */ - EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } - } + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. We don't + * support fallocate for non extent files. So we can update + * reserve space here. + */ + if ((retval > 0) && + (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) + ext4_da_update_reserve_space(inode, retval, 1); + } if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) EXT4_I(inode)->i_delalloc_reserved_flag = 0; - /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. - */ - if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) - ext4_da_update_reserve_space(inode, retval); - up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { int ret = check_block_validity(inode, "file system " @@ -1535,6 +1543,18 @@ static int do_journal_get_write_access(handle_t *handle, return ext4_journal_get_write_access(handle, bh); } +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + +static int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1576,8 +1596,12 @@ retry: } *pagep = page; - ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext4_get_block); + if (ext4_should_dioread_nolock(inode)) + ret = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, ext4_get_block_write); + else + ret = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = walk_page_buffers(handle, page_buffers(page), @@ -1600,7 +1624,7 @@ retry: ext4_journal_stop(handle); if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to @@ -1710,7 +1734,7 @@ static int ext4_ordered_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1752,7 +1776,7 @@ static int ext4_writeback_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1794,7 +1818,7 @@ static int ext4_journalled_write_end(struct file *file, new_i_size = pos + copied; if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + ext4_set_inode_state(inode, EXT4_STATE_JDATA); if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1815,7 +1839,7 @@ static int ext4_journalled_write_end(struct file *file, if (!ret) ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1828,11 +1852,16 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } -static int ext4_da_reserve_space(struct inode *inode, int nrblocks) +/* + * Reserve a single block located at lblock + */ +static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned long md_needed, mdblocks, total = 0; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long md_needed, md_reserved; + int ret; /* * recalculate the amount of metadata blocks to reserve @@ -1840,86 +1869,80 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) * worse case is one extent per block */ repeat: - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; - mdblocks = ext4_calc_metadata_amount(inode, total); - BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); - - md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; - total = md_needed + nrblocks; + spin_lock(&ei->i_block_reservation_lock); + md_reserved = ei->i_reserved_meta_blocks; + md_needed = ext4_calc_metadata_amount(inode, lblock); + trace_ext4_da_reserve_space(inode, md_needed); + spin_unlock(&ei->i_block_reservation_lock); /* * Make quota reservation here to prevent quota overflow * later. Real quota accounting is done at pages writeout * time. */ - if (vfs_dq_reserve_block(inode, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return -EDQUOT; - } + ret = dquot_reserve_block(inode, md_needed + 1); + if (ret) + return ret; - if (ext4_claim_free_blocks(sbi, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, total); + if (ext4_claim_free_blocks(sbi, md_needed + 1)) { + dquot_release_reservation_block(inode, md_needed + 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; } return -ENOSPC; } - EXT4_I(inode)->i_reserved_data_blocks += nrblocks; - EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return 0; /* success */ } static void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free, release; + struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - if (!EXT4_I(inode)->i_reserved_data_blocks) { + if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* - * if there is no reserved blocks, but we try to free some - * then the counter is messed up somewhere. - * but since this function is called from invalidate - * page, it's harmless to return without any action + * if there aren't enough reserved blocks, then the + * counter is messed up somewhere. Since this + * function is called from invalidate page, it's + * harmless to return without any action. */ - printk(KERN_INFO "ext4 delalloc try to release %d reserved " - "blocks for inode %lu, but there is no reserved " - "data blocks\n", to_free, inode->i_ino); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return; + ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " + "ino %lu, to_free %d with only %d reserved " + "data blocks\n", inode->i_ino, to_free, + ei->i_reserved_data_blocks); + WARN_ON(1); + to_free = ei->i_reserved_data_blocks; } + ei->i_reserved_data_blocks -= to_free; - /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - to_free; - mdb = ext4_calc_metadata_amount(inode, total); - - /* figure out how many metablocks to release */ - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - - release = to_free + mdb_free; - - /* update fs dirty blocks counter for truncate case */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + to_free += ei->i_reserved_meta_blocks; + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + } - /* update per-inode reservations */ - BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= to_free; + /* update fs dirty blocks counter */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - EXT4_I(inode)->i_reserved_meta_blocks = mdb; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, release); + dquot_release_reservation_block(inode, to_free); } static void ext4_da_page_release_reservation(struct page *page, @@ -2096,6 +2119,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); + if (buffer_uninit(exbh)) + set_buffer_uninit(bh); cur_logical++; pblock++; } while ((bh = bh->b_this_page) != head); @@ -2138,17 +2163,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) + if (page->index > end) break; - index++; - BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); block_invalidatepage(page, 0); ClearPageUptodate(page); unlock_page(page); } + index = pvec.pages[nr_pages - 1]->index + 1; + pagevec_release(&pvec); } return; } @@ -2224,10 +2248,12 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * variables are updated after the blocks have been allocated. */ new.b_state = 0; - get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | - EXT4_GET_BLOCKS_DELALLOC_RESERVE); + get_blocks_flags = EXT4_GET_BLOCKS_CREATE; + if (ext4_should_dioread_nolock(mpd->inode)) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (mpd->b_state & (1 << BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, &new, get_blocks_flags); if (blks < 0) { @@ -2525,7 +2551,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - ret = ext4_da_reserve_space(inode, 1); + ret = ext4_da_reserve_space(inode, iblock); if (ret) /* not enough space to reserve */ return ret; @@ -2601,7 +2627,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) } static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc, unsigned int len) { struct address_space *mapping = page->mapping; @@ -2636,11 +2661,14 @@ static int __ext4_journalled_writepage(struct page *page, ret = err; walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: return ret; } +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); + /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't @@ -2688,7 +2716,7 @@ static int ext4_writepage(struct page *page, int ret = 0; loff_t size; unsigned int len; - struct buffer_head *page_bufs; + struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; trace_ext4_writepage(inode, page); @@ -2759,12 +2787,16 @@ static int ext4_writepage(struct page *page, * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc, len); + return __ext4_journalled_writepage(page, len); } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ret = nobh_writepage(page, noalloc_get_block_write, wbc); - else + else if (page_bufs && buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + ret = block_write_full_page_endio(page, noalloc_get_block_write, + wbc, ext4_end_io_buffer_write); + } else ret = block_write_full_page(page, noalloc_get_block_write, wbc); @@ -2934,7 +2966,7 @@ retry: ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); /* - * If we have a contigous extent of pages and we + * If we have a contiguous extent of pages and we * haven't done the I/O yet, map the blocks and submit * them for I/O. */ @@ -3000,8 +3032,7 @@ retry: out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; - if (wbc->nr_to_write > nr_to_writebump) - wbc->nr_to_write -= nr_to_writebump; + wbc->nr_to_write -= nr_to_writebump; wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; @@ -3026,11 +3057,18 @@ static int ext4_nonda_switch(struct super_block *sb) if (2 * free_blocks < 3 * dirty_blocks || free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { /* - * free block count is less that 150% of dirty blocks - * or free blocks is less that watermark + * free block count is less than 150% of dirty blocks + * or free blocks is less than watermark */ return 1; } + /* + * Even if we don't switch but are nearing capacity, + * start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (free_blocks < 2 * dirty_blocks) + writeback_inodes_sb_if_idle(sb); + return 0; } @@ -3038,7 +3076,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - int ret, retries = 0; + int ret, retries = 0, quota_retries = 0; struct page *page; pgoff_t index; unsigned from, to; @@ -3092,11 +3130,27 @@ retry: * i_size_read because we hold i_mutex. */ if (pos + len > inode->i_size) - ext4_truncate(inode); + ext4_truncate_failed_write(inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + + if ((ret == -EDQUOT) && + EXT4_I(inode)->i_reserved_meta_blocks && + (quota_retries++ < 3)) { + /* + * Since we often over-estimate the number of meta + * data blocks required, we may sometimes get a + * spurios out of quota error even though there would + * be enough space once we write the data blocks and + * find out how many meta data blocks were _really_ + * required. So try forcing the inode write to see if + * that helps. + */ + write_inode_now(inode, (quota_retries == 3)); + goto retry; + } out: return ret; } @@ -3285,7 +3339,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); } - if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { + if (EXT4_JOURNAL(inode) && + ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -3304,7 +3359,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) * everything they get. */ - EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; + ext4_clear_inode_state(inode, EXT4_STATE_JDATA); journal = EXT4_JOURNAL(inode); jbd2_journal_lock_updates(journal); err = jbd2_journal_flush(journal); @@ -3329,11 +3384,45 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } +static void ext4_free_io_end(ext4_io_end_t *io) +{ + BUG_ON(!io); + if (io->page) + put_page(io->page); + iput(io->inode); + kfree(io); +} + +static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + if (!page_has_buffers(page)) + return; + head = bh = page_buffers(page); + do { + if (offset <= curr_off && test_clear_buffer_uninit(bh) + && bh->b_private) { + ext4_free_io_end(bh->b_private); + bh->b_private = NULL; + bh->b_end_io = NULL; + } + curr_off = curr_off + bh->b_size; + bh = bh->b_this_page; + } while (bh != head); +} + static void ext4_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); /* + * free any io_end structure allocated for buffers to be discarded + */ + if (ext4_should_dioread_nolock(page->mapping->host)) + ext4_invalidatepage_free_endio(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0) @@ -3404,7 +3493,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, } retry: - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + if (rw == READ && ext4_should_dioread_nolock(inode)) + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); + else + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3420,6 +3516,9 @@ retry: * but cannot extend i_size. Bail out and pretend * the write failed... */ ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + goto out; } if (inode->i_nlink) @@ -3447,75 +3546,63 @@ out: return ret; } -static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, +static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle = NULL; + handle_t *handle = ext4_journal_current_handle(); int ret = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int dio_credits; + int started = 0; - ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", + ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", inode->i_ino, create); /* - * DIO VFS code passes create = 0 flag for write to - * the middle of file. It does this to avoid block - * allocation for holes, to prevent expose stale data - * out when there is parallel buffered read (which does - * not hold the i_mutex lock) while direct IO write has - * not completed. DIO request on holes finally falls back - * to buffered IO for this reason. - * - * For ext4 extent based file, since we support fallocate, - * new allocated extent as uninitialized, for holes, we - * could fallocate blocks for holes, thus parallel - * buffered IO read will zero out the page when read on - * a hole while parallel DIO write to the hole has not completed. - * - * when we come here, we know it's a direct IO write to - * to the middle of file ( DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); - handle = ext4_journal_start(inode, dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + if (!handle) { + if (max_blocks > DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + started = 1; } + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; } - ext4_journal_stop(handle); + if (started) + ext4_journal_stop(handle); out: return ret; } -static void ext4_free_io_end(ext4_io_end_t *io) -{ - BUG_ON(!io); - iput(io->inode); - kfree(io); -} -static void dump_aio_dio_list(struct inode * inode) +static void dump_completed_IO(struct inode * inode) { #ifdef EXT4_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; + unsigned long flags; - if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ - ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); + if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ + ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); return; } - ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); - list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ + ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ cur = &io->list; before = cur->prev; io0 = container_of(before, ext4_io_end_t, list); @@ -3525,32 +3612,31 @@ static void dump_aio_dio_list(struct inode * inode) ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", io, inode->i_ino, io0, io1); } + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); #endif } /* * check a range of space and convert unwritten extents to written. */ -static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) +static int ext4_end_io_nolock(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; - size_t size = io->size; + ssize_t size = io->size; int ret = 0; - ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); if (list_empty(&io->list)) return ret; - if (io->flag != DIO_AIO_UNWRITTEN) + if (io->flag != EXT4_IO_UNWRITTEN) return ret; - if (offset + size <= i_size_read(inode)) - ret = ext4_convert_unwritten_extents(inode, offset, size); - + ret = ext4_convert_unwritten_extents(inode, offset, size); if (ret < 0) { printk(KERN_EMERG "%s: failed to convert unwritten" "extents to written extents, error is %d" @@ -3563,50 +3649,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) io->flag = 0; return ret; } + /* * work on completed aio dio IO, to convert unwritten extents to extents */ -static void ext4_end_aio_dio_work(struct work_struct *work) +static void ext4_end_io_work(struct work_struct *work) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - int ret = 0; + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret; mutex_lock(&inode->i_mutex); - ret = ext4_end_aio_dio_nolock(io); - if (ret >= 0) { - if (!list_empty(&io->list)) - list_del_init(&io->list); - ext4_free_io_end(io); + ret = ext4_end_io_nolock(io); + if (ret < 0) { + mutex_unlock(&inode->i_mutex); + return; } + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (!list_empty(&io->list)) + list_del_init(&io->list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); mutex_unlock(&inode->i_mutex); + ext4_free_io_end(io); } + /* * This function is called from ext4_sync_file(). * - * When AIO DIO IO is completed, the work to convert unwritten - * extents to written is queued on workqueue but may not get immediately + * When IO is completed, the work to convert unwritten extents to + * written is queued on workqueue but may not get immediately * scheduled. When fsync is called, we need to ensure the * conversion is complete before fsync returns. - * The inode keeps track of a list of completed AIO from DIO path - * that might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents to written. + * The inode keeps track of a list of pending/completed IO that + * might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents for completed IO + * to written. + * The function return the number of pending IOs on success. */ -int flush_aio_dio_completed_IO(struct inode *inode) +int flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; int ret = 0; int ret2 = 0; - if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) + if (list_empty(&ei->i_completed_io_list)) return ret; - dump_aio_dio_list(inode); - while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ - io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, + dump_completed_IO(inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + while (!list_empty(&ei->i_completed_io_list)){ + io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); /* - * Calling ext4_end_aio_dio_nolock() to convert completed + * Calling ext4_end_io_nolock() to convert completed * IO to written. * * When ext4_sync_file() is called, run_queue() may already @@ -3619,20 +3719,23 @@ int flush_aio_dio_completed_IO(struct inode *inode) * avoid double converting from both fsync and background work * queue work. */ - ret = ext4_end_aio_dio_nolock(io); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + ret = ext4_end_io_nolock(io); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (ret < 0) ret2 = ret; else list_del_init(&io->list); } + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; } -static ext4_io_end_t *ext4_init_io_end (struct inode *inode) +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) { ext4_io_end_t *io = NULL; - io = kmalloc(sizeof(*io), GFP_NOFS); + io = kmalloc(sizeof(*io), flags); if (io) { igrab(inode); @@ -3640,8 +3743,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode) io->flag = 0; io->offset = 0; io->size = 0; - io->error = 0; - INIT_WORK(&io->work, ext4_end_aio_dio_work); + io->page = NULL; + INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -3653,6 +3756,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, { ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; + unsigned long flags; + struct ext4_inode_info *ei; /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) @@ -3664,7 +3769,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); /* if not aio dio with unwritten extents, just free io and return */ - if (io_end->flag != DIO_AIO_UNWRITTEN){ + if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; return; @@ -3672,16 +3777,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, io_end->offset = offset; io_end->size = size; + io_end->flag = EXT4_IO_UNWRITTEN; wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); /* Add the io_end to per-inode completed aio dio list*/ - list_add_tail(&io_end->list, - &EXT4_I(io_end->inode)->i_aio_dio_complete_list); + ei = EXT4_I(io_end->inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &ei->i_completed_io_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); iocb->private = NULL; } + +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) +{ + ext4_io_end_t *io_end = bh->b_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + + if (!test_clear_buffer_uninit(bh) || !io_end) + goto out; + + if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { + printk("sb umounted, discard end_io request for inode %lu\n", + io_end->inode->i_ino); + ext4_free_io_end(io_end); + goto out; + } + + io_end->flag = EXT4_IO_UNWRITTEN; + inode = io_end->inode; + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +out: + bh->b_private = NULL; + bh->b_end_io = NULL; + clear_buffer_uninit(bh); + end_buffer_async_write(bh, uptodate); +} + +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; + size_t size = bh->b_size; + +retry: + io_end = ext4_init_io_end(inode, GFP_ATOMIC); + if (!io_end) { + if (printk_ratelimit()) + printk(KERN_WARNING "%s: allocation fail\n", __func__); + schedule(); + goto retry; + } + io_end->offset = offset; + io_end->size = size; + /* + * We need to hold a reference to the page to make sure it + * doesn't get evicted before ext4_end_io_work() has a chance + * to convert the extent from written to unwritten. + */ + io_end->page = page; + get_page(io_end->page); + + bh->b_private = io_end; + bh->b_end_io = ext4_end_io_buffer_write; + return 0; +} + /* * For ext4 extent files, ext4 will do direct-io write to holes, * preallocated extents, and those write extend the file, no need to @@ -3735,7 +3909,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; EXT4_I(inode)->cur_aio_dio = NULL; if (!is_sync_kiocb(iocb)) { - iocb->private = ext4_init_io_end(inode); + iocb->private = ext4_init_io_end(inode, GFP_NOFS); if (!iocb->private) return -ENOMEM; /* @@ -3751,7 +3925,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, - ext4_get_block_dio_write, + ext4_get_block_write, ext4_end_io_dio); if (iocb->private) EXT4_I(inode)->cur_aio_dio = NULL; @@ -3772,8 +3946,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0 && (EXT4_I(inode)->i_state & - EXT4_STATE_DIO_UNWRITTEN)) { + } else if (ret > 0 && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { int err; /* * for non AIO case, since the IO is already @@ -3783,7 +3957,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, offset, ret); if (err < 0) ret = err; - EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } return ret; } @@ -4065,7 +4239,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth, int k, err; *top = 0; - /* Make k index the deepest non-null offest + 1 */ + /* Make k index the deepest non-null offset + 1 */ for (k = depth; k > 1 && !offsets[k-1]; k--) ; partial = ext4_get_branch(inode, k, offsets, chain, &err); @@ -4114,14 +4288,26 @@ no_top: * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. */ -static void ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, - ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, - __le32 *last) +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) { __le32 *p; - int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + ext4_error(inode->i_sb, "inode #%lu: " + "attempt to clear blocks %llu len %lu, invalid", + inode->i_ino, (unsigned long long) block_to_free, + count); + return 1; + } if (try_to_extend_transaction(handle, inode)) { if (bh) { @@ -4137,27 +4323,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, } } - /* - * Any buffers which are on the journal will be in memory. We - * find them on the hash table so jbd2_journal_revoke() will - * run jbd2_journal_forget() on them. We've already detached - * each block from the file, so bforget() in - * jbd2_journal_forget() should be safe. - * - * AKPM: turn on bforget in jbd2_journal_forget()!!! - */ - for (p = first; p < last; p++) { - u32 nr = le32_to_cpu(*p); - if (nr) { - struct buffer_head *tbh; - - *p = 0; - tbh = sb_find_get_block(inode->i_sb, nr); - ext4_forget(handle, is_metadata, inode, tbh, nr); - } - } + for (p = first; p < last; p++) + *p = 0; - ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); + ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); + return 0; } /** @@ -4213,9 +4383,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, } else if (nr == block_to_free + count) { count++; } else { - ext4_clear_blocks(handle, inode, this_bh, - block_to_free, - count, block_to_free_p, p); + if (ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p)) + break; block_to_free = nr; block_to_free_p = p; count = 1; @@ -4239,7 +4410,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "circular indirect block detected, " "inode=%lu, block=%llu", inode->i_ino, @@ -4279,6 +4450,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!nr) continue; /* A hole */ + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + ext4_error(inode->i_sb, + "indirect mapped block in inode " + "#%lu invalid (level %d, blk #%lu)", + inode->i_ino, depth, + (unsigned long) nr); + break; + } + /* Go read the buffer for the next level down */ bh = sb_bread(inode->i_sb, nr); @@ -4287,7 +4468,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - ext4_error(inode->i_sb, "ext4_free_branches", + ext4_error(inode->i_sb, "Read failure, inode=%lu, block=%llu", inode->i_ino, nr); continue; @@ -4345,7 +4526,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, blocks_for_truncate(inode)); } - ext4_free_blocks(handle, inode, nr, 1, 1); + ext4_free_blocks(handle, inode, 0, nr, 1, + EXT4_FREE_BLOCKS_METADATA); if (parent_bh) { /* @@ -4430,8 +4612,10 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; + EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) - ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { ext4_ext_truncate(inode); @@ -4601,9 +4785,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (!bh) { - ext4_error(sb, "ext4_get_inode_loc", "unable to read " - "inode block - inode=%lu, block=%llu", - inode->i_ino, block); + ext4_error(sb, "unable to read inode block - " + "inode=%lu, block=%llu", inode->i_ino, block); return -EIO; } if (!buffer_uptodate(bh)) { @@ -4701,9 +4884,8 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_error(sb, __func__, - "unable to read inode block - inode=%lu, " - "block=%llu", inode->i_ino, block); + ext4_error(sb, "unable to read inode block - inode=%lu," + " block=%llu", inode->i_ino, block); brelse(bh); return -EIO; } @@ -4717,7 +4899,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { /* We have all inode data except xattrs in memory here. */ return __ext4_get_inode_loc(inode, iloc, - !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); + !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); } void ext4_set_inode_flags(struct inode *inode) @@ -4785,6 +4967,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; long ret; int block; @@ -4810,7 +4993,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); - ei->i_state = 0; + ei->i_state_flags = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -4838,6 +5021,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); ei->i_disksize = inode->i_size; +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; @@ -4849,6 +5035,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > @@ -4865,7 +5076,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) - ei->i_state |= EXT4_STATE_XATTR; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); } } else ei->i_extra_isize = 0; @@ -4885,8 +5096,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - ext4_error(sb, __func__, - "bad extended attribute block %llu in inode #%lu", + ext4_error(sb, "bad extended attribute block %llu inode #%lu", ei->i_file_acl, inode->i_ino); ret = -EIO; goto bad_inode; @@ -4932,8 +5142,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { ret = -EIO; - ext4_error(inode->i_sb, __func__, - "bogus i_mode (%o) for inode=%lu", + ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", inode->i_mode, inode->i_ino); goto bad_inode; } @@ -5005,7 +5214,7 @@ static int ext4_do_update_inode(handle_t *handle, /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ - if (ei->i_state & EXT4_STATE_NEW) + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ext4_get_inode_flags(ei); @@ -5069,7 +5278,7 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); sb->s_dirt = 1; ext4_handle_sync(handle); - err = ext4_handle_dirty_metadata(handle, inode, + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } @@ -5098,11 +5307,12 @@ static int ext4_do_update_inode(handle_t *handle, } BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - rc = ext4_handle_dirty_metadata(handle, inode, bh); + rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) err = rc; - ei->i_state &= ~EXT4_STATE_NEW; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); + ext4_update_inode_fsync_trans(handle, inode, 0); out_brelse: brelse(bh); ext4_std_error(inode->i_sb, err); @@ -5144,7 +5354,7 @@ out_brelse: * `stuff()' is running, and the new i_size will be lost. Plus the inode * will no longer be on the superblock's dirty inode list. */ -int ext4_write_inode(struct inode *inode, int wait) +int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; @@ -5158,26 +5368,25 @@ int ext4_write_inode(struct inode *inode, int wait) return -EIO; } - if (!wait) + if (wbc->sync_mode != WB_SYNC_ALL) return 0; err = ext4_force_commit(inode->i_sb); } else { struct ext4_iloc iloc; - err = ext4_get_inode_loc(inode, &iloc); + err = __ext4_get_inode_loc(inode, &iloc, 0); if (err) return err; - if (wait) + if (wbc->sync_mode == WB_SYNC_ALL) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - ext4_error(inode->i_sb, __func__, - "IO error syncing inode, " - "inode=%lu, block=%llu", - inode->i_ino, + ext4_error(inode->i_sb, "IO error syncing inode, " + "inode=%lu, block=%llu", inode->i_ino, (unsigned long long)iloc.bh->b_blocknr); err = -EIO; } + brelse(iloc.bh); } return err; } @@ -5216,19 +5425,21 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } - error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext4_journal_stop(handle); return error; @@ -5255,7 +5466,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + attr->ia_valid & ATTR_SIZE && + (attr->ia_size < inode->i_size || + (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { handle_t *handle; handle = ext4_journal_start(inode, 3); @@ -5286,6 +5499,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } } + /* ext4_truncate will clear the flag */ + if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) + ext4_truncate(inode); } rc = inode_setattr(inode, attr); @@ -5371,7 +5587,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) * worse case, the indexs blocks spread over different block groups * * If datablocks are discontiguous, they are possible to spread over - * different block groups too. If they are contiugous, with flexbg, + * different block groups too. If they are contiuguous, with flexbg, * they could still across block group boundary. * * Also account for superblock, inode, quota and xattr blocks @@ -5447,7 +5663,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling - * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks. + * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. @@ -5524,8 +5740,8 @@ static int ext4_expand_extra_isize(struct inode *inode, entry = IFIRST(header); /* No extended attributes present */ - if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || - header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, new_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; @@ -5569,7 +5785,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && - !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { + !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { /* * We need extra buffer credits since we may write into EA block * with this same handle. If journal_extend fails, then it will @@ -5583,10 +5799,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) sbi->s_want_extra_isize, iloc, handle); if (ret) { - EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + ext4_set_inode_state(inode, + EXT4_STATE_NO_EXPAND); if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete" " some EAs or run e2fsck.", inode->i_ino); @@ -5608,7 +5825,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, vfs_dq_alloc_block() will always dirty the inode when blocks + * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing @@ -5650,7 +5867,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode) err = jbd2_journal_get_write_access(handle, iloc.bh); if (!err) err = ext4_handle_dirty_metadata(handle, - inode, + NULL, iloc.bh); brelse(iloc.bh); }