Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs into for-2.6.34-incoming
[safe/jmp/linux-2.6] / fs / ext4 / inode.c
index a31980d..e119524 100644 (file)
@@ -1009,81 +1009,114 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
        return &EXT4_I(inode)->i_reserved_quota;
 }
 #endif
+
 /*
  * Calculate the number of metadata blocks need to reserve
- * to allocate @blocks for non extent file based file
+ * to allocate a new block at @lblocks for non extent file based file
  */
-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_indirect_calc_metadata_amount(struct inode *inode,
+                                             sector_t lblock)
 {
-       int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-       int ind_blks, dind_blks, tind_blks;
-
-       /* number of new indirect blocks needed */
-       ind_blks = (blocks + icap - 1) / icap;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+       int blk_bits;
 
-       dind_blks = (ind_blks + icap - 1) / icap;
+       if (lblock < EXT4_NDIR_BLOCKS)
+               return 0;
 
-       tind_blks = 1;
+       lblock -= EXT4_NDIR_BLOCKS;
 
-       return ind_blks + dind_blks + tind_blks;
+       if (ei->i_da_metadata_calc_len &&
+           (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+               ei->i_da_metadata_calc_len++;
+               return 0;
+       }
+       ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+       ei->i_da_metadata_calc_len = 1;
+       blk_bits = roundup_pow_of_two(lblock + 1);
+       return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 
 /*
  * Calculate the number of metadata blocks need to reserve
- * to allocate given number of blocks
+ * to allocate a block located at @lblock
  */
-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-       if (!blocks)
-               return 0;
-
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-               return ext4_ext_calc_metadata_amount(inode, blocks);
+               return ext4_ext_calc_metadata_amount(inode, lblock);
 
-       return ext4_indirect_calc_metadata_amount(inode, blocks);
+       return ext4_indirect_calc_metadata_amount(inode, lblock);
 }
 
-static void ext4_da_update_reserve_space(struct inode *inode, int used)
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
+void ext4_da_update_reserve_space(struct inode *inode,
+                                       int used, int quota_claim)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       int total, mdb, mdb_free;
-
-       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-       /* recalculate the number of metablocks still need to be reserved */
-       total = EXT4_I(inode)->i_reserved_data_blocks - used;
-       mdb = ext4_calc_metadata_amount(inode, total);
-
-       /* figure out how many metablocks to release */
-       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-       mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
-       if (mdb_free) {
-               /* Account for allocated meta_blocks */
-               mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
-
-               /* update fs dirty blocks counter */
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       int mdb_free = 0, allocated_meta_blocks = 0;
+
+       spin_lock(&ei->i_block_reservation_lock);
+       if (unlikely(used > ei->i_reserved_data_blocks)) {
+               ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+                        "with only %d reserved data blocks\n",
+                        __func__, inode->i_ino, used,
+                        ei->i_reserved_data_blocks);
+               WARN_ON(1);
+               used = ei->i_reserved_data_blocks;
+       }
+
+       /* Update per-inode reservations */
+       ei->i_reserved_data_blocks -= used;
+       used += ei->i_allocated_meta_blocks;
+       ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+       allocated_meta_blocks = ei->i_allocated_meta_blocks;
+       ei->i_allocated_meta_blocks = 0;
+       percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
+
+       if (ei->i_reserved_data_blocks == 0) {
+               /*
+                * We can release all of the reserved metadata blocks
+                * only when we have written all of the delayed
+                * allocation blocks.
+                */
+               mdb_free = ei->i_reserved_meta_blocks;
+               ei->i_reserved_meta_blocks = 0;
+               ei->i_da_metadata_calc_len = 0;
                percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
-               EXT4_I(inode)->i_allocated_meta_blocks = 0;
-               EXT4_I(inode)->i_reserved_meta_blocks = mdb;
        }
-
-       /* update per-inode reservations */
-       BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
-       EXT4_I(inode)->i_reserved_data_blocks -= used;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-       /*
-        * free those over-booking quota for metadata blocks
-        */
-       if (mdb_free)
-               vfs_dq_release_reservation_block(inode, mdb_free);
+       /* Update quota subsystem */
+       if (quota_claim) {
+               vfs_dq_claim_block(inode, used);
+               if (mdb_free)
+                       vfs_dq_release_reservation_block(inode, mdb_free);
+       } else {
+               /*
+                * We did fallocate with an offset that is already delayed
+                * allocated. So on delayed allocated writeback we should
+                * not update the quota for allocated blocks. But then
+                * converting an fallocate region to initialized region would
+                * have caused a metadata allocation. So claim quota for
+                * that
+                */
+               if (allocated_meta_blocks)
+                       vfs_dq_claim_block(inode, allocated_meta_blocks);
+               vfs_dq_release_reservation_block(inode, mdb_free + used);
+       }
 
        /*
         * If we have done all the pending block allocations and if
         * there aren't any writers on the inode, we can discard the
         * inode's preallocations.
         */
-       if (!total && (atomic_read(&inode->i_writecount) == 0))
+       if ((ei->i_reserved_data_blocks == 0) &&
+           (atomic_read(&inode->i_writecount) == 0))
                ext4_discard_preallocations(inode);
 }
 
@@ -1275,18 +1308,20 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
                         */
                        EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
                }
-       }
 
+               /*
+                * Update reserved blocks/metadata blocks after successful
+                * block allocation which had been deferred till now. We don't
+                * support fallocate for non extent files. So we can update
+                * reserve space here.
+                */
+               if ((retval > 0) &&
+                       (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+                       ext4_da_update_reserve_space(inode, retval, 1);
+       }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
 
-       /*
-        * Update reserved blocks/metadata blocks after successful
-        * block allocation which had been deferred till now.
-        */
-       if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
-               ext4_da_update_reserve_space(inode, retval);
-
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && buffer_mapped(bh)) {
                int ret = check_block_validity(inode, "file system "
@@ -1792,11 +1827,15 @@ static int ext4_journalled_write_end(struct file *file,
        return ret ? ret : copied;
 }
 
-static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+/*
+ * Reserve a single block located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       unsigned long md_needed, mdblocks, total = 0;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       unsigned long md_needed, md_reserved;
 
        /*
         * recalculate the amount of metadata blocks to reserve
@@ -1804,86 +1843,78 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
         * worse case is one extent per block
         */
 repeat:
-       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-       total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
-       mdblocks = ext4_calc_metadata_amount(inode, total);
-       BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
-
-       md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
-       total = md_needed + nrblocks;
+       spin_lock(&ei->i_block_reservation_lock);
+       md_reserved = ei->i_reserved_meta_blocks;
+       md_needed = ext4_calc_metadata_amount(inode, lblock);
+       spin_unlock(&ei->i_block_reservation_lock);
 
        /*
         * Make quota reservation here to prevent quota overflow
         * later. Real quota accounting is done at pages writeout
         * time.
         */
-       if (vfs_dq_reserve_block(inode, total)) {
-               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+       if (vfs_dq_reserve_block(inode, md_needed + 1))
                return -EDQUOT;
-       }
 
-       if (ext4_claim_free_blocks(sbi, total)) {
-               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-               vfs_dq_release_reservation_block(inode, total);
+       if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+               vfs_dq_release_reservation_block(inode, md_needed + 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
                }
                return -ENOSPC;
        }
-       EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
-       EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+       spin_lock(&ei->i_block_reservation_lock);
+       ei->i_reserved_data_blocks++;
+       ei->i_reserved_meta_blocks += md_needed;
+       spin_unlock(&ei->i_block_reservation_lock);
 
-       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        return 0;       /* success */
 }
 
 static void ext4_da_release_space(struct inode *inode, int to_free)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       int total, mdb, mdb_free, release;
+       struct ext4_inode_info *ei = EXT4_I(inode);
 
        if (!to_free)
                return;         /* Nothing to release, exit */
 
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 
-       if (!EXT4_I(inode)->i_reserved_data_blocks) {
+       if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
-                * if there is no reserved blocks, but we try to free some
-                * then the counter is messed up somewhere.
-                * but since this function is called from invalidate
-                * page, it's harmless to return without any action
+                * if there aren't enough reserved blocks, then the
+                * counter is messed up somewhere.  Since this
+                * function is called from invalidate page, it's
+                * harmless to return without any action.
                 */
-               printk(KERN_INFO "ext4 delalloc try to release %d reserved "
-                           "blocks for inode %lu, but there is no reserved "
-                           "data blocks\n", to_free, inode->i_ino);
-               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-               return;
+               ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+                        "ino %lu, to_free %d with only %d reserved "
+                        "data blocks\n", inode->i_ino, to_free,
+                        ei->i_reserved_data_blocks);
+               WARN_ON(1);
+               to_free = ei->i_reserved_data_blocks;
        }
+       ei->i_reserved_data_blocks -= to_free;
 
-       /* recalculate the number of metablocks still need to be reserved */
-       total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
-       mdb = ext4_calc_metadata_amount(inode, total);
-
-       /* figure out how many metablocks to release */
-       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-       mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
-       release = to_free + mdb_free;
-
-       /* update fs dirty blocks counter for truncate case */
-       percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
+       if (ei->i_reserved_data_blocks == 0) {
+               /*
+                * We can release all of the reserved metadata blocks
+                * only when we have written all of the delayed
+                * allocation blocks.
+                */
+               to_free += ei->i_reserved_meta_blocks;
+               ei->i_reserved_meta_blocks = 0;
+               ei->i_da_metadata_calc_len = 0;
+       }
 
-       /* update per-inode reservations */
-       BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
-       EXT4_I(inode)->i_reserved_data_blocks -= to_free;
+       /* update fs dirty blocks counter */
+       percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
 
-       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-       EXT4_I(inode)->i_reserved_meta_blocks = mdb;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-       vfs_dq_release_reservation_block(inode, release);
+       vfs_dq_release_reservation_block(inode, to_free);
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
@@ -2188,10 +2219,10 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * variables are updated after the blocks have been allocated.
         */
        new.b_state = 0;
-       get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
-                           EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+       get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
        if (mpd->b_state & (1 << BH_Delay))
-               get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+               get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
                               &new, get_blocks_flags);
        if (blks < 0) {
@@ -2489,7 +2520,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
                 */
-               ret = ext4_da_reserve_space(inode, 1);
+               ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
                        /* not enough space to reserve */
                        return ret;
@@ -2963,8 +2994,7 @@ retry:
 out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
-       if (wbc->nr_to_write > nr_to_writebump)
-               wbc->nr_to_write -= nr_to_writebump;
+       wbc->nr_to_write -= nr_to_writebump;
        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;
@@ -2989,11 +3019,18 @@ static int ext4_nonda_switch(struct super_block *sb)
        if (2 * free_blocks < 3 * dirty_blocks ||
                free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
                /*
-                * free block count is less that 150% of dirty blocks
-                * or free blocks is less that watermark
+                * free block count is less than 150% of dirty blocks
+                * or free blocks is less than watermark
                 */
                return 1;
        }
+       /*
+        * Even if we don't switch but are nearing capacity,
+        * start pushing delalloc when 1/2 of free blocks are dirty.
+        */
+       if (free_blocks < 2 * dirty_blocks)
+               writeback_inodes_sb_if_idle(sb);
+
        return 0;
 }
 
@@ -3001,7 +3038,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len, unsigned flags,
                               struct page **pagep, void **fsdata)
 {
-       int ret, retries = 0;
+       int ret, retries = 0, quota_retries = 0;
        struct page *page;
        pgoff_t index;
        unsigned from, to;
@@ -3060,6 +3097,22 @@ retry:
 
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
+
+       if ((ret == -EDQUOT) &&
+           EXT4_I(inode)->i_reserved_meta_blocks &&
+           (quota_retries++ < 3)) {
+               /*
+                * Since we often over-estimate the number of meta
+                * data blocks required, we may sometimes get a
+                * spurios out of quota error even though there would
+                * be enough space once we write the data blocks and
+                * find out how many meta data blocks were _really_
+                * required.  So try forcing the inode write to see if
+                * that helps.
+                */
+               write_inode_now(inode, (quota_retries == 3));
+               goto retry;
+       }
 out:
        return ret;
 }