Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs into for-2.6.34-incoming

[safe/jmp/linux-2.6] / fs / ext4 / inode.c
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 1b1b7d9..e119524 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -983,6 +983,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
                 goto cleanup;
  
         set_buffer_new(bh_result);
+
+       ext4_update_inode_fsync_trans(handle, inode, 1);
  got_it:
         map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
         if (count > blocks_to_boundary)
@@ -1001,92 +1003,120 @@ out:
         return err;
  }
  
-qsize_t ext4_get_reserved_space(struct inode *inode)
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
  {
-       unsigned long long total;
-
-       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-       total = EXT4_I(inode)->i_reserved_data_blocks +
-               EXT4_I(inode)->i_reserved_meta_blocks;
-       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-
-       return (total << inode->i_blkbits);
+       return &EXT4_I(inode)->i_reserved_quota;
  }
+#endif
+
  /*
   * Calculate the number of metadata blocks need to reserve
- * to allocate @blocks for non extent file based file
+ * to allocate a new block at @lblocks for non extent file based file
   */
-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_indirect_calc_metadata_amount(struct inode *inode,
+                                             sector_t lblock)
  {
-       int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-       int ind_blks, dind_blks, tind_blks;
-
-       /* number of new indirect blocks needed */
-       ind_blks = (blocks + icap - 1) / icap;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+       int blk_bits;
  
-       dind_blks = (ind_blks + icap - 1) / icap;
+       if (lblock < EXT4_NDIR_BLOCKS)
+               return 0;
  
-       tind_blks = 1;
+       lblock -= EXT4_NDIR_BLOCKS;
  
-       return ind_blks + dind_blks + tind_blks;
+       if (ei->i_da_metadata_calc_len &&
+           (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+               ei->i_da_metadata_calc_len++;
+               return 0;
+       }
+       ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+       ei->i_da_metadata_calc_len = 1;
+       blk_bits = roundup_pow_of_two(lblock + 1);
+       return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
  }
  
  /*
   * Calculate the number of metadata blocks need to reserve
- * to allocate given number of blocks
+ * to allocate a block located at @lblock
   */
-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
  {
-       if (!blocks)
-               return 0;
-
         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-               return ext4_ext_calc_metadata_amount(inode, blocks);
+               return ext4_ext_calc_metadata_amount(inode, lblock);
  
-       return ext4_indirect_calc_metadata_amount(inode, blocks);
+       return ext4_indirect_calc_metadata_amount(inode, lblock);
  }
  
-static void ext4_da_update_reserve_space(struct inode *inode, int used)
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
+void ext4_da_update_reserve_space(struct inode *inode,
+                                       int used, int quota_claim)
  {
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       int total, mdb, mdb_free;
-
-       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-       /* recalculate the number of metablocks still need to be reserved */
-       total = EXT4_I(inode)->i_reserved_data_blocks - used;
-       mdb = ext4_calc_metadata_amount(inode, total);
-
-       /* figure out how many metablocks to release */
-       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-       mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
-       if (mdb_free) {
-               /* Account for allocated meta_blocks */
-               mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
-
-               /* update fs dirty blocks counter */
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       int mdb_free = 0, allocated_meta_blocks = 0;
+
+       spin_lock(&ei->i_block_reservation_lock);
+       if (unlikely(used > ei->i_reserved_data_blocks)) {
+               ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+                        "with only %d reserved data blocks\n",
+                        __func__, inode->i_ino, used,
+                        ei->i_reserved_data_blocks);
+               WARN_ON(1);
+               used = ei->i_reserved_data_blocks;
+       }
+
+       /* Update per-inode reservations */
+       ei->i_reserved_data_blocks -= used;
+       used += ei->i_allocated_meta_blocks;
+       ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+       allocated_meta_blocks = ei->i_allocated_meta_blocks;
+       ei->i_allocated_meta_blocks = 0;
+       percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
+
+       if (ei->i_reserved_data_blocks == 0) {
+               /*
+                * We can release all of the reserved metadata blocks
+                * only when we have written all of the delayed
+                * allocation blocks.
+                */
+               mdb_free = ei->i_reserved_meta_blocks;
+               ei->i_reserved_meta_blocks = 0;
+               ei->i_da_metadata_calc_len = 0;
                 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
-               EXT4_I(inode)->i_allocated_meta_blocks = 0;
-               EXT4_I(inode)->i_reserved_meta_blocks = mdb;
         }
-
-       /* update per-inode reservations */
-       BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
-       EXT4_I(inode)->i_reserved_data_blocks -= used;
         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
  
-       /*
-        * free those over-booking quota for metadata blocks
-        */
-       if (mdb_free)
-               vfs_dq_release_reservation_block(inode, mdb_free);
+       /* Update quota subsystem */
+       if (quota_claim) {
+               vfs_dq_claim_block(inode, used);
+               if (mdb_free)
+                       vfs_dq_release_reservation_block(inode, mdb_free);
+       } else {
+               /*
+                * We did fallocate with an offset that is already delayed
+                * allocated. So on delayed allocated writeback we should
+                * not update the quota for allocated blocks. But then
+                * converting an fallocate region to initialized region would
+                * have caused a metadata allocation. So claim quota for
+                * that
+                */
+               if (allocated_meta_blocks)
+                       vfs_dq_claim_block(inode, allocated_meta_blocks);
+               vfs_dq_release_reservation_block(inode, mdb_free + used);
+       }
  
         /*
          * If we have done all the pending block allocations and if
          * there aren't any writers on the inode, we can discard the
          * inode's preallocations.
          */
-       if (!total && (atomic_read(&inode->i_writecount) == 0))
+       if ((ei->i_reserved_data_blocks == 0) &&
+           (atomic_read(&inode->i_writecount) == 0))
                 ext4_discard_preallocations(inode);
  }
  
@@ -1278,18 +1308,20 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
                          */
                         EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
                 }
-       }
  
+               /*
+                * Update reserved blocks/metadata blocks after successful
+                * block allocation which had been deferred till now. We don't
+                * support fallocate for non extent files. So we can update
+                * reserve space here.
+                */
+               if ((retval > 0) &&
+                       (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+                       ext4_da_update_reserve_space(inode, retval, 1);
+       }
         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
  
-       /*
-        * Update reserved blocks/metadata blocks after successful
-        * block allocation which had been deferred till now.
-        */
-       if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
-               ext4_da_update_reserve_space(inode, retval);
-
         up_write((&EXT4_I(inode)->i_data_sem));
         if (retval > 0 && buffer_mapped(bh)) {
                 int ret = check_block_validity(inode, "file system "
@@ -1795,11 +1827,15 @@ static int ext4_journalled_write_end(struct file *file,
         return ret ? ret : copied;
  }
  
-static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+/*
+ * Reserve a single block located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
  {
         int retries = 0;
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       unsigned long md_needed, mdblocks, total = 0;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       unsigned long md_needed, md_reserved;
  
         /*
          * recalculate the amount of metadata blocks to reserve
@@ -1807,86 +1843,78 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
          * worse case is one extent per block
          */
  repeat:
-       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-       total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
-       mdblocks = ext4_calc_metadata_amount(inode, total);
-       BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
-
-       md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
-       total = md_needed + nrblocks;
+       spin_lock(&ei->i_block_reservation_lock);
+       md_reserved = ei->i_reserved_meta_blocks;
+       md_needed = ext4_calc_metadata_amount(inode, lblock);
+       spin_unlock(&ei->i_block_reservation_lock);
  
         /*
          * Make quota reservation here to prevent quota overflow
          * later. Real quota accounting is done at pages writeout
          * time.
          */
-       if (vfs_dq_reserve_block(inode, total)) {
-               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+       if (vfs_dq_reserve_block(inode, md_needed + 1))
                 return -EDQUOT;
-       }
  
-       if (ext4_claim_free_blocks(sbi, total)) {
-               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-               vfs_dq_release_reservation_block(inode, total);
+       if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+               vfs_dq_release_reservation_block(inode, md_needed + 1);
                 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                         yield();
                         goto repeat;
                 }
                 return -ENOSPC;
         }
-       EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
-       EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+       spin_lock(&ei->i_block_reservation_lock);
+       ei->i_reserved_data_blocks++;
+       ei->i_reserved_meta_blocks += md_needed;
+       spin_unlock(&ei->i_block_reservation_lock);
  
-       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
         return 0;       /* success */
  }
  
  static void ext4_da_release_space(struct inode *inode, int to_free)
  {
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       int total, mdb, mdb_free, release;
+       struct ext4_inode_info *ei = EXT4_I(inode);
  
         if (!to_free)
                 return;         /* Nothing to release, exit */
  
         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
  
-       if (!EXT4_I(inode)->i_reserved_data_blocks) {
+       if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                 /*
-                * if there is no reserved blocks, but we try to free some
-                * then the counter is messed up somewhere.
-                * but since this function is called from invalidate
-                * page, it's harmless to return without any action
+                * if there aren't enough reserved blocks, then the
+                * counter is messed up somewhere.  Since this
+                * function is called from invalidate page, it's
+                * harmless to return without any action.
                  */
-               printk(KERN_INFO "ext4 delalloc try to release %d reserved "
-                           "blocks for inode %lu, but there is no reserved "
-                           "data blocks\n", to_free, inode->i_ino);
-               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-               return;
+               ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+                        "ino %lu, to_free %d with only %d reserved "
+                        "data blocks\n", inode->i_ino, to_free,
+                        ei->i_reserved_data_blocks);
+               WARN_ON(1);
+               to_free = ei->i_reserved_data_blocks;
         }
+       ei->i_reserved_data_blocks -= to_free;
  
-       /* recalculate the number of metablocks still need to be reserved */
-       total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
-       mdb = ext4_calc_metadata_amount(inode, total);
-
-       /* figure out how many metablocks to release */
-       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-       mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
-       release = to_free + mdb_free;
-
-       /* update fs dirty blocks counter for truncate case */
-       percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
+       if (ei->i_reserved_data_blocks == 0) {
+               /*
+                * We can release all of the reserved metadata blocks
+                * only when we have written all of the delayed
+                * allocation blocks.
+                */
+               to_free += ei->i_reserved_meta_blocks;
+               ei->i_reserved_meta_blocks = 0;
+               ei->i_da_metadata_calc_len = 0;
+       }
  
-       /* update per-inode reservations */
-       BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
-       EXT4_I(inode)->i_reserved_data_blocks -= to_free;
+       /* update fs dirty blocks counter */
+       percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
  
-       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-       EXT4_I(inode)->i_reserved_meta_blocks = mdb;
         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
  
-       vfs_dq_release_reservation_block(inode, release);
+       vfs_dq_release_reservation_block(inode, to_free);
  }
  
  static void ext4_da_page_release_reservation(struct page *page,
@@ -2191,10 +2219,10 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
          * variables are updated after the blocks have been allocated.
          */
         new.b_state = 0;
-       get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
-                           EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+       get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
         if (mpd->b_state & (1 << BH_Delay))
-               get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+               get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
         blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
                                &new, get_blocks_flags);
         if (blks < 0) {
@@ -2492,7 +2520,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                  * XXX: __block_prepare_write() unmaps passed block,
                  * is it OK?
                  */
-               ret = ext4_da_reserve_space(inode, 1);
+               ret = ext4_da_reserve_space(inode, iblock);
                 if (ret)
                         /* not enough space to reserve */
                         return ret;
@@ -2900,7 +2928,7 @@ retry:
                 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
                                         &mpd);
                 /*
-                * If we have a contigous extent of pages and we
+                * If we have a contiguous extent of pages and we
                  * haven't done the I/O yet, map the blocks and submit
                  * them for I/O.
                  */
@@ -2966,8 +2994,7 @@ retry:
  out_writepages:
         if (!no_nrwrite_index_update)
                 wbc->no_nrwrite_index_update = 0;
-       if (wbc->nr_to_write > nr_to_writebump)
-               wbc->nr_to_write -= nr_to_writebump;
+       wbc->nr_to_write -= nr_to_writebump;
         wbc->range_start = range_start;
         trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
         return ret;
@@ -2992,11 +3019,18 @@ static int ext4_nonda_switch(struct super_block *sb)
         if (2 * free_blocks < 3 * dirty_blocks ||
                 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
                 /*
-                * free block count is less that 150% of dirty blocks
-                * or free blocks is less that watermark
+                * free block count is less than 150% of dirty blocks
+                * or free blocks is less than watermark
                  */
                 return 1;
         }
+       /*
+        * Even if we don't switch but are nearing capacity,
+        * start pushing delalloc when 1/2 of free blocks are dirty.
+        */
+       if (free_blocks < 2 * dirty_blocks)
+               writeback_inodes_sb_if_idle(sb);
+
         return 0;
  }
  
@@ -3004,7 +3038,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata)
  {
-       int ret, retries = 0;
+       int ret, retries = 0, quota_retries = 0;
         struct page *page;
         pgoff_t index;
         unsigned from, to;
@@ -3063,6 +3097,22 @@ retry:
  
         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                 goto retry;
+
+       if ((ret == -EDQUOT) &&
+           EXT4_I(inode)->i_reserved_meta_blocks &&
+           (quota_retries++ < 3)) {
+               /*
+                * Since we often over-estimate the number of meta
+                * data blocks required, we may sometimes get a
+                * spurios out of quota error even though there would
+                * be enough space once we write the data blocks and
+                * find out how many meta data blocks were _really_
+                * required.  So try forcing the inode write to see if
+                * that helps.
+                */
+               write_inode_now(inode, (quota_retries == 3));
+               goto retry;
+       }
  out:
         return ret;
  }
@@ -4031,7 +4081,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
         int k, err;
  
         *top = 0;
-       /* Make k index the deepest non-null offest + 1 */
+       /* Make k index the deepest non-null offset + 1 */
         for (k = depth; k > 1 && !offsets[k-1]; k--)
                 ;
         partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -4738,6 +4788,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
         struct ext4_inode *raw_inode;
         struct ext4_inode_info *ei;
         struct inode *inode;
+       journal_t *journal = EXT4_SB(sb)->s_journal;
         long ret;
         int block;
  
@@ -4791,6 +4842,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
         inode->i_size = ext4_isize(raw_inode);
         ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+       ei->i_reserved_quota = 0;
+#endif
         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
         ei->i_block_group = iloc.block_group;
         ei->i_last_alloc_group = ~0;
@@ -4802,6 +4856,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                 ei->i_data[block] = raw_inode->i_block[block];
         INIT_LIST_HEAD(&ei->i_orphan);
  
+       /*
+        * Set transaction id's of transactions that have to be committed
+        * to finish f[data]sync. We set them to currently running transaction
+        * as we cannot be sure that the inode or some of its metadata isn't
+        * part of the transaction - the inode could have been reclaimed and
+        * now it is reread from disk.
+        */
+       if (journal) {
+               transaction_t *transaction;
+               tid_t tid;
+
+               spin_lock(&journal->j_state_lock);
+               if (journal->j_running_transaction)
+                       transaction = journal->j_running_transaction;
+               else
+                       transaction = journal->j_committing_transaction;
+               if (transaction)
+                       tid = transaction->t_tid;
+               else
+                       tid = journal->j_commit_sequence;
+               spin_unlock(&journal->j_state_lock);
+               ei->i_sync_tid = tid;
+               ei->i_datasync_tid = tid;
+       }
+
         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
@@ -5056,6 +5135,7 @@ static int ext4_do_update_inode(handle_t *handle,
                 err = rc;
         ei->i_state &= ~EXT4_STATE_NEW;
  
+       ext4_update_inode_fsync_trans(handle, inode, 0);
  out_brelse:
         brelse(bh);
         ext4_std_error(inode->i_sb, err);
@@ -5176,7 +5256,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                 /* (user+group)*(old+new) structure, inode write (sb,
                  * inode block, ? - but truncate inode update has it) */
                 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
-                                       EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+                                       EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
                 if (IS_ERR(handle)) {
                         error = PTR_ERR(handle);
                         goto err_out;
@@ -5324,7 +5404,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
   * worse case, the indexs blocks spread over different block groups
   *
   * If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiugous, with flexbg,
+ * different block groups too. If they are contiuguous, with flexbg,
   * they could still across block group boundary.
   *
   * Also account for superblock, inode, quota and xattr blocks
@@ -5400,7 +5480,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
   * Calculate the journal credits for a chunk of data modification.
   *
   * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
   *
   * journal buffers for data blocks are not included here, as DIO
   * and fallocate do no need to journal data buffers.