Merge branch 'for-linus' of git://git390.marist.edu/pub/scm/linux-2.6
[safe/jmp/linux-2.6] / fs / ext4 / balloc.c
index a711898..95b7594 100644 (file)
@@ -19,7 +19,7 @@
 #include <linux/buffer_head.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "group.h"
+#include "mballoc.h"
 
 /*
  * balloc.c contains the blocks allocation and deallocation routines
@@ -54,7 +54,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 }
 
 static int ext4_group_used_meta_blocks(struct super_block *sb,
-                               ext4_group_t block_group)
+                                      ext4_group_t block_group,
+                                      struct ext4_group_desc *gdp)
 {
        ext4_fsblk_t tmp;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -62,10 +63,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
        int used_blocks = sbi->s_itb_per_group + 2;
 
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-               struct ext4_group_desc *gdp;
-               struct buffer_head *bh;
-
-               gdp = ext4_get_group_desc(sb, block_group, &bh);
                if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
                                        block_group))
                        used_blocks--;
@@ -90,6 +87,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
        int bit, bit_max;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
        unsigned free_blocks, group_blocks;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
@@ -99,11 +97,11 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                /* If checksum is bad mark all blocks used to prevent allocation
                 * essentially implementing a per-group read-only flag. */
                if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-                       ext4_error(sb, __func__,
-                                 "Checksum bad for group %u", block_group);
-                       gdp->bg_free_blocks_count = 0;
-                       gdp->bg_free_inodes_count = 0;
-                       gdp->bg_itable_unused = 0;
+                       ext4_error(sb, "Checksum bad for group %u",
+                                       block_group);
+                       ext4_free_blks_set(sb, gdp, 0);
+                       ext4_free_inodes_set(sb, gdp, 0);
+                       ext4_itable_unused_set(sb, gdp, 0);
                        memset(bh->b_data, 0xff, sb->s_blocksize);
                        return 0;
                }
@@ -125,15 +123,14 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                bit_max += ext4_bg_num_gdb(sb, block_group);
        }
 
-       if (block_group == sbi->s_groups_count - 1) {
+       if (block_group == ngroups - 1) {
                /*
                 * Even though mke2fs always initialize first and last group
                 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
                 * to make sure we calculate the right free blocks
                 */
                group_blocks = ext4_blocks_count(sbi->s_es) -
-                       le32_to_cpu(sbi->s_es->s_first_data_block) -
-                       (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
+                       ext4_group_first_block_no(sb, ngroups - 1);
        } else {
                group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
        }
@@ -176,7 +173,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 */
                mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
        }
-       return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+       return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
 
 
@@ -191,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
  * when a file system is mounted (see ext4_fill_super).
  */
 
-
-#define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
-
 /**
  * ext4_get_group_desc() -- load group descriptor from disk
  * @sb:                        super block
@@ -205,27 +199,24 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
                                             ext4_group_t block_group,
                                             struct buffer_head **bh)
 {
-       unsigned long group_desc;
-       unsigned long offset;
+       unsigned int group_desc;
+       unsigned int offset;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-       if (block_group >= sbi->s_groups_count) {
-               ext4_error(sb, "ext4_get_group_desc",
-                          "block_group >= groups_count - "
-                          "block_group = %u, groups_count = %u",
-                          block_group, sbi->s_groups_count);
+       if (block_group >= ngroups) {
+               ext4_error(sb, "block_group >= groups_count - block_group = %u,"
+                          " groups_count = %u", block_group, ngroups);
 
                return NULL;
        }
-       smp_rmb();
 
        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
        if (!sbi->s_group_desc[group_desc]) {
-               ext4_error(sb, "ext4_get_group_desc",
-                          "Group descriptor not loaded - "
-                          "block_group = %u, group_desc = %lu, desc = %lu",
+               ext4_error(sb, "Group descriptor not loaded - "
+                          "block_group = %u, group_desc = %u, desc = %u",
                           block_group, group_desc, offset);
                return NULL;
        }
@@ -284,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
                return 1;
 
 err_out:
-       ext4_error(sb, __func__,
-                       "Invalid block bitmap - "
-                       "block_group = %d, block = %llu",
+       ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
                        block_group, bitmap_blk);
        return 0;
 }
@@ -313,30 +302,49 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        bitmap_blk = ext4_block_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
-               ext4_error(sb, __func__,
-                           "Cannot read block bitmap - "
+               ext4_error(sb, "Cannot read block bitmap - "
                            "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
-       if (buffer_uptodate(bh) &&
-           !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+
+       if (bitmap_uptodate(bh))
                return bh;
 
        lock_buffer(bh);
-       spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+       if (bitmap_uptodate(bh)) {
+               unlock_buffer(bh);
+               return bh;
+       }
+       ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                ext4_init_block_bitmap(sb, bh, block_group, desc);
+               set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
+               ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
-               spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
                return bh;
        }
-       spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+       ext4_unlock_group(sb, block_group);
+       if (buffer_uptodate(bh)) {
+               /*
+                * if not uninit if bh is uptodate,
+                * bitmap is also uptodate
+                */
+               set_bitmap_uptodate(bh);
+               unlock_buffer(bh);
+               return bh;
+       }
+       /*
+        * submit the buffer_head for read. We can
+        * safely mark the bitmap as uptodate now.
+        * We do it here so the bitmap uptodate bit
+        * get set with buffer lock held.
+        */
+       set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
-               ext4_error(sb, __func__,
-                           "Cannot read block bitmap - "
+               ext4_error(sb, "Cannot read block bitmap - "
                            "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
@@ -350,62 +358,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 
 /**
- * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
  * @handle:                    handle to this transaction
  * @sb:                                super block
- * @block:                     start physcial block to free
+ * @block:                     start physcial block to add to the block group
  * @count:                     number of blocks to free
- * @pdquot_freed_blocks:       pointer to quota
  *
- * XXX This function is only used by the on-line resizing code, which
- * should probably be fixed up to call the mballoc variant.  There
- * this needs to be cleaned up later; in fact, I'm not convinced this
- * is 100% correct in the face of the mballoc code.  The online resizing
- * code needs to be fixed up to more tightly (and correctly) interlock
- * with the mballoc code.
+ * This marks the blocks as free in the bitmap. We ask the
+ * mballoc to reload the buddy after this by setting group
+ * EXT4_GROUP_INFO_NEED_INIT_BIT flag
  */
-void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-                        ext4_fsblk_t block, unsigned long count,
-                        unsigned long *pdquot_freed_blocks)
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                        ext4_fsblk_t block, unsigned long count)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gd_bh;
        ext4_group_t block_group;
        ext4_grpblk_t bit;
-       unsigned long i;
-       unsigned long overflow;
+       unsigned int i;
        struct ext4_group_desc *desc;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
-       int err = 0, ret;
-       ext4_grpblk_t group_freed;
+       int err = 0, ret, blk_free_count;
+       ext4_grpblk_t blocks_freed;
+       struct ext4_group_info *grp;
 
-       *pdquot_freed_blocks = 0;
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
-       if (block < le32_to_cpu(es->s_first_data_block) ||
-           block + count < block ||
-           block + count > ext4_blocks_count(es)) {
-               ext4_error(sb, "ext4_free_blocks",
-                          "Freeing blocks not in datazone - "
-                          "block = %llu, count = %lu", block, count);
-               goto error_return;
-       }
+       ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
 
-       ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
-
-do_more:
-       overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+       grp = ext4_get_group_info(sb, block_group);
        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-               overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
-               count -= overflow;
+               goto error_return;
        }
-       brelse(bitmap_bh);
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
@@ -418,18 +408,16 @@ do_more:
            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
            in_range(block + count - 1, ext4_inode_table(sb, desc),
                     sbi->s_itb_per_group)) {
-               ext4_error(sb, "ext4_free_blocks",
-                          "Freeing blocks in system zones - "
+               ext4_error(sb, "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
                goto error_return;
        }
 
        /*
-        * We are about to start releasing blocks in the bitmap,
+        * We are about to add blocks to the bitmap,
         * so we need undo access.
         */
-       /* @@@ check errors */
        BUFFER_TRACE(bitmap_bh, "getting undo access");
        err = ext4_journal_get_undo_access(handle, bitmap_bh);
        if (err)
@@ -444,90 +432,41 @@ do_more:
        err = ext4_journal_get_write_access(handle, gd_bh);
        if (err)
                goto error_return;
-
-       jbd_lock_bh_state(bitmap_bh);
-
-       for (i = 0, group_freed = 0; i < count; i++) {
-               /*
-                * An HJ special.  This is expensive...
-                */
-#ifdef CONFIG_JBD2_DEBUG
-               jbd_unlock_bh_state(bitmap_bh);
-               {
-                       struct buffer_head *debug_bh;
-                       debug_bh = sb_find_get_block(sb, block + i);
-                       if (debug_bh) {
-                               BUFFER_TRACE(debug_bh, "Deleted!");
-                               if (!bh2jh(bitmap_bh)->b_committed_data)
-                                       BUFFER_TRACE(debug_bh,
-                                               "No commited data in bitmap");
-                               BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
-                               __brelse(debug_bh);
-                       }
-               }
-               jbd_lock_bh_state(bitmap_bh);
-#endif
-               if (need_resched()) {
-                       jbd_unlock_bh_state(bitmap_bh);
-                       cond_resched();
-                       jbd_lock_bh_state(bitmap_bh);
-               }
-               /* @@@ This prevents newly-allocated data from being
-                * freed and then reallocated within the same
-                * transaction.
-                *
-                * Ideally we would want to allow that to happen, but to
-                * do so requires making jbd2_journal_forget() capable of
-                * revoking the queued write of a data block, which
-                * implies blocking on the journal lock.  *forget()
-                * cannot block due to truncate races.
-                *
-                * Eventually we can fix this by making jbd2_journal_forget()
-                * return a status indicating whether or not it was able
-                * to revoke the buffer.  On successful revoke, it is
-                * safe not to set the allocation bit in the committed
-                * bitmap, because we know that there is no outstanding
-                * activity on the buffer any more and so it is safe to
-                * reallocate it.
-                */
-               BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
-               J_ASSERT_BH(bitmap_bh,
-                               bh2jh(bitmap_bh)->b_committed_data != NULL);
-               ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
-                               bh2jh(bitmap_bh)->b_committed_data);
-
-               /*
-                * We clear the bit in the bitmap after setting the committed
-                * data bit, because this is the reverse order to that which
-                * the allocator uses.
-                */
+       /*
+        * make sure we don't allow a parallel init on other groups in the
+        * same buddy cache
+        */
+       down_write(&grp->alloc_sem);
+       for (i = 0, blocks_freed = 0; i < count; i++) {
                BUFFER_TRACE(bitmap_bh, "clear bit");
-               if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+               if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
                                                bit + i, bitmap_bh->b_data)) {
-                       jbd_unlock_bh_state(bitmap_bh);
-                       ext4_error(sb, __func__,
-                                  "bit already cleared for block %llu",
+                       ext4_error(sb, "bit already cleared for block %llu",
                                   (ext4_fsblk_t)(block + i));
-                       jbd_lock_bh_state(bitmap_bh);
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
                } else {
-                       group_freed++;
+                       blocks_freed++;
                }
        }
-       jbd_unlock_bh_state(bitmap_bh);
-
-       spin_lock(sb_bgl_lock(sbi, block_group));
-       le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
+       ext4_lock_group(sb, block_group);
+       blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+       ext4_free_blks_set(sb, desc, blk_free_count);
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-       spin_unlock(sb_bgl_lock(sbi, block_group));
-       percpu_counter_add(&sbi->s_freeblocks_counter, count);
+       ext4_unlock_group(sb, block_group);
+       percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               spin_lock(sb_bgl_lock(sbi, flex_group));
-               sbi->s_flex_groups[flex_group].free_blocks += count;
-               spin_unlock(sb_bgl_lock(sbi, flex_group));
+               atomic_add(blocks_freed,
+                          &sbi->s_flex_groups[flex_group].free_blocks);
        }
+       /*
+        * request to reload the buddy with the
+        * new bitmap information
+        */
+       set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+       grp->bb_free += blocks_freed;
+       up_write(&grp->alloc_sem);
 
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -536,15 +475,10 @@ do_more:
        /* And the group descriptor block */
        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-       if (!err) err = ret;
-       *pdquot_freed_blocks += group_freed;
-
-       if (overflow && !err) {
-               block += count;
-               count = overflow;
-               goto do_more;
-       }
+       if (!err)
+               err = ret;
        sb->s_dirt = 1;
+
 error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
@@ -552,44 +486,6 @@ error_return:
 }
 
 /**
- * ext4_free_blocks() -- Free given blocks and update quota
- * @handle:            handle for this transaction
- * @inode:             inode
- * @block:             start physical block to free
- * @count:             number of blocks to count
- * @metadata:          Are these metadata blocks
- */
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                       ext4_fsblk_t block, unsigned long count,
-                       int metadata)
-{
-       struct super_block *sb;
-       unsigned long dquot_freed_blocks;
-
-       /* this isn't the right place to decide whether block is metadata
-        * inode.c/extents.c knows better, but for safety ... */
-       if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-               metadata = 1;
-
-       /* We need to make sure we don't reuse
-        * block released untill the transaction commit.
-        * writeback mode have weak data consistency so
-        * don't force data as metadata when freeing block
-        * for writeback mode.
-        */
-       if (metadata == 0 && !ext4_should_writeback_data(inode))
-               metadata = 1;
-
-       sb = inode->i_sb;
-
-       ext4_mb_free_blocks(handle, inode, block, count,
-                           metadata, &dquot_freed_blocks);
-       if (dquot_freed_blocks)
-               DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
-       return;
-}
-
-/**
  * ext4_has_free_blocks()
  * @sbi:       in-core super block structure.
  * @nblocks:   number of needed blocks
@@ -658,7 +554,9 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-       if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
+       if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+           (*retries)++ > 3 ||
+           !EXT4_SB(sb)->s_journal)
                return 0;
 
        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -693,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
                *count = ar.len;
-
        /*
-        * Account for the allocated meta blocks
+        * Account for the allocated meta blocks.  We will never
+        * fail EDQUOT for metdata, but we do account for it.
         */
        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+               dquot_alloc_block_nofail(inode, ar.len);
        }
        return ret;
 }
@@ -716,11 +615,11 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
-       ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
-       unsigned long x;
+       unsigned int x;
        struct buffer_head *bitmap_bh = NULL;
 
        es = EXT4_SB(sb)->s_es;
@@ -728,20 +627,19 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        bitmap_count = 0;
        gdp = NULL;
 
-       smp_rmb();
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-               desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+               desc_count += ext4_free_blks_count(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (bitmap_bh == NULL)
                        continue;
 
                x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-               printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-                       i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+               printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
+                       i, ext4_free_blks_count(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
@@ -751,12 +649,11 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        return bitmap_count;
 #else
        desc_count = 0;
-       smp_rmb();
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-               desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+               desc_count += ext4_free_blks_count(sb, gdp);
        }
 
        return desc_count;
@@ -814,7 +711,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
                                        ext4_group_t group)
 {
-       return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+       if (!ext4_bg_has_super(sb, group))
+               return 0;
+
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+               return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+       else
+               return EXT4_SB(sb)->s_gdb_count;
 }
 
 /**