Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 9 Jan 2009 01:14:59 +0000 (17:14 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 9 Jan 2009 01:14:59 +0000 (17:14 -0800)
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (57 commits)
  jbd2: Fix oops in jbd2_journal_init_inode() on corrupted fs
  ext4: Remove "extents" mount option
  block: Add Kconfig help which notes that ext4 needs CONFIG_LBD
  ext4: Make printk's consistently prefixed with "EXT4-fs: "
  ext4: Add sanity checks for the superblock before mounting the filesystem
  ext4: Add mount option to set kjournald's I/O priority
  jbd2: Submit writes to the journal using WRITE_SYNC
  jbd2: Add pid and journal device name to the "kjournald2 starting" message
  ext4: Add markers for better debuggability
  ext4: Remove code to create the journal inode
  ext4: provide function to release metadata pages under memory pressure
  ext3: provide function to release metadata pages under memory pressure
  add releasepage hooks to block devices which can be used by file systems
  ext4: Fix s_dirty_blocks_counter if block allocation failed with nodelalloc
  ext4: Init the complete page while building buddy cache
  ext4: Don't allow new groups to be added during block allocation
  ext4: mark the blocks/inode bitmap beyond end of group as used
  ext4: Use new buffer_head flag to check uninit group bitmaps initialization
  ext4: Fix the race between read_inode_bitmap() and ext4_new_inode()
  ext4: code cleanup
  ...

39 files changed:
Documentation/filesystems/ext4.txt
block/Kconfig
fs/block_dev.c
fs/ext3/hash.c
fs/ext3/namei.c
fs/ext3/super.c
fs/ext4/balloc.c
fs/ext4/bitmap.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_extents.h
fs/ext4/ext4_i.h
fs/ext4/ext4_jbd2.c
fs/ext4/ext4_jbd2.h
fs/ext4/ext4_sb.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/hash.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/mballoc.h
fs/ext4/migrate.c
fs/ext4/namei.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/ioprio.c
fs/jbd2/checkpoint.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/transaction.c
fs/super.c
include/linux/ext3_fs.h
include/linux/ext3_fs_sb.h
include/linux/fs.h
include/linux/ioprio.h
include/linux/jbd2.h

index 174eaff..cec829b 100644 (file)
@@ -58,13 +58,22 @@ Note: More extensive information for getting started with ext4 can be
 
        # mount -t ext4 /dev/hda1 /wherever
 
-  - When comparing performance with other filesystems, remember that
-    ext3/4 by default offers higher data integrity guarantees than most.
-    So when comparing with a metadata-only journalling filesystem, such
-    as ext3, use `mount -o data=writeback'.  And you might as well use
-    `mount -o nobh' too along with it.  Making the journal larger than
-    the mke2fs default often helps performance with metadata-intensive
-    workloads.
+  - When comparing performance with other filesystems, it's always
+    important to try multiple workloads; very often a subtle change in a
+    workload parameter can completely change the ranking of which
+    filesystems do well compared to others.  When comparing versus ext3,
+    note that ext4 enables write barriers by default, while ext3 does
+    not enable write barriers by default.  So it is useful to use
+    explicitly specify whether barriers are enabled or not when via the
+    '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
+    for a fair comparison.  When tuning ext3 for best benchmark numbers,
+    it is often worthwhile to try changing the data journaling mode; '-o
+    data=writeback,nobh' can be faster for some workloads.  (Note
+    however that running mounted with data=writeback can potentially
+    leave stale data exposed in recently written files in case of an
+    unclean shutdown, which could be a security exposure in some
+    situations.)  Configuring the filesystem with a large journal can
+    also be helpful for metadata-intensive workloads.
 
 2. Features
 ===========
@@ -74,7 +83,7 @@ Note: More extensive information for getting started with ext4 can be
 * ability to use filesystems > 16TB (e2fsprogs support not available yet)
 * extent format reduces metadata overhead (RAM, IO for access, transactions)
 * extent format more robust in face of on-disk corruption due to magics,
-* internal redunancy in tree
+* internal redundancy in tree
 * improved file allocation (multi-block alloc)
 * fix 32000 subdirectory limit
 * nsec timestamps for mtime, atime, ctime, create time
@@ -116,10 +125,11 @@ grouping of bitmaps and inode tables.  Some test results available here:
 When mounting an ext4 filesystem, the following option are accepted:
 (*) == default
 
-extents                (*)     ext4 will use extents to address file data.  The
-                       file system will no longer be mountable by ext3.
-
-noextents              ext4 will not use extents for newly created files
+ro                     Mount filesystem read only. Note that ext4 will
+                       replay the journal (and thus write to the
+                       partition) even when mounted "read only". The
+                       mount options "ro,noload" can be used to prevent
+                       writes to the filesystem.
 
 journal_checksum       Enable checksumming of the journal transactions.
                        This will allow the recovery code in e2fsck and the
@@ -134,17 +144,17 @@ journal_async_commit      Commit block can be written to disk without waiting
 journal=update         Update the ext4 file system's journal to the current
                        format.
 
-journal=inum           When a journal already exists, this option is ignored.
-                       Otherwise, it specifies the number of the inode which
-                       will represent the ext4 file system's journal file.
-
 journal_dev=devnum     When the external journal device's major/minor numbers
                        have changed, this option allows the user to specify
                        the new journal location.  The journal device is
                        identified through its new major/minor numbers encoded
                        in devnum.
 
-noload                 Don't load the journal on mounting.
+noload                 Don't load the journal on mounting.  Note that
+                       if the filesystem was not unmounted cleanly,
+                       skipping the journal replay will lead to the
+                       filesystem containing inconsistencies that can
+                       lead to any number of problems.
 
 data=journal           All data are committed into the journal prior to being
                        written into the main file system.
@@ -219,9 +229,12 @@ minixdf                    Make 'df' act like Minix.
 
 debug                  Extra debugging information is sent to syslog.
 
-errors=remount-ro(*)   Remount the filesystem read-only on an error.
+errors=remount-ro      Remount the filesystem read-only on an error.
 errors=continue                Keep going on a filesystem error.
 errors=panic           Panic and halt the machine if an error occurs.
+                        (These mount options override the errors behavior
+                        specified in the superblock, which can be configured
+                        using tune2fs)
 
 data_err=ignore(*)     Just print an error message if an error occurs
                        in a file data buffer in ordered mode.
@@ -261,6 +274,42 @@ delalloc   (*)     Deferring block allocation until write-out time.
 nodelalloc             Disable delayed allocation. Blocks are allocation
                        when data is copied from user to page cache.
 
+max_batch_time=usec    Maximum amount of time ext4 should wait for
+                       additional filesystem operations to be batch
+                       together with a synchronous write operation.
+                       Since a synchronous write operation is going to
+                       force a commit and then a wait for the I/O
+                       complete, it doesn't cost much, and can be a
+                       huge throughput win, we wait for a small amount
+                       of time to see if any other transactions can
+                       piggyback on the synchronous write.   The
+                       algorithm used is designed to automatically tune
+                       for the speed of the disk, by measuring the
+                       amount of time (on average) that it takes to
+                       finish committing a transaction.  Call this time
+                       the "commit time".  If the time that the
+                       transactoin has been running is less than the
+                       commit time, ext4 will try sleeping for the
+                       commit time to see if other operations will join
+                       the transaction.   The commit time is capped by
+                       the max_batch_time, which defaults to 15000us
+                       (15ms).   This optimization can be turned off
+                       entirely by setting max_batch_time to 0.
+
+min_batch_time=usec    This parameter sets the commit time (as
+                       described above) to be at least min_batch_time.
+                       It defaults to zero microseconds.  Increasing
+                       this parameter may improve the throughput of
+                       multi-threaded, synchronous workloads on very
+                       fast disks, at the cost of increasing latency.
+
+journal_ioprio=prio    The I/O priority (from 0 to 7, where 0 is the
+                       highest priorty) which should be used for I/O
+                       operations submitted by kjournald2 during a
+                       commit operation.  This defaults to 3, which is
+                       a slightly higher priority than the default I/O
+                       priority.
+
 Data Mode
 =========
 There are 3 different data modes:
index ac0956f..0cbb3b8 100644 (file)
@@ -36,6 +36,12 @@ config LBD
          This option also enables support for single files larger than
          2TB.
 
+         The ext4 filesystem requires that this feature be enabled in
+         order to support filesystems that have the huge_file feature
+         enabled.    Otherwise, it will refuse to mount any filesystems
+         that use the huge_file feature, which is enabled by default
+         by mke2fs.ext4.   The GFS2 filesystem also requires this feature.
+
          If unsure, say N.
 
 config BLK_DEV_IO_TRACE
index 8ebbfdf..ac7031f 100644 (file)
@@ -1234,6 +1234,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
+/*
+ * Try to release a page associated with block device when the system
+ * is under memory pressure.
+ */
+static int blkdev_releasepage(struct page *page, gfp_t wait)
+{
+       struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
+
+       if (super && super->s_op->bdev_try_to_free_page)
+               return super->s_op->bdev_try_to_free_page(super, page, wait);
+
+       return try_to_free_buffers(page);
+}
+
 static const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
        .writepage      = blkdev_writepage,
@@ -1241,6 +1255,7 @@ static const struct address_space_operations def_blk_aops = {
        .write_begin    = blkdev_write_begin,
        .write_end      = blkdev_write_end,
        .writepages     = generic_writepages,
+       .releasepage    = blkdev_releasepage,
        .direct_IO      = blkdev_direct_IO,
 };
 
index c30e149..7d215b4 100644 (file)
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 
 
 /* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-       __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+       __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+       const unsigned char *ucp = (const unsigned char *) name;
+
+       while (len--) {
+               hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+               if (hash & 0x80000000)
+                       hash -= 0x7fffffff;
+               hash1 = hash0;
+               hash0 = hash;
+       }
+       return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+       __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+       const signed char *scp = (const signed char *) name;
+
        while (len--) {
-               __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+               hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
 
-               if (hash & 0x80000000) hash -= 0x7fffffff;
+               if (hash & 0x80000000)
+                       hash -= 0x7fffffff;
                hash1 = hash0;
                hash0 = hash;
        }
-       return (hash0 << 1);
+       return hash0 << 1;
 }
 
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 {
        __u32   pad, val;
        int     i;
+       const signed char *scp = (const signed char *) msg;
+
+       pad = (__u32)len | ((__u32)len << 8);
+       pad |= pad << 16;
+
+       val = pad;
+       if (len > num*4)
+               len = num * 4;
+       for (i = 0; i < len; i++) {
+               if ((i % 4) == 0)
+                       val = pad;
+               val = ((int) scp[i]) + (val << 8);
+               if ((i % 4) == 3) {
+                       *buf++ = val;
+                       val = pad;
+                       num--;
+               }
+       }
+       if (--num >= 0)
+               *buf++ = val;
+       while (--num >= 0)
+               *buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+       __u32   pad, val;
+       int     i;
+       const unsigned char *ucp = (const unsigned char *) msg;
 
        pad = (__u32)len | ((__u32)len << 8);
        pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
        for (i=0; i < len; i++) {
                if ((i % 4) == 0)
                        val = pad;
-               val = msg[i] + (val << 8);
+               val = ((int) ucp[i]) + (val << 8);
                if ((i % 4) == 3) {
                        *buf++ = val;
                        val = pad;
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        const char      *p;
        int             i;
        __u32           in[8], buf[4];
+       void            (*str2hashbuf)(const char *, int, __u32 *, int) =
+                               str2hashbuf_signed;
 
        /* Initialize the default seed for the hash checksum functions */
        buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        }
 
        switch (hinfo->hash_version) {
+       case DX_HASH_LEGACY_UNSIGNED:
+               hash = dx_hack_hash_unsigned(name, len);
+               break;
        case DX_HASH_LEGACY:
-               hash = dx_hack_hash(name, len);
+               hash = dx_hack_hash_signed(name, len);
                break;
+       case DX_HASH_HALF_MD4_UNSIGNED:
+               str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_HALF_MD4:
                p = name;
                while (len > 0) {
-                       str2hashbuf(p, len, in, 8);
+                       (*str2hashbuf)(p, len, in, 8);
                        half_md4_transform(buf, in);
                        len -= 32;
                        p += 32;
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
                minor_hash = buf[2];
                hash = buf[1];
                break;
+       case DX_HASH_TEA_UNSIGNED:
+               str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_TEA:
                p = name;
                while (len > 0) {
-                       str2hashbuf(p, len, in, 4);
+                       (*str2hashbuf)(p, len, in, 4);
                        TEA_transform(buf, in);
                        len -= 16;
                        p += 16;
index 8d6f965..69a3d19 100644 (file)
@@ -364,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir,
                goto fail;
        }
        hinfo->hash_version = root->info.hash_version;
+       if (hinfo->hash_version <= DX_HASH_TEA)
+               hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
        hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
        if (entry)
                ext3fs_dirhash(entry->name, entry->len, hinfo);
@@ -632,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        dir = dir_file->f_path.dentry->d_inode;
        if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
                hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+               if (hinfo.hash_version <= DX_HASH_TEA)
+                       hinfo.hash_version +=
+                               EXT3_SB(dir->i_sb)->s_hash_unsigned;
                hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
                count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
                                               start_hash, start_minor_hash);
@@ -1152,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        u32 hash2;
        struct dx_map_entry *map;
        char *data1 = (*bh)->b_data, *data2;
-       unsigned split, move, size, i;
+       unsigned split, move, size;
        struct ext3_dir_entry_2 *de = NULL, *de2;
-       int     err = 0;
+       int     err = 0, i;
 
        bh2 = ext3_append (handle, dir, &newblock, &err);
        if (!(bh2)) {
@@ -1394,6 +1399,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 
        /* Initialize as for dx_probe */
        hinfo.hash_version = root->info.hash_version;
+       if (hinfo.hash_version <= DX_HASH_TEA)
+               hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
        hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
        ext3fs_dirhash(name, namelen, &hinfo);
        frame = frames;
index 01c235b..5d047a0 100644 (file)
@@ -683,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
                                    ext3_nfs_get_inode);
 }
 
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+                                gfp_t wait)
+{
+       journal_t *journal = EXT3_SB(sb)->s_journal;
+
+       WARN_ON(PageChecked(page));
+       if (!page_has_buffers(page))
+               return 0;
+       if (journal)
+               return journal_try_to_free_buffers(journal, page, 
+                                                  wait & ~__GFP_WAIT);
+       return try_to_free_buffers(page);
+}
+
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -749,6 +769,7 @@ static const struct super_operations ext3_sops = {
        .quota_read     = ext3_quota_read,
        .quota_write    = ext3_quota_write,
 #endif
+       .bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
 static const struct export_operations ext3_export_ops = {
@@ -1750,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        for (i=0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
+       i = le32_to_cpu(es->s_flags);
+       if (i & EXT2_FLAGS_UNSIGNED_HASH)
+               sbi->s_hash_unsigned = 3;
+       else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+               es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+               sbi->s_hash_unsigned = 3;
+#else
+               es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+               sb->s_dirt = 1;
+       }
 
        if (sbi->s_blocks_per_group > blocksize * 8) {
                printk (KERN_ERR
index 38b3acf..6bba06b 100644 (file)
@@ -20,6 +20,7 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "group.h"
+#include "mballoc.h"
 
 /*
  * balloc.c contains the blocks allocation and deallocation routines
@@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * essentially implementing a per-group read-only flag. */
                if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
                        ext4_error(sb, __func__,
-                                 "Checksum bad for group %lu\n", block_group);
-                       gdp->bg_free_blocks_count = 0;
-                       gdp->bg_free_inodes_count = 0;
-                       gdp->bg_itable_unused = 0;
+                                 "Checksum bad for group %u", block_group);
+                       ext4_free_blks_set(sb, gdp, 0);
+                       ext4_free_inodes_set(sb, gdp, 0);
+                       ext4_itable_unused_set(sb, gdp, 0);
                        memset(bh->b_data, 0xff, sb->s_blocksize);
                        return 0;
                }
@@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
                                             ext4_group_t block_group,
                                             struct buffer_head **bh)
 {
-       unsigned long group_desc;
-       unsigned long offset;
+       unsigned int group_desc;
+       unsigned int offset;
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
        if (block_group >= sbi->s_groups_count) {
                ext4_error(sb, "ext4_get_group_desc",
                           "block_group >= groups_count - "
-                          "block_group = %lu, groups_count = %lu",
+                          "block_group = %u, groups_count = %u",
                           block_group, sbi->s_groups_count);
 
                return NULL;
@@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
        if (!sbi->s_group_desc[group_desc]) {
                ext4_error(sb, "ext4_get_group_desc",
                           "Group descriptor not loaded - "
-                          "block_group = %lu, group_desc = %lu, desc = %lu",
+                          "block_group = %u, group_desc = %u, desc = %u",
                           block_group, group_desc, offset);
                return NULL;
        }
@@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        if (unlikely(!bh)) {
                ext4_error(sb, __func__,
                            "Cannot read block bitmap - "
-                           "block_group = %lu, block_bitmap = %llu",
+                           "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
-       if (buffer_uptodate(bh) &&
-           !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+
+       if (bitmap_uptodate(bh))
                return bh;
 
        lock_buffer(bh);
+       if (bitmap_uptodate(bh)) {
+               unlock_buffer(bh);
+               return bh;
+       }
        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                ext4_init_block_bitmap(sb, bh, block_group, desc);
+               set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-               unlock_buffer(bh);
                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+               unlock_buffer(bh);
                return bh;
        }
        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+       if (buffer_uptodate(bh)) {
+               /*
+                * if not uninit if bh is uptodate,
+                * bitmap is also uptodate
+                */
+               set_bitmap_uptodate(bh);
+               unlock_buffer(bh);
+               return bh;
+       }
+       /*
+        * submit the buffer_head for read. We can
+        * safely mark the bitmap as uptodate now.
+        * We do it here so the bitmap uptodate bit
+        * get set with buffer lock held.
+        */
+       set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
                ext4_error(sb, __func__,
                            "Cannot read block bitmap - "
-                           "block_group = %lu, block_bitmap = %llu",
+                           "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
@@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 
 /**
- * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
  * @handle:                    handle to this transaction
  * @sb:                                super block
- * @block:                     start physcial block to free
+ * @block:                     start physcial block to add to the block group
  * @count:                     number of blocks to free
- * @pdquot_freed_blocks:       pointer to quota
  *
- * XXX This function is only used by the on-line resizing code, which
- * should probably be fixed up to call the mballoc variant.  There
- * this needs to be cleaned up later; in fact, I'm not convinced this
- * is 100% correct in the face of the mballoc code.  The online resizing
- * code needs to be fixed up to more tightly (and correctly) interlock
- * with the mballoc code.
+ * This marks the blocks as free in the bitmap. We ask the
+ * mballoc to reload the buddy after this by setting group
+ * EXT4_GROUP_INFO_NEED_INIT_BIT flag
  */
-void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-                        ext4_fsblk_t block, unsigned long count,
-                        unsigned long *pdquot_freed_blocks)
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                        ext4_fsblk_t block, unsigned long count)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gd_bh;
        ext4_group_t block_group;
        ext4_grpblk_t bit;
-       unsigned long i;
-       unsigned long overflow;
+       unsigned int i;
        struct ext4_group_desc *desc;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
-       int err = 0, ret;
-       ext4_grpblk_t group_freed;
+       int err = 0, ret, blk_free_count;
+       ext4_grpblk_t blocks_freed;
+       struct ext4_group_info *grp;
 
-       *pdquot_freed_blocks = 0;
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
-       if (block < le32_to_cpu(es->s_first_data_block) ||
-           block + count < block ||
-           block + count > ext4_blocks_count(es)) {
-               ext4_error(sb, "ext4_free_blocks",
-                          "Freeing blocks not in datazone - "
-                          "block = %llu, count = %lu", block, count);
-               goto error_return;
-       }
-
-       ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
+       ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
 
-do_more:
-       overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+       grp = ext4_get_group_info(sb, block_group);
        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-               overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
-               count -= overflow;
+               goto error_return;
        }
-       brelse(bitmap_bh);
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
@@ -418,18 +422,17 @@ do_more:
            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
            in_range(block + count - 1, ext4_inode_table(sb, desc),
                     sbi->s_itb_per_group)) {
-               ext4_error(sb, "ext4_free_blocks",
-                          "Freeing blocks in system zones - "
+               ext4_error(sb, __func__,
+                          "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
                goto error_return;
        }
 
        /*
-        * We are about to start releasing blocks in the bitmap,
+        * We are about to add blocks to the bitmap,
         * so we need undo access.
         */
-       /* @@@ check errors */
        BUFFER_TRACE(bitmap_bh, "getting undo access");
        err = ext4_journal_get_undo_access(handle, bitmap_bh);
        if (err)
@@ -444,107 +447,55 @@ do_more:
        err = ext4_journal_get_write_access(handle, gd_bh);
        if (err)
                goto error_return;
-
-       jbd_lock_bh_state(bitmap_bh);
-
-       for (i = 0, group_freed = 0; i < count; i++) {
-               /*
-                * An HJ special.  This is expensive...
-                */
-#ifdef CONFIG_JBD2_DEBUG
-               jbd_unlock_bh_state(bitmap_bh);
-               {
-                       struct buffer_head *debug_bh;
-                       debug_bh = sb_find_get_block(sb, block + i);
-                       if (debug_bh) {
-                               BUFFER_TRACE(debug_bh, "Deleted!");
-                               if (!bh2jh(bitmap_bh)->b_committed_data)
-                                       BUFFER_TRACE(debug_bh,
-                                               "No commited data in bitmap");
-                               BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
-                               __brelse(debug_bh);
-                       }
-               }
-               jbd_lock_bh_state(bitmap_bh);
-#endif
-               if (need_resched()) {
-                       jbd_unlock_bh_state(bitmap_bh);
-                       cond_resched();
-                       jbd_lock_bh_state(bitmap_bh);
-               }
-               /* @@@ This prevents newly-allocated data from being
-                * freed and then reallocated within the same
-                * transaction.
-                *
-                * Ideally we would want to allow that to happen, but to
-                * do so requires making jbd2_journal_forget() capable of
-                * revoking the queued write of a data block, which
-                * implies blocking on the journal lock.  *forget()
-                * cannot block due to truncate races.
-                *
-                * Eventually we can fix this by making jbd2_journal_forget()
-                * return a status indicating whether or not it was able
-                * to revoke the buffer.  On successful revoke, it is
-                * safe not to set the allocation bit in the committed
-                * bitmap, because we know that there is no outstanding
-                * activity on the buffer any more and so it is safe to
-                * reallocate it.
-                */
-               BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
-               J_ASSERT_BH(bitmap_bh,
-                               bh2jh(bitmap_bh)->b_committed_data != NULL);
-               ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
-                               bh2jh(bitmap_bh)->b_committed_data);
-
-               /*
-                * We clear the bit in the bitmap after setting the committed
-                * data bit, because this is the reverse order to that which
-                * the allocator uses.
-                */
+       /*
+        * make sure we don't allow a parallel init on other groups in the
+        * same buddy cache
+        */
+       down_write(&grp->alloc_sem);
+       for (i = 0, blocks_freed = 0; i < count; i++) {
                BUFFER_TRACE(bitmap_bh, "clear bit");
                if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
                                                bit + i, bitmap_bh->b_data)) {
-                       jbd_unlock_bh_state(bitmap_bh);
                        ext4_error(sb, __func__,
                                   "bit already cleared for block %llu",
                                   (ext4_fsblk_t)(block + i));
-                       jbd_lock_bh_state(bitmap_bh);
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
                } else {
-                       group_freed++;
+                       blocks_freed++;
                }
        }
-       jbd_unlock_bh_state(bitmap_bh);
-
        spin_lock(sb_bgl_lock(sbi, block_group));
-       le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
+       blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+       ext4_free_blks_set(sb, desc, blk_free_count);
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
        spin_unlock(sb_bgl_lock(sbi, block_group));
-       percpu_counter_add(&sbi->s_freeblocks_counter, count);
+       percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
                spin_lock(sb_bgl_lock(sbi, flex_group));
-               sbi->s_flex_groups[flex_group].free_blocks += count;
+               sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
+       /*
+        * request to reload the buddy with the
+        * new bitmap information
+        */
+       set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+       ext4_mb_update_group_info(grp, blocks_freed);
+       up_write(&grp->alloc_sem);
 
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 
        /* And the group descriptor block */
        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-       ret = ext4_journal_dirty_metadata(handle, gd_bh);
-       if (!err) err = ret;
-       *pdquot_freed_blocks += group_freed;
-
-       if (overflow && !err) {
-               block += count;
-               count = overflow;
-               goto do_more;
-       }
+       ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+       if (!err)
+               err = ret;
        sb->s_dirt = 1;
+
 error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
@@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
                if (dirty_blocks < 0) {
                        printk(KERN_CRIT "Dirty block accounting "
                                        "went wrong %lld\n",
-                                       dirty_blocks);
+                                       (long long)dirty_blocks);
                }
        }
        /* Check whether we have space after
@@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
        return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
 
-#define EXT4_META_BLOCK 0x1
-
-static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, ext4_fsblk_t goal,
-                               unsigned long *count, int *errp, int flags)
-{
-       struct ext4_allocation_request ar;
-       ext4_fsblk_t ret;
-
-       memset(&ar, 0, sizeof(ar));
-       /* Fill with neighbour allocated blocks */
-
-       ar.inode = inode;
-       ar.goal = goal;
-       ar.len = *count;
-       ar.logical = iblock;
-
-       if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
-               /* enable in-core preallocation for data block allocation */
-               ar.flags = EXT4_MB_HINT_DATA;
-       else
-               /* disable in-core preallocation for non-regular files */
-               ar.flags = 0;
-
-       ret = ext4_mb_new_blocks(handle, &ar, errp);
-       *count = ar.len;
-       return ret;
-}
-
 /*
  * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
  *
  * @handle:             handle to this transaction
  * @inode:              file inode
  * @goal:               given target block(filesystem wide)
- * @count:             total number of blocks need
+ * @count:             pointer to total number of blocks needed
  * @errp:               error code
  *
- * Return 1st allocated block numberon success, *count stores total account
+ * Return 1st allocated block number on success, *count stores total account
  * error stores in errp pointer
  */
 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
+       struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
-       ret = do_blk_alloc(handle, inode, 0, goal,
-                               count, errp, EXT4_META_BLOCK);
+
+       memset(&ar, 0, sizeof(ar));
+       /* Fill with neighbour allocated blocks */
+       ar.inode = inode;
+       ar.goal = goal;
+       ar.len = count ? *count : 1;
+
+       ret = ext4_mb_new_blocks(handle, &ar, errp);
+       if (count)
+               *count = ar.len;
+
        /*
         * Account for the allocated meta blocks
         */
        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-               EXT4_I(inode)->i_allocated_meta_blocks += *count;
+               EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        }
        return ret;
 }
 
-/*
- * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @errp:               error code
- *
- * Return allocated block number on success
- */
-ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-               ext4_fsblk_t goal, int *errp)
-{
-       unsigned long count = 1;
-       return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
-}
-
-/*
- * ext4_new_blocks() -- allocate data blocks
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @count:             total number of blocks need
- * @errp:               error code
- *
- * Return 1st allocated block numberon success, *count stores total account
- * error stores in errp pointer
- */
-
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, ext4_fsblk_t goal,
-                               unsigned long *count, int *errp)
-{
-       return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
-}
-
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
  * @sb:                superblock
@@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
-       unsigned long x;
+       unsigned int x;
        struct buffer_head *bitmap_bh = NULL;
 
        es = EXT4_SB(sb)->s_es;
@@ -796,7 +691,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                        continue;
 
                x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-               printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
+               printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n",
                        i, le16_to_cpu(gdp->bg_free_blocks_count), x);
                bitmap_count += x;
        }
@@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-               desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+               desc_count += ext4_free_blks_count(sb, gdp);
        }
 
        return desc_count;
index 0a7a666..fa3af81 100644 (file)
 
 static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
 
-unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
+unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
 {
-       unsigned int i;
-       unsigned long sum = 0;
+       unsigned int i, sum = 0;
 
        if (!map)
                return 0;
index fed5b61..2df2e40 100644 (file)
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 int ext4_check_dir_entry(const char *function, struct inode *dir,
                         struct ext4_dir_entry_2 *de,
                         struct buffer_head *bh,
-                        unsigned long offset)
+                        unsigned int offset)
 {
        const char *error_msg = NULL;
        const int rlen = ext4_rec_len_from_disk(de->rec_len);
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
        if (error_msg != NULL)
                ext4_error(dir->i_sb, function,
                        "bad entry in directory #%lu: %s - "
-                       "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+                       "offset=%u, inode=%u, rec_len=%d, name_len=%d",
                        dir->i_ino, error_msg, offset,
-                       (unsigned long) le32_to_cpu(de->inode),
+                       le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
 }
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp,
                         void *dirent, filldir_t filldir)
 {
        int error = 0;
-       unsigned long offset;
+       unsigned int offset;
        int i, stored;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent,
        sb = inode->i_sb;
 
        if (!fname) {
-               printk(KERN_ERR "ext4: call_filldir: called with "
+               printk(KERN_ERR "EXT4-fs: call_filldir: called with "
                       "null fname?!?\n");
                return 0;
        }
index 6c46c64..c668e43 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/types.h>
 #include <linux/blkdev.h>
 #include <linux/magic.h>
+#include <linux/jbd2.h>
 #include "ext4_i.h"
 
 /*
@@ -94,9 +95,9 @@ struct ext4_allocation_request {
        /* phys. block for ^^^ */
        ext4_fsblk_t pright;
        /* how many blocks we want to allocate */
-       unsigned long len;
+       unsigned int len;
        /* flags. see above EXT4_MB_HINT_* */
-       unsigned long flags;
+       unsigned int flags;
 };
 
 /*
@@ -156,12 +157,12 @@ struct ext4_group_desc
        __le32  bg_block_bitmap_lo;     /* Blocks bitmap block */
        __le32  bg_inode_bitmap_lo;     /* Inodes bitmap block */
        __le32  bg_inode_table_lo;      /* Inodes table block */
-       __le16  bg_free_blocks_count;   /* Free blocks count */
-       __le16  bg_free_inodes_count;   /* Free inodes count */
-       __le16  bg_used_dirs_count;     /* Directories count */
+       __le16  bg_free_blocks_count_lo;/* Free blocks count */
+       __le16  bg_free_inodes_count_lo;/* Free inodes count */
+       __le16  bg_used_dirs_count_lo;  /* Directories count */
        __le16  bg_flags;               /* EXT4_BG_flags (INODE_UNINIT, etc) */
        __u32   bg_reserved[2];         /* Likely block/inode bitmap checksum */
-       __le16  bg_itable_unused;       /* Unused inodes count */
+       __le16  bg_itable_unused_lo;    /* Unused inodes count */
        __le16  bg_checksum;            /* crc16(sb_uuid+group+desc) */
        __le32  bg_block_bitmap_hi;     /* Blocks bitmap block MSB */
        __le32  bg_inode_bitmap_hi;     /* Inodes bitmap block MSB */
@@ -169,7 +170,7 @@ struct ext4_group_desc
        __le16  bg_free_blocks_count_hi;/* Free blocks count MSB */
        __le16  bg_free_inodes_count_hi;/* Free inodes count MSB */
        __le16  bg_used_dirs_count_hi;  /* Directories count MSB */
-       __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
+       __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
        __u32   bg_reserved2[3];
 };
 
@@ -328,6 +329,7 @@ struct ext4_mount_options {
        uid_t s_resuid;
        gid_t s_resgid;
        unsigned long s_commit_interval;
+       u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
        int s_jquota_fmt;
        char *s_qf_names[MAXQUOTAS];
@@ -534,7 +536,6 @@ do {                                                                               \
 #define EXT4_MOUNT_QUOTA               0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA            0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA            0x200000 /* "old" group quota */
-#define EXT4_MOUNT_EXTENTS             0x400000 /* Extents support */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM    0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
@@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
  */
 
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)                       \
-       (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
+       ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)                    \
-       (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
+       ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)                     \
-       (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
+       ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
 #define EXT4_SET_COMPAT_FEATURE(sb,mask)                       \
        EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
 #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)                    \
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEFM_JMODE_WBACK  0x0060
 
 /*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME        0
+#define EXT4_DEF_MAX_BATCH_TIME        15000 /* 15ms */
+
+/*
  * Structure of a directory entry
  */
 #define EXT4_NAME_LEN 255
@@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
 #define DX_HASH_LEGACY         0
 #define DX_HASH_HALF_MD4       1
 #define DX_HASH_TEA            2
+#define DX_HASH_LEGACY_UNSIGNED        3
+#define DX_HASH_HALF_MD4_UNSIGNED      4
+#define DX_HASH_TEA_UNSIGNED           5
 
 #ifdef __KERNEL__
 
@@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 #define ERR_BAD_DX_DIR -75000
 
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-                       unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
+                       ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
 
 extern struct proc_dir_entry *ext4_proc_root;
 
@@ -987,6 +997,9 @@ do {                                                                        \
 # define ATTRIB_NORET  __attribute__((noreturn))
 # define NORET_AND     noreturn,
 
+/* bitmap.c */
+extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+
 /* balloc.c */
 extern unsigned int ext4_block_group(struct super_block *sb,
                        ext4_fsblk_t blocknr);
@@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
-extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-                       ext4_fsblk_t goal, int *errp);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-                                       ext4_lblk_t iblock, ext4_fsblk_t goal,
-                                       unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-                               ext4_fsblk_t block, unsigned long count,
-                               unsigned long *pdquot_freed_blocks);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                               ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
                                struct ext4_dir_entry_2 *,
-                               struct buffer_head *, unsigned long);
+                               struct buffer_head *, unsigned int);
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
@@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
                unsigned long, unsigned long, int, unsigned long *);
-extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
                ext4_grpblk_t add);
-
-
+extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
+                                               ext4_group_t, int);
 /* inode.c */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                struct buffer_head *bh, ext4_fsblk_t blocknr);
@@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
 int ext4_get_block(struct inode *inode, sector_t iblock,
                                struct buffer_head *bh_result, int create);
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, unsigned long maxblocks,
-                               struct buffer_head *bh_result,
-                               int create, int extend_disksize);
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, int);
@@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
+                               const char *, const char *, ...)
+       __attribute__ ((format (printf, 4, 5)));
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
                                        __u32 compat);
@@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
 extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
+extern __u32 ext4_free_blks_count(struct super_block *sb,
+                               struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+                                struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+                               struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+                                  struct ext4_group_desc *bg);
 extern void ext4_block_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_blks_set(struct super_block *sb,
+                              struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+                               struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+                               struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+                                  struct ext4_group_desc *bg, __u32 count);
 
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
@@ -1246,6 +1268,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
        return ;
 }
 
+struct ext4_group_info {
+       unsigned long   bb_state;
+       struct rb_root  bb_free_root;
+       unsigned short  bb_first_free;
+       unsigned short  bb_free;
+       unsigned short  bb_fragments;
+       struct          list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+       void            *bb_bitmap;
+#endif
+       struct rw_semaphore alloc_sem;
+       unsigned short  bb_counters[];
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT  0
+#define EXT4_GROUP_INFO_LOCKED_BIT     1
+
+#define EXT4_MB_GRP_NEED_INIT(grp)     \
+       (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+       bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+                                       ext4_group_t group)
+{
+       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+       bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline int ext4_is_group_locked(struct super_block *sb,
+                                       ext4_group_t group)
+{
+       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+       return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+                                               &(grinfo->bb_state));
+}
+
 /*
  * Inodes and files operations
  */
@@ -1271,18 +1337,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-                       ext4_lblk_t iblock,
-                       unsigned long max_blocks, struct buffer_head *bh_result,
-                       int create, int extend_disksize);
+                              ext4_lblk_t iblock, unsigned int max_blocks,
+                              struct buffer_head *bh_result,
+                              int create, int extend_disksize);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
-                       sector_t block, unsigned long max_blocks,
+                       sector_t block, unsigned int max_blocks,
                        struct buffer_head *bh, int create,
                        int extend_disksize, int flag);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                       __u64 start, __u64 len);
+
+/*
+ * Add new method to test wether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+       return (buffer_uptodate(bh) &&
+                       test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+       set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _EXT4_H */
index bec7ce5..18cb67b 100644 (file)
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode)
        return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
 }
 
-static inline void ext4_ext_tree_changed(struct inode *inode)
-{
-       EXT4_I(inode)->i_ext_generation++;
-}
-
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
index 5c124c0..e69acc1 100644 (file)
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t;
 typedef __u32 ext4_lblk_t;
 
 /* data type for block group number */
-typedef unsigned long ext4_group_t;
+typedef unsigned int ext4_group_t;
 
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
@@ -100,9 +100,6 @@ struct ext4_inode_info {
         */
        loff_t  i_disksize;
 
-       /* on-disk additional length */
-       __u16 i_extra_isize;
-
        /*
         * i_data_sem is for serialising ext4_truncate() against
         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
@@ -117,7 +114,6 @@ struct ext4_inode_info {
        struct inode vfs_inode;
        struct jbd2_inode jinode;
 
-       unsigned long i_ext_generation;
        struct ext4_ext_cache i_cached_extent;
        /*
         * File creation time. Its function is same as that of
@@ -130,10 +126,14 @@ struct ext4_inode_info {
        spinlock_t i_prealloc_lock;
 
        /* allocation reservation info for delalloc */
-       unsigned long i_reserved_data_blocks;
-       unsigned long i_reserved_meta_blocks;
-       unsigned long i_allocated_meta_blocks;
+       unsigned int i_reserved_data_blocks;
+       unsigned int i_reserved_meta_blocks;
+       unsigned int i_allocated_meta_blocks;
        unsigned short i_delalloc_reserved_flag;
+
+       /* on-disk additional length */
+       __u16 i_extra_isize;
+
        spinlock_t i_block_reservation_lock;
 };
 
index c75384b..ad13a84 100644 (file)
@@ -7,53 +7,96 @@
 int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
-       int err = jbd2_journal_get_undo_access(handle, bh);
-       if (err)
-               ext4_journal_abort_handle(where, __func__, bh, handle, err);
+       int err = 0;
+
+       if (ext4_handle_valid(handle)) {
+               err = jbd2_journal_get_undo_access(handle, bh);
+               if (err)
+                       ext4_journal_abort_handle(where, __func__, bh,
+                                                 handle, err);
+       }
        return err;
 }
 
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
-       int err = jbd2_journal_get_write_access(handle, bh);
-       if (err)
-               ext4_journal_abort_handle(where, __func__, bh, handle, err);
+       int err = 0;
+
+       if (ext4_handle_valid(handle)) {
+               err = jbd2_journal_get_write_access(handle, bh);
+               if (err)
+                       ext4_journal_abort_handle(where, __func__, bh,
+                                                 handle, err);
+       }
        return err;
 }
 
 int __ext4_journal_forget(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
-       int err = jbd2_journal_forget(handle, bh);
-       if (err)
-               ext4_journal_abort_handle(where, __func__, bh, handle, err);
+       int err = 0;
+
+       if (ext4_handle_valid(handle)) {
+               err = jbd2_journal_forget(handle, bh);
+               if (err)
+                       ext4_journal_abort_handle(where, __func__, bh,
+                                                 handle, err);
+       }
        return err;
 }
 
 int __ext4_journal_revoke(const char *where, handle_t *handle,
                                ext4_fsblk_t blocknr, struct buffer_head *bh)
 {
-       int err = jbd2_journal_revoke(handle, blocknr, bh);
-       if (err)
-               ext4_journal_abort_handle(where, __func__, bh, handle, err);
+       int err = 0;
+
+       if (ext4_handle_valid(handle)) {
+               err = jbd2_journal_revoke(handle, blocknr, bh);
+               if (err)
+                       ext4_journal_abort_handle(where, __func__, bh,
+                                                 handle, err);
+       }
        return err;
 }
 
 int __ext4_journal_get_create_access(const char *where,
                                handle_t *handle, struct buffer_head *bh)
 {
-       int err = jbd2_journal_get_create_access(handle, bh);
-       if (err)
-               ext4_journal_abort_handle(where, __func__, bh, handle, err);
+       int err = 0;
+
+       if (ext4_handle_valid(handle)) {
+               err = jbd2_journal_get_create_access(handle, bh);
+               if (err)
+                       ext4_journal_abort_handle(where, __func__, bh,
+                                                 handle, err);
+       }
        return err;
 }
 
-int __ext4_journal_dirty_metadata(const char *where,
-                               handle_t *handle, struct buffer_head *bh)
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+                                struct inode *inode, struct buffer_head *bh)
 {
-       int err = jbd2_journal_dirty_metadata(handle, bh);
-       if (err)
-               ext4_journal_abort_handle(where, __func__, bh, handle, err);
+       int err = 0;
+
+       if (ext4_handle_valid(handle)) {
+               err = jbd2_journal_dirty_metadata(handle, bh);
+               if (err)
+                       ext4_journal_abort_handle(where, __func__, bh,
+                                                 handle, err);
+       } else {
+               mark_buffer_dirty(bh);
+               if (inode && inode_needs_sync(inode)) {
+                       sync_dirty_buffer(bh);
+                       if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                               ext4_error(inode->i_sb, __func__,
+                                          "IO error syncing inode, "
+                                          "inode=%lu, block=%llu",
+                                          inode->i_ino,
+                                          (unsigned long long) bh->b_blocknr);
+                               err = -EIO;
+                       }
+               }
+       }
        return err;
 }
index b455c68..be2f426 100644 (file)
@@ -32,8 +32,8 @@
  * 5 levels of tree + root which are stored in the inode. */
 
 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                               \
-       (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
-               || test_opt(sb, EXTENTS) ? 27U : 8U)
+       (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
+        ? 27U : 8U)
 
 /* Extended attribute operations touch at most two data buffers,
  * two bitmap buffers, and two group summaries, in addition to the inode
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
  * been done yet.
  */
 
-static inline void ext4_journal_release_buffer(handle_t *handle,
-                                               struct buffer_head *bh)
-{
-       jbd2_journal_release_buffer(handle, bh);
-}
-
 void ext4_journal_abort_handle(const char *caller, const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
 
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
 int __ext4_journal_get_create_access(const char *where,
                                handle_t *handle, struct buffer_head *bh);
 
-int __ext4_journal_dirty_metadata(const char *where,
-                               handle_t *handle, struct buffer_head *bh);
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+                                struct inode *inode, struct buffer_head *bh);
 
 #define ext4_journal_get_undo_access(handle, bh) \
        __ext4_journal_get_undo_access(__func__, (handle), (bh))
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where,
        __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
        __ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_dirty_metadata(handle, bh) \
-       __ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
        __ext4_journal_forget(__func__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+       __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
 
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
 
+#define EXT4_NOJOURNAL_HANDLE  ((handle_t *) 0x1)
+
+static inline int ext4_handle_valid(handle_t *handle)
+{
+       if (handle == EXT4_NOJOURNAL_HANDLE)
+               return 0;
+       return 1;
+}
+
+static inline void ext4_handle_sync(handle_t *handle)
+{
+       if (ext4_handle_valid(handle))
+               handle->h_sync = 1;
+}
+
+static inline void ext4_handle_release_buffer(handle_t *handle,
+                                               struct buffer_head *bh)
+{
+       if (ext4_handle_valid(handle))
+               jbd2_journal_release_buffer(handle, bh);
+}
+
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+       if (ext4_handle_valid(handle))
+               return is_handle_aborted(handle);
+       return 0;
+}
+
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+       if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+               return 0;
+       return 1;
+}
+
+static inline void ext4_journal_release_buffer(handle_t *handle,
+                                               struct buffer_head *bh)
+{
+       if (ext4_handle_valid(handle))
+               jbd2_journal_release_buffer(handle, bh);
+}
+
 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 {
        return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void)
 
 static inline int ext4_journal_extend(handle_t *handle, int nblocks)
 {
-       return jbd2_journal_extend(handle, nblocks);
+       if (ext4_handle_valid(handle))
+               return jbd2_journal_extend(handle, nblocks);
+       return 0;
 }
 
 static inline int ext4_journal_restart(handle_t *handle, int nblocks)
 {
-       return jbd2_journal_restart(handle, nblocks);
+       if (ext4_handle_valid(handle))
+               return jbd2_journal_restart(handle, nblocks);
+       return 0;
 }
 
 static inline int ext4_journal_blocks_per_page(struct inode *inode)
 {
-       return jbd2_journal_blocks_per_page(inode);
+       if (EXT4_JOURNAL(inode) != NULL)
+               return jbd2_journal_blocks_per_page(inode);
+       return 0;
 }
 
 static inline int ext4_journal_force_commit(journal_t *journal)
 {
-       return jbd2_journal_force_commit(journal);
+       if (journal)
+               return jbd2_journal_force_commit(journal);
+       return 0;
 }
 
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
-       return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+       if (ext4_handle_valid(handle))
+               return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+       return 0;
 }
 
 /* super.c */
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb);
 
 static inline int ext4_should_journal_data(struct inode *inode)
 {
+       if (EXT4_JOURNAL(inode) == NULL)
+               return 0;
        if (!S_ISREG(inode->i_mode))
                return 1;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode)
 
 static inline int ext4_should_order_data(struct inode *inode)
 {
+       if (EXT4_JOURNAL(inode) == NULL)
+               return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode)
 
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
+       if (EXT4_JOURNAL(inode) == NULL)
+               return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
index b21f167..039b6ea 100644 (file)
@@ -57,6 +57,7 @@ struct ext4_sb_info {
        u32 s_next_generation;
        u32 s_hash_seed[4];
        int s_def_hash_version;
+       int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
        struct percpu_counter s_freeblocks_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
@@ -73,6 +74,8 @@ struct ext4_sb_info {
        struct journal_s *s_journal;
        struct list_head s_orphan;
        unsigned long s_commit_interval;
+       u32 s_max_batch_time;
+       u32 s_min_batch_time;
        struct block_device *journal_bdev;
 #ifdef CONFIG_JBD2_DEBUG
        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
@@ -101,7 +104,8 @@ struct ext4_sb_info {
        spinlock_t s_reserve_lock;
        spinlock_t s_md_lock;
        tid_t s_last_transaction;
-       unsigned short *s_mb_offsets, *s_mb_maxs;
+       unsigned short *s_mb_offsets;
+       unsigned int *s_mb_maxs;
 
        /* tunables */
        unsigned long s_stripe;
index 3f54db3..54bf062 100644 (file)
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
        int err;
 
+       if (!ext4_handle_valid(handle))
+               return 0;
        if (handle->h_buffer_credits > needed)
                return 0;
        err = ext4_journal_extend(handle, needed);
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
        int err;
        if (path->p_bh) {
                /* path points to block */
-               err = ext4_journal_dirty_metadata(handle, path->p_bh);
+               err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
        } else {
                /* path points to leaf/index in inode body */
                err = ext4_mark_inode_dirty(handle, inode);
@@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
        ext4_fsblk_t goal, newblock;
 
        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-       newblock = ext4_new_meta_block(handle, inode, goal, err);
+       newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
        return newblock;
 }
 
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
 
-       err = ext4_journal_dirty_metadata(handle, bh);
+       err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto cleanup;
        brelse(bh);
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
 
-               err = ext4_journal_dirty_metadata(handle, bh);
+               err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (err)
                        goto cleanup;
                brelse(bh);
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
 
-       err = ext4_journal_dirty_metadata(handle, bh);
+       err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto out;
 
@@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        while (--depth >= 0) {
                ix = path[depth].p_idx;
                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
-                       break;
+                       goto got_index;
        }
 
-       if (depth < 0) {
-               /* we've gone up to the root and
-                * found no index to the right */
-               return 0;
-       }
+       /* we've gone up to the root and found no index to the right */
+       return 0;
 
+got_index:
        /* we've found index to the right, let's
         * follow it and find the closest allocated
         * block to the right */
@@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        *phys = ext_pblock(ex);
        put_bh(bh);
        return 0;
-
 }
 
 /*
@@ -1622,7 +1621,6 @@ cleanup:
                ext4_ext_drop_refs(npath);
                kfree(npath);
        }
-       ext4_ext_tree_changed(inode);
        ext4_ext_invalidate_cache(inode);
        return err;
 }
@@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
                }
        }
 out:
-       ext4_ext_tree_changed(inode);
        ext4_ext_drop_refs(path);
        kfree(path);
        ext4_journal_stop(handle);
@@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb)
         * possible initialization would be here
         */
 
-       if (test_opt(sb, EXTENTS)) {
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
                printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
                printk(", aggressive tests");
@@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb)
  */
 void ext4_ext_release(struct super_block *sb)
 {
-       if (!test_opt(sb, EXTENTS))
+       if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
                return;
 
 #ifdef EXTENTS_STATS
@@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_ext_path *path,
                                                ext4_lblk_t iblock,
-                                               unsigned long max_blocks)
+                                               unsigned int max_blocks)
 {
        struct ext4_extent *ex, newex, orig_ex;
        struct ext4_extent *ex1 = NULL;
@@ -2678,26 +2675,26 @@ fix_extent_len:
  */
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
-                       unsigned long max_blocks, struct buffer_head *bh_result,
+                       unsigned int max_blocks, struct buffer_head *bh_result,
                        int create, int extend_disksize)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex;
-       ext4_fsblk_t goal, newblock;
-       int err = 0, depth, ret;
-       unsigned long allocated = 0;
+       ext4_fsblk_t newblock;
+       int err = 0, depth, ret, cache_type;
+       unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        loff_t disksize;
 
        __clear_bit(BH_New, &bh_result->b_state);
-       ext_debug("blocks %u/%lu requested for inode %u\n",
+       ext_debug("blocks %u/%u requested for inode %u\n",
                        iblock, max_blocks, inode->i_ino);
 
        /* check in cache */
-       goal = ext4_ext_in_cache(inode, iblock, &newex);
-       if (goal) {
-               if (goal == EXT4_EXT_CACHE_GAP) {
+       cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+       if (cache_type) {
+               if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if (!create) {
                                /*
                                 * block isn't allocated yet and
@@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                goto out2;
                        }
                        /* we should allocate requested block */
-               } else if (goal == EXT4_EXT_CACHE_EXTENT) {
+               } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
                        /* block is already allocated */
                        newblock = iblock
                                   - le32_to_cpu(newex.ee_block)
@@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        if (!newblock)
                goto out2;
        ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
-                       goal, newblock, allocated);
+                 ar.goal, newblock, allocated);
 
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
@@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode)
         * transaction synchronous.
         */
        if (IS_SYNC(inode))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
 
 out_stop:
        up_write(&EXT4_I(inode)->i_data_sem);
@@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        handle_t *handle;
        ext4_lblk_t block;
        loff_t new_size;
-       unsigned long max_blocks;
+       unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
@@ -3083,7 +3080,7 @@ retry:
 /*
  * Callback function called for each extent to gather FIEMAP information.
  */
-int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                       struct ext4_ext_cache *newex, struct ext4_extent *ex,
                       void *data)
 {
@@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
-int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+static int ext4_xattr_fiemap(struct inode *inode,
+                               struct fiemap_extent_info *fieinfo)
 {
        __u64 physical = 0;
        __u64 length;
index 6bd11fb..f731cb5 100644 (file)
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
 
-extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-               __u64 start, __u64 len);
-
 const struct file_operations ext4_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
index 556ca8e..ac8f168 100644 (file)
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 
 
 /* The old legacy hash */
-static __u32 dx_hack_hash(const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-       __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+       __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+       const unsigned char *ucp = (const unsigned char *) name;
+
+       while (len--) {
+               hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+               if (hash & 0x80000000)
+                       hash -= 0x7fffffff;
+               hash1 = hash0;
+               hash0 = hash;
+       }
+       return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+       __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+       const signed char *scp = (const signed char *) name;
+
        while (len--) {
-               __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+               hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
 
-               if (hash & 0x80000000) hash -= 0x7fffffff;
+               if (hash & 0x80000000)
+                       hash -= 0x7fffffff;
                hash1 = hash0;
                hash0 = hash;
        }
-       return (hash0 << 1);
+       return hash0 << 1;
+}
+
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+       __u32   pad, val;
+       int     i;
+       const signed char *scp = (const signed char *) msg;
+
+       pad = (__u32)len | ((__u32)len << 8);
+       pad |= pad << 16;
+
+       val = pad;
+       if (len > num*4)
+               len = num * 4;
+       for (i = 0; i < len; i++) {
+               if ((i % 4) == 0)
+                       val = pad;
+               val = ((int) scp[i]) + (val << 8);
+               if ((i % 4) == 3) {
+                       *buf++ = val;
+                       val = pad;
+                       num--;
+               }
+       }
+       if (--num >= 0)
+               *buf++ = val;
+       while (--num >= 0)
+               *buf++ = pad;
 }
 
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
 {
        __u32   pad, val;
        int     i;
+       const unsigned char *ucp = (const unsigned char *) msg;
 
        pad = (__u32)len | ((__u32)len << 8);
        pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
        for (i = 0; i < len; i++) {
                if ((i % 4) == 0)
                        val = pad;
-               val = msg[i] + (val << 8);
+               val = ((int) ucp[i]) + (val << 8);
                if ((i % 4) == 3) {
                        *buf++ = val;
                        val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        const char      *p;
        int             i;
        __u32           in[8], buf[4];
+       void            (*str2hashbuf)(const char *, int, __u32 *, int) =
+                               str2hashbuf_signed;
 
        /* Initialize the default seed for the hash checksum functions */
        buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        }
 
        switch (hinfo->hash_version) {
+       case DX_HASH_LEGACY_UNSIGNED:
+               hash = dx_hack_hash_unsigned(name, len);
+               break;
        case DX_HASH_LEGACY:
-               hash = dx_hack_hash(name, len);
+               hash = dx_hack_hash_signed(name, len);
                break;
+       case DX_HASH_HALF_MD4_UNSIGNED:
+               str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_HALF_MD4:
                p = name;
                while (len > 0) {
-                       str2hashbuf(p, len, in, 8);
+                       (*str2hashbuf)(p, len, in, 8);
                        half_md4_transform(buf, in);
                        len -= 32;
                        p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
                minor_hash = buf[2];
                hash = buf[1];
                break;
+       case DX_HASH_TEA_UNSIGNED:
+               str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_TEA:
                p = name;
                while (len > 0) {
-                       str2hashbuf(p, len, in, 4);
+                       (*str2hashbuf)(p, len, in, 4);
                        TEA_transform(buf, in);
                        len -= 16;
                        p += 16;
index 6e60528..4fb86a0 100644 (file)
@@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
        /* If checksum is bad mark all blocks and inodes use to prevent
         * allocation, essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-               ext4_error(sb, __func__, "Checksum bad for group %lu\n",
+               ext4_error(sb, __func__, "Checksum bad for group %u",
                           block_group);
-               gdp->bg_free_blocks_count = 0;
-               gdp->bg_free_inodes_count = 0;
-               gdp->bg_itable_unused = 0;
+               ext4_free_blks_set(sb, gdp, 0);
+               ext4_free_inodes_set(sb, gdp, 0);
+               ext4_itable_unused_set(sb, gdp, 0);
                memset(bh->b_data, 0xff, sb->s_blocksize);
                return 0;
        }
 
        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
 
        return EXT4_INODES_PER_GROUP(sb);
@@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        if (unlikely(!bh)) {
                ext4_error(sb, __func__,
                            "Cannot read inode bitmap - "
-                           "block_group = %lu, inode_bitmap = %llu",
+                           "block_group = %u, inode_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
-       if (buffer_uptodate(bh) &&
-           !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+       if (bitmap_uptodate(bh))
                return bh;
 
        lock_buffer(bh);
+       if (bitmap_uptodate(bh)) {
+               unlock_buffer(bh);
+               return bh;
+       }
        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
+               set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-               unlock_buffer(bh);
                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+               unlock_buffer(bh);
                return bh;
        }
        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+       if (buffer_uptodate(bh)) {
+               /*
+                * if not uninit if bh is uptodate,
+                * bitmap is also uptodate
+                */
+               set_bitmap_uptodate(bh);
+               unlock_buffer(bh);
+               return bh;
+       }
+       /*
+        * submit the buffer_head for read. We can
+        * safely mark the bitmap as uptodate now.
+        * We do it here so the bitmap uptodate bit
+        * get set with buffer lock held.
+        */
+       set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
                ext4_error(sb, __func__,
                            "Cannot read inode bitmap - "
-                           "block_group = %lu, inode_bitmap = %llu",
+                           "block_group = %u, inode_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
@@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
-       int fatal = 0, err;
+       int fatal = 0, err, count;
        ext4_group_t flex_group;
 
        if (atomic_read(&inode->i_count) > 1) {
@@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
        ino = inode->i_ino;
        ext4_debug("freeing inode %lu\n", ino);
+       trace_mark(ext4_free_inode,
+                  "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
+                  sb->s_id, inode->i_ino, inode->i_mode,
+                  (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
+                  (unsigned long long) inode->i_blocks);
 
        /*
         * Note: we must free any quota before locking the superblock,
@@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
                if (gdp) {
                        spin_lock(sb_bgl_lock(sbi, block_group));
-                       le16_add_cpu(&gdp->bg_free_inodes_count, 1);
-                       if (is_directory)
-                               le16_add_cpu(&gdp->bg_used_dirs_count, -1);
+                       count = ext4_free_inodes_count(sb, gdp) + 1;
+                       ext4_free_inodes_set(sb, gdp, count);
+                       if (is_directory) {
+                               count = ext4_used_dirs_count(sb, gdp) - 1;
+                               ext4_used_dirs_set(sb, gdp, count);
+                       }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
                        spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                                spin_unlock(sb_bgl_lock(sbi, flex_group));
                        }
                }
-               BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
-               err = ext4_journal_dirty_metadata(handle, bh2);
+               BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+               err = ext4_handle_dirty_metadata(handle, NULL, bh2);
                if (!fatal) fatal = err;
        }
-       BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
-       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+       BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        if (!fatal)
                fatal = err;
        sb->s_dirt = 1;
@@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 
        for (group = 0; group < ngroups; group++) {
                desc = ext4_get_group_desc(sb, group, NULL);
-               if (!desc || !desc->bg_free_inodes_count)
+               if (!desc || !ext4_free_inodes_count(sb, desc))
                        continue;
-               if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+               if (ext4_free_inodes_count(sb, desc) < avefreei)
                        continue;
                if (!best_desc ||
-                   (le16_to_cpu(desc->bg_free_blocks_count) >
-                    le16_to_cpu(best_desc->bg_free_blocks_count))) {
+                   (ext4_free_blks_count(sb, desc) >
+                    ext4_free_blks_count(sb, best_desc))) {
                        *best_group = group;
                        best_desc = desc;
                        ret = 0;
@@ -369,7 +397,7 @@ found_flexbg:
        for (i = best_flex * flex_size; i < ngroups &&
                     i < (best_flex + 1) * flex_size; i++) {
                desc = ext4_get_group_desc(sb, i, &bh);
-               if (le16_to_cpu(desc->bg_free_inodes_count)) {
+               if (ext4_free_inodes_count(sb, desc)) {
                        *best_group = i;
                        goto out;
                }
@@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                for (i = 0; i < ngroups; i++) {
                        grp = (parent_group + i) % ngroups;
                        desc = ext4_get_group_desc(sb, grp, NULL);
-                       if (!desc || !desc->bg_free_inodes_count)
+                       if (!desc || !ext4_free_inodes_count(sb, desc))
                                continue;
-                       if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+                       if (ext4_used_dirs_count(sb, desc) >= best_ndir)
                                continue;
-                       if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+                       if (ext4_free_inodes_count(sb, desc) < avefreei)
                                continue;
-                       if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+                       if (ext4_free_blks_count(sb, desc) < avefreeb)
                                continue;
                        *group = grp;
                        ret = 0;
-                       best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+                       best_ndir = ext4_used_dirs_count(sb, desc);
                }
                if (ret == 0)
                        return ret;
@@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        for (i = 0; i < ngroups; i++) {
                *group = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
-               if (!desc || !desc->bg_free_inodes_count)
+               if (!desc || !ext4_free_inodes_count(sb, desc))
                        continue;
-               if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+               if (ext4_used_dirs_count(sb, desc) >= max_dirs)
                        continue;
-               if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+               if (ext4_free_inodes_count(sb, desc) < min_inodes)
                        continue;
-               if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+               if (ext4_free_blks_count(sb, desc) < min_blocks)
                        continue;
                return 0;
        }
@@ -494,8 +522,8 @@ fallback:
        for (i = 0; i < ngroups; i++) {
                *group = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
-               if (desc && desc->bg_free_inodes_count &&
-                       le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+               if (desc && ext4_free_inodes_count(sb, desc) &&
+                       ext4_free_inodes_count(sb, desc) >= avefreei)
                        return 0;
        }
 
@@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
         */
        *group = parent_group;
        desc = ext4_get_group_desc(sb, *group, NULL);
-       if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
-                       le16_to_cpu(desc->bg_free_blocks_count))
+       if (desc && ext4_free_inodes_count(sb, desc) &&
+                       ext4_free_blks_count(sb, desc))
                return 0;
 
        /*
@@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                if (*group >= ngroups)
                        *group -= ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
-               if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
-                               le16_to_cpu(desc->bg_free_blocks_count))
+               if (desc && ext4_free_inodes_count(sb, desc) &&
+                               ext4_free_blks_count(sb, desc))
                        return 0;
        }
 
@@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                if (++*group >= ngroups)
                        *group = 0;
                desc = ext4_get_group_desc(sb, *group, NULL);
-               if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+               if (desc && ext4_free_inodes_count(sb, desc))
                        return 0;
        }
 
@@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 }
 
 /*
+ * claim the inode from the inode bitmap. If the group
+ * is uninit we need to take the groups's sb_bgl_lock
+ * and clear the uninit flag. The inode bitmap update
+ * and group desc uninit flag clear should be done
+ * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * doesn't race with the ext4_claim_inode
+ */
+static int ext4_claim_inode(struct super_block *sb,
+                       struct buffer_head *inode_bitmap_bh,
+                       unsigned long ino, ext4_group_t group, int mode)
+{
+       int free = 0, retval = 0, count;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+       spin_lock(sb_bgl_lock(sbi, group));
+       if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+               /* not a free inode */
+               retval = 1;
+               goto err_ret;
+       }
+       ino++;
+       if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
+                       ino > EXT4_INODES_PER_GROUP(sb)) {
+               spin_unlock(sb_bgl_lock(sbi, group));
+               ext4_error(sb, __func__,
+                          "reserved inode or inode > inodes count - "
+                          "block_group = %u, inode=%lu", group,
+                          ino + group * EXT4_INODES_PER_GROUP(sb));
+               return 1;
+       }
+       /* If we didn't allocate from within the initialized part of the inode
+        * table then we need to initialize up to this inode. */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+
+               if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+                       /* When marking the block group with
+                        * ~EXT4_BG_INODE_UNINIT we don't want to depend
+                        * on the value of bg_itable_unused even though
+                        * mke2fs could have initialized the same for us.
+                        * Instead we calculated the value below
+                        */
+
+                       free = 0;
+               } else {
+                       free = EXT4_INODES_PER_GROUP(sb) -
+                               ext4_itable_unused_count(sb, gdp);
+               }
+
+               /*
+                * Check the relative inode number against the last used
+                * relative inode number in this group. if it is greater
+                * we need to  update the bg_itable_unused count
+                *
+                */
+               if (ino > free)
+                       ext4_itable_unused_set(sb, gdp,
+                                       (EXT4_INODES_PER_GROUP(sb) - ino));
+       }
+       count = ext4_free_inodes_count(sb, gdp) - 1;
+       ext4_free_inodes_set(sb, gdp, count);
+       if (S_ISDIR(mode)) {
+               count = ext4_used_dirs_count(sb, gdp) + 1;
+               ext4_used_dirs_set(sb, gdp, count);
+       }
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+err_ret:
+       spin_unlock(sb_bgl_lock(sbi, group));
+       return retval;
+}
+
+/*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
  * free space and a low directory-to-inode ratio; if that fails, then of
@@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 {
        struct super_block *sb;
-       struct buffer_head *bitmap_bh = NULL;
-       struct buffer_head *bh2;
+       struct buffer_head *inode_bitmap_bh = NULL;
+       struct buffer_head *group_desc_bh;
        ext4_group_t group = 0;
        unsigned long ino = 0;
        struct inode *inode;
@@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                return ERR_PTR(-EPERM);
 
        sb = dir->i_sb;
+       trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
+                  dir->i_ino, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
@@ -631,40 +734,52 @@ got_group:
        for (i = 0; i < sbi->s_groups_count; i++) {
                err = -EIO;
 
-               gdp = ext4_get_group_desc(sb, group, &bh2);
+               gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
                if (!gdp)
                        goto fail;
 
-               brelse(bitmap_bh);
-               bitmap_bh = ext4_read_inode_bitmap(sb, group);
-               if (!bitmap_bh)
+               brelse(inode_bitmap_bh);
+               inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+               if (!inode_bitmap_bh)
                        goto fail;
 
                ino = 0;
 
 repeat_in_this_group:
                ino = ext4_find_next_zero_bit((unsigned long *)
-                               bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
+                                             inode_bitmap_bh->b_data,
+                                             EXT4_INODES_PER_GROUP(sb), ino);
+
                if (ino < EXT4_INODES_PER_GROUP(sb)) {
 
-                       BUFFER_TRACE(bitmap_bh, "get_write_access");
-                       err = ext4_journal_get_write_access(handle, bitmap_bh);
+                       BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+                       err = ext4_journal_get_write_access(handle,
+                                                           inode_bitmap_bh);
                        if (err)
                                goto fail;
 
-                       if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
-                                               ino, bitmap_bh->b_data)) {
+                       BUFFER_TRACE(group_desc_bh, "get_write_access");
+                       err = ext4_journal_get_write_access(handle,
+                                                               group_desc_bh);
+                       if (err)
+                               goto fail;
+                       if (!ext4_claim_inode(sb, inode_bitmap_bh,
+                                               ino, group, mode)) {
                                /* we won it */
-                               BUFFER_TRACE(bitmap_bh,
-                                       "call ext4_journal_dirty_metadata");
-                               err = ext4_journal_dirty_metadata(handle,
-                                                               bitmap_bh);
+                               BUFFER_TRACE(inode_bitmap_bh,
+                                       "call ext4_handle_dirty_metadata");
+                               err = ext4_handle_dirty_metadata(handle,
+                                                                inode,
+                                                       inode_bitmap_bh);
                                if (err)
                                        goto fail;
+                               /* zero bit is inode number 1*/
+                               ino++;
                                goto got;
                        }
                        /* we lost it */
-                       jbd2_journal_release_buffer(handle, bitmap_bh);
+                       ext4_handle_release_buffer(handle, inode_bitmap_bh);
+                       ext4_handle_release_buffer(handle, group_desc_bh);
 
                        if (++ino < EXT4_INODES_PER_GROUP(sb))
                                goto repeat_in_this_group;
@@ -684,30 +799,16 @@ repeat_in_this_group:
        goto out;
 
 got:
-       ino++;
-       if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
-           ino > EXT4_INODES_PER_GROUP(sb)) {
-               ext4_error(sb, __func__,
-                          "reserved inode or inode > inodes count - "
-                          "block_group = %lu, inode=%lu", group,
-                          ino + group * EXT4_INODES_PER_GROUP(sb));
-               err = -EIO;
-               goto fail;
-       }
-
-       BUFFER_TRACE(bh2, "get_write_access");
-       err = ext4_journal_get_write_access(handle, bh2);
-       if (err) goto fail;
-
        /* We may have to initialize the block bitmap if it isn't already */
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-               struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
+               struct buffer_head *block_bitmap_bh;
 
-               BUFFER_TRACE(block_bh, "get block bitmap access");
-               err = ext4_journal_get_write_access(handle, block_bh);
+               block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+               BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+               err = ext4_journal_get_write_access(handle, block_bitmap_bh);
                if (err) {
-                       brelse(block_bh);
+                       brelse(block_bitmap_bh);
                        goto fail;
                }
 
@@ -715,9 +816,9 @@ got:
                spin_lock(sb_bgl_lock(sbi, group));
                /* recheck and clear flag under lock if we still need to */
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        free = ext4_free_blocks_after_init(sb, group, gdp);
-                       gdp->bg_free_blocks_count = cpu_to_le16(free);
+                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+                       ext4_free_blks_set(sb, gdp, free);
                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
                                                                gdp);
                }
@@ -725,55 +826,19 @@ got:
 
                /* Don't need to dirty bitmap block if we didn't change it */
                if (free) {
-                       BUFFER_TRACE(block_bh, "dirty block bitmap");
-                       err = ext4_journal_dirty_metadata(handle, block_bh);
+                       BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+                       err = ext4_handle_dirty_metadata(handle,
+                                                       NULL, block_bitmap_bh);
                }
 
-               brelse(block_bh);
+               brelse(block_bitmap_bh);
                if (err)
                        goto fail;
        }
-
-       spin_lock(sb_bgl_lock(sbi, group));
-       /* If we didn't allocate from within the initialized part of the inode
-        * table then we need to initialize up to this inode. */
-       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-               if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-
-                       /* When marking the block group with
-                        * ~EXT4_BG_INODE_UNINIT we don't want to depend
-                        * on the value of bg_itable_unused even though
-                        * mke2fs could have initialized the same for us.
-                        * Instead we calculated the value below
-                        */
-
-                       free = 0;
-               } else {
-                       free = EXT4_INODES_PER_GROUP(sb) -
-                               le16_to_cpu(gdp->bg_itable_unused);
-               }
-
-               /*
-                * Check the relative inode number against the last used
-                * relative inode number in this group. if it is greater
-                * we need to  update the bg_itable_unused count
-                *
-                */
-               if (ino > free)
-                       gdp->bg_itable_unused =
-                               cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
-       }
-
-       le16_add_cpu(&gdp->bg_free_inodes_count, -1);
-       if (S_ISDIR(mode)) {
-               le16_add_cpu(&gdp->bg_used_dirs_count, 1);
-       }
-       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-       spin_unlock(sb_bgl_lock(sbi, group));
-       BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
-       err = ext4_journal_dirty_metadata(handle, bh2);
-       if (err) goto fail;
+       BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
+       err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
+       if (err)
+               goto fail;
 
        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
@@ -825,7 +890,7 @@ got:
 
        ext4_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
        if (insert_inode_locked(inode) < 0) {
                err = -EINVAL;
                goto fail_drop;
@@ -852,7 +917,7 @@ got:
        if (err)
                goto fail_free_drop;
 
-       if (test_opt(sb, EXTENTS)) {
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
@@ -867,6 +932,8 @@ got:
        }
 
        ext4_debug("allocating inode %lu\n", inode->i_ino);
+       trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
+                  sb->s_id, inode->i_ino, dir->i_ino, mode);
        goto really_out;
 fail:
        ext4_std_error(sb, err);
@@ -874,7 +941,7 @@ out:
        iput(inode);
        ret = ERR_PTR(err);
 really_out:
-       brelse(bitmap_bh);
+       brelse(inode_bitmap_bh);
        return ret;
 
 fail_free_drop:
@@ -886,7 +953,7 @@ fail_drop:
        inode->i_nlink = 0;
        unlock_new_inode(inode);
        iput(inode);
-       brelse(bitmap_bh);
+       brelse(inode_bitmap_bh);
        return ERR_PTR(err);
 }
 
@@ -985,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-               desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+               desc_count += ext4_free_inodes_count(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_inode_bitmap(sb, i);
                if (!bitmap_bh)
@@ -993,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 
                x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-                       i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+                       i, ext4_free_inodes_count(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
@@ -1007,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-               desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+               desc_count += ext4_free_inodes_count(sb, gdp);
                cond_resched();
        }
        return desc_count;
@@ -1024,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-               count += le16_to_cpu(gdp->bg_used_dirs_count);
+               count += ext4_used_dirs_count(sb, gdp);
        }
        return count;
 }
-
index 98d3fe7..a6444ce 100644 (file)
@@ -72,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
  * "bh" may be NULL: a metadata block may have been freed from memory
  * but there may still be a record of it in the journal, and that record
  * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling so there's nothing to do.
  */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                        struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
        int err;
 
+       if (!ext4_handle_valid(handle))
+               return 0;
+
        might_sleep();
 
        BUFFER_TRACE(bh, "enter");
@@ -170,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode)
  */
 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 {
-       if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+       if (!ext4_handle_valid(handle))
+               return 0;
+       if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
                return 0;
        if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
                return 0;
@@ -184,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
  */
 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 {
+       BUG_ON(EXT4_JOURNAL(inode) == NULL);
        jbd_debug(2, "restarting handle %p\n", handle);
        return ext4_journal_restart(handle, blocks_for_truncate(inode));
 }
@@ -216,7 +224,7 @@ void ext4_delete_inode(struct inode *inode)
        }
 
        if (IS_SYNC(inode))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
@@ -233,7 +241,7 @@ void ext4_delete_inode(struct inode *inode)
         * enough credits left in the handle to remove the inode from
         * the orphan list and set the dtime field.
         */
-       if (handle->h_buffer_credits < 3) {
+       if (!ext4_handle_has_enough_credits(handle, 3)) {
                err = ext4_journal_extend(handle, 3);
                if (err > 0)
                        err = ext4_journal_restart(handle, 3);
@@ -506,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
  *     return the total number of blocks to be allocate, including the
  *     direct and indirect blocks.
  */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
                int blocks_to_boundary)
 {
-       unsigned long count = 0;
+       unsigned int count = 0;
 
        /*
         * Simple case, [t,d]Indirect block(s) has not allocated yet
@@ -547,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                                int indirect_blks, int blks,
                                ext4_fsblk_t new_blocks[4], int *err)
 {
+       struct ext4_allocation_request ar;
        int target, i;
        unsigned long count = 0, blk_allocated = 0;
        int index = 0;
@@ -595,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        if (!target)
                goto allocated;
        /* Now allocate data blocks */
-       count = target;
-       /* allocating blocks for data blocks */
-       current_block = ext4_new_blocks(handle, inode, iblock,
-                                               goal, &count, err);
+       memset(&ar, 0, sizeof(ar));
+       ar.inode = inode;
+       ar.goal = goal;
+       ar.len = target;
+       ar.logical = iblock;
+       if (S_ISREG(inode->i_mode))
+               /* enable in-core preallocation only for regular files */
+               ar.flags = EXT4_MB_HINT_DATA;
+
+       current_block = ext4_mb_new_blocks(handle, &ar, err);
+
        if (*err && (target == blks)) {
                /*
                 * if the allocation failed and we didn't allocate
@@ -614,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                 */
                        new_blocks[index] = current_block;
                }
-               blk_allocated += count;
+               blk_allocated += ar.len;
        }
 allocated:
        /* total number of blocks allocated for direct blocks */
@@ -709,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
 
-               BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-               err = ext4_journal_dirty_metadata(handle, bh);
+               BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+               err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (err)
                        goto failed;
        }
@@ -792,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
                 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
                 */
                jbd_debug(5, "splicing indirect only\n");
-               BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
-               err = ext4_journal_dirty_metadata(handle, where->bh);
+               BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+               err = ext4_handle_dirty_metadata(handle, inode, where->bh);
                if (err)
                        goto err_out;
        } else {
@@ -840,10 +856,10 @@ err_out:
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  */
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-               ext4_lblk_t iblock, unsigned long maxblocks,
-               struct buffer_head *bh_result,
-               int create, int extend_disksize)
+static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+                                 ext4_lblk_t iblock, unsigned int maxblocks,
+                                 struct buffer_head *bh_result,
+                                 int create, int extend_disksize)
 {
        int err = -EIO;
        ext4_lblk_t offsets[4];
@@ -1045,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  * It returns the error in case of allocation failure.
  */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
-                       unsigned long max_blocks, struct buffer_head *bh,
+                       unsigned int max_blocks, struct buffer_head *bh,
                        int create, int extend_disksize, int flag)
 {
        int retval;
@@ -1221,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                set_buffer_uptodate(bh);
                        }
                        unlock_buffer(bh);
-                       BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-                       err = ext4_journal_dirty_metadata(handle, bh);
+                       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                       err = ext4_handle_dirty_metadata(handle, inode, bh);
                        if (!fatal)
                                fatal = err;
                } else {
@@ -1335,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
        pgoff_t index;
        unsigned from, to;
 
+       trace_mark(ext4_write_begin,
+                  "dev %s ino %lu pos %llu len %u flags %u",
+                  inode->i_sb->s_id, inode->i_ino,
+                  (unsigned long long) pos, len, flags);
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1387,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        set_buffer_uptodate(bh);
-       return ext4_journal_dirty_metadata(handle, bh);
+       return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 
 /*
@@ -1406,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file,
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
 
+       trace_mark(ext4_ordered_write_end,
+                  "dev %s ino %lu pos %llu len %u copied %u",
+                  inode->i_sb->s_id, inode->i_ino,
+                  (unsigned long long) pos, len, copied);
        ret = ext4_jbd2_file_inode(handle, inode);
 
        if (ret == 0) {
@@ -1444,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file,
        int ret = 0, ret2;
        loff_t new_i_size;
 
+       trace_mark(ext4_writeback_write_end,
+                  "dev %s ino %lu pos %llu len %u copied %u",
+                  inode->i_sb->s_id, inode->i_ino,
+                  (unsigned long long) pos, len, copied);
        new_i_size = pos + copied;
        if (new_i_size > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, new_i_size);
@@ -1479,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file,
        unsigned from, to;
        loff_t new_i_size;
 
+       trace_mark(ext4_journalled_write_end,
+                  "dev %s ino %lu pos %llu len %u copied %u",
+                  inode->i_sb->s_id, inode->i_ino,
+                  (unsigned long long) pos, len, copied);
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
 
@@ -1625,7 +1657,7 @@ struct mpage_da_data {
        get_block_t *get_block;
        struct writeback_control *wbc;
        int io_done;
-       long pages_written;
+       int pages_written;
        int retval;
 };
 
@@ -1645,35 +1677,39 @@ struct mpage_da_data {
  */
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
-       struct address_space *mapping = mpd->inode->i_mapping;
-       int ret = 0, err, nr_pages, i;
-       unsigned long index, end;
-       struct pagevec pvec;
        long pages_skipped;
+       struct pagevec pvec;
+       unsigned long index, end;
+       int ret = 0, err, nr_pages, i;
+       struct inode *inode = mpd->inode;
+       struct address_space *mapping = inode->i_mapping;
 
        BUG_ON(mpd->next_page <= mpd->first_page);
-       pagevec_init(&pvec, 0);
+       /*
+        * We need to start from the first_page to the next_page - 1
+        * to make sure we also write the mapped dirty buffer_heads.
+        * If we look at mpd->lbh.b_blocknr we would only be looking
+        * at the currently mapped buffer_heads.
+        */
        index = mpd->first_page;
        end = mpd->next_page - 1;
 
+       pagevec_init(&pvec, 0);
        while (index <= end) {
-               /*
-                * We can use PAGECACHE_TAG_DIRTY lookup here because
-                * even though we have cleared the dirty flag on the page
-                * We still keep the page in the radix tree with tag
-                * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
-                * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
-                * which is called via the below writepage callback.
-                */
-               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                                       PAGECACHE_TAG_DIRTY,
-                                       min(end - index,
-                                       (pgoff_t)PAGEVEC_SIZE-1) + 1);
+               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
 
+                       index = page->index;
+                       if (index > end)
+                               break;
+                       index++;
+
+                       BUG_ON(!PageLocked(page));
+                       BUG_ON(PageWriteback(page));
+
                        pages_skipped = mpd->wbc->pages_skipped;
                        err = mapping->a_ops->writepage(page, mpd->wbc);
                        if (!err && (pages_skipped == mpd->wbc->pages_skipped))
@@ -1831,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode)
                        ext4_count_free_blocks(inode->i_sb));
        printk(KERN_EMERG "Free/Dirty block details\n");
        printk(KERN_EMERG "free_blocks=%lld\n",
-                       percpu_counter_sum(&sbi->s_freeblocks_counter));
+                       (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
        printk(KERN_EMERG "dirty_blocks=%lld\n",
-                       percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+                       (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
        printk(KERN_EMERG "Block reservation details\n");
-       printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+       printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
                        EXT4_I(inode)->i_reserved_data_blocks);
-       printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+       printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
                        EXT4_I(inode)->i_reserved_meta_blocks);
        return;
 }
@@ -2087,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page,
                bh = head;
                do {
                        BUG_ON(buffer_locked(bh));
+                       /*
+                        * We need to try to allocate
+                        * unmapped blocks in the same page.
+                        * Otherwise we won't make progress
+                        * with the page in ext4_da_writepage
+                        */
                        if (buffer_dirty(bh) &&
                                (!buffer_mapped(bh) || buffer_delay(bh))) {
                                mpage_add_bh_to_extent(mpd, logical, bh);
                                if (mpd->io_done)
                                        return MPAGE_DA_EXTENT_TAIL;
+                       } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+                               /*
+                                * mapped dirty buffer. We need to update
+                                * the b_state because we look at
+                                * b_state in mpage_da_map_blocks. We don't
+                                * update b_size because if we find an
+                                * unmapped buffer_head later we need to
+                                * use the b_state flag of that buffer_head.
+                                */
+                               if (mpd->lbh.b_size == 0)
+                                       mpd->lbh.b_state =
+                                               bh->b_state & BH_FLAGS;
                        }
                        logical++;
                } while ((bh = bh->b_this_page) != head);
@@ -2269,10 +2323,13 @@ static int ext4_da_writepage(struct page *page,
 {
        int ret = 0;
        loff_t size;
-       unsigned long len;
+       unsigned int len;
        struct buffer_head *page_bufs;
        struct inode *inode = page->mapping->host;
 
+       trace_mark(ext4_da_writepage,
+                  "dev %s ino %lu page_index %lu",
+                  inode->i_sb->s_id, inode->i_ino, page->index);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2378,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
        int no_nrwrite_index_update;
-       long pages_written = 0, pages_skipped;
+       int pages_written = 0;
+       long pages_skipped;
        int needed_blocks, ret = 0, nr_to_writebump = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
+       trace_mark(ext4_da_writepages,
+                  "dev %s ino %lu nr_t_write %ld "
+                  "pages_skipped %ld range_start %llu "
+                  "range_end %llu nonblocking %d "
+                  "for_kupdate %d for_reclaim %d "
+                  "for_writepages %d range_cyclic %d",
+                  inode->i_sb->s_id, inode->i_ino,
+                  wbc->nr_to_write, wbc->pages_skipped,
+                  (unsigned long long) wbc->range_start,
+                  (unsigned long long) wbc->range_end,
+                  wbc->nonblocking, wbc->for_kupdate,
+                  wbc->for_reclaim, wbc->for_writepages,
+                  wbc->range_cyclic);
+
        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
@@ -2389,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping,
         */
        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;
+
+       /*
+        * If the filesystem has aborted, it is read-only, so return
+        * right away instead of dumping stack traces later on that
+        * will obscure the real source of the problem.  We test
+        * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+        * the latter could be true if the filesystem is mounted
+        * read-only, and in that case, ext4_da_writepages should
+        * *never* be called, so if that ever happens, we would want
+        * the stack trace.
+        */
+       if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+               return -EROFS;
+
        /*
         * Make sure nr_to_write is >= sbi->s_mb_stream_request
         * This make sure small files blocks are allocated in
@@ -2433,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping,
                handle = ext4_journal_start(inode, needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-                       printk(KERN_EMERG "%s: jbd2_start: "
+                       printk(KERN_CRIT "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d\n", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        dump_stack();
@@ -2486,6 +2572,14 @@ out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
+       trace_mark(ext4_da_writepage_result,
+                  "dev %s ino %lu ret %d pages_written %d "
+                  "pages_skipped %ld congestion %d "
+                  "more_io %d no_nrwrite_index_update %d",
+                  inode->i_sb->s_id, inode->i_ino, ret,
+                  pages_written, wbc->pages_skipped,
+                  wbc->encountered_congestion, wbc->more_io,
+                  wbc->no_nrwrite_index_update);
        return ret;
 }
 
@@ -2537,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                        len, flags, pagep, fsdata);
        }
        *fsdata = (void *)0;
+
+       trace_mark(ext4_da_write_begin,
+                  "dev %s ino %lu pos %llu len %u flags %u",
+                  inode->i_sb->s_id, inode->i_ino,
+                  (unsigned long long) pos, len, flags);
 retry:
        /*
         * With delayed allocation, we don't log the i_disksize update
@@ -2626,6 +2725,10 @@ static int ext4_da_write_end(struct file *file,
                }
        }
 
+       trace_mark(ext4_da_write_end,
+                  "dev %s ino %lu pos %llu len %u copied %u",
+                  inode->i_sb->s_id, inode->i_ino,
+                  (unsigned long long) pos, len, copied);
        start = pos & (PAGE_CACHE_SIZE - 1);
        end = start + copied - 1;
 
@@ -2718,7 +2821,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
                filemap_write_and_wait(mapping);
        }
 
-       if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+       BUG_ON(!EXT4_JOURNAL(inode) &&
+              EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
+
+       if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
                 * bmap on dirty files is expected to be extremely rare:
@@ -2836,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page,
        loff_t size = i_size_read(inode);
        loff_t len;
 
+       trace_mark(ext4_normal_writepage,
+                  "dev %s ino %lu page_index %lu",
+                  inode->i_sb->s_id, inode->i_ino, page->index);
        J_ASSERT(PageLocked(page));
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2921,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page,
        loff_t size = i_size_read(inode);
        loff_t len;
 
+       trace_mark(ext4_journalled_writepage,
+                  "dev %s ino %lu page_index %lu",
+                  inode->i_sb->s_id, inode->i_ino, page->index);
        J_ASSERT(PageLocked(page));
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2989,7 +3101,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
        if (offset == 0)
                ClearPageChecked(page);
 
-       jbd2_journal_invalidatepage(journal, page, offset);
+       if (journal)
+               jbd2_journal_invalidatepage(journal, page, offset);
+       else
+               block_invalidatepage(page, offset);
 }
 
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2999,7 +3114,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
        WARN_ON(PageChecked(page));
        if (!page_has_buffers(page))
                return 0;
-       return jbd2_journal_try_to_free_buffers(journal, page, wait);
+       if (journal)
+               return jbd2_journal_try_to_free_buffers(journal, page, wait);
+       else
+               return try_to_free_buffers(page);
 }
 
 /*
@@ -3271,7 +3389,7 @@ int ext4_block_truncate_page(handle_t *handle,
 
        err = 0;
        if (ext4_should_journal_data(inode)) {
-               err = ext4_journal_dirty_metadata(handle, bh);
+               err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
                if (ext4_should_order_data(inode))
                        err = ext4_jbd2_file_inode(handle, inode);
@@ -3395,8 +3513,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
        __le32 *p;
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
-                       BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-                       ext4_journal_dirty_metadata(handle, bh);
+                       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                       ext4_handle_dirty_metadata(handle, inode, bh);
                }
                ext4_mark_inode_dirty(handle, inode);
                ext4_journal_test_restart(handle, inode);
@@ -3496,7 +3614,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                                  count, block_to_free_p, p);
 
        if (this_bh) {
-               BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
+               BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
 
                /*
                 * The buffer head should have an attached journal head at this
@@ -3505,7 +3623,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                 * the block was cleared. Check for this instead of OOPSing.
                 */
                if (bh2jh(this_bh))
-                       ext4_journal_dirty_metadata(handle, this_bh);
+                       ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
                        ext4_error(inode->i_sb, __func__,
                                   "circular indirect block detected, "
@@ -3535,7 +3653,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
        ext4_fsblk_t nr;
        __le32 *p;
 
-       if (is_handle_aborted(handle))
+       if (ext4_handle_is_aborted(handle))
                return;
 
        if (depth--) {
@@ -3605,7 +3723,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * will merely complain about releasing a free block,
                         * rather than leaking blocks.
                         */
-                       if (is_handle_aborted(handle))
+                       if (ext4_handle_is_aborted(handle))
                                return;
                        if (try_to_extend_transaction(handle, inode)) {
                                ext4_mark_inode_dirty(handle, inode);
@@ -3624,9 +3742,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                                                   parent_bh)){
                                        *p = 0;
                                        BUFFER_TRACE(parent_bh,
-                                       "call ext4_journal_dirty_metadata");
-                                       ext4_journal_dirty_metadata(handle,
-                                                                   parent_bh);
+                                       "call ext4_handle_dirty_metadata");
+                                       ext4_handle_dirty_metadata(handle,
+                                                                  inode,
+                                                                  parent_bh);
                                }
                        }
                }
@@ -3814,7 +3933,7 @@ do_indirects:
         * synchronous
         */
        if (IS_SYNC(inode))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
 out_stop:
        /*
         * If this was a simple ftruncate(), and the file will remain alive
@@ -3844,7 +3963,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
        ext4_fsblk_t            block;
        int                     inodes_per_block, inode_offset;
 
-       iloc->bh = 0;
+       iloc->bh = NULL;
        if (!ext4_valid_inum(sb, inode->i_ino))
                return -EIO;
 
@@ -3951,7 +4070,7 @@ make_io:
                        num = EXT4_INODES_PER_GROUP(sb);
                        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
-                               num -= le16_to_cpu(gdp->bg_itable_unused);
+                               num -= ext4_itable_unused_count(sb, gdp);
                        table += num / inodes_per_block;
                        if (end > table)
                                end = table;
@@ -4313,8 +4432,8 @@ static int ext4_do_update_inode(handle_t *handle,
                        EXT4_SET_RO_COMPAT_FEATURE(sb,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
                        sb->s_dirt = 1;
-                       handle->h_sync = 1;
-                       err = ext4_journal_dirty_metadata(handle,
+                       ext4_handle_sync(handle);
+                       err = ext4_handle_dirty_metadata(handle, inode,
                                        EXT4_SB(sb)->s_sbh);
                }
        }
@@ -4341,9 +4460,8 @@ static int ext4_do_update_inode(handle_t *handle,
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
        }
 
-
-       BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-       rc = ext4_journal_dirty_metadata(handle, bh);
+       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+       rc = ext4_handle_dirty_metadata(handle, inode, bh);
        if (!err)
                err = rc;
        ei->i_state &= ~EXT4_STATE_NEW;
@@ -4406,6 +4524,25 @@ int ext4_write_inode(struct inode *inode, int wait)
        return ext4_force_commit(inode->i_sb);
 }
 
+int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
+{
+       int err = 0;
+
+       mark_buffer_dirty(bh);
+       if (inode && inode_needs_sync(inode)) {
+               sync_dirty_buffer(bh);
+               if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                       ext4_error(inode->i_sb, __func__,
+                                  "IO error syncing inode, "
+                                  "inode=%lu, block=%llu",
+                                  inode->i_ino,
+                                  (unsigned long long)bh->b_blocknr);
+                       err = -EIO;
+               }
+       }
+       return err;
+}
+
 /*
  * ext4_setattr()
  *
@@ -4710,16 +4847,15 @@ int
 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                         struct ext4_iloc *iloc)
 {
-       int err = 0;
-       if (handle) {
-               err = ext4_get_inode_loc(inode, iloc);
-               if (!err) {
-                       BUFFER_TRACE(iloc->bh, "get_write_access");
-                       err = ext4_journal_get_write_access(handle, iloc->bh);
-                       if (err) {
-                               brelse(iloc->bh);
-                               iloc->bh = NULL;
-                       }
+       int err;
+
+       err = ext4_get_inode_loc(inode, iloc);
+       if (!err) {
+               BUFFER_TRACE(iloc->bh, "get_write_access");
+               err = ext4_journal_get_write_access(handle, iloc->bh);
+               if (err) {
+                       brelse(iloc->bh);
+                       iloc->bh = NULL;
                }
        }
        ext4_std_error(inode->i_sb, err);
@@ -4791,7 +4927,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 
        might_sleep();
        err = ext4_reserve_inode_write(handle, inode, &iloc);
-       if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+       if (ext4_handle_valid(handle) &&
+           EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
            !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
                /*
                 * We need extra buffer credits since we may write into EA block
@@ -4843,6 +4980,11 @@ void ext4_dirty_inode(struct inode *inode)
        handle_t *current_handle = ext4_journal_current_handle();
        handle_t *handle;
 
+       if (!ext4_handle_valid(current_handle)) {
+               ext4_mark_inode_dirty(current_handle, inode);
+               return;
+       }
+
        handle = ext4_journal_start(inode, 2);
        if (IS_ERR(handle))
                goto out;
@@ -4880,8 +5022,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
                        BUFFER_TRACE(iloc.bh, "get_write_access");
                        err = jbd2_journal_get_write_access(handle, iloc.bh);
                        if (!err)
-                               err = ext4_journal_dirty_metadata(handle,
-                                                                 iloc.bh);
+                               err = ext4_handle_dirty_metadata(handle,
+                                                                inode,
+                                                                iloc.bh);
                        brelse(iloc.bh);
                }
        }
@@ -4907,6 +5050,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         */
 
        journal = EXT4_JOURNAL(inode);
+       if (!journal)
+               return 0;
        if (is_journal_aborted(journal))
                return -EROFS;
 
@@ -4936,7 +5081,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                return PTR_ERR(handle);
 
        err = ext4_mark_inode_dirty(handle, inode);
-       handle->h_sync = 1;
+       ext4_handle_sync(handle);
        ext4_journal_stop(handle);
        ext4_std_error(inode->i_sb, err);
 
index dc99b47..42dc83f 100644 (file)
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        goto flags_out;
                }
                if (IS_SYNC(inode))
-                       handle->h_sync = 1;
+                       ext4_handle_sync(handle);
                err = ext4_reserve_inode_write(handle, inode, &iloc);
                if (err)
                        goto flags_err;
index 444ad99..918aec0 100644 (file)
  * inode as:
  *
  *  {                        page                        }
- *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  *
  *
  * one block each for bitmap and buddy information.  So for each group we
  *        object
  *
  */
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                       ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+                                               ext4_group_t group);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+
+
 
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr += first + i;
                        blocknr +=
                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
-                       ext4_error(sb, __func__, "double-free of inode"
-                                  " %lu's block %llu(bit %u in group %lu)\n",
+                       ext4_grp_locked_error(sb, e4b->bd_group,
+                                  __func__, "double-free of inode"
+                                  " %lu's block %llu(bit %u in group %u)",
                                   inode ? inode->i_ino : 0, blocknr,
                                   first + i, e4b->bd_group);
                }
@@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
                b2 = (unsigned char *) bitmap;
                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
                        if (b1[i] != b2[i]) {
-                               printk(KERN_ERR "corruption in group %lu "
+                               printk(KERN_ERR "corruption in group %u "
                                       "at byte %u(%u): %x in copy != %x "
                                       "on disk/prealloc\n",
                                       e4b->bd_group, i, i * 8, b1[i], b2[i]);
@@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
        grp->bb_fragments = fragments;
 
        if (free != grp->bb_free) {
-               ext4_error(sb, __func__,
-                       "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
+               ext4_grp_locked_error(sb, group,  __func__,
+                       "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
                        group, free, grp->bb_free);
                /*
                 * If we intent to continue, we consider group descritor
@@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
  * stored in the inode as
  *
  * {                        page                        }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  *
  *
  * one block each for bitmap and buddy information.
@@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (bh[i] == NULL)
                        goto out;
 
-               if (buffer_uptodate(bh[i]) &&
-                   !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+               if (bitmap_uptodate(bh[i]))
                        continue;
 
                lock_buffer(bh[i]);
+               if (bitmap_uptodate(bh[i])) {
+                       unlock_buffer(bh[i]);
+                       continue;
+               }
                spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        ext4_init_block_bitmap(sb, bh[i],
                                                first_group + i, desc);
+                       set_bitmap_uptodate(bh[i]);
                        set_buffer_uptodate(bh[i]);
-                       unlock_buffer(bh[i]);
                        spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                       unlock_buffer(bh[i]);
                        continue;
                }
                spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+               if (buffer_uptodate(bh[i])) {
+                       /*
+                        * if not uninit if bh is uptodate,
+                        * bitmap is also uptodate
+                        */
+                       set_bitmap_uptodate(bh[i]);
+                       unlock_buffer(bh[i]);
+                       continue;
+               }
                get_bh(bh[i]);
+               /*
+                * submit the buffer_head for read. We can
+                * safely mark the bitmap as uptodate now.
+                * We do it here so the bitmap uptodate bit
+                * get set with buffer lock held.
+                */
+               set_bitmap_uptodate(bh[i]);
                bh[i]->b_end_io = end_buffer_read_sync;
                submit_bh(READ, bh[i]);
-               mb_debug("read bitmap for group %lu\n", first_group + i);
+               mb_debug("read bitmap for group %u\n", first_group + i);
        }
 
        /* wait for I/O completion */
@@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
        err = 0;
        first_block = page->index * blocks_per_page;
+       /* init the page  */
+       memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
        for (i = 0; i < blocks_per_page; i++) {
                int group;
                struct ext4_group_info *grinfo;
@@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore == NULL);
                        mb_debug("put buddy for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
-                       memset(data, 0xff, blocksize);
                        grinfo = ext4_get_group_info(sb, group);
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
@@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        /*
                         * incore got set to the group block bitmap below
                         */
+                       ext4_lock_group(sb, group);
                        ext4_mb_generate_buddy(sb, data, incore, group);
+                       ext4_unlock_group(sb, group);
                        incore = NULL;
                } else {
                        /* this is block of bitmap */
@@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
                        /* mark all preallocated blks used in in-core bitmap */
                        ext4_mb_generate_from_pa(sb, data, group);
+                       ext4_mb_generate_from_freelist(sb, data, group);
                        ext4_unlock_group(sb, group);
 
                        /* set incore so that the buddy information can be
@@ -886,18 +922,20 @@ static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                                        struct ext4_buddy *e4b)
 {
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct inode *inode = sbi->s_buddy_cache;
        int blocks_per_page;
        int block;
        int pnum;
        int poff;
        struct page *page;
        int ret;
+       struct ext4_group_info *grp;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct inode *inode = sbi->s_buddy_cache;
 
-       mb_debug("load group %lu\n", group);
+       mb_debug("load group %u\n", group);
 
        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+       grp = ext4_get_group_info(sb, group);
 
        e4b->bd_blkbits = sb->s_blocksize_bits;
        e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        e4b->bd_group = group;
        e4b->bd_buddy_page = NULL;
        e4b->bd_bitmap_page = NULL;
+       e4b->alloc_semp = &grp->alloc_sem;
+
+       /* Take the read lock on the group alloc
+        * sem. This would make sure a parallel
+        * ext4_mb_init_group happening on other
+        * groups mapped by the page is blocked
+        * till we are done with allocation
+        */
+       down_read(e4b->alloc_semp);
 
        /*
         * the buddy cache inode stores the block bitmap
@@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        page = find_get_page(inode->i_mapping, pnum);
        if (page == NULL || !PageUptodate(page)) {
                if (page)
+                       /*
+                        * drop the page reference and try
+                        * to get the page with lock. If we
+                        * are not uptodate that implies
+                        * somebody just created the page but
+                        * is yet to initialize the same. So
+                        * wait for it to initialize.
+                        */
                        page_cache_release(page);
                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
                if (page) {
@@ -985,6 +1040,9 @@ err:
                page_cache_release(e4b->bd_buddy_page);
        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
+
+       /* Done with the buddy cache */
+       up_read(e4b->alloc_semp);
        return ret;
 }
 
@@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                page_cache_release(e4b->bd_buddy_page);
+       /* Done with the buddy cache */
+       if (e4b->alloc_semp)
+               up_read(e4b->alloc_semp);
 }
 
 
@@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-               mb_clear_bit_atomic(lock, cur, bm);
+               if (lock)
+                       mb_clear_bit_atomic(lock, cur, bm);
+               else
+                       mb_clear_bit(cur, bm);
                cur++;
        }
 }
@@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-               mb_set_bit_atomic(lock, cur, bm);
+               if (lock)
+                       mb_set_bit_atomic(lock, cur, bm);
+               else
+                       mb_set_bit(cur, bm);
                cur++;
        }
 }
@@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr += block;
                        blocknr +=
                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-                       ext4_unlock_group(sb, e4b->bd_group);
-                       ext4_error(sb, __func__, "double-free of inode"
-                                  " %lu's block %llu(bit %u in group %lu)\n",
+                       ext4_grp_locked_error(sb, e4b->bd_group,
+                                  __func__, "double-free of inode"
+                                  " %lu's block %llu(bit %u in group %u)",
                                   inode ? inode->i_ino : 0, blocknr, block,
                                   e4b->bd_group);
-                       ext4_lock_group(sb, e4b->bd_group);
                }
                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
                e4b->bd_info->bb_counters[order]++;
@@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
        ac->ac_tail = ret & 0xffff;
        ac->ac_buddy = ret >> 16;
 
-       /* XXXXXXX: SUCH A HORRIBLE **CK */
-       /*FIXME!! Why ? */
+       /*
+        * take the page reference. We want the page to be pinned
+        * so that we don't get a ext4_mb_init_cache_call for this
+        * group until we update the bitmap. That would mean we
+        * double allocate blocks. The reference is dropped
+        * in ext4_mb_release_context
+        */
        ac->ac_bitmap_page = e4b->bd_bitmap_page;
        get_page(ac->ac_bitmap_page);
        ac->ac_buddy_page = e4b->bd_buddy_page;
        get_page(ac->ac_buddy_page);
-
+       /* on allocation we use ac to track the held semaphore */
+       ac->alloc_semp =  e4b->alloc_semp;
+       e4b->alloc_semp = NULL;
        /* store last allocated for subsequent stream allocation */
        if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
                spin_lock(&sbi->s_md_lock);
@@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
        struct ext4_free_extent ex;
        int max;
 
+       if (ac->ac_status == AC_STATUS_FOUND)
+               return;
        /*
         * We don't want to scan for a whole year
         */
@@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                         * free blocks even though group info says we
                         * we have free blocks
                         */
-                       ext4_error(sb, __func__, "%d free blocks as per "
-                                       "group info. But bitmap says 0\n",
+                       ext4_grp_locked_error(sb, e4b->bd_group,
+                                       __func__, "%d free blocks as per "
+                                       "group info. But bitmap says 0",
                                        free);
                        break;
                }
@@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
                BUG_ON(ex.fe_len <= 0);
                if (free < ex.fe_len) {
-                       ext4_error(sb, __func__, "%d free blocks as per "
-                                       "group info. But got %d blocks\n",
+                       ext4_grp_locked_error(sb, e4b->bd_group,
+                                       __func__, "%d free blocks as per "
+                                       "group info. But got %d blocks",
                                        free, ex.fe_len);
                        /*
                         * The number of free blocks differs. This mostly
@@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        return 0;
 }
 
+/*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
+{
+       int i;
+       int block, pnum;
+       int blocks_per_page;
+       int groups_per_page;
+       ext4_group_t first_group;
+       struct ext4_group_info *grp;
+
+       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+       /*
+        * the buddy cache inode stores the block bitmap
+        * and buddy information in consecutive blocks.
+        * So for each group we need two blocks.
+        */
+       block = group * 2;
+       pnum = block / blocks_per_page;
+       first_group = pnum * blocks_per_page / 2;
+
+       groups_per_page = blocks_per_page >> 1;
+       if (groups_per_page == 0)
+               groups_per_page = 1;
+       /* read all groups the page covers into the cache */
+       for (i = 0; i < groups_per_page; i++) {
+
+               if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+                       break;
+               grp = ext4_get_group_info(sb, first_group + i);
+               /* take all groups write allocation
+                * semaphore. This make sure there is
+                * no block allocation going on in any
+                * of that groups
+                */
+               down_write_nested(&grp->alloc_sem, i);
+       }
+       return i;
+}
+
+void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+                                       ext4_group_t group, int locked_group)
+{
+       int i;
+       int block, pnum;
+       int blocks_per_page;
+       ext4_group_t first_group;
+       struct ext4_group_info *grp;
+
+       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+       /*
+        * the buddy cache inode stores the block bitmap
+        * and buddy information in consecutive blocks.
+        * So for each group we need two blocks.
+        */
+       block = group * 2;
+       pnum = block / blocks_per_page;
+       first_group = pnum * blocks_per_page / 2;
+       /* release locks on all the groups */
+       for (i = 0; i < locked_group; i++) {
+
+               grp = ext4_get_group_info(sb, first_group + i);
+               /* take all groups write allocation
+                * semaphore. This make sure there is
+                * no block allocation going on in any
+                * of that groups
+                */
+               up_write(&grp->alloc_sem);
+       }
+
+}
+
+static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+       int ret;
+       void *bitmap;
+       int blocks_per_page;
+       int block, pnum, poff;
+       int num_grp_locked = 0;
+       struct ext4_group_info *this_grp;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct inode *inode = sbi->s_buddy_cache;
+       struct page *page = NULL, *bitmap_page = NULL;
+
+       mb_debug("init group %lu\n", group);
+       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+       this_grp = ext4_get_group_info(sb, group);
+       /*
+        * This ensures we don't add group
+        * to this buddy cache via resize
+        */
+       num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+       if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+               /*
+                * somebody initialized the group
+                * return without doing anything
+                */
+               ret = 0;
+               goto err;
+       }
+       /*
+        * the buddy cache inode stores the block bitmap
+        * and buddy information in consecutive blocks.
+        * So for each group we need two blocks.
+        */
+       block = group * 2;
+       pnum = block / blocks_per_page;
+       poff = block % blocks_per_page;
+       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+       if (page) {
+               BUG_ON(page->mapping != inode->i_mapping);
+               ret = ext4_mb_init_cache(page, NULL);
+               if (ret) {
+                       unlock_page(page);
+                       goto err;
+               }
+               unlock_page(page);
+       }
+       if (page == NULL || !PageUptodate(page)) {
+               ret = -EIO;
+               goto err;
+       }
+       mark_page_accessed(page);
+       bitmap_page = page;
+       bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+       /* init buddy cache */
+       block++;
+       pnum = block / blocks_per_page;
+       poff = block % blocks_per_page;
+       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+       if (page == bitmap_page) {
+               /*
+                * If both the bitmap and buddy are in
+                * the same page we don't need to force
+                * init the buddy
+                */
+               unlock_page(page);
+       } else if (page) {
+               BUG_ON(page->mapping != inode->i_mapping);
+               ret = ext4_mb_init_cache(page, bitmap);
+               if (ret) {
+                       unlock_page(page);
+                       goto err;
+               }
+               unlock_page(page);
+       }
+       if (page == NULL || !PageUptodate(page)) {
+               ret = -EIO;
+               goto err;
+       }
+       mark_page_accessed(page);
+err:
+       ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+       if (bitmap_page)
+               page_cache_release(bitmap_page);
+       if (page)
+               page_cache_release(page);
+       return ret;
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -1775,7 +2019,7 @@ repeat:
                                group = 0;
 
                        /* quick check to skip empty groups */
-                       grp = ext4_get_group_info(ac->ac_sb, group);
+                       grp = ext4_get_group_info(sb, group);
                        if (grp->bb_free == 0)
                                continue;
 
@@ -1788,10 +2032,9 @@ repeat:
                                 * we need full data about the group
                                 * to make a good selection
                                 */
-                               err = ext4_mb_load_buddy(sb, group, &e4b);
+                               err = ext4_mb_init_group(sb, group);
                                if (err)
                                        goto out;
-                               ext4_mb_release_desc(&e4b);
                        }
 
                        /*
@@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
        if (hs->op == EXT4_MB_HISTORY_ALLOC) {
                fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
                        "%-5u %-5s %-5u %-6u\n";
-               sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+               sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len,
                        hs->result.fe_logical);
-               sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+               sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
                        hs->orig.fe_start, hs->orig.fe_len,
                        hs->orig.fe_logical);
-               sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
+               sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
                        hs->goal.fe_start, hs->goal.fe_len,
                        hs->goal.fe_logical);
                seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
@@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
                                hs->buddy ? 1 << hs->buddy : 0);
        } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
                fmt = "%-5u %-8u %-23s %-23s %-23s\n";
-               sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+               sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len,
                        hs->result.fe_logical);
-               sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+               sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
                        hs->orig.fe_start, hs->orig.fe_len,
                        hs->orig.fe_logical);
                seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
        } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
-               sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+               sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len);
                seq_printf(seq, "%-5u %-8u %-23s discard\n",
                                hs->pid, hs->ino, buf2);
        } else if (hs->op == EXT4_MB_HISTORY_FREE) {
-               sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+               sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len);
                seq_printf(seq, "%-5u %-8u %-23s free\n",
                                hs->pid, hs->ino, buf2);
@@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
                return NULL;
 
        group = *pos + 1;
-       return (void *) group;
+       return (void *) ((unsigned long) group);
 }
 
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
        if (*pos < 0 || *pos >= sbi->s_groups_count)
                return NULL;
        group = *pos + 1;
-       return (void *) group;;
+       return (void *) ((unsigned long) group);
 }
 
 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 {
        struct super_block *sb = seq->private;
-       long group = (long) v;
+       ext4_group_t group = (ext4_group_t) ((unsigned long) v);
        int i;
        int err;
        struct ext4_buddy e4b;
@@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
                sizeof(struct ext4_group_info);
        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
-               seq_printf(seq, "#%-5lu: I/O error\n", group);
+               seq_printf(seq, "#%-5u: I/O error\n", group);
                return 0;
        }
        ext4_lock_group(sb, group);
@@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        ext4_unlock_group(sb, group);
        ext4_mb_release_desc(&e4b);
 
-       seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+       seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
                        sg.info.bb_fragments, sg.info.bb_first_free);
        for (i = 0; i <= 13; i++)
                seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        ext4_free_blocks_after_init(sb, group, desc);
        } else {
                meta_group_info[i]->bb_free =
-                       le16_to_cpu(desc->bg_free_blocks_count);
+                       ext4_free_blks_count(sb, desc);
        }
 
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+       init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root.rb_node = NULL;;
 
 #ifdef DOUBLE_CHECK
@@ -2327,54 +2571,6 @@ exit_meta_group_info:
 } /* ext4_mb_add_groupinfo */
 
 /*
- * Add a group to the existing groups.
- * This function is used for online resize
- */
-int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
-                              struct ext4_group_desc *desc)
-{
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct inode *inode = sbi->s_buddy_cache;
-       int blocks_per_page;
-       int block;
-       int pnum;
-       struct page *page;
-       int err;
-
-       /* Add group based on group descriptor*/
-       err = ext4_mb_add_groupinfo(sb, group, desc);
-       if (err)
-               return err;
-
-       /*
-        * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
-        * datas) are set not up to date so that they will be re-initilaized
-        * during the next call to ext4_mb_load_buddy
-        */
-
-       /* Set buddy page as not up to date */
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       page = find_get_page(inode->i_mapping, pnum);
-       if (page != NULL) {
-               ClearPageUptodate(page);
-               page_cache_release(page);
-       }
-
-       /* Set bitmap page as not up to date */
-       block++;
-       pnum = block / blocks_per_page;
-       page = find_get_page(inode->i_mapping, pnum);
-       if (page != NULL) {
-               ClearPageUptodate(page);
-               page_cache_release(page);
-       }
-
-       return 0;
-}
-
-/*
  * Update an existing group.
  * This function is used for online resize
  */
@@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        printk(KERN_ERR
-                               "EXT4-fs: can't read descriptor %lu\n", i);
+                               "EXT4-fs: can't read descriptor %u\n", i);
                        goto err_freebuddy;
                }
                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        if (sbi->s_mb_offsets == NULL) {
                return -ENOMEM;
        }
+
+       i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
                kfree(sbi->s_mb_maxs);
@@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        ext4_mb_init_per_dev_proc(sb);
        ext4_mb_history_init(sb);
 
-       sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+       if (sbi->s_journal)
+               sbi->s_journal->j_commit_callback = release_blocks_on_commit;
 
        printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
        return 0;
@@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        list_for_each_safe(l, ltmp, &txn->t_private_list) {
                entry = list_entry(l, struct ext4_free_data, list);
 
-               mb_debug("gonna free %u blocks in group %lu (0x%p):",
+               mb_debug("gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
 
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
                        + entry->start_blk
                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-               trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
-                          (unsigned long long) discard_block, entry->count);
+               trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
+                          sb->s_id, (unsigned long long) discard_block,
+                          entry->count);
                sb_issue_discard(sb, discard_block, entry->count);
 
                kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void)
  */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-                               handle_t *handle, unsigned long reserv_blks)
+                               handle_t *handle, unsigned int reserv_blks)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_super_block *es;
@@ -2824,7 +3024,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (!gdp)
                goto out_err;
 
-       ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+       ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
                        gdp->bg_free_blocks_count);
 
        err = ext4_journal_get_write_access(handle, gdp_bh);
@@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
            in_range(block + len - 1, ext4_inode_table(sb, gdp),
                     EXT4_SB(sb)->s_itb_per_group)) {
                ext4_error(sb, __func__,
-                          "Allocating block in system zone - block = %llu",
-                          block);
+                          "Allocating block %llu in system zone of %d group\n",
+                          block, ac->ac_b_ex.fe_group);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
                 * We leak some of the blocks here.
@@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
                                bitmap_bh->b_data, ac->ac_b_ex.fe_start,
                                ac->ac_b_ex.fe_len);
-               err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+               err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!err)
                        err = -EAGAIN;
                goto out_err;
@@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                }
        }
 #endif
-       mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
-                               ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
-
        spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+       mb_set_bits(NULL, bitmap_bh->b_data,
+                               ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-               gdp->bg_free_blocks_count =
-                       cpu_to_le16(ext4_free_blocks_after_init(sb,
-                                               ac->ac_b_ex.fe_group,
-                                               gdp));
+               ext4_free_blks_set(sb, gdp,
+                                       ext4_free_blocks_after_init(sb,
+                                       ac->ac_b_ex.fe_group, gdp));
        }
-       le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
+       len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
+       ext4_free_blks_set(sb, gdp, len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
 
-       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        if (err)
                goto out_err;
-       err = ext4_journal_dirty_metadata(handle, gdp_bh);
+       err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
 
 out_err:
        sb->s_dirt = 1;
@@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        /* check we don't cross already preallocated blocks */
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
-               unsigned long pa_end;
+               ext4_lblk_t pa_end;
 
                if (pa->pa_deleted)
                        continue;
@@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        /* XXX: extra loop to check we really don't overlap preallocations */
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
-               unsigned long pa_end;
+               ext4_lblk_t pa_end;
                spin_lock(&pa->pa_lock);
                if (pa->pa_deleted == 0) {
                        pa_end = pa->pa_lstart + pa->pa_len;
@@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 }
 
 /*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+                                               ext4_group_t group)
+{
+       struct rb_node *n;
+       struct ext4_group_info *grp;
+       struct ext4_free_data *entry;
+
+       grp = ext4_get_group_info(sb, group);
+       n = rb_first(&(grp->bb_free_root));
+
+       while (n) {
+               entry = rb_entry(n, struct ext4_free_data, node);
+               mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                               bitmap, entry->start_blk,
+                               entry->count);
+               n = rb_next(n);
+       }
+       return;
+}
+
+/*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
  * Need to be called with ext4 group lock (ext4_lock_group)
@@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                preallocated += len;
                count++;
        }
-       mb_debug("prellocated %u for group %lu\n", preallocated, group);
+       mb_debug("prellocated %u for group %u\n", preallocated, group);
 }
 
 static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
 static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
                        struct super_block *sb, struct ext4_prealloc_space *pa)
 {
-       unsigned long grp;
+       ext4_group_t grp;
 
        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
                return;
@@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 
        mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+       trace_mark(ext4_mb_new_inode_pa,
+                  "dev %s ino %lu pstart %llu len %u lstart %u",
+                  sb->s_id, ac->ac_inode->i_ino,
+                  pa->pa_pstart, pa->pa_len, pa->pa_lstart);
 
        ext4_mb_use_inode_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        pa->pa_linear = 1;
 
        mb_debug("new group pa %p: %llu/%u for %u\n", pa,
-                       pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+                pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+       trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
+                  sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
 
        ext4_mb_use_group_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 {
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-       unsigned long end;
-       unsigned long next;
+       unsigned int end;
+       unsigned int next;
        ext4_group_t group;
        ext4_grpblk_t bit;
+       unsigned long long grp_blk_start;
        sector_t start;
        int err = 0;
        int free = 0;
 
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+       grp_blk_start = pa->pa_pstart - bit;
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
 
@@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        ext4_mb_store_history(ac);
                }
 
+               trace_mark(ext4_mb_release_inode_pa,
+                          "dev %s ino %lu block %llu count %u",
+                          sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
+                          next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
@@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        pa, (unsigned long) pa->pa_lstart,
                        (unsigned long) pa->pa_pstart,
                        (unsigned long) pa->pa_len);
-               ext4_error(sb, __func__, "free %u, pa_free %u\n",
-                                               free, pa->pa_free);
+               ext4_grp_locked_error(sb, group,
+                                       __func__, "free %u, pa_free %u",
+                                       free, pa->pa_free);
                /*
                 * pa is already deleted so we use the value obtained
                 * from the bitmap and continue.
@@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
        if (ac)
                ac->ac_op = EXT4_MB_HISTORY_DISCARD;
 
+       trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
+                  sb->s_id, pa->pa_pstart, pa->pa_len);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        int busy = 0;
        int free = 0;
 
-       mb_debug("discard preallocation for group %lu\n", group);
+       mb_debug("discard preallocation for group %u\n", group);
 
        if (list_empty(&grp->bb_prealloc_list))
                return 0;
@@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (bitmap_bh == NULL) {
                ext4_error(sb, __func__, "Error in reading block "
-                               "bitmap for %lu\n", group);
+                               "bitmap for %u", group);
                return 0;
        }
 
        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
                ext4_error(sb, __func__, "Error in loading buddy "
-                               "information for %lu\n", group);
+                               "information for %u", group);
                put_bh(bitmap_bh);
                return 0;
        }
@@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode)
        }
 
        mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+       trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
+                  inode->i_ino);
 
        INIT_LIST_HEAD(&list);
 
@@ -3874,14 +4116,14 @@ repeat:
                err = ext4_mb_load_buddy(sb, group, &e4b);
                if (err) {
                        ext4_error(sb, __func__, "Error in loading buddy "
-                                       "information for %lu\n", group);
+                                       "information for %u", group);
                        continue;
                }
 
                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (bitmap_bh == NULL) {
                        ext4_error(sb, __func__, "Error in reading block "
-                                       "bitmap for %lu\n", group);
+                                       "bitmap for %u", group);
                        ext4_mb_release_desc(&e4b);
                        continue;
                }
@@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_group_t group;
-       unsigned long len;
-       unsigned long goal;
+       unsigned int len;
+       ext4_fsblk_t goal;
        ext4_grpblk_t block;
 
        /* we can't allocate > group size */
@@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ac->ac_pa = NULL;
        ac->ac_bitmap_page = NULL;
        ac->ac_buddy_page = NULL;
+       ac->alloc_semp = NULL;
        ac->ac_lg = NULL;
 
        /* we have to define context: we'll we work with a file or
@@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
                if (ext4_mb_load_buddy(sb, group, &e4b)) {
                        ext4_error(sb, __func__, "Error in loading buddy "
-                                       "information for %lu\n", group);
+                                       "information for %u", group);
                        continue;
                }
                ext4_lock_group(sb, group);
@@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                }
                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
+       if (ac->alloc_semp)
+               up_read(ac->alloc_semp);
        if (ac->ac_bitmap_page)
                page_cache_release(ac->ac_bitmap_page);
        if (ac->ac_buddy_page)
@@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
        int ret;
        int freed = 0;
 
+       trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
+                  sb->s_id, needed);
        for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
                freed += ret;
@@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block = 0;
-       unsigned long inquota;
-       unsigned long reserv_blks = 0;
+       unsigned int inquota;
+       unsigned int reserv_blks = 0;
 
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);
 
+       trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
+                  "lblk %llu goal %llu lleft %llu lright %llu "
+                  "pleft %llu pright %llu ",
+                  sb->s_id, ar->flags, ar->len,
+                  ar->inode ? ar->inode->i_ino : 0,
+                  (unsigned long long) ar->logical,
+                  (unsigned long long) ar->goal,
+                  (unsigned long long) ar->lleft,
+                  (unsigned long long) ar->lright,
+                  (unsigned long long) ar->pleft,
+                  (unsigned long long) ar->pright);
+
        if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
                /*
                 * With delalloc we already reserved the blocks
@@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        }
        if (ar->len == 0) {
                *errp = -EDQUOT;
-               return 0;
+               goto out3;
        }
        inquota = ar->len;
 
@@ -4348,10 +4607,14 @@ repeat:
                                ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
                        ext4_mb_new_preallocation(ac);
        }
-
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
                if (*errp ==  -EAGAIN) {
+                       /*
+                        * drop the reference that we took
+                        * in ext4_mb_use_best_found
+                        */
+                       ext4_mb_release_context(ac);
                        ac->ac_b_ex.fe_group = 0;
                        ac->ac_b_ex.fe_start = 0;
                        ac->ac_b_ex.fe_len = 0;
@@ -4382,6 +4645,26 @@ out2:
 out1:
        if (ar->len < inquota)
                DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+out3:
+       if (!ar->len) {
+               if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                       /* release all the reserved blocks if non delalloc */
+                       percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                               reserv_blks);
+       }
+
+       trace_mark(ext4_allocate_blocks,
+                  "dev %s block %llu flags %u len %u ino %lu "
+                  "logical %llu goal %llu lleft %llu lright %llu "
+                  "pleft %llu pright %llu ",
+                  sb->s_id, (unsigned long long) block,
+                  ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
+                  (unsigned long long) ar->logical,
+                  (unsigned long long) ar->goal,
+                  (unsigned long long) ar->lleft,
+                  (unsigned long long) ar->lright,
+                  (unsigned long long) ar->pleft,
+                  (unsigned long long) ar->pright);
 
        return block;
 }
@@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1,
 
 static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
-                         ext4_group_t group, ext4_grpblk_t block, int count)
+                     struct ext4_free_data *new_entry)
 {
+       ext4_grpblk_t block;
+       struct ext4_free_data *entry;
        struct ext4_group_info *db = e4b->bd_info;
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct ext4_free_data *entry, *new_entry;
        struct rb_node **n = &db->bb_free_root.rb_node, *node;
        struct rb_node *parent = NULL, *new_node;
 
-
+       BUG_ON(!ext4_handle_valid(handle));
        BUG_ON(e4b->bd_bitmap_page == NULL);
        BUG_ON(e4b->bd_buddy_page == NULL);
 
-       new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
-       new_entry->start_blk = block;
-       new_entry->group  = group;
-       new_entry->count = count;
-       new_entry->t_tid = handle->h_transaction->t_tid;
        new_node = &new_entry->node;
+       block = new_entry->start_blk;
 
-       ext4_lock_group(sb, group);
        if (!*n) {
                /* first free block exent. We need to
                   protect buddy cache from being freed,
@@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                else if (block >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else {
-                       ext4_unlock_group(sb, group);
-                       ext4_error(sb, __func__,
-                           "Double free of blocks %d (%d %d)\n",
-                           block, entry->start_blk, entry->count);
+                       ext4_grp_locked_error(sb, e4b->bd_group, __func__,
+                                       "Double free of blocks %d (%d %d)",
+                                       block, entry->start_blk, entry->count);
                        return 0;
                }
        }
@@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        spin_lock(&sbi->s_md_lock);
        list_add(&new_entry->list, &handle->h_transaction->t_private_list);
        spin_unlock(&sbi->s_md_lock);
-       ext4_unlock_group(sb, group);
        return 0;
 }
 
@@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
-       unsigned long overflow;
+       unsigned int overflow;
        ext4_grpblk_t bit;
        struct buffer_head *gd_bh;
        ext4_group_t block_group;
@@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        }
 
        ext4_debug("freeing block %lu\n", block);
+       trace_mark(ext4_free_blocks,
+                  "dev %s block %llu count %lu metadata %d ino %lu",
+                  sb->s_id, (unsigned long long) block, count, metadata,
+                  inode ? inode->i_ino : 0);
 
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (ac) {
@@ -4581,11 +4862,6 @@ do_more:
        err = ext4_journal_get_write_access(handle, gd_bh);
        if (err)
                goto error_return;
-
-       err = ext4_mb_load_buddy(sb, block_group, &e4b);
-       if (err)
-               goto error_return;
-
 #ifdef AGGRESSIVE_CHECK
        {
                int i;
@@ -4593,13 +4869,6 @@ do_more:
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
 #endif
-       mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-                       bit, count);
-
-       /* We dirtied the bitmap block */
-       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
-
        if (ac) {
                ac->ac_b_ex.fe_group = block_group;
                ac->ac_b_ex.fe_start = bit;
@@ -4607,19 +4876,41 @@ do_more:
                ext4_mb_store_history(ac);
        }
 
-       if (metadata) {
-               /* blocks being freed are metadata. these blocks shouldn't
-                * be used until this transaction is committed */
-               ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+       err = ext4_mb_load_buddy(sb, block_group, &e4b);
+       if (err)
+               goto error_return;
+       if (metadata && ext4_handle_valid(handle)) {
+               struct ext4_free_data *new_entry;
+               /*
+                * blocks being freed are metadata. these blocks shouldn't
+                * be used until this transaction is committed
+                */
+               new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+               new_entry->start_blk = bit;
+               new_entry->group  = block_group;
+               new_entry->count = count;
+               new_entry->t_tid = handle->h_transaction->t_tid;
+               ext4_lock_group(sb, block_group);
+               mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                               bit, count);
+               ext4_mb_free_metadata(handle, &e4b, new_entry);
+               ext4_unlock_group(sb, block_group);
        } else {
                ext4_lock_group(sb, block_group);
+               /* need to update group_info->bb_free and bitmap
+                * with group lock held. generate_buddy look at
+                * them with group lock_held
+                */
+               mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                               bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
                ext4_unlock_group(sb, block_group);
        }
 
        spin_lock(sb_bgl_lock(sbi, block_group));
-       le16_add_cpu(&gdp->bg_free_blocks_count, count);
+       ret = ext4_free_blks_count(sb, gdp) + count;
+       ext4_free_blks_set(sb, gdp, ret);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
        spin_unlock(sb_bgl_lock(sbi, block_group));
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
@@ -4635,9 +4926,13 @@ do_more:
 
        *freed += count;
 
+       /* We dirtied the bitmap block */
+       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
        /* And the group descriptor block */
        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-       ret = ext4_journal_dirty_metadata(handle, gd_bh);
+       ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
        if (!err)
                err = ret;
 
index b5dff1f..10a2921 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/marker.h>
+#include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "group.h"
@@ -98,9 +99,6 @@
  */
 #define MB_DEFAULT_GROUP_PREALLOC      512
 
-static struct kmem_cache *ext4_pspace_cachep;
-static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
 
 struct ext4_free_data {
        /* this links the free block information from group_info */
@@ -120,26 +118,6 @@ struct ext4_free_data {
        tid_t   t_tid;
 };
 
-struct ext4_group_info {
-       unsigned long   bb_state;
-       struct rb_root  bb_free_root;
-       unsigned short  bb_first_free;
-       unsigned short  bb_free;
-       unsigned short  bb_fragments;
-       struct          list_head bb_prealloc_list;
-#ifdef DOUBLE_CHECK
-       void            *bb_bitmap;
-#endif
-       unsigned short  bb_counters[];
-};
-
-#define EXT4_GROUP_INFO_NEED_INIT_BIT  0
-#define EXT4_GROUP_INFO_LOCKED_BIT     1
-
-#define EXT4_MB_GRP_NEED_INIT(grp)     \
-       (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-
-
 struct ext4_prealloc_space {
        struct list_head        pa_inode_list;
        struct list_head        pa_group_list;
@@ -217,6 +195,11 @@ struct ext4_allocation_context {
        __u8 ac_op;             /* operation, for history only */
        struct page *ac_bitmap_page;
        struct page *ac_buddy_page;
+       /*
+        * pointer to the held semaphore upon successful
+        * block allocation
+        */
+       struct rw_semaphore *alloc_semp;
        struct ext4_prealloc_space *ac_pa;
        struct ext4_locality_group *ac_lg;
 };
@@ -250,6 +233,7 @@ struct ext4_buddy {
        struct super_block *bd_sb;
        __u16 bd_blkbits;
        ext4_group_t bd_group;
+       struct rw_semaphore *alloc_semp;
 };
 #define EXT4_MB_BITMAP(e4b)    ((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)     ((e4b)->bd_buddy)
@@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 {
        return;
 }
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
 #endif
 
 #define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
 
 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
-
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
-                                       ext4_group_t group);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-                                       struct ext4_buddy *e4b, sector_t block,
-                                       int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
-                       struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
-
-
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-{
-       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-       bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline void ext4_unlock_group(struct super_block *sb,
-                                       ext4_group_t group)
-{
-       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-       bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline int ext4_is_group_locked(struct super_block *sb,
-                                       ext4_group_t group)
-{
-       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-       return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-                                               &(grinfo->bb_state));
-}
-
-static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
 {
        ext4_fsblk_t block;
index f2a9cf4..734abca 100644 (file)
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
        /*
         * Make sure the credit we accumalated is not really high
         */
-       if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
+       if (needed && ext4_handle_has_enough_credits(handle,
+                                               EXT4_RESERVE_TRANS_BLOCKS)) {
                retval = ext4_journal_restart(handle, needed);
                if (retval)
                        goto err_out;
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
 {
        int retval = 0, needed;
 
-       if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+       if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
                return 0;
        /*
         * We are freeing a blocks. During this we touch
@@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
        struct list_blocks_struct lb;
        unsigned long max_entries;
 
-       if (!test_opt(inode->i_sb, EXTENTS))
-               /*
-                * if mounted with noextents we don't allow the migrate
-                */
-               return -EINVAL;
-
-       if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+       /*
+        * If the filesystem does not support extents, or the inode
+        * already is extent-based, error out.
+        */
+       if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+           (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
                return -EINVAL;
 
        if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
index 4b8d431..fec0b4c 100644 (file)
@@ -368,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                goto fail;
        }
        hinfo->hash_version = root->info.hash_version;
+       if (hinfo->hash_version <= DX_HASH_TEA)
+               hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
        if (d_name)
                ext4fs_dirhash(d_name->name, d_name->len, hinfo);
@@ -637,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        dir = dir_file->f_path.dentry->d_inode;
        if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
                hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+               if (hinfo.hash_version <= DX_HASH_TEA)
+                       hinfo.hash_version +=
+                               EXT4_SB(dir->i_sb)->s_hash_unsigned;
                hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
                count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
                                               start_hash, start_minor_hash);
@@ -802,7 +807,7 @@ static inline int ext4_match (int len, const char * const name,
 static inline int search_dirblock(struct buffer_head *bh,
                                  struct inode *dir,
                                  const struct qstr *d_name,
-                                 unsigned long offset,
+                                 unsigned int offset,
                                  struct ext4_dir_entry_2 ** res_dir)
 {
        struct ext4_dir_entry_2 * de;
@@ -1039,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
        bh = ext4_find_entry(dir, &dentry->d_name, &de);
        inode = NULL;
        if (bh) {
-               unsigned long ino = le32_to_cpu(de->inode);
+               __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
                        ext4_error(dir->i_sb, "ext4_lookup",
-                                  "bad inode number: %lu", ino);
+                                  "bad inode number: %u", ino);
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
@@ -1056,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 
 struct dentry *ext4_get_parent(struct dentry *child)
 {
-       unsigned long ino;
+       __u32 ino;
        struct inode *inode;
        static const struct qstr dotdot = {
                .name = "..",
@@ -1074,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
 
        if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
                ext4_error(child->d_inode->i_sb, "ext4_get_parent",
-                          "bad inode number: %lu", ino);
+                          "bad inode number: %u", ino);
                return ERR_PTR(-EIO);
        }
 
@@ -1162,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        u32 hash2;
        struct dx_map_entry *map;
        char *data1 = (*bh)->b_data, *data2;
-       unsigned split, move, size, i;
+       unsigned split, move, size;
        struct ext4_dir_entry_2 *de = NULL, *de2;
-       int     err = 0;
+       int     err = 0, i;
 
        bh2 = ext4_append (handle, dir, &newblock, &err);
        if (!(bh2)) {
@@ -1224,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                de = de2;
        }
        dx_insert_block(frame, hash2 + continued, newblock);
-       err = ext4_journal_dirty_metadata(handle, bh2);
+       err = ext4_handle_dirty_metadata(handle, dir, bh2);
        if (err)
                goto journal_error;
-       err = ext4_journal_dirty_metadata(handle, frame->bh);
+       err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
        if (err)
                goto journal_error;
        brelse(bh2);
@@ -1262,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        struct inode    *dir = dentry->d_parent->d_inode;
        const char      *name = dentry->d_name.name;
        int             namelen = dentry->d_name.len;
-       unsigned long   offset = 0;
+       unsigned int    offset = 0;
        unsigned short  reclen;
        int             nlen, rlen, err;
        char            *top;
@@ -1331,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        ext4_update_dx_flag(dir);
        dir->i_version++;
        ext4_mark_inode_dirty(handle, dir);
-       BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-       err = ext4_journal_dirty_metadata(handle, bh);
+       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+       err = ext4_handle_dirty_metadata(handle, dir, bh);
        if (err)
                ext4_std_error(dir->i_sb, err);
        brelse(bh);
@@ -1404,6 +1409,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 
        /* Initialize as for dx_probe */
        hinfo.hash_version = root->info.hash_version;
+       if (hinfo.hash_version <= DX_HASH_TEA)
+               hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
        ext4fs_dirhash(name, namelen, &hinfo);
        frame = frames;
@@ -1433,7 +1440,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                          struct inode *inode)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-       unsigned long offset;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
@@ -1455,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                ext4_mark_inode_dirty(handle, dir);
        }
        blocks = dir->i_size >> sb->s_blocksize_bits;
-       for (block = 0, offset = 0; block < blocks; block++) {
+       for (block = 0; block < blocks; block++) {
                bh = ext4_bread(handle, dir, block, 0, &retval);
                if(!bh)
                        return retval;
@@ -1570,7 +1576,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        dxtrace(dx_show_index("node", frames[1].entries));
                        dxtrace(dx_show_index("node",
                               ((struct dx_node *) bh2->b_data)->entries));
-                       err = ext4_journal_dirty_metadata(handle, bh2);
+                       err = ext4_handle_dirty_metadata(handle, inode, bh2);
                        if (err)
                                goto journal_error;
                        brelse (bh2);
@@ -1596,7 +1602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-               ext4_journal_dirty_metadata(handle, frames[0].bh);
+               ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1642,8 +1648,8 @@ static int ext4_delete_entry(handle_t *handle,
                        else
                                de->inode = 0;
                        dir->i_version++;
-                       BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-                       ext4_journal_dirty_metadata(handle, bh);
+                       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                       ext4_handle_dirty_metadata(handle, dir, bh);
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len);
@@ -1721,7 +1727,7 @@ retry:
                return PTR_ERR(handle);
 
        if (IS_DIRSYNC(dir))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
 
        inode = ext4_new_inode (handle, dir, mode);
        err = PTR_ERR(inode);
@@ -1755,7 +1761,7 @@ retry:
                return PTR_ERR(handle);
 
        if (IS_DIRSYNC(dir))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
 
        inode = ext4_new_inode(handle, dir, mode);
        err = PTR_ERR(inode);
@@ -1791,7 +1797,7 @@ retry:
                return PTR_ERR(handle);
 
        if (IS_DIRSYNC(dir))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
 
        inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
        err = PTR_ERR(inode);
@@ -1820,8 +1826,8 @@ retry:
        strcpy(de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
-       BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
-       ext4_journal_dirty_metadata(handle, dir_block);
+       BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+       ext4_handle_dirty_metadata(handle, dir, dir_block);
        brelse(dir_block);
        ext4_mark_inode_dirty(handle, inode);
        err = ext4_add_entry(handle, dentry, inode);
@@ -1850,7 +1856,7 @@ out_stop:
  */
 static int empty_dir(struct inode *inode)
 {
-       unsigned long offset;
+       unsigned int offset;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de, *de1;
        struct super_block *sb;
@@ -1895,7 +1901,7 @@ static int empty_dir(struct inode *inode)
                                if (err)
                                        ext4_error(sb, __func__,
                                                   "error %d reading directory"
-                                                  " #%lu offset %lu",
+                                                  " #%lu offset %u",
                                                   err, inode->i_ino, offset);
                                offset += sb->s_blocksize;
                                continue;
@@ -1933,6 +1939,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        struct ext4_iloc iloc;
        int err = 0, rc;
 
+       if (!ext4_handle_valid(handle))
+               return 0;
+
        lock_super(sb);
        if (!list_empty(&EXT4_I(inode)->i_orphan))
                goto out_unlock;
@@ -1961,7 +1970,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        /* Insert this inode at the head of the on-disk orphan list... */
        NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
        EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-       err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+       err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
        rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
        if (!err)
                err = rc;
@@ -1995,10 +2004,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
        struct list_head *prev;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi;
-       unsigned long ino_next;
+       __u32 ino_next;
        struct ext4_iloc iloc;
        int err = 0;
 
+       if (!ext4_handle_valid(handle))
+               return 0;
+
        lock_super(inode->i_sb);
        if (list_empty(&ei->i_orphan)) {
                unlock_super(inode->i_sb);
@@ -2017,7 +2029,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
         * transaction handle with which to update the orphan list on
         * disk, but we still need to remove the inode from the linked
         * list in memory. */
-       if (!handle)
+       if (sbi->s_journal && !handle)
                goto out;
 
        err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2025,19 +2037,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
                goto out_err;
 
        if (prev == &sbi->s_orphan) {
-               jbd_debug(4, "superblock will point to %lu\n", ino_next);
+               jbd_debug(4, "superblock will point to %u\n", ino_next);
                BUFFER_TRACE(sbi->s_sbh, "get_write_access");
                err = ext4_journal_get_write_access(handle, sbi->s_sbh);
                if (err)
                        goto out_brelse;
                sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-               err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+               err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
        } else {
                struct ext4_iloc iloc2;
                struct inode *i_prev =
                        &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
 
-               jbd_debug(4, "orphan inode %lu will point to %lu\n",
+               jbd_debug(4, "orphan inode %lu will point to %u\n",
                          i_prev->i_ino, ino_next);
                err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
                if (err)
@@ -2082,7 +2094,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
                goto end_rmdir;
 
        if (IS_DIRSYNC(dir))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
 
        inode = dentry->d_inode;
 
@@ -2136,7 +2148,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
                return PTR_ERR(handle);
 
        if (IS_DIRSYNC(dir))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
 
        retval = -ENOENT;
        bh = ext4_find_entry(dir, &dentry->d_name, &de);
@@ -2193,7 +2205,7 @@ retry:
                return PTR_ERR(handle);
 
        if (IS_DIRSYNC(dir))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
 
        inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
        err = PTR_ERR(inode);
@@ -2256,7 +2268,7 @@ retry:
                return PTR_ERR(handle);
 
        if (IS_DIRSYNC(dir))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
 
        inode->i_ctime = ext4_current_time(inode);
        ext4_inc_count(handle, inode);
@@ -2305,7 +2317,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                return PTR_ERR(handle);
 
        if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-               handle->h_sync = 1;
+               ext4_handle_sync(handle);
 
        old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
        /*
@@ -2359,8 +2371,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_dir->i_ctime = new_dir->i_mtime =
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
-               BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
-               ext4_journal_dirty_metadata(handle, new_bh);
+               BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
+               ext4_handle_dirty_metadata(handle, new_dir, new_bh);
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2410,8 +2422,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                BUFFER_TRACE(dir_bh, "get_write_access");
                ext4_journal_get_write_access(handle, dir_bh);
                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
-               BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
-               ext4_journal_dirty_metadata(handle, dir_bh);
+               BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
+               ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
                ext4_dec_count(handle, old_dir);
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
index b6ec184..c328be5 100644 (file)
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb,
        ext4_get_group_no_and_offset(sb, start, NULL, &offset);
        if (group != sbi->s_groups_count)
                ext4_warning(sb, __func__,
-                            "Cannot add at group %u (only %lu groups)",
+                            "Cannot add at group %u (only %u groups)",
                             input->group, sbi->s_groups_count);
        else if (offset != 0)
                        ext4_warning(sb, __func__, "Last group not full");
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
 {
        int err;
 
-       if (handle->h_buffer_credits >= thresh)
+       if (ext4_handle_has_enough_credits(handle, thresh))
                return 0;
 
        err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-               ext4_journal_dirty_metadata(handle, gdb);
+               ext4_handle_dirty_metadata(handle, NULL, gdb);
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(bh);
                        goto exit_bh;
                }
-               ext4_journal_dirty_metadata(handle, gdb);
+               ext4_handle_dirty_metadata(handle, NULL, gdb);
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(it);
                        goto exit_bh;
                }
-               ext4_journal_dirty_metadata(handle, it);
+               ext4_handle_dirty_metadata(handle, NULL, it);
                brelse(it);
                ext4_set_bit(bit, bh->b_data);
        }
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
 
-       mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
-                       bh->b_data);
-       ext4_journal_dirty_metadata(handle, bh);
+       mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+       ext4_handle_dirty_metadata(handle, NULL, bh);
        brelse(bh);
-
        /* Mark unused entries in inode bitmap used */
        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
                   input->inode_bitmap, input->inode_bitmap - start);
@@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb,
                goto exit_journal;
        }
 
-       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
-       ext4_journal_dirty_metadata(handle, bh);
+       ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
        brelse(bh);
 
@@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-       ext4_journal_dirty_metadata(handle, dind);
+       ext4_handle_dirty_metadata(handle, NULL, dind);
        brelse(dind);
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
        ext4_mark_iloc_dirty(handle, inode, &iloc);
        memset((*primary)->b_data, 0, sb->s_blocksize);
-       ext4_journal_dirty_metadata(handle, *primary);
+       ext4_handle_dirty_metadata(handle, NULL, *primary);
 
        o_group_desc = EXT4_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
 
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-       ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+       ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 
        return 0;
 
@@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
                       primary[i]->b_blocknr, gdbackups,
                       blk + primary[i]->b_blocknr); */
                data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
-               err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+               err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
                if (!err)
                        err = err2;
        }
@@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb,
                struct buffer_head *bh;
 
                /* Out of journal space, and can't get more - abort - so sad */
-               if (handle->h_buffer_credits == 0 &&
+               if (ext4_handle_valid(handle) &&
+                   handle->h_buffer_credits == 0 &&
                    ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
                    (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
                        break;
@@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb,
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-               ext4_journal_dirty_metadata(handle, bh);
+               ext4_handle_dirty_metadata(handle, NULL, bh);
                brelse(bh);
        }
        if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb,
 exit_err:
        if (err) {
                ext4_warning(sb, __func__,
-                            "can't update backup for group %lu (err %d), "
+                            "can't update backup for group %u (err %d), "
                             "forcing fsck on next reboot", group, err);
                sbi->s_mount_state &= ~EXT4_VALID_FS;
                sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        struct inode *inode = NULL;
        handle_t *handle;
        int gdb_off, gdb_num;
+       int num_grp_locked = 0;
        int err, err2;
 
        gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
        if (ext4_blocks_count(es) + input->blocks_count <
            ext4_blocks_count(es)) {
-               ext4_warning(sb, __func__, "blocks_count overflow\n");
+               ext4_warning(sb, __func__, "blocks_count overflow");
                return -EINVAL;
        }
 
        if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
            le32_to_cpu(es->s_inodes_count)) {
-               ext4_warning(sb, __func__, "inodes_count overflow\n");
+               ext4_warning(sb, __func__, "inodes_count overflow");
                return -EINVAL;
        }
 
@@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                }
        }
 
+
        if ((err = verify_group_input(sb, input)))
                goto exit_put;
 
@@ -855,6 +856,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
          * using the new disk blocks.
          */
 
+       num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
        /* Update group descriptor block for new group */
        gdp = (struct ext4_group_desc *)((char *)primary->b_data +
                                         gdb_off * EXT4_DESC_SIZE(sb));
@@ -862,17 +864,20 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
        ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
        ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-       gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
-       gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+       ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+       ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+       gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
 
        /*
         * We can allocate memory for mb_alloc based on the new group
         * descriptor
         */
-       err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
-       if (err)
+       err = ext4_mb_add_groupinfo(sb, input->group, gdp);
+       if (err) {
+               ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
                goto exit_journal;
+       }
 
        /*
         * Make the new blocks and inodes valid next.  We do this before
@@ -914,8 +919,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
        /* Update the global fs size fields */
        sbi->s_groups_count++;
+       ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
 
-       ext4_journal_dirty_metadata(handle, primary);
+       ext4_handle_dirty_metadata(handle, NULL, primary);
 
        /* Update the reserved block counts only once the new group is
         * active. */
@@ -937,7 +943,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                        EXT4_INODES_PER_GROUP(sb);
        }
 
-       ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+       ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
        sb->s_dirt = 1;
 
 exit_journal:
@@ -975,9 +981,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        struct buffer_head *bh;
        handle_t *handle;
        int err;
-       unsigned long freed_blocks;
        ext4_group_t group;
-       struct ext4_group_info *grp;
 
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
@@ -997,8 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                        " too large to resize to %llu blocks safely\n",
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
-                       ext4_warning(sb, __func__,
-                       "CONFIG_LBD not enabled\n");
+                       ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
                return -EINVAL;
        }
 
@@ -1071,62 +1074,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                goto exit_put;
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
-       ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+       ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        sb->s_dirt = 1;
        unlock_super(sb);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
-       ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+       /* We add the blocks to the bitmap and set the group need init bit */
+       ext4_add_groupblocks(handle, sb, o_blocks_count, add);
        ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        if ((err = ext4_journal_stop(handle)))
                goto exit_put;
 
-       /*
-        * Mark mballoc pages as not up to date so that they will be updated
-        * next time they are loaded by ext4_mb_load_buddy.
-        *
-        * XXX Bad, Bad, BAD!!!  We should not be overloading the
-        * Uptodate flag, particularly on thte bitmap bh, as way of
-        * hinting to ext4_mb_load_buddy() that it needs to be
-        * overloaded.  A user could take a LVM snapshot, then do an
-        * on-line fsck, and clear the uptodate flag, and this would
-        * not be a bug in userspace, but a bug in the kernel.  FIXME!!!
-        */
-       {
-               struct ext4_sb_info *sbi = EXT4_SB(sb);
-               struct inode *inode = sbi->s_buddy_cache;
-               int blocks_per_page;
-               int block;
-               int pnum;
-               struct page *page;
-
-               /* Set buddy page as not up to date */
-               blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-               block = group * 2;
-               pnum = block / blocks_per_page;
-               page = find_get_page(inode->i_mapping, pnum);
-               if (page != NULL) {
-                       ClearPageUptodate(page);
-                       page_cache_release(page);
-               }
-
-               /* Set bitmap page as not up to date */
-               block++;
-               pnum = block / blocks_per_page;
-               page = find_get_page(inode->i_mapping, pnum);
-               if (page != NULL) {
-                       ClearPageUptodate(page);
-                       page_cache_release(page);
-               }
-
-               /* Get the info on the last group */
-               grp = ext4_get_group_info(sb, group);
-
-               /* Update free blocks in group info */
-               ext4_mb_update_group_info(grp, add);
-       }
-
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
                       ext4_blocks_count(es));
index 9494bb2..8f7e0be 100644 (file)
@@ -51,8 +51,6 @@ struct proc_dir_entry *ext4_proc_root;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
-static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
-                              unsigned int);
 static void ext4_commit_super(struct super_block *sb,
                              struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
@@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 
+__u32 ext4_free_blks_count(struct super_block *sb,
+                             struct ext4_group_desc *bg)
+{
+       return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+               (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+               (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+                             struct ext4_group_desc *bg)
+{
+       return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+               (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+               (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+                             struct ext4_group_desc *bg)
+{
+       return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+               (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+               (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u32 ext4_itable_unused_count(struct super_block *sb,
+                             struct ext4_group_desc *bg)
+{
+       return le16_to_cpu(bg->bg_itable_unused_lo) |
+               (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+               (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+}
+
 void ext4_block_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
 {
@@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb,
                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 }
 
+void ext4_free_blks_set(struct super_block *sb,
+                         struct ext4_group_desc *bg, __u32 count)
+{
+       bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+       if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+               bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_free_inodes_set(struct super_block *sb,
+                         struct ext4_group_desc *bg, __u32 count)
+{
+       bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+       if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+               bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_used_dirs_set(struct super_block *sb,
+                         struct ext4_group_desc *bg, __u32 count)
+{
+       bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+       if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+               bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_itable_unused_set(struct super_block *sb,
+                         struct ext4_group_desc *bg, __u32 count)
+{
+       bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+       if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+               bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+}
+
 /*
  * Wrappers for jbd2_journal_start/end.
  *
@@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
-       if (is_journal_aborted(journal)) {
-               ext4_abort(sb, __func__,
-                          "Detected aborted journal");
-               return ERR_PTR(-EROFS);
+       if (journal) {
+               if (is_journal_aborted(journal)) {
+                       ext4_abort(sb, __func__,
+                                  "Detected aborted journal");
+                       return ERR_PTR(-EROFS);
+               }
+               return jbd2_journal_start(journal, nblocks);
        }
-
-       return jbd2_journal_start(journal, nblocks);
+       /*
+        * We're not journaling, return the appropriate indication.
+        */
+       current->journal_info = EXT4_NOJOURNAL_HANDLE;
+       return current->journal_info;
 }
 
 /*
@@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
        int err;
        int rc;
 
+       if (!ext4_handle_valid(handle)) {
+               /*
+                * Do this here since we don't call jbd2_journal_stop() in
+                * no-journal mode.
+                */
+               current->journal_info = NULL;
+               return 0;
+       }
        sb = handle->h_transaction->t_journal->j_private;
        err = handle->h_err;
        rc = jbd2_journal_stop(handle);
@@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
        char nbuf[16];
        const char *errstr = ext4_decode_error(NULL, err, nbuf);
 
+       BUG_ON(!ext4_handle_valid(handle));
+
        if (bh)
                BUFFER_TRACE(bh, "abort");
 
@@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function,
        va_end(args);
 }
 
+void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
+                               const char *function, const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+       va_list args;
+       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+       va_start(args, fmt);
+       printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+       vprintk(fmt, args);
+       printk("\n");
+       va_end(args);
+
+       if (test_opt(sb, ERRORS_CONT)) {
+               EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+               es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+               ext4_commit_super(sb, es, 0);
+               return;
+       }
+       ext4_unlock_group(sb, grp);
+       ext4_handle_error(sb);
+       /*
+        * We only get here in the ERRORS_RO case; relocking the group
+        * may be dangerous, but nothing bad will happen since the
+        * filesystem will have already been marked read/only and the
+        * journal has been aborted.  We return 1 as a hint to callers
+        * who might what to use the return value from
+        * ext4_grp_locked_error() to distinguish beween the
+        * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+        * aggressively from the ext4 function in question, with a
+        * more appropriate error code.
+        */
+       ext4_lock_group(sb, grp);
+       return;
+}
+
+
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
        return bdev;
 
 fail:
-       printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+       printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
                        __bdevname(dev, b), PTR_ERR(bdev));
        return NULL;
 }
@@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb)
        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
-       err = jbd2_journal_destroy(sbi->s_journal);
-       sbi->s_journal = NULL;
-       if (err < 0)
-               ext4_abort(sb, __func__, "Couldn't clean up the journal");
-
+       if (sbi->s_journal) {
+               err = jbd2_journal_destroy(sbi->s_journal);
+               sbi->s_journal = NULL;
+               if (err < 0)
+                       ext4_abort(sb, __func__,
+                                  "Couldn't clean up the journal");
+       }
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
+       /*
+        * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
+        * therefore it can be null here.  Don't check it, just initialize
+        * jinode.
+        */
        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
@@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode)
        }
 #endif
        ext4_discard_preallocations(inode);
-       jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+       if (EXT4_JOURNAL(inode))
+               jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
                                       &EXT4_I(inode)->jinode);
 }
 
@@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
        if (!test_opt(sb, RESERVATION))
                seq_puts(seq, ",noreservation");
-       if (sbi->s_commit_interval) {
+       if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                seq_printf(seq, ",commit=%u",
                           (unsigned) (sbi->s_commit_interval / HZ));
        }
+       if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+               seq_printf(seq, ",min_batch_time=%u",
+                          (unsigned) sbi->s_min_batch_time);
+       }
+       if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+               seq_printf(seq, ",max_batch_time=%u",
+                          (unsigned) sbi->s_min_batch_time);
+       }
+
        /*
         * We're changing the default of barrier mount option, so
         * let's always display its mount state so it's clear what its
@@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",journal_async_commit");
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
-       if (!test_opt(sb, EXTENTS))
-               seq_puts(seq, ",noextents");
        if (test_opt(sb, I_VERSION))
                seq_puts(seq, ",i_version");
        if (!test_opt(sb, DELALLOC))
@@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
                                    ext4_nfs_get_inode);
 }
 
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+{
+       journal_t *journal = EXT4_SB(sb)->s_journal;
+
+       WARN_ON(PageChecked(page));
+       if (!page_has_buffers(page))
+               return 0;
+       if (journal)
+               return jbd2_journal_try_to_free_buffers(journal, page,
+                                                       wait & ~__GFP_WAIT);
+       return try_to_free_buffers(page);
+}
+
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -838,6 +988,7 @@ static const struct super_operations ext4_sops = {
        .quota_read     = ext4_quota_read,
        .quota_write    = ext4_quota_write,
 #endif
+       .bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
 static const struct export_operations ext4_export_ops = {
@@ -852,16 +1003,17 @@ enum {
        Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-       Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+       Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+       Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-       Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+       Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
-       Opt_inode_readahead_blks
+       Opt_inode_readahead_blks, Opt_journal_ioprio
 };
 
 static const match_table_t tokens = {
@@ -891,8 +1043,9 @@ static const match_table_t tokens = {
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
        {Opt_commit, "commit=%u"},
+       {Opt_min_batch_time, "min_batch_time=%u"},
+       {Opt_max_batch_time, "max_batch_time=%u"},
        {Opt_journal_update, "journal=update"},
-       {Opt_journal_inum, "journal=%u"},
        {Opt_journal_dev, "journal_dev=%u"},
        {Opt_journal_checksum, "journal_checksum"},
        {Opt_journal_async_commit, "journal_async_commit"},
@@ -913,14 +1066,13 @@ static const match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
-       {Opt_extents, "extents"},
-       {Opt_noextents, "noextents"},
        {Opt_i_version, "i_version"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+       {Opt_journal_ioprio, "journal_ioprio=%u"},
        {Opt_err, NULL},
 };
 
@@ -945,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data)
        return sb_block;
 }
 
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
 static int parse_options(char *options, struct super_block *sb,
-                        unsigned int *inum, unsigned long *journal_devnum,
+                        unsigned long *journal_devnum,
+                        unsigned int *journal_ioprio,
                         ext4_fsblk_t *n_blocks_count, int is_remount)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -958,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb,
        int qtype, qfmt;
        char *qname;
 #endif
-       ext4_fsblk_t last_block;
 
        if (!options)
                return 1;
@@ -1070,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
                        }
                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
                        break;
-               case Opt_journal_inum:
-                       if (is_remount) {
-                               printk(KERN_ERR "EXT4-fs: cannot specify "
-                                      "journal on remount\n");
-                               return 0;
-                       }
-                       if (match_int(&args[0], &option))
-                               return 0;
-                       *inum = option;
-                       break;
                case Opt_journal_dev:
                        if (is_remount) {
                                printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -1109,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb,
                                option = JBD2_DEFAULT_MAX_COMMIT_AGE;
                        sbi->s_commit_interval = HZ * option;
                        break;
+               case Opt_max_batch_time:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       if (option < 0)
+                               return 0;
+                       if (option == 0)
+                               option = EXT4_DEF_MAX_BATCH_TIME;
+                       sbi->s_max_batch_time = option;
+                       break;
+               case Opt_min_batch_time:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       if (option < 0)
+                               return 0;
+                       sbi->s_min_batch_time = option;
+                       break;
                case Opt_data_journal:
                        data_opt = EXT4_MOUNT_JOURNAL_DATA;
                        goto datacheck;
@@ -1279,33 +1439,6 @@ set_qf_format:
                case Opt_bh:
                        clear_opt(sbi->s_mount_opt, NOBH);
                        break;
-               case Opt_extents:
-                       if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
-                                       EXT4_FEATURE_INCOMPAT_EXTENTS)) {
-                               ext4_warning(sb, __func__,
-                                       "extents feature not enabled "
-                                       "on this filesystem, use tune2fs\n");
-                               return 0;
-                       }
-                       set_opt(sbi->s_mount_opt, EXTENTS);
-                       break;
-               case Opt_noextents:
-                       /*
-                        * When e2fsprogs support resizing an already existing
-                        * ext3 file system to greater than 2**32 we need to
-                        * add support to block allocator to handle growing
-                        * already existing block  mapped inode so that blocks
-                        * allocated for them fall within 2**32
-                        */
-                       last_block = ext4_blocks_count(sbi->s_es) - 1;
-                       if (last_block  > 0xffffffffULL) {
-                               printk(KERN_ERR "EXT4-fs: Filesystem too "
-                                               "large to mount with "
-                                               "-o noextents options\n");
-                               return 0;
-                       }
-                       clear_opt(sbi->s_mount_opt, EXTENTS);
-                       break;
                case Opt_i_version:
                        set_opt(sbi->s_mount_opt, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
@@ -1330,6 +1463,14 @@ set_qf_format:
                                return 0;
                        sbi->s_inode_readahead_blks = option;
                        break;
+               case Opt_journal_ioprio:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       if (option < 0 || option > 7)
+                               break;
+                       *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
+                                                           option);
+                       break;
                default:
                        printk(KERN_ERR
                               "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1405,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                printk(KERN_WARNING
                       "EXT4-fs warning: checktime reached, "
                       "running e2fsck is recommended\n");
-#if 0
-               /* @@@ We _will_ want to clear the valid bit if we find
-                * inconsistencies, to force a fsck at reboot.  But for
-                * a plain journaled filesystem we can keep it set as
-                * valid forever! :)
-                */
-       es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
-#endif
+       if (!sbi->s_journal) 
+               es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        es->s_mtime = cpu_to_le32(get_seconds());
        ext4_update_dynamic_rev(sb);
-       EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+       if (sbi->s_journal)
+               EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 
        ext4_commit_super(sb, es, 1);
        if (test_opt(sb, DEBUG))
-               printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
+               printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04lx]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
@@ -1430,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt);
 
-       printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
-              sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
-              "external", EXT4_SB(sb)->s_journal->j_devname);
+       if (EXT4_SB(sb)->s_journal) {
+               printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+                      sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+                      "external", EXT4_SB(sb)->s_journal->j_devname);
+       } else {
+               printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+       }
        return res;
 }
 
@@ -1444,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
        ext4_group_t flex_group_count;
        ext4_group_t flex_group;
        int groups_per_flex = 0;
-       __u64 block_bitmap = 0;
        int i;
 
        if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1463,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb)
                                     sizeof(struct flex_groups), GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
                printk(KERN_ERR "EXT4-fs: not enough memory for "
-                               "%lu flex groups\n", flex_group_count);
+                               "%u flex groups\n", flex_group_count);
                goto failed;
        }
 
-       gdp = ext4_get_group_desc(sb, 1, &bh);
-       block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
-
        for (i = 0; i < sbi->s_groups_count; i++) {
                gdp = ext4_get_group_desc(sb, i, &bh);
 
                flex_group = ext4_flex_group(sbi, i);
                sbi->s_flex_groups[flex_group].free_inodes +=
-                       le16_to_cpu(gdp->bg_free_inodes_count);
+                       ext4_free_inodes_count(sb, gdp);
                sbi->s_flex_groups[flex_group].free_blocks +=
-                       le16_to_cpu(gdp->bg_free_blocks_count);
+                       ext4_free_blks_count(sb, gdp);
        }
 
        return 1;
@@ -1551,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb)
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                              "Block bitmap for group %lu not in group "
+                              "Block bitmap for group %u not in group "
                               "(block %llu)!\n", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                              "Inode bitmap for group %lu not in group "
+                              "Inode bitmap for group %u not in group "
                               "(block %llu)!\n", i, inode_bitmap);
                        return 0;
                }
@@ -1566,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb)
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                              "Inode table for group %lu not in group "
+                              "Inode table for group %u not in group "
                               "(block %llu)!\n", i, inode_table);
                        return 0;
                }
                spin_lock(sb_bgl_lock(sbi, i));
                if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                              "Checksum for group %lu failed (%u!=%u)\n",
+                              "Checksum for group %u failed (%u!=%u)\n",
                               i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
                               gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!(sb->s_flags & MS_RDONLY)) {
@@ -1865,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        ext4_fsblk_t sb_block = get_sb_block(&data);
        ext4_fsblk_t logical_sb_block;
        unsigned long offset = 0;
-       unsigned int journal_inum = 0;
        unsigned long journal_devnum = 0;
        unsigned long def_mount_opts;
        struct inode *root;
        char *cp;
+       const char *descr;
        int ret = -EINVAL;
        int blocksize;
-       int db_count;
-       int i;
+       unsigned int db_count;
+       unsigned int i;
        int needs_recovery, has_huge_files;
-       __le32 features;
+       int features;
        __u64 blocks_count;
        int err;
+       unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
@@ -1958,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+       sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+       sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+       sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
        set_opt(sbi->s_mount_opt, RESERVATION);
        set_opt(sbi->s_mount_opt, BARRIER);
 
        /*
-        * turn on extents feature by default in ext4 filesystem
-        * only if feature flag already set by mkfs or tune2fs.
-        * Use -o noextents to turn it off
-        */
-       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
-               set_opt(sbi->s_mount_opt, EXTENTS);
-       else
-               ext4_warning(sb, __func__,
-                       "extents feature not enabled on this filesystem, "
-                       "use tune2fs.\n");
-
-       /*
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
        set_opt(sbi->s_mount_opt, DELALLOC);
 
 
-       if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
-                          NULL, 0))
+       if (!parse_options((char *) data, sb, &journal_devnum,
+                          &journal_ioprio, NULL, 0))
                goto failed_mount;
 
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2004,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
        if (features) {
                printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
-                      "unsupported optional features (%x).\n",
-                      sb->s_id, le32_to_cpu(features));
+                      "unsupported optional features (%x).\n", sb->s_id,
+                       (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+                       ~EXT4_FEATURE_INCOMPAT_SUPP));
                goto failed_mount;
        }
        features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
        if (!(sb->s_flags & MS_RDONLY) && features) {
                printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
-                      "unsupported optional features (%x).\n",
-                      sb->s_id, le32_to_cpu(features));
+                      "unsupported optional features (%x).\n", sb->s_id,
+                       (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+                       ~EXT4_FEATURE_RO_COMPAT_SUPP));
                goto failed_mount;
        }
        has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2117,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
+       i = le32_to_cpu(es->s_flags);
+       if (i & EXT2_FLAGS_UNSIGNED_HASH)
+               sbi->s_hash_unsigned = 3;
+       else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+               es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+               sbi->s_hash_unsigned = 3;
+#else
+               es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+               sb->s_dirt = 1;
+       }
 
        if (sbi->s_blocks_per_group > blocksize * 8) {
                printk(KERN_ERR
@@ -2144,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
 
-       /* ensure blocks_count calculation below doesn't sign-extend */
-       if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
-           le32_to_cpu(es->s_first_data_block) + 1) {
-               printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
-                      "first data block %u, blocks per group %lu\n",
-                       ext4_blocks_count(es),
-                       le32_to_cpu(es->s_first_data_block),
-                       EXT4_BLOCKS_PER_GROUP(sb));
+        /*
+         * It makes no sense for the first data block to be beyond the end
+         * of the filesystem.
+         */
+        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
+                      "block %u is beyond end of filesystem (%llu)\n",
+                      le32_to_cpu(es->s_first_data_block),
+                      ext4_blocks_count(es));
                goto failed_mount;
        }
        blocks_count = (ext4_blocks_count(es) -
                        le32_to_cpu(es->s_first_data_block) +
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+       if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+               printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+                      "(block count %llu, first data block %u, "
+                      "blocks per group %lu)\n", sbi->s_groups_count,
+                      ext4_blocks_count(es),
+                      le32_to_cpu(es->s_first_data_block),
+                      EXT4_BLOCKS_PER_GROUP(sb));
+               goto failed_mount;
+       }
        sbi->s_groups_count = blocks_count;
        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
@@ -2269,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                                ext4_commit_super(sb, es, 1);
-                               printk(KERN_CRIT
-                                      "EXT4-fs (device %s): mount failed\n",
-                                     sb->s_id);
                                goto failed_mount4;
                        }
                }
-       } else if (journal_inum) {
-               if (ext4_create_journal(sb, es, journal_inum))
-                       goto failed_mount3;
+       } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+             EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+               printk(KERN_ERR "EXT4-fs: required journal recovery "
+                      "suppressed and not mounted read-only\n");
+               goto failed_mount4;
        } else {
-               if (!silent)
-                       printk(KERN_ERR
-                              "ext4: No journal on filesystem on %s\n",
-                              sb->s_id);
-               goto failed_mount3;
+               clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+               set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+               sbi->s_journal = NULL;
+               needs_recovery = 0;
+               goto no_journal;
        }
 
        if (ext4_blocks_count(es) > 0xffffffffULL &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
-               printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n");
+               printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
                goto failed_mount4;
        }
 
@@ -2334,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        default:
                break;
        }
+       set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+
+no_journal:
 
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
@@ -2419,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
-       if (needs_recovery)
+       if (needs_recovery) {
                printk(KERN_INFO "EXT4-fs: recovery complete.\n");
-       ext4_mark_recovery_complete(sb, es);
-       printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
-              test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
-              test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
-              "writeback");
+               ext4_mark_recovery_complete(sb, es);
+       }
+       if (EXT4_SB(sb)->s_journal) {
+               if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+                       descr = " journalled data mode";
+               else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+                       descr = " ordered data mode";
+               else
+                       descr = " writeback data mode";
+       } else
+               descr = "out journal";
+
+       printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
+              sb->s_id, descr);
 
        lock_kernel();
        return 0;
@@ -2437,8 +2600,11 @@ cantfind_ext4:
        goto failed_mount;
 
 failed_mount4:
-       jbd2_journal_destroy(sbi->s_journal);
-       sbi->s_journal = NULL;
+       printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+       if (sbi->s_journal) {
+               jbd2_journal_destroy(sbi->s_journal);
+               sbi->s_journal = NULL;
+       }
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2475,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-       if (sbi->s_commit_interval)
-               journal->j_commit_interval = sbi->s_commit_interval;
-       /* We could also set up an ext4-specific default for the commit
-        * interval here, but for now we'll just fall back to the jbd
-        * default. */
+       journal->j_commit_interval = sbi->s_commit_interval;
+       journal->j_min_batch_time = sbi->s_min_batch_time;
+       journal->j_max_batch_time = sbi->s_max_batch_time;
 
        spin_lock(&journal->j_state_lock);
        if (test_opt(sb, BARRIER))
@@ -2499,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
        struct inode *journal_inode;
        journal_t *journal;
 
+       BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
        /* First, test for the existence of a valid inode on disk.  Bad
         * things happen if we iget() an unused inode, as the subsequent
         * iput() will try to delete it. */
@@ -2547,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        struct ext4_super_block *es;
        struct block_device *bdev;
 
+       BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
        bdev = ext4_blkdev_get(j_dev);
        if (bdev == NULL)
                return NULL;
 
        if (bd_claim(bdev, sb)) {
                printk(KERN_ERR
-                       "EXT4: failed to claim external journal device.\n");
+                       "EXT4-fs: failed to claim external journal device.\n");
                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
                return NULL;
        }
@@ -2634,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb,
        int err = 0;
        int really_read_only;
 
+       BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                printk(KERN_INFO "EXT4-fs: external journal device major/minor "
@@ -2718,48 +2888,6 @@ static int ext4_load_journal(struct super_block *sb,
        return 0;
 }
 
-static int ext4_create_journal(struct super_block *sb,
-                              struct ext4_super_block *es,
-                              unsigned int journal_inum)
-{
-       journal_t *journal;
-       int err;
-
-       if (sb->s_flags & MS_RDONLY) {
-               printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
-                               "create journal.\n");
-               return -EROFS;
-       }
-
-       journal = ext4_get_journal(sb, journal_inum);
-       if (!journal)
-               return -EINVAL;
-
-       printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
-              journal_inum);
-
-       err = jbd2_journal_create(journal);
-       if (err) {
-               printk(KERN_ERR "EXT4-fs: error creating journal.\n");
-               jbd2_journal_destroy(journal);
-               return -EIO;
-       }
-
-       EXT4_SB(sb)->s_journal = journal;
-
-       ext4_update_dynamic_rev(sb);
-       EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-       EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
-
-       es->s_journal_inum = cpu_to_le32(journal_inum);
-       sb->s_dirt = 1;
-
-       /* Make sure we flush the recovery flag to disk. */
-       ext4_commit_super(sb, es, 1);
-
-       return 0;
-}
-
 static void ext4_commit_super(struct super_block *sb,
                              struct ext4_super_block *es, int sync)
 {
@@ -2776,20 +2904,23 @@ static void ext4_commit_super(struct super_block *sb,
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
-               printk(KERN_ERR "ext4: previous I/O error to "
+               printk(KERN_ERR "EXT4-fs: previous I/O error to "
                       "superblock detected for %s.\n", sb->s_id);
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
        es->s_wtime = cpu_to_le32(get_seconds());
-       ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
-       es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+       ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+                                       &EXT4_SB(sb)->s_freeblocks_counter));
+       es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+                                       &EXT4_SB(sb)->s_freeinodes_counter));
+
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
        if (sync) {
                sync_dirty_buffer(sbh);
                if (buffer_write_io_error(sbh)) {
-                       printk(KERN_ERR "ext4: I/O error while writing "
+                       printk(KERN_ERR "EXT4-fs: I/O error while writing "
                               "superblock for %s.\n", sb->s_id);
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
@@ -2808,6 +2939,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 {
        journal_t *journal = EXT4_SB(sb)->s_journal;
 
+       if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+               BUG_ON(journal != NULL);
+               return;
+       }
        jbd2_journal_lock_updates(journal);
        if (jbd2_journal_flush(journal) < 0)
                goto out;
@@ -2837,6 +2972,8 @@ static void ext4_clear_journal_err(struct super_block *sb,
        int j_errno;
        const char *errstr;
 
+       BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
        journal = EXT4_SB(sb)->s_journal;
 
        /*
@@ -2869,14 +3006,17 @@ static void ext4_clear_journal_err(struct super_block *sb,
 int ext4_force_commit(struct super_block *sb)
 {
        journal_t *journal;
-       int ret;
+       int ret = 0;
 
        if (sb->s_flags & MS_RDONLY)
                return 0;
 
        journal = EXT4_SB(sb)->s_journal;
-       sb->s_dirt = 0;
-       ret = ext4_journal_force_commit(journal);
+       if (journal) {
+               sb->s_dirt = 0;
+               ret = ext4_journal_force_commit(journal);
+       }
+
        return ret;
 }
 
@@ -2888,9 +3028,13 @@ int ext4_force_commit(struct super_block *sb)
  */
 static void ext4_write_super(struct super_block *sb)
 {
-       if (mutex_trylock(&sb->s_lock) != 0)
-               BUG();
-       sb->s_dirt = 0;
+       if (EXT4_SB(sb)->s_journal) {
+               if (mutex_trylock(&sb->s_lock) != 0)
+                       BUG();
+               sb->s_dirt = 0;
+       } else {
+               ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+       }
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -2899,10 +3043,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 
        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
        sb->s_dirt = 0;
-       if (wait)
-               ret = ext4_force_commit(sb);
-       else
-               jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+       if (EXT4_SB(sb)->s_journal) {
+               if (wait)
+                       ret = ext4_force_commit(sb);
+               else
+                       jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+       } else {
+               ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+       }
        return ret;
 }
 
@@ -2917,15 +3065,17 @@ static void ext4_write_super_lockfs(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                journal_t *journal = EXT4_SB(sb)->s_journal;
 
-               /* Now we set up the journal barrier. */
-               jbd2_journal_lock_updates(journal);
+               if (journal) {
+                       /* Now we set up the journal barrier. */
+                       jbd2_journal_lock_updates(journal);
 
-               /*
-                * We don't want to clear needs_recovery flag when we failed
-                * to flush the journal.
-                */
-               if (jbd2_journal_flush(journal) < 0)
-                       return;
+                       /*
+                        * We don't want to clear needs_recovery flag when we
+                        * failed to flush the journal.
+                        */
+                       if (jbd2_journal_flush(journal) < 0)
+                               return;
+               }
 
                /* Journal blocked and flushed, clear needs_recovery flag. */
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2939,7 +3089,7 @@ static void ext4_write_super_lockfs(struct super_block *sb)
  */
 static void ext4_unlockfs(struct super_block *sb)
 {
-       if (!(sb->s_flags & MS_RDONLY)) {
+       if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
                lock_super(sb);
                /* Reser the needs_recovery flag before the fs is unlocked. */
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2957,6 +3107,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
        ext4_group_t g;
+       unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
@@ -2968,16 +3119,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
+       old_opts.s_min_batch_time = sbi->s_min_batch_time;
+       old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
        for (i = 0; i < MAXQUOTAS; i++)
                old_opts.s_qf_names[i] = sbi->s_qf_names[i];
 #endif
+       if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+               journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
 
        /*
         * Allow the "check" option to be passed as a remount option.
         */
-       if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+       if (!parse_options(data, sb, NULL, &journal_ioprio,
+                          &n_blocks_count, 1)) {
                err = -EINVAL;
                goto restore_opts;
        }
@@ -2990,7 +3146,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
        es = sbi->s_es;
 
-       ext4_init_journal_params(sb, sbi->s_journal);
+       if (sbi->s_journal) {
+               ext4_init_journal_params(sb, sbi->s_journal);
+               set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+       }
 
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
                n_blocks_count > ext4_blocks_count(es)) {
@@ -3019,17 +3178,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                         * We have to unlock super so that we can wait for
                         * transactions.
                         */
-                       unlock_super(sb);
-                       ext4_mark_recovery_complete(sb, es);
-                       lock_super(sb);
+                       if (sbi->s_journal) {
+                               unlock_super(sb);
+                               ext4_mark_recovery_complete(sb, es);
+                               lock_super(sb);
+                       }
                } else {
-                       __le32 ret;
+                       int ret;
                        if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
                                printk(KERN_WARNING "EXT4-fs: %s: couldn't "
                                       "remount RDWR because of unsupported "
-                                      "optional features (%x).\n",
-                                      sb->s_id, le32_to_cpu(ret));
+                                      "optional features (%x).\n", sb->s_id,
+                               (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
+                                       ~EXT4_FEATURE_RO_COMPAT_SUPP));
                                err = -EROFS;
                                goto restore_opts;
                        }
@@ -3046,7 +3208,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
                                        printk(KERN_ERR
               "EXT4-fs: ext4_remount: "
-               "Checksum for group %lu failed (%u!=%u)\n",
+               "Checksum for group %u failed (%u!=%u)\n",
                g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EINVAL;
@@ -3075,7 +3237,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                         * been changed by e2fsck since we originally mounted
                         * the partition.)
                         */
-                       ext4_clear_journal_err(sb, es);
+                       if (sbi->s_journal)
+                               ext4_clear_journal_err(sb, es);
                        sbi->s_mount_state = le16_to_cpu(es->s_state);
                        if ((err = ext4_group_extend(sb, es, n_blocks_count)))
                                goto restore_opts;
@@ -3083,6 +3246,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                sb->s_flags &= ~MS_RDONLY;
                }
        }
+       if (sbi->s_journal == NULL)
+               ext4_commit_super(sb, es, 1);
+
 #ifdef CONFIG_QUOTA
        /* Release old quota file names */
        for (i = 0; i < MAXQUOTAS; i++)
@@ -3097,6 +3263,8 @@ restore_opts:
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
+       sbi->s_min_batch_time = old_opts.s_min_batch_time;
+       sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
        for (i = 0; i < MAXQUOTAS; i++) {
@@ -3359,7 +3527,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
         * When we journal data on quota file, we have to flush journal to see
         * all updates to the file when we bypass pagecache...
         */
-       if (ext4_should_journal_data(path.dentry->d_inode)) {
+       if (EXT4_SB(sb)->s_journal &&
+           ext4_should_journal_data(path.dentry->d_inode)) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
@@ -3433,7 +3602,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
 
-       if (!handle) {
+       if (EXT4_SB(sb)->s_journal && !handle) {
                printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
                        " cancelled because transaction is not started.\n",
                        (unsigned long long)off, (unsigned long long)len);
@@ -3458,7 +3627,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                flush_dcache_page(bh->b_page);
                unlock_buffer(bh);
                if (journal_quota)
-                       err = ext4_journal_dirty_metadata(handle, bh);
+                       err = ext4_handle_dirty_metadata(handle, NULL, bh);
                else {
                        /* Always do at least ordered writes for quotas */
                        err = ext4_jbd2_file_inode(handle, inode);
@@ -3512,18 +3681,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file)
 static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
                               size_t cnt, loff_t *ppos)
 {
-       unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+       unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
        char str[32];
-       unsigned long value;
 
        if (cnt >= sizeof(str))
                return -EINVAL;
        if (copy_from_user(str, buf, cnt))
                return -EFAULT;
-       value = simple_strtol(str, NULL, 0);
-       if (value < 0)
-               return -ERANGE;
-       *p = value;
+
+       *p = simple_strtoul(str, NULL, 0);
        return cnt;
 }
 
@@ -3614,7 +3780,7 @@ static void __exit exit_ext4_fs(void)
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
 module_init(init_ext4_fs)
 module_exit(exit_ext4_fs)
index 80626d5..157ce65 100644 (file)
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
        if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
                EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
                sb->s_dirt = 1;
-               ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+               ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        }
 }
 
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
        } else {
                le32_add_cpu(&BHDR(bh)->h_refcount, -1);
-               error = ext4_journal_dirty_metadata(handle, bh);
+               error = ext4_handle_dirty_metadata(handle, inode, bh);
                if (IS_SYNC(inode))
-                       handle->h_sync = 1;
+                       ext4_handle_sync(handle);
                DQUOT_FREE_BLOCK(inode, 1);
                ea_bdebug(bh, "refcount now=%d; releasing",
                          le32_to_cpu(BHDR(bh)->h_refcount));
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                        if (error == -EIO)
                                goto bad_block;
                        if (!error)
-                               error = ext4_journal_dirty_metadata(handle,
-                                                                   bs->bh);
+                               error = ext4_handle_dirty_metadata(handle,
+                                                                  inode,
+                                                                  bs->bh);
                        if (error)
                                goto cleanup;
                        goto inserted;
@@ -794,8 +795,9 @@ inserted:
                                ea_bdebug(new_bh, "reusing; refcount now=%d",
                                        le32_to_cpu(BHDR(new_bh)->h_refcount));
                                unlock_buffer(new_bh);
-                               error = ext4_journal_dirty_metadata(handle,
-                                                                   new_bh);
+                               error = ext4_handle_dirty_metadata(handle,
+                                                                  inode,
+                                                                  new_bh);
                                if (error)
                                        goto cleanup_dquot;
                        }
@@ -810,8 +812,8 @@ inserted:
                        /* We need to allocate a new block */
                        ext4_fsblk_t goal = ext4_group_first_block_no(sb,
                                                EXT4_I(inode)->i_block_group);
-                       ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
-                                                       goal, &error);
+                       ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+                                                 goal, NULL, &error);
                        if (error)
                                goto cleanup;
                        ea_idebug(inode, "creating block %d", block);
@@ -833,7 +835,8 @@ getblk_failed:
                        set_buffer_uptodate(new_bh);
                        unlock_buffer(new_bh);
                        ext4_xattr_cache_insert(new_bh);
-                       error = ext4_journal_dirty_metadata(handle, new_bh);
+                       error = ext4_handle_dirty_metadata(handle,
+                                                          inode, new_bh);
                        if (error)
                                goto cleanup;
                }
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                 */
                is.iloc.bh = NULL;
                if (IS_SYNC(inode))
-                       handle->h_sync = 1;
+                       ext4_handle_sync(handle);
        }
 
 cleanup:
index 3569e0a..1a39ac3 100644 (file)
@@ -27,7 +27,7 @@
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
 
-static int set_task_ioprio(struct task_struct *task, int ioprio)
+int set_task_ioprio(struct task_struct *task, int ioprio)
 {
        int err;
        struct io_context *ioc;
@@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
        task_unlock(task);
        return err;
 }
+EXPORT_SYMBOL_GPL(set_task_ioprio);
 
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 {
index 9497718..17159ca 100644 (file)
@@ -249,16 +249,14 @@ restart:
        return ret;
 }
 
-#define NR_BATCH       64
-
 static void
-__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+__flush_batch(journal_t *journal, int *batch_count)
 {
        int i;
 
-       ll_rw_block(SWRITE, *batch_count, bhs);
+       ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
        for (i = 0; i < *batch_count; i++) {
-               struct buffer_head *bh = bhs[i];
+               struct buffer_head *bh = journal->j_chkpt_bhs[i];
                clear_buffer_jwrite(bh);
                BUFFER_TRACE(bh, "brelse");
                __brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
-                       struct buffer_head **bhs, int *batch_count,
-                       transaction_t *transaction)
+                           int *batch_count, transaction_t *transaction)
 {
        struct buffer_head *bh = jh2bh(jh);
        int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                get_bh(bh);
                J_ASSERT_BH(bh, !buffer_jwrite(bh));
                set_buffer_jwrite(bh);
-               bhs[*batch_count] = bh;
+               journal->j_chkpt_bhs[*batch_count] = bh;
                __buffer_relink_io(jh);
                jbd_unlock_bh_state(bh);
                transaction->t_chp_stats.cs_written++;
                (*batch_count)++;
-               if (*batch_count == NR_BATCH) {
+               if (*batch_count == JBD2_NR_BATCH) {
                        spin_unlock(&journal->j_list_lock);
-                       __flush_batch(journal, bhs, batch_count);
+                       __flush_batch(journal, batch_count);
                        ret = 1;
                }
        }
@@ -388,7 +385,6 @@ restart:
        if (journal->j_checkpoint_transactions == transaction &&
                        transaction->t_tid == this_tid) {
                int batch_count = 0;
-               struct buffer_head *bhs[NR_BATCH];
                struct journal_head *jh;
                int retry = 0, err;
 
@@ -402,7 +398,7 @@ restart:
                                retry = 1;
                                break;
                        }
-                       retry = __process_buffer(journal, jh, bhs, &batch_count,
+                       retry = __process_buffer(journal, jh, &batch_count,
                                                 transaction);
                        if (retry < 0 && !result)
                                result = retry;
@@ -419,7 +415,7 @@ restart:
                                spin_unlock(&journal->j_list_lock);
                                retry = 1;
                        }
-                       __flush_batch(journal, bhs, &batch_count);
+                       __flush_batch(journal, &batch_count);
                }
 
                if (retry) {
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
           safely remove this transaction from the log */
 
        __jbd2_journal_drop_transaction(journal, transaction);
+       kfree(transaction);
 
        /* Just in case anybody was waiting for more transactions to be
            checkpointed... */
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(journal->j_running_transaction != transaction);
 
        jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
-       kfree(transaction);
 }
index c8a1bac..62804e5 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
+#include <linux/bio.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
                set_buffer_ordered(bh);
                barrier_done = 1;
        }
-       ret = submit_bh(WRITE, bh);
+       ret = submit_bh(WRITE_SYNC, bh);
        if (barrier_done)
                clear_buffer_ordered(bh);
 
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
                lock_buffer(bh);
                set_buffer_uptodate(bh);
                clear_buffer_dirty(bh);
-               ret = submit_bh(WRITE, bh);
+               ret = submit_bh(WRITE_SYNC, bh);
        }
        *cbh = bh;
        return ret;
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
  * This function along with journal_submit_commit_record
  * allows to write the commit record asynchronously.
  */
-static int journal_wait_on_commit_record(struct buffer_head *bh)
+static int journal_wait_on_commit_record(journal_t *journal,
+                                        struct buffer_head *bh)
 {
        int ret = 0;
 
+retry:
        clear_buffer_dirty(bh);
        wait_on_buffer(bh);
+       if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
+               printk(KERN_WARNING
+                      "JBD2: wait_on_commit_record: sync failed on %s - "
+                      "disabling barriers\n", journal->j_devname);
+               spin_lock(&journal->j_state_lock);
+               journal->j_flags &= ~JBD2_BARRIER;
+               spin_unlock(&journal->j_state_lock);
+
+               lock_buffer(bh);
+               clear_buffer_dirty(bh);
+               set_buffer_uptodate(bh);
+               bh->b_end_io = journal_end_buffer_io_sync;
+
+               ret = submit_bh(WRITE_SYNC, bh);
+               if (ret) {
+                       unlock_buffer(bh);
+                       return ret;
+               }
+               goto retry;
+       }
 
        if (unlikely(!buffer_uptodate(bh)))
                ret = -EIO;
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int flags;
        int err;
        unsigned long long blocknr;
+       ktime_t start_time;
+       u64 commit_time;
        char *tagp = NULL;
        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
        int space_left = 0;
        int first_tag = 0;
        int tag_flag;
-       int i;
+       int i, to_free = 0;
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
+       start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
        spin_unlock(&journal->j_state_lock);
@@ -803,7 +829,7 @@ wait_for_iobuf:
                        __jbd2_journal_abort_hard(journal);
        }
        if (!err && !is_journal_aborted(journal))
-               err = journal_wait_on_commit_record(cbh);
+               err = journal_wait_on_commit_record(journal, cbh);
 
        if (err)
                jbd2_journal_abort(journal, err);
@@ -981,14 +1007,23 @@ restart_loop:
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
-       spin_unlock(&journal->j_state_lock);
+       commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
 
-       if (journal->j_commit_callback)
-               journal->j_commit_callback(journal, commit_transaction);
+       /*
+        * weight the commit time higher than the average time so we don't
+        * react too strongly to vast changes in the commit time
+        */
+       if (likely(journal->j_average_commit_time))
+               journal->j_average_commit_time = (commit_time +
+                               journal->j_average_commit_time*3) / 4;
+       else
+               journal->j_average_commit_time = commit_time;
+       spin_unlock(&journal->j_state_lock);
 
        if (commit_transaction->t_checkpoint_list == NULL &&
            commit_transaction->t_checkpoint_io_list == NULL) {
                __jbd2_journal_drop_transaction(journal, commit_transaction);
+               to_free = 1;
        } else {
                if (journal->j_checkpoint_transactions == NULL) {
                        journal->j_checkpoint_transactions = commit_transaction;
@@ -1007,11 +1042,16 @@ restart_loop:
        }
        spin_unlock(&journal->j_list_lock);
 
+       if (journal->j_commit_callback)
+               journal->j_commit_callback(journal, commit_transaction);
+
        trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-                  journal->j_devname, journal->j_commit_sequence,
+                  journal->j_devname, commit_transaction->t_tid,
                   journal->j_tail_sequence);
        jbd_debug(1, "JBD: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
+       if (to_free)
+               kfree(commit_transaction);
 
        wake_up(&journal->j_wait_done_commit);
 }
index f6bff9d..5667530 100644 (file)
@@ -40,6 +40,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
+#include <asm/div64.h>
 
 EXPORT_SYMBOL(jbd2_journal_start);
 EXPORT_SYMBOL(jbd2_journal_restart);
@@ -66,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
 EXPORT_SYMBOL(jbd2_journal_check_used_features);
 EXPORT_SYMBOL(jbd2_journal_check_available_features);
 EXPORT_SYMBOL(jbd2_journal_set_features);
-EXPORT_SYMBOL(jbd2_journal_create);
 EXPORT_SYMBOL(jbd2_journal_load);
 EXPORT_SYMBOL(jbd2_journal_destroy);
 EXPORT_SYMBOL(jbd2_journal_abort);
@@ -132,8 +132,9 @@ static int kjournald2(void *arg)
        journal->j_task = current;
        wake_up(&journal->j_wait_done_commit);
 
-       printk(KERN_INFO "kjournald2 starting.  Commit interval %ld seconds\n",
-                       journal->j_commit_interval / HZ);
+       printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
+              "commit interval %ld seconds\n", current->pid,
+              journal->j_devname, journal->j_commit_interval / HZ);
 
        /*
         * And now, wait forever for commit wakeup events.
@@ -650,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
                return NULL;
 
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+       if (!bh)
+               return NULL;
        lock_buffer(bh);
        memset(bh->b_data, 0, journal->j_blocksize);
        set_buffer_uptodate(bh);
@@ -843,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
            jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
        seq_printf(seq, "  %ums logging transaction\n",
            jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+       seq_printf(seq, "  %luus average transaction commit time\n",
+                  do_div(s->journal->j_average_commit_time, 1000));
        seq_printf(seq, "  %lu handles per transaction\n",
            s->stats->u.run.rs_handle_count / s->stats->ts_tid);
        seq_printf(seq, "  %lu blocks per transaction\n",
@@ -980,6 +985,8 @@ static journal_t * journal_init_common (void)
        spin_lock_init(&journal->j_state_lock);
 
        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+       journal->j_min_batch_time = 0;
+       journal->j_max_batch_time = 15000; /* 15ms */
 
        /* The journal is marked for error until we succeed with recovery! */
        journal->j_flags = JBD2_ABORT;
@@ -1035,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 
        /* journal descriptor can store up to n blocks -bzzz */
        journal->j_blocksize = blocksize;
+       jbd2_stats_proc_init(journal);
        n = journal->j_blocksize / sizeof(journal_block_tag_t);
        journal->j_wbufsize = n;
        journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
-               kfree(journal);
-               journal = NULL;
-               goto out;
+               goto out_err;
        }
        journal->j_dev = bdev;
        journal->j_fs_dev = fs_dev;
@@ -1053,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        p = journal->j_devname;
        while ((p = strchr(p, '/')))
                *p = '!';
-       jbd2_stats_proc_init(journal);
 
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-       J_ASSERT(bh != NULL);
+       if (!bh) {
+               printk(KERN_ERR
+                      "%s: Cannot get buffer for journal superblock\n",
+                      __func__);
+               goto out_err;
+       }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
-out:
+
        return journal;
+out_err:
+       jbd2_stats_proc_exit(journal);
+       kfree(journal);
+       return NULL;
 }
 
 /**
@@ -1108,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
-               jbd2_stats_proc_exit(journal);
-               kfree(journal);
-               return NULL;
+               goto out_err;
        }
 
        err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1118,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        if (err) {
                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
                       __func__);
-               jbd2_stats_proc_exit(journal);
-               kfree(journal);
-               return NULL;
+               goto out_err;
        }
 
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-       J_ASSERT(bh != NULL);
+       if (!bh) {
+               printk(KERN_ERR
+                      "%s: Cannot get buffer for journal superblock\n",
+                      __func__);
+               goto out_err;
+       }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
 
        return journal;
+out_err:
+       jbd2_stats_proc_exit(journal);
+       kfree(journal);
+       return NULL;
 }
 
 /*
@@ -1177,77 +1196,6 @@ static int journal_reset(journal_t *journal)
 }
 
 /**
- * int jbd2_journal_create() - Initialise the new journal file
- * @journal: Journal to create. This structure must have been initialised
- *
- * Given a journal_t structure which tells us which disk blocks we can
- * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.
- **/
-int jbd2_journal_create(journal_t *journal)
-{
-       unsigned long long blocknr;
-       struct buffer_head *bh;
-       journal_superblock_t *sb;
-       int i, err;
-
-       if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
-               printk (KERN_ERR "Journal length (%d blocks) too short.\n",
-                       journal->j_maxlen);
-               journal_fail_superblock(journal);
-               return -EINVAL;
-       }
-
-       if (journal->j_inode == NULL) {
-               /*
-                * We don't know what block to start at!
-                */
-               printk(KERN_EMERG
-                      "%s: creation of journal on external device!\n",
-                      __func__);
-               BUG();
-       }
-
-       /* Zero out the entire journal on disk.  We cannot afford to
-          have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
-       jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
-       for (i = 0; i < journal->j_maxlen; i++) {
-               err = jbd2_journal_bmap(journal, i, &blocknr);
-               if (err)
-                       return err;
-               bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-               lock_buffer(bh);
-               memset (bh->b_data, 0, journal->j_blocksize);
-               BUFFER_TRACE(bh, "marking dirty");
-               mark_buffer_dirty(bh);
-               BUFFER_TRACE(bh, "marking uptodate");
-               set_buffer_uptodate(bh);
-               unlock_buffer(bh);
-               __brelse(bh);
-       }
-
-       sync_blockdev(journal->j_dev);
-       jbd_debug(1, "JBD: journal cleared.\n");
-
-       /* OK, fill in the initial static fields in the new superblock */
-       sb = journal->j_superblock;
-
-       sb->s_header.h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
-       sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
-
-       sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
-       sb->s_maxlen    = cpu_to_be32(journal->j_maxlen);
-       sb->s_first     = cpu_to_be32(1);
-
-       journal->j_transaction_sequence = 1;
-
-       journal->j_flags &= ~JBD2_ABORT;
-       journal->j_format_version = 2;
-
-       return journal_reset(journal);
-}
-
-/**
  * void jbd2_journal_update_superblock() - Update journal sb on disk.
  * @journal: The journal to update.
  * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -1491,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal)
        spin_lock(&journal->j_list_lock);
        while (journal->j_checkpoint_transactions != NULL) {
                spin_unlock(&journal->j_list_lock);
+               mutex_lock(&journal->j_checkpoint_mutex);
                jbd2_log_do_checkpoint(journal);
+               mutex_unlock(&journal->j_checkpoint_mutex);
                spin_lock(&journal->j_list_lock);
        }
 
index 4f925a4..46b4e34 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
+       transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
@@ -1240,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-       int old_handle_count, err;
+       int err;
        pid_t pid;
 
        J_ASSERT(journal_current_handle() == handle);
@@ -1263,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
        /*
         * Implement synchronous transaction batching.  If the handle
         * was synchronous, don't force a commit immediately.  Let's
-        * yield and let another thread piggyback onto this transaction.
-        * Keep doing that while new threads continue to arrive.
-        * It doesn't cost much - we're about to run a commit and sleep
-        * on IO anyway.  Speeds up many-threaded, many-dir operations
-        * by 30x or more...
+        * yield and let another thread piggyback onto this
+        * transaction.  Keep doing that while new threads continue to
+        * arrive.  It doesn't cost much - we're about to run a commit
+        * and sleep on IO anyway.  Speeds up many-threaded, many-dir
+        * operations by 30x or more...
+        *
+        * We try and optimize the sleep time against what the
+        * underlying disk can do, instead of having a static sleep
+        * time.  This is useful for the case where our storage is so
+        * fast that it is more optimal to go ahead and force a flush
+        * and wait for the transaction to be committed than it is to
+        * wait for an arbitrary amount of time for new writers to
+        * join the transaction.  We achieve this by measuring how
+        * long it takes to commit a transaction, and compare it with
+        * how long this transaction has been running, and if run time
+        * < commit time then we sleep for the delta and commit.  This
+        * greatly helps super fast disks that would see slowdowns as
+        * more threads started doing fsyncs.
         *
-        * But don't do this if this process was the most recent one to
-        * perform a synchronous write.  We do this to detect the case where a
-        * single process is doing a stream of sync writes.  No point in waiting
-        * for joiners in that case.
+        * But don't do this if this process was the most recent one
+        * to perform a synchronous write.  We do this to detect the
+        * case where a single process is doing a stream of sync
+        * writes.  No point in waiting for joiners in that case.
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+               u64 commit_time, trans_time;
+
                journal->j_last_sync_writer = pid;
-               do {
-                       old_handle_count = transaction->t_handle_count;
-                       schedule_timeout_uninterruptible(1);
-               } while (old_handle_count != transaction->t_handle_count);
+
+               spin_lock(&journal->j_state_lock);
+               commit_time = journal->j_average_commit_time;
+               spin_unlock(&journal->j_state_lock);
+
+               trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+                                                  transaction->t_start_time));
+
+               commit_time = max_t(u64, commit_time,
+                                   1000*journal->j_min_batch_time);
+               commit_time = min_t(u64, commit_time,
+                                   1000*journal->j_max_batch_time);
+
+               if (trans_time < commit_time) {
+                       ktime_t expires = ktime_add_ns(ktime_get(),
+                                                      commit_time);
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+               }
        }
 
        current->journal_info = NULL;
index 7d67387..ed080c4 100644 (file)
@@ -810,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
                }
 
                s->s_flags |= MS_ACTIVE;
+               bdev->bd_super = s;
        }
 
        return simple_set_mnt(mnt, s);
@@ -829,6 +830,7 @@ void kill_block_super(struct super_block *sb)
        struct block_device *bdev = sb->s_bdev;
        fmode_t mode = sb->s_mode;
 
+       bdev->bd_super = 0;
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
        close_bdev_exclusive(bdev, mode);
index d76800f..dd495b8 100644 (file)
@@ -378,6 +378,13 @@ struct ext3_inode {
 #define        EXT3_ORPHAN_FS                  0x0004  /* Orphans being recovered */
 
 /*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH         0x0001  /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH       0x0002  /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS                0x0004  /* to test development code */
+
+/*
  * Mount flags
  */
 #define EXT3_MOUNT_CHECK               0x00001 /* Do mount-time checks */
@@ -513,7 +520,23 @@ struct ext3_super_block {
        __u16   s_reserved_word_pad;
        __le32  s_default_mount_opts;
        __le32  s_first_meta_bg;        /* First metablock block group */
-       __u32   s_reserved[190];        /* Padding to the end of the block */
+       __le32  s_mkfs_time;            /* When the filesystem was created */
+       __le32  s_jnl_blocks[17];       /* Backup of the journal inode */
+       /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/        __le32  s_blocks_count_hi;      /* Blocks count */
+       __le32  s_r_blocks_count_hi;    /* Reserved blocks count */
+       __le32  s_free_blocks_count_hi; /* Free blocks count */
+       __le16  s_min_extra_isize;      /* All inodes have at least # bytes */
+       __le16  s_want_extra_isize;     /* New inodes should reserve # bytes */
+       __le32  s_flags;                /* Miscellaneous flags */
+       __le16  s_raid_stride;          /* RAID stride */
+       __le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+       __le64  s_mmp_block;            /* Block for multi-mount protection */
+       __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
+       __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
+       __u8    s_reserved_char_pad2;
+       __le16  s_reserved_pad;
+       __u32   s_reserved[162];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
@@ -718,6 +741,9 @@ static inline __le16 ext3_rec_len_to_disk(unsigned len)
 #define DX_HASH_LEGACY         0
 #define DX_HASH_HALF_MD4       1
 #define DX_HASH_TEA            2
+#define DX_HASH_LEGACY_UNSIGNED        3
+#define DX_HASH_HALF_MD4_UNSIGNED      4
+#define DX_HASH_TEA_UNSIGNED           5
 
 #ifdef __KERNEL__
 
index 76fdc0f..f07f34d 100644 (file)
@@ -57,6 +57,7 @@ struct ext3_sb_info {
        u32 s_next_generation;
        u32 s_hash_seed[4];
        int s_def_hash_version;
+       int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
        struct percpu_counter s_freeblocks_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
index e38a64d..0b87b29 100644 (file)
@@ -565,6 +565,7 @@ struct address_space {
 struct block_device {
        dev_t                   bd_dev;  /* not a kdev_t - it's a search key */
        struct inode *          bd_inode;       /* will die */
+       struct super_block *    bd_super;
        int                     bd_openers;
        struct mutex            bd_mutex;       /* open/close mutex */
        struct semaphore        bd_mount_sem;
@@ -1389,6 +1390,7 @@ struct super_operations {
        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
+       int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
 };
 
 /*
index f98a656..76dad48 100644 (file)
@@ -86,4 +86,6 @@ static inline int task_nice_ioclass(struct task_struct *task)
  */
 extern int ioprio_best(unsigned short aprio, unsigned short bprio);
 
+extern int set_task_ioprio(struct task_struct *task, int ioprio);
+
 #endif
index 3445647..b45109c 100644 (file)
@@ -638,6 +638,11 @@ struct transaction_s
        unsigned long           t_expires;
 
        /*
+        * When this transaction started, in nanoseconds [no locking]
+        */
+       ktime_t                 t_start_time;
+
+       /*
         * How many handles used this transaction? [t_handle_lock]
         */
        int t_handle_count;
@@ -682,6 +687,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
        return end + (MAX_JIFFY_OFFSET - start);
 }
 
+#define JBD2_NR_BATCH  64
+
 /**
  * struct journal_s - The journal_s type is the concrete type associated with
  *     journal_t.
@@ -826,6 +833,14 @@ struct journal_s
        struct mutex            j_checkpoint_mutex;
 
        /*
+        * List of buffer heads used by the checkpoint routine.  This
+        * was moved from jbd2_log_do_checkpoint() to reduce stack
+        * usage.  Access to this array is controlled by the
+        * j_checkpoint_mutex.  [j_checkpoint_mutex]
+        */
+       struct buffer_head      *j_chkpt_bhs[JBD2_NR_BATCH];
+       
+       /*
         * Journal head: identifies the first unused block in the journal.
         * [j_state_lock]
         */
@@ -939,8 +954,26 @@ struct journal_s
        struct buffer_head      **j_wbuf;
        int                     j_wbufsize;
 
+       /*
+        * this is the pid of hte last person to run a synchronous operation
+        * through the journal
+        */
        pid_t                   j_last_sync_writer;
 
+       /*
+        * the average amount of time in nanoseconds it takes to commit a
+        * transaction to disk. [j_state_lock]
+        */
+       u64                     j_average_commit_time;
+
+       /*
+        * minimum and maximum times that we should wait for
+        * additional filesystem operations to get batched into a
+        * synchronous handle in microseconds
+        */
+       u32                     j_min_batch_time;
+       u32                     j_max_batch_time;
+
        /* This function is called when a transaction is closed */
        void                    (*j_commit_callback)(journal_t *,
                                                     transaction_t *);
@@ -1102,7 +1135,6 @@ extern int           jbd2_journal_set_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
 extern void       jbd2_journal_clear_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
-extern int        jbd2_journal_create     (journal_t *);
 extern int        jbd2_journal_load       (journal_t *journal);
 extern int        jbd2_journal_destroy    (journal_t *);
 extern int        jbd2_journal_recover    (journal_t *journal);
@@ -1177,8 +1209,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 
 void __jbd2_log_wait_for_space(journal_t *journal);
-extern void    __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
-extern int     jbd2_cleanup_journal_tail(journal_t *);
+extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
+extern int jbd2_cleanup_journal_tail(journal_t *);
 
 /* Debugging code only: */