ext4: Fix Direct I/O locking
[safe/jmp/linux-2.6] / fs / ext4 / inode.c
index b83f91e..7dd9b50 100644 (file)
@@ -105,7 +105,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
  */
 static unsigned long blocks_for_truncate(struct inode *inode)
 {
-       unsigned long needed;
+       ext4_lblk_t needed;
 
        needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 
@@ -243,13 +243,6 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
        p->bh = bh;
 }
 
-static int verify_chain(Indirect *from, Indirect *to)
-{
-       while (from <= to && from->key == *from->p)
-               from++;
-       return (from > to);
-}
-
 /**
  *     ext4_block_to_path - parse the block number into array of offsets
  *     @inode: inode in question (we are only interested in its superblock)
@@ -282,7 +275,8 @@ static int verify_chain(Indirect *from, Indirect *to)
  */
 
 static int ext4_block_to_path(struct inode *inode,
-                       long i_block, int offsets[4], int *boundary)
+                       ext4_lblk_t i_block,
+                       ext4_lblk_t offsets[4], int *boundary)
 {
        int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -313,7 +307,10 @@ static int ext4_block_to_path(struct inode *inode,
                offsets[n++] = i_block & (ptrs - 1);
                final = ptrs;
        } else {
-               ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big");
+               ext4_warning(inode->i_sb, "ext4_block_to_path",
+                               "block %lu > max",
+                               i_block + direct_blocks +
+                               indirect_blocks + double_blocks);
        }
        if (boundary)
                *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -344,12 +341,14 @@ static int ext4_block_to_path(struct inode *inode,
  *             (pointer to last triple returned, *@err == 0)
  *     or when it gets an IO error reading an indirect block
  *             (ditto, *@err == -EIO)
- *     or when it notices that chain had been changed while it was reading
- *             (ditto, *@err == -EAGAIN)
  *     or when it reads all @depth-1 indirect blocks successfully and finds
  *     the whole chain, all way to the data (returns %NULL, *err == 0).
+ *
+ *      Need to be called with
+ *      down_read(&EXT4_I(inode)->i_data_sem)
  */
-static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
+static Indirect *ext4_get_branch(struct inode *inode, int depth,
+                                ext4_lblk_t  *offsets,
                                 Indirect chain[4], int *err)
 {
        struct super_block *sb = inode->i_sb;
@@ -365,9 +364,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
                bh = sb_bread(sb, le32_to_cpu(p->key));
                if (!bh)
                        goto failure;
-               /* Reader: pointers */
-               if (!verify_chain(chain, p))
-                       goto changed;
                add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
                /* Reader: end */
                if (!p->key)
@@ -375,10 +371,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
        }
        return NULL;
 
-changed:
-       brelse(bh);
-       *err = -EAGAIN;
-       goto no_block;
 failure:
        *err = -EIO;
 no_block:
@@ -437,16 +429,13 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
  *     ext4_find_goal - find a prefered place for allocation.
  *     @inode: owner
  *     @block:  block we want
- *     @chain:  chain of indirect blocks
  *     @partial: pointer to the last triple within a chain
- *     @goal:  place to store the result.
  *
  *     Normally this function find the prefered place for block allocation,
- *     stores it in *@goal and returns zero.
+ *     returns it.
  */
-
-static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block,
-               Indirect chain[4], Indirect *partial)
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
+               Indirect *partial)
 {
        struct ext4_block_alloc_info *block_i;
 
@@ -559,7 +548,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        return ret;
 failed_out:
        for (i = 0; i <index; i++)
-               ext4_free_blocks(handle, inode, new_blocks[i], 1);
+               ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
        return ret;
 }
 
@@ -590,7 +579,7 @@ failed_out:
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                        int indirect_blks, int *blks, ext4_fsblk_t goal,
-                       int *offsets, Indirect *branch)
+                       ext4_lblk_t *offsets, Indirect *branch)
 {
        int blocksize = inode->i_sb->s_blocksize;
        int i, n = 0;
@@ -658,9 +647,9 @@ failed:
                ext4_journal_forget(handle, branch[i].bh);
        }
        for (i = 0; i <indirect_blks; i++)
-               ext4_free_blocks(handle, inode, new_blocks[i], 1);
+               ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 
-       ext4_free_blocks(handle, inode, new_blocks[i], num);
+       ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
 
        return err;
 }
@@ -680,7 +669,7 @@ failed:
  * chain to new block and return 0.
  */
 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
-                       long block, Indirect *where, int num, int blks)
+                       ext4_lblk_t block, Indirect *where, int num, int blks)
 {
        int i;
        int err = 0;
@@ -757,9 +746,10 @@ err_out:
        for (i = 1; i <= num; i++) {
                BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
                ext4_journal_forget(handle, where[i].bh);
-               ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
+               ext4_free_blocks(handle, inode,
+                                       le32_to_cpu(where[i-1].key), 1, 0);
        }
-       ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
+       ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
 
        return err;
 }
@@ -782,14 +772,19 @@ err_out:
  * return > 0, # of blocks mapped or allocated.
  * return = 0, if plain lookup failed.
  * return < 0, error case.
+ *
+ *
+ * Need to be called with
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  */
 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-               sector_t iblock, unsigned long maxblocks,
+               ext4_lblk_t iblock, unsigned long maxblocks,
                struct buffer_head *bh_result,
                int create, int extend_disksize)
 {
        int err = -EIO;
-       int offsets[4];
+       ext4_lblk_t offsets[4];
        Indirect chain[4];
        Indirect *partial;
        ext4_fsblk_t goal;
@@ -803,7 +798,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 
        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
        J_ASSERT(handle != NULL || create == 0);
-       depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
+       depth = ext4_block_to_path(inode, iblock, offsets,
+                                       &blocks_to_boundary);
 
        if (depth == 0)
                goto out;
@@ -819,18 +815,6 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
                while (count < maxblocks && count <= blocks_to_boundary) {
                        ext4_fsblk_t blk;
 
-                       if (!verify_chain(chain, partial)) {
-                               /*
-                                * Indirect block might be removed by
-                                * truncate while we were reading it.
-                                * Handling of that case: forget what we've
-                                * got now. Flag the err as EAGAIN, so it
-                                * will reread.
-                                */
-                               err = -EAGAIN;
-                               count = 0;
-                               break;
-                       }
                        blk = le32_to_cpu(*(chain[depth-1].p + count));
 
                        if (blk == first_block + count)
@@ -838,44 +822,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
                        else
                                break;
                }
-               if (err != -EAGAIN)
-                       goto got_it;
+               goto got_it;
        }
 
        /* Next simple case - plain lookup or failed read of indirect block */
        if (!create || err == -EIO)
                goto cleanup;
 
-       mutex_lock(&ei->truncate_mutex);
-
-       /*
-        * If the indirect block is missing while we are reading
-        * the chain(ext4_get_branch() returns -EAGAIN err), or
-        * if the chain has been changed after we grab the semaphore,
-        * (either because another process truncated this branch, or
-        * another get_block allocated this branch) re-grab the chain to see if
-        * the request block has been allocated or not.
-        *
-        * Since we already block the truncate/other get_block
-        * at this point, we will have the current copy of the chain when we
-        * splice the branch into the tree.
-        */
-       if (err == -EAGAIN || !verify_chain(chain, partial)) {
-               while (partial > chain) {
-                       brelse(partial->bh);
-                       partial--;
-               }
-               partial = ext4_get_branch(inode, depth, offsets, chain, &err);
-               if (!partial) {
-                       count++;
-                       mutex_unlock(&ei->truncate_mutex);
-                       if (err)
-                               goto cleanup;
-                       clear_buffer_new(bh_result);
-                       goto got_it;
-               }
-       }
-
        /*
         * Okay, we need to do block allocation.  Lazily initialize the block
         * allocation info here if necessary
@@ -883,7 +836,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
                ext4_init_block_alloc_info(inode);
 
-       goal = ext4_find_goal(inode, iblock, chain, partial);
+       goal = ext4_find_goal(inode, iblock, partial);
 
        /* the number of blocks need to allocate for [d,t]indirect blocks */
        indirect_blks = (chain + depth) - partial - 1;
@@ -911,13 +864,12 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
                err = ext4_splice_branch(handle, inode, iblock,
                                        partial, indirect_blks, count);
        /*
-        * i_disksize growing is protected by truncate_mutex.  Don't forget to
+        * i_disksize growing is protected by i_data_sem.  Don't forget to
         * protect it if you're about to implement concurrent
         * ext4_get_block() -bzzz
        */
        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
                ei->i_disksize = inode->i_size;
-       mutex_unlock(&ei->truncate_mutex);
        if (err)
                goto cleanup;
 
@@ -940,55 +892,87 @@ out:
        return err;
 }
 
-#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+/*
+ * Number of credits we need for writing DIO_MAX_BLOCKS:
+ * We need sb + group descriptor + bitmap + inode -> 4
+ * For B blocks with A block pointers per block we need:
+ * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
+ * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
+ */
+#define DIO_CREDITS 25
+
+int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
+                       unsigned long max_blocks, struct buffer_head *bh,
+                       int create, int extend_disksize)
+{
+       int retval;
+       /*
+        * Try to see if we can get  the block without requesting
+        * for new file system block.
+        */
+       down_read((&EXT4_I(inode)->i_data_sem));
+       if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+               retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                               bh, 0, 0);
+       } else {
+               retval = ext4_get_blocks_handle(handle,
+                               inode, block, max_blocks, bh, 0, 0);
+       }
+       up_read((&EXT4_I(inode)->i_data_sem));
+       if (!create || (retval > 0))
+               return retval;
+
+       /*
+        * We need to allocate new blocks which will result
+        * in i_data update
+        */
+       down_write((&EXT4_I(inode)->i_data_sem));
+       /*
+        * We need to check for EXT4 here because migrate
+        * could have changed the inode type in between
+        */
+       if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+               retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                               bh, create, extend_disksize);
+       } else {
+               retval = ext4_get_blocks_handle(handle, inode, block,
+                               max_blocks, bh, create, extend_disksize);
+       }
+       up_write((&EXT4_I(inode)->i_data_sem));
+       return retval;
+}
 
 static int ext4_get_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create)
 {
        handle_t *handle = ext4_journal_current_handle();
-       int ret = 0;
+       int ret = 0, started = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 
-       if (!create)
-               goto get_block;         /* A read */
-
-       if (max_blocks == 1)
-               goto get_block;         /* A single block get */
-
-       if (handle->h_transaction->t_state == T_LOCKED) {
-               /*
-                * Huge direct-io writes can hold off commits for long
-                * periods of time.  Let this commit run.
-                */
-               ext4_journal_stop(handle);
-               handle = ext4_journal_start(inode, DIO_CREDITS);
-               if (IS_ERR(handle))
+       if (create && !handle) {
+               /* Direct IO write... */
+               if (max_blocks > DIO_MAX_BLOCKS)
+                       max_blocks = DIO_MAX_BLOCKS;
+               handle = ext4_journal_start(inode, DIO_CREDITS +
+                             2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+               if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-               goto get_block;
-       }
-
-       if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
-               /*
-                * Getting low on buffer credits...
-                */
-               ret = ext4_journal_extend(handle, DIO_CREDITS);
-               if (ret > 0) {
-                       /*
-                        * Couldn't extend the transaction.  Start a new one.
-                        */
-                       ret = ext4_journal_restart(handle, DIO_CREDITS);
+                       goto out;
                }
+               started = 1;
        }
 
-get_block:
-       if (ret == 0) {
-               ret = ext4_get_blocks_wrap(handle, inode, iblock,
+       ret = ext4_get_blocks_wrap(handle, inode, iblock,
                                        max_blocks, bh_result, create, 0);
-               if (ret > 0) {
-                       bh_result->b_size = (ret << inode->i_blkbits);
-                       ret = 0;
-               }
+       if (ret > 0) {
+               bh_result->b_size = (ret << inode->i_blkbits);
+               ret = 0;
        }
+       if (started)
+               ext4_journal_stop(handle);
+out:
        return ret;
 }
 
@@ -996,7 +980,7 @@ get_block:
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
-                               long block, int create, int *errp)
+                               ext4_lblk_t block, int create, int *errp)
 {
        struct buffer_head dummy;
        int fatal = 0, err;
@@ -1027,7 +1011,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                }
                if (buffer_new(&dummy)) {
                        J_ASSERT(create != 0);
-                       J_ASSERT(handle != 0);
+                       J_ASSERT(handle != NULL);
 
                        /*
                         * Now that we do not always journal data, we should
@@ -1063,7 +1047,7 @@ err:
 }
 
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
-                              int block, int create, int *err)
+                              ext4_lblk_t block, int create, int *err)
 {
        struct buffer_head * bh;
 
@@ -1146,34 +1130,50 @@ static int do_journal_get_write_access(handle_t *handle,
        return ext4_journal_get_write_access(handle, bh);
 }
 
-static int ext4_prepare_write(struct file *file, struct page *page,
-                             unsigned from, unsigned to)
+static int ext4_write_begin(struct file *file, struct address_space *mapping,
+                               loff_t pos, unsigned len, unsigned flags,
+                               struct page **pagep, void **fsdata)
 {
-       struct inode *inode = page->mapping->host;
+       struct inode *inode = mapping->host;
        int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
        handle_t *handle;
        int retries = 0;
+       struct page *page;
+       pgoff_t index;
+       unsigned from, to;
+
+       index = pos >> PAGE_CACHE_SHIFT;
+       from = pos & (PAGE_CACHE_SIZE - 1);
+       to = from + len;
 
 retry:
-       handle = ext4_journal_start(inode, needed_blocks);
-       if (IS_ERR(handle)) {
-               ret = PTR_ERR(handle);
-               goto out;
+       page = __grab_cache_page(mapping, index);
+       if (!page)
+               return -ENOMEM;
+       *pagep = page;
+
+       handle = ext4_journal_start(inode, needed_blocks);
+       if (IS_ERR(handle)) {
+               unlock_page(page);
+               page_cache_release(page);
+               ret = PTR_ERR(handle);
+               goto out;
        }
-       if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-               ret = nobh_prepare_write(page, from, to, ext4_get_block);
-       else
-               ret = block_prepare_write(page, from, to, ext4_get_block);
-       if (ret)
-               goto prepare_write_failed;
 
-       if (ext4_should_journal_data(inode)) {
+       ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+                                                       ext4_get_block);
+
+       if (!ret && ext4_should_journal_data(inode)) {
                ret = walk_page_buffers(handle, page_buffers(page),
                                from, to, NULL, do_journal_get_write_access);
        }
-prepare_write_failed:
-       if (ret)
+
+       if (ret) {
                ext4_journal_stop(handle);
+               unlock_page(page);
+               page_cache_release(page);
+       }
+
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
 out:
@@ -1185,12 +1185,12 @@ int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
        int err = jbd2_journal_dirty_data(handle, bh);
        if (err)
                ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
-                                               bh, handle,err);
+                                               bh, handle, err);
        return err;
 }
 
-/* For commit_write() in data=journal mode */
-static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+/* For write_end() in data=journal mode */
+static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
@@ -1199,84 +1199,130 @@ static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
 }
 
 /*
+ * Generic write_end handler for ordered and writeback ext4 journal modes.
+ * We can't use generic_write_end, because that unlocks the page and we need to
+ * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
+ * after block_write_end.
+ */
+static int ext4_generic_write_end(struct file *file,
+                               struct address_space *mapping,
+                               loff_t pos, unsigned len, unsigned copied,
+                               struct page *page, void *fsdata)
+{
+       struct inode *inode = file->f_mapping->host;
+
+       copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+       if (pos+copied > inode->i_size) {
+               i_size_write(inode, pos+copied);
+               mark_inode_dirty(inode);
+       }
+
+       return copied;
+}
+
+/*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
  * buffers are managed internally.
  */
-static int ext4_ordered_commit_write(struct file *file, struct page *page,
-                            unsigned from, unsigned to)
+static int ext4_ordered_write_end(struct file *file,
+                               struct address_space *mapping,
+                               loff_t pos, unsigned len, unsigned copied,
+                               struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
-       struct inode *inode = page->mapping->host;
+       struct inode *inode = file->f_mapping->host;
+       unsigned from, to;
        int ret = 0, ret2;
 
+       from = pos & (PAGE_CACHE_SIZE - 1);
+       to = from + len;
+
        ret = walk_page_buffers(handle, page_buffers(page),
                from, to, NULL, ext4_journal_dirty_data);
 
        if (ret == 0) {
                /*
-                * generic_commit_write() will run mark_inode_dirty() if i_size
+                * generic_write_end() will run mark_inode_dirty() if i_size
                 * changes.  So let's piggyback the i_disksize mark_inode_dirty
                 * into that.
                 */
                loff_t new_i_size;
 
-               new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+               new_i_size = pos + copied;
                if (new_i_size > EXT4_I(inode)->i_disksize)
                        EXT4_I(inode)->i_disksize = new_i_size;
-               ret = generic_commit_write(file, page, from, to);
+               copied = ext4_generic_write_end(file, mapping, pos, len, copied,
+                                                       page, fsdata);
+               if (copied < 0)
+                       ret = copied;
        }
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-       return ret;
+       unlock_page(page);
+       page_cache_release(page);
+
+       return ret ? ret : copied;
 }
 
-static int ext4_writeback_commit_write(struct file *file, struct page *page,
-                            unsigned from, unsigned to)
+static int ext4_writeback_write_end(struct file *file,
+                               struct address_space *mapping,
+                               loff_t pos, unsigned len, unsigned copied,
+                               struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
-       struct inode *inode = page->mapping->host;
+       struct inode *inode = file->f_mapping->host;
        int ret = 0, ret2;
        loff_t new_i_size;
 
-       new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+       new_i_size = pos + copied;
        if (new_i_size > EXT4_I(inode)->i_disksize)
                EXT4_I(inode)->i_disksize = new_i_size;
 
-       if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-               ret = nobh_commit_write(file, page, from, to);
-       else
-               ret = generic_commit_write(file, page, from, to);
+       copied = ext4_generic_write_end(file, mapping, pos, len, copied,
+                                                       page, fsdata);
+       if (copied < 0)
+               ret = copied;
 
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-       return ret;
+       unlock_page(page);
+       page_cache_release(page);
+
+       return ret ? ret : copied;
 }
 
-static int ext4_journalled_commit_write(struct file *file,
-                       struct page *page, unsigned from, unsigned to)
+static int ext4_journalled_write_end(struct file *file,
+                               struct address_space *mapping,
+                               loff_t pos, unsigned len, unsigned copied,
+                               struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
-       struct inode *inode = page->mapping->host;
+       struct inode *inode = mapping->host;
        int ret = 0, ret2;
        int partial = 0;
-       loff_t pos;
+       unsigned from, to;
 
-       /*
-        * Here we duplicate the generic_commit_write() functionality
-        */
-       pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+       from = pos & (PAGE_CACHE_SIZE - 1);
+       to = from + len;
+
+       if (copied < len) {
+               if (!PageUptodate(page))
+                       copied = 0;
+               page_zero_new_buffers(page, from+copied, to);
+       }
 
        ret = walk_page_buffers(handle, page_buffers(page), from,
-                               to, &partial, commit_write_fn);
+                               to, &partial, write_end_fn);
        if (!partial)
                SetPageUptodate(page);
-       if (pos > inode->i_size)
-               i_size_write(inode, pos);
+       if (pos+copied > inode->i_size)
+               i_size_write(inode, pos+copied);
        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
        if (inode->i_size > EXT4_I(inode)->i_disksize) {
                EXT4_I(inode)->i_disksize = inode->i_size;
@@ -1284,10 +1330,14 @@ static int ext4_journalled_commit_write(struct file *file,
                if (!ret)
                        ret = ret2;
        }
+
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-       return ret;
+       unlock_page(page);
+       page_cache_release(page);
+
+       return ret ? ret : copied;
 }
 
 /*
@@ -1380,7 +1430,7 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
  *     ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
  *
  * Same applies to ext4_get_block().  We will deadlock on various things like
- * lock_journal and i_truncate_mutex.
+ * lock_journal and i_data_sem
  *
  * Setting PF_MEMALLOC here doesn't work - too many internal memory
  * allocations fail.
@@ -1545,7 +1595,7 @@ static int ext4_journalled_writepage(struct page *page,
                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
 
                err = walk_page_buffers(handle, page_buffers(page), 0,
-                               PAGE_CACHE_SIZE, NULL, commit_write_fn);
+                               PAGE_CACHE_SIZE, NULL, write_end_fn);
                if (ret == 0)
                        ret = err;
                EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
@@ -1612,7 +1662,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
  * if the machine crashes during the write.
  *
  * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file.
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
  */
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
@@ -1621,7 +1672,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        struct ext4_inode_info *ei = EXT4_I(inode);
-       handle_t *handle = NULL;
+       handle_t *handle;
        ssize_t ret;
        int orphan = 0;
        size_t count = iov_length(iov, nr_segs);
@@ -1629,17 +1680,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        if (rw == WRITE) {
                loff_t final_size = offset + count;
 
-               handle = ext4_journal_start(inode, DIO_CREDITS);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       goto out;
-               }
                if (final_size > inode->i_size) {
+                       /* Credits for sb + inode write */
+                       handle = ext4_journal_start(inode, 2);
+                       if (IS_ERR(handle)) {
+                               ret = PTR_ERR(handle);
+                               goto out;
+                       }
                        ret = ext4_orphan_add(handle, inode);
-                       if (ret)
-                               goto out_stop;
+                       if (ret) {
+                               ext4_journal_stop(handle);
+                               goto out;
+                       }
                        orphan = 1;
                        ei->i_disksize = inode->i_size;
+                       ext4_journal_stop(handle);
                }
        }
 
@@ -1647,18 +1702,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
                                 offset, nr_segs,
                                 ext4_get_block, NULL);
 
-       /*
-        * Reacquire the handle: ext4_get_block() can restart the transaction
-        */
-       handle = ext4_journal_current_handle();
-
-out_stop:
-       if (handle) {
+       if (orphan) {
                int err;
 
-               if (orphan && inode->i_nlink)
+               /* Credits for sb + inode write */
+               handle = ext4_journal_start(inode, 2);
+               if (IS_ERR(handle)) {
+                       /* This is really bad luck. We've written the data
+                        * but cannot extend i_size. Bail out and pretend
+                        * the write failed... */
+                       ret = PTR_ERR(handle);
+                       goto out;
+               }
+               if (inode->i_nlink)
                        ext4_orphan_del(handle, inode);
-               if (orphan && ret > 0) {
+               if (ret > 0) {
                        loff_t end = offset + ret;
                        if (end > inode->i_size) {
                                ei->i_disksize = end;
@@ -1705,8 +1763,8 @@ static const struct address_space_operations ext4_ordered_aops = {
        .readpages      = ext4_readpages,
        .writepage      = ext4_ordered_writepage,
        .sync_page      = block_sync_page,
-       .prepare_write  = ext4_prepare_write,
-       .commit_write   = ext4_ordered_commit_write,
+       .write_begin    = ext4_write_begin,
+       .write_end      = ext4_ordered_write_end,
        .bmap           = ext4_bmap,
        .invalidatepage = ext4_invalidatepage,
        .releasepage    = ext4_releasepage,
@@ -1719,8 +1777,8 @@ static const struct address_space_operations ext4_writeback_aops = {
        .readpages      = ext4_readpages,
        .writepage      = ext4_writeback_writepage,
        .sync_page      = block_sync_page,
-       .prepare_write  = ext4_prepare_write,
-       .commit_write   = ext4_writeback_commit_write,
+       .write_begin    = ext4_write_begin,
+       .write_end      = ext4_writeback_write_end,
        .bmap           = ext4_bmap,
        .invalidatepage = ext4_invalidatepage,
        .releasepage    = ext4_releasepage,
@@ -1733,8 +1791,8 @@ static const struct address_space_operations ext4_journalled_aops = {
        .readpages      = ext4_readpages,
        .writepage      = ext4_journalled_writepage,
        .sync_page      = block_sync_page,
-       .prepare_write  = ext4_prepare_write,
-       .commit_write   = ext4_journalled_commit_write,
+       .write_begin    = ext4_write_begin,
+       .write_end      = ext4_journalled_write_end,
        .set_page_dirty = ext4_journalled_set_page_dirty,
        .bmap           = ext4_bmap,
        .invalidatepage = ext4_invalidatepage,
@@ -1762,11 +1820,11 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-       unsigned blocksize, iblock, length, pos;
+       unsigned blocksize, length, pos;
+       ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
        int err = 0;
-       void *kaddr;
 
        blocksize = inode->i_sb->s_blocksize;
        length = blocksize - (offset & (blocksize - 1));
@@ -1778,10 +1836,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
         */
        if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
             ext4_should_writeback_data(inode) && PageUptodate(page)) {
-               kaddr = kmap_atomic(page, KM_USER0);
-               memset(kaddr + offset, 0, length);
-               flush_dcache_page(page);
-               kunmap_atomic(kaddr, KM_USER0);
+               zero_user(page, offset, length);
                set_page_dirty(page);
                goto unlock;
        }
@@ -1834,10 +1889,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
                        goto unlock;
        }
 
-       kaddr = kmap_atomic(page, KM_USER0);
-       memset(kaddr + offset, 0, length);
-       flush_dcache_page(page);
-       kunmap_atomic(kaddr, KM_USER0);
+       zero_user(page, offset, length);
 
        BUFFER_TRACE(bh, "zeroed end of block");
 
@@ -1905,7 +1957,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
  *                     (no partially truncated stuff there).  */
 
 static Indirect *ext4_find_shared(struct inode *inode, int depth,
-                       int offsets[4], Indirect chain[4], __le32 *top)
+                       ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
 {
        Indirect *partial, *p;
        int k, err;
@@ -1989,15 +2041,15 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
        for (p = first; p < last; p++) {
                u32 nr = le32_to_cpu(*p);
                if (nr) {
-                       struct buffer_head *bh;
+                       struct buffer_head *tbh;
 
                        *p = 0;
-                       bh = sb_find_get_block(inode->i_sb, nr);
-                       ext4_forget(handle, 0, inode, bh, nr);
+                       tbh = sb_find_get_block(inode->i_sb, nr);
+                       ext4_forget(handle, 0, inode, tbh, nr);
                }
        }
 
-       ext4_free_blocks(handle, inode, block_to_free, count);
+       ext4_free_blocks(handle, inode, block_to_free, count, 0);
 }
 
 /**
@@ -2170,7 +2222,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                ext4_journal_test_restart(handle, inode);
                        }
 
-                       ext4_free_blocks(handle, inode, nr, 1);
+                       ext4_free_blocks(handle, inode, nr, 1, 1);
 
                        if (parent_bh) {
                                /*
@@ -2230,12 +2282,12 @@ void ext4_truncate(struct inode *inode)
        __le32 *i_data = ei->i_data;
        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
-       int offsets[4];
+       ext4_lblk_t offsets[4];
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
        int n;
-       long last_block;
+       ext4_lblk_t last_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
        struct page *page;
 
@@ -2261,8 +2313,10 @@ void ext4_truncate(struct inode *inode)
                        return;
        }
 
-       if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-               return ext4_ext_truncate(inode, page);
+       if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+               ext4_ext_truncate(inode, page);
+               return;
+       }
 
        handle = start_transaction(inode);
        if (IS_ERR(handle)) {
@@ -2310,7 +2364,7 @@ void ext4_truncate(struct inode *inode)
         * From here we block out all ext4_get_block() callers who want to
         * modify the block allocation tree.
         */
-       mutex_lock(&ei->truncate_mutex);
+       down_write(&ei->i_data_sem);
 
        if (n == 1) {           /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
@@ -2374,7 +2428,7 @@ do_indirects:
 
        ext4_discard_reservation(inode);
 
-       mutex_unlock(&ei->truncate_mutex);
+       up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
 
@@ -2401,7 +2455,8 @@ out_stop:
 static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
                unsigned long ino, struct ext4_iloc *iloc)
 {
-       unsigned long desc, group_desc, block_group;
+       unsigned long desc, group_desc;
+       ext4_group_t block_group;
        unsigned long offset;
        ext4_fsblk_t block;
        struct buffer_head *bh;
@@ -2488,7 +2543,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
                        struct ext4_group_desc *desc;
                        int inodes_per_buffer;
                        int inode_offset, i;
-                       int block_group;
+                       ext4_group_t block_group;
                        int start;
 
                        block_group = (inode->i_ino - 1) /
@@ -2601,22 +2656,54 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
        if (flags & S_DIRSYNC)
                ei->i_flags |= EXT4_DIRSYNC_FL;
 }
+static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
+                                       struct ext4_inode_info *ei)
+{
+       blkcnt_t i_blocks ;
+       struct inode *inode = &(ei->vfs_inode);
+       struct super_block *sb = inode->i_sb;
 
-void ext4_read_inode(struct inode * inode)
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                               EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+               /* we are using combined 48 bit field */
+               i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
+                                       le32_to_cpu(raw_inode->i_blocks_lo);
+               if (ei->i_flags & EXT4_HUGE_FILE_FL) {
+                       /* i_blocks represent file system block size */
+                       return i_blocks  << (inode->i_blkbits - 9);
+               } else {
+                       return i_blocks;
+               }
+       } else {
+               return le32_to_cpu(raw_inode->i_blocks_lo);
+       }
+}
+
+struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
-       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_inode_info *ei;
        struct buffer_head *bh;
+       struct inode *inode;
+       long ret;
        int block;
 
+       inode = iget_locked(sb, ino);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+       if (!(inode->i_state & I_NEW))
+               return inode;
+
+       ei = EXT4_I(inode);
 #ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
        ei->i_acl = EXT4_ACL_NOT_CACHED;
        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
        ei->i_block_alloc_info = NULL;
 
-       if (__ext4_get_inode_loc(inode, &iloc, 0))
+       ret = __ext4_get_inode_loc(inode, &iloc, 0);
+       if (ret < 0)
                goto bad_inode;
        bh = iloc.bh;
        raw_inode = ext4_raw_inode(&iloc);
@@ -2628,7 +2715,6 @@ void ext4_read_inode(struct inode * inode)
                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-       inode->i_size = le32_to_cpu(raw_inode->i_size);
 
        ei->i_state = 0;
        ei->i_dir_start_lookup = 0;
@@ -2643,6 +2729,7 @@ void ext4_read_inode(struct inode * inode)
                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
                        /* this inode is deleted */
                        brelse (bh);
+                       ret = -ESTALE;
                        goto bad_inode;
                }
                /* The only unlinked inodes we let through here have
@@ -2650,24 +2737,15 @@ void ext4_read_inode(struct inode * inode)
                 * recovery code: that's fine, we're about to complete
                 * the process of deleting those. */
        }
-       inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
-#ifdef EXT4_FRAGMENTS
-       ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
-       ei->i_frag_no = raw_inode->i_frag;
-       ei->i_frag_size = raw_inode->i_fsize;
-#endif
-       ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+       inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
+       ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-           cpu_to_le32(EXT4_OS_HURD))
+           cpu_to_le32(EXT4_OS_HURD)) {
                ei->i_file_acl |=
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-       if (!S_ISREG(inode->i_mode)) {
-               ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
-       } else {
-               inode->i_size |=
-                       ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
        }
+       inode->i_size = ext4_isize(raw_inode);
        ei->i_disksize = inode->i_size;
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
@@ -2679,17 +2757,12 @@ void ext4_read_inode(struct inode * inode)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
 
-       if (inode->i_ino >= EXT4_FIRST_INO(inode->i_sb) + 1 &&
-           EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
-               /*
-                * When mke2fs creates big inodes it does not zero out
-                * the unused bytes above EXT4_GOOD_OLD_INODE_SIZE,
-                * so ignore those first few inodes.
-                */
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                    EXT4_INODE_SIZE(inode->i_sb)) {
                        brelse (bh);
+                       ret = -EIO;
                        goto bad_inode;
                }
                if (ei->i_extra_isize == 0) {
@@ -2711,6 +2784,13 @@ void ext4_read_inode(struct inode * inode)
        EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 
+       inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+               if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+                       inode->i_version |=
+                       (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+       }
+
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
@@ -2736,11 +2816,61 @@ void ext4_read_inode(struct inode * inode)
        }
        brelse (iloc.bh);
        ext4_set_inode_flags(inode);
-       return;
+       unlock_new_inode(inode);
+       return inode;
 
 bad_inode:
-       make_bad_inode(inode);
-       return;
+       iget_failed(inode);
+       return ERR_PTR(ret);
+}
+
+static int ext4_inode_blocks_set(handle_t *handle,
+                               struct ext4_inode *raw_inode,
+                               struct ext4_inode_info *ei)
+{
+       struct inode *inode = &(ei->vfs_inode);
+       u64 i_blocks = inode->i_blocks;
+       struct super_block *sb = inode->i_sb;
+       int err = 0;
+
+       if (i_blocks <= ~0U) {
+               /*
+                * i_blocks can be represnted in a 32 bit variable
+                * as multiple of 512 bytes
+                */
+               raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+               raw_inode->i_blocks_high = 0;
+               ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+       } else if (i_blocks <= 0xffffffffffffULL) {
+               /*
+                * i_blocks can be represented in a 48 bit variable
+                * as multiple of 512 bytes
+                */
+               err = ext4_update_rocompat_feature(handle, sb,
+                                           EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+               if (err)
+                       goto  err_out;
+               /* i_block is stored in the split  48 bit fields */
+               raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+               raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+               ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+       } else {
+               /*
+                * i_blocks should be represented in a 48 bit variable
+                * as multiple of  file system block size
+                */
+               err = ext4_update_rocompat_feature(handle, sb,
+                                           EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+               if (err)
+                       goto  err_out;
+               ei->i_flags |= EXT4_HUGE_FILE_FL;
+               /* i_block is stored in file system block size */
+               i_blocks = i_blocks >> (inode->i_blkbits - 9);
+               raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+               raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+       }
+err_out:
+       return err;
 }
 
 /*
@@ -2791,52 +2921,42 @@ static int ext4_do_update_inode(handle_t *handle,
                raw_inode->i_gid_high = 0;
        }
        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-       raw_inode->i_size = cpu_to_le32(ei->i_disksize);
 
        EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
        EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
        EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
        EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
 
-       raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+       if (ext4_inode_blocks_set(handle, raw_inode, ei))
+               goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
-#ifdef EXT4_FRAGMENTS
-       raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
-       raw_inode->i_frag = ei->i_frag_no;
-       raw_inode->i_fsize = ei->i_frag_size;
-#endif
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
                        cpu_to_le16(ei->i_file_acl >> 32);
-       raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
-       if (!S_ISREG(inode->i_mode)) {
-               raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
-       } else {
-               raw_inode->i_size_high =
-                       cpu_to_le32(ei->i_disksize >> 32);
-               if (ei->i_disksize > 0x7fffffffULL) {
-                       struct super_block *sb = inode->i_sb;
-                       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                       EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
-                           EXT4_SB(sb)->s_es->s_rev_level ==
-                                       cpu_to_le32(EXT4_GOOD_OLD_REV)) {
-                              /* If this is the first large file
-                               * created, add a flag to the superblock.
-                               */
-                               err = ext4_journal_get_write_access(handle,
-                                               EXT4_SB(sb)->s_sbh);
-                               if (err)
-                                       goto out_brelse;
-                               ext4_update_dynamic_rev(sb);
-                               EXT4_SET_RO_COMPAT_FEATURE(sb,
+       raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+       ext4_isize_set(raw_inode, ei->i_disksize);
+       if (ei->i_disksize > 0x7fffffffULL) {
+               struct super_block *sb = inode->i_sb;
+               if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                               EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+                               EXT4_SB(sb)->s_es->s_rev_level ==
+                               cpu_to_le32(EXT4_GOOD_OLD_REV)) {
+                       /* If this is the first large file
+                        * created, add a flag to the superblock.
+                        */
+                       err = ext4_journal_get_write_access(handle,
+                                       EXT4_SB(sb)->s_sbh);
+                       if (err)
+                               goto out_brelse;
+                       ext4_update_dynamic_rev(sb);
+                       EXT4_SET_RO_COMPAT_FEATURE(sb,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
-                               sb->s_dirt = 1;
-                               handle->h_sync = 1;
-                               err = ext4_journal_dirty_metadata(handle,
-                                               EXT4_SB(sb)->s_sbh);
-                       }
+                       sb->s_dirt = 1;
+                       handle->h_sync = 1;
+                       err = ext4_journal_dirty_metadata(handle,
+                                       EXT4_SB(sb)->s_sbh);
                }
        }
        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -2854,8 +2974,14 @@ static int ext4_do_update_inode(handle_t *handle,
        } else for (block = 0; block < EXT4_N_BLOCKS; block++)
                raw_inode->i_block[block] = ei->i_data[block];
 
-       if (ei->i_extra_isize)
+       raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+       if (ei->i_extra_isize) {
+               if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+                       raw_inode->i_version_hi =
+                       cpu_to_le32(inode->i_version >> 32);
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+       }
+
 
        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
        rc = ext4_journal_dirty_metadata(handle, bh);
@@ -2910,7 +3036,7 @@ int ext4_write_inode(struct inode *inode, int wait)
                return 0;
 
        if (ext4_journal_current_handle()) {
-               jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
+               jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
                dump_stack();
                return -EIO;
        }
@@ -2975,6 +3101,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                ext4_journal_stop(handle);
        }
 
+       if (attr->ia_valid & ATTR_SIZE) {
+               if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+                       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+                       if (attr->ia_size > sbi->s_bitmap_maxbytes) {
+                               error = -EFBIG;
+                               goto err_out;
+                       }
+               }
+       }
+
        if (S_ISREG(inode->i_mode) &&
            attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
                handle_t *handle;
@@ -3071,6 +3208,9 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 {
        int err = 0;
 
+       if (test_opt(inode->i_sb, I_VERSION))
+               inode_inc_iversion(inode);
+
        /* the do_update_inode consumes one bh->b_count */
        get_bh(iloc->bh);
 
@@ -3106,6 +3246,41 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 }
 
 /*
+ * Expand an inode by new_extra_isize bytes.
+ * Returns 0 on success or negative error number on failure.
+ */
+static int ext4_expand_extra_isize(struct inode *inode,
+                                  unsigned int new_extra_isize,
+                                  struct ext4_iloc iloc,
+                                  handle_t *handle)
+{
+       struct ext4_inode *raw_inode;
+       struct ext4_xattr_ibody_header *header;
+       struct ext4_xattr_entry *entry;
+
+       if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
+               return 0;
+
+       raw_inode = ext4_raw_inode(&iloc);
+
+       header = IHDR(inode, raw_inode);
+       entry = IFIRST(header);
+
+       /* No extended attributes present */
+       if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
+               header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+               memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+                       new_extra_isize);
+               EXT4_I(inode)->i_extra_isize = new_extra_isize;
+               return 0;
+       }
+
+       /* try to expand with EAs present */
+       return ext4_expand_extra_isize_ea(inode, new_extra_isize,
+                                         raw_inode, handle);
+}
+
+/*
  * What we do here is to mark the in-core inode as clean with respect to inode
  * dirtiness (it may still be data-dirty).
  * This means that the in-core inode may be reaped by prune_icache
@@ -3129,10 +3304,40 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
        struct ext4_iloc iloc;
-       int err;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       static unsigned int mnt_count;
+       int err, ret;
 
        might_sleep();
        err = ext4_reserve_inode_write(handle, inode, &iloc);
+       if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+           !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+               /*
+                * We need extra buffer credits since we may write into EA block
+                * with this same handle. If journal_extend fails, then it will
+                * only result in a minor loss of functionality for that inode.
+                * If this is felt to be critical, then e2fsck should be run to
+                * force a large enough s_min_extra_isize.
+                */
+               if ((jbd2_journal_extend(handle,
+                            EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+                       ret = ext4_expand_extra_isize(inode,
+                                                     sbi->s_want_extra_isize,
+                                                     iloc, handle);
+                       if (ret) {
+                               EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+                               if (mnt_count !=
+                                       le16_to_cpu(sbi->s_es->s_mnt_count)) {
+                                       ext4_warning(inode->i_sb, __FUNCTION__,
+                                       "Unable to expand inode %lu. Delete"
+                                       " some EAs or run e2fsck.",
+                                       inode->i_ino);
+                                       mnt_count =
+                                         le16_to_cpu(sbi->s_es->s_mnt_count);
+                               }
+                       }
+               }
+       }
        if (!err)
                err = ext4_mark_iloc_dirty(handle, inode, &iloc);
        return err;
@@ -3221,7 +3426,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         */
 
        journal = EXT4_JOURNAL(inode);
-       if (is_journal_aborted(journal) || IS_RDONLY(inode))
+       if (is_journal_aborted(journal))
                return -EROFS;
 
        jbd2_journal_lock_updates(journal);