SAFE public projects git trees. - safe/jmp/linux-2.6/blob - fs/ext4/inode.c

   1 /*
   2  *  linux/fs/ext4/inode.c
   3  *
   4  * Copyright (C) 1992, 1993, 1994, 1995
   5  * Remy Card (card@masi.ibp.fr)
   6  * Laboratoire MASI - Institut Blaise Pascal
   7  * Universite Pierre et Marie Curie (Paris VI)
   8  *
   9  *  from
  10  *
  11  *  linux/fs/minix/inode.c
  12  *
  13  *  Copyright (C) 1991, 1992  Linus Torvalds
  14  *
  15  *  Goal-directed block allocation by Stephen Tweedie
  16  *      (sct@redhat.com), 1993, 1998
  17  *  Big-endian to little-endian byte-swapping/bitmaps by
  18  *        David S. Miller (davem@caip.rutgers.edu), 1995
  19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  20  *      (jj@sunsite.ms.mff.cuni.cz)
  21  *
  22  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  23  */
  24
  25 #include <linux/module.h>
  26 #include <linux/fs.h>
  27 #include <linux/time.h>
  28 #include <linux/jbd2.h>
  29 #include <linux/highuid.h>
  30 #include <linux/pagemap.h>
  31 #include <linux/quotaops.h>
  32 #include <linux/string.h>
  33 #include <linux/buffer_head.h>
  34 #include <linux/writeback.h>
  35 #include <linux/pagevec.h>
  36 #include <linux/mpage.h>
  37 #include <linux/namei.h>
  38 #include <linux/uio.h>
  39 #include <linux/bio.h>
  40 #include "ext4_jbd2.h"
  41 #include "xattr.h"
  42 #include "acl.h"
  43 #include "ext4_extents.h"
  44
  45 #define MPAGE_DA_EXTENT_TAIL 0x01
  46
  47 static inline int ext4_begin_ordered_truncate(struct inode *inode,
  48                                               loff_t new_size)
  49 {
  50         return jbd2_journal_begin_ordered_truncate(
  51                                         EXT4_SB(inode->i_sb)->s_journal,
  52                                         &EXT4_I(inode)->jinode,
  53                                         new_size);
  54 }
  55
  56 static void ext4_invalidatepage(struct page *page, unsigned long offset);
  57
  58 /*
  59  * Test whether an inode is a fast symlink.
  60  */
  61 static int ext4_inode_is_fast_symlink(struct inode *inode)
  62 {
  63         int ea_blocks = EXT4_I(inode)->i_file_acl ?
  64                 (inode->i_sb->s_blocksize >> 9) : 0;
  65
  66         return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
  67 }
  68
  69 /*
  70  * The ext4 forget function must perform a revoke if we are freeing data
  71  * which has been journaled.  Metadata (eg. indirect blocks) must be
  72  * revoked in all cases.
  73  *
  74  * "bh" may be NULL: a metadata block may have been freed from memory
  75  * but there may still be a record of it in the journal, and that record
  76  * still needs to be revoked.
  77  *
  78  * If the handle isn't valid we're not journaling so there's nothing to do.
  79  */
  80 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
  81                         struct buffer_head *bh, ext4_fsblk_t blocknr)
  82 {
  83         int err;
  84
  85         if (!ext4_handle_valid(handle))
  86                 return 0;
  87
  88         might_sleep();
  89
  90         BUFFER_TRACE(bh, "enter");
  91
  92         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
  93                   "data mode %lx\n",
  94                   bh, is_metadata, inode->i_mode,
  95                   test_opt(inode->i_sb, DATA_FLAGS));
  96
  97         /* Never use the revoke function if we are doing full data
  98          * journaling: there is no need to, and a V1 superblock won't
  99          * support it.  Otherwise, only skip the revoke on un-journaled
 100          * data blocks. */
 101
 102         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
 103             (!is_metadata && !ext4_should_journal_data(inode))) {
 104                 if (bh) {
 105                         BUFFER_TRACE(bh, "call jbd2_journal_forget");
 106                         return ext4_journal_forget(handle, bh);
 107                 }
 108                 return 0;
 109         }
 110
 111         /*
 112          * data!=journal && (is_metadata || should_journal_data(inode))
 113          */
 114         BUFFER_TRACE(bh, "call ext4_journal_revoke");
 115         err = ext4_journal_revoke(handle, blocknr, bh);
 116         if (err)
 117                 ext4_abort(inode->i_sb, __func__,
 118                            "error %d when attempting revoke", err);
 119         BUFFER_TRACE(bh, "exit");
 120         return err;
 121 }
 122
 123 /*
 124  * Work out how many blocks we need to proceed with the next chunk of a
 125  * truncate transaction.
 126  */
 127 static unsigned long blocks_for_truncate(struct inode *inode)
 128 {
 129         ext4_lblk_t needed;
 130
 131         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 132
 133         /* Give ourselves just enough room to cope with inodes in which
 134          * i_blocks is corrupt: we've seen disk corruptions in the past
 135          * which resulted in random data in an inode which looked enough
 136          * like a regular file for ext4 to try to delete it.  Things
 137          * will go a bit crazy if that happens, but at least we should
 138          * try not to panic the whole kernel. */
 139         if (needed < 2)
 140                 needed = 2;
 141
 142         /* But we need to bound the transaction so we don't overflow the
 143          * journal. */
 144         if (needed > EXT4_MAX_TRANS_DATA)
 145                 needed = EXT4_MAX_TRANS_DATA;
 146
 147         return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 148 }
 149
 150 /*
 151  * Truncate transactions can be complex and absolutely huge.  So we need to
 152  * be able to restart the transaction at a conventient checkpoint to make
 153  * sure we don't overflow the journal.
 154  *
 155  * start_transaction gets us a new handle for a truncate transaction,
 156  * and extend_transaction tries to extend the existing one a bit.  If
 157  * extend fails, we need to propagate the failure up and restart the
 158  * transaction in the top-level truncate loop. --sct
 159  */
 160 static handle_t *start_transaction(struct inode *inode)
 161 {
 162         handle_t *result;
 163
 164         result = ext4_journal_start(inode, blocks_for_truncate(inode));
 165         if (!IS_ERR(result))
 166                 return result;
 167
 168         ext4_std_error(inode->i_sb, PTR_ERR(result));
 169         return result;
 170 }
 171
 172 /*
 173  * Try to extend this transaction for the purposes of truncation.
 174  *
 175  * Returns 0 if we managed to create more room.  If we can't create more
 176  * room, and the transaction must be restarted we return 1.
 177  */
 178 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 179 {
 180         if (!ext4_handle_valid(handle))
 181                 return 0;
 182         if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
 183                 return 0;
 184         if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
 185                 return 0;
 186         return 1;
 187 }
 188
 189 /*
 190  * Restart the transaction associated with *handle.  This does a commit,
 191  * so before we call here everything must be consistently dirtied against
 192  * this transaction.
 193  */
 194 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 195 {
 196         BUG_ON(EXT4_JOURNAL(inode) == NULL);
 197         jbd_debug(2, "restarting handle %p\n", handle);
 198         return ext4_journal_restart(handle, blocks_for_truncate(inode));
 199 }
 200
 201 /*
 202  * Called at the last iput() if i_nlink is zero.
 203  */
 204 void ext4_delete_inode(struct inode *inode)
 205 {
 206         handle_t *handle;
 207         int err;
 208
 209         if (ext4_should_order_data(inode))
 210                 ext4_begin_ordered_truncate(inode, 0);
 211         truncate_inode_pages(&inode->i_data, 0);
 212
 213         if (is_bad_inode(inode))
 214                 goto no_delete;
 215
 216         handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
 217         if (IS_ERR(handle)) {
 218                 ext4_std_error(inode->i_sb, PTR_ERR(handle));
 219                 /*
 220                  * If we're going to skip the normal cleanup, we still need to
 221                  * make sure that the in-core orphan linked list is properly
 222                  * cleaned up.
 223                  */
 224                 ext4_orphan_del(NULL, inode);
 225                 goto no_delete;
 226         }
 227
 228         if (IS_SYNC(inode))
 229                 ext4_handle_sync(handle);
 230         inode->i_size = 0;
 231         err = ext4_mark_inode_dirty(handle, inode);
 232         if (err) {
 233                 ext4_warning(inode->i_sb, __func__,
 234                              "couldn't mark inode dirty (err %d)", err);
 235                 goto stop_handle;
 236         }
 237         if (inode->i_blocks)
 238                 ext4_truncate(inode);
 239
 240         /*
 241          * ext4_ext_truncate() doesn't reserve any slop when it
 242          * restarts journal transactions; therefore there may not be
 243          * enough credits left in the handle to remove the inode from
 244          * the orphan list and set the dtime field.
 245          */
 246         if (!ext4_handle_has_enough_credits(handle, 3)) {
 247                 err = ext4_journal_extend(handle, 3);
 248                 if (err > 0)
 249                         err = ext4_journal_restart(handle, 3);
 250                 if (err != 0) {
 251                         ext4_warning(inode->i_sb, __func__,
 252                                      "couldn't extend journal (err %d)", err);
 253                 stop_handle:
 254                         ext4_journal_stop(handle);
 255                         goto no_delete;
 256                 }
 257         }
 258
 259         /*
 260          * Kill off the orphan record which ext4_truncate created.
 261          * AKPM: I think this can be inside the above `if'.
 262          * Note that ext4_orphan_del() has to be able to cope with the
 263          * deletion of a non-existent orphan - this is because we don't
 264          * know if ext4_truncate() actually created an orphan record.
 265          * (Well, we could do this if we need to, but heck - it works)
 266          */
 267         ext4_orphan_del(handle, inode);
 268         EXT4_I(inode)->i_dtime  = get_seconds();
 269
 270         /*
 271          * One subtle ordering requirement: if anything has gone wrong
 272          * (transaction abort, IO errors, whatever), then we can still
 273          * do these next steps (the fs will already have been marked as
 274          * having errors), but we can't free the inode if the mark_dirty
 275          * fails.
 276          */
 277         if (ext4_mark_inode_dirty(handle, inode))
 278                 /* If that failed, just do the required in-core inode clear. */
 279                 clear_inode(inode);
 280         else
 281                 ext4_free_inode(handle, inode);
 282         ext4_journal_stop(handle);
 283         return;
 284 no_delete:
 285         clear_inode(inode);     /* We must guarantee clearing of inode... */
 286 }
 287
 288 typedef struct {
 289         __le32  *p;
 290         __le32  key;
 291         struct buffer_head *bh;
 292 } Indirect;
 293
 294 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 295 {
 296         p->key = *(p->p = v);
 297         p->bh = bh;
 298 }
 299
 300 /**
 301  *      ext4_block_to_path - parse the block number into array of offsets
 302  *      @inode: inode in question (we are only interested in its superblock)
 303  *      @i_block: block number to be parsed
 304  *      @offsets: array to store the offsets in
 305  *      @boundary: set this non-zero if the referred-to block is likely to be
 306  *             followed (on disk) by an indirect block.
 307  *
 308  *      To store the locations of file's data ext4 uses a data structure common
 309  *      for UNIX filesystems - tree of pointers anchored in the inode, with
 310  *      data blocks at leaves and indirect blocks in intermediate nodes.
 311  *      This function translates the block number into path in that tree -
 312  *      return value is the path length and @offsets[n] is the offset of
 313  *      pointer to (n+1)th node in the nth one. If @block is out of range
 314  *      (negative or too large) warning is printed and zero returned.
 315  *
 316  *      Note: function doesn't find node addresses, so no IO is needed. All
 317  *      we need to know is the capacity of indirect blocks (taken from the
 318  *      inode->i_sb).
 319  */
 320
 321 /*
 322  * Portability note: the last comparison (check that we fit into triple
 323  * indirect block) is spelled differently, because otherwise on an
 324  * architecture with 32-bit longs and 8Kb pages we might get into trouble
 325  * if our filesystem had 8Kb blocks. We might use long long, but that would
 326  * kill us on x86. Oh, well, at least the sign propagation does not matter -
 327  * i_block would have to be negative in the very beginning, so we would not
 328  * get there at all.
 329  */
 330
 331 static int ext4_block_to_path(struct inode *inode,
 332                         ext4_lblk_t i_block,
 333                         ext4_lblk_t offsets[4], int *boundary)
 334 {
 335         int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 336         int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
 337         const long direct_blocks = EXT4_NDIR_BLOCKS,
 338                 indirect_blocks = ptrs,
 339                 double_blocks = (1 << (ptrs_bits * 2));
 340         int n = 0;
 341         int final = 0;
 342
 343         if (i_block < 0) {
 344                 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
 345         } else if (i_block < direct_blocks) {
 346                 offsets[n++] = i_block;
 347                 final = direct_blocks;
 348         } else if ((i_block -= direct_blocks) < indirect_blocks) {
 349                 offsets[n++] = EXT4_IND_BLOCK;
 350                 offsets[n++] = i_block;
 351                 final = ptrs;
 352         } else if ((i_block -= indirect_blocks) < double_blocks) {
 353                 offsets[n++] = EXT4_DIND_BLOCK;
 354                 offsets[n++] = i_block >> ptrs_bits;
 355                 offsets[n++] = i_block & (ptrs - 1);
 356                 final = ptrs;
 357         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 358                 offsets[n++] = EXT4_TIND_BLOCK;
 359                 offsets[n++] = i_block >> (ptrs_bits * 2);
 360                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 361                 offsets[n++] = i_block & (ptrs - 1);
 362                 final = ptrs;
 363         } else {
 364                 ext4_warning(inode->i_sb, "ext4_block_to_path",
 365                                 "block %lu > max in inode %lu",
 366                                 i_block + direct_blocks +
 367                                 indirect_blocks + double_blocks, inode->i_ino);
 368         }
 369         if (boundary)
 370                 *boundary = final - 1 - (i_block & (ptrs - 1));
 371         return n;
 372 }
 373
 374 static int __ext4_check_blockref(const char *function, struct inode *inode,
 375                                  __le32 *p, unsigned int max)
 376 {
 377         __le32 *bref = p;
 378         unsigned int blk;
 379
 380         while (bref < p+max) {
 381                 blk = le32_to_cpu(*bref++);
 382                 if (blk &&
 383                     unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 384                                                     blk, 1))) {
 385                         ext4_error(inode->i_sb, function,
 386                                    "invalid block reference %u "
 387                                    "in inode #%lu", blk, inode->i_ino);
 388                         return -EIO;
 389                 }
 390         }
 391         return 0;
 392 }
 393
 394
 395 #define ext4_check_indirect_blockref(inode, bh)                         \
 396         __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
 397                               EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 398
 399 #define ext4_check_inode_blockref(inode)                                \
 400         __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
 401                               EXT4_NDIR_BLOCKS)
 402
 403 /**
 404  *      ext4_get_branch - read the chain of indirect blocks leading to data
 405  *      @inode: inode in question
 406  *      @depth: depth of the chain (1 - direct pointer, etc.)
 407  *      @offsets: offsets of pointers in inode/indirect blocks
 408  *      @chain: place to store the result
 409  *      @err: here we store the error value
 410  *
 411  *      Function fills the array of triples <key, p, bh> and returns %NULL
 412  *      if everything went OK or the pointer to the last filled triple
 413  *      (incomplete one) otherwise. Upon the return chain[i].key contains
 414  *      the number of (i+1)-th block in the chain (as it is stored in memory,
 415  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
 416  *      number (it points into struct inode for i==0 and into the bh->b_data
 417  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
 418  *      block for i>0 and NULL for i==0. In other words, it holds the block
 419  *      numbers of the chain, addresses they were taken from (and where we can
 420  *      verify that chain did not change) and buffer_heads hosting these
 421  *      numbers.
 422  *
 423  *      Function stops when it stumbles upon zero pointer (absent block)
 424  *              (pointer to last triple returned, *@err == 0)
 425  *      or when it gets an IO error reading an indirect block
 426  *              (ditto, *@err == -EIO)
 427  *      or when it reads all @depth-1 indirect blocks successfully and finds
 428  *      the whole chain, all way to the data (returns %NULL, *err == 0).
 429  *
 430  *      Need to be called with
 431  *      down_read(&EXT4_I(inode)->i_data_sem)
 432  */
 433 static Indirect *ext4_get_branch(struct inode *inode, int depth,
 434                                  ext4_lblk_t  *offsets,
 435                                  Indirect chain[4], int *err)
 436 {
 437         struct super_block *sb = inode->i_sb;
 438         Indirect *p = chain;
 439         struct buffer_head *bh;
 440
 441         *err = 0;
 442         /* i_data is not going away, no lock needed */
 443         add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
 444         if (!p->key)
 445                 goto no_block;
 446         while (--depth) {
 447                 bh = sb_getblk(sb, le32_to_cpu(p->key));
 448                 if (unlikely(!bh))
 449                         goto failure;
 450
 451                 if (!bh_uptodate_or_lock(bh)) {
 452                         if (bh_submit_read(bh) < 0) {
 453                                 put_bh(bh);
 454                                 goto failure;
 455                         }
 456                         /* validate block references */
 457                         if (ext4_check_indirect_blockref(inode, bh)) {
 458                                 put_bh(bh);
 459                                 goto failure;
 460                         }
 461                 }
 462
 463                 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 464                 /* Reader: end */
 465                 if (!p->key)
 466                         goto no_block;
 467         }
 468         return NULL;
 469
 470 failure:
 471         *err = -EIO;
 472 no_block:
 473         return p;
 474 }
 475
 476 /**
 477  *      ext4_find_near - find a place for allocation with sufficient locality
 478  *      @inode: owner
 479  *      @ind: descriptor of indirect block.
 480  *
 481  *      This function returns the preferred place for block allocation.
 482  *      It is used when heuristic for sequential allocation fails.
 483  *      Rules are:
 484  *        + if there is a block to the left of our position - allocate near it.
 485  *        + if pointer will live in indirect block - allocate near that block.
 486  *        + if pointer will live in inode - allocate in the same
 487  *          cylinder group.
 488  *
 489  * In the latter case we colour the starting block by the callers PID to
 490  * prevent it from clashing with concurrent allocations for a different inode
 491  * in the same block group.   The PID is used here so that functionally related
 492  * files will be close-by on-disk.
 493  *
 494  *      Caller must make sure that @ind is valid and will stay that way.
 495  */
 496 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 497 {
 498         struct ext4_inode_info *ei = EXT4_I(inode);
 499         __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
 500         __le32 *p;
 501         ext4_fsblk_t bg_start;
 502         ext4_fsblk_t last_block;
 503         ext4_grpblk_t colour;
 504         ext4_group_t block_group;
 505         int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 506
 507         /* Try to find previous block */
 508         for (p = ind->p - 1; p >= start; p--) {
 509                 if (*p)
 510                         return le32_to_cpu(*p);
 511         }
 512
 513         /* No such thing, so let's try location of indirect block */
 514         if (ind->bh)
 515                 return ind->bh->b_blocknr;
 516
 517         /*
 518          * It is going to be referred to from the inode itself? OK, just put it
 519          * into the same cylinder group then.
 520          */
 521         block_group = ei->i_block_group;
 522         if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
 523                 block_group &= ~(flex_size-1);
 524                 if (S_ISREG(inode->i_mode))
 525                         block_group++;
 526         }
 527         bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
 528         last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 529
 530         /*
 531          * If we are doing delayed allocation, we don't need take
 532          * colour into account.
 533          */
 534         if (test_opt(inode->i_sb, DELALLOC))
 535                 return bg_start;
 536
 537         if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 538                 colour = (current->pid % 16) *
 539                         (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 540         else
 541                 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
 542         return bg_start + colour;
 543 }
 544
 545 /**
 546  *      ext4_find_goal - find a preferred place for allocation.
 547  *      @inode: owner
 548  *      @block:  block we want
 549  *      @partial: pointer to the last triple within a chain
 550  *
 551  *      Normally this function find the preferred place for block allocation,
 552  *      returns it.
 553  */
 554 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 555                 Indirect *partial)
 556 {
 557         /*
 558          * XXX need to get goal block from mballoc's data structures
 559          */
 560
 561         return ext4_find_near(inode, partial);
 562 }
 563
 564 /**
 565  *      ext4_blks_to_allocate: Look up the block map and count the number
 566  *      of direct blocks need to be allocated for the given branch.
 567  *
 568  *      @branch: chain of indirect blocks
 569  *      @k: number of blocks need for indirect blocks
 570  *      @blks: number of data blocks to be mapped.
 571  *      @blocks_to_boundary:  the offset in the indirect block
 572  *
 573  *      return the total number of blocks to be allocate, including the
 574  *      direct and indirect blocks.
 575  */
 576 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 577                 int blocks_to_boundary)
 578 {
 579         unsigned int count = 0;
 580
 581         /*
 582          * Simple case, [t,d]Indirect block(s) has not allocated yet
 583          * then it's clear blocks on that path have not allocated
 584          */
 585         if (k > 0) {
 586                 /* right now we don't handle cross boundary allocation */
 587                 if (blks < blocks_to_boundary + 1)
 588                         count += blks;
 589                 else
 590                         count += blocks_to_boundary + 1;
 591                 return count;
 592         }
 593
 594         count++;
 595         while (count < blks && count <= blocks_to_boundary &&
 596                 le32_to_cpu(*(branch[0].p + count)) == 0) {
 597                 count++;
 598         }
 599         return count;
 600 }
 601
 602 /**
 603  *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
 604  *      @indirect_blks: the number of blocks need to allocate for indirect
 605  *                      blocks
 606  *
 607  *      @new_blocks: on return it will store the new block numbers for
 608  *      the indirect blocks(if needed) and the first direct block,
 609  *      @blks:  on return it will store the total number of allocated
 610  *              direct blocks
 611  */
 612 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 613                                 ext4_lblk_t iblock, ext4_fsblk_t goal,
 614                                 int indirect_blks, int blks,
 615                                 ext4_fsblk_t new_blocks[4], int *err)
 616 {
 617         struct ext4_allocation_request ar;
 618         int target, i;
 619         unsigned long count = 0, blk_allocated = 0;
 620         int index = 0;
 621         ext4_fsblk_t current_block = 0;
 622         int ret = 0;
 623
 624         /*
 625          * Here we try to allocate the requested multiple blocks at once,
 626          * on a best-effort basis.
 627          * To build a branch, we should allocate blocks for
 628          * the indirect blocks(if not allocated yet), and at least
 629          * the first direct block of this branch.  That's the
 630          * minimum number of blocks need to allocate(required)
 631          */
 632         /* first we try to allocate the indirect blocks */
 633         target = indirect_blks;
 634         while (target > 0) {
 635                 count = target;
 636                 /* allocating blocks for indirect blocks and direct blocks */
 637                 current_block = ext4_new_meta_blocks(handle, inode,
 638                                                         goal, &count, err);
 639                 if (*err)
 640                         goto failed_out;
 641
 642                 target -= count;
 643                 /* allocate blocks for indirect blocks */
 644                 while (index < indirect_blks && count) {
 645                         new_blocks[index++] = current_block++;
 646                         count--;
 647                 }
 648                 if (count > 0) {
 649                         /*
 650                          * save the new block number
 651                          * for the first direct block
 652                          */
 653                         new_blocks[index] = current_block;
 654                         printk(KERN_INFO "%s returned more blocks than "
 655                                                 "requested\n", __func__);
 656                         WARN_ON(1);
 657                         break;
 658                 }
 659         }
 660
 661         target = blks - count ;
 662         blk_allocated = count;
 663         if (!target)
 664                 goto allocated;
 665         /* Now allocate data blocks */
 666         memset(&ar, 0, sizeof(ar));
 667         ar.inode = inode;
 668         ar.goal = goal;
 669         ar.len = target;
 670         ar.logical = iblock;
 671         if (S_ISREG(inode->i_mode))
 672                 /* enable in-core preallocation only for regular files */
 673                 ar.flags = EXT4_MB_HINT_DATA;
 674
 675         current_block = ext4_mb_new_blocks(handle, &ar, err);
 676
 677         if (*err && (target == blks)) {
 678                 /*
 679                  * if the allocation failed and we didn't allocate
 680                  * any blocks before
 681                  */
 682                 goto failed_out;
 683         }
 684         if (!*err) {
 685                 if (target == blks) {
 686                 /*
 687                  * save the new block number
 688                  * for the first direct block
 689                  */
 690                         new_blocks[index] = current_block;
 691                 }
 692                 blk_allocated += ar.len;
 693         }
 694 allocated:
 695         /* total number of blocks allocated for direct blocks */
 696         ret = blk_allocated;
 697         *err = 0;
 698         return ret;
 699 failed_out:
 700         for (i = 0; i < index; i++)
 701                 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 702         return ret;
 703 }
 704
 705 /**
 706  *      ext4_alloc_branch - allocate and set up a chain of blocks.
 707  *      @inode: owner
 708  *      @indirect_blks: number of allocated indirect blocks
 709  *      @blks: number of allocated direct blocks
 710  *      @offsets: offsets (in the blocks) to store the pointers to next.
 711  *      @branch: place to store the chain in.
 712  *
 713  *      This function allocates blocks, zeroes out all but the last one,
 714  *      links them into chain and (if we are synchronous) writes them to disk.
 715  *      In other words, it prepares a branch that can be spliced onto the
 716  *      inode. It stores the information about that chain in the branch[], in
 717  *      the same format as ext4_get_branch() would do. We are calling it after
 718  *      we had read the existing part of chain and partial points to the last
 719  *      triple of that (one with zero ->key). Upon the exit we have the same
 720  *      picture as after the successful ext4_get_block(), except that in one
 721  *      place chain is disconnected - *branch->p is still zero (we did not
 722  *      set the last link), but branch->key contains the number that should
 723  *      be placed into *branch->p to fill that gap.
 724  *
 725  *      If allocation fails we free all blocks we've allocated (and forget
 726  *      their buffer_heads) and return the error value the from failed
 727  *      ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
 728  *      as described above and return 0.
 729  */
 730 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 731                                 ext4_lblk_t iblock, int indirect_blks,
 732                                 int *blks, ext4_fsblk_t goal,
 733                                 ext4_lblk_t *offsets, Indirect *branch)
 734 {
 735         int blocksize = inode->i_sb->s_blocksize;
 736         int i, n = 0;
 737         int err = 0;
 738         struct buffer_head *bh;
 739         int num;
 740         ext4_fsblk_t new_blocks[4];
 741         ext4_fsblk_t current_block;
 742
 743         num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
 744                                 *blks, new_blocks, &err);
 745         if (err)
 746                 return err;
 747
 748         branch[0].key = cpu_to_le32(new_blocks[0]);
 749         /*
 750          * metadata blocks and data blocks are allocated.
 751          */
 752         for (n = 1; n <= indirect_blks;  n++) {
 753                 /*
 754                  * Get buffer_head for parent block, zero it out
 755                  * and set the pointer to new one, then send
 756                  * parent to disk.
 757                  */
 758                 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
 759                 branch[n].bh = bh;
 760                 lock_buffer(bh);
 761                 BUFFER_TRACE(bh, "call get_create_access");
 762                 err = ext4_journal_get_create_access(handle, bh);
 763                 if (err) {
 764                         unlock_buffer(bh);
 765                         brelse(bh);
 766                         goto failed;
 767                 }
 768
 769                 memset(bh->b_data, 0, blocksize);
 770                 branch[n].p = (__le32 *) bh->b_data + offsets[n];
 771                 branch[n].key = cpu_to_le32(new_blocks[n]);
 772                 *branch[n].p = branch[n].key;
 773                 if (n == indirect_blks) {
 774                         current_block = new_blocks[n];
 775                         /*
 776                          * End of chain, update the last new metablock of
 777                          * the chain to point to the new allocated
 778                          * data blocks numbers
 779                          */
 780                         for (i=1; i < num; i++)
 781                                 *(branch[n].p + i) = cpu_to_le32(++current_block);
 782                 }
 783                 BUFFER_TRACE(bh, "marking uptodate");
 784                 set_buffer_uptodate(bh);
 785                 unlock_buffer(bh);
 786
 787                 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 788                 err = ext4_handle_dirty_metadata(handle, inode, bh);
 789                 if (err)
 790                         goto failed;
 791         }
 792         *blks = num;
 793         return err;
 794 failed:
 795         /* Allocation failed, free what we already allocated */
 796         for (i = 1; i <= n ; i++) {
 797                 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
 798                 ext4_journal_forget(handle, branch[i].bh);
 799         }
 800         for (i = 0; i < indirect_blks; i++)
 801                 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 802
 803         ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
 804
 805         return err;
 806 }
 807
 808 /**
 809  * ext4_splice_branch - splice the allocated branch onto inode.
 810  * @inode: owner
 811  * @block: (logical) number of block we are adding
 812  * @chain: chain of indirect blocks (with a missing link - see
 813  *      ext4_alloc_branch)
 814  * @where: location of missing link
 815  * @num:   number of indirect blocks we are adding
 816  * @blks:  number of direct blocks we are adding
 817  *
 818  * This function fills the missing link and does all housekeeping needed in
 819  * inode (->i_blocks, etc.). In case of success we end up with the full
 820  * chain to new block and return 0.
 821  */
 822 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 823                         ext4_lblk_t block, Indirect *where, int num, int blks)
 824 {
 825         int i;
 826         int err = 0;
 827         ext4_fsblk_t current_block;
 828
 829         /*
 830          * If we're splicing into a [td]indirect block (as opposed to the
 831          * inode) then we need to get write access to the [td]indirect block
 832          * before the splice.
 833          */
 834         if (where->bh) {
 835                 BUFFER_TRACE(where->bh, "get_write_access");
 836                 err = ext4_journal_get_write_access(handle, where->bh);
 837                 if (err)
 838                         goto err_out;
 839         }
 840         /* That's it */
 841
 842         *where->p = where->key;
 843
 844         /*
 845          * Update the host buffer_head or inode to point to more just allocated
 846          * direct blocks blocks
 847          */
 848         if (num == 0 && blks > 1) {
 849                 current_block = le32_to_cpu(where->key) + 1;
 850                 for (i = 1; i < blks; i++)
 851                         *(where->p + i) = cpu_to_le32(current_block++);
 852         }
 853
 854         /* We are done with atomic stuff, now do the rest of housekeeping */
 855
 856         inode->i_ctime = ext4_current_time(inode);
 857         ext4_mark_inode_dirty(handle, inode);
 858
 859         /* had we spliced it onto indirect block? */
 860         if (where->bh) {
 861                 /*
 862                  * If we spliced it onto an indirect block, we haven't
 863                  * altered the inode.  Note however that if it is being spliced
 864                  * onto an indirect block at the very end of the file (the
 865                  * file is growing) then we *will* alter the inode to reflect
 866                  * the new i_size.  But that is not done here - it is done in
 867                  * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
 868                  */
 869                 jbd_debug(5, "splicing indirect only\n");
 870                 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
 871                 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
 872                 if (err)
 873                         goto err_out;
 874         } else {
 875                 /*
 876                  * OK, we spliced it into the inode itself on a direct block.
 877                  * Inode was dirtied above.
 878                  */
 879                 jbd_debug(5, "splicing direct\n");
 880         }
 881         return err;
 882
 883 err_out:
 884         for (i = 1; i <= num; i++) {
 885                 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
 886                 ext4_journal_forget(handle, where[i].bh);
 887                 ext4_free_blocks(handle, inode,
 888                                         le32_to_cpu(where[i-1].key), 1, 0);
 889         }
 890         ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
 891
 892         return err;
 893 }
 894
 895 /*
 896  * The ext4_ind_get_blocks() function handles non-extents inodes
 897  * (i.e., using the traditional indirect/double-indirect i_blocks
 898  * scheme) for ext4_get_blocks().
 899  *
 900  * Allocation strategy is simple: if we have to allocate something, we will
 901  * have to go the whole way to leaf. So let's do it before attaching anything
 902  * to tree, set linkage between the newborn blocks, write them if sync is
 903  * required, recheck the path, free and repeat if check fails, otherwise
 904  * set the last missing link (that will protect us from any truncate-generated
 905  * removals - all blocks on the path are immune now) and possibly force the
 906  * write on the parent block.
 907  * That has a nice additional property: no special recovery from the failed
 908  * allocations is needed - we simply release blocks and do not touch anything
 909  * reachable from inode.
 910  *
 911  * `handle' can be NULL if create == 0.
 912  *
 913  * return > 0, # of blocks mapped or allocated.
 914  * return = 0, if plain lookup failed.
 915  * return < 0, error case.
 916  *
 917  * The ext4_ind_get_blocks() function should be called with
 918  * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
 919  * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
 920  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
 921  * blocks.
 922  */
 923 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 924                                   ext4_lblk_t iblock, unsigned int maxblocks,
 925                                   struct buffer_head *bh_result,
 926                                   int flags)
 927 {
 928         int err = -EIO;
 929         ext4_lblk_t offsets[4];
 930         Indirect chain[4];
 931         Indirect *partial;
 932         ext4_fsblk_t goal;
 933         int indirect_blks;
 934         int blocks_to_boundary = 0;
 935         int depth;
 936         struct ext4_inode_info *ei = EXT4_I(inode);
 937         int count = 0;
 938         ext4_fsblk_t first_block = 0;
 939         loff_t disksize;
 940
 941
 942         J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
 943         J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 944         depth = ext4_block_to_path(inode, iblock, offsets,
 945                                         &blocks_to_boundary);
 946
 947         if (depth == 0)
 948                 goto out;
 949
 950         partial = ext4_get_branch(inode, depth, offsets, chain, &err);
 951
 952         /* Simplest case - block found, no allocation needed */
 953         if (!partial) {
 954                 first_block = le32_to_cpu(chain[depth - 1].key);
 955                 clear_buffer_new(bh_result);
 956                 count++;
 957                 /*map more blocks*/
 958                 while (count < maxblocks && count <= blocks_to_boundary) {
 959                         ext4_fsblk_t blk;
 960
 961                         blk = le32_to_cpu(*(chain[depth-1].p + count));
 962
 963                         if (blk == first_block + count)
 964                                 count++;
 965                         else
 966                                 break;
 967                 }
 968                 goto got_it;
 969         }
 970
 971         /* Next simple case - plain lookup or failed read of indirect block */
 972         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
 973                 goto cleanup;
 974
 975         /*
 976          * Okay, we need to do block allocation.
 977         */
 978         goal = ext4_find_goal(inode, iblock, partial);
 979
 980         /* the number of blocks need to allocate for [d,t]indirect blocks */
 981         indirect_blks = (chain + depth) - partial - 1;
 982
 983         /*
 984          * Next look up the indirect map to count the totoal number of
 985          * direct blocks to allocate for this branch.
 986          */
 987         count = ext4_blks_to_allocate(partial, indirect_blks,
 988                                         maxblocks, blocks_to_boundary);
 989         /*
 990          * Block out ext4_truncate while we alter the tree
 991          */
 992         err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
 993                                         &count, goal,
 994                                         offsets + (partial - chain), partial);
 995
 996         /*
 997          * The ext4_splice_branch call will free and forget any buffers
 998          * on the new chain if there is a failure, but that risks using
 999          * up transaction credits, especially for bitmaps where the
1000          * credits cannot be returned.  Can we handle this somehow?  We
1001          * may need to return -EAGAIN upwards in the worst case.  --sct
1002          */
1003         if (!err)
1004                 err = ext4_splice_branch(handle, inode, iblock,
1005                                         partial, indirect_blks, count);
1006         /*
1007          * i_disksize growing is protected by i_data_sem.  Don't forget to
1008          * protect it if you're about to implement concurrent
1009          * ext4_get_block() -bzzz
1010         */
1011         if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) {
1012                 disksize = ((loff_t) iblock + count) << inode->i_blkbits;
1013                 if (disksize > i_size_read(inode))
1014                         disksize = i_size_read(inode);
1015                 if (disksize > ei->i_disksize)
1016                         ei->i_disksize = disksize;
1017         }
1018         if (err)
1019                 goto cleanup;
1020
1021         set_buffer_new(bh_result);
1022 got_it:
1023         map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
1024         if (count > blocks_to_boundary)
1025                 set_buffer_boundary(bh_result);
1026         err = count;
1027         /* Clean up and exit */
1028         partial = chain + depth - 1;    /* the whole chain */
1029 cleanup:
1030         while (partial > chain) {
1031                 BUFFER_TRACE(partial->bh, "call brelse");
1032                 brelse(partial->bh);
1033                 partial--;
1034         }
1035         BUFFER_TRACE(bh_result, "returned");
1036 out:
1037         return err;
1038 }
1039
1040 qsize_t ext4_get_reserved_space(struct inode *inode)
1041 {
1042         unsigned long long total;
1043
1044         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1045         total = EXT4_I(inode)->i_reserved_data_blocks +
1046                 EXT4_I(inode)->i_reserved_meta_blocks;
1047         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1048
1049         return total;
1050 }
1051 /*
1052  * Calculate the number of metadata blocks need to reserve
1053  * to allocate @blocks for non extent file based file
1054  */
1055 static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
1056 {
1057         int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1058         int ind_blks, dind_blks, tind_blks;
1059
1060         /* number of new indirect blocks needed */
1061         ind_blks = (blocks + icap - 1) / icap;
1062
1063         dind_blks = (ind_blks + icap - 1) / icap;
1064
1065         tind_blks = 1;
1066
1067         return ind_blks + dind_blks + tind_blks;
1068 }
1069
1070 /*
1071  * Calculate the number of metadata blocks need to reserve
1072  * to allocate given number of blocks
1073  */
1074 static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
1075 {
1076         if (!blocks)
1077                 return 0;
1078
1079         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1080                 return ext4_ext_calc_metadata_amount(inode, blocks);
1081
1082         return ext4_indirect_calc_metadata_amount(inode, blocks);
1083 }
1084
1085 static void ext4_da_update_reserve_space(struct inode *inode, int used)
1086 {
1087         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1088         int total, mdb, mdb_free;
1089
1090         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1091         /* recalculate the number of metablocks still need to be reserved */
1092         total = EXT4_I(inode)->i_reserved_data_blocks - used;
1093         mdb = ext4_calc_metadata_amount(inode, total);
1094
1095         /* figure out how many metablocks to release */
1096         BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1097         mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1098
1099         if (mdb_free) {
1100                 /* Account for allocated meta_blocks */
1101                 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
1102
1103                 /* update fs dirty blocks counter */
1104                 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1105                 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1106                 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1107         }
1108
1109         /* update per-inode reservations */
1110         BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
1111         EXT4_I(inode)->i_reserved_data_blocks -= used;
1112         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1113
1114         /*
1115          * free those over-booking quota for metadata blocks
1116          */
1117         if (mdb_free)
1118                 vfs_dq_release_reservation_block(inode, mdb_free);
1119
1120         /*
1121          * If we have done all the pending block allocations and if
1122          * there aren't any writers on the inode, we can discard the
1123          * inode's preallocations.
1124          */
1125         if (!total && (atomic_read(&inode->i_writecount) == 0))
1126                 ext4_discard_preallocations(inode);
1127 }
1128
1129 static int check_block_validity(struct inode *inode, sector_t logical,
1130                                 sector_t phys, int len)
1131 {
1132         if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1133                 ext4_error(inode->i_sb, "check_block_validity",
1134                            "inode #%lu logical block %llu mapped to %llu "
1135                            "(size %d)", inode->i_ino,
1136                            (unsigned long long) logical,
1137                            (unsigned long long) phys, len);
1138                 WARN_ON(1);
1139                 return -EIO;
1140         }
1141         return 0;
1142 }
1143
1144 /*
1145  * The ext4_get_blocks() function tries to look up the requested blocks,
1146  * and returns if the blocks are already mapped.
1147  *
1148  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1149  * and store the allocated blocks in the result buffer head and mark it
1150  * mapped.
1151  *
1152  * If file type is extents based, it will call ext4_ext_get_blocks(),
1153  * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
1154  * based files
1155  *
1156  * On success, it returns the number of blocks being mapped or allocate.
1157  * if create==0 and the blocks are pre-allocated and uninitialized block,
1158  * the result buffer head is unmapped. If the create ==1, it will make sure
1159  * the buffer head is mapped.
1160  *
1161  * It returns 0 if plain look up failed (blocks have not been allocated), in
1162  * that casem, buffer head is unmapped
1163  *
1164  * It returns the error in case of allocation failure.
1165  */
1166 int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1167                     unsigned int max_blocks, struct buffer_head *bh,
1168                     int flags)
1169 {
1170         int retval;
1171
1172         clear_buffer_mapped(bh);
1173         clear_buffer_unwritten(bh);
1174
1175         /*
1176          * Try to see if we can get the block without requesting a new
1177          * file system block.
1178          */
1179         down_read((&EXT4_I(inode)->i_data_sem));
1180         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1181                 retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
1182                                 bh, 0);
1183         } else {
1184                 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
1185                                              bh, 0);
1186         }
1187         up_read((&EXT4_I(inode)->i_data_sem));
1188
1189         if (retval > 0 && buffer_mapped(bh)) {
1190                 int ret = check_block_validity(inode, block,
1191                                                bh->b_blocknr, retval);
1192                 if (ret != 0)
1193                         return ret;
1194         }
1195
1196         /* If it is only a block(s) look up */
1197         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
1198                 return retval;
1199
1200         /*
1201          * Returns if the blocks have already allocated
1202          *
1203          * Note that if blocks have been preallocated
1204          * ext4_ext_get_block() returns th create = 0
1205          * with buffer head unmapped.
1206          */
1207         if (retval > 0 && buffer_mapped(bh))
1208                 return retval;
1209
1210         /*
1211          * When we call get_blocks without the create flag, the
1212          * BH_Unwritten flag could have gotten set if the blocks
1213          * requested were part of a uninitialized extent.  We need to
1214          * clear this flag now that we are committed to convert all or
1215          * part of the uninitialized extent to be an initialized
1216          * extent.  This is because we need to avoid the combination
1217          * of BH_Unwritten and BH_Mapped flags being simultaneously
1218          * set on the buffer_head.
1219          */
1220         clear_buffer_unwritten(bh);
1221
1222         /*
1223          * New blocks allocate and/or writing to uninitialized extent
1224          * will possibly result in updating i_data, so we take
1225          * the write lock of i_data_sem, and call get_blocks()
1226          * with create == 1 flag.
1227          */
1228         down_write((&EXT4_I(inode)->i_data_sem));
1229
1230         /*
1231          * if the caller is from delayed allocation writeout path
1232          * we have already reserved fs blocks for allocation
1233          * let the underlying get_block() function know to
1234          * avoid double accounting
1235          */
1236         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1237                 EXT4_I(inode)->i_delalloc_reserved_flag = 1;
1238         /*
1239          * We need to check for EXT4 here because migrate
1240          * could have changed the inode type in between
1241          */
1242         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1243                 retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
1244                                               bh, flags);
1245         } else {
1246                 retval = ext4_ind_get_blocks(handle, inode, block,
1247                                              max_blocks, bh, flags);
1248
1249                 if (retval > 0 && buffer_new(bh)) {
1250                         /*
1251                          * We allocated new blocks which will result in
1252                          * i_data's format changing.  Force the migrate
1253                          * to fail by clearing migrate flags
1254                          */
1255                         EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
1256                                                         ~EXT4_EXT_MIGRATE;
1257                 }
1258         }
1259
1260         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1261                 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1262
1263         /*
1264          * Update reserved blocks/metadata blocks after successful
1265          * block allocation which had been deferred till now.
1266          */
1267         if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
1268                 ext4_da_update_reserve_space(inode, retval);
1269
1270         up_write((&EXT4_I(inode)->i_data_sem));
1271         if (retval > 0 && buffer_mapped(bh)) {
1272                 int ret = check_block_validity(inode, block,
1273                                                bh->b_blocknr, retval);
1274                 if (ret != 0)
1275                         return ret;
1276         }
1277         return retval;
1278 }
1279
1280 /* Maximum number of blocks we map for direct IO at once. */
1281 #define DIO_MAX_BLOCKS 4096
1282
1283 int ext4_get_block(struct inode *inode, sector_t iblock,
1284                    struct buffer_head *bh_result, int create)
1285 {
1286         handle_t *handle = ext4_journal_current_handle();
1287         int ret = 0, started = 0;
1288         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1289         int dio_credits;
1290
1291         if (create && !handle) {
1292                 /* Direct IO write... */
1293                 if (max_blocks > DIO_MAX_BLOCKS)
1294                         max_blocks = DIO_MAX_BLOCKS;
1295                 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
1296                 handle = ext4_journal_start(inode, dio_credits);
1297                 if (IS_ERR(handle)) {
1298                         ret = PTR_ERR(handle);
1299                         goto out;
1300                 }
1301                 started = 1;
1302         }
1303
1304         ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
1305                               create ? EXT4_GET_BLOCKS_CREATE : 0);
1306         if (ret > 0) {
1307                 bh_result->b_size = (ret << inode->i_blkbits);
1308                 ret = 0;
1309         }
1310         if (started)
1311                 ext4_journal_stop(handle);
1312 out:
1313         return ret;
1314 }
1315
1316 /*
1317  * `handle' can be NULL if create is zero
1318  */
1319 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1320                                 ext4_lblk_t block, int create, int *errp)
1321 {
1322         struct buffer_head dummy;
1323         int fatal = 0, err;
1324         int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE;
1325
1326         J_ASSERT(handle != NULL || create == 0);
1327
1328         dummy.b_state = 0;
1329         dummy.b_blocknr = -1000;
1330         buffer_trace_init(&dummy.b_history);
1331         if (create)
1332                 flags |= EXT4_GET_BLOCKS_CREATE;
1333         err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
1334         /*
1335          * ext4_get_blocks() returns number of blocks mapped. 0 in
1336          * case of a HOLE.
1337          */
1338         if (err > 0) {
1339                 if (err > 1)
1340                         WARN_ON(1);
1341                 err = 0;
1342         }
1343         *errp = err;
1344         if (!err && buffer_mapped(&dummy)) {
1345                 struct buffer_head *bh;
1346                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1347                 if (!bh) {
1348                         *errp = -EIO;
1349                         goto err;
1350                 }
1351                 if (buffer_new(&dummy)) {
1352                         J_ASSERT(create != 0);
1353                         J_ASSERT(handle != NULL);
1354
1355                         /*
1356                          * Now that we do not always journal data, we should
1357                          * keep in mind whether this should always journal the
1358                          * new buffer as metadata.  For now, regular file
1359                          * writes use ext4_get_block instead, so it's not a
1360                          * problem.
1361                          */
1362                         lock_buffer(bh);
1363                         BUFFER_TRACE(bh, "call get_create_access");
1364                         fatal = ext4_journal_get_create_access(handle, bh);
1365                         if (!fatal && !buffer_uptodate(bh)) {
1366                                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1367                                 set_buffer_uptodate(bh);
1368                         }
1369                         unlock_buffer(bh);
1370                         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1371                         err = ext4_handle_dirty_metadata(handle, inode, bh);
1372                         if (!fatal)
1373                                 fatal = err;
1374                 } else {
1375                         BUFFER_TRACE(bh, "not a new buffer");
1376                 }
1377                 if (fatal) {
1378                         *errp = fatal;
1379                         brelse(bh);
1380                         bh = NULL;
1381                 }
1382                 return bh;
1383         }
1384 err:
1385         return NULL;
1386 }
1387
1388 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1389                                ext4_lblk_t block, int create, int *err)
1390 {
1391         struct buffer_head *bh;
1392
1393         bh = ext4_getblk(handle, inode, block, create, err);
1394         if (!bh)
1395                 return bh;
1396         if (buffer_uptodate(bh))
1397                 return bh;
1398         ll_rw_block(READ_META, 1, &bh);
1399         wait_on_buffer(bh);
1400         if (buffer_uptodate(bh))
1401                 return bh;
1402         put_bh(bh);
1403         *err = -EIO;
1404         return NULL;
1405 }
1406
1407 static int walk_page_buffers(handle_t *handle,
1408                              struct buffer_head *head,
1409                              unsigned from,
1410                              unsigned to,
1411                              int *partial,
1412                              int (*fn)(handle_t *handle,
1413                                        struct buffer_head *bh))
1414 {
1415         struct buffer_head *bh;
1416         unsigned block_start, block_end;
1417         unsigned blocksize = head->b_size;
1418         int err, ret = 0;
1419         struct buffer_head *next;
1420
1421         for (bh = head, block_start = 0;
1422              ret == 0 && (bh != head || !block_start);
1423              block_start = block_end, bh = next)
1424         {
1425                 next = bh->b_this_page;
1426                 block_end = block_start + blocksize;
1427                 if (block_end <= from || block_start >= to) {
1428                         if (partial && !buffer_uptodate(bh))
1429                                 *partial = 1;
1430                         continue;
1431                 }
1432                 err = (*fn)(handle, bh);
1433                 if (!ret)
1434                         ret = err;
1435         }
1436         return ret;
1437 }
1438
1439 /*
1440  * To preserve ordering, it is essential that the hole instantiation and
1441  * the data write be encapsulated in a single transaction.  We cannot
1442  * close off a transaction and start a new one between the ext4_get_block()
1443  * and the commit_write().  So doing the jbd2_journal_start at the start of
1444  * prepare_write() is the right place.
1445  *
1446  * Also, this function can nest inside ext4_writepage() ->
1447  * block_write_full_page(). In that case, we *know* that ext4_writepage()
1448  * has generated enough buffer credits to do the whole page.  So we won't
1449  * block on the journal in that case, which is good, because the caller may
1450  * be PF_MEMALLOC.
1451  *
1452  * By accident, ext4 can be reentered when a transaction is open via
1453  * quota file writes.  If we were to commit the transaction while thus
1454  * reentered, there can be a deadlock - we would be holding a quota
1455  * lock, and the commit would never complete if another thread had a
1456  * transaction open and was blocking on the quota lock - a ranking
1457  * violation.
1458  *
1459  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1460  * will _not_ run commit under these circumstances because handle->h_ref
1461  * is elevated.  We'll still have enough credits for the tiny quotafile
1462  * write.
1463  */
1464 static int do_journal_get_write_access(handle_t *handle,
1465                                         struct buffer_head *bh)
1466 {
1467         if (!buffer_mapped(bh) || buffer_freed(bh))
1468                 return 0;
1469         return ext4_journal_get_write_access(handle, bh);
1470 }
1471
1472 static int ext4_write_begin(struct file *file, struct address_space *mapping,
1473                                 loff_t pos, unsigned len, unsigned flags,
1474                                 struct page **pagep, void **fsdata)
1475 {
1476         struct inode *inode = mapping->host;
1477         int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1478         handle_t *handle;
1479         int retries = 0;
1480         struct page *page;
1481         pgoff_t index;
1482         unsigned from, to;
1483
1484         trace_mark(ext4_write_begin,
1485                    "dev %s ino %lu pos %llu len %u flags %u",
1486                    inode->i_sb->s_id, inode->i_ino,
1487                    (unsigned long long) pos, len, flags);
1488         index = pos >> PAGE_CACHE_SHIFT;
1489         from = pos & (PAGE_CACHE_SIZE - 1);
1490         to = from + len;
1491
1492 retry:
1493         handle = ext4_journal_start(inode, needed_blocks);
1494         if (IS_ERR(handle)) {
1495                 ret = PTR_ERR(handle);
1496                 goto out;
1497         }
1498
1499         /* We cannot recurse into the filesystem as the transaction is already
1500          * started */
1501         flags |= AOP_FLAG_NOFS;
1502
1503         page = grab_cache_page_write_begin(mapping, index, flags);
1504         if (!page) {
1505                 ext4_journal_stop(handle);
1506                 ret = -ENOMEM;
1507                 goto out;
1508         }
1509         *pagep = page;
1510
1511         ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1512                                 ext4_get_block);
1513
1514         if (!ret && ext4_should_journal_data(inode)) {
1515                 ret = walk_page_buffers(handle, page_buffers(page),
1516                                 from, to, NULL, do_journal_get_write_access);
1517         }
1518
1519         if (ret) {
1520                 unlock_page(page);
1521                 ext4_journal_stop(handle);
1522                 page_cache_release(page);
1523                 /*
1524                  * block_write_begin may have instantiated a few blocks
1525                  * outside i_size.  Trim these off again. Don't need
1526                  * i_size_read because we hold i_mutex.
1527                  */
1528                 if (pos + len > inode->i_size)
1529                         vmtruncate(inode, inode->i_size);
1530         }
1531
1532         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1533                 goto retry;
1534 out:
1535         return ret;
1536 }
1537
1538 /* For write_end() in data=journal mode */
1539 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1540 {
1541         if (!buffer_mapped(bh) || buffer_freed(bh))
1542                 return 0;
1543         set_buffer_uptodate(bh);
1544         return ext4_handle_dirty_metadata(handle, NULL, bh);
1545 }
1546
1547 /*
1548  * We need to pick up the new inode size which generic_commit_write gave us
1549  * `file' can be NULL - eg, when called from page_symlink().
1550  *
1551  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
1552  * buffers are managed internally.
1553  */
1554 static int ext4_ordered_write_end(struct file *file,
1555                                 struct address_space *mapping,
1556                                 loff_t pos, unsigned len, unsigned copied,
1557                                 struct page *page, void *fsdata)
1558 {
1559         handle_t *handle = ext4_journal_current_handle();
1560         struct inode *inode = mapping->host;
1561         int ret = 0, ret2;
1562
1563         trace_mark(ext4_ordered_write_end,
1564                    "dev %s ino %lu pos %llu len %u copied %u",
1565                    inode->i_sb->s_id, inode->i_ino,
1566                    (unsigned long long) pos, len, copied);
1567         ret = ext4_jbd2_file_inode(handle, inode);
1568
1569         if (ret == 0) {
1570                 loff_t new_i_size;
1571
1572                 new_i_size = pos + copied;
1573                 if (new_i_size > EXT4_I(inode)->i_disksize) {
1574                         ext4_update_i_disksize(inode, new_i_size);
1575                         /* We need to mark inode dirty even if
1576                          * new_i_size is less that inode->i_size
1577                          * bu greater than i_disksize.(hint delalloc)
1578                          */
1579                         ext4_mark_inode_dirty(handle, inode);
1580                 }
1581
1582                 ret2 = generic_write_end(file, mapping, pos, len, copied,
1583                                                         page, fsdata);
1584                 copied = ret2;
1585                 if (ret2 < 0)
1586                         ret = ret2;
1587         }
1588         ret2 = ext4_journal_stop(handle);
1589         if (!ret)
1590                 ret = ret2;
1591
1592         return ret ? ret : copied;
1593 }
1594
1595 static int ext4_writeback_write_end(struct file *file,
1596                                 struct address_space *mapping,
1597                                 loff_t pos, unsigned len, unsigned copied,
1598                                 struct page *page, void *fsdata)
1599 {
1600         handle_t *handle = ext4_journal_current_handle();
1601         struct inode *inode = mapping->host;
1602         int ret = 0, ret2;
1603         loff_t new_i_size;
1604
1605         trace_mark(ext4_writeback_write_end,
1606                    "dev %s ino %lu pos %llu len %u copied %u",
1607                    inode->i_sb->s_id, inode->i_ino,
1608                    (unsigned long long) pos, len, copied);
1609         new_i_size = pos + copied;
1610         if (new_i_size > EXT4_I(inode)->i_disksize) {
1611                 ext4_update_i_disksize(inode, new_i_size);
1612                 /* We need to mark inode dirty even if
1613                  * new_i_size is less that inode->i_size
1614                  * bu greater than i_disksize.(hint delalloc)
1615                  */
1616                 ext4_mark_inode_dirty(handle, inode);
1617         }
1618
1619         ret2 = generic_write_end(file, mapping, pos, len, copied,
1620                                                         page, fsdata);
1621         copied = ret2;
1622         if (ret2 < 0)
1623                 ret = ret2;
1624
1625         ret2 = ext4_journal_stop(handle);
1626         if (!ret)
1627                 ret = ret2;
1628
1629         return ret ? ret : copied;
1630 }
1631
1632 static int ext4_journalled_write_end(struct file *file,
1633                                 struct address_space *mapping,
1634                                 loff_t pos, unsigned len, unsigned copied,
1635                                 struct page *page, void *fsdata)
1636 {
1637         handle_t *handle = ext4_journal_current_handle();
1638         struct inode *inode = mapping->host;
1639         int ret = 0, ret2;
1640         int partial = 0;
1641         unsigned from, to;
1642         loff_t new_i_size;
1643
1644         trace_mark(ext4_journalled_write_end,
1645                    "dev %s ino %lu pos %llu len %u copied %u",
1646                    inode->i_sb->s_id, inode->i_ino,
1647                    (unsigned long long) pos, len, copied);
1648         from = pos & (PAGE_CACHE_SIZE - 1);
1649         to = from + len;
1650
1651         if (copied < len) {
1652                 if (!PageUptodate(page))
1653                         copied = 0;
1654                 page_zero_new_buffers(page, from+copied, to);
1655         }
1656
1657         ret = walk_page_buffers(handle, page_buffers(page), from,
1658                                 to, &partial, write_end_fn);
1659         if (!partial)
1660                 SetPageUptodate(page);
1661         new_i_size = pos + copied;
1662         if (new_i_size > inode->i_size)
1663                 i_size_write(inode, pos+copied);
1664         EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1665         if (new_i_size > EXT4_I(inode)->i_disksize) {
1666                 ext4_update_i_disksize(inode, new_i_size);
1667                 ret2 = ext4_mark_inode_dirty(handle, inode);
1668                 if (!ret)
1669                         ret = ret2;
1670         }
1671
1672         unlock_page(page);
1673         ret2 = ext4_journal_stop(handle);
1674         if (!ret)
1675                 ret = ret2;
1676         page_cache_release(page);
1677
1678         return ret ? ret : copied;
1679 }
1680
1681 static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1682 {
1683         int retries = 0;
1684         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1685         unsigned long md_needed, mdblocks, total = 0;
1686
1687         /*
1688          * recalculate the amount of metadata blocks to reserve
1689          * in order to allocate nrblocks
1690          * worse case is one extent per block
1691          */
1692 repeat:
1693         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1694         total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
1695         mdblocks = ext4_calc_metadata_amount(inode, total);
1696         BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
1697
1698         md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1699         total = md_needed + nrblocks;
1700
1701         /*
1702          * Make quota reservation here to prevent quota overflow
1703          * later. Real quota accounting is done at pages writeout
1704          * time.
1705          */
1706         if (vfs_dq_reserve_block(inode, total)) {
1707                 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1708                 return -EDQUOT;
1709         }
1710
1711         if (ext4_claim_free_blocks(sbi, total)) {
1712                 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1713                 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1714                         yield();
1715                         goto repeat;
1716                 }
1717                 vfs_dq_release_reservation_block(inode, total);
1718                 return -ENOSPC;
1719         }
1720         EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1721         EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
1722
1723         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1724         return 0;       /* success */
1725 }
1726
1727 static void ext4_da_release_space(struct inode *inode, int to_free)
1728 {
1729         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1730         int total, mdb, mdb_free, release;
1731
1732         if (!to_free)
1733                 return;         /* Nothing to release, exit */
1734
1735         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1736
1737         if (!EXT4_I(inode)->i_reserved_data_blocks) {
1738                 /*
1739                  * if there is no reserved blocks, but we try to free some
1740                  * then the counter is messed up somewhere.
1741                  * but since this function is called from invalidate
1742                  * page, it's harmless to return without any action
1743                  */
1744                 printk(KERN_INFO "ext4 delalloc try to release %d reserved "
1745                             "blocks for inode %lu, but there is no reserved "
1746                             "data blocks\n", to_free, inode->i_ino);
1747                 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1748                 return;
1749         }
1750
1751         /* recalculate the number of metablocks still need to be reserved */
1752         total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
1753         mdb = ext4_calc_metadata_amount(inode, total);
1754
1755         /* figure out how many metablocks to release */
1756         BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1757         mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1758
1759         release = to_free + mdb_free;
1760
1761         /* update fs dirty blocks counter for truncate case */
1762         percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
1763
1764         /* update per-inode reservations */
1765         BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
1766         EXT4_I(inode)->i_reserved_data_blocks -= to_free;
1767
1768         BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1769         EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1770         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1771
1772         vfs_dq_release_reservation_block(inode, release);
1773 }
1774
1775 static void ext4_da_page_release_reservation(struct page *page,
1776                                                 unsigned long offset)
1777 {
1778         int to_release = 0;
1779         struct buffer_head *head, *bh;
1780         unsigned int curr_off = 0;
1781
1782         head = page_buffers(page);
1783         bh = head;
1784         do {
1785                 unsigned int next_off = curr_off + bh->b_size;
1786
1787                 if ((offset <= curr_off) && (buffer_delay(bh))) {
1788                         to_release++;
1789                         clear_buffer_delay(bh);
1790                 }
1791                 curr_off = next_off;
1792         } while ((bh = bh->b_this_page) != head);
1793         ext4_da_release_space(page->mapping->host, to_release);
1794 }
1795
1796 /*
1797  * Delayed allocation stuff
1798  */
1799
1800 struct mpage_da_data {
1801         struct inode *inode;
1802         sector_t b_blocknr;             /* start block number of extent */
1803         size_t b_size;                  /* size of extent */
1804         unsigned long b_state;          /* state of the extent */
1805         unsigned long first_page, next_page;    /* extent of pages */
1806         struct writeback_control *wbc;
1807         int io_done;
1808         int pages_written;
1809         int retval;
1810 };
1811
1812 /*
1813  * mpage_da_submit_io - walks through extent of pages and try to write
1814  * them with writepage() call back
1815  *
1816  * @mpd->inode: inode
1817  * @mpd->first_page: first page of the extent
1818  * @mpd->next_page: page after the last page of the extent
1819  *
1820  * By the time mpage_da_submit_io() is called we expect all blocks
1821  * to be allocated. this may be wrong if allocation failed.
1822  *
1823  * As pages are already locked by write_cache_pages(), we can't use it
1824  */
1825 static int mpage_da_submit_io(struct mpage_da_data *mpd)
1826 {
1827         long pages_skipped;
1828         struct pagevec pvec;
1829         unsigned long index, end;
1830         int ret = 0, err, nr_pages, i;
1831         struct inode *inode = mpd->inode;
1832         struct address_space *mapping = inode->i_mapping;
1833
1834         BUG_ON(mpd->next_page <= mpd->first_page);
1835         /*
1836          * We need to start from the first_page to the next_page - 1
1837          * to make sure we also write the mapped dirty buffer_heads.
1838          * If we look at mpd->b_blocknr we would only be looking
1839          * at the currently mapped buffer_heads.
1840          */
1841         index = mpd->first_page;
1842         end = mpd->next_page - 1;
1843
1844         pagevec_init(&pvec, 0);
1845         while (index <= end) {
1846                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1847                 if (nr_pages == 0)
1848                         break;
1849                 for (i = 0; i < nr_pages; i++) {
1850                         struct page *page = pvec.pages[i];
1851
1852                         index = page->index;
1853                         if (index > end)
1854                                 break;
1855                         index++;
1856
1857                         BUG_ON(!PageLocked(page));
1858                         BUG_ON(PageWriteback(page));
1859
1860                         pages_skipped = mpd->wbc->pages_skipped;
1861                         err = mapping->a_ops->writepage(page, mpd->wbc);
1862                         if (!err && (pages_skipped == mpd->wbc->pages_skipped))
1863                                 /*
1864                                  * have successfully written the page
1865                                  * without skipping the same
1866                                  */
1867                                 mpd->pages_written++;
1868                         /*
1869                          * In error case, we have to continue because
1870                          * remaining pages are still locked
1871                          * XXX: unlock and re-dirty them?
1872                          */
1873                         if (ret == 0)
1874                                 ret = err;
1875                 }
1876                 pagevec_release(&pvec);
1877         }
1878         return ret;
1879 }
1880
1881 /*
1882  * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
1883  *
1884  * @mpd->inode - inode to walk through
1885  * @exbh->b_blocknr - first block on a disk
1886  * @exbh->b_size - amount of space in bytes
1887  * @logical - first logical block to start assignment with
1888  *
1889  * the function goes through all passed space and put actual disk
1890  * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
1891  */
1892 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1893                                  struct buffer_head *exbh)
1894 {
1895         struct inode *inode = mpd->inode;
1896         struct address_space *mapping = inode->i_mapping;
1897         int blocks = exbh->b_size >> inode->i_blkbits;
1898         sector_t pblock = exbh->b_blocknr, cur_logical;
1899         struct buffer_head *head, *bh;
1900         pgoff_t index, end;
1901         struct pagevec pvec;
1902         int nr_pages, i;
1903
1904         index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1905         end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1906         cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1907
1908         pagevec_init(&pvec, 0);
1909
1910         while (index <= end) {
1911                 /* XXX: optimize tail */
1912                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1913                 if (nr_pages == 0)
1914                         break;
1915                 for (i = 0; i < nr_pages; i++) {
1916                         struct page *page = pvec.pages[i];
1917
1918                         index = page->index;
1919                         if (index > end)
1920                                 break;
1921                         index++;
1922
1923                         BUG_ON(!PageLocked(page));
1924                         BUG_ON(PageWriteback(page));
1925                         BUG_ON(!page_has_buffers(page));
1926
1927                         bh = page_buffers(page);
1928                         head = bh;
1929
1930                         /* skip blocks out of the range */
1931                         do {
1932                                 if (cur_logical >= logical)
1933                                         break;
1934                                 cur_logical++;
1935                         } while ((bh = bh->b_this_page) != head);
1936
1937                         do {
1938                                 if (cur_logical >= logical + blocks)
1939                                         break;
1940
1941                                 if (buffer_delay(bh) ||
1942                                                 buffer_unwritten(bh)) {
1943
1944                                         BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
1945
1946                                         if (buffer_delay(bh)) {
1947                                                 clear_buffer_delay(bh);
1948                                                 bh->b_blocknr = pblock;
1949                                         } else {
1950                                                 /*
1951                                                  * unwritten already should have
1952                                                  * blocknr assigned. Verify that
1953                                                  */
1954                                                 clear_buffer_unwritten(bh);
1955                                                 BUG_ON(bh->b_blocknr != pblock);
1956                                         }
1957
1958                                 } else if (buffer_mapped(bh))
1959                                         BUG_ON(bh->b_blocknr != pblock);
1960
1961                                 cur_logical++;
1962                                 pblock++;
1963                         } while ((bh = bh->b_this_page) != head);
1964                 }
1965                 pagevec_release(&pvec);
1966         }
1967 }
1968
1969
1970 /*
1971  * __unmap_underlying_blocks - just a helper function to unmap
1972  * set of blocks described by @bh
1973  */
1974 static inline void __unmap_underlying_blocks(struct inode *inode,
1975                                              struct buffer_head *bh)
1976 {
1977         struct block_device *bdev = inode->i_sb->s_bdev;
1978         int blocks, i;
1979
1980         blocks = bh->b_size >> inode->i_blkbits;
1981         for (i = 0; i < blocks; i++)
1982                 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1983 }
1984
1985 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
1986                                         sector_t logical, long blk_cnt)
1987 {
1988         int nr_pages, i;
1989         pgoff_t index, end;
1990         struct pagevec pvec;
1991         struct inode *inode = mpd->inode;
1992         struct address_space *mapping = inode->i_mapping;
1993
1994         index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1995         end   = (logical + blk_cnt - 1) >>
1996                                 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1997         while (index <= end) {
1998                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1999                 if (nr_pages == 0)
2000                         break;
2001                 for (i = 0; i < nr_pages; i++) {
2002                         struct page *page = pvec.pages[i];
2003                         index = page->index;
2004                         if (index > end)
2005                                 break;
2006                         index++;
2007
2008                         BUG_ON(!PageLocked(page));
2009                         BUG_ON(PageWriteback(page));
2010                         block_invalidatepage(page, 0);
2011                         ClearPageUptodate(page);
2012                         unlock_page(page);
2013                 }
2014         }
2015         return;
2016 }
2017
2018 static void ext4_print_free_blocks(struct inode *inode)
2019 {
2020         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2021         printk(KERN_EMERG "Total free blocks count %lld\n",
2022                         ext4_count_free_blocks(inode->i_sb));
2023         printk(KERN_EMERG "Free/Dirty block details\n");
2024         printk(KERN_EMERG "free_blocks=%lld\n",
2025                         (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
2026         printk(KERN_EMERG "dirty_blocks=%lld\n",
2027                         (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2028         printk(KERN_EMERG "Block reservation details\n");
2029         printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
2030                         EXT4_I(inode)->i_reserved_data_blocks);
2031         printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
2032                         EXT4_I(inode)->i_reserved_meta_blocks);
2033         return;
2034 }
2035
2036 /*
2037  * mpage_da_map_blocks - go through given space
2038  *
2039  * @mpd - bh describing space
2040  *
2041  * The function skips space we know is already mapped to disk blocks.
2042  *
2043  */
2044 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2045 {
2046         int err, blks, get_blocks_flags;
2047         struct buffer_head new;
2048         sector_t next = mpd->b_blocknr;
2049         unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2050         loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2051         handle_t *handle = NULL;
2052
2053         /*
2054          * We consider only non-mapped and non-allocated blocks
2055          */
2056         if ((mpd->b_state  & (1 << BH_Mapped)) &&
2057                 !(mpd->b_state & (1 << BH_Delay)) &&
2058                 !(mpd->b_state & (1 << BH_Unwritten)))
2059                 return 0;
2060
2061         /*
2062          * If we didn't accumulate anything to write simply return
2063          */
2064         if (!mpd->b_size)
2065                 return 0;
2066
2067         handle = ext4_journal_current_handle();
2068         BUG_ON(!handle);
2069
2070         /*
2071          * Call ext4_get_blocks() to allocate any delayed allocation
2072          * blocks, or to convert an uninitialized extent to be
2073          * initialized (in the case where we have written into
2074          * one or more preallocated blocks).
2075          *
2076          * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
2077          * indicate that we are on the delayed allocation path.  This
2078          * affects functions in many different parts of the allocation
2079          * call path.  This flag exists primarily because we don't
2080          * want to change *many* call functions, so ext4_get_blocks()
2081          * will set the magic i_delalloc_reserved_flag once the
2082          * inode's allocation semaphore is taken.
2083          *
2084          * If the blocks in questions were delalloc blocks, set
2085          * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2086          * variables are updated after the blocks have been allocated.
2087          */
2088         new.b_state = 0;
2089         get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
2090                             EXT4_GET_BLOCKS_DELALLOC_RESERVE);
2091         if (mpd->b_state & (1 << BH_Delay))
2092                 get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
2093         blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
2094                                &new, get_blocks_flags);
2095         if (blks < 0) {
2096                 err = blks;
2097                 /*
2098                  * If get block returns with error we simply
2099                  * return. Later writepage will redirty the page and
2100                  * writepages will find the dirty page again
2101                  */
2102                 if (err == -EAGAIN)
2103                         return 0;
2104
2105                 if (err == -ENOSPC &&
2106                     ext4_count_free_blocks(mpd->inode->i_sb)) {
2107                         mpd->retval = err;
2108                         return 0;
2109                 }
2110
2111                 /*
2112                  * get block failure will cause us to loop in
2113                  * writepages, because a_ops->writepage won't be able
2114                  * to make progress. The page will be redirtied by
2115                  * writepage and writepages will again try to write
2116                  * the same.
2117                  */
2118                 printk(KERN_EMERG "%s block allocation failed for inode %lu "
2119                                   "at logical offset %llu with max blocks "
2120                                   "%zd with error %d\n",
2121                                   __func__, mpd->inode->i_ino,
2122                                   (unsigned long long)next,
2123                                   mpd->b_size >> mpd->inode->i_blkbits, err);
2124                 printk(KERN_EMERG "This should not happen.!! "
2125                                         "Data will be lost\n");
2126                 if (err == -ENOSPC) {
2127                         ext4_print_free_blocks(mpd->inode);
2128                 }
2129                 /* invalidate all the pages */
2130                 ext4_da_block_invalidatepages(mpd, next,
2131                                 mpd->b_size >> mpd->inode->i_blkbits);
2132                 return err;
2133         }
2134         BUG_ON(blks == 0);
2135
2136         new.b_size = (blks << mpd->inode->i_blkbits);
2137
2138         if (buffer_new(&new))
2139                 __unmap_underlying_blocks(mpd->inode, &new);
2140
2141         /*
2142          * If blocks are delayed marked, we need to
2143          * put actual blocknr and drop delayed bit
2144          */
2145         if ((mpd->b_state & (1 << BH_Delay)) ||
2146             (mpd->b_state & (1 << BH_Unwritten)))
2147                 mpage_put_bnr_to_bhs(mpd, next, &new);
2148
2149         if (ext4_should_order_data(mpd->inode)) {
2150                 err = ext4_jbd2_file_inode(handle, mpd->inode);
2151                 if (err)
2152                         return err;
2153         }
2154
2155         /*
2156          * Update on-disk size along with block allocation we don't
2157          * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
2158          * within already allocated block -bzzz
2159          */
2160         disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
2161         if (disksize > i_size_read(mpd->inode))
2162                 disksize = i_size_read(mpd->inode);
2163         if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2164                 ext4_update_i_disksize(mpd->inode, disksize);
2165                 return ext4_mark_inode_dirty(handle, mpd->inode);
2166         }
2167
2168         return 0;
2169 }
2170
2171 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
2172                 (1 << BH_Delay) | (1 << BH_Unwritten))
2173
2174 /*
2175  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
2176  *
2177  * @mpd->lbh - extent of blocks
2178  * @logical - logical number of the block in the file
2179  * @bh - bh of the block (used to access block's state)
2180  *
2181  * the function is used to collect contig. blocks in same state
2182  */
2183 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2184                                    sector_t logical, size_t b_size,
2185                                    unsigned long b_state)
2186 {
2187         sector_t next;
2188         int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2189
2190         /* check if thereserved journal credits might overflow */
2191         if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
2192                 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2193                         /*
2194                          * With non-extent format we are limited by the journal
2195                          * credit available.  Total credit needed to insert
2196                          * nrblocks contiguous blocks is dependent on the
2197                          * nrblocks.  So limit nrblocks.
2198                          */
2199                         goto flush_it;
2200                 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
2201                                 EXT4_MAX_TRANS_DATA) {
2202                         /*
2203                          * Adding the new buffer_head would make it cross the
2204                          * allowed limit for which we have journal credit
2205                          * reserved. So limit the new bh->b_size
2206                          */
2207                         b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
2208                                                 mpd->inode->i_blkbits;
2209                         /* we will do mpage_da_submit_io in the next loop */
2210                 }
2211         }
2212         /*
2213          * First block in the extent
2214          */
2215         if (mpd->b_size == 0) {
2216                 mpd->b_blocknr = logical;
2217                 mpd->b_size = b_size;
2218                 mpd->b_state = b_state & BH_FLAGS;
2219                 return;
2220         }
2221
2222         next = mpd->b_blocknr + nrblocks;
2223         /*
2224          * Can we merge the block to our big extent?
2225          */
2226         if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2227                 mpd->b_size += b_size;
2228                 return;
2229         }
2230
2231 flush_it:
2232         /*
2233          * We couldn't merge the block to our extent, so we
2234          * need to flush current  extent and start new one
2235          */
2236         if (mpage_da_map_blocks(mpd) == 0)
2237                 mpage_da_submit_io(mpd);
2238         mpd->io_done = 1;
2239         return;
2240 }
2241
2242 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2243 {
2244         /*
2245          * unmapped buffer is possible for holes.
2246          * delay buffer is possible with delayed allocation.
2247          * We also need to consider unwritten buffer as unmapped.
2248          */
2249         return (!buffer_mapped(bh) || buffer_delay(bh) ||
2250                                 buffer_unwritten(bh)) && buffer_dirty(bh);
2251 }
2252
2253 /*
2254  * __mpage_da_writepage - finds extent of pages and blocks
2255  *
2256  * @page: page to consider
2257  * @wbc: not used, we just follow rules
2258  * @data: context
2259  *
2260  * The function finds extents of pages and scan them for all blocks.
2261  */
2262 static int __mpage_da_writepage(struct page *page,
2263                                 struct writeback_control *wbc, void *data)
2264 {
2265         struct mpage_da_data *mpd = data;
2266         struct inode *inode = mpd->inode;
2267         struct buffer_head *bh, *head;
2268         sector_t logical;
2269
2270         if (mpd->io_done) {
2271                 /*
2272                  * Rest of the page in the page_vec
2273                  * redirty then and skip then. We will
2274                  * try to to write them again after
2275                  * starting a new transaction
2276                  */
2277                 redirty_page_for_writepage(wbc, page);
2278                 unlock_page(page);
2279                 return MPAGE_DA_EXTENT_TAIL;
2280         }
2281         /*
2282          * Can we merge this page to current extent?
2283          */
2284         if (mpd->next_page != page->index) {
2285                 /*
2286                  * Nope, we can't. So, we map non-allocated blocks
2287                  * and start IO on them using writepage()
2288                  */
2289                 if (mpd->next_page != mpd->first_page) {
2290                         if (mpage_da_map_blocks(mpd) == 0)
2291                                 mpage_da_submit_io(mpd);
2292                         /*
2293                          * skip rest of the page in the page_vec
2294                          */
2295                         mpd->io_done = 1;
2296                         redirty_page_for_writepage(wbc, page);
2297                         unlock_page(page);
2298                         return MPAGE_DA_EXTENT_TAIL;
2299                 }
2300
2301                 /*
2302                  * Start next extent of pages ...
2303                  */
2304                 mpd->first_page = page->index;
2305
2306                 /*
2307                  * ... and blocks
2308                  */
2309                 mpd->b_size = 0;
2310                 mpd->b_state = 0;
2311                 mpd->b_blocknr = 0;
2312         }
2313
2314         mpd->next_page = page->index + 1;
2315         logical = (sector_t) page->index <<
2316                   (PAGE_CACHE_SHIFT - inode->i_blkbits);
2317
2318         if (!page_has_buffers(page)) {
2319                 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2320                                        (1 << BH_Dirty) | (1 << BH_Uptodate));
2321                 if (mpd->io_done)
2322                         return MPAGE_DA_EXTENT_TAIL;
2323         } else {
2324                 /*
2325                  * Page with regular buffer heads, just add all dirty ones
2326                  */
2327                 head = page_buffers(page);
2328                 bh = head;
2329                 do {
2330                         BUG_ON(buffer_locked(bh));
2331                         /*
2332                          * We need to try to allocate
2333                          * unmapped blocks in the same page.
2334                          * Otherwise we won't make progress
2335                          * with the page in ext4_da_writepage
2336                          */
2337                         if (ext4_bh_unmapped_or_delay(NULL, bh)) {
2338                                 mpage_add_bh_to_extent(mpd, logical,
2339                                                        bh->b_size,
2340                                                        bh->b_state);
2341                                 if (mpd->io_done)
2342                                         return MPAGE_DA_EXTENT_TAIL;
2343                         } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2344                                 /*
2345                                  * mapped dirty buffer. We need to update
2346                                  * the b_state because we look at
2347                                  * b_state in mpage_da_map_blocks. We don't
2348                                  * update b_size because if we find an
2349                                  * unmapped buffer_head later we need to
2350                                  * use the b_state flag of that buffer_head.
2351                                  */
2352                                 if (mpd->b_size == 0)
2353                                         mpd->b_state = bh->b_state & BH_FLAGS;
2354                         }
2355                         logical++;
2356                 } while ((bh = bh->b_this_page) != head);
2357         }
2358
2359         return 0;
2360 }
2361
2362 /*
2363  * This is a special get_blocks_t callback which is used by
2364  * ext4_da_write_begin().  It will either return mapped block or
2365  * reserve space for a single block.
2366  *
2367  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
2368  * We also have b_blocknr = -1 and b_bdev initialized properly
2369  *
2370  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
2371  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
2372  * initialized properly.
2373  */
2374 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2375                                   struct buffer_head *bh_result, int create)
2376 {
2377         int ret = 0;
2378         sector_t invalid_block = ~((sector_t) 0xffff);
2379
2380         if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
2381                 invalid_block = ~0;
2382
2383         BUG_ON(create == 0);
2384         BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2385
2386         /*
2387          * first, we need to know whether the block is allocated already
2388          * preallocated blocks are unmapped but should treated
2389          * the same as allocated blocks.
2390          */
2391         ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
2392         if ((ret == 0) && !buffer_delay(bh_result)) {
2393                 /* the block isn't (pre)allocated yet, let's reserve space */
2394                 /*
2395                  * XXX: __block_prepare_write() unmaps passed block,
2396                  * is it OK?
2397                  */
2398                 ret = ext4_da_reserve_space(inode, 1);
2399                 if (ret)
2400                         /* not enough space to reserve */
2401                         return ret;
2402
2403                 map_bh(bh_result, inode->i_sb, invalid_block);
2404                 set_buffer_new(bh_result);
2405                 set_buffer_delay(bh_result);
2406         } else if (ret > 0) {
2407                 bh_result->b_size = (ret << inode->i_blkbits);
2408                 if (buffer_unwritten(bh_result)) {
2409                         /* A delayed write to unwritten bh should
2410                          * be marked new and mapped.  Mapped ensures
2411                          * that we don't do get_block multiple times
2412                          * when we write to the same offset and new
2413                          * ensures that we do proper zero out for
2414                          * partial write.
2415                          */
2416                         set_buffer_new(bh_result);
2417                         set_buffer_mapped(bh_result);
2418                 }
2419                 ret = 0;
2420         }
2421
2422         return ret;
2423 }
2424
2425 /*
2426  * This function is used as a standard get_block_t calback function
2427  * when there is no desire to allocate any blocks.  It is used as a
2428  * callback function for block_prepare_write(), nobh_writepage(), and
2429  * block_write_full_page().  These functions should only try to map a
2430  * single block at a time.
2431  *
2432  * Since this function doesn't do block allocations even if the caller
2433  * requests it by passing in create=1, it is critically important that
2434  * any caller checks to make sure that any buffer heads are returned
2435  * by this function are either all already mapped or marked for
2436  * delayed allocation before calling nobh_writepage() or
2437  * block_write_full_page().  Otherwise, b_blocknr could be left
2438  * unitialized, and the page write functions will be taken by
2439  * surprise.
2440  */
2441 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2442                                    struct buffer_head *bh_result, int create)
2443 {
2444         int ret = 0;
2445         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2446
2447         BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2448
2449         /*
2450          * we don't want to do block allocation in writepage
2451          * so call get_block_wrap with create = 0
2452          */
2453         ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
2454         BUG_ON(create && ret == 0);
2455         if (ret > 0) {
2456                 bh_result->b_size = (ret << inode->i_blkbits);
2457                 ret = 0;
2458         }
2459         return ret;
2460 }
2461
2462 /*
2463  * This function can get called via...
2464  *   - ext4_da_writepages after taking page lock (have journal handle)
2465  *   - journal_submit_inode_data_buffers (no journal handle)
2466  *   - shrink_page_list via pdflush (no journal handle)
2467  *   - grab_page_cache when doing write_begin (have journal handle)
2468  */
2469 static int ext4_da_writepage(struct page *page,
2470                                 struct writeback_control *wbc)
2471 {
2472         int ret = 0;
2473         loff_t size;
2474         unsigned int len;
2475         struct buffer_head *page_bufs;
2476         struct inode *inode = page->mapping->host;
2477
2478         trace_mark(ext4_da_writepage,
2479                    "dev %s ino %lu page_index %lu",
2480                    inode->i_sb->s_id, inode->i_ino, page->index);
2481         size = i_size_read(inode);
2482         if (page->index == size >> PAGE_CACHE_SHIFT)
2483                 len = size & ~PAGE_CACHE_MASK;
2484         else
2485                 len = PAGE_CACHE_SIZE;
2486
2487         if (page_has_buffers(page)) {
2488                 page_bufs = page_buffers(page);
2489                 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2490                                         ext4_bh_unmapped_or_delay)) {
2491                         /*
2492                          * We don't want to do  block allocation
2493                          * So redirty the page and return
2494                          * We may reach here when we do a journal commit
2495                          * via journal_submit_inode_data_buffers.
2496                          * If we don't have mapping block we just ignore
2497                          * them. We can also reach here via shrink_page_list
2498                          */
2499                         redirty_page_for_writepage(wbc, page);
2500                         unlock_page(page);
2501                         return 0;
2502                 }
2503         } else {
2504                 /*
2505                  * The test for page_has_buffers() is subtle:
2506                  * We know the page is dirty but it lost buffers. That means
2507                  * that at some moment in time after write_begin()/write_end()
2508                  * has been called all buffers have been clean and thus they
2509                  * must have been written at least once. So they are all
2510                  * mapped and we can happily proceed with mapping them
2511                  * and writing the page.
2512                  *
2513                  * Try to initialize the buffer_heads and check whether
2514                  * all are mapped and non delay. We don't want to
2515                  * do block allocation here.
2516                  */
2517                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
2518                                           noalloc_get_block_write);
2519                 if (!ret) {
2520                         page_bufs = page_buffers(page);
2521                         /* check whether all are mapped and non delay */
2522                         if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2523                                                 ext4_bh_unmapped_or_delay)) {
2524                                 redirty_page_for_writepage(wbc, page);
2525                                 unlock_page(page);
2526                                 return 0;
2527                         }
2528                 } else {
2529                         /*
2530                          * We can't do block allocation here
2531                          * so just redity the page and unlock
2532                          * and return
2533                          */
2534                         redirty_page_for_writepage(wbc, page);
2535                         unlock_page(page);
2536                         return 0;
2537                 }
2538                 /* now mark the buffer_heads as dirty and uptodate */
2539                 block_commit_write(page, 0, PAGE_CACHE_SIZE);
2540         }
2541
2542         if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2543                 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2544         else
2545                 ret = block_write_full_page(page, noalloc_get_block_write,
2546                                             wbc);
2547
2548         return ret;
2549 }
2550
2551 /*
2552  * This is called via ext4_da_writepages() to
2553  * calulate the total number of credits to reserve to fit
2554  * a single extent allocation into a single transaction,
2555  * ext4_da_writpeages() will loop calling this before
2556  * the block allocation.
2557  */
2558
2559 static int ext4_da_writepages_trans_blocks(struct inode *inode)
2560 {
2561         int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
2562
2563         /*
2564          * With non-extent format the journal credit needed to
2565          * insert nrblocks contiguous block is dependent on
2566          * number of contiguous block. So we will limit
2567          * number of contiguous block to a sane value
2568          */
2569         if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
2570             (max_blocks > EXT4_MAX_TRANS_DATA))
2571                 max_blocks = EXT4_MAX_TRANS_DATA;
2572
2573         return ext4_chunk_trans_blocks(inode, max_blocks);
2574 }
2575
2576 static int ext4_da_writepages(struct address_space *mapping,
2577                               struct writeback_control *wbc)
2578 {
2579         pgoff_t index;
2580         int range_whole = 0;
2581         handle_t *handle = NULL;
2582         struct mpage_da_data mpd;
2583         struct inode *inode = mapping->host;
2584         int no_nrwrite_index_update;
2585         int pages_written = 0;
2586         long pages_skipped;
2587         int range_cyclic, cycled = 1, io_done = 0;
2588         int needed_blocks, ret = 0, nr_to_writebump = 0;
2589         struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2590
2591         trace_mark(ext4_da_writepages,
2592                    "dev %s ino %lu nr_t_write %ld "
2593                    "pages_skipped %ld range_start %llu "
2594                    "range_end %llu nonblocking %d "
2595                    "for_kupdate %d for_reclaim %d "
2596                    "for_writepages %d range_cyclic %d",
2597                    inode->i_sb->s_id, inode->i_ino,
2598                    wbc->nr_to_write, wbc->pages_skipped,
2599                    (unsigned long long) wbc->range_start,
2600                    (unsigned long long) wbc->range_end,
2601                    wbc->nonblocking, wbc->for_kupdate,
2602                    wbc->for_reclaim, wbc->for_writepages,
2603                    wbc->range_cyclic);
2604
2605         /*
2606          * No pages to write? This is mainly a kludge to avoid starting
2607          * a transaction for special inodes like journal inode on last iput()
2608          * because that could violate lock ordering on umount
2609          */
2610         if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2611                 return 0;
2612
2613         /*
2614          * If the filesystem has aborted, it is read-only, so return
2615          * right away instead of dumping stack traces later on that
2616          * will obscure the real source of the problem.  We test
2617          * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
2618          * the latter could be true if the filesystem is mounted
2619          * read-only, and in that case, ext4_da_writepages should
2620          * *never* be called, so if that ever happens, we would want
2621          * the stack trace.
2622          */
2623         if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
2624                 return -EROFS;
2625
2626         /*
2627          * Make sure nr_to_write is >= sbi->s_mb_stream_request
2628          * This make sure small files blocks are allocated in
2629          * single attempt. This ensure that small files
2630          * get less fragmented.
2631          */
2632         if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2633                 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2634                 wbc->nr_to_write = sbi->s_mb_stream_request;
2635         }
2636         if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2637                 range_whole = 1;
2638
2639         range_cyclic = wbc->range_cyclic;
2640         if (wbc->range_cyclic) {
2641                 index = mapping->writeback_index;
2642                 if (index)
2643                         cycled = 0;
2644                 wbc->range_start = index << PAGE_CACHE_SHIFT;
2645                 wbc->range_end  = LLONG_MAX;
2646                 wbc->range_cyclic = 0;
2647         } else
2648                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2649
2650         mpd.wbc = wbc;
2651         mpd.inode = mapping->host;
2652
2653         /*
2654          * we don't want write_cache_pages to update
2655          * nr_to_write and writeback_index
2656          */
2657         no_nrwrite_index_update = wbc->no_nrwrite_index_update;
2658         wbc->no_nrwrite_index_update = 1;
2659         pages_skipped = wbc->pages_skipped;
2660
2661 retry:
2662         while (!ret && wbc->nr_to_write > 0) {
2663
2664                 /*
2665                  * we  insert one extent at a time. So we need
2666                  * credit needed for single extent allocation.
2667                  * journalled mode is currently not supported
2668                  * by delalloc
2669                  */
2670                 BUG_ON(ext4_should_journal_data(inode));
2671                 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2672
2673                 /* start a new transaction*/
2674                 handle = ext4_journal_start(inode, needed_blocks);
2675                 if (IS_ERR(handle)) {
2676                         ret = PTR_ERR(handle);
2677                         printk(KERN_CRIT "%s: jbd2_start: "
2678                                "%ld pages, ino %lu; err %d\n", __func__,
2679                                 wbc->nr_to_write, inode->i_ino, ret);
2680                         dump_stack();
2681                         goto out_writepages;
2682                 }
2683
2684                 /*
2685                  * Now call __mpage_da_writepage to find the next
2686                  * contiguous region of logical blocks that need
2687                  * blocks to be allocated by ext4.  We don't actually
2688                  * submit the blocks for I/O here, even though
2689                  * write_cache_pages thinks it will, and will set the
2690                  * pages as clean for write before calling
2691                  * __mpage_da_writepage().
2692                  */
2693                 mpd.b_size = 0;
2694                 mpd.b_state = 0;
2695                 mpd.b_blocknr = 0;
2696                 mpd.first_page = 0;
2697                 mpd.next_page = 0;
2698                 mpd.io_done = 0;
2699                 mpd.pages_written = 0;
2700                 mpd.retval = 0;
2701                 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
2702                                         &mpd);
2703                 /*
2704                  * If we have a contigous extent of pages and we
2705                  * haven't done the I/O yet, map the blocks and submit
2706                  * them for I/O.
2707                  */
2708                 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2709                         if (mpage_da_map_blocks(&mpd) == 0)
2710                                 mpage_da_submit_io(&mpd);
2711                         mpd.io_done = 1;
2712                         ret = MPAGE_DA_EXTENT_TAIL;
2713                 }
2714                 wbc->nr_to_write -= mpd.pages_written;
2715
2716                 ext4_journal_stop(handle);
2717
2718                 if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
2719                         /* commit the transaction which would
2720                          * free blocks released in the transaction
2721                          * and try again
2722                          */
2723                         jbd2_journal_force_commit_nested(sbi->s_journal);
2724                         wbc->pages_skipped = pages_skipped;
2725                         ret = 0;
2726                 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
2727                         /*
2728                          * got one extent now try with
2729                          * rest of the pages
2730                          */
2731                         pages_written += mpd.pages_written;
2732                         wbc->pages_skipped = pages_skipped;
2733                         ret = 0;
2734                         io_done = 1;
2735                 } else if (wbc->nr_to_write)
2736                         /*
2737                          * There is no more writeout needed
2738                          * or we requested for a noblocking writeout
2739                          * and we found the device congested
2740                          */
2741                         break;
2742         }
2743         if (!io_done && !cycled) {
2744                 cycled = 1;
2745                 index = 0;
2746                 wbc->range_start = index << PAGE_CACHE_SHIFT;
2747                 wbc->range_end  = mapping->writeback_index - 1;
2748                 goto retry;
2749         }
2750         if (pages_skipped != wbc->pages_skipped)
2751                 printk(KERN_EMERG "This should not happen leaving %s "
2752                                 "with nr_to_write = %ld ret = %d\n",
2753                                 __func__, wbc->nr_to_write, ret);
2754
2755         /* Update index */
2756         index += pages_written;
2757         wbc->range_cyclic = range_cyclic;
2758         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2759                 /*
2760                  * set the writeback_index so that range_cyclic
2761                  * mode will write it back later
2762                  */
2763                 mapping->writeback_index = index;
2764
2765 out_writepages:
2766         if (!no_nrwrite_index_update)
2767                 wbc->no_nrwrite_index_update = 0;
2768         wbc->nr_to_write -= nr_to_writebump;
2769         trace_mark(ext4_da_writepage_result,
2770                    "dev %s ino %lu ret %d pages_written %d "
2771                    "pages_skipped %ld congestion %d "
2772                    "more_io %d no_nrwrite_index_update %d",
2773                    inode->i_sb->s_id, inode->i_ino, ret,
2774                    pages_written, wbc->pages_skipped,
2775                    wbc->encountered_congestion, wbc->more_io,
2776                    wbc->no_nrwrite_index_update);
2777         return ret;
2778 }
2779
2780 #define FALL_BACK_TO_NONDELALLOC 1
2781 static int ext4_nonda_switch(struct super_block *sb)
2782 {
2783         s64 free_blocks, dirty_blocks;
2784         struct ext4_sb_info *sbi = EXT4_SB(sb);
2785
2786         /*
2787          * switch to non delalloc mode if we are running low
2788          * on free block. The free block accounting via percpu
2789          * counters can get slightly wrong with percpu_counter_batch getting
2790          * accumulated on each CPU without updating global counters
2791          * Delalloc need an accurate free block accounting. So switch
2792          * to non delalloc when we are near to error range.
2793          */
2794         free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
2795         dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
2796         if (2 * free_blocks < 3 * dirty_blocks ||
2797                 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
2798                 /*
2799                  * free block count is less that 150% of dirty blocks
2800                  * or free blocks is less that watermark
2801                  */
2802                 return 1;
2803         }
2804         return 0;
2805 }
2806
2807 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2808                                 loff_t pos, unsigned len, unsigned flags,
2809                                 struct page **pagep, void **fsdata)
2810 {
2811         int ret, retries = 0;
2812         struct page *page;
2813         pgoff_t index;
2814         unsigned from, to;
2815         struct inode *inode = mapping->host;
2816         handle_t *handle;
2817
2818         index = pos >> PAGE_CACHE_SHIFT;
2819         from = pos & (PAGE_CACHE_SIZE - 1);
2820         to = from + len;
2821
2822         if (ext4_nonda_switch(inode->i_sb)) {
2823                 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2824                 return ext4_write_begin(file, mapping, pos,
2825                                         len, flags, pagep, fsdata);
2826         }
2827         *fsdata = (void *)0;
2828
2829         trace_mark(ext4_da_write_begin,
2830                    "dev %s ino %lu pos %llu len %u flags %u",
2831                    inode->i_sb->s_id, inode->i_ino,
2832                    (unsigned long long) pos, len, flags);
2833 retry:
2834         /*
2835          * With delayed allocation, we don't log the i_disksize update
2836          * if there is delayed block allocation. But we still need
2837          * to journalling the i_disksize update if writes to the end
2838          * of file which has an already mapped buffer.
2839          */
2840         handle = ext4_journal_start(inode, 1);
2841         if (IS_ERR(handle)) {
2842                 ret = PTR_ERR(handle);
2843                 goto out;
2844         }
2845         /* We cannot recurse into the filesystem as the transaction is already
2846          * started */
2847         flags |= AOP_FLAG_NOFS;
2848
2849         page = grab_cache_page_write_begin(mapping, index, flags);
2850         if (!page) {
2851                 ext4_journal_stop(handle);
2852                 ret = -ENOMEM;
2853                 goto out;
2854         }
2855         *pagep = page;
2856
2857         ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2858                                 ext4_da_get_block_prep);
2859         if (ret < 0) {
2860                 unlock_page(page);
2861                 ext4_journal_stop(handle);
2862                 page_cache_release(page);
2863                 /*
2864                  * block_write_begin may have instantiated a few blocks
2865                  * outside i_size.  Trim these off again. Don't need
2866                  * i_size_read because we hold i_mutex.
2867                  */
2868                 if (pos + len > inode->i_size)
2869                         vmtruncate(inode, inode->i_size);
2870         }
2871
2872         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2873                 goto retry;
2874 out:
2875         return ret;
2876 }
2877
2878 /*
2879  * Check if we should update i_disksize
2880  * when write to the end of file but not require block allocation
2881  */
2882 static int ext4_da_should_update_i_disksize(struct page *page,
2883                                          unsigned long offset)
2884 {
2885         struct buffer_head *bh;
2886         struct inode *inode = page->mapping->host;
2887         unsigned int idx;
2888         int i;
2889
2890         bh = page_buffers(page);
2891         idx = offset >> inode->i_blkbits;
2892
2893         for (i = 0; i < idx; i++)
2894                 bh = bh->b_this_page;
2895
2896         if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
2897                 return 0;
2898         return 1;
2899 }
2900
2901 static int ext4_da_write_end(struct file *file,
2902                                 struct address_space *mapping,
2903                                 loff_t pos, unsigned len, unsigned copied,
2904                                 struct page *page, void *fsdata)
2905 {
2906         struct inode *inode = mapping->host;
2907         int ret = 0, ret2;
2908         handle_t *handle = ext4_journal_current_handle();
2909         loff_t new_i_size;
2910         unsigned long start, end;
2911         int write_mode = (int)(unsigned long)fsdata;
2912
2913         if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2914                 if (ext4_should_order_data(inode)) {
2915                         return ext4_ordered_write_end(file, mapping, pos,
2916                                         len, copied, page, fsdata);
2917                 } else if (ext4_should_writeback_data(inode)) {
2918                         return ext4_writeback_write_end(file, mapping, pos,
2919                                         len, copied, page, fsdata);
2920                 } else {
2921                         BUG();
2922                 }
2923         }
2924
2925         trace_mark(ext4_da_write_end,
2926                    "dev %s ino %lu pos %llu len %u copied %u",
2927                    inode->i_sb->s_id, inode->i_ino,
2928                    (unsigned long long) pos, len, copied);
2929         start = pos & (PAGE_CACHE_SIZE - 1);
2930         end = start + copied - 1;
2931
2932         /*
2933          * generic_write_end() will run mark_inode_dirty() if i_size
2934          * changes.  So let's piggyback the i_disksize mark_inode_dirty
2935          * into that.
2936          */
2937
2938         new_i_size = pos + copied;
2939         if (new_i_size > EXT4_I(inode)->i_disksize) {
2940                 if (ext4_da_should_update_i_disksize(page, end)) {
2941                         down_write(&EXT4_I(inode)->i_data_sem);
2942                         if (new_i_size > EXT4_I(inode)->i_disksize) {
2943                                 /*
2944                                  * Updating i_disksize when extending file
2945                                  * without needing block allocation
2946                                  */
2947                                 if (ext4_should_order_data(inode))
2948                                         ret = ext4_jbd2_file_inode(handle,
2949                                                                    inode);
2950
2951                                 EXT4_I(inode)->i_disksize = new_i_size;
2952                         }
2953                         up_write(&EXT4_I(inode)->i_data_sem);
2954                         /* We need to mark inode dirty even if
2955                          * new_i_size is less that inode->i_size
2956                          * bu greater than i_disksize.(hint delalloc)
2957                          */
2958                         ext4_mark_inode_dirty(handle, inode);
2959                 }
2960         }
2961         ret2 = generic_write_end(file, mapping, pos, len, copied,
2962                                                         page, fsdata);
2963         copied = ret2;
2964         if (ret2 < 0)
2965                 ret = ret2;
2966         ret2 = ext4_journal_stop(handle);
2967         if (!ret)
2968                 ret = ret2;
2969
2970         return ret ? ret : copied;
2971 }
2972
2973 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2974 {
2975         /*
2976          * Drop reserved blocks
2977          */
2978         BUG_ON(!PageLocked(page));
2979         if (!page_has_buffers(page))
2980                 goto out;
2981
2982         ext4_da_page_release_reservation(page, offset);
2983
2984 out:
2985         ext4_invalidatepage(page, offset);
2986
2987         return;
2988 }
2989
2990 /*
2991  * Force all delayed allocation blocks to be allocated for a given inode.
2992  */
2993 int ext4_alloc_da_blocks(struct inode *inode)
2994 {
2995         if (!EXT4_I(inode)->i_reserved_data_blocks &&
2996             !EXT4_I(inode)->i_reserved_meta_blocks)
2997                 return 0;
2998
2999         /*
3000          * We do something simple for now.  The filemap_flush() will
3001          * also start triggering a write of the data blocks, which is
3002          * not strictly speaking necessary (and for users of
3003          * laptop_mode, not even desirable).  However, to do otherwise
3004          * would require replicating code paths in:
3005          *
3006          * ext4_da_writepages() ->
3007          *    write_cache_pages() ---> (via passed in callback function)
3008          *        __mpage_da_writepage() -->
3009          *           mpage_add_bh_to_extent()
3010          *           mpage_da_map_blocks()
3011          *
3012          * The problem is that write_cache_pages(), located in
3013          * mm/page-writeback.c, marks pages clean in preparation for
3014          * doing I/O, which is not desirable if we're not planning on
3015          * doing I/O at all.
3016          *
3017          * We could call write_cache_pages(), and then redirty all of
3018          * the pages by calling redirty_page_for_writeback() but that
3019          * would be ugly in the extreme.  So instead we would need to
3020          * replicate parts of the code in the above functions,
3021          * simplifying them becuase we wouldn't actually intend to
3022          * write out the pages, but rather only collect contiguous
3023          * logical block extents, call the multi-block allocator, and
3024          * then update the buffer heads with the block allocations.
3025          *
3026          * For now, though, we'll cheat by calling filemap_flush(),
3027          * which will map the blocks, and start the I/O, but not
3028          * actually wait for the I/O to complete.
3029          */
3030         return filemap_flush(inode->i_mapping);
3031 }
3032
3033 /*
3034  * bmap() is special.  It gets used by applications such as lilo and by
3035  * the swapper to find the on-disk block of a specific piece of data.
3036  *
3037  * Naturally, this is dangerous if the block concerned is still in the
3038  * journal.  If somebody makes a swapfile on an ext4 data-journaling
3039  * filesystem and enables swap, then they may get a nasty shock when the
3040  * data getting swapped to that swapfile suddenly gets overwritten by
3041  * the original zero's written out previously to the journal and
3042  * awaiting writeback in the kernel's buffer cache.
3043  *
3044  * So, if we see any bmap calls here on a modified, data-journaled file,
3045  * take extra steps to flush any blocks which might be in the cache.
3046  */
3047 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3048 {
3049         struct inode *inode = mapping->host;
3050         journal_t *journal;
3051         int err;
3052
3053         if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3054                         test_opt(inode->i_sb, DELALLOC)) {
3055                 /*
3056                  * With delalloc we want to sync the file
3057                  * so that we can make sure we allocate
3058                  * blocks for file
3059                  */
3060                 filemap_write_and_wait(mapping);
3061         }
3062
3063         if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
3064                 /*
3065                  * This is a REALLY heavyweight approach, but the use of
3066                  * bmap on dirty files is expected to be extremely rare:
3067                  * only if we run lilo or swapon on a freshly made file
3068                  * do we expect this to happen.
3069                  *
3070                  * (bmap requires CAP_SYS_RAWIO so this does not
3071                  * represent an unprivileged user DOS attack --- we'd be
3072                  * in trouble if mortal users could trigger this path at
3073                  * will.)
3074                  *
3075                  * NB. EXT4_STATE_JDATA is not set on files other than
3076                  * regular files.  If somebody wants to bmap a directory
3077                  * or symlink and gets confused because the buffer
3078                  * hasn't yet been flushed to disk, they deserve
3079                  * everything they get.
3080                  */
3081
3082                 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
3083                 journal = EXT4_JOURNAL(inode);
3084                 jbd2_journal_lock_updates(journal);
3085                 err = jbd2_journal_flush(journal);
3086                 jbd2_journal_unlock_updates(journal);
3087
3088                 if (err)
3089                         return 0;
3090         }
3091
3092         return generic_block_bmap(mapping, block, ext4_get_block);
3093 }
3094
3095 static int bget_one(handle_t *handle, struct buffer_head *bh)
3096 {
3097         get_bh(bh);
3098         return 0;
3099 }
3100
3101 static int bput_one(handle_t *handle, struct buffer_head *bh)
3102 {
3103         put_bh(bh);
3104         return 0;
3105 }
3106
3107 /*
3108  * Note that we don't need to start a transaction unless we're journaling data
3109  * because we should have holes filled from ext4_page_mkwrite(). We even don't
3110  * need to file the inode to the transaction's list in ordered mode because if
3111  * we are writing back data added by write(), the inode is already there and if
3112  * we are writing back data modified via mmap(), noone guarantees in which
3113  * transaction the data will hit the disk. In case we are journaling data, we
3114  * cannot start transaction directly because transaction start ranks above page
3115  * lock so we have to do some magic.
3116  *
3117  * In all journaling modes block_write_full_page() will start the I/O.
3118  *
3119  * Problem:
3120  *
3121  *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
3122  *              ext4_writepage()
3123  *
3124  * Similar for:
3125  *
3126  *      ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
3127  *
3128  * Same applies to ext4_get_block().  We will deadlock on various things like
3129  * lock_journal and i_data_sem
3130  *
3131  * Setting PF_MEMALLOC here doesn't work - too many internal memory
3132  * allocations fail.
3133  *
3134  * 16May01: If we're reentered then journal_current_handle() will be
3135  *          non-zero. We simply *return*.
3136  *
3137  * 1 July 2001: @@@ FIXME:
3138  *   In journalled data mode, a data buffer may be metadata against the
3139  *   current transaction.  But the same file is part of a shared mapping
3140  *   and someone does a writepage() on it.
3141  *
3142  *   We will move the buffer onto the async_data list, but *after* it has
3143  *   been dirtied. So there's a small window where we have dirty data on
3144  *   BJ_Metadata.
3145  *
3146  *   Note that this only applies to the last partial page in the file.  The
3147  *   bit which block_write_full_page() uses prepare/commit for.  (That's
3148  *   broken code anyway: it's wrong for msync()).
3149  *
3150  *   It's a rare case: affects the final partial page, for journalled data
3151  *   where the file is subject to bith write() and writepage() in the same
3152  *   transction.  To fix it we'll need a custom block_write_full_page().
3153  *   We'll probably need that anyway for journalling writepage() output.
3154  *
3155  * We don't honour synchronous mounts for writepage().  That would be
3156  * disastrous.  Any write() or metadata operation will sync the fs for
3157  * us.
3158  *
3159  */
3160 static int __ext4_normal_writepage(struct page *page,
3161                                 struct writeback_control *wbc)
3162 {
3163         struct inode *inode = page->mapping->host;
3164
3165         if (test_opt(inode->i_sb, NOBH))
3166                 return nobh_writepage(page, noalloc_get_block_write, wbc);
3167         else
3168                 return block_write_full_page(page, noalloc_get_block_write,
3169                                              wbc);
3170 }
3171
3172 static int ext4_normal_writepage(struct page *page,
3173                                 struct writeback_control *wbc)
3174 {
3175         struct inode *inode = page->mapping->host;
3176         loff_t size = i_size_read(inode);
3177         loff_t len;
3178
3179         trace_mark(ext4_normal_writepage,
3180                    "dev %s ino %lu page_index %lu",
3181                    inode->i_sb->s_id, inode->i_ino, page->index);
3182         J_ASSERT(PageLocked(page));
3183         if (page->index == size >> PAGE_CACHE_SHIFT)
3184                 len = size & ~PAGE_CACHE_MASK;
3185         else
3186                 len = PAGE_CACHE_SIZE;
3187
3188         if (page_has_buffers(page)) {
3189                 /* if page has buffers it should all be mapped
3190                  * and allocated. If there are not buffers attached
3191                  * to the page we know the page is dirty but it lost
3192                  * buffers. That means that at some moment in time
3193                  * after write_begin() / write_end() has been called
3194                  * all buffers have been clean and thus they must have been
3195                  * written at least once. So they are all mapped and we can
3196                  * happily proceed with mapping them and writing the page.
3197                  */
3198                 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
3199                                         ext4_bh_unmapped_or_delay));
3200         }
3201
3202         if (!ext4_journal_current_handle())
3203                 return __ext4_normal_writepage(page, wbc);
3204
3205         redirty_page_for_writepage(wbc, page);
3206         unlock_page(page);
3207         return 0;
3208 }
3209
3210 static int __ext4_journalled_writepage(struct page *page,
3211                                 struct writeback_control *wbc)
3212 {
3213         struct address_space *mapping = page->mapping;
3214         struct inode *inode = mapping->host;
3215         struct buffer_head *page_bufs;
3216         handle_t *handle = NULL;
3217         int ret = 0;
3218         int err;
3219
3220         ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
3221                                   noalloc_get_block_write);
3222         if (ret != 0)
3223                 goto out_unlock;
3224
3225         page_bufs = page_buffers(page);
3226         walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
3227                                                                 bget_one);
3228         /* As soon as we unlock the page, it can go away, but we have
3229          * references to buffers so we are safe */
3230         unlock_page(page);
3231
3232         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
3233         if (IS_ERR(handle)) {
3234                 ret = PTR_ERR(handle);
3235                 goto out;
3236         }
3237
3238         ret = walk_page_buffers(handle, page_bufs, 0,
3239                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
3240
3241         err = walk_page_buffers(handle, page_bufs, 0,
3242                                 PAGE_CACHE_SIZE, NULL, write_end_fn);
3243         if (ret == 0)
3244                 ret = err;
3245         err = ext4_journal_stop(handle);
3246         if (!ret)
3247                 ret = err;
3248
3249         walk_page_buffers(handle, page_bufs, 0,
3250                                 PAGE_CACHE_SIZE, NULL, bput_one);
3251         EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
3252         goto out;
3253
3254 out_unlock:
3255         unlock_page(page);
3256 out:
3257         return ret;
3258 }
3259
3260 static int ext4_journalled_writepage(struct page *page,
3261                                 struct writeback_control *wbc)
3262 {
3263         struct inode *inode = page->mapping->host;
3264         loff_t size = i_size_read(inode);
3265         loff_t len;
3266
3267         trace_mark(ext4_journalled_writepage,
3268                    "dev %s ino %lu page_index %lu",
3269                    inode->i_sb->s_id, inode->i_ino, page->index);
3270         J_ASSERT(PageLocked(page));
3271         if (page->index == size >> PAGE_CACHE_SHIFT)
3272                 len = size & ~PAGE_CACHE_MASK;
3273         else
3274                 len = PAGE_CACHE_SIZE;
3275
3276         if (page_has_buffers(page)) {
3277                 /* if page has buffers it should all be mapped
3278                  * and allocated. If there are not buffers attached
3279                  * to the page we know the page is dirty but it lost
3280                  * buffers. That means that at some moment in time
3281                  * after write_begin() / write_end() has been called
3282                  * all buffers have been clean and thus they must have been
3283                  * written at least once. So they are all mapped and we can
3284                  * happily proceed with mapping them and writing the page.
3285                  */
3286                 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
3287                                         ext4_bh_unmapped_or_delay));
3288         }
3289
3290         if (ext4_journal_current_handle())
3291                 goto no_write;
3292
3293         if (PageChecked(page)) {
3294                 /*
3295                  * It's mmapped pagecache.  Add buffers and journal it.  There
3296                  * doesn't seem much point in redirtying the page here.
3297                  */
3298                 ClearPageChecked(page);
3299                 return __ext4_journalled_writepage(page, wbc);
3300         } else {
3301                 /*
3302                  * It may be a page full of checkpoint-mode buffers.  We don't
3303                  * really know unless we go poke around in the buffer_heads.
3304                  * But block_write_full_page will do the right thing.
3305                  */
3306                 return block_write_full_page(page, noalloc_get_block_write,
3307                                              wbc);
3308         }
3309 no_write:
3310         redirty_page_for_writepage(wbc, page);
3311         unlock_page(page);
3312         return 0;
3313 }
3314
3315 static int ext4_readpage(struct file *file, struct page *page)
3316 {
3317         return mpage_readpage(page, ext4_get_block);
3318 }
3319
3320 static int
3321 ext4_readpages(struct file *file, struct address_space *mapping,
3322                 struct list_head *pages, unsigned nr_pages)
3323 {
3324         return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3325 }
3326
3327 static void ext4_invalidatepage(struct page *page, unsigned long offset)
3328 {
3329         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3330
3331         /*
3332          * If it's a full truncate we just forget about the pending dirtying
3333          */
3334         if (offset == 0)
3335                 ClearPageChecked(page);
3336
3337         if (journal)
3338                 jbd2_journal_invalidatepage(journal, page, offset);
3339         else
3340                 block_invalidatepage(page, offset);
3341 }
3342
3343 static int ext4_releasepage(struct page *page, gfp_t wait)
3344 {
3345         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3346
3347         WARN_ON(PageChecked(page));
3348         if (!page_has_buffers(page))
3349                 return 0;
3350         if (journal)
3351                 return jbd2_journal_try_to_free_buffers(journal, page, wait);
3352         else
3353                 return try_to_free_buffers(page);
3354 }
3355
3356 /*
3357  * If the O_DIRECT write will extend the file then add this inode to the
3358  * orphan list.  So recovery will truncate it back to the original size
3359  * if the machine crashes during the write.
3360  *
3361  * If the O_DIRECT write is intantiating holes inside i_size and the machine
3362  * crashes then stale disk data _may_ be exposed inside the file. But current
3363  * VFS code falls back into buffered path in that case so we are safe.
3364  */
3365 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3366                         const struct iovec *iov, loff_t offset,
3367                         unsigned long nr_segs)
3368 {
3369         struct file *file = iocb->ki_filp;
3370         struct inode *inode = file->f_mapping->host;
3371         struct ext4_inode_info *ei = EXT4_I(inode);
3372         handle_t *handle;
3373         ssize_t ret;
3374         int orphan = 0;
3375         size_t count = iov_length(iov, nr_segs);
3376
3377         if (rw == WRITE) {
3378                 loff_t final_size = offset + count;
3379
3380                 if (final_size > inode->i_size) {
3381                         /* Credits for sb + inode write */
3382                         handle = ext4_journal_start(inode, 2);
3383                         if (IS_ERR(handle)) {
3384                                 ret = PTR_ERR(handle);
3385                                 goto out;
3386                         }
3387                         ret = ext4_orphan_add(handle, inode);
3388                         if (ret) {
3389                                 ext4_journal_stop(handle);
3390                                 goto out;
3391                         }
3392                         orphan = 1;
3393                         ei->i_disksize = inode->i_size;
3394                         ext4_journal_stop(handle);
3395                 }
3396         }
3397
3398         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3399                                  offset, nr_segs,
3400                                  ext4_get_block, NULL);
3401
3402         if (orphan) {
3403                 int err;
3404
3405                 /* Credits for sb + inode write */
3406                 handle = ext4_journal_start(inode, 2);
3407                 if (IS_ERR(handle)) {
3408                         /* This is really bad luck. We've written the data
3409                          * but cannot extend i_size. Bail out and pretend
3410                          * the write failed... */
3411                         ret = PTR_ERR(handle);
3412                         goto out;
3413                 }
3414                 if (inode->i_nlink)
3415                         ext4_orphan_del(handle, inode);
3416                 if (ret > 0) {
3417                         loff_t end = offset + ret;
3418                         if (end > inode->i_size) {
3419                                 ei->i_disksize = end;
3420                                 i_size_write(inode, end);
3421                                 /*
3422                                  * We're going to return a positive `ret'
3423                                  * here due to non-zero-length I/O, so there's
3424                                  * no way of reporting error returns from
3425                                  * ext4_mark_inode_dirty() to userspace.  So
3426                                  * ignore it.
3427                                  */
3428                                 ext4_mark_inode_dirty(handle, inode);
3429                         }
3430                 }
3431                 err = ext4_journal_stop(handle);
3432                 if (ret == 0)
3433                         ret = err;
3434         }
3435 out:
3436         return ret;
3437 }
3438
3439 /*
3440  * Pages can be marked dirty completely asynchronously from ext4's journalling
3441  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
3442  * much here because ->set_page_dirty is called under VFS locks.  The page is
3443  * not necessarily locked.
3444  *
3445  * We cannot just dirty the page and leave attached buffers clean, because the
3446  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
3447  * or jbddirty because all the journalling code will explode.
3448  *
3449  * So what we do is to mark the page "pending dirty" and next time writepage
3450  * is called, propagate that into the buffers appropriately.
3451  */
3452 static int ext4_journalled_set_page_dirty(struct page *page)
3453 {
3454         SetPageChecked(page);
3455         return __set_page_dirty_nobuffers(page);
3456 }
3457
3458 static const struct address_space_operations ext4_ordered_aops = {
3459         .readpage               = ext4_readpage,
3460         .readpages              = ext4_readpages,
3461         .writepage              = ext4_normal_writepage,
3462         .sync_page              = block_sync_page,
3463         .write_begin            = ext4_write_begin,
3464         .write_end              = ext4_ordered_write_end,
3465         .bmap                   = ext4_bmap,
3466         .invalidatepage         = ext4_invalidatepage,
3467         .releasepage            = ext4_releasepage,
3468         .direct_IO              = ext4_direct_IO,
3469         .migratepage            = buffer_migrate_page,
3470         .is_partially_uptodate  = block_is_partially_uptodate,
3471 };
3472
3473 static const struct address_space_operations ext4_writeback_aops = {
3474         .readpage               = ext4_readpage,
3475         .readpages              = ext4_readpages,
3476         .writepage              = ext4_normal_writepage,
3477         .sync_page              = block_sync_page,
3478         .write_begin            = ext4_write_begin,
3479         .write_end              = ext4_writeback_write_end,
3480         .bmap                   = ext4_bmap,
3481         .invalidatepage         = ext4_invalidatepage,
3482         .releasepage            = ext4_releasepage,
3483         .direct_IO              = ext4_direct_IO,
3484         .migratepage            = buffer_migrate_page,
3485         .is_partially_uptodate  = block_is_partially_uptodate,
3486 };
3487
3488 static const struct address_space_operations ext4_journalled_aops = {
3489         .readpage               = ext4_readpage,
3490         .readpages              = ext4_readpages,
3491         .writepage              = ext4_journalled_writepage,
3492         .sync_page              = block_sync_page,
3493         .write_begin            = ext4_write_begin,
3494         .write_end              = ext4_journalled_write_end,
3495         .set_page_dirty         = ext4_journalled_set_page_dirty,
3496         .bmap                   = ext4_bmap,
3497         .invalidatepage         = ext4_invalidatepage,
3498         .releasepage            = ext4_releasepage,
3499         .is_partially_uptodate  = block_is_partially_uptodate,
3500 };
3501
3502 static const struct address_space_operations ext4_da_aops = {
3503         .readpage               = ext4_readpage,
3504         .readpages              = ext4_readpages,
3505         .writepage              = ext4_da_writepage,
3506         .writepages             = ext4_da_writepages,
3507         .sync_page              = block_sync_page,
3508         .write_begin            = ext4_da_write_begin,
3509         .write_end              = ext4_da_write_end,
3510         .bmap                   = ext4_bmap,
3511         .invalidatepage         = ext4_da_invalidatepage,
3512         .releasepage            = ext4_releasepage,
3513         .direct_IO              = ext4_direct_IO,
3514         .migratepage            = buffer_migrate_page,
3515         .is_partially_uptodate  = block_is_partially_uptodate,
3516 };
3517
3518 void ext4_set_aops(struct inode *inode)
3519 {
3520         if (ext4_should_order_data(inode) &&
3521                 test_opt(inode->i_sb, DELALLOC))
3522                 inode->i_mapping->a_ops = &ext4_da_aops;
3523         else if (ext4_should_order_data(inode))
3524                 inode->i_mapping->a_ops = &ext4_ordered_aops;
3525         else if (ext4_should_writeback_data(inode) &&
3526                  test_opt(inode->i_sb, DELALLOC))
3527                 inode->i_mapping->a_ops = &ext4_da_aops;
3528         else if (ext4_should_writeback_data(inode))
3529                 inode->i_mapping->a_ops = &ext4_writeback_aops;
3530         else
3531                 inode->i_mapping->a_ops = &ext4_journalled_aops;
3532 }
3533
3534 /*
3535  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3536  * up to the end of the block which corresponds to `from'.
3537  * This required during truncate. We need to physically zero the tail end
3538  * of that block so it doesn't yield old data if the file is later grown.
3539  */
3540 int ext4_block_truncate_page(handle_t *handle,
3541                 struct address_space *mapping, loff_t from)
3542 {
3543         ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3544         unsigned offset = from & (PAGE_CACHE_SIZE-1);
3545         unsigned blocksize, length, pos;
3546         ext4_lblk_t iblock;
3547         struct inode *inode = mapping->host;
3548         struct buffer_head *bh;
3549         struct page *page;
3550         int err = 0;
3551
3552         page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
3553         if (!page)
3554                 return -EINVAL;
3555
3556         blocksize = inode->i_sb->s_blocksize;
3557         length = blocksize - (offset & (blocksize - 1));
3558         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3559
3560         /*
3561          * For "nobh" option,  we can only work if we don't need to
3562          * read-in the page - otherwise we create buffers to do the IO.
3563          */
3564         if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
3565              ext4_should_writeback_data(inode) && PageUptodate(page)) {
3566                 zero_user(page, offset, length);
3567                 set_page_dirty(page);
3568                 goto unlock;
3569         }
3570
3571         if (!page_has_buffers(page))
3572                 create_empty_buffers(page, blocksize, 0);
3573
3574         /* Find the buffer that contains "offset" */
3575         bh = page_buffers(page);
3576         pos = blocksize;
3577         while (offset >= pos) {
3578                 bh = bh->b_this_page;
3579                 iblock++;
3580                 pos += blocksize;
3581         }
3582
3583         err = 0;
3584         if (buffer_freed(bh)) {
3585                 BUFFER_TRACE(bh, "freed: skip");
3586                 goto unlock;
3587         }
3588
3589         if (!buffer_mapped(bh)) {
3590                 BUFFER_TRACE(bh, "unmapped");
3591                 ext4_get_block(inode, iblock, bh, 0);
3592                 /* unmapped? It's a hole - nothing to do */
3593                 if (!buffer_mapped(bh)) {
3594                         BUFFER_TRACE(bh, "still unmapped");
3595                         goto unlock;
3596                 }
3597         }
3598
3599         /* Ok, it's mapped. Make sure it's up-to-date */
3600         if (PageUptodate(page))
3601                 set_buffer_uptodate(bh);
3602
3603         if (!buffer_uptodate(bh)) {
3604                 err = -EIO;
3605                 ll_rw_block(READ, 1, &bh);
3606                 wait_on_buffer(bh);
3607                 /* Uhhuh. Read error. Complain and punt. */
3608                 if (!buffer_uptodate(bh))
3609                         goto unlock;
3610         }
3611
3612         if (ext4_should_journal_data(inode)) {
3613                 BUFFER_TRACE(bh, "get write access");
3614                 err = ext4_journal_get_write_access(handle, bh);
3615                 if (err)
3616                         goto unlock;
3617         }
3618
3619         zero_user(page, offset, length);
3620
3621         BUFFER_TRACE(bh, "zeroed end of block");
3622
3623         err = 0;
3624         if (ext4_should_journal_data(inode)) {
3625                 err = ext4_handle_dirty_metadata(handle, inode, bh);
3626         } else {
3627                 if (ext4_should_order_data(inode))
3628                         err = ext4_jbd2_file_inode(handle, inode);
3629                 mark_buffer_dirty(bh);
3630         }
3631
3632 unlock:
3633         unlock_page(page);
3634         page_cache_release(page);
3635         return err;
3636 }
3637
3638 /*
3639  * Probably it should be a library function... search for first non-zero word
3640  * or memcmp with zero_page, whatever is better for particular architecture.
3641  * Linus?
3642  */
3643 static inline int all_zeroes(__le32 *p, __le32 *q)
3644 {
3645         while (p < q)
3646                 if (*p++)
3647                         return 0;
3648         return 1;
3649 }
3650
3651 /**
3652  *      ext4_find_shared - find the indirect blocks for partial truncation.
3653  *      @inode:   inode in question
3654  *      @depth:   depth of the affected branch
3655  *      @offsets: offsets of pointers in that branch (see ext4_block_to_path)
3656  *      @chain:   place to store the pointers to partial indirect blocks
3657  *      @top:     place to the (detached) top of branch
3658  *
3659  *      This is a helper function used by ext4_truncate().
3660  *
3661  *      When we do truncate() we may have to clean the ends of several
3662  *      indirect blocks but leave the blocks themselves alive. Block is
3663  *      partially truncated if some data below the new i_size is refered
3664  *      from it (and it is on the path to the first completely truncated
3665  *      data block, indeed).  We have to free the top of that path along
3666  *      with everything to the right of the path. Since no allocation
3667  *      past the truncation point is possible until ext4_truncate()
3668  *      finishes, we may safely do the latter, but top of branch may
3669  *      require special attention - pageout below the truncation point
3670  *      might try to populate it.
3671  *
3672  *      We atomically detach the top of branch from the tree, store the
3673  *      block number of its root in *@top, pointers to buffer_heads of
3674  *      partially truncated blocks - in @chain[].bh and pointers to
3675  *      their last elements that should not be removed - in
3676  *      @chain[].p. Return value is the pointer to last filled element
3677  *      of @chain.
3678  *
3679  *      The work left to caller to do the actual freeing of subtrees:
3680  *              a) free the subtree starting from *@top
3681  *              b) free the subtrees whose roots are stored in
3682  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
3683  *              c) free the subtrees growing from the inode past the @chain[0].
3684  *                      (no partially truncated stuff there).  */
3685
3686 static Indirect *ext4_find_shared(struct inode *inode, int depth,
3687                         ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
3688 {
3689         Indirect *partial, *p;
3690         int k, err;
3691
3692         *top = 0;
3693         /* Make k index the deepest non-null offest + 1 */
3694         for (k = depth; k > 1 && !offsets[k-1]; k--)
3695                 ;
3696         partial = ext4_get_branch(inode, k, offsets, chain, &err);
3697         /* Writer: pointers */
3698         if (!partial)
3699                 partial = chain + k-1;
3700         /*
3701          * If the branch acquired continuation since we've looked at it -
3702          * fine, it should all survive and (new) top doesn't belong to us.
3703          */
3704         if (!partial->key && *partial->p)
3705                 /* Writer: end */
3706                 goto no_top;
3707         for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
3708                 ;
3709         /*
3710          * OK, we've found the last block that must survive. The rest of our
3711          * branch should be detached before unlocking. However, if that rest
3712          * of branch is all ours and does not grow immediately from the inode
3713          * it's easier to cheat and just decrement partial->p.
3714          */
3715         if (p == chain + k - 1 && p > chain) {
3716                 p->p--;
3717         } else {
3718                 *top = *p->p;
3719                 /* Nope, don't do this in ext4.  Must leave the tree intact */
3720 #if 0
3721                 *p->p = 0;
3722 #endif
3723         }
3724         /* Writer: end */
3725
3726         while (partial > p) {
3727                 brelse(partial->bh);
3728                 partial--;
3729         }
3730 no_top:
3731         return partial;
3732 }
3733
3734 /*
3735  * Zero a number of block pointers in either an inode or an indirect block.
3736  * If we restart the transaction we must again get write access to the
3737  * indirect block for further modification.
3738  *
3739  * We release `count' blocks on disk, but (last - first) may be greater
3740  * than `count' because there can be holes in there.
3741  */
3742 static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3743                 struct buffer_head *bh, ext4_fsblk_t block_to_free,
3744                 unsigned long count, __le32 *first, __le32 *last)
3745 {
3746         __le32 *p;
3747         if (try_to_extend_transaction(handle, inode)) {
3748                 if (bh) {
3749                         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
3750                         ext4_handle_dirty_metadata(handle, inode, bh);
3751                 }
3752                 ext4_mark_inode_dirty(handle, inode);
3753                 ext4_journal_test_restart(handle, inode);
3754                 if (bh) {
3755                         BUFFER_TRACE(bh, "retaking write access");
3756                         ext4_journal_get_write_access(handle, bh);
3757                 }
3758         }
3759
3760         /*
3761          * Any buffers which are on the journal will be in memory. We find
3762          * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
3763          * on them.  We've already detached each block from the file, so
3764          * bforget() in jbd2_journal_forget() should be safe.
3765          *
3766          * AKPM: turn on bforget in jbd2_journal_forget()!!!
3767          */
3768         for (p = first; p < last; p++) {
3769                 u32 nr = le32_to_cpu(*p);
3770                 if (nr) {
3771                         struct buffer_head *tbh;
3772
3773                         *p = 0;
3774                         tbh = sb_find_get_block(inode->i_sb, nr);
3775                         ext4_forget(handle, 0, inode, tbh, nr);
3776                 }
3777         }
3778
3779         ext4_free_blocks(handle, inode, block_to_free, count, 0);
3780 }
3781
3782 /**
3783  * ext4_free_data - free a list of data blocks
3784  * @handle:     handle for this transaction
3785  * @inode:      inode we are dealing with
3786  * @this_bh:    indirect buffer_head which contains *@first and *@last
3787  * @first:      array of block numbers
3788  * @last:       points immediately past the end of array
3789  *
3790  * We are freeing all blocks refered from that array (numbers are stored as
3791  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
3792  *
3793  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
3794  * blocks are contiguous then releasing them at one time will only affect one
3795  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
3796  * actually use a lot of journal space.
3797  *
3798  * @this_bh will be %NULL if @first and @last point into the inode's direct
3799  * block pointers.
3800  */
3801 static void ext4_free_data(handle_t *handle, struct inode *inode,
3802                            struct buffer_head *this_bh,
3803                            __le32 *first, __le32 *last)
3804 {
3805         ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
3806         unsigned long count = 0;            /* Number of blocks in the run */
3807         __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
3808                                                corresponding to
3809                                                block_to_free */
3810         ext4_fsblk_t nr;                    /* Current block # */
3811         __le32 *p;                          /* Pointer into inode/ind
3812                                                for current block */
3813         int err;
3814
3815         if (this_bh) {                          /* For indirect block */
3816                 BUFFER_TRACE(this_bh, "get_write_access");
3817                 err = ext4_journal_get_write_access(handle, this_bh);
3818                 /* Important: if we can't update the indirect pointers
3819                  * to the blocks, we can't free them. */
3820                 if (err)
3821                         return;
3822         }
3823
3824         for (p = first; p < last; p++) {
3825                 nr = le32_to_cpu(*p);
3826                 if (nr) {
3827                         /* accumulate blocks to free if they're contiguous */
3828                         if (count == 0) {
3829                                 block_to_free = nr;
3830                                 block_to_free_p = p;
3831                                 count = 1;
3832                         } else if (nr == block_to_free + count) {
3833                                 count++;
3834                         } else {
3835                                 ext4_clear_blocks(handle, inode, this_bh,
3836                                                   block_to_free,
3837                                                   count, block_to_free_p, p);
3838                                 block_to_free = nr;
3839                                 block_to_free_p = p;
3840                                 count = 1;
3841                         }
3842                 }
3843         }
3844
3845         if (count > 0)
3846                 ext4_clear_blocks(handle, inode, this_bh, block_to_free,
3847                                   count, block_to_free_p, p);
3848
3849         if (this_bh) {
3850                 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
3851
3852                 /*
3853                  * The buffer head should have an attached journal head at this
3854                  * point. However, if the data is corrupted and an indirect
3855                  * block pointed to itself, it would have been detached when
3856                  * the block was cleared. Check for this instead of OOPSing.
3857                  */
3858                 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
3859                         ext4_handle_dirty_metadata(handle, inode, this_bh);
3860                 else
3861                         ext4_error(inode->i_sb, __func__,
3862                                    "circular indirect block detected, "
3863                                    "inode=%lu, block=%llu",
3864                                    inode->i_ino,
3865                                    (unsigned long long) this_bh->b_blocknr);
3866         }
3867 }
3868
3869 /**
3870  *      ext4_free_branches - free an array of branches
3871  *      @handle: JBD handle for this transaction
3872  *      @inode: inode we are dealing with
3873  *      @parent_bh: the buffer_head which contains *@first and *@last
3874  *      @first: array of block numbers
3875  *      @last:  pointer immediately past the end of array
3876  *      @depth: depth of the branches to free
3877  *
3878  *      We are freeing all blocks refered from these branches (numbers are
3879  *      stored as little-endian 32-bit) and updating @inode->i_blocks
3880  *      appropriately.
3881  */
3882 static void ext4_free_branches(handle_t *handle, struct inode *inode,
3883                                struct buffer_head *parent_bh,
3884                                __le32 *first, __le32 *last, int depth)
3885 {
3886         ext4_fsblk_t nr;
3887         __le32 *p;
3888
3889         if (ext4_handle_is_aborted(handle))
3890                 return;
3891
3892         if (depth--) {
3893                 struct buffer_head *bh;
3894                 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
3895                 p = last;
3896                 while (--p >= first) {
3897                         nr = le32_to_cpu(*p);
3898                         if (!nr)
3899                                 continue;               /* A hole */
3900
3901                         /* Go read the buffer for the next level down */
3902                         bh = sb_bread(inode->i_sb, nr);
3903
3904                         /*
3905                          * A read failure? Report error and clear slot
3906                          * (should be rare).
3907                          */
3908                         if (!bh) {
3909                                 ext4_error(inode->i_sb, "ext4_free_branches",
3910                                            "Read failure, inode=%lu, block=%llu",
3911                                            inode->i_ino, nr);
3912                                 continue;
3913                         }
3914
3915                         /* This zaps the entire block.  Bottom up. */
3916                         BUFFER_TRACE(bh, "free child branches");
3917                         ext4_free_branches(handle, inode, bh,
3918                                         (__le32 *) bh->b_data,
3919                                         (__le32 *) bh->b_data + addr_per_block,
3920                                         depth);
3921
3922                         /*
3923                          * We've probably journalled the indirect block several
3924                          * times during the truncate.  But it's no longer
3925                          * needed and we now drop it from the transaction via
3926                          * jbd2_journal_revoke().
3927                          *
3928                          * That's easy if it's exclusively part of this
3929                          * transaction.  But if it's part of the committing
3930                          * transaction then jbd2_journal_forget() will simply
3931                          * brelse() it.  That means that if the underlying
3932                          * block is reallocated in ext4_get_block(),
3933                          * unmap_underlying_metadata() will find this block
3934                          * and will try to get rid of it.  damn, damn.
3935                          *
3936                          * If this block has already been committed to the
3937                          * journal, a revoke record will be written.  And
3938                          * revoke records must be emitted *before* clearing
3939                          * this block's bit in the bitmaps.
3940                          */
3941                         ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
3942
3943                         /*
3944                          * Everything below this this pointer has been
3945                          * released.  Now let this top-of-subtree go.
3946                          *
3947                          * We want the freeing of this indirect block to be
3948                          * atomic in the journal with the updating of the
3949                          * bitmap block which owns it.  So make some room in
3950                          * the journal.
3951                          *
3952                          * We zero the parent pointer *after* freeing its
3953                          * pointee in the bitmaps, so if extend_transaction()
3954                          * for some reason fails to put the bitmap changes and
3955                          * the release into the same transaction, recovery
3956                          * will merely complain about releasing a free block,
3957                          * rather than leaking blocks.
3958                          */
3959                         if (ext4_handle_is_aborted(handle))
3960                                 return;
3961                         if (try_to_extend_transaction(handle, inode)) {
3962                                 ext4_mark_inode_dirty(handle, inode);
3963                                 ext4_journal_test_restart(handle, inode);
3964                         }
3965
3966                         ext4_free_blocks(handle, inode, nr, 1, 1);
3967
3968                         if (parent_bh) {
3969                                 /*
3970                                  * The block which we have just freed is
3971                                  * pointed to by an indirect block: journal it
3972                                  */
3973                                 BUFFER_TRACE(parent_bh, "get_write_access");
3974                                 if (!ext4_journal_get_write_access(handle,
3975                                                                    parent_bh)){
3976                                         *p = 0;
3977                                         BUFFER_TRACE(parent_bh,
3978                                         "call ext4_handle_dirty_metadata");
3979                                         ext4_handle_dirty_metadata(handle,
3980                                                                    inode,
3981                                                                    parent_bh);
3982                                 }
3983                         }
3984                 }
3985         } else {
3986                 /* We have reached the bottom of the tree. */
3987                 BUFFER_TRACE(parent_bh, "free data blocks");
3988                 ext4_free_data(handle, inode, parent_bh, first, last);
3989         }
3990 }
3991
3992 int ext4_can_truncate(struct inode *inode)
3993 {
3994         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
3995                 return 0;
3996         if (S_ISREG(inode->i_mode))
3997                 return 1;
3998         if (S_ISDIR(inode->i_mode))
3999                 return 1;
4000         if (S_ISLNK(inode->i_mode))
4001                 return !ext4_inode_is_fast_symlink(inode);
4002         return 0;
4003 }
4004
4005 /*
4006  * ext4_truncate()
4007  *
4008  * We block out ext4_get_block() block instantiations across the entire
4009  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
4010  * simultaneously on behalf of the same inode.
4011  *
4012  * As we work through the truncate and commmit bits of it to the journal there
4013  * is one core, guiding principle: the file's tree must always be consistent on
4014  * disk.  We must be able to restart the truncate after a crash.
4015  *
4016  * The file's tree may be transiently inconsistent in memory (although it
4017  * probably isn't), but whenever we close off and commit a journal transaction,
4018  * the contents of (the filesystem + the journal) must be consistent and
4019  * restartable.  It's pretty simple, really: bottom up, right to left (although
4020  * left-to-right works OK too).
4021  *
4022  * Note that at recovery time, journal replay occurs *before* the restart of
4023  * truncate against the orphan inode list.
4024  *
4025  * The committed inode has the new, desired i_size (which is the same as
4026  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
4027  * that this inode's truncate did not complete and it will again call
4028  * ext4_truncate() to have another go.  So there will be instantiated blocks
4029  * to the right of the truncation point in a crashed ext4 filesystem.  But
4030  * that's fine - as long as they are linked from the inode, the post-crash
4031  * ext4_truncate() run will find them and release them.
4032  */
4033 void ext4_truncate(struct inode *inode)
4034 {
4035         handle_t *handle;
4036         struct ext4_inode_info *ei = EXT4_I(inode);
4037         __le32 *i_data = ei->i_data;
4038         int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4039         struct address_space *mapping = inode->i_mapping;
4040         ext4_lblk_t offsets[4];
4041         Indirect chain[4];
4042         Indirect *partial;
4043         __le32 nr = 0;
4044         int n;
4045         ext4_lblk_t last_block;
4046         unsigned blocksize = inode->i_sb->s_blocksize;
4047
4048         if (!ext4_can_truncate(inode))
4049                 return;
4050
4051         if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4052                 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
4053
4054         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
4055                 ext4_ext_truncate(inode);
4056                 return;
4057         }
4058
4059         handle = start_transaction(inode);
4060         if (IS_ERR(handle))
4061                 return;         /* AKPM: return what? */
4062
4063         last_block = (inode->i_size + blocksize-1)
4064                                         >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4065
4066         if (inode->i_size & (blocksize - 1))
4067                 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4068                         goto out_stop;
4069
4070         n = ext4_block_to_path(inode, last_block, offsets, NULL);
4071         if (n == 0)
4072                 goto out_stop;  /* error */
4073
4074         /*
4075          * OK.  This truncate is going to happen.  We add the inode to the
4076          * orphan list, so that if this truncate spans multiple transactions,
4077          * and we crash, we will resume the truncate when the filesystem
4078          * recovers.  It also marks the inode dirty, to catch the new size.
4079          *
4080          * Implication: the file must always be in a sane, consistent
4081          * truncatable state while each transaction commits.
4082          */
4083         if (ext4_orphan_add(handle, inode))
4084                 goto out_stop;
4085
4086         /*
4087          * From here we block out all ext4_get_block() callers who want to
4088          * modify the block allocation tree.
4089          */
4090         down_write(&ei->i_data_sem);
4091
4092         ext4_discard_preallocations(inode);
4093
4094         /*
4095          * The orphan list entry will now protect us from any crash which
4096          * occurs before the truncate completes, so it is now safe to propagate
4097          * the new, shorter inode size (held for now in i_size) into the
4098          * on-disk inode. We do this via i_disksize, which is the value which
4099          * ext4 *really* writes onto the disk inode.
4100          */
4101         ei->i_disksize = inode->i_size;
4102
4103         if (n == 1) {           /* direct blocks */
4104                 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4105                                i_data + EXT4_NDIR_BLOCKS);
4106                 goto do_indirects;
4107         }
4108
4109         partial = ext4_find_shared(inode, n, offsets, chain, &nr);
4110         /* Kill the top of shared branch (not detached) */
4111         if (nr) {
4112                 if (partial == chain) {
4113                         /* Shared branch grows from the inode */
4114                         ext4_free_branches(handle, inode, NULL,
4115                                            &nr, &nr+1, (chain+n-1) - partial);
4116                         *partial->p = 0;
4117                         /*
4118                          * We mark the inode dirty prior to restart,
4119                          * and prior to stop.  No need for it here.
4120                          */
4121                 } else {
4122                         /* Shared branch grows from an indirect block */
4123                         BUFFER_TRACE(partial->bh, "get_write_access");
4124                         ext4_free_branches(handle, inode, partial->bh,
4125                                         partial->p,
4126                                         partial->p+1, (chain+n-1) - partial);
4127                 }
4128         }
4129         /* Clear the ends of indirect blocks on the shared branch */
4130         while (partial > chain) {
4131                 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
4132                                    (__le32*)partial->bh->b_data+addr_per_block,
4133                                    (chain+n-1) - partial);
4134                 BUFFER_TRACE(partial->bh, "call brelse");
4135                 brelse (partial->bh);
4136                 partial--;
4137         }
4138 do_indirects:
4139         /* Kill the remaining (whole) subtrees */
4140         switch (offsets[0]) {
4141         default:
4142                 nr = i_data[EXT4_IND_BLOCK];
4143                 if (nr) {
4144                         ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
4145                         i_data[EXT4_IND_BLOCK] = 0;
4146                 }
4147         case EXT4_IND_BLOCK:
4148                 nr = i_data[EXT4_DIND_BLOCK];
4149                 if (nr) {
4150                         ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
4151                         i_data[EXT4_DIND_BLOCK] = 0;
4152                 }
4153         case EXT4_DIND_BLOCK:
4154                 nr = i_data[EXT4_TIND_BLOCK];
4155                 if (nr) {
4156                         ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
4157                         i_data[EXT4_TIND_BLOCK] = 0;
4158                 }
4159         case EXT4_TIND_BLOCK:
4160                 ;
4161         }
4162
4163         up_write(&ei->i_data_sem);
4164         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4165         ext4_mark_inode_dirty(handle, inode);
4166
4167         /*
4168          * In a multi-transaction truncate, we only make the final transaction
4169          * synchronous
4170          */
4171         if (IS_SYNC(inode))
4172                 ext4_handle_sync(handle);
4173 out_stop:
4174         /*
4175          * If this was a simple ftruncate(), and the file will remain alive
4176          * then we need to clear up the orphan record which we created above.
4177          * However, if this was a real unlink then we were called by
4178          * ext4_delete_inode(), and we allow that function to clean up the
4179          * orphan info for us.
4180          */
4181         if (inode->i_nlink)
4182                 ext4_orphan_del(handle, inode);
4183
4184         ext4_journal_stop(handle);
4185 }
4186
4187 /*
4188  * ext4_get_inode_loc returns with an extra refcount against the inode's
4189  * underlying buffer_head on success. If 'in_mem' is true, we have all
4190  * data in memory that is needed to recreate the on-disk version of this
4191  * inode.
4192  */
4193 static int __ext4_get_inode_loc(struct inode *inode,
4194                                 struct ext4_iloc *iloc, int in_mem)
4195 {
4196         struct ext4_group_desc  *gdp;
4197         struct buffer_head      *bh;
4198         struct super_block      *sb = inode->i_sb;
4199         ext4_fsblk_t            block;
4200         int                     inodes_per_block, inode_offset;
4201
4202         iloc->bh = NULL;
4203         if (!ext4_valid_inum(sb, inode->i_ino))
4204                 return -EIO;
4205
4206         iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
4207         gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
4208         if (!gdp)
4209                 return -EIO;
4210
4211         /*
4212          * Figure out the offset within the block group inode table
4213          */
4214         inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
4215         inode_offset = ((inode->i_ino - 1) %
4216                         EXT4_INODES_PER_GROUP(sb));
4217         block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
4218         iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
4219
4220         bh = sb_getblk(sb, block);
4221         if (!bh) {
4222                 ext4_error(sb, "ext4_get_inode_loc", "unable to read "
4223                            "inode block - inode=%lu, block=%llu",
4224                            inode->i_ino, block);
4225                 return -EIO;
4226         }
4227         if (!buffer_uptodate(bh)) {
4228                 lock_buffer(bh);
4229
4230                 /*
4231                  * If the buffer has the write error flag, we have failed
4232                  * to write out another inode in the same block.  In this
4233                  * case, we don't have to read the block because we may
4234                  * read the old inode data successfully.
4235                  */
4236                 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4237                         set_buffer_uptodate(bh);
4238
4239                 if (buffer_uptodate(bh)) {
4240                         /* someone brought it uptodate while we waited */
4241                         unlock_buffer(bh);
4242                         goto has_buffer;
4243                 }
4244
4245                 /*
4246                  * If we have all information of the inode in memory and this
4247                  * is the only valid inode in the block, we need not read the
4248                  * block.
4249                  */
4250                 if (in_mem) {
4251                         struct buffer_head *bitmap_bh;
4252                         int i, start;
4253
4254                         start = inode_offset & ~(inodes_per_block - 1);
4255
4256                         /* Is the inode bitmap in cache? */
4257                         bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
4258                         if (!bitmap_bh)
4259                                 goto make_io;
4260
4261                         /*
4262                          * If the inode bitmap isn't in cache then the
4263                          * optimisation may end up performing two reads instead
4264                          * of one, so skip it.
4265                          */
4266                         if (!buffer_uptodate(bitmap_bh)) {
4267                                 brelse(bitmap_bh);
4268                                 goto make_io;
4269                         }
4270                         for (i = start; i < start + inodes_per_block; i++) {
4271                                 if (i == inode_offset)
4272                                         continue;
4273                                 if (ext4_test_bit(i, bitmap_bh->b_data))
4274                                         break;
4275                         }
4276                         brelse(bitmap_bh);
4277                         if (i == start + inodes_per_block) {
4278                                 /* all other inodes are free, so skip I/O */
4279                                 memset(bh->b_data, 0, bh->b_size);
4280                                 set_buffer_uptodate(bh);
4281                                 unlock_buffer(bh);
4282                                 goto has_buffer;
4283                         }
4284                 }
4285
4286 make_io:
4287                 /*
4288                  * If we need to do any I/O, try to pre-readahead extra
4289                  * blocks from the inode table.
4290                  */
4291                 if (EXT4_SB(sb)->s_inode_readahead_blks) {
4292                         ext4_fsblk_t b, end, table;
4293                         unsigned num;
4294
4295                         table = ext4_inode_table(sb, gdp);
4296                         /* s_inode_readahead_blks is always a power of 2 */
4297                         b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4298                         if (table > b)
4299                                 b = table;
4300                         end = b + EXT4_SB(sb)->s_inode_readahead_blks;
4301                         num = EXT4_INODES_PER_GROUP(sb);
4302                         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4303                                        EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
4304                                 num -= ext4_itable_unused_count(sb, gdp);
4305                         table += num / inodes_per_block;
4306                         if (end > table)
4307                                 end = table;
4308                         while (b <= end)
4309                                 sb_breadahead(sb, b++);
4310                 }
4311
4312                 /*
4313                  * There are other valid inodes in the buffer, this inode
4314                  * has in-inode xattrs, or we don't have this inode in memory.
4315                  * Read the block from disk.
4316                  */
4317                 get_bh(bh);
4318                 bh->b_end_io = end_buffer_read_sync;
4319                 submit_bh(READ_META, bh);
4320                 wait_on_buffer(bh);
4321                 if (!buffer_uptodate(bh)) {
4322                         ext4_error(sb, __func__,
4323                                    "unable to read inode block - inode=%lu, "
4324                                    "block=%llu", inode->i_ino, block);
4325                         brelse(bh);
4326                         return -EIO;
4327                 }
4328         }
4329 has_buffer:
4330         iloc->bh = bh;
4331         return 0;
4332 }
4333
4334 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4335 {
4336         /* We have all inode data except xattrs in memory here. */
4337         return __ext4_get_inode_loc(inode, iloc,
4338                 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
4339 }
4340
4341 void ext4_set_inode_flags(struct inode *inode)
4342 {
4343         unsigned int flags = EXT4_I(inode)->i_flags;
4344
4345         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
4346         if (flags & EXT4_SYNC_FL)
4347                 inode->i_flags |= S_SYNC;
4348         if (flags & EXT4_APPEND_FL)
4349                 inode->i_flags |= S_APPEND;
4350         if (flags & EXT4_IMMUTABLE_FL)
4351                 inode->i_flags |= S_IMMUTABLE;
4352         if (flags & EXT4_NOATIME_FL)
4353                 inode->i_flags |= S_NOATIME;
4354         if (flags & EXT4_DIRSYNC_FL)
4355                 inode->i_flags |= S_DIRSYNC;
4356 }
4357
4358 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4359 void ext4_get_inode_flags(struct ext4_inode_info *ei)
4360 {
4361         unsigned int flags = ei->vfs_inode.i_flags;
4362
4363         ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
4364                         EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
4365         if (flags & S_SYNC)
4366                 ei->i_flags |= EXT4_SYNC_FL;
4367         if (flags & S_APPEND)
4368                 ei->i_flags |= EXT4_APPEND_FL;
4369         if (flags & S_IMMUTABLE)
4370                 ei->i_flags |= EXT4_IMMUTABLE_FL;
4371         if (flags & S_NOATIME)
4372                 ei->i_flags |= EXT4_NOATIME_FL;
4373         if (flags & S_DIRSYNC)
4374                 ei->i_flags |= EXT4_DIRSYNC_FL;
4375 }
4376 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4377                                         struct ext4_inode_info *ei)
4378 {
4379         blkcnt_t i_blocks ;
4380         struct inode *inode = &(ei->vfs_inode);
4381         struct super_block *sb = inode->i_sb;
4382
4383         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4384                                 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
4385                 /* we are using combined 48 bit field */
4386                 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4387                                         le32_to_cpu(raw_inode->i_blocks_lo);
4388                 if (ei->i_flags & EXT4_HUGE_FILE_FL) {
4389                         /* i_blocks represent file system block size */
4390                         return i_blocks  << (inode->i_blkbits - 9);
4391                 } else {
4392                         return i_blocks;
4393                 }
4394         } else {
4395                 return le32_to_cpu(raw_inode->i_blocks_lo);
4396         }
4397 }
4398
4399 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4400 {
4401         struct ext4_iloc iloc;
4402         struct ext4_inode *raw_inode;
4403         struct ext4_inode_info *ei;
4404         struct buffer_head *bh;
4405         struct inode *inode;
4406         long ret;
4407         int block;
4408
4409         inode = iget_locked(sb, ino);
4410         if (!inode)
4411                 return ERR_PTR(-ENOMEM);
4412         if (!(inode->i_state & I_NEW))
4413                 return inode;
4414
4415         ei = EXT4_I(inode);
4416 #ifdef CONFIG_EXT4_FS_POSIX_ACL
4417         ei->i_acl = EXT4_ACL_NOT_CACHED;
4418         ei->i_default_acl = EXT4_ACL_NOT_CACHED;
4419 #endif
4420
4421         ret = __ext4_get_inode_loc(inode, &iloc, 0);
4422         if (ret < 0)
4423                 goto bad_inode;
4424         bh = iloc.bh;
4425         raw_inode = ext4_raw_inode(&iloc);
4426         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4427         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
4428         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
4429         if (!(test_opt(inode->i_sb, NO_UID32))) {
4430                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
4431                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
4432         }
4433         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4434
4435         ei->i_state = 0;
4436         ei->i_dir_start_lookup = 0;
4437         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4438         /* We now have enough fields to check if the inode was active or not.
4439          * This is needed because nfsd might try to access dead inodes
4440          * the test is that same one that e2fsck uses
4441          * NeilBrown 1999oct15
4442          */
4443         if (inode->i_nlink == 0) {
4444                 if (inode->i_mode == 0 ||
4445                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
4446                         /* this inode is deleted */
4447                         brelse(bh);
4448                         ret = -ESTALE;
4449                         goto bad_inode;
4450                 }
4451                 /* The only unlinked inodes we let through here have
4452                  * valid i_mode and are being read by the orphan
4453                  * recovery code: that's fine, we're about to complete
4454                  * the process of deleting those. */
4455         }
4456         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4457         inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4458         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4459         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
4460                 ei->i_file_acl |=
4461                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4462         inode->i_size = ext4_isize(raw_inode);
4463         ei->i_disksize = inode->i_size;
4464         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4465         ei->i_block_group = iloc.block_group;
4466         ei->i_last_alloc_group = ~0;
4467         /*
4468          * NOTE! The in-memory inode i_data array is in little-endian order
4469          * even on big-endian machines: we do NOT byteswap the block numbers!
4470          */
4471         for (block = 0; block < EXT4_N_BLOCKS; block++)
4472                 ei->i_data[block] = raw_inode->i_block[block];
4473         INIT_LIST_HEAD(&ei->i_orphan);
4474
4475         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4476                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4477                 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4478                     EXT4_INODE_SIZE(inode->i_sb)) {
4479                         brelse(bh);
4480                         ret = -EIO;
4481                         goto bad_inode;
4482                 }
4483                 if (ei->i_extra_isize == 0) {
4484                         /* The extra space is currently unused. Use it. */
4485                         ei->i_extra_isize = sizeof(struct ext4_inode) -
4486                                             EXT4_GOOD_OLD_INODE_SIZE;
4487                 } else {
4488                         __le32 *magic = (void *)raw_inode +
4489                                         EXT4_GOOD_OLD_INODE_SIZE +
4490                                         ei->i_extra_isize;
4491                         if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4492                                  ei->i_state |= EXT4_STATE_XATTR;
4493                 }
4494         } else
4495                 ei->i_extra_isize = 0;
4496
4497         EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
4498         EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
4499         EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4500         EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4501
4502         inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4503         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4504                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4505                         inode->i_version |=
4506                         (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4507         }
4508
4509         ret = 0;
4510         if (ei->i_file_acl &&
4511             ((ei->i_file_acl <
4512               (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
4513                EXT4_SB(sb)->s_gdb_count)) ||
4514              (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
4515                 ext4_error(sb, __func__,
4516                            "bad extended attribute block %llu in inode #%lu",
4517                            ei->i_file_acl, inode->i_ino);
4518                 ret = -EIO;
4519                 goto bad_inode;
4520         } else if (ei->i_flags & EXT4_EXTENTS_FL) {
4521                 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4522                     (S_ISLNK(inode->i_mode) &&
4523                      !ext4_inode_is_fast_symlink(inode)))
4524                         /* Validate extent which is part of inode */
4525                         ret = ext4_ext_check_inode(inode);
4526         } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4527                    (S_ISLNK(inode->i_mode) &&
4528                     !ext4_inode_is_fast_symlink(inode))) {
4529                 /* Validate block references which are part of inode */
4530                 ret = ext4_check_inode_blockref(inode);
4531         }
4532         if (ret) {
4533                 brelse(bh);
4534                 goto bad_inode;
4535         }
4536
4537         if (S_ISREG(inode->i_mode)) {
4538                 inode->i_op = &ext4_file_inode_operations;
4539                 inode->i_fop = &ext4_file_operations;
4540                 ext4_set_aops(inode);
4541         } else if (S_ISDIR(inode->i_mode)) {
4542                 inode->i_op = &ext4_dir_inode_operations;
4543                 inode->i_fop = &ext4_dir_operations;
4544         } else if (S_ISLNK(inode->i_mode)) {
4545                 if (ext4_inode_is_fast_symlink(inode)) {
4546                         inode->i_op = &ext4_fast_symlink_inode_operations;
4547                         nd_terminate_link(ei->i_data, inode->i_size,
4548                                 sizeof(ei->i_data) - 1);
4549                 } else {
4550                         inode->i_op = &ext4_symlink_inode_operations;
4551                         ext4_set_aops(inode);
4552                 }
4553         } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4554               S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
4555                 inode->i_op = &ext4_special_inode_operations;
4556                 if (raw_inode->i_block[0])
4557                         init_special_inode(inode, inode->i_mode,
4558                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
4559                 else
4560                         init_special_inode(inode, inode->i_mode,
4561                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4562         } else {
4563                 brelse(bh);
4564                 ret = -EIO;
4565                 ext4_error(inode->i_sb, __func__,
4566                            "bogus i_mode (%o) for inode=%lu",
4567                            inode->i_mode, inode->i_ino);
4568                 goto bad_inode;
4569         }
4570         brelse(iloc.bh);
4571         ext4_set_inode_flags(inode);
4572         unlock_new_inode(inode);
4573         return inode;
4574
4575 bad_inode:
4576         iget_failed(inode);
4577         return ERR_PTR(ret);
4578 }
4579
4580 static int ext4_inode_blocks_set(handle_t *handle,
4581                                 struct ext4_inode *raw_inode,
4582                                 struct ext4_inode_info *ei)
4583 {
4584         struct inode *inode = &(ei->vfs_inode);
4585         u64 i_blocks = inode->i_blocks;
4586         struct super_block *sb = inode->i_sb;
4587
4588         if (i_blocks <= ~0U) {
4589                 /*
4590                  * i_blocks can be represnted in a 32 bit variable
4591                  * as multiple of 512 bytes
4592                  */
4593                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
4594                 raw_inode->i_blocks_high = 0;
4595                 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
4596                 return 0;
4597         }
4598         if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
4599                 return -EFBIG;
4600
4601         if (i_blocks <= 0xffffffffffffULL) {
4602                 /*
4603                  * i_blocks can be represented in a 48 bit variable
4604                  * as multiple of 512 bytes
4605                  */
4606                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
4607                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4608                 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
4609         } else {
4610                 ei->i_flags |= EXT4_HUGE_FILE_FL;
4611                 /* i_block is stored in file system block size */
4612                 i_blocks = i_blocks >> (inode->i_blkbits - 9);
4613                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
4614                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4615         }
4616         return 0;
4617 }
4618
4619 /*
4620  * Post the struct inode info into an on-disk inode location in the
4621  * buffer-cache.  This gobbles the caller's reference to the
4622  * buffer_head in the inode location struct.
4623  *
4624  * The caller must have write access to iloc->bh.
4625  */
4626 static int ext4_do_update_inode(handle_t *handle,
4627                                 struct inode *inode,
4628                                 struct ext4_iloc *iloc)
4629 {
4630         struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4631         struct ext4_inode_info *ei = EXT4_I(inode);
4632         struct buffer_head *bh = iloc->bh;
4633         int err = 0, rc, block;
4634
4635         /* For fields not not tracking in the in-memory inode,
4636          * initialise them to zero for new inodes. */
4637         if (ei->i_state & EXT4_STATE_NEW)
4638                 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
4639
4640         ext4_get_inode_flags(ei);
4641         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
4642         if (!(test_opt(inode->i_sb, NO_UID32))) {
4643                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
4644                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
4645 /*
4646  * Fix up interoperability with old kernels. Otherwise, old inodes get
4647  * re-used with the upper 16 bits of the uid/gid intact
4648  */
4649                 if (!ei->i_dtime) {
4650                         raw_inode->i_uid_high =
4651                                 cpu_to_le16(high_16_bits(inode->i_uid));
4652                         raw_inode->i_gid_high =
4653                                 cpu_to_le16(high_16_bits(inode->i_gid));
4654                 } else {
4655                         raw_inode->i_uid_high = 0;
4656                         raw_inode->i_gid_high = 0;
4657                 }
4658         } else {
4659                 raw_inode->i_uid_low =
4660                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
4661                 raw_inode->i_gid_low =
4662                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
4663                 raw_inode->i_uid_high = 0;
4664                 raw_inode->i_gid_high = 0;
4665         }
4666         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
4667
4668         EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
4669         EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
4670         EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
4671         EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
4672
4673         if (ext4_inode_blocks_set(handle, raw_inode, ei))
4674                 goto out_brelse;
4675         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4676         /* clear the migrate flag in the raw_inode */
4677         raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
4678         if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
4679             cpu_to_le32(EXT4_OS_HURD))
4680                 raw_inode->i_file_acl_high =
4681                         cpu_to_le16(ei->i_file_acl >> 32);
4682         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
4683         ext4_isize_set(raw_inode, ei->i_disksize);
4684         if (ei->i_disksize > 0x7fffffffULL) {
4685                 struct super_block *sb = inode->i_sb;
4686                 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
4687                                 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
4688                                 EXT4_SB(sb)->s_es->s_rev_level ==
4689                                 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
4690                         /* If this is the first large file
4691                          * created, add a flag to the superblock.
4692                          */
4693                         err = ext4_journal_get_write_access(handle,
4694                                         EXT4_SB(sb)->s_sbh);
4695                         if (err)
4696                                 goto out_brelse;
4697                         ext4_update_dynamic_rev(sb);
4698                         EXT4_SET_RO_COMPAT_FEATURE(sb,
4699                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
4700                         sb->s_dirt = 1;
4701                         ext4_handle_sync(handle);
4702                         err = ext4_handle_dirty_metadata(handle, inode,
4703                                         EXT4_SB(sb)->s_sbh);
4704                 }
4705         }
4706         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
4707         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
4708                 if (old_valid_dev(inode->i_rdev)) {
4709                         raw_inode->i_block[0] =
4710                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
4711                         raw_inode->i_block[1] = 0;
4712                 } else {
4713                         raw_inode->i_block[0] = 0;
4714                         raw_inode->i_block[1] =
4715                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
4716                         raw_inode->i_block[2] = 0;
4717                 }
4718         } else for (block = 0; block < EXT4_N_BLOCKS; block++)
4719                 raw_inode->i_block[block] = ei->i_data[block];
4720
4721         raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
4722         if (ei->i_extra_isize) {
4723                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4724                         raw_inode->i_version_hi =
4725                         cpu_to_le32(inode->i_version >> 32);
4726                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4727         }
4728
4729         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4730         rc = ext4_handle_dirty_metadata(handle, inode, bh);
4731         if (!err)
4732                 err = rc;
4733         ei->i_state &= ~EXT4_STATE_NEW;
4734
4735 out_brelse:
4736         brelse(bh);
4737         ext4_std_error(inode->i_sb, err);
4738         return err;
4739 }
4740
4741 /*
4742  * ext4_write_inode()
4743  *
4744  * We are called from a few places:
4745  *
4746  * - Within generic_file_write() for O_SYNC files.
4747  *   Here, there will be no transaction running. We wait for any running
4748  *   trasnaction to commit.
4749  *
4750  * - Within sys_sync(), kupdate and such.
4751  *   We wait on commit, if tol to.
4752  *
4753  * - Within prune_icache() (PF_MEMALLOC == true)
4754  *   Here we simply return.  We can't afford to block kswapd on the
4755  *   journal commit.
4756  *
4757  * In all cases it is actually safe for us to return without doing anything,
4758  * because the inode has been copied into a raw inode buffer in
4759  * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
4760  * knfsd.
4761  *
4762  * Note that we are absolutely dependent upon all inode dirtiers doing the
4763  * right thing: they *must* call mark_inode_dirty() after dirtying info in
4764  * which we are interested.
4765  *
4766  * It would be a bug for them to not do this.  The code:
4767  *
4768  *      mark_inode_dirty(inode)
4769  *      stuff();
4770  *      inode->i_size = expr;
4771  *
4772  * is in error because a kswapd-driven write_inode() could occur while
4773  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
4774  * will no longer be on the superblock's dirty inode list.
4775  */
4776 int ext4_write_inode(struct inode *inode, int wait)
4777 {
4778         if (current->flags & PF_MEMALLOC)
4779                 return 0;
4780
4781         if (ext4_journal_current_handle()) {
4782                 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
4783                 dump_stack();
4784                 return -EIO;
4785         }
4786
4787         if (!wait)
4788                 return 0;
4789
4790         return ext4_force_commit(inode->i_sb);
4791 }
4792
4793 int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
4794 {
4795         int err = 0;
4796
4797         mark_buffer_dirty(bh);
4798         if (inode && inode_needs_sync(inode)) {
4799                 sync_dirty_buffer(bh);
4800                 if (buffer_req(bh) && !buffer_uptodate(bh)) {
4801                         ext4_error(inode->i_sb, __func__,
4802                                    "IO error syncing inode, "
4803                                    "inode=%lu, block=%llu",
4804                                    inode->i_ino,
4805                                    (unsigned long long)bh->b_blocknr);
4806                         err = -EIO;
4807                 }
4808         }
4809         return err;
4810 }
4811
4812 /*
4813  * ext4_setattr()
4814  *
4815  * Called from notify_change.
4816  *
4817  * We want to trap VFS attempts to truncate the file as soon as
4818  * possible.  In particular, we want to make sure that when the VFS
4819  * shrinks i_size, we put the inode on the orphan list and modify
4820  * i_disksize immediately, so that during the subsequent flushing of
4821  * dirty pages and freeing of disk blocks, we can guarantee that any
4822  * commit will leave the blocks being flushed in an unused state on
4823  * disk.  (On recovery, the inode will get truncated and the blocks will
4824  * be freed, so we have a strong guarantee that no future commit will
4825  * leave these blocks visible to the user.)
4826  *
4827  * Another thing we have to assure is that if we are in ordered mode
4828  * and inode is still attached to the committing transaction, we must
4829  * we start writeout of all the dirty pages which are being truncated.
4830  * This way we are sure that all the data written in the previous
4831  * transaction are already on disk (truncate waits for pages under
4832  * writeback).
4833  *
4834  * Called with inode->i_mutex down.
4835  */
4836 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4837 {
4838         struct inode *inode = dentry->d_inode;
4839         int error, rc = 0;
4840         const unsigned int ia_valid = attr->ia_valid;
4841
4842         error = inode_change_ok(inode, attr);
4843         if (error)
4844                 return error;
4845
4846         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
4847                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
4848                 handle_t *handle;
4849
4850                 /* (user+group)*(old+new) structure, inode write (sb,
4851                  * inode block, ? - but truncate inode update has it) */
4852                 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
4853                                         EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
4854                 if (IS_ERR(handle)) {
4855                         error = PTR_ERR(handle);
4856                         goto err_out;
4857                 }
4858                 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
4859                 if (error) {
4860                         ext4_journal_stop(handle);
4861                         return error;
4862                 }
4863                 /* Update corresponding info in inode so that everything is in
4864                  * one transaction */
4865                 if (attr->ia_valid & ATTR_UID)
4866                         inode->i_uid = attr->ia_uid;
4867                 if (attr->ia_valid & ATTR_GID)
4868                         inode->i_gid = attr->ia_gid;
4869                 error = ext4_mark_inode_dirty(handle, inode);
4870                 ext4_journal_stop(handle);
4871         }
4872
4873         if (attr->ia_valid & ATTR_SIZE) {
4874                 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
4875                         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4876
4877                         if (attr->ia_size > sbi->s_bitmap_maxbytes) {
4878                                 error = -EFBIG;
4879                                 goto err_out;
4880                         }
4881                 }
4882         }
4883
4884         if (S_ISREG(inode->i_mode) &&
4885             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
4886                 handle_t *handle;
4887
4888                 handle = ext4_journal_start(inode, 3);
4889                 if (IS_ERR(handle)) {
4890                         error = PTR_ERR(handle);
4891                         goto err_out;
4892                 }
4893
4894                 error = ext4_orphan_add(handle, inode);
4895                 EXT4_I(inode)->i_disksize = attr->ia_size;
4896                 rc = ext4_mark_inode_dirty(handle, inode);
4897                 if (!error)
4898                         error = rc;
4899                 ext4_journal_stop(handle);
4900
4901                 if (ext4_should_order_data(inode)) {
4902                         error = ext4_begin_ordered_truncate(inode,
4903                                                             attr->ia_size);
4904                         if (error) {
4905                                 /* Do as much error cleanup as possible */
4906                                 handle = ext4_journal_start(inode, 3);
4907                                 if (IS_ERR(handle)) {
4908                                         ext4_orphan_del(NULL, inode);
4909                                         goto err_out;
4910                                 }
4911                                 ext4_orphan_del(handle, inode);
4912                                 ext4_journal_stop(handle);
4913                                 goto err_out;
4914                         }
4915                 }
4916         }
4917
4918         rc = inode_setattr(inode, attr);
4919
4920         /* If inode_setattr's call to ext4_truncate failed to get a
4921          * transaction handle at all, we need to clean up the in-core
4922          * orphan list manually. */
4923         if (inode->i_nlink)
4924                 ext4_orphan_del(NULL, inode);
4925
4926         if (!rc && (ia_valid & ATTR_MODE))
4927                 rc = ext4_acl_chmod(inode);
4928
4929 err_out:
4930         ext4_std_error(inode->i_sb, error);
4931         if (!error)
4932                 error = rc;
4933         return error;
4934 }
4935
4936 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4937                  struct kstat *stat)
4938 {
4939         struct inode *inode;
4940         unsigned long delalloc_blocks;
4941
4942         inode = dentry->d_inode;
4943         generic_fillattr(inode, stat);
4944
4945         /*
4946          * We can't update i_blocks if the block allocation is delayed
4947          * otherwise in the case of system crash before the real block
4948          * allocation is done, we will have i_blocks inconsistent with
4949          * on-disk file blocks.
4950          * We always keep i_blocks updated together with real
4951          * allocation. But to not confuse with user, stat
4952          * will return the blocks that include the delayed allocation
4953          * blocks for this file.
4954          */
4955         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
4956         delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
4957         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
4958
4959         stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4960         return 0;
4961 }
4962
4963 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
4964                                       int chunk)
4965 {
4966         int indirects;
4967
4968         /* if nrblocks are contiguous */
4969         if (chunk) {
4970                 /*
4971                  * With N contiguous data blocks, it need at most
4972                  * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
4973                  * 2 dindirect blocks
4974                  * 1 tindirect block
4975                  */
4976                 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
4977                 return indirects + 3;
4978         }
4979         /*
4980          * if nrblocks are not contiguous, worse case, each block touch
4981          * a indirect block, and each indirect block touch a double indirect
4982          * block, plus a triple indirect block
4983          */
4984         indirects = nrblocks * 2 + 1;
4985         return indirects;
4986 }
4987
4988 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4989 {
4990         if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4991                 return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
4992         return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
4993 }
4994
4995 /*
4996  * Account for index blocks, block groups bitmaps and block group
4997  * descriptor blocks if modify datablocks and index blocks
4998  * worse case, the indexs blocks spread over different block groups
4999  *
5000  * If datablocks are discontiguous, they are possible to spread over
5001  * different block groups too. If they are contiugous, with flexbg,
5002  * they could still across block group boundary.
5003  *
5004  * Also account for superblock, inode, quota and xattr blocks
5005  */
5006 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5007 {
5008         ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5009         int gdpblocks;
5010         int idxblocks;
5011         int ret = 0;
5012
5013         /*
5014          * How many index blocks need to touch to modify nrblocks?
5015          * The "Chunk" flag indicating whether the nrblocks is
5016          * physically contiguous on disk
5017          *
5018          * For Direct IO and fallocate, they calls get_block to allocate
5019          * one single extent at a time, so they could set the "Chunk" flag
5020          */
5021         idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
5022
5023         ret = idxblocks;
5024
5025         /*
5026          * Now let's see how many group bitmaps and group descriptors need
5027          * to account
5028          */
5029         groups = idxblocks;
5030         if (chunk)
5031                 groups += 1;
5032         else
5033                 groups += nrblocks;
5034
5035         gdpblocks = groups;
5036         if (groups > ngroups)
5037                 groups = ngroups;
5038         if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
5039                 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
5040
5041         /* bitmaps and block group descriptor blocks */
5042         ret += groups + gdpblocks;
5043
5044         /* Blocks for super block, inode, quota and xattr blocks */
5045         ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
5046
5047         return ret;
5048 }
5049
5050 /*
5051  * Calulate the total number of credits to reserve to fit
5052  * the modification of a single pages into a single transaction,
5053  * which may include multiple chunks of block allocations.
5054  *
5055  * This could be called via ext4_write_begin()
5056  *
5057  * We need to consider the worse case, when
5058  * one new block per extent.
5059  */
5060 int ext4_writepage_trans_blocks(struct inode *inode)
5061 {
5062         int bpp = ext4_journal_blocks_per_page(inode);
5063         int ret;
5064
5065         ret = ext4_meta_trans_blocks(inode, bpp, 0);
5066
5067         /* Account for data blocks for journalled mode */
5068         if (ext4_should_journal_data(inode))
5069                 ret += bpp;
5070         return ret;
5071 }
5072
5073 /*
5074  * Calculate the journal credits for a chunk of data modification.
5075  *
5076  * This is called from DIO, fallocate or whoever calling
5077  * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
5078  *
5079  * journal buffers for data blocks are not included here, as DIO
5080  * and fallocate do no need to journal data buffers.
5081  */
5082 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
5083 {
5084         return ext4_meta_trans_blocks(inode, nrblocks, 1);
5085 }
5086
5087 /*
5088  * The caller must have previously called ext4_reserve_inode_write().
5089  * Give this, we know that the caller already has write access to iloc->bh.
5090  */
5091 int ext4_mark_iloc_dirty(handle_t *handle,
5092                 struct inode *inode, struct ext4_iloc *iloc)
5093 {
5094         int err = 0;
5095
5096         if (test_opt(inode->i_sb, I_VERSION))
5097                 inode_inc_iversion(inode);
5098
5099         /* the do_update_inode consumes one bh->b_count */
5100         get_bh(iloc->bh);
5101
5102         /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5103         err = ext4_do_update_inode(handle, inode, iloc);
5104         put_bh(iloc->bh);
5105         return err;
5106 }
5107
5108 /*
5109  * On success, We end up with an outstanding reference count against
5110  * iloc->bh.  This _must_ be cleaned up later.
5111  */
5112
5113 int
5114 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
5115                          struct ext4_iloc *iloc)
5116 {
5117         int err;
5118
5119         err = ext4_get_inode_loc(inode, iloc);
5120         if (!err) {
5121                 BUFFER_TRACE(iloc->bh, "get_write_access");
5122                 err = ext4_journal_get_write_access(handle, iloc->bh);
5123                 if (err) {
5124                         brelse(iloc->bh);
5125                         iloc->bh = NULL;
5126                 }
5127         }
5128         ext4_std_error(inode->i_sb, err);
5129         return err;
5130 }
5131
5132 /*
5133  * Expand an inode by new_extra_isize bytes.
5134  * Returns 0 on success or negative error number on failure.
5135  */
5136 static int ext4_expand_extra_isize(struct inode *inode,
5137                                    unsigned int new_extra_isize,
5138                                    struct ext4_iloc iloc,
5139                                    handle_t *handle)
5140 {
5141         struct ext4_inode *raw_inode;
5142         struct ext4_xattr_ibody_header *header;
5143         struct ext4_xattr_entry *entry;
5144
5145         if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
5146                 return 0;
5147
5148         raw_inode = ext4_raw_inode(&iloc);
5149
5150         header = IHDR(inode, raw_inode);
5151         entry = IFIRST(header);
5152
5153         /* No extended attributes present */
5154         if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
5155                 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5156                 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5157                         new_extra_isize);
5158                 EXT4_I(inode)->i_extra_isize = new_extra_isize;
5159                 return 0;
5160         }
5161
5162         /* try to expand with EAs present */
5163         return ext4_expand_extra_isize_ea(inode, new_extra_isize,
5164                                           raw_inode, handle);
5165 }
5166
5167 /*
5168  * What we do here is to mark the in-core inode as clean with respect to inode
5169  * dirtiness (it may still be data-dirty).
5170  * This means that the in-core inode may be reaped by prune_icache
5171  * without having to perform any I/O.  This is a very good thing,
5172  * because *any* task may call prune_icache - even ones which
5173  * have a transaction open against a different journal.
5174  *
5175  * Is this cheating?  Not really.  Sure, we haven't written the
5176  * inode out, but prune_icache isn't a user-visible syncing function.
5177  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
5178  * we start and wait on commits.
5179  *
5180  * Is this efficient/effective?  Well, we're being nice to the system
5181  * by cleaning up our inodes proactively so they can be reaped
5182  * without I/O.  But we are potentially leaving up to five seconds'
5183  * worth of inodes floating about which prune_icache wants us to
5184  * write out.  One way to fix that would be to get prune_icache()
5185  * to do a write_super() to free up some memory.  It has the desired
5186  * effect.
5187  */
5188 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5189 {
5190         struct ext4_iloc iloc;
5191         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5192         static unsigned int mnt_count;
5193         int err, ret;
5194
5195         might_sleep();
5196         err = ext4_reserve_inode_write(handle, inode, &iloc);
5197         if (ext4_handle_valid(handle) &&
5198             EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5199             !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
5200                 /*
5201                  * We need extra buffer credits since we may write into EA block
5202                  * with this same handle. If journal_extend fails, then it will
5203                  * only result in a minor loss of functionality for that inode.
5204                  * If this is felt to be critical, then e2fsck should be run to
5205                  * force a large enough s_min_extra_isize.
5206                  */
5207                 if ((jbd2_journal_extend(handle,
5208                              EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
5209                         ret = ext4_expand_extra_isize(inode,
5210                                                       sbi->s_want_extra_isize,
5211                                                       iloc, handle);
5212                         if (ret) {
5213                                 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
5214                                 if (mnt_count !=
5215                                         le16_to_cpu(sbi->s_es->s_mnt_count)) {
5216                                         ext4_warning(inode->i_sb, __func__,
5217                                         "Unable to expand inode %lu. Delete"
5218                                         " some EAs or run e2fsck.",
5219                                         inode->i_ino);
5220                                         mnt_count =
5221                                           le16_to_cpu(sbi->s_es->s_mnt_count);
5222                                 }
5223                         }
5224                 }
5225         }
5226         if (!err)
5227                 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
5228         return err;
5229 }
5230
5231 /*
5232  * ext4_dirty_inode() is called from __mark_inode_dirty()
5233  *
5234  * We're really interested in the case where a file is being extended.
5235  * i_size has been changed by generic_commit_write() and we thus need
5236  * to include the updated inode in the current transaction.
5237  *
5238  * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
5239  * are allocated to the file.
5240  *
5241  * If the inode is marked synchronous, we don't honour that here - doing
5242  * so would cause a commit on atime updates, which we don't bother doing.
5243  * We handle synchronous inodes at the highest possible level.
5244  */
5245 void ext4_dirty_inode(struct inode *inode)
5246 {
5247         handle_t *current_handle = ext4_journal_current_handle();
5248         handle_t *handle;
5249
5250         if (!ext4_handle_valid(current_handle)) {
5251                 ext4_mark_inode_dirty(current_handle, inode);
5252                 return;
5253         }
5254
5255         handle = ext4_journal_start(inode, 2);
5256         if (IS_ERR(handle))
5257                 goto out;
5258         if (current_handle &&
5259                 current_handle->h_transaction != handle->h_transaction) {
5260                 /* This task has a transaction open against a different fs */
5261                 printk(KERN_EMERG "%s: transactions do not match!\n",
5262                        __func__);
5263         } else {
5264                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
5265                                 current_handle);
5266                 ext4_mark_inode_dirty(handle, inode);
5267         }
5268         ext4_journal_stop(handle);
5269 out:
5270         return;
5271 }
5272
5273 #if 0
5274 /*
5275  * Bind an inode's backing buffer_head into this transaction, to prevent
5276  * it from being flushed to disk early.  Unlike
5277  * ext4_reserve_inode_write, this leaves behind no bh reference and
5278  * returns no iloc structure, so the caller needs to repeat the iloc
5279  * lookup to mark the inode dirty later.
5280  */
5281 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5282 {
5283         struct ext4_iloc iloc;
5284
5285         int err = 0;
5286         if (handle) {
5287                 err = ext4_get_inode_loc(inode, &iloc);
5288                 if (!err) {
5289                         BUFFER_TRACE(iloc.bh, "get_write_access");
5290                         err = jbd2_journal_get_write_access(handle, iloc.bh);
5291                         if (!err)
5292                                 err = ext4_handle_dirty_metadata(handle,
5293                                                                  inode,
5294                                                                  iloc.bh);
5295                         brelse(iloc.bh);
5296                 }
5297         }
5298         ext4_std_error(inode->i_sb, err);
5299         return err;
5300 }
5301 #endif
5302
5303 int ext4_change_inode_journal_flag(struct inode *inode, int val)
5304 {
5305         journal_t *journal;
5306         handle_t *handle;
5307         int err;
5308
5309         /*
5310          * We have to be very careful here: changing a data block's
5311          * journaling status dynamically is dangerous.  If we write a
5312          * data block to the journal, change the status and then delete
5313          * that block, we risk forgetting to revoke the old log record
5314          * from the journal and so a subsequent replay can corrupt data.
5315          * So, first we make sure that the journal is empty and that
5316          * nobody is changing anything.
5317          */
5318
5319         journal = EXT4_JOURNAL(inode);
5320         if (!journal)
5321                 return 0;
5322         if (is_journal_aborted(journal))
5323                 return -EROFS;
5324
5325         jbd2_journal_lock_updates(journal);
5326         jbd2_journal_flush(journal);
5327
5328         /*
5329          * OK, there are no updates running now, and all cached data is
5330          * synced to disk.  We are now in a completely consistent state
5331          * which doesn't have anything in the journal, and we know that
5332          * no filesystem updates are running, so it is safe to modify
5333          * the inode's in-core data-journaling state flag now.
5334          */
5335
5336         if (val)
5337                 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
5338         else
5339                 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
5340         ext4_set_aops(inode);
5341
5342         jbd2_journal_unlock_updates(journal);
5343
5344         /* Finally we can mark the inode as dirty. */
5345
5346         handle = ext4_journal_start(inode, 1);
5347         if (IS_ERR(handle))
5348                 return PTR_ERR(handle);
5349
5350         err = ext4_mark_inode_dirty(handle, inode);
5351         ext4_handle_sync(handle);
5352         ext4_journal_stop(handle);
5353         ext4_std_error(inode->i_sb, err);
5354
5355         return err;
5356 }
5357
5358 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5359 {
5360         return !buffer_mapped(bh);
5361 }
5362
5363 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5364 {
5365         struct page *page = vmf->page;
5366         loff_t size;
5367         unsigned long len;
5368         int ret = -EINVAL;
5369         void *fsdata;
5370         struct file *file = vma->vm_file;
5371         struct inode *inode = file->f_path.dentry->d_inode;
5372         struct address_space *mapping = inode->i_mapping;
5373
5374         /*
5375          * Get i_alloc_sem to stop truncates messing with the inode. We cannot
5376          * get i_mutex because we are already holding mmap_sem.
5377          */
5378         down_read(&inode->i_alloc_sem);
5379         size = i_size_read(inode);
5380         if (page->mapping != mapping || size <= page_offset(page)
5381             || !PageUptodate(page)) {
5382                 /* page got truncated from under us? */
5383                 goto out_unlock;
5384         }
5385         ret = 0;
5386         if (PageMappedToDisk(page))
5387                 goto out_unlock;
5388
5389         if (page->index == size >> PAGE_CACHE_SHIFT)
5390                 len = size & ~PAGE_CACHE_MASK;
5391         else
5392                 len = PAGE_CACHE_SIZE;
5393
5394         if (page_has_buffers(page)) {
5395                 /* return if we have all the buffers mapped */
5396                 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5397                                        ext4_bh_unmapped))
5398                         goto out_unlock;
5399         }
5400         /*
5401          * OK, we need to fill the hole... Do write_begin write_end
5402          * to do block allocation/reservation.We are not holding
5403          * inode.i__mutex here. That allow * parallel write_begin,
5404          * write_end call. lock_page prevent this from happening
5405          * on the same page though
5406          */
5407         ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
5408                         len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
5409         if (ret < 0)
5410                 goto out_unlock;
5411         ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
5412                         len, len, page, fsdata);
5413         if (ret < 0)
5414                 goto out_unlock;
5415         ret = 0;
5416 out_unlock:
5417         if (ret)
5418                 ret = VM_FAULT_SIGBUS;
5419         up_read(&inode->i_alloc_sem);
5420         return ret;
5421 }