2 * linux/fs/ext3/inode.c
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
11 * linux/fs/minix/inode.c
13 * Copyright (C) 1991, 1992 Linus Torvalds
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
25 #include <linux/module.h>
27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/pagemap.h>
33 #include <linux/quotaops.h>
34 #include <linux/string.h>
35 #include <linux/buffer_head.h>
36 #include <linux/writeback.h>
37 #include <linux/mpage.h>
38 #include <linux/uio.h>
42 static int ext3_writepage_trans_blocks(struct inode *inode);
45 * Test whether an inode is a fast symlink.
47 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
49 int ea_blocks = EXT3_I(inode)->i_file_acl ?
50 (inode->i_sb->s_blocksize >> 9) : 0;
52 return (S_ISLNK(inode->i_mode) &&
53 inode->i_blocks - ea_blocks == 0);
56 /* The ext3 forget function must perform a revoke if we are freeing data
57 * which has been journaled. Metadata (eg. indirect blocks) must be
58 * revoked in all cases.
60 * "bh" may be NULL: a metadata block may have been freed from memory
61 * but there may still be a record of it in the journal, and that record
62 * still needs to be revoked.
65 int ext3_forget(handle_t *handle, int is_metadata,
66 struct inode *inode, struct buffer_head *bh,
73 BUFFER_TRACE(bh, "enter");
75 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
77 bh, is_metadata, inode->i_mode,
78 test_opt(inode->i_sb, DATA_FLAGS));
80 /* Never use the revoke function if we are doing full data
81 * journaling: there is no need to, and a V1 superblock won't
82 * support it. Otherwise, only skip the revoke on un-journaled
85 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
86 (!is_metadata && !ext3_should_journal_data(inode))) {
88 BUFFER_TRACE(bh, "call journal_forget");
89 return ext3_journal_forget(handle, bh);
95 * data!=journal && (is_metadata || should_journal_data(inode))
97 BUFFER_TRACE(bh, "call ext3_journal_revoke");
98 err = ext3_journal_revoke(handle, blocknr, bh);
100 ext3_abort(inode->i_sb, __FUNCTION__,
101 "error %d when attempting revoke", err);
102 BUFFER_TRACE(bh, "exit");
107 * Work out how many blocks we need to progress with the next chunk of a
108 * truncate transaction.
111 static unsigned long blocks_for_truncate(struct inode *inode)
113 unsigned long needed;
115 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
117 /* Give ourselves just enough room to cope with inodes in which
118 * i_blocks is corrupt: we've seen disk corruptions in the past
119 * which resulted in random data in an inode which looked enough
120 * like a regular file for ext3 to try to delete it. Things
121 * will go a bit crazy if that happens, but at least we should
122 * try not to panic the whole kernel. */
126 /* But we need to bound the transaction so we don't overflow the
128 if (needed > EXT3_MAX_TRANS_DATA)
129 needed = EXT3_MAX_TRANS_DATA;
131 return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
135 * Truncate transactions can be complex and absolutely huge. So we need to
136 * be able to restart the transaction at a conventient checkpoint to make
137 * sure we don't overflow the journal.
139 * start_transaction gets us a new handle for a truncate transaction,
140 * and extend_transaction tries to extend the existing one a bit. If
141 * extend fails, we need to propagate the failure up and restart the
142 * transaction in the top-level truncate loop. --sct
145 static handle_t *start_transaction(struct inode *inode)
149 result = ext3_journal_start(inode, blocks_for_truncate(inode));
153 ext3_std_error(inode->i_sb, PTR_ERR(result));
158 * Try to extend this transaction for the purposes of truncation.
160 * Returns 0 if we managed to create more room. If we can't create more
161 * room, and the transaction must be restarted we return 1.
163 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
165 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
167 if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
173 * Restart the transaction associated with *handle. This does a commit,
174 * so before we call here everything must be consistently dirtied against
177 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
179 jbd_debug(2, "restarting handle %p\n", handle);
180 return ext3_journal_restart(handle, blocks_for_truncate(inode));
184 * Called at the last iput() if i_nlink is zero.
186 void ext3_delete_inode (struct inode * inode)
190 truncate_inode_pages(&inode->i_data, 0);
192 if (is_bad_inode(inode))
195 handle = start_transaction(inode);
196 if (IS_ERR(handle)) {
197 /* If we're going to skip the normal cleanup, we still
198 * need to make sure that the in-core orphan linked list
199 * is properly cleaned up. */
200 ext3_orphan_del(NULL, inode);
208 ext3_truncate(inode);
210 * Kill off the orphan record which ext3_truncate created.
211 * AKPM: I think this can be inside the above `if'.
212 * Note that ext3_orphan_del() has to be able to cope with the
213 * deletion of a non-existent orphan - this is because we don't
214 * know if ext3_truncate() actually created an orphan record.
215 * (Well, we could do this if we need to, but heck - it works)
217 ext3_orphan_del(handle, inode);
218 EXT3_I(inode)->i_dtime = get_seconds();
221 * One subtle ordering requirement: if anything has gone wrong
222 * (transaction abort, IO errors, whatever), then we can still
223 * do these next steps (the fs will already have been marked as
224 * having errors), but we can't free the inode if the mark_dirty
227 if (ext3_mark_inode_dirty(handle, inode))
228 /* If that failed, just do the required in-core inode clear. */
231 ext3_free_inode(handle, inode);
232 ext3_journal_stop(handle);
235 clear_inode(inode); /* We must guarantee clearing of inode... */
241 struct buffer_head *bh;
244 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
246 p->key = *(p->p = v);
250 static inline int verify_chain(Indirect *from, Indirect *to)
252 while (from <= to && from->key == *from->p)
258 * ext3_block_to_path - parse the block number into array of offsets
259 * @inode: inode in question (we are only interested in its superblock)
260 * @i_block: block number to be parsed
261 * @offsets: array to store the offsets in
262 * @boundary: set this non-zero if the referred-to block is likely to be
263 * followed (on disk) by an indirect block.
265 * To store the locations of file's data ext3 uses a data structure common
266 * for UNIX filesystems - tree of pointers anchored in the inode, with
267 * data blocks at leaves and indirect blocks in intermediate nodes.
268 * This function translates the block number into path in that tree -
269 * return value is the path length and @offsets[n] is the offset of
270 * pointer to (n+1)th node in the nth one. If @block is out of range
271 * (negative or too large) warning is printed and zero returned.
273 * Note: function doesn't find node addresses, so no IO is needed. All
274 * we need to know is the capacity of indirect blocks (taken from the
279 * Portability note: the last comparison (check that we fit into triple
280 * indirect block) is spelled differently, because otherwise on an
281 * architecture with 32-bit longs and 8Kb pages we might get into trouble
282 * if our filesystem had 8Kb blocks. We might use long long, but that would
283 * kill us on x86. Oh, well, at least the sign propagation does not matter -
284 * i_block would have to be negative in the very beginning, so we would not
288 static int ext3_block_to_path(struct inode *inode,
289 long i_block, int offsets[4], int *boundary)
291 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
292 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
293 const long direct_blocks = EXT3_NDIR_BLOCKS,
294 indirect_blocks = ptrs,
295 double_blocks = (1 << (ptrs_bits * 2));
300 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
301 } else if (i_block < direct_blocks) {
302 offsets[n++] = i_block;
303 final = direct_blocks;
304 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
305 offsets[n++] = EXT3_IND_BLOCK;
306 offsets[n++] = i_block;
308 } else if ((i_block -= indirect_blocks) < double_blocks) {
309 offsets[n++] = EXT3_DIND_BLOCK;
310 offsets[n++] = i_block >> ptrs_bits;
311 offsets[n++] = i_block & (ptrs - 1);
313 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
314 offsets[n++] = EXT3_TIND_BLOCK;
315 offsets[n++] = i_block >> (ptrs_bits * 2);
316 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
317 offsets[n++] = i_block & (ptrs - 1);
320 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
323 *boundary = final - 1 - (i_block & (ptrs - 1));
328 * ext3_get_branch - read the chain of indirect blocks leading to data
329 * @inode: inode in question
330 * @depth: depth of the chain (1 - direct pointer, etc.)
331 * @offsets: offsets of pointers in inode/indirect blocks
332 * @chain: place to store the result
333 * @err: here we store the error value
335 * Function fills the array of triples <key, p, bh> and returns %NULL
336 * if everything went OK or the pointer to the last filled triple
337 * (incomplete one) otherwise. Upon the return chain[i].key contains
338 * the number of (i+1)-th block in the chain (as it is stored in memory,
339 * i.e. little-endian 32-bit), chain[i].p contains the address of that
340 * number (it points into struct inode for i==0 and into the bh->b_data
341 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
342 * block for i>0 and NULL for i==0. In other words, it holds the block
343 * numbers of the chain, addresses they were taken from (and where we can
344 * verify that chain did not change) and buffer_heads hosting these
347 * Function stops when it stumbles upon zero pointer (absent block)
348 * (pointer to last triple returned, *@err == 0)
349 * or when it gets an IO error reading an indirect block
350 * (ditto, *@err == -EIO)
351 * or when it notices that chain had been changed while it was reading
352 * (ditto, *@err == -EAGAIN)
353 * or when it reads all @depth-1 indirect blocks successfully and finds
354 * the whole chain, all way to the data (returns %NULL, *err == 0).
356 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
357 Indirect chain[4], int *err)
359 struct super_block *sb = inode->i_sb;
361 struct buffer_head *bh;
364 /* i_data is not going away, no lock needed */
365 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
369 bh = sb_bread(sb, le32_to_cpu(p->key));
372 /* Reader: pointers */
373 if (!verify_chain(chain, p))
375 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
393 * ext3_find_near - find a place for allocation with sufficient locality
395 * @ind: descriptor of indirect block.
397 * This function returns the prefered place for block allocation.
398 * It is used when heuristic for sequential allocation fails.
400 * + if there is a block to the left of our position - allocate near it.
401 * + if pointer will live in indirect block - allocate near that block.
402 * + if pointer will live in inode - allocate in the same
405 * In the latter case we colour the starting block by the callers PID to
406 * prevent it from clashing with concurrent allocations for a different inode
407 * in the same block group. The PID is used here so that functionally related
408 * files will be close-by on-disk.
410 * Caller must make sure that @ind is valid and will stay that way.
413 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
415 struct ext3_inode_info *ei = EXT3_I(inode);
416 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
418 unsigned long bg_start;
419 unsigned long colour;
421 /* Try to find previous block */
422 for (p = ind->p - 1; p >= start; p--)
424 return le32_to_cpu(*p);
426 /* No such thing, so let's try location of indirect block */
428 return ind->bh->b_blocknr;
431 * It is going to be refered from inode itself? OK, just put it into
432 * the same cylinder group then.
434 bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
435 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
436 colour = (current->pid % 16) *
437 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
438 return bg_start + colour;
442 * ext3_find_goal - find a prefered place for allocation.
444 * @block: block we want
445 * @chain: chain of indirect blocks
446 * @partial: pointer to the last triple within a chain
447 * @goal: place to store the result.
449 * Normally this function find the prefered place for block allocation,
450 * stores it in *@goal and returns zero.
453 static unsigned long ext3_find_goal(struct inode *inode, long block,
454 Indirect chain[4], Indirect *partial)
456 struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
459 * try the heuristic for sequential allocation,
460 * failing that at least try to get decent locality.
462 if (block_i && (block == block_i->last_alloc_logical_block + 1)
463 && (block_i->last_alloc_physical_block != 0)) {
464 return block_i->last_alloc_physical_block + 1;
467 return ext3_find_near(inode, partial);
470 * ext3_blks_to_allocate: Look up the block map and count the number
471 * of direct blocks need to be allocated for the given branch.
473 * @branch: chain of indirect blocks
474 * @k: number of blocks need for indirect blocks
475 * @blks: number of data blocks to be mapped.
476 * @blocks_to_boundary: the offset in the indirect block
478 * return the total number of blocks to be allocate, including the
479 * direct and indirect blocks.
482 ext3_blks_to_allocate(Indirect * branch, int k, unsigned long blks,
483 int blocks_to_boundary)
485 unsigned long count = 0;
488 * Simple case, [t,d]Indirect block(s) has not allocated yet
489 * then it's clear blocks on that path have not allocated
492 /* right now don't hanel cross boundary allocation */
493 if (blks < blocks_to_boundary + 1)
496 count += blocks_to_boundary + 1;
501 while (count < blks && count <= blocks_to_boundary &&
502 le32_to_cpu(*(branch[0].p + count)) == 0) {
509 * ext3_alloc_blocks: multiple allocate blocks needed for a branch
510 * @indirect_blks: the number of blocks need to allocate for indirect
513 * @new_blocks: on return it will store the new block numbers for
514 * the indirect blocks(if needed) and the first direct block,
515 * @blks: on return it will store the total number of allocated
518 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
519 unsigned long goal, int indirect_blks, int blks,
520 unsigned long long new_blocks[4], int *err)
523 unsigned long count = 0;
525 unsigned long current_block = 0;
529 * Here we try to allocate the requested multiple blocks at once,
530 * on a best-effort basis.
531 * To build a branch, we should allocate blocks for
532 * the indirect blocks(if not allocated yet), and at least
533 * the first direct block of this branch. That's the
534 * minimum number of blocks need to allocate(required)
536 target = blks + indirect_blks;
540 /* allocating blocks for indirect blocks and direct blocks */
541 current_block = ext3_new_blocks(handle, inode, goal, &count, err);
546 /* allocate blocks for indirect blocks */
547 while (index < indirect_blks && count) {
548 new_blocks[index++] = current_block++;
556 /* save the new block number for the first direct block */
557 new_blocks[index] = current_block;
559 /* total number of blocks allocated for direct blocks */
564 for (i = 0; i <index; i++)
565 ext3_free_blocks(handle, inode, new_blocks[i], 1);
570 * ext3_alloc_branch - allocate and set up a chain of blocks.
572 * @indirect_blks: number of allocated indirect blocks
573 * @blks: number of allocated direct blocks
574 * @offsets: offsets (in the blocks) to store the pointers to next.
575 * @branch: place to store the chain in.
577 * This function allocates blocks, zeroes out all but the last one,
578 * links them into chain and (if we are synchronous) writes them to disk.
579 * In other words, it prepares a branch that can be spliced onto the
580 * inode. It stores the information about that chain in the branch[], in
581 * the same format as ext3_get_branch() would do. We are calling it after
582 * we had read the existing part of chain and partial points to the last
583 * triple of that (one with zero ->key). Upon the exit we have the same
584 * picture as after the successful ext3_get_block(), except that in one
585 * place chain is disconnected - *branch->p is still zero (we did not
586 * set the last link), but branch->key contains the number that should
587 * be placed into *branch->p to fill that gap.
589 * If allocation fails we free all blocks we've allocated (and forget
590 * their buffer_heads) and return the error value the from failed
591 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
592 * as described above and return 0.
595 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
596 int indirect_blks, int *blks, unsigned long goal,
597 int *offsets, Indirect *branch)
599 int blocksize = inode->i_sb->s_blocksize;
602 struct buffer_head *bh;
604 unsigned long long new_blocks[4];
605 unsigned long long current_block;
607 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
608 *blks, new_blocks, &err);
612 branch[0].key = cpu_to_le32(new_blocks[0]);
614 * metadata blocks and data blocks are allocated.
616 for (n = 1; n <= indirect_blks; n++) {
618 * Get buffer_head for parent block, zero it out
619 * and set the pointer to new one, then send
622 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
625 BUFFER_TRACE(bh, "call get_create_access");
626 err = ext3_journal_get_create_access(handle, bh);
633 memset(bh->b_data, 0, blocksize);
634 branch[n].p = (__le32 *) bh->b_data + offsets[n];
635 branch[n].key = cpu_to_le32(new_blocks[n]);
636 *branch[n].p = branch[n].key;
637 if ( n == indirect_blks) {
638 current_block = new_blocks[n];
640 * End of chain, update the last new metablock of
641 * the chain to point to the new allocated
642 * data blocks numbers
644 for (i=1; i < num; i++)
645 *(branch[n].p + i) = cpu_to_le32(++current_block);
647 BUFFER_TRACE(bh, "marking uptodate");
648 set_buffer_uptodate(bh);
651 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
652 err = ext3_journal_dirty_metadata(handle, bh);
659 /* Allocation failed, free what we already allocated */
660 for (i = 1; i <= n ; i++) {
661 BUFFER_TRACE(branch[i].bh, "call journal_forget");
662 ext3_journal_forget(handle, branch[i].bh);
664 for (i = 0; i <indirect_blks; i++)
665 ext3_free_blocks(handle, inode, new_blocks[i], 1);
667 ext3_free_blocks(handle, inode, new_blocks[i], num);
673 * ext3_splice_branch - splice the allocated branch onto inode.
675 * @block: (logical) number of block we are adding
676 * @chain: chain of indirect blocks (with a missing link - see
678 * @where: location of missing link
679 * @num: number of indirect blocks we are adding
680 * @blks: number of direct blocks we are adding
682 * This function fills the missing link and does all housekeeping needed in
683 * inode (->i_blocks, etc.). In case of success we end up with the full
684 * chain to new block and return 0.
687 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
688 Indirect *where, int num, int blks)
692 struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
693 unsigned long current_block;
695 * If we're splicing into a [td]indirect block (as opposed to the
696 * inode) then we need to get write access to the [td]indirect block
700 BUFFER_TRACE(where->bh, "get_write_access");
701 err = ext3_journal_get_write_access(handle, where->bh);
707 *where->p = where->key;
708 /* update host bufferhead or inode to point to
709 * more just allocated direct blocks blocks */
710 if (num == 0 && blks > 1) {
711 current_block = le32_to_cpu(where->key + 1);
712 for (i = 1; i < blks; i++)
713 *(where->p + i ) = cpu_to_le32(current_block++);
717 * update the most recently allocated logical & physical block
718 * in i_block_alloc_info, to assist find the proper goal block for next
722 block_i->last_alloc_logical_block = block + blks - 1;
723 block_i->last_alloc_physical_block = le32_to_cpu(where[num].key + blks - 1);
726 /* We are done with atomic stuff, now do the rest of housekeeping */
728 inode->i_ctime = CURRENT_TIME_SEC;
729 ext3_mark_inode_dirty(handle, inode);
731 /* had we spliced it onto indirect block? */
734 * akpm: If we spliced it onto an indirect block, we haven't
735 * altered the inode. Note however that if it is being spliced
736 * onto an indirect block at the very end of the file (the
737 * file is growing) then we *will* alter the inode to reflect
738 * the new i_size. But that is not done here - it is done in
739 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
741 jbd_debug(5, "splicing indirect only\n");
742 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
743 err = ext3_journal_dirty_metadata(handle, where->bh);
748 * OK, we spliced it into the inode itself on a direct block.
749 * Inode was dirtied above.
751 jbd_debug(5, "splicing direct\n");
756 for (i = 1; i <= num; i++) {
757 BUFFER_TRACE(where[i].bh, "call journal_forget");
758 ext3_journal_forget(handle, where[i].bh);
759 ext3_free_blocks(handle, inode, le32_to_cpu(where[i-1].key), 1);
761 ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
767 * Allocation strategy is simple: if we have to allocate something, we will
768 * have to go the whole way to leaf. So let's do it before attaching anything
769 * to tree, set linkage between the newborn blocks, write them if sync is
770 * required, recheck the path, free and repeat if check fails, otherwise
771 * set the last missing link (that will protect us from any truncate-generated
772 * removals - all blocks on the path are immune now) and possibly force the
773 * write on the parent block.
774 * That has a nice additional property: no special recovery from the failed
775 * allocations is needed - we simply release blocks and do not touch anything
776 * reachable from inode.
778 * akpm: `handle' can be NULL if create == 0.
780 * The BKL may not be held on entry here. Be sure to take it early.
781 * return > 0, # of blocks mapped or allocated.
782 * return = 0, if plain lookup failed.
783 * return < 0, error case.
787 ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock,
788 unsigned long maxblocks, struct buffer_head *bh_result,
789 int create, int extend_disksize)
797 int blocks_to_boundary = 0;
799 struct ext3_inode_info *ei = EXT3_I(inode);
801 unsigned long first_block = 0;
804 J_ASSERT(handle != NULL || create == 0);
805 depth = ext3_block_to_path(inode, iblock, offsets, &blocks_to_boundary);
810 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
812 /* Simplest case - block found, no allocation needed */
814 first_block = chain[depth - 1].key;
815 clear_buffer_new(bh_result);
818 while (count < maxblocks && count <= blocks_to_boundary) {
819 if (!verify_chain(chain, partial)) {
821 * Indirect block might be removed by
822 * truncate while we were reading it.
823 * Handling of that case: forget what we've
824 * got now. Flag the err as EAGAIN, so it
831 if (le32_to_cpu(*(chain[depth-1].p+count) ==
832 (first_block + count)))
841 /* Next simple case - plain lookup or failed read of indirect block */
842 if (!create || err == -EIO)
845 mutex_lock(&ei->truncate_mutex);
848 * If the indirect block is missing while we are reading
849 * the chain(ext3_get_branch() returns -EAGAIN err), or
850 * if the chain has been changed after we grab the semaphore,
851 * (either because another process truncated this branch, or
852 * another get_block allocated this branch) re-grab the chain to see if
853 * the request block has been allocated or not.
855 * Since we already block the truncate/other get_block
856 * at this point, we will have the current copy of the chain when we
857 * splice the branch into the tree.
859 if (err == -EAGAIN || !verify_chain(chain, partial)) {
860 while (partial > chain) {
864 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
867 mutex_unlock(&ei->truncate_mutex);
870 clear_buffer_new(bh_result);
876 * Okay, we need to do block allocation. Lazily initialize the block
877 * allocation info here if necessary
879 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
880 ext3_init_block_alloc_info(inode);
882 goal = ext3_find_goal(inode, iblock, chain, partial);
884 /* the number of blocks need to allocate for [d,t]indirect blocks */
885 indirect_blks = (chain + depth) - partial - 1;
888 * Next look up the indirect map to count the totoal number of
889 * direct blocks to allocate for this branch.
891 count = ext3_blks_to_allocate(partial, indirect_blks,
892 maxblocks, blocks_to_boundary);
894 * Block out ext3_truncate while we alter the tree
896 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
897 offsets + (partial - chain), partial);
900 * The ext3_splice_branch call will free and forget any buffers
901 * on the new chain if there is a failure, but that risks using
902 * up transaction credits, especially for bitmaps where the
903 * credits cannot be returned. Can we handle this somehow? We
904 * may need to return -EAGAIN upwards in the worst case. --sct
907 err = ext3_splice_branch(handle, inode, iblock,
908 partial, indirect_blks, count);
910 * i_disksize growing is protected by truncate_mutex. Don't forget to
911 * protect it if you're about to implement concurrent
912 * ext3_get_block() -bzzz
914 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
915 ei->i_disksize = inode->i_size;
916 mutex_unlock(&ei->truncate_mutex);
920 set_buffer_new(bh_result);
922 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
923 if (blocks_to_boundary == 0)
924 set_buffer_boundary(bh_result);
926 /* Clean up and exit */
927 partial = chain + depth - 1; /* the whole chain */
929 while (partial > chain) {
930 BUFFER_TRACE(partial->bh, "call brelse");
934 BUFFER_TRACE(bh_result, "returned");
939 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
942 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
943 struct buffer_head *bh_result, int create)
945 handle_t *handle = journal_current_handle();
947 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
950 goto get_block; /* A read */
953 goto get_block; /* A single block get */
955 if (handle->h_transaction->t_state == T_LOCKED) {
957 * Huge direct-io writes can hold off commits for long
958 * periods of time. Let this commit run.
960 ext3_journal_stop(handle);
961 handle = ext3_journal_start(inode, DIO_CREDITS);
963 ret = PTR_ERR(handle);
967 if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
969 * Getting low on buffer credits...
971 ret = ext3_journal_extend(handle, DIO_CREDITS);
974 * Couldn't extend the transaction. Start a new one.
976 ret = ext3_journal_restart(handle, DIO_CREDITS);
982 ret = ext3_get_blocks_handle(handle, inode, iblock,
983 max_blocks, bh_result, create, 0);
985 bh_result->b_size = (ret << inode->i_blkbits);
992 static int ext3_get_block(struct inode *inode, sector_t iblock,
993 struct buffer_head *bh_result, int create)
995 return ext3_direct_io_get_blocks(inode, iblock, bh_result, create);
999 * `handle' can be NULL if create is zero
1001 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
1002 long block, int create, int * errp)
1004 struct buffer_head dummy;
1007 J_ASSERT(handle != NULL || create == 0);
1010 dummy.b_blocknr = -1000;
1011 buffer_trace_init(&dummy.b_history);
1012 err = ext3_get_blocks_handle(handle, inode, block, 1,
1016 } else if (err >= 0) {
1021 if (!err && buffer_mapped(&dummy)) {
1022 struct buffer_head *bh;
1023 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1028 if (buffer_new(&dummy)) {
1029 J_ASSERT(create != 0);
1030 J_ASSERT(handle != 0);
1032 /* Now that we do not always journal data, we
1033 should keep in mind whether this should
1034 always journal the new buffer as metadata.
1035 For now, regular file writes use
1036 ext3_get_block instead, so it's not a
1039 BUFFER_TRACE(bh, "call get_create_access");
1040 fatal = ext3_journal_get_create_access(handle, bh);
1041 if (!fatal && !buffer_uptodate(bh)) {
1042 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1043 set_buffer_uptodate(bh);
1046 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1047 err = ext3_journal_dirty_metadata(handle, bh);
1051 BUFFER_TRACE(bh, "not a new buffer");
1064 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
1065 int block, int create, int *err)
1067 struct buffer_head * bh;
1069 bh = ext3_getblk(handle, inode, block, create, err);
1072 if (buffer_uptodate(bh))
1074 ll_rw_block(READ, 1, &bh);
1076 if (buffer_uptodate(bh))
1083 static int walk_page_buffers( handle_t *handle,
1084 struct buffer_head *head,
1088 int (*fn)( handle_t *handle,
1089 struct buffer_head *bh))
1091 struct buffer_head *bh;
1092 unsigned block_start, block_end;
1093 unsigned blocksize = head->b_size;
1095 struct buffer_head *next;
1097 for ( bh = head, block_start = 0;
1098 ret == 0 && (bh != head || !block_start);
1099 block_start = block_end, bh = next)
1101 next = bh->b_this_page;
1102 block_end = block_start + blocksize;
1103 if (block_end <= from || block_start >= to) {
1104 if (partial && !buffer_uptodate(bh))
1108 err = (*fn)(handle, bh);
1116 * To preserve ordering, it is essential that the hole instantiation and
1117 * the data write be encapsulated in a single transaction. We cannot
1118 * close off a transaction and start a new one between the ext3_get_block()
1119 * and the commit_write(). So doing the journal_start at the start of
1120 * prepare_write() is the right place.
1122 * Also, this function can nest inside ext3_writepage() ->
1123 * block_write_full_page(). In that case, we *know* that ext3_writepage()
1124 * has generated enough buffer credits to do the whole page. So we won't
1125 * block on the journal in that case, which is good, because the caller may
1128 * By accident, ext3 can be reentered when a transaction is open via
1129 * quota file writes. If we were to commit the transaction while thus
1130 * reentered, there can be a deadlock - we would be holding a quota
1131 * lock, and the commit would never complete if another thread had a
1132 * transaction open and was blocking on the quota lock - a ranking
1135 * So what we do is to rely on the fact that journal_stop/journal_start
1136 * will _not_ run commit under these circumstances because handle->h_ref
1137 * is elevated. We'll still have enough credits for the tiny quotafile
1141 static int do_journal_get_write_access(handle_t *handle,
1142 struct buffer_head *bh)
1144 if (!buffer_mapped(bh) || buffer_freed(bh))
1146 return ext3_journal_get_write_access(handle, bh);
1149 static int ext3_prepare_write(struct file *file, struct page *page,
1150 unsigned from, unsigned to)
1152 struct inode *inode = page->mapping->host;
1153 int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1158 handle = ext3_journal_start(inode, needed_blocks);
1159 if (IS_ERR(handle)) {
1160 ret = PTR_ERR(handle);
1163 if (test_opt(inode->i_sb, NOBH))
1164 ret = nobh_prepare_write(page, from, to, ext3_get_block);
1166 ret = block_prepare_write(page, from, to, ext3_get_block);
1168 goto prepare_write_failed;
1170 if (ext3_should_journal_data(inode)) {
1171 ret = walk_page_buffers(handle, page_buffers(page),
1172 from, to, NULL, do_journal_get_write_access);
1174 prepare_write_failed:
1176 ext3_journal_stop(handle);
1177 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1184 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1186 int err = journal_dirty_data(handle, bh);
1188 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1193 /* For commit_write() in data=journal mode */
1194 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1196 if (!buffer_mapped(bh) || buffer_freed(bh))
1198 set_buffer_uptodate(bh);
1199 return ext3_journal_dirty_metadata(handle, bh);
1203 * We need to pick up the new inode size which generic_commit_write gave us
1204 * `file' can be NULL - eg, when called from page_symlink().
1206 * ext3 never places buffers on inode->i_mapping->private_list. metadata
1207 * buffers are managed internally.
1210 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1211 unsigned from, unsigned to)
1213 handle_t *handle = ext3_journal_current_handle();
1214 struct inode *inode = page->mapping->host;
1217 ret = walk_page_buffers(handle, page_buffers(page),
1218 from, to, NULL, ext3_journal_dirty_data);
1222 * generic_commit_write() will run mark_inode_dirty() if i_size
1223 * changes. So let's piggyback the i_disksize mark_inode_dirty
1228 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1229 if (new_i_size > EXT3_I(inode)->i_disksize)
1230 EXT3_I(inode)->i_disksize = new_i_size;
1231 ret = generic_commit_write(file, page, from, to);
1233 ret2 = ext3_journal_stop(handle);
1239 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1240 unsigned from, unsigned to)
1242 handle_t *handle = ext3_journal_current_handle();
1243 struct inode *inode = page->mapping->host;
1247 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1248 if (new_i_size > EXT3_I(inode)->i_disksize)
1249 EXT3_I(inode)->i_disksize = new_i_size;
1251 if (test_opt(inode->i_sb, NOBH))
1252 ret = nobh_commit_write(file, page, from, to);
1254 ret = generic_commit_write(file, page, from, to);
1256 ret2 = ext3_journal_stop(handle);
1262 static int ext3_journalled_commit_write(struct file *file,
1263 struct page *page, unsigned from, unsigned to)
1265 handle_t *handle = ext3_journal_current_handle();
1266 struct inode *inode = page->mapping->host;
1272 * Here we duplicate the generic_commit_write() functionality
1274 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1276 ret = walk_page_buffers(handle, page_buffers(page), from,
1277 to, &partial, commit_write_fn);
1279 SetPageUptodate(page);
1280 if (pos > inode->i_size)
1281 i_size_write(inode, pos);
1282 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1283 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1284 EXT3_I(inode)->i_disksize = inode->i_size;
1285 ret2 = ext3_mark_inode_dirty(handle, inode);
1289 ret2 = ext3_journal_stop(handle);
1296 * bmap() is special. It gets used by applications such as lilo and by
1297 * the swapper to find the on-disk block of a specific piece of data.
1299 * Naturally, this is dangerous if the block concerned is still in the
1300 * journal. If somebody makes a swapfile on an ext3 data-journaling
1301 * filesystem and enables swap, then they may get a nasty shock when the
1302 * data getting swapped to that swapfile suddenly gets overwritten by
1303 * the original zero's written out previously to the journal and
1304 * awaiting writeback in the kernel's buffer cache.
1306 * So, if we see any bmap calls here on a modified, data-journaled file,
1307 * take extra steps to flush any blocks which might be in the cache.
1309 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1311 struct inode *inode = mapping->host;
1315 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1317 * This is a REALLY heavyweight approach, but the use of
1318 * bmap on dirty files is expected to be extremely rare:
1319 * only if we run lilo or swapon on a freshly made file
1320 * do we expect this to happen.
1322 * (bmap requires CAP_SYS_RAWIO so this does not
1323 * represent an unprivileged user DOS attack --- we'd be
1324 * in trouble if mortal users could trigger this path at
1327 * NB. EXT3_STATE_JDATA is not set on files other than
1328 * regular files. If somebody wants to bmap a directory
1329 * or symlink and gets confused because the buffer
1330 * hasn't yet been flushed to disk, they deserve
1331 * everything they get.
1334 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1335 journal = EXT3_JOURNAL(inode);
1336 journal_lock_updates(journal);
1337 err = journal_flush(journal);
1338 journal_unlock_updates(journal);
1344 return generic_block_bmap(mapping,block,ext3_get_block);
1347 static int bget_one(handle_t *handle, struct buffer_head *bh)
1353 static int bput_one(handle_t *handle, struct buffer_head *bh)
1359 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1361 if (buffer_mapped(bh))
1362 return ext3_journal_dirty_data(handle, bh);
1367 * Note that we always start a transaction even if we're not journalling
1368 * data. This is to preserve ordering: any hole instantiation within
1369 * __block_write_full_page -> ext3_get_block() should be journalled
1370 * along with the data so we don't crash and then get metadata which
1371 * refers to old data.
1373 * In all journalling modes block_write_full_page() will start the I/O.
1377 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1382 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1384 * Same applies to ext3_get_block(). We will deadlock on various things like
1385 * lock_journal and i_truncate_mutex.
1387 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1390 * 16May01: If we're reentered then journal_current_handle() will be
1391 * non-zero. We simply *return*.
1393 * 1 July 2001: @@@ FIXME:
1394 * In journalled data mode, a data buffer may be metadata against the
1395 * current transaction. But the same file is part of a shared mapping
1396 * and someone does a writepage() on it.
1398 * We will move the buffer onto the async_data list, but *after* it has
1399 * been dirtied. So there's a small window where we have dirty data on
1402 * Note that this only applies to the last partial page in the file. The
1403 * bit which block_write_full_page() uses prepare/commit for. (That's
1404 * broken code anyway: it's wrong for msync()).
1406 * It's a rare case: affects the final partial page, for journalled data
1407 * where the file is subject to bith write() and writepage() in the same
1408 * transction. To fix it we'll need a custom block_write_full_page().
1409 * We'll probably need that anyway for journalling writepage() output.
1411 * We don't honour synchronous mounts for writepage(). That would be
1412 * disastrous. Any write() or metadata operation will sync the fs for
1415 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1416 * we don't need to open a transaction here.
1418 static int ext3_ordered_writepage(struct page *page,
1419 struct writeback_control *wbc)
1421 struct inode *inode = page->mapping->host;
1422 struct buffer_head *page_bufs;
1423 handle_t *handle = NULL;
1427 J_ASSERT(PageLocked(page));
1430 * We give up here if we're reentered, because it might be for a
1431 * different filesystem.
1433 if (ext3_journal_current_handle())
1436 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1438 if (IS_ERR(handle)) {
1439 ret = PTR_ERR(handle);
1443 if (!page_has_buffers(page)) {
1444 create_empty_buffers(page, inode->i_sb->s_blocksize,
1445 (1 << BH_Dirty)|(1 << BH_Uptodate));
1447 page_bufs = page_buffers(page);
1448 walk_page_buffers(handle, page_bufs, 0,
1449 PAGE_CACHE_SIZE, NULL, bget_one);
1451 ret = block_write_full_page(page, ext3_get_block, wbc);
1454 * The page can become unlocked at any point now, and
1455 * truncate can then come in and change things. So we
1456 * can't touch *page from now on. But *page_bufs is
1457 * safe due to elevated refcount.
1461 * And attach them to the current transaction. But only if
1462 * block_write_full_page() succeeded. Otherwise they are unmapped,
1463 * and generally junk.
1466 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1467 NULL, journal_dirty_data_fn);
1471 walk_page_buffers(handle, page_bufs, 0,
1472 PAGE_CACHE_SIZE, NULL, bput_one);
1473 err = ext3_journal_stop(handle);
1479 redirty_page_for_writepage(wbc, page);
1484 static int ext3_writeback_writepage(struct page *page,
1485 struct writeback_control *wbc)
1487 struct inode *inode = page->mapping->host;
1488 handle_t *handle = NULL;
1492 if (ext3_journal_current_handle())
1495 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1496 if (IS_ERR(handle)) {
1497 ret = PTR_ERR(handle);
1501 if (test_opt(inode->i_sb, NOBH))
1502 ret = nobh_writepage(page, ext3_get_block, wbc);
1504 ret = block_write_full_page(page, ext3_get_block, wbc);
1506 err = ext3_journal_stop(handle);
1512 redirty_page_for_writepage(wbc, page);
1517 static int ext3_journalled_writepage(struct page *page,
1518 struct writeback_control *wbc)
1520 struct inode *inode = page->mapping->host;
1521 handle_t *handle = NULL;
1525 if (ext3_journal_current_handle())
1528 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1529 if (IS_ERR(handle)) {
1530 ret = PTR_ERR(handle);
1534 if (!page_has_buffers(page) || PageChecked(page)) {
1536 * It's mmapped pagecache. Add buffers and journal it. There
1537 * doesn't seem much point in redirtying the page here.
1539 ClearPageChecked(page);
1540 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1543 ext3_journal_stop(handle);
1546 ret = walk_page_buffers(handle, page_buffers(page), 0,
1547 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1549 err = walk_page_buffers(handle, page_buffers(page), 0,
1550 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1553 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1557 * It may be a page full of checkpoint-mode buffers. We don't
1558 * really know unless we go poke around in the buffer_heads.
1559 * But block_write_full_page will do the right thing.
1561 ret = block_write_full_page(page, ext3_get_block, wbc);
1563 err = ext3_journal_stop(handle);
1570 redirty_page_for_writepage(wbc, page);
1576 static int ext3_readpage(struct file *file, struct page *page)
1578 return mpage_readpage(page, ext3_get_block);
1582 ext3_readpages(struct file *file, struct address_space *mapping,
1583 struct list_head *pages, unsigned nr_pages)
1585 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1588 static void ext3_invalidatepage(struct page *page, unsigned long offset)
1590 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1593 * If it's a full truncate we just forget about the pending dirtying
1596 ClearPageChecked(page);
1598 journal_invalidatepage(journal, page, offset);
1601 static int ext3_releasepage(struct page *page, gfp_t wait)
1603 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1605 WARN_ON(PageChecked(page));
1606 if (!page_has_buffers(page))
1608 return journal_try_to_free_buffers(journal, page, wait);
1612 * If the O_DIRECT write will extend the file then add this inode to the
1613 * orphan list. So recovery will truncate it back to the original size
1614 * if the machine crashes during the write.
1616 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1617 * crashes then stale disk data _may_ be exposed inside the file.
1619 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1620 const struct iovec *iov, loff_t offset,
1621 unsigned long nr_segs)
1623 struct file *file = iocb->ki_filp;
1624 struct inode *inode = file->f_mapping->host;
1625 struct ext3_inode_info *ei = EXT3_I(inode);
1626 handle_t *handle = NULL;
1629 size_t count = iov_length(iov, nr_segs);
1632 loff_t final_size = offset + count;
1634 handle = ext3_journal_start(inode, DIO_CREDITS);
1635 if (IS_ERR(handle)) {
1636 ret = PTR_ERR(handle);
1639 if (final_size > inode->i_size) {
1640 ret = ext3_orphan_add(handle, inode);
1644 ei->i_disksize = inode->i_size;
1648 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1650 ext3_direct_io_get_blocks, NULL);
1653 * Reacquire the handle: ext3_direct_io_get_block() can restart the
1656 handle = journal_current_handle();
1662 if (orphan && inode->i_nlink)
1663 ext3_orphan_del(handle, inode);
1664 if (orphan && ret > 0) {
1665 loff_t end = offset + ret;
1666 if (end > inode->i_size) {
1667 ei->i_disksize = end;
1668 i_size_write(inode, end);
1670 * We're going to return a positive `ret'
1671 * here due to non-zero-length I/O, so there's
1672 * no way of reporting error returns from
1673 * ext3_mark_inode_dirty() to userspace. So
1676 ext3_mark_inode_dirty(handle, inode);
1679 err = ext3_journal_stop(handle);
1688 * Pages can be marked dirty completely asynchronously from ext3's journalling
1689 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1690 * much here because ->set_page_dirty is called under VFS locks. The page is
1691 * not necessarily locked.
1693 * We cannot just dirty the page and leave attached buffers clean, because the
1694 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1695 * or jbddirty because all the journalling code will explode.
1697 * So what we do is to mark the page "pending dirty" and next time writepage
1698 * is called, propagate that into the buffers appropriately.
1700 static int ext3_journalled_set_page_dirty(struct page *page)
1702 SetPageChecked(page);
1703 return __set_page_dirty_nobuffers(page);
1706 static struct address_space_operations ext3_ordered_aops = {
1707 .readpage = ext3_readpage,
1708 .readpages = ext3_readpages,
1709 .writepage = ext3_ordered_writepage,
1710 .sync_page = block_sync_page,
1711 .prepare_write = ext3_prepare_write,
1712 .commit_write = ext3_ordered_commit_write,
1714 .invalidatepage = ext3_invalidatepage,
1715 .releasepage = ext3_releasepage,
1716 .direct_IO = ext3_direct_IO,
1717 .migratepage = buffer_migrate_page,
1720 static struct address_space_operations ext3_writeback_aops = {
1721 .readpage = ext3_readpage,
1722 .readpages = ext3_readpages,
1723 .writepage = ext3_writeback_writepage,
1724 .sync_page = block_sync_page,
1725 .prepare_write = ext3_prepare_write,
1726 .commit_write = ext3_writeback_commit_write,
1728 .invalidatepage = ext3_invalidatepage,
1729 .releasepage = ext3_releasepage,
1730 .direct_IO = ext3_direct_IO,
1731 .migratepage = buffer_migrate_page,
1734 static struct address_space_operations ext3_journalled_aops = {
1735 .readpage = ext3_readpage,
1736 .readpages = ext3_readpages,
1737 .writepage = ext3_journalled_writepage,
1738 .sync_page = block_sync_page,
1739 .prepare_write = ext3_prepare_write,
1740 .commit_write = ext3_journalled_commit_write,
1741 .set_page_dirty = ext3_journalled_set_page_dirty,
1743 .invalidatepage = ext3_invalidatepage,
1744 .releasepage = ext3_releasepage,
1747 void ext3_set_aops(struct inode *inode)
1749 if (ext3_should_order_data(inode))
1750 inode->i_mapping->a_ops = &ext3_ordered_aops;
1751 else if (ext3_should_writeback_data(inode))
1752 inode->i_mapping->a_ops = &ext3_writeback_aops;
1754 inode->i_mapping->a_ops = &ext3_journalled_aops;
1758 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1759 * up to the end of the block which corresponds to `from'.
1760 * This required during truncate. We need to physically zero the tail end
1761 * of that block so it doesn't yield old data if the file is later grown.
1763 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1764 struct address_space *mapping, loff_t from)
1766 unsigned long index = from >> PAGE_CACHE_SHIFT;
1767 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1768 unsigned blocksize, iblock, length, pos;
1769 struct inode *inode = mapping->host;
1770 struct buffer_head *bh;
1774 blocksize = inode->i_sb->s_blocksize;
1775 length = blocksize - (offset & (blocksize - 1));
1776 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1779 * For "nobh" option, we can only work if we don't need to
1780 * read-in the page - otherwise we create buffers to do the IO.
1782 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1783 ext3_should_writeback_data(inode) && PageUptodate(page)) {
1784 kaddr = kmap_atomic(page, KM_USER0);
1785 memset(kaddr + offset, 0, length);
1786 flush_dcache_page(page);
1787 kunmap_atomic(kaddr, KM_USER0);
1788 set_page_dirty(page);
1792 if (!page_has_buffers(page))
1793 create_empty_buffers(page, blocksize, 0);
1795 /* Find the buffer that contains "offset" */
1796 bh = page_buffers(page);
1798 while (offset >= pos) {
1799 bh = bh->b_this_page;
1805 if (buffer_freed(bh)) {
1806 BUFFER_TRACE(bh, "freed: skip");
1810 if (!buffer_mapped(bh)) {
1811 BUFFER_TRACE(bh, "unmapped");
1812 ext3_get_block(inode, iblock, bh, 0);
1813 /* unmapped? It's a hole - nothing to do */
1814 if (!buffer_mapped(bh)) {
1815 BUFFER_TRACE(bh, "still unmapped");
1820 /* Ok, it's mapped. Make sure it's up-to-date */
1821 if (PageUptodate(page))
1822 set_buffer_uptodate(bh);
1824 if (!buffer_uptodate(bh)) {
1826 ll_rw_block(READ, 1, &bh);
1828 /* Uhhuh. Read error. Complain and punt. */
1829 if (!buffer_uptodate(bh))
1833 if (ext3_should_journal_data(inode)) {
1834 BUFFER_TRACE(bh, "get write access");
1835 err = ext3_journal_get_write_access(handle, bh);
1840 kaddr = kmap_atomic(page, KM_USER0);
1841 memset(kaddr + offset, 0, length);
1842 flush_dcache_page(page);
1843 kunmap_atomic(kaddr, KM_USER0);
1845 BUFFER_TRACE(bh, "zeroed end of block");
1848 if (ext3_should_journal_data(inode)) {
1849 err = ext3_journal_dirty_metadata(handle, bh);
1851 if (ext3_should_order_data(inode))
1852 err = ext3_journal_dirty_data(handle, bh);
1853 mark_buffer_dirty(bh);
1858 page_cache_release(page);
1863 * Probably it should be a library function... search for first non-zero word
1864 * or memcmp with zero_page, whatever is better for particular architecture.
1867 static inline int all_zeroes(__le32 *p, __le32 *q)
1876 * ext3_find_shared - find the indirect blocks for partial truncation.
1877 * @inode: inode in question
1878 * @depth: depth of the affected branch
1879 * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1880 * @chain: place to store the pointers to partial indirect blocks
1881 * @top: place to the (detached) top of branch
1883 * This is a helper function used by ext3_truncate().
1885 * When we do truncate() we may have to clean the ends of several
1886 * indirect blocks but leave the blocks themselves alive. Block is
1887 * partially truncated if some data below the new i_size is refered
1888 * from it (and it is on the path to the first completely truncated
1889 * data block, indeed). We have to free the top of that path along
1890 * with everything to the right of the path. Since no allocation
1891 * past the truncation point is possible until ext3_truncate()
1892 * finishes, we may safely do the latter, but top of branch may
1893 * require special attention - pageout below the truncation point
1894 * might try to populate it.
1896 * We atomically detach the top of branch from the tree, store the
1897 * block number of its root in *@top, pointers to buffer_heads of
1898 * partially truncated blocks - in @chain[].bh and pointers to
1899 * their last elements that should not be removed - in
1900 * @chain[].p. Return value is the pointer to last filled element
1903 * The work left to caller to do the actual freeing of subtrees:
1904 * a) free the subtree starting from *@top
1905 * b) free the subtrees whose roots are stored in
1906 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1907 * c) free the subtrees growing from the inode past the @chain[0].
1908 * (no partially truncated stuff there). */
1910 static Indirect *ext3_find_shared(struct inode *inode,
1916 Indirect *partial, *p;
1920 /* Make k index the deepest non-null offest + 1 */
1921 for (k = depth; k > 1 && !offsets[k-1]; k--)
1923 partial = ext3_get_branch(inode, k, offsets, chain, &err);
1924 /* Writer: pointers */
1926 partial = chain + k-1;
1928 * If the branch acquired continuation since we've looked at it -
1929 * fine, it should all survive and (new) top doesn't belong to us.
1931 if (!partial->key && *partial->p)
1934 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1937 * OK, we've found the last block that must survive. The rest of our
1938 * branch should be detached before unlocking. However, if that rest
1939 * of branch is all ours and does not grow immediately from the inode
1940 * it's easier to cheat and just decrement partial->p.
1942 if (p == chain + k - 1 && p > chain) {
1946 /* Nope, don't do this in ext3. Must leave the tree intact */
1955 brelse(partial->bh);
1963 * Zero a number of block pointers in either an inode or an indirect block.
1964 * If we restart the transaction we must again get write access to the
1965 * indirect block for further modification.
1967 * We release `count' blocks on disk, but (last - first) may be greater
1968 * than `count' because there can be holes in there.
1971 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1972 unsigned long block_to_free, unsigned long count,
1973 __le32 *first, __le32 *last)
1976 if (try_to_extend_transaction(handle, inode)) {
1978 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1979 ext3_journal_dirty_metadata(handle, bh);
1981 ext3_mark_inode_dirty(handle, inode);
1982 ext3_journal_test_restart(handle, inode);
1984 BUFFER_TRACE(bh, "retaking write access");
1985 ext3_journal_get_write_access(handle, bh);
1990 * Any buffers which are on the journal will be in memory. We find
1991 * them on the hash table so journal_revoke() will run journal_forget()
1992 * on them. We've already detached each block from the file, so
1993 * bforget() in journal_forget() should be safe.
1995 * AKPM: turn on bforget in journal_forget()!!!
1997 for (p = first; p < last; p++) {
1998 u32 nr = le32_to_cpu(*p);
2000 struct buffer_head *bh;
2003 bh = sb_find_get_block(inode->i_sb, nr);
2004 ext3_forget(handle, 0, inode, bh, nr);
2008 ext3_free_blocks(handle, inode, block_to_free, count);
2012 * ext3_free_data - free a list of data blocks
2013 * @handle: handle for this transaction
2014 * @inode: inode we are dealing with
2015 * @this_bh: indirect buffer_head which contains *@first and *@last
2016 * @first: array of block numbers
2017 * @last: points immediately past the end of array
2019 * We are freeing all blocks refered from that array (numbers are stored as
2020 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2022 * We accumulate contiguous runs of blocks to free. Conveniently, if these
2023 * blocks are contiguous then releasing them at one time will only affect one
2024 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2025 * actually use a lot of journal space.
2027 * @this_bh will be %NULL if @first and @last point into the inode's direct
2030 static void ext3_free_data(handle_t *handle, struct inode *inode,
2031 struct buffer_head *this_bh,
2032 __le32 *first, __le32 *last)
2034 unsigned long block_to_free = 0; /* Starting block # of a run */
2035 unsigned long count = 0; /* Number of blocks in the run */
2036 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2039 unsigned long nr; /* Current block # */
2040 __le32 *p; /* Pointer into inode/ind
2041 for current block */
2044 if (this_bh) { /* For indirect block */
2045 BUFFER_TRACE(this_bh, "get_write_access");
2046 err = ext3_journal_get_write_access(handle, this_bh);
2047 /* Important: if we can't update the indirect pointers
2048 * to the blocks, we can't free them. */
2053 for (p = first; p < last; p++) {
2054 nr = le32_to_cpu(*p);
2056 /* accumulate blocks to free if they're contiguous */
2059 block_to_free_p = p;
2061 } else if (nr == block_to_free + count) {
2064 ext3_clear_blocks(handle, inode, this_bh,
2066 count, block_to_free_p, p);
2068 block_to_free_p = p;
2075 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
2076 count, block_to_free_p, p);
2079 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
2080 ext3_journal_dirty_metadata(handle, this_bh);
2085 * ext3_free_branches - free an array of branches
2086 * @handle: JBD handle for this transaction
2087 * @inode: inode we are dealing with
2088 * @parent_bh: the buffer_head which contains *@first and *@last
2089 * @first: array of block numbers
2090 * @last: pointer immediately past the end of array
2091 * @depth: depth of the branches to free
2093 * We are freeing all blocks refered from these branches (numbers are
2094 * stored as little-endian 32-bit) and updating @inode->i_blocks
2097 static void ext3_free_branches(handle_t *handle, struct inode *inode,
2098 struct buffer_head *parent_bh,
2099 __le32 *first, __le32 *last, int depth)
2104 if (is_handle_aborted(handle))
2108 struct buffer_head *bh;
2109 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2111 while (--p >= first) {
2112 nr = le32_to_cpu(*p);
2114 continue; /* A hole */
2116 /* Go read the buffer for the next level down */
2117 bh = sb_bread(inode->i_sb, nr);
2120 * A read failure? Report error and clear slot
2124 ext3_error(inode->i_sb, "ext3_free_branches",
2125 "Read failure, inode=%ld, block=%ld",
2130 /* This zaps the entire block. Bottom up. */
2131 BUFFER_TRACE(bh, "free child branches");
2132 ext3_free_branches(handle, inode, bh,
2133 (__le32*)bh->b_data,
2134 (__le32*)bh->b_data + addr_per_block,
2138 * We've probably journalled the indirect block several
2139 * times during the truncate. But it's no longer
2140 * needed and we now drop it from the transaction via
2143 * That's easy if it's exclusively part of this
2144 * transaction. But if it's part of the committing
2145 * transaction then journal_forget() will simply
2146 * brelse() it. That means that if the underlying
2147 * block is reallocated in ext3_get_block(),
2148 * unmap_underlying_metadata() will find this block
2149 * and will try to get rid of it. damn, damn.
2151 * If this block has already been committed to the
2152 * journal, a revoke record will be written. And
2153 * revoke records must be emitted *before* clearing
2154 * this block's bit in the bitmaps.
2156 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2159 * Everything below this this pointer has been
2160 * released. Now let this top-of-subtree go.
2162 * We want the freeing of this indirect block to be
2163 * atomic in the journal with the updating of the
2164 * bitmap block which owns it. So make some room in
2167 * We zero the parent pointer *after* freeing its
2168 * pointee in the bitmaps, so if extend_transaction()
2169 * for some reason fails to put the bitmap changes and
2170 * the release into the same transaction, recovery
2171 * will merely complain about releasing a free block,
2172 * rather than leaking blocks.
2174 if (is_handle_aborted(handle))
2176 if (try_to_extend_transaction(handle, inode)) {
2177 ext3_mark_inode_dirty(handle, inode);
2178 ext3_journal_test_restart(handle, inode);
2181 ext3_free_blocks(handle, inode, nr, 1);
2185 * The block which we have just freed is
2186 * pointed to by an indirect block: journal it
2188 BUFFER_TRACE(parent_bh, "get_write_access");
2189 if (!ext3_journal_get_write_access(handle,
2192 BUFFER_TRACE(parent_bh,
2193 "call ext3_journal_dirty_metadata");
2194 ext3_journal_dirty_metadata(handle,
2200 /* We have reached the bottom of the tree. */
2201 BUFFER_TRACE(parent_bh, "free data blocks");
2202 ext3_free_data(handle, inode, parent_bh, first, last);
2209 * We block out ext3_get_block() block instantiations across the entire
2210 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2211 * simultaneously on behalf of the same inode.
2213 * As we work through the truncate and commmit bits of it to the journal there
2214 * is one core, guiding principle: the file's tree must always be consistent on
2215 * disk. We must be able to restart the truncate after a crash.
2217 * The file's tree may be transiently inconsistent in memory (although it
2218 * probably isn't), but whenever we close off and commit a journal transaction,
2219 * the contents of (the filesystem + the journal) must be consistent and
2220 * restartable. It's pretty simple, really: bottom up, right to left (although
2221 * left-to-right works OK too).
2223 * Note that at recovery time, journal replay occurs *before* the restart of
2224 * truncate against the orphan inode list.
2226 * The committed inode has the new, desired i_size (which is the same as
2227 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
2228 * that this inode's truncate did not complete and it will again call
2229 * ext3_truncate() to have another go. So there will be instantiated blocks
2230 * to the right of the truncation point in a crashed ext3 filesystem. But
2231 * that's fine - as long as they are linked from the inode, the post-crash
2232 * ext3_truncate() run will find them and release them.
2235 void ext3_truncate(struct inode * inode)
2238 struct ext3_inode_info *ei = EXT3_I(inode);
2239 __le32 *i_data = ei->i_data;
2240 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2241 struct address_space *mapping = inode->i_mapping;
2248 unsigned blocksize = inode->i_sb->s_blocksize;
2251 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2252 S_ISLNK(inode->i_mode)))
2254 if (ext3_inode_is_fast_symlink(inode))
2256 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2260 * We have to lock the EOF page here, because lock_page() nests
2261 * outside journal_start().
2263 if ((inode->i_size & (blocksize - 1)) == 0) {
2264 /* Block boundary? Nothing to do */
2267 page = grab_cache_page(mapping,
2268 inode->i_size >> PAGE_CACHE_SHIFT);
2273 handle = start_transaction(inode);
2274 if (IS_ERR(handle)) {
2276 clear_highpage(page);
2277 flush_dcache_page(page);
2279 page_cache_release(page);
2281 return; /* AKPM: return what? */
2284 last_block = (inode->i_size + blocksize-1)
2285 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2288 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2290 n = ext3_block_to_path(inode, last_block, offsets, NULL);
2292 goto out_stop; /* error */
2295 * OK. This truncate is going to happen. We add the inode to the
2296 * orphan list, so that if this truncate spans multiple transactions,
2297 * and we crash, we will resume the truncate when the filesystem
2298 * recovers. It also marks the inode dirty, to catch the new size.
2300 * Implication: the file must always be in a sane, consistent
2301 * truncatable state while each transaction commits.
2303 if (ext3_orphan_add(handle, inode))
2307 * The orphan list entry will now protect us from any crash which
2308 * occurs before the truncate completes, so it is now safe to propagate
2309 * the new, shorter inode size (held for now in i_size) into the
2310 * on-disk inode. We do this via i_disksize, which is the value which
2311 * ext3 *really* writes onto the disk inode.
2313 ei->i_disksize = inode->i_size;
2316 * From here we block out all ext3_get_block() callers who want to
2317 * modify the block allocation tree.
2319 mutex_lock(&ei->truncate_mutex);
2321 if (n == 1) { /* direct blocks */
2322 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2323 i_data + EXT3_NDIR_BLOCKS);
2327 partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2328 /* Kill the top of shared branch (not detached) */
2330 if (partial == chain) {
2331 /* Shared branch grows from the inode */
2332 ext3_free_branches(handle, inode, NULL,
2333 &nr, &nr+1, (chain+n-1) - partial);
2336 * We mark the inode dirty prior to restart,
2337 * and prior to stop. No need for it here.
2340 /* Shared branch grows from an indirect block */
2341 BUFFER_TRACE(partial->bh, "get_write_access");
2342 ext3_free_branches(handle, inode, partial->bh,
2344 partial->p+1, (chain+n-1) - partial);
2347 /* Clear the ends of indirect blocks on the shared branch */
2348 while (partial > chain) {
2349 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2350 (__le32*)partial->bh->b_data+addr_per_block,
2351 (chain+n-1) - partial);
2352 BUFFER_TRACE(partial->bh, "call brelse");
2353 brelse (partial->bh);
2357 /* Kill the remaining (whole) subtrees */
2358 switch (offsets[0]) {
2360 nr = i_data[EXT3_IND_BLOCK];
2362 ext3_free_branches(handle, inode, NULL,
2364 i_data[EXT3_IND_BLOCK] = 0;
2366 case EXT3_IND_BLOCK:
2367 nr = i_data[EXT3_DIND_BLOCK];
2369 ext3_free_branches(handle, inode, NULL,
2371 i_data[EXT3_DIND_BLOCK] = 0;
2373 case EXT3_DIND_BLOCK:
2374 nr = i_data[EXT3_TIND_BLOCK];
2376 ext3_free_branches(handle, inode, NULL,
2378 i_data[EXT3_TIND_BLOCK] = 0;
2380 case EXT3_TIND_BLOCK:
2384 ext3_discard_reservation(inode);
2386 mutex_unlock(&ei->truncate_mutex);
2387 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2388 ext3_mark_inode_dirty(handle, inode);
2390 /* In a multi-transaction truncate, we only make the final
2391 * transaction synchronous */
2396 * If this was a simple ftruncate(), and the file will remain alive
2397 * then we need to clear up the orphan record which we created above.
2398 * However, if this was a real unlink then we were called by
2399 * ext3_delete_inode(), and we allow that function to clean up the
2400 * orphan info for us.
2403 ext3_orphan_del(handle, inode);
2405 ext3_journal_stop(handle);
2408 static unsigned long ext3_get_inode_block(struct super_block *sb,
2409 unsigned long ino, struct ext3_iloc *iloc)
2411 unsigned long desc, group_desc, block_group;
2412 unsigned long offset, block;
2413 struct buffer_head *bh;
2414 struct ext3_group_desc * gdp;
2417 if ((ino != EXT3_ROOT_INO &&
2418 ino != EXT3_JOURNAL_INO &&
2419 ino != EXT3_RESIZE_INO &&
2420 ino < EXT3_FIRST_INO(sb)) ||
2422 EXT3_SB(sb)->s_es->s_inodes_count)) {
2423 ext3_error (sb, "ext3_get_inode_block",
2424 "bad inode number: %lu", ino);
2427 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2428 if (block_group >= EXT3_SB(sb)->s_groups_count) {
2429 ext3_error (sb, "ext3_get_inode_block",
2430 "group >= groups count");
2434 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2435 desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2436 bh = EXT3_SB(sb)->s_group_desc[group_desc];
2438 ext3_error (sb, "ext3_get_inode_block",
2439 "Descriptor not loaded");
2443 gdp = (struct ext3_group_desc *) bh->b_data;
2445 * Figure out the offset within the block group inode table
2447 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2448 EXT3_INODE_SIZE(sb);
2449 block = le32_to_cpu(gdp[desc].bg_inode_table) +
2450 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2452 iloc->block_group = block_group;
2453 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2458 * ext3_get_inode_loc returns with an extra refcount against the inode's
2459 * underlying buffer_head on success. If 'in_mem' is true, we have all
2460 * data in memory that is needed to recreate the on-disk version of this
2463 static int __ext3_get_inode_loc(struct inode *inode,
2464 struct ext3_iloc *iloc, int in_mem)
2466 unsigned long block;
2467 struct buffer_head *bh;
2469 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2473 bh = sb_getblk(inode->i_sb, block);
2475 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2476 "unable to read inode block - "
2477 "inode=%lu, block=%lu", inode->i_ino, block);
2480 if (!buffer_uptodate(bh)) {
2482 if (buffer_uptodate(bh)) {
2483 /* someone brought it uptodate while we waited */
2489 * If we have all information of the inode in memory and this
2490 * is the only valid inode in the block, we need not read the
2494 struct buffer_head *bitmap_bh;
2495 struct ext3_group_desc *desc;
2496 int inodes_per_buffer;
2497 int inode_offset, i;
2501 block_group = (inode->i_ino - 1) /
2502 EXT3_INODES_PER_GROUP(inode->i_sb);
2503 inodes_per_buffer = bh->b_size /
2504 EXT3_INODE_SIZE(inode->i_sb);
2505 inode_offset = ((inode->i_ino - 1) %
2506 EXT3_INODES_PER_GROUP(inode->i_sb));
2507 start = inode_offset & ~(inodes_per_buffer - 1);
2509 /* Is the inode bitmap in cache? */
2510 desc = ext3_get_group_desc(inode->i_sb,
2515 bitmap_bh = sb_getblk(inode->i_sb,
2516 le32_to_cpu(desc->bg_inode_bitmap));
2521 * If the inode bitmap isn't in cache then the
2522 * optimisation may end up performing two reads instead
2523 * of one, so skip it.
2525 if (!buffer_uptodate(bitmap_bh)) {
2529 for (i = start; i < start + inodes_per_buffer; i++) {
2530 if (i == inode_offset)
2532 if (ext3_test_bit(i, bitmap_bh->b_data))
2536 if (i == start + inodes_per_buffer) {
2537 /* all other inodes are free, so skip I/O */
2538 memset(bh->b_data, 0, bh->b_size);
2539 set_buffer_uptodate(bh);
2547 * There are other valid inodes in the buffer, this inode
2548 * has in-inode xattrs, or we don't have this inode in memory.
2549 * Read the block from disk.
2552 bh->b_end_io = end_buffer_read_sync;
2553 submit_bh(READ, bh);
2555 if (!buffer_uptodate(bh)) {
2556 ext3_error(inode->i_sb, "ext3_get_inode_loc",
2557 "unable to read inode block - "
2558 "inode=%lu, block=%lu",
2559 inode->i_ino, block);
2569 int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2571 /* We have all inode data except xattrs in memory here. */
2572 return __ext3_get_inode_loc(inode, iloc,
2573 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
2576 void ext3_set_inode_flags(struct inode *inode)
2578 unsigned int flags = EXT3_I(inode)->i_flags;
2580 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2581 if (flags & EXT3_SYNC_FL)
2582 inode->i_flags |= S_SYNC;
2583 if (flags & EXT3_APPEND_FL)
2584 inode->i_flags |= S_APPEND;
2585 if (flags & EXT3_IMMUTABLE_FL)
2586 inode->i_flags |= S_IMMUTABLE;
2587 if (flags & EXT3_NOATIME_FL)
2588 inode->i_flags |= S_NOATIME;
2589 if (flags & EXT3_DIRSYNC_FL)
2590 inode->i_flags |= S_DIRSYNC;
2593 void ext3_read_inode(struct inode * inode)
2595 struct ext3_iloc iloc;
2596 struct ext3_inode *raw_inode;
2597 struct ext3_inode_info *ei = EXT3_I(inode);
2598 struct buffer_head *bh;
2601 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2602 ei->i_acl = EXT3_ACL_NOT_CACHED;
2603 ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2605 ei->i_block_alloc_info = NULL;
2607 if (__ext3_get_inode_loc(inode, &iloc, 0))
2610 raw_inode = ext3_raw_inode(&iloc);
2611 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2612 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2613 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2614 if(!(test_opt (inode->i_sb, NO_UID32))) {
2615 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2616 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2618 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2619 inode->i_size = le32_to_cpu(raw_inode->i_size);
2620 inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2621 inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2622 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2623 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2626 ei->i_dir_start_lookup = 0;
2627 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2628 /* We now have enough fields to check if the inode was active or not.
2629 * This is needed because nfsd might try to access dead inodes
2630 * the test is that same one that e2fsck uses
2631 * NeilBrown 1999oct15
2633 if (inode->i_nlink == 0) {
2634 if (inode->i_mode == 0 ||
2635 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2636 /* this inode is deleted */
2640 /* The only unlinked inodes we let through here have
2641 * valid i_mode and are being read by the orphan
2642 * recovery code: that's fine, we're about to complete
2643 * the process of deleting those. */
2645 inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
2646 * (for stat), not the fs block
2648 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2649 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2650 #ifdef EXT3_FRAGMENTS
2651 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2652 ei->i_frag_no = raw_inode->i_frag;
2653 ei->i_frag_size = raw_inode->i_fsize;
2655 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2656 if (!S_ISREG(inode->i_mode)) {
2657 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2660 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2662 ei->i_disksize = inode->i_size;
2663 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2664 ei->i_block_group = iloc.block_group;
2666 * NOTE! The in-memory inode i_data array is in little-endian order
2667 * even on big-endian machines: we do NOT byteswap the block numbers!
2669 for (block = 0; block < EXT3_N_BLOCKS; block++)
2670 ei->i_data[block] = raw_inode->i_block[block];
2671 INIT_LIST_HEAD(&ei->i_orphan);
2673 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2674 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2676 * When mke2fs creates big inodes it does not zero out
2677 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2678 * so ignore those first few inodes.
2680 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2681 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2682 EXT3_INODE_SIZE(inode->i_sb))
2684 if (ei->i_extra_isize == 0) {
2685 /* The extra space is currently unused. Use it. */
2686 ei->i_extra_isize = sizeof(struct ext3_inode) -
2687 EXT3_GOOD_OLD_INODE_SIZE;
2689 __le32 *magic = (void *)raw_inode +
2690 EXT3_GOOD_OLD_INODE_SIZE +
2692 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2693 ei->i_state |= EXT3_STATE_XATTR;
2696 ei->i_extra_isize = 0;
2698 if (S_ISREG(inode->i_mode)) {
2699 inode->i_op = &ext3_file_inode_operations;
2700 inode->i_fop = &ext3_file_operations;
2701 ext3_set_aops(inode);
2702 } else if (S_ISDIR(inode->i_mode)) {
2703 inode->i_op = &ext3_dir_inode_operations;
2704 inode->i_fop = &ext3_dir_operations;
2705 } else if (S_ISLNK(inode->i_mode)) {
2706 if (ext3_inode_is_fast_symlink(inode))
2707 inode->i_op = &ext3_fast_symlink_inode_operations;
2709 inode->i_op = &ext3_symlink_inode_operations;
2710 ext3_set_aops(inode);
2713 inode->i_op = &ext3_special_inode_operations;
2714 if (raw_inode->i_block[0])
2715 init_special_inode(inode, inode->i_mode,
2716 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2718 init_special_inode(inode, inode->i_mode,
2719 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2722 ext3_set_inode_flags(inode);
2726 make_bad_inode(inode);
2731 * Post the struct inode info into an on-disk inode location in the
2732 * buffer-cache. This gobbles the caller's reference to the
2733 * buffer_head in the inode location struct.
2735 * The caller must have write access to iloc->bh.
2737 static int ext3_do_update_inode(handle_t *handle,
2738 struct inode *inode,
2739 struct ext3_iloc *iloc)
2741 struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2742 struct ext3_inode_info *ei = EXT3_I(inode);
2743 struct buffer_head *bh = iloc->bh;
2744 int err = 0, rc, block;
2746 /* For fields not not tracking in the in-memory inode,
2747 * initialise them to zero for new inodes. */
2748 if (ei->i_state & EXT3_STATE_NEW)
2749 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2751 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2752 if(!(test_opt(inode->i_sb, NO_UID32))) {
2753 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2754 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2756 * Fix up interoperability with old kernels. Otherwise, old inodes get
2757 * re-used with the upper 16 bits of the uid/gid intact
2760 raw_inode->i_uid_high =
2761 cpu_to_le16(high_16_bits(inode->i_uid));
2762 raw_inode->i_gid_high =
2763 cpu_to_le16(high_16_bits(inode->i_gid));
2765 raw_inode->i_uid_high = 0;
2766 raw_inode->i_gid_high = 0;
2769 raw_inode->i_uid_low =
2770 cpu_to_le16(fs_high2lowuid(inode->i_uid));
2771 raw_inode->i_gid_low =
2772 cpu_to_le16(fs_high2lowgid(inode->i_gid));
2773 raw_inode->i_uid_high = 0;
2774 raw_inode->i_gid_high = 0;
2776 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2777 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2778 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2779 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2780 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2781 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2782 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2783 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2784 #ifdef EXT3_FRAGMENTS
2785 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2786 raw_inode->i_frag = ei->i_frag_no;
2787 raw_inode->i_fsize = ei->i_frag_size;
2789 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2790 if (!S_ISREG(inode->i_mode)) {
2791 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2793 raw_inode->i_size_high =
2794 cpu_to_le32(ei->i_disksize >> 32);
2795 if (ei->i_disksize > 0x7fffffffULL) {
2796 struct super_block *sb = inode->i_sb;
2797 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2798 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2799 EXT3_SB(sb)->s_es->s_rev_level ==
2800 cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2801 /* If this is the first large file
2802 * created, add a flag to the superblock.
2804 err = ext3_journal_get_write_access(handle,
2805 EXT3_SB(sb)->s_sbh);
2808 ext3_update_dynamic_rev(sb);
2809 EXT3_SET_RO_COMPAT_FEATURE(sb,
2810 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2813 err = ext3_journal_dirty_metadata(handle,
2814 EXT3_SB(sb)->s_sbh);
2818 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2819 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2820 if (old_valid_dev(inode->i_rdev)) {
2821 raw_inode->i_block[0] =
2822 cpu_to_le32(old_encode_dev(inode->i_rdev));
2823 raw_inode->i_block[1] = 0;
2825 raw_inode->i_block[0] = 0;
2826 raw_inode->i_block[1] =
2827 cpu_to_le32(new_encode_dev(inode->i_rdev));
2828 raw_inode->i_block[2] = 0;
2830 } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2831 raw_inode->i_block[block] = ei->i_data[block];
2833 if (ei->i_extra_isize)
2834 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2836 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2837 rc = ext3_journal_dirty_metadata(handle, bh);
2840 ei->i_state &= ~EXT3_STATE_NEW;
2844 ext3_std_error(inode->i_sb, err);
2849 * ext3_write_inode()
2851 * We are called from a few places:
2853 * - Within generic_file_write() for O_SYNC files.
2854 * Here, there will be no transaction running. We wait for any running
2855 * trasnaction to commit.
2857 * - Within sys_sync(), kupdate and such.
2858 * We wait on commit, if tol to.
2860 * - Within prune_icache() (PF_MEMALLOC == true)
2861 * Here we simply return. We can't afford to block kswapd on the
2864 * In all cases it is actually safe for us to return without doing anything,
2865 * because the inode has been copied into a raw inode buffer in
2866 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
2869 * Note that we are absolutely dependent upon all inode dirtiers doing the
2870 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2871 * which we are interested.
2873 * It would be a bug for them to not do this. The code:
2875 * mark_inode_dirty(inode)
2877 * inode->i_size = expr;
2879 * is in error because a kswapd-driven write_inode() could occur while
2880 * `stuff()' is running, and the new i_size will be lost. Plus the inode
2881 * will no longer be on the superblock's dirty inode list.
2883 int ext3_write_inode(struct inode *inode, int wait)
2885 if (current->flags & PF_MEMALLOC)
2888 if (ext3_journal_current_handle()) {
2889 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2897 return ext3_force_commit(inode->i_sb);
2903 * Called from notify_change.
2905 * We want to trap VFS attempts to truncate the file as soon as
2906 * possible. In particular, we want to make sure that when the VFS
2907 * shrinks i_size, we put the inode on the orphan list and modify
2908 * i_disksize immediately, so that during the subsequent flushing of
2909 * dirty pages and freeing of disk blocks, we can guarantee that any
2910 * commit will leave the blocks being flushed in an unused state on
2911 * disk. (On recovery, the inode will get truncated and the blocks will
2912 * be freed, so we have a strong guarantee that no future commit will
2913 * leave these blocks visible to the user.)
2915 * Called with inode->sem down.
2917 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2919 struct inode *inode = dentry->d_inode;
2921 const unsigned int ia_valid = attr->ia_valid;
2923 error = inode_change_ok(inode, attr);
2927 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2928 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2931 /* (user+group)*(old+new) structure, inode write (sb,
2932 * inode block, ? - but truncate inode update has it) */
2933 handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
2934 EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
2935 if (IS_ERR(handle)) {
2936 error = PTR_ERR(handle);
2939 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2941 ext3_journal_stop(handle);
2944 /* Update corresponding info in inode so that everything is in
2945 * one transaction */
2946 if (attr->ia_valid & ATTR_UID)
2947 inode->i_uid = attr->ia_uid;
2948 if (attr->ia_valid & ATTR_GID)
2949 inode->i_gid = attr->ia_gid;
2950 error = ext3_mark_inode_dirty(handle, inode);
2951 ext3_journal_stop(handle);
2954 if (S_ISREG(inode->i_mode) &&
2955 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2958 handle = ext3_journal_start(inode, 3);
2959 if (IS_ERR(handle)) {
2960 error = PTR_ERR(handle);
2964 error = ext3_orphan_add(handle, inode);
2965 EXT3_I(inode)->i_disksize = attr->ia_size;
2966 rc = ext3_mark_inode_dirty(handle, inode);
2969 ext3_journal_stop(handle);
2972 rc = inode_setattr(inode, attr);
2974 /* If inode_setattr's call to ext3_truncate failed to get a
2975 * transaction handle at all, we need to clean up the in-core
2976 * orphan list manually. */
2978 ext3_orphan_del(NULL, inode);
2980 if (!rc && (ia_valid & ATTR_MODE))
2981 rc = ext3_acl_chmod(inode);
2984 ext3_std_error(inode->i_sb, error);
2992 * akpm: how many blocks doth make a writepage()?
2994 * With N blocks per page, it may be:
2999 * N+5 bitmap blocks (from the above)
3000 * N+5 group descriptor summary blocks
3003 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
3005 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
3007 * With ordered or writeback data it's the same, less the N data blocks.
3009 * If the inode's direct blocks can hold an integral number of pages then a
3010 * page cannot straddle two indirect blocks, and we can only touch one indirect
3011 * and dindirect block, and the "5" above becomes "3".
3013 * This still overestimates under most circumstances. If we were to pass the
3014 * start and end offsets in here as well we could do block_to_path() on each
3015 * block and work out the exact number of indirects which are touched. Pah.
3018 static int ext3_writepage_trans_blocks(struct inode *inode)
3020 int bpp = ext3_journal_blocks_per_page(inode);
3021 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
3024 if (ext3_should_journal_data(inode))
3025 ret = 3 * (bpp + indirects) + 2;
3027 ret = 2 * (bpp + indirects) + 2;
3030 /* We know that structure was already allocated during DQUOT_INIT so
3031 * we will be updating only the data blocks + inodes */
3032 ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
3039 * The caller must have previously called ext3_reserve_inode_write().
3040 * Give this, we know that the caller already has write access to iloc->bh.
3042 int ext3_mark_iloc_dirty(handle_t *handle,
3043 struct inode *inode, struct ext3_iloc *iloc)
3047 /* the do_update_inode consumes one bh->b_count */
3050 /* ext3_do_update_inode() does journal_dirty_metadata */
3051 err = ext3_do_update_inode(handle, inode, iloc);
3057 * On success, We end up with an outstanding reference count against
3058 * iloc->bh. This _must_ be cleaned up later.
3062 ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
3063 struct ext3_iloc *iloc)
3067 err = ext3_get_inode_loc(inode, iloc);
3069 BUFFER_TRACE(iloc->bh, "get_write_access");
3070 err = ext3_journal_get_write_access(handle, iloc->bh);
3077 ext3_std_error(inode->i_sb, err);
3082 * akpm: What we do here is to mark the in-core inode as clean
3083 * with respect to inode dirtiness (it may still be data-dirty).
3084 * This means that the in-core inode may be reaped by prune_icache
3085 * without having to perform any I/O. This is a very good thing,
3086 * because *any* task may call prune_icache - even ones which
3087 * have a transaction open against a different journal.
3089 * Is this cheating? Not really. Sure, we haven't written the
3090 * inode out, but prune_icache isn't a user-visible syncing function.
3091 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3092 * we start and wait on commits.
3094 * Is this efficient/effective? Well, we're being nice to the system
3095 * by cleaning up our inodes proactively so they can be reaped
3096 * without I/O. But we are potentially leaving up to five seconds'
3097 * worth of inodes floating about which prune_icache wants us to
3098 * write out. One way to fix that would be to get prune_icache()
3099 * to do a write_super() to free up some memory. It has the desired
3102 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3104 struct ext3_iloc iloc;
3108 err = ext3_reserve_inode_write(handle, inode, &iloc);
3110 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3115 * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
3117 * We're really interested in the case where a file is being extended.
3118 * i_size has been changed by generic_commit_write() and we thus need
3119 * to include the updated inode in the current transaction.
3121 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3122 * are allocated to the file.
3124 * If the inode is marked synchronous, we don't honour that here - doing
3125 * so would cause a commit on atime updates, which we don't bother doing.
3126 * We handle synchronous inodes at the highest possible level.
3128 void ext3_dirty_inode(struct inode *inode)
3130 handle_t *current_handle = ext3_journal_current_handle();
3133 handle = ext3_journal_start(inode, 2);
3136 if (current_handle &&
3137 current_handle->h_transaction != handle->h_transaction) {
3138 /* This task has a transaction open against a different fs */
3139 printk(KERN_EMERG "%s: transactions do not match!\n",
3142 jbd_debug(5, "marking dirty. outer handle=%p\n",
3144 ext3_mark_inode_dirty(handle, inode);
3146 ext3_journal_stop(handle);
3153 * Bind an inode's backing buffer_head into this transaction, to prevent
3154 * it from being flushed to disk early. Unlike
3155 * ext3_reserve_inode_write, this leaves behind no bh reference and
3156 * returns no iloc structure, so the caller needs to repeat the iloc
3157 * lookup to mark the inode dirty later.
3160 ext3_pin_inode(handle_t *handle, struct inode *inode)
3162 struct ext3_iloc iloc;
3166 err = ext3_get_inode_loc(inode, &iloc);
3168 BUFFER_TRACE(iloc.bh, "get_write_access");
3169 err = journal_get_write_access(handle, iloc.bh);
3171 err = ext3_journal_dirty_metadata(handle,
3176 ext3_std_error(inode->i_sb, err);
3181 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3188 * We have to be very careful here: changing a data block's
3189 * journaling status dynamically is dangerous. If we write a
3190 * data block to the journal, change the status and then delete
3191 * that block, we risk forgetting to revoke the old log record
3192 * from the journal and so a subsequent replay can corrupt data.
3193 * So, first we make sure that the journal is empty and that
3194 * nobody is changing anything.
3197 journal = EXT3_JOURNAL(inode);
3198 if (is_journal_aborted(journal) || IS_RDONLY(inode))
3201 journal_lock_updates(journal);
3202 journal_flush(journal);
3205 * OK, there are no updates running now, and all cached data is
3206 * synced to disk. We are now in a completely consistent state
3207 * which doesn't have anything in the journal, and we know that
3208 * no filesystem updates are running, so it is safe to modify
3209 * the inode's in-core data-journaling state flag now.
3213 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3215 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3216 ext3_set_aops(inode);
3218 journal_unlock_updates(journal);
3220 /* Finally we can mark the inode as dirty. */
3222 handle = ext3_journal_start(inode, 1);
3224 return PTR_ERR(handle);
3226 err = ext3_mark_inode_dirty(handle, inode);
3228 ext3_journal_stop(handle);
3229 ext3_std_error(inode->i_sb, err);