0cd126176bbbcf942bb91c4f8e3f1d5bbc554868
[safe/jmp/linux-2.6] / fs / ext3 / inode.c
1 /*
2  *  linux/fs/ext3/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Goal-directed block allocation by Stephen Tweedie
16  *      (sct@redhat.com), 1993, 1998
17  *  Big-endian to little-endian byte-swapping/bitmaps by
18  *        David S. Miller (davem@caip.rutgers.edu), 1995
19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
20  *      (jj@sunsite.ms.mff.cuni.cz)
21  *
22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23  */
24
25 #include <linux/module.h>
26 #include <linux/fs.h>
27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/pagemap.h>
33 #include <linux/quotaops.h>
34 #include <linux/string.h>
35 #include <linux/buffer_head.h>
36 #include <linux/writeback.h>
37 #include <linux/mpage.h>
38 #include <linux/uio.h>
39 #include "xattr.h"
40 #include "acl.h"
41
42 static int ext3_writepage_trans_blocks(struct inode *inode);
43
44 /*
45  * Test whether an inode is a fast symlink.
46  */
47 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
48 {
49         int ea_blocks = EXT3_I(inode)->i_file_acl ?
50                 (inode->i_sb->s_blocksize >> 9) : 0;
51
52         return (S_ISLNK(inode->i_mode) &&
53                 inode->i_blocks - ea_blocks == 0);
54 }
55
56 /* The ext3 forget function must perform a revoke if we are freeing data
57  * which has been journaled.  Metadata (eg. indirect blocks) must be
58  * revoked in all cases. 
59  *
60  * "bh" may be NULL: a metadata block may have been freed from memory
61  * but there may still be a record of it in the journal, and that record
62  * still needs to be revoked.
63  */
64
65 int ext3_forget(handle_t *handle, int is_metadata,
66                        struct inode *inode, struct buffer_head *bh,
67                        int blocknr)
68 {
69         int err;
70
71         might_sleep();
72
73         BUFFER_TRACE(bh, "enter");
74
75         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
76                   "data mode %lx\n",
77                   bh, is_metadata, inode->i_mode,
78                   test_opt(inode->i_sb, DATA_FLAGS));
79
80         /* Never use the revoke function if we are doing full data
81          * journaling: there is no need to, and a V1 superblock won't
82          * support it.  Otherwise, only skip the revoke on un-journaled
83          * data blocks. */
84
85         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
86             (!is_metadata && !ext3_should_journal_data(inode))) {
87                 if (bh) {
88                         BUFFER_TRACE(bh, "call journal_forget");
89                         return ext3_journal_forget(handle, bh);
90                 }
91                 return 0;
92         }
93
94         /*
95          * data!=journal && (is_metadata || should_journal_data(inode))
96          */
97         BUFFER_TRACE(bh, "call ext3_journal_revoke");
98         err = ext3_journal_revoke(handle, blocknr, bh);
99         if (err)
100                 ext3_abort(inode->i_sb, __FUNCTION__,
101                            "error %d when attempting revoke", err);
102         BUFFER_TRACE(bh, "exit");
103         return err;
104 }
105
106 /*
107  * Work out how many blocks we need to progress with the next chunk of a
108  * truncate transaction.
109  */
110
111 static unsigned long blocks_for_truncate(struct inode *inode) 
112 {
113         unsigned long needed;
114
115         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
116
117         /* Give ourselves just enough room to cope with inodes in which
118          * i_blocks is corrupt: we've seen disk corruptions in the past
119          * which resulted in random data in an inode which looked enough
120          * like a regular file for ext3 to try to delete it.  Things
121          * will go a bit crazy if that happens, but at least we should
122          * try not to panic the whole kernel. */
123         if (needed < 2)
124                 needed = 2;
125
126         /* But we need to bound the transaction so we don't overflow the
127          * journal. */
128         if (needed > EXT3_MAX_TRANS_DATA) 
129                 needed = EXT3_MAX_TRANS_DATA;
130
131         return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
132 }
133
134 /* 
135  * Truncate transactions can be complex and absolutely huge.  So we need to
136  * be able to restart the transaction at a conventient checkpoint to make
137  * sure we don't overflow the journal.
138  *
139  * start_transaction gets us a new handle for a truncate transaction,
140  * and extend_transaction tries to extend the existing one a bit.  If
141  * extend fails, we need to propagate the failure up and restart the
142  * transaction in the top-level truncate loop. --sct 
143  */
144
145 static handle_t *start_transaction(struct inode *inode) 
146 {
147         handle_t *result;
148
149         result = ext3_journal_start(inode, blocks_for_truncate(inode));
150         if (!IS_ERR(result))
151                 return result;
152
153         ext3_std_error(inode->i_sb, PTR_ERR(result));
154         return result;
155 }
156
157 /*
158  * Try to extend this transaction for the purposes of truncation.
159  *
160  * Returns 0 if we managed to create more room.  If we can't create more
161  * room, and the transaction must be restarted we return 1.
162  */
163 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
164 {
165         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
166                 return 0;
167         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
168                 return 0;
169         return 1;
170 }
171
172 /*
173  * Restart the transaction associated with *handle.  This does a commit,
174  * so before we call here everything must be consistently dirtied against
175  * this transaction.
176  */
177 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
178 {
179         jbd_debug(2, "restarting handle %p\n", handle);
180         return ext3_journal_restart(handle, blocks_for_truncate(inode));
181 }
182
183 /*
184  * Called at the last iput() if i_nlink is zero.
185  */
186 void ext3_delete_inode (struct inode * inode)
187 {
188         handle_t *handle;
189
190         truncate_inode_pages(&inode->i_data, 0);
191
192         if (is_bad_inode(inode))
193                 goto no_delete;
194
195         handle = start_transaction(inode);
196         if (IS_ERR(handle)) {
197                 /* If we're going to skip the normal cleanup, we still
198                  * need to make sure that the in-core orphan linked list
199                  * is properly cleaned up. */
200                 ext3_orphan_del(NULL, inode);
201                 goto no_delete;
202         }
203
204         if (IS_SYNC(inode))
205                 handle->h_sync = 1;
206         inode->i_size = 0;
207         if (inode->i_blocks)
208                 ext3_truncate(inode);
209         /*
210          * Kill off the orphan record which ext3_truncate created.
211          * AKPM: I think this can be inside the above `if'.
212          * Note that ext3_orphan_del() has to be able to cope with the
213          * deletion of a non-existent orphan - this is because we don't
214          * know if ext3_truncate() actually created an orphan record.
215          * (Well, we could do this if we need to, but heck - it works)
216          */
217         ext3_orphan_del(handle, inode);
218         EXT3_I(inode)->i_dtime  = get_seconds();
219
220         /* 
221          * One subtle ordering requirement: if anything has gone wrong
222          * (transaction abort, IO errors, whatever), then we can still
223          * do these next steps (the fs will already have been marked as
224          * having errors), but we can't free the inode if the mark_dirty
225          * fails.  
226          */
227         if (ext3_mark_inode_dirty(handle, inode))
228                 /* If that failed, just do the required in-core inode clear. */
229                 clear_inode(inode);
230         else
231                 ext3_free_inode(handle, inode);
232         ext3_journal_stop(handle);
233         return;
234 no_delete:
235         clear_inode(inode);     /* We must guarantee clearing of inode... */
236 }
237
238 typedef struct {
239         __le32  *p;
240         __le32  key;
241         struct buffer_head *bh;
242 } Indirect;
243
244 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
245 {
246         p->key = *(p->p = v);
247         p->bh = bh;
248 }
249
250 static inline int verify_chain(Indirect *from, Indirect *to)
251 {
252         while (from <= to && from->key == *from->p)
253                 from++;
254         return (from > to);
255 }
256
257 /**
258  *      ext3_block_to_path - parse the block number into array of offsets
259  *      @inode: inode in question (we are only interested in its superblock)
260  *      @i_block: block number to be parsed
261  *      @offsets: array to store the offsets in
262  *      @boundary: set this non-zero if the referred-to block is likely to be
263  *             followed (on disk) by an indirect block.
264  *
265  *      To store the locations of file's data ext3 uses a data structure common
266  *      for UNIX filesystems - tree of pointers anchored in the inode, with
267  *      data blocks at leaves and indirect blocks in intermediate nodes.
268  *      This function translates the block number into path in that tree -
269  *      return value is the path length and @offsets[n] is the offset of
270  *      pointer to (n+1)th node in the nth one. If @block is out of range
271  *      (negative or too large) warning is printed and zero returned.
272  *
273  *      Note: function doesn't find node addresses, so no IO is needed. All
274  *      we need to know is the capacity of indirect blocks (taken from the
275  *      inode->i_sb).
276  */
277
278 /*
279  * Portability note: the last comparison (check that we fit into triple
280  * indirect block) is spelled differently, because otherwise on an
281  * architecture with 32-bit longs and 8Kb pages we might get into trouble
282  * if our filesystem had 8Kb blocks. We might use long long, but that would
283  * kill us on x86. Oh, well, at least the sign propagation does not matter -
284  * i_block would have to be negative in the very beginning, so we would not
285  * get there at all.
286  */
287
288 static int ext3_block_to_path(struct inode *inode,
289                         long i_block, int offsets[4], int *boundary)
290 {
291         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
292         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
293         const long direct_blocks = EXT3_NDIR_BLOCKS,
294                 indirect_blocks = ptrs,
295                 double_blocks = (1 << (ptrs_bits * 2));
296         int n = 0;
297         int final = 0;
298
299         if (i_block < 0) {
300                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
301         } else if (i_block < direct_blocks) {
302                 offsets[n++] = i_block;
303                 final = direct_blocks;
304         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
305                 offsets[n++] = EXT3_IND_BLOCK;
306                 offsets[n++] = i_block;
307                 final = ptrs;
308         } else if ((i_block -= indirect_blocks) < double_blocks) {
309                 offsets[n++] = EXT3_DIND_BLOCK;
310                 offsets[n++] = i_block >> ptrs_bits;
311                 offsets[n++] = i_block & (ptrs - 1);
312                 final = ptrs;
313         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
314                 offsets[n++] = EXT3_TIND_BLOCK;
315                 offsets[n++] = i_block >> (ptrs_bits * 2);
316                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
317                 offsets[n++] = i_block & (ptrs - 1);
318                 final = ptrs;
319         } else {
320                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
321         }
322         if (boundary)
323                 *boundary = final - 1 - (i_block & (ptrs - 1));
324         return n;
325 }
326
327 /**
328  *      ext3_get_branch - read the chain of indirect blocks leading to data
329  *      @inode: inode in question
330  *      @depth: depth of the chain (1 - direct pointer, etc.)
331  *      @offsets: offsets of pointers in inode/indirect blocks
332  *      @chain: place to store the result
333  *      @err: here we store the error value
334  *
335  *      Function fills the array of triples <key, p, bh> and returns %NULL
336  *      if everything went OK or the pointer to the last filled triple
337  *      (incomplete one) otherwise. Upon the return chain[i].key contains
338  *      the number of (i+1)-th block in the chain (as it is stored in memory,
339  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
340  *      number (it points into struct inode for i==0 and into the bh->b_data
341  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
342  *      block for i>0 and NULL for i==0. In other words, it holds the block
343  *      numbers of the chain, addresses they were taken from (and where we can
344  *      verify that chain did not change) and buffer_heads hosting these
345  *      numbers.
346  *
347  *      Function stops when it stumbles upon zero pointer (absent block)
348  *              (pointer to last triple returned, *@err == 0)
349  *      or when it gets an IO error reading an indirect block
350  *              (ditto, *@err == -EIO)
351  *      or when it notices that chain had been changed while it was reading
352  *              (ditto, *@err == -EAGAIN)
353  *      or when it reads all @depth-1 indirect blocks successfully and finds
354  *      the whole chain, all way to the data (returns %NULL, *err == 0).
355  */
356 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
357                                  Indirect chain[4], int *err)
358 {
359         struct super_block *sb = inode->i_sb;
360         Indirect *p = chain;
361         struct buffer_head *bh;
362
363         *err = 0;
364         /* i_data is not going away, no lock needed */
365         add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
366         if (!p->key)
367                 goto no_block;
368         while (--depth) {
369                 bh = sb_bread(sb, le32_to_cpu(p->key));
370                 if (!bh)
371                         goto failure;
372                 /* Reader: pointers */
373                 if (!verify_chain(chain, p))
374                         goto changed;
375                 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
376                 /* Reader: end */
377                 if (!p->key)
378                         goto no_block;
379         }
380         return NULL;
381
382 changed:
383         brelse(bh);
384         *err = -EAGAIN;
385         goto no_block;
386 failure:
387         *err = -EIO;
388 no_block:
389         return p;
390 }
391
392 /**
393  *      ext3_find_near - find a place for allocation with sufficient locality
394  *      @inode: owner
395  *      @ind: descriptor of indirect block.
396  *
397  *      This function returns the prefered place for block allocation.
398  *      It is used when heuristic for sequential allocation fails.
399  *      Rules are:
400  *        + if there is a block to the left of our position - allocate near it.
401  *        + if pointer will live in indirect block - allocate near that block.
402  *        + if pointer will live in inode - allocate in the same
403  *          cylinder group. 
404  *
405  * In the latter case we colour the starting block by the callers PID to
406  * prevent it from clashing with concurrent allocations for a different inode
407  * in the same block group.   The PID is used here so that functionally related
408  * files will be close-by on-disk.
409  *
410  *      Caller must make sure that @ind is valid and will stay that way.
411  */
412
413 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
414 {
415         struct ext3_inode_info *ei = EXT3_I(inode);
416         __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
417         __le32 *p;
418         unsigned long bg_start;
419         unsigned long colour;
420
421         /* Try to find previous block */
422         for (p = ind->p - 1; p >= start; p--)
423                 if (*p)
424                         return le32_to_cpu(*p);
425
426         /* No such thing, so let's try location of indirect block */
427         if (ind->bh)
428                 return ind->bh->b_blocknr;
429
430         /*
431          * It is going to be refered from inode itself? OK, just put it into
432          * the same cylinder group then.
433          */
434         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
435                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
436         colour = (current->pid % 16) *
437                         (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
438         return bg_start + colour;
439 }
440
441 /**
442  *      ext3_find_goal - find a prefered place for allocation.
443  *      @inode: owner
444  *      @block:  block we want
445  *      @chain:  chain of indirect blocks
446  *      @partial: pointer to the last triple within a chain
447  *      @goal:  place to store the result.
448  *
449  *      Normally this function find the prefered place for block allocation,
450  *      stores it in *@goal and returns zero.
451  */
452
453 static unsigned long ext3_find_goal(struct inode *inode, long block,
454                 Indirect chain[4], Indirect *partial)
455 {
456         struct ext3_block_alloc_info *block_i =  EXT3_I(inode)->i_block_alloc_info;
457
458         /*
459          * try the heuristic for sequential allocation,
460          * failing that at least try to get decent locality.
461          */
462         if (block_i && (block == block_i->last_alloc_logical_block + 1)
463                 && (block_i->last_alloc_physical_block != 0)) {
464                 return block_i->last_alloc_physical_block + 1;
465         }
466
467         return ext3_find_near(inode, partial);
468 }
469 /**
470  *      ext3_blks_to_allocate: Look up the block map and count the number
471  *      of direct blocks need to be allocated for the given branch.
472  *
473  *      @branch: chain of indirect blocks
474  *      @k: number of blocks need for indirect blocks
475  *      @blks: number of data blocks to be mapped.
476  *      @blocks_to_boundary:  the offset in the indirect block
477  *
478  *      return the total number of blocks to be allocate, including the
479  *      direct and indirect blocks.
480  */
481 static int
482 ext3_blks_to_allocate(Indirect * branch, int k, unsigned long blks,
483                 int blocks_to_boundary)
484 {
485         unsigned long count = 0;
486
487         /*
488          * Simple case, [t,d]Indirect block(s) has not allocated yet
489          * then it's clear blocks on that path have not allocated
490          */
491         if (k > 0) {
492                 /* right now don't hanel cross boundary allocation */
493                 if (blks < blocks_to_boundary + 1)
494                         count += blks;
495                 else
496                         count += blocks_to_boundary + 1;
497                 return count;
498         }
499
500         count++;
501         while (count < blks && count <= blocks_to_boundary &&
502                 le32_to_cpu(*(branch[0].p + count)) == 0) {
503                 count++;
504         }
505         return count;
506 }
507
508 /**
509  *      ext3_alloc_blocks: multiple allocate blocks needed for a branch
510  *      @indirect_blks: the number of blocks need to allocate for indirect
511  *                      blocks
512  *
513  *      @new_blocks: on return it will store the new block numbers for
514  *      the indirect blocks(if needed) and the first direct block,
515  *      @blks:  on return it will store the total number of allocated
516  *              direct blocks
517  */
518 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
519                         unsigned long goal, int indirect_blks, int blks,
520                         unsigned long long new_blocks[4], int *err)
521 {
522         int target, i;
523         unsigned long count = 0;
524         int index = 0;
525         unsigned long current_block = 0;
526         int ret = 0;
527
528         /*
529          * Here we try to allocate the requested multiple blocks at once,
530          * on a best-effort basis.
531          * To build a branch, we should allocate blocks for
532          * the indirect blocks(if not allocated yet), and at least
533          * the first direct block of this branch.  That's the
534          * minimum number of blocks need to allocate(required)
535          */
536         target = blks + indirect_blks;
537
538         while (1) {
539                 count = target;
540                 /* allocating blocks for indirect blocks and direct blocks */
541                 current_block = ext3_new_blocks(handle, inode, goal, &count, err);
542                 if (*err)
543                         goto failed_out;
544
545                 target -= count;
546                 /* allocate blocks for indirect blocks */
547                 while (index < indirect_blks && count) {
548                         new_blocks[index++] = current_block++;
549                         count--;
550                 }
551
552                 if (count > 0)
553                         break;
554         }
555
556         /* save the new block number for the first direct block */
557         new_blocks[index] = current_block;
558
559         /* total number of blocks allocated for direct blocks */
560         ret = count;
561         *err = 0;
562         return ret;
563 failed_out:
564         for (i = 0; i <index; i++)
565                 ext3_free_blocks(handle, inode, new_blocks[i], 1);
566         return ret;
567 }
568
569 /**
570  *      ext3_alloc_branch - allocate and set up a chain of blocks.
571  *      @inode: owner
572  *      @indirect_blks: number of allocated indirect blocks
573  *      @blks: number of allocated direct blocks
574  *      @offsets: offsets (in the blocks) to store the pointers to next.
575  *      @branch: place to store the chain in.
576  *
577  *      This function allocates blocks, zeroes out all but the last one,
578  *      links them into chain and (if we are synchronous) writes them to disk.
579  *      In other words, it prepares a branch that can be spliced onto the
580  *      inode. It stores the information about that chain in the branch[], in
581  *      the same format as ext3_get_branch() would do. We are calling it after
582  *      we had read the existing part of chain and partial points to the last
583  *      triple of that (one with zero ->key). Upon the exit we have the same
584  *      picture as after the successful ext3_get_block(), except that in one
585  *      place chain is disconnected - *branch->p is still zero (we did not
586  *      set the last link), but branch->key contains the number that should
587  *      be placed into *branch->p to fill that gap.
588  *
589  *      If allocation fails we free all blocks we've allocated (and forget
590  *      their buffer_heads) and return the error value the from failed
591  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
592  *      as described above and return 0.
593  */
594
595 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
596                         int indirect_blks, int *blks, unsigned long goal,
597                         int *offsets, Indirect *branch)
598 {
599         int blocksize = inode->i_sb->s_blocksize;
600         int i, n = 0;
601         int err = 0;
602         struct buffer_head *bh;
603         int num;
604         unsigned long long new_blocks[4];
605         unsigned long long current_block;
606
607         num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
608                                 *blks, new_blocks, &err);
609         if (err)
610                 return err;
611
612         branch[0].key = cpu_to_le32(new_blocks[0]);
613         /*
614          * metadata blocks and data blocks are allocated.
615          */
616         for (n = 1; n <= indirect_blks;  n++) {
617                 /*
618                  * Get buffer_head for parent block, zero it out
619                  * and set the pointer to new one, then send
620                  * parent to disk.
621                  */
622                 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
623                 branch[n].bh = bh;
624                 lock_buffer(bh);
625                 BUFFER_TRACE(bh, "call get_create_access");
626                 err = ext3_journal_get_create_access(handle, bh);
627                 if (err) {
628                         unlock_buffer(bh);
629                         brelse(bh);
630                         goto failed;
631                 }
632
633                 memset(bh->b_data, 0, blocksize);
634                 branch[n].p = (__le32 *) bh->b_data + offsets[n];
635                 branch[n].key = cpu_to_le32(new_blocks[n]);
636                 *branch[n].p = branch[n].key;
637                 if ( n == indirect_blks) {
638                         current_block = new_blocks[n];
639                         /*
640                          * End of chain, update the last new metablock of
641                          * the chain to point to the new allocated
642                          * data blocks numbers
643                          */
644                         for (i=1; i < num; i++)
645                                 *(branch[n].p + i) = cpu_to_le32(++current_block);
646                 }
647                 BUFFER_TRACE(bh, "marking uptodate");
648                 set_buffer_uptodate(bh);
649                 unlock_buffer(bh);
650
651                 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
652                 err = ext3_journal_dirty_metadata(handle, bh);
653                 if (err)
654                         goto failed;
655         }
656         *blks = num;
657         return err;
658 failed:
659         /* Allocation failed, free what we already allocated */
660         for (i = 1; i <= n ; i++) {
661                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
662                 ext3_journal_forget(handle, branch[i].bh);
663         }
664         for (i = 0; i <indirect_blks; i++)
665                 ext3_free_blocks(handle, inode, new_blocks[i], 1);
666
667         ext3_free_blocks(handle, inode, new_blocks[i], num);
668
669         return err;
670 }
671
672 /**
673  *      ext3_splice_branch - splice the allocated branch onto inode.
674  *      @inode: owner
675  *      @block: (logical) number of block we are adding
676  *      @chain: chain of indirect blocks (with a missing link - see
677  *              ext3_alloc_branch)
678  *      @where: location of missing link
679  *      @num:   number of indirect blocks we are adding
680  *      @blks:  number of direct blocks we are adding
681  *
682  *      This function fills the missing link and does all housekeeping needed in
683  *      inode (->i_blocks, etc.). In case of success we end up with the full
684  *      chain to new block and return 0.
685  */
686
687 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
688                               Indirect *where, int num, int blks)
689 {
690         int i;
691         int err = 0;
692         struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
693         unsigned long current_block;
694         /*
695          * If we're splicing into a [td]indirect block (as opposed to the
696          * inode) then we need to get write access to the [td]indirect block
697          * before the splice.
698          */
699         if (where->bh) {
700                 BUFFER_TRACE(where->bh, "get_write_access");
701                 err = ext3_journal_get_write_access(handle, where->bh);
702                 if (err)
703                         goto err_out;
704         }
705         /* That's it */
706
707         *where->p = where->key;
708         /* update host bufferhead or inode to point to
709          * more just allocated direct blocks blocks */
710         if (num == 0 && blks > 1) {
711                 current_block = le32_to_cpu(where->key + 1);
712                 for (i = 1; i < blks; i++)
713                         *(where->p + i ) = cpu_to_le32(current_block++);
714         }
715
716         /*
717          * update the most recently allocated logical & physical block
718          * in i_block_alloc_info, to assist find the proper goal block for next
719          * allocation
720          */
721         if (block_i) {
722                 block_i->last_alloc_logical_block = block + blks - 1;
723                 block_i->last_alloc_physical_block = le32_to_cpu(where[num].key + blks - 1);
724         }
725
726         /* We are done with atomic stuff, now do the rest of housekeeping */
727
728         inode->i_ctime = CURRENT_TIME_SEC;
729         ext3_mark_inode_dirty(handle, inode);
730
731         /* had we spliced it onto indirect block? */
732         if (where->bh) {
733                 /*
734                  * akpm: If we spliced it onto an indirect block, we haven't
735                  * altered the inode.  Note however that if it is being spliced
736                  * onto an indirect block at the very end of the file (the
737                  * file is growing) then we *will* alter the inode to reflect
738                  * the new i_size.  But that is not done here - it is done in
739                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
740                  */
741                 jbd_debug(5, "splicing indirect only\n");
742                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
743                 err = ext3_journal_dirty_metadata(handle, where->bh);
744                 if (err) 
745                         goto err_out;
746         } else {
747                 /*
748                  * OK, we spliced it into the inode itself on a direct block.
749                  * Inode was dirtied above.
750                  */
751                 jbd_debug(5, "splicing direct\n");
752         }
753         return err;
754
755 err_out:
756         for (i = 1; i <= num; i++) {
757                 BUFFER_TRACE(where[i].bh, "call journal_forget");
758                 ext3_journal_forget(handle, where[i].bh);
759                 ext3_free_blocks(handle, inode, le32_to_cpu(where[i-1].key), 1);
760         }
761         ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
762
763         return err;
764 }
765
766 /*
767  * Allocation strategy is simple: if we have to allocate something, we will
768  * have to go the whole way to leaf. So let's do it before attaching anything
769  * to tree, set linkage between the newborn blocks, write them if sync is
770  * required, recheck the path, free and repeat if check fails, otherwise
771  * set the last missing link (that will protect us from any truncate-generated
772  * removals - all blocks on the path are immune now) and possibly force the
773  * write on the parent block.
774  * That has a nice additional property: no special recovery from the failed
775  * allocations is needed - we simply release blocks and do not touch anything
776  * reachable from inode.
777  *
778  * akpm: `handle' can be NULL if create == 0.
779  *
780  * The BKL may not be held on entry here.  Be sure to take it early.
781  * return > 0, # of blocks mapped or allocated.
782  * return = 0, if plain lookup failed.
783  * return < 0, error case.
784  */
785
786 int
787 ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock,
788                 unsigned long maxblocks, struct buffer_head *bh_result,
789                 int create, int extend_disksize)
790 {
791         int err = -EIO;
792         int offsets[4];
793         Indirect chain[4];
794         Indirect *partial;
795         unsigned long goal;
796         int indirect_blks;
797         int blocks_to_boundary = 0;
798         int depth;
799         struct ext3_inode_info *ei = EXT3_I(inode);
800         int count = 0;
801         unsigned long first_block = 0;
802
803
804         J_ASSERT(handle != NULL || create == 0);
805         depth = ext3_block_to_path(inode, iblock, offsets, &blocks_to_boundary);
806
807         if (depth == 0)
808                 goto out;
809
810         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
811
812         /* Simplest case - block found, no allocation needed */
813         if (!partial) {
814                 first_block = chain[depth - 1].key;
815                 clear_buffer_new(bh_result);
816                 count++;
817                 /*map more blocks*/
818                 while (count < maxblocks && count <= blocks_to_boundary) {
819                         if (!verify_chain(chain, partial)) {
820                                 /*
821                                  * Indirect block might be removed by
822                                  * truncate while we were reading it.
823                                  * Handling of that case: forget what we've
824                                  * got now. Flag the err as EAGAIN, so it
825                                  * will reread.
826                                  */
827                                 err = -EAGAIN;
828                                 count = 0;
829                                 break;
830                         }
831                         if (le32_to_cpu(*(chain[depth-1].p+count) ==
832                                         (first_block + count)))
833                                 count++;
834                         else
835                                 break;
836                 }
837                 if (err != -EAGAIN)
838                         goto got_it;
839         }
840
841         /* Next simple case - plain lookup or failed read of indirect block */
842         if (!create || err == -EIO)
843                 goto cleanup;
844
845         mutex_lock(&ei->truncate_mutex);
846
847         /*
848          * If the indirect block is missing while we are reading
849          * the chain(ext3_get_branch() returns -EAGAIN err), or
850          * if the chain has been changed after we grab the semaphore,
851          * (either because another process truncated this branch, or
852          * another get_block allocated this branch) re-grab the chain to see if
853          * the request block has been allocated or not.
854          *
855          * Since we already block the truncate/other get_block
856          * at this point, we will have the current copy of the chain when we
857          * splice the branch into the tree.
858          */
859         if (err == -EAGAIN || !verify_chain(chain, partial)) {
860                 while (partial > chain) {
861                         brelse(partial->bh);
862                         partial--;
863                 }
864                 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
865                 if (!partial) {
866                         count++;
867                         mutex_unlock(&ei->truncate_mutex);
868                         if (err)
869                                 goto cleanup;
870                         clear_buffer_new(bh_result);
871                         goto got_it;
872                 }
873         }
874
875         /*
876          * Okay, we need to do block allocation.  Lazily initialize the block
877          * allocation info here if necessary
878         */
879         if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
880                 ext3_init_block_alloc_info(inode);
881
882         goal = ext3_find_goal(inode, iblock, chain, partial);
883
884         /* the number of blocks need to allocate for [d,t]indirect blocks */
885         indirect_blks = (chain + depth) - partial - 1;
886
887         /*
888          * Next look up the indirect map to count the totoal number of
889          * direct blocks to allocate for this branch.
890          */
891         count = ext3_blks_to_allocate(partial, indirect_blks,
892                                         maxblocks, blocks_to_boundary);
893         /*
894          * Block out ext3_truncate while we alter the tree
895          */
896         err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
897                                 offsets + (partial - chain), partial);
898
899         /*
900          * The ext3_splice_branch call will free and forget any buffers
901          * on the new chain if there is a failure, but that risks using
902          * up transaction credits, especially for bitmaps where the
903          * credits cannot be returned.  Can we handle this somehow?  We
904          * may need to return -EAGAIN upwards in the worst case.  --sct
905          */
906         if (!err)
907                 err = ext3_splice_branch(handle, inode, iblock,
908                                         partial, indirect_blks, count);
909         /*
910          * i_disksize growing is protected by truncate_mutex.  Don't forget to
911          * protect it if you're about to implement concurrent
912          * ext3_get_block() -bzzz
913         */
914         if (!err && extend_disksize && inode->i_size > ei->i_disksize)
915                 ei->i_disksize = inode->i_size;
916         mutex_unlock(&ei->truncate_mutex);
917         if (err)
918                 goto cleanup;
919
920         set_buffer_new(bh_result);
921 got_it:
922         map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
923         if (blocks_to_boundary == 0)
924                 set_buffer_boundary(bh_result);
925         err = count;
926         /* Clean up and exit */
927         partial = chain + depth - 1;    /* the whole chain */
928 cleanup:
929         while (partial > chain) {
930                 BUFFER_TRACE(partial->bh, "call brelse");
931                 brelse(partial->bh);
932                 partial--;
933         }
934         BUFFER_TRACE(bh_result, "returned");
935 out:
936         return err;
937 }
938
939 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
940
941 static int
942 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
943                 struct buffer_head *bh_result, int create)
944 {
945         handle_t *handle = journal_current_handle();
946         int ret = 0;
947         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
948
949         if (!create)
950                 goto get_block;         /* A read */
951
952         if (max_blocks == 1)
953                 goto get_block;         /* A single block get */
954
955         if (handle->h_transaction->t_state == T_LOCKED) {
956                 /*
957                  * Huge direct-io writes can hold off commits for long
958                  * periods of time.  Let this commit run.
959                  */
960                 ext3_journal_stop(handle);
961                 handle = ext3_journal_start(inode, DIO_CREDITS);
962                 if (IS_ERR(handle))
963                         ret = PTR_ERR(handle);
964                 goto get_block;
965         }
966
967         if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
968                 /*
969                  * Getting low on buffer credits...
970                  */
971                 ret = ext3_journal_extend(handle, DIO_CREDITS);
972                 if (ret > 0) {
973                         /*
974                          * Couldn't extend the transaction.  Start a new one.
975                          */
976                         ret = ext3_journal_restart(handle, DIO_CREDITS);
977                 }
978         }
979
980 get_block:
981         if (ret == 0) {
982                 ret = ext3_get_blocks_handle(handle, inode, iblock,
983                                         max_blocks, bh_result, create, 0);
984                 if (ret > 0) {
985                         bh_result->b_size = (ret << inode->i_blkbits);
986                         ret = 0;
987                 }
988         }
989         return ret;
990 }
991
992 static int ext3_get_block(struct inode *inode, sector_t iblock,
993                         struct buffer_head *bh_result, int create)
994 {
995         return ext3_direct_io_get_blocks(inode, iblock, bh_result, create);
996 }
997
998 /*
999  * `handle' can be NULL if create is zero
1000  */
1001 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
1002                                 long block, int create, int * errp)
1003 {
1004         struct buffer_head dummy;
1005         int fatal = 0, err;
1006
1007         J_ASSERT(handle != NULL || create == 0);
1008
1009         dummy.b_state = 0;
1010         dummy.b_blocknr = -1000;
1011         buffer_trace_init(&dummy.b_history);
1012         err = ext3_get_blocks_handle(handle, inode, block, 1,
1013                                         &dummy, create, 1);
1014         if (err == 1) {
1015                 err = 0;
1016         } else if (err >= 0) {
1017                 WARN_ON(1);
1018                 err = -EIO;
1019         }
1020         *errp = err;
1021         if (!err && buffer_mapped(&dummy)) {
1022                 struct buffer_head *bh;
1023                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1024                 if (!bh) {
1025                         *errp = -EIO;
1026                         goto err;
1027                 }
1028                 if (buffer_new(&dummy)) {
1029                         J_ASSERT(create != 0);
1030                         J_ASSERT(handle != 0);
1031
1032                         /* Now that we do not always journal data, we
1033                            should keep in mind whether this should
1034                            always journal the new buffer as metadata.
1035                            For now, regular file writes use
1036                            ext3_get_block instead, so it's not a
1037                            problem. */
1038                         lock_buffer(bh);
1039                         BUFFER_TRACE(bh, "call get_create_access");
1040                         fatal = ext3_journal_get_create_access(handle, bh);
1041                         if (!fatal && !buffer_uptodate(bh)) {
1042                                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1043                                 set_buffer_uptodate(bh);
1044                         }
1045                         unlock_buffer(bh);
1046                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1047                         err = ext3_journal_dirty_metadata(handle, bh);
1048                         if (!fatal)
1049                                 fatal = err;
1050                 } else {
1051                         BUFFER_TRACE(bh, "not a new buffer");
1052                 }
1053                 if (fatal) {
1054                         *errp = fatal;
1055                         brelse(bh);
1056                         bh = NULL;
1057                 }
1058                 return bh;
1059         }
1060 err:
1061         return NULL;
1062 }
1063
1064 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
1065                                int block, int create, int *err)
1066 {
1067         struct buffer_head * bh;
1068
1069         bh = ext3_getblk(handle, inode, block, create, err);
1070         if (!bh)
1071                 return bh;
1072         if (buffer_uptodate(bh))
1073                 return bh;
1074         ll_rw_block(READ, 1, &bh);
1075         wait_on_buffer(bh);
1076         if (buffer_uptodate(bh))
1077                 return bh;
1078         put_bh(bh);
1079         *err = -EIO;
1080         return NULL;
1081 }
1082
1083 static int walk_page_buffers(   handle_t *handle,
1084                                 struct buffer_head *head,
1085                                 unsigned from,
1086                                 unsigned to,
1087                                 int *partial,
1088                                 int (*fn)(      handle_t *handle,
1089                                                 struct buffer_head *bh))
1090 {
1091         struct buffer_head *bh;
1092         unsigned block_start, block_end;
1093         unsigned blocksize = head->b_size;
1094         int err, ret = 0;
1095         struct buffer_head *next;
1096
1097         for (   bh = head, block_start = 0;
1098                 ret == 0 && (bh != head || !block_start);
1099                 block_start = block_end, bh = next)
1100         {
1101                 next = bh->b_this_page;
1102                 block_end = block_start + blocksize;
1103                 if (block_end <= from || block_start >= to) {
1104                         if (partial && !buffer_uptodate(bh))
1105                                 *partial = 1;
1106                         continue;
1107                 }
1108                 err = (*fn)(handle, bh);
1109                 if (!ret)
1110                         ret = err;
1111         }
1112         return ret;
1113 }
1114
1115 /*
1116  * To preserve ordering, it is essential that the hole instantiation and
1117  * the data write be encapsulated in a single transaction.  We cannot
1118  * close off a transaction and start a new one between the ext3_get_block()
1119  * and the commit_write().  So doing the journal_start at the start of
1120  * prepare_write() is the right place.
1121  *
1122  * Also, this function can nest inside ext3_writepage() ->
1123  * block_write_full_page(). In that case, we *know* that ext3_writepage()
1124  * has generated enough buffer credits to do the whole page.  So we won't
1125  * block on the journal in that case, which is good, because the caller may
1126  * be PF_MEMALLOC.
1127  *
1128  * By accident, ext3 can be reentered when a transaction is open via
1129  * quota file writes.  If we were to commit the transaction while thus
1130  * reentered, there can be a deadlock - we would be holding a quota
1131  * lock, and the commit would never complete if another thread had a
1132  * transaction open and was blocking on the quota lock - a ranking
1133  * violation.
1134  *
1135  * So what we do is to rely on the fact that journal_stop/journal_start
1136  * will _not_ run commit under these circumstances because handle->h_ref
1137  * is elevated.  We'll still have enough credits for the tiny quotafile
1138  * write.  
1139  */
1140
1141 static int do_journal_get_write_access(handle_t *handle, 
1142                                        struct buffer_head *bh)
1143 {
1144         if (!buffer_mapped(bh) || buffer_freed(bh))
1145                 return 0;
1146         return ext3_journal_get_write_access(handle, bh);
1147 }
1148
1149 static int ext3_prepare_write(struct file *file, struct page *page,
1150                               unsigned from, unsigned to)
1151 {
1152         struct inode *inode = page->mapping->host;
1153         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1154         handle_t *handle;
1155         int retries = 0;
1156
1157 retry:
1158         handle = ext3_journal_start(inode, needed_blocks);
1159         if (IS_ERR(handle)) {
1160                 ret = PTR_ERR(handle);
1161                 goto out;
1162         }
1163         if (test_opt(inode->i_sb, NOBH))
1164                 ret = nobh_prepare_write(page, from, to, ext3_get_block);
1165         else
1166                 ret = block_prepare_write(page, from, to, ext3_get_block);
1167         if (ret)
1168                 goto prepare_write_failed;
1169
1170         if (ext3_should_journal_data(inode)) {
1171                 ret = walk_page_buffers(handle, page_buffers(page),
1172                                 from, to, NULL, do_journal_get_write_access);
1173         }
1174 prepare_write_failed:
1175         if (ret)
1176                 ext3_journal_stop(handle);
1177         if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1178                 goto retry;
1179 out:
1180         return ret;
1181 }
1182
1183 int
1184 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1185 {
1186         int err = journal_dirty_data(handle, bh);
1187         if (err)
1188                 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1189                                                 bh, handle,err);
1190         return err;
1191 }
1192
1193 /* For commit_write() in data=journal mode */
1194 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1195 {
1196         if (!buffer_mapped(bh) || buffer_freed(bh))
1197                 return 0;
1198         set_buffer_uptodate(bh);
1199         return ext3_journal_dirty_metadata(handle, bh);
1200 }
1201
1202 /*
1203  * We need to pick up the new inode size which generic_commit_write gave us
1204  * `file' can be NULL - eg, when called from page_symlink().
1205  *
1206  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
1207  * buffers are managed internally.
1208  */
1209
1210 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1211                              unsigned from, unsigned to)
1212 {
1213         handle_t *handle = ext3_journal_current_handle();
1214         struct inode *inode = page->mapping->host;
1215         int ret = 0, ret2;
1216
1217         ret = walk_page_buffers(handle, page_buffers(page),
1218                 from, to, NULL, ext3_journal_dirty_data);
1219
1220         if (ret == 0) {
1221                 /*
1222                  * generic_commit_write() will run mark_inode_dirty() if i_size
1223                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1224                  * into that.
1225                  */
1226                 loff_t new_i_size;
1227
1228                 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1229                 if (new_i_size > EXT3_I(inode)->i_disksize)
1230                         EXT3_I(inode)->i_disksize = new_i_size;
1231                 ret = generic_commit_write(file, page, from, to);
1232         }
1233         ret2 = ext3_journal_stop(handle);
1234         if (!ret)
1235                 ret = ret2;
1236         return ret;
1237 }
1238
1239 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1240                              unsigned from, unsigned to)
1241 {
1242         handle_t *handle = ext3_journal_current_handle();
1243         struct inode *inode = page->mapping->host;
1244         int ret = 0, ret2;
1245         loff_t new_i_size;
1246
1247         new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1248         if (new_i_size > EXT3_I(inode)->i_disksize)
1249                 EXT3_I(inode)->i_disksize = new_i_size;
1250
1251         if (test_opt(inode->i_sb, NOBH))
1252                 ret = nobh_commit_write(file, page, from, to);
1253         else
1254                 ret = generic_commit_write(file, page, from, to);
1255
1256         ret2 = ext3_journal_stop(handle);
1257         if (!ret)
1258                 ret = ret2;
1259         return ret;
1260 }
1261
1262 static int ext3_journalled_commit_write(struct file *file,
1263                         struct page *page, unsigned from, unsigned to)
1264 {
1265         handle_t *handle = ext3_journal_current_handle();
1266         struct inode *inode = page->mapping->host;
1267         int ret = 0, ret2;
1268         int partial = 0;
1269         loff_t pos;
1270
1271         /*
1272          * Here we duplicate the generic_commit_write() functionality
1273          */
1274         pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1275
1276         ret = walk_page_buffers(handle, page_buffers(page), from,
1277                                 to, &partial, commit_write_fn);
1278         if (!partial)
1279                 SetPageUptodate(page);
1280         if (pos > inode->i_size)
1281                 i_size_write(inode, pos);
1282         EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1283         if (inode->i_size > EXT3_I(inode)->i_disksize) {
1284                 EXT3_I(inode)->i_disksize = inode->i_size;
1285                 ret2 = ext3_mark_inode_dirty(handle, inode);
1286                 if (!ret) 
1287                         ret = ret2;
1288         }
1289         ret2 = ext3_journal_stop(handle);
1290         if (!ret)
1291                 ret = ret2;
1292         return ret;
1293 }
1294
1295 /* 
1296  * bmap() is special.  It gets used by applications such as lilo and by
1297  * the swapper to find the on-disk block of a specific piece of data.
1298  *
1299  * Naturally, this is dangerous if the block concerned is still in the
1300  * journal.  If somebody makes a swapfile on an ext3 data-journaling
1301  * filesystem and enables swap, then they may get a nasty shock when the
1302  * data getting swapped to that swapfile suddenly gets overwritten by
1303  * the original zero's written out previously to the journal and
1304  * awaiting writeback in the kernel's buffer cache. 
1305  *
1306  * So, if we see any bmap calls here on a modified, data-journaled file,
1307  * take extra steps to flush any blocks which might be in the cache. 
1308  */
1309 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1310 {
1311         struct inode *inode = mapping->host;
1312         journal_t *journal;
1313         int err;
1314
1315         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1316                 /* 
1317                  * This is a REALLY heavyweight approach, but the use of
1318                  * bmap on dirty files is expected to be extremely rare:
1319                  * only if we run lilo or swapon on a freshly made file
1320                  * do we expect this to happen. 
1321                  *
1322                  * (bmap requires CAP_SYS_RAWIO so this does not
1323                  * represent an unprivileged user DOS attack --- we'd be
1324                  * in trouble if mortal users could trigger this path at
1325                  * will.) 
1326                  *
1327                  * NB. EXT3_STATE_JDATA is not set on files other than
1328                  * regular files.  If somebody wants to bmap a directory
1329                  * or symlink and gets confused because the buffer
1330                  * hasn't yet been flushed to disk, they deserve
1331                  * everything they get.
1332                  */
1333
1334                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1335                 journal = EXT3_JOURNAL(inode);
1336                 journal_lock_updates(journal);
1337                 err = journal_flush(journal);
1338                 journal_unlock_updates(journal);
1339
1340                 if (err)
1341                         return 0;
1342         }
1343
1344         return generic_block_bmap(mapping,block,ext3_get_block);
1345 }
1346
1347 static int bget_one(handle_t *handle, struct buffer_head *bh)
1348 {
1349         get_bh(bh);
1350         return 0;
1351 }
1352
1353 static int bput_one(handle_t *handle, struct buffer_head *bh)
1354 {
1355         put_bh(bh);
1356         return 0;
1357 }
1358
1359 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1360 {
1361         if (buffer_mapped(bh))
1362                 return ext3_journal_dirty_data(handle, bh);
1363         return 0;
1364 }
1365
1366 /*
1367  * Note that we always start a transaction even if we're not journalling
1368  * data.  This is to preserve ordering: any hole instantiation within
1369  * __block_write_full_page -> ext3_get_block() should be journalled
1370  * along with the data so we don't crash and then get metadata which
1371  * refers to old data.
1372  *
1373  * In all journalling modes block_write_full_page() will start the I/O.
1374  *
1375  * Problem:
1376  *
1377  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1378  *              ext3_writepage()
1379  *
1380  * Similar for:
1381  *
1382  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1383  *
1384  * Same applies to ext3_get_block().  We will deadlock on various things like
1385  * lock_journal and i_truncate_mutex.
1386  *
1387  * Setting PF_MEMALLOC here doesn't work - too many internal memory
1388  * allocations fail.
1389  *
1390  * 16May01: If we're reentered then journal_current_handle() will be
1391  *          non-zero. We simply *return*.
1392  *
1393  * 1 July 2001: @@@ FIXME:
1394  *   In journalled data mode, a data buffer may be metadata against the
1395  *   current transaction.  But the same file is part of a shared mapping
1396  *   and someone does a writepage() on it.
1397  *
1398  *   We will move the buffer onto the async_data list, but *after* it has
1399  *   been dirtied. So there's a small window where we have dirty data on
1400  *   BJ_Metadata.
1401  *
1402  *   Note that this only applies to the last partial page in the file.  The
1403  *   bit which block_write_full_page() uses prepare/commit for.  (That's
1404  *   broken code anyway: it's wrong for msync()).
1405  *
1406  *   It's a rare case: affects the final partial page, for journalled data
1407  *   where the file is subject to bith write() and writepage() in the same
1408  *   transction.  To fix it we'll need a custom block_write_full_page().
1409  *   We'll probably need that anyway for journalling writepage() output.
1410  *
1411  * We don't honour synchronous mounts for writepage().  That would be
1412  * disastrous.  Any write() or metadata operation will sync the fs for
1413  * us.
1414  *
1415  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1416  * we don't need to open a transaction here.
1417  */
1418 static int ext3_ordered_writepage(struct page *page,
1419                         struct writeback_control *wbc)
1420 {
1421         struct inode *inode = page->mapping->host;
1422         struct buffer_head *page_bufs;
1423         handle_t *handle = NULL;
1424         int ret = 0;
1425         int err;
1426
1427         J_ASSERT(PageLocked(page));
1428
1429         /*
1430          * We give up here if we're reentered, because it might be for a
1431          * different filesystem.
1432          */
1433         if (ext3_journal_current_handle())
1434                 goto out_fail;
1435
1436         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1437
1438         if (IS_ERR(handle)) {
1439                 ret = PTR_ERR(handle);
1440                 goto out_fail;
1441         }
1442
1443         if (!page_has_buffers(page)) {
1444                 create_empty_buffers(page, inode->i_sb->s_blocksize,
1445                                 (1 << BH_Dirty)|(1 << BH_Uptodate));
1446         }
1447         page_bufs = page_buffers(page);
1448         walk_page_buffers(handle, page_bufs, 0,
1449                         PAGE_CACHE_SIZE, NULL, bget_one);
1450
1451         ret = block_write_full_page(page, ext3_get_block, wbc);
1452
1453         /*
1454          * The page can become unlocked at any point now, and
1455          * truncate can then come in and change things.  So we
1456          * can't touch *page from now on.  But *page_bufs is
1457          * safe due to elevated refcount.
1458          */
1459
1460         /*
1461          * And attach them to the current transaction.  But only if 
1462          * block_write_full_page() succeeded.  Otherwise they are unmapped,
1463          * and generally junk.
1464          */
1465         if (ret == 0) {
1466                 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1467                                         NULL, journal_dirty_data_fn);
1468                 if (!ret)
1469                         ret = err;
1470         }
1471         walk_page_buffers(handle, page_bufs, 0,
1472                         PAGE_CACHE_SIZE, NULL, bput_one);
1473         err = ext3_journal_stop(handle);
1474         if (!ret)
1475                 ret = err;
1476         return ret;
1477
1478 out_fail:
1479         redirty_page_for_writepage(wbc, page);
1480         unlock_page(page);
1481         return ret;
1482 }
1483
1484 static int ext3_writeback_writepage(struct page *page,
1485                                 struct writeback_control *wbc)
1486 {
1487         struct inode *inode = page->mapping->host;
1488         handle_t *handle = NULL;
1489         int ret = 0;
1490         int err;
1491
1492         if (ext3_journal_current_handle())
1493                 goto out_fail;
1494
1495         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1496         if (IS_ERR(handle)) {
1497                 ret = PTR_ERR(handle);
1498                 goto out_fail;
1499         }
1500
1501         if (test_opt(inode->i_sb, NOBH))
1502                 ret = nobh_writepage(page, ext3_get_block, wbc);
1503         else
1504                 ret = block_write_full_page(page, ext3_get_block, wbc);
1505
1506         err = ext3_journal_stop(handle);
1507         if (!ret)
1508                 ret = err;
1509         return ret;
1510
1511 out_fail:
1512         redirty_page_for_writepage(wbc, page);
1513         unlock_page(page);
1514         return ret;
1515 }
1516
1517 static int ext3_journalled_writepage(struct page *page,
1518                                 struct writeback_control *wbc)
1519 {
1520         struct inode *inode = page->mapping->host;
1521         handle_t *handle = NULL;
1522         int ret = 0;
1523         int err;
1524
1525         if (ext3_journal_current_handle())
1526                 goto no_write;
1527
1528         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1529         if (IS_ERR(handle)) {
1530                 ret = PTR_ERR(handle);
1531                 goto no_write;
1532         }
1533
1534         if (!page_has_buffers(page) || PageChecked(page)) {
1535                 /*
1536                  * It's mmapped pagecache.  Add buffers and journal it.  There
1537                  * doesn't seem much point in redirtying the page here.
1538                  */
1539                 ClearPageChecked(page);
1540                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1541                                         ext3_get_block);
1542                 if (ret != 0) {
1543                         ext3_journal_stop(handle);
1544                         goto out_unlock;
1545                 }
1546                 ret = walk_page_buffers(handle, page_buffers(page), 0,
1547                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1548
1549                 err = walk_page_buffers(handle, page_buffers(page), 0,
1550                                 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1551                 if (ret == 0)
1552                         ret = err;
1553                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1554                 unlock_page(page);
1555         } else {
1556                 /*
1557                  * It may be a page full of checkpoint-mode buffers.  We don't
1558                  * really know unless we go poke around in the buffer_heads.
1559                  * But block_write_full_page will do the right thing.
1560                  */
1561                 ret = block_write_full_page(page, ext3_get_block, wbc);
1562         }
1563         err = ext3_journal_stop(handle);
1564         if (!ret)
1565                 ret = err;
1566 out:
1567         return ret;
1568
1569 no_write:
1570         redirty_page_for_writepage(wbc, page);
1571 out_unlock:
1572         unlock_page(page);
1573         goto out;
1574 }
1575
1576 static int ext3_readpage(struct file *file, struct page *page)
1577 {
1578         return mpage_readpage(page, ext3_get_block);
1579 }
1580
1581 static int
1582 ext3_readpages(struct file *file, struct address_space *mapping,
1583                 struct list_head *pages, unsigned nr_pages)
1584 {
1585         return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1586 }
1587
1588 static void ext3_invalidatepage(struct page *page, unsigned long offset)
1589 {
1590         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1591
1592         /*
1593          * If it's a full truncate we just forget about the pending dirtying
1594          */
1595         if (offset == 0)
1596                 ClearPageChecked(page);
1597
1598         journal_invalidatepage(journal, page, offset);
1599 }
1600
1601 static int ext3_releasepage(struct page *page, gfp_t wait)
1602 {
1603         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1604
1605         WARN_ON(PageChecked(page));
1606         if (!page_has_buffers(page))
1607                 return 0;
1608         return journal_try_to_free_buffers(journal, page, wait);
1609 }
1610
1611 /*
1612  * If the O_DIRECT write will extend the file then add this inode to the
1613  * orphan list.  So recovery will truncate it back to the original size
1614  * if the machine crashes during the write.
1615  *
1616  * If the O_DIRECT write is intantiating holes inside i_size and the machine
1617  * crashes then stale disk data _may_ be exposed inside the file.
1618  */
1619 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1620                         const struct iovec *iov, loff_t offset,
1621                         unsigned long nr_segs)
1622 {
1623         struct file *file = iocb->ki_filp;
1624         struct inode *inode = file->f_mapping->host;
1625         struct ext3_inode_info *ei = EXT3_I(inode);
1626         handle_t *handle = NULL;
1627         ssize_t ret;
1628         int orphan = 0;
1629         size_t count = iov_length(iov, nr_segs);
1630
1631         if (rw == WRITE) {
1632                 loff_t final_size = offset + count;
1633
1634                 handle = ext3_journal_start(inode, DIO_CREDITS);
1635                 if (IS_ERR(handle)) {
1636                         ret = PTR_ERR(handle);
1637                         goto out;
1638                 }
1639                 if (final_size > inode->i_size) {
1640                         ret = ext3_orphan_add(handle, inode);
1641                         if (ret)
1642                                 goto out_stop;
1643                         orphan = 1;
1644                         ei->i_disksize = inode->i_size;
1645                 }
1646         }
1647
1648         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
1649                                  offset, nr_segs,
1650                                  ext3_direct_io_get_blocks, NULL);
1651
1652         /*
1653          * Reacquire the handle: ext3_direct_io_get_block() can restart the
1654          * transaction
1655          */
1656         handle = journal_current_handle();
1657
1658 out_stop:
1659         if (handle) {
1660                 int err;
1661
1662                 if (orphan && inode->i_nlink)
1663                         ext3_orphan_del(handle, inode);
1664                 if (orphan && ret > 0) {
1665                         loff_t end = offset + ret;
1666                         if (end > inode->i_size) {
1667                                 ei->i_disksize = end;
1668                                 i_size_write(inode, end);
1669                                 /*
1670                                  * We're going to return a positive `ret'
1671                                  * here due to non-zero-length I/O, so there's
1672                                  * no way of reporting error returns from
1673                                  * ext3_mark_inode_dirty() to userspace.  So
1674                                  * ignore it.
1675                                  */
1676                                 ext3_mark_inode_dirty(handle, inode);
1677                         }
1678                 }
1679                 err = ext3_journal_stop(handle);
1680                 if (ret == 0)
1681                         ret = err;
1682         }
1683 out:
1684         return ret;
1685 }
1686
1687 /*
1688  * Pages can be marked dirty completely asynchronously from ext3's journalling
1689  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
1690  * much here because ->set_page_dirty is called under VFS locks.  The page is
1691  * not necessarily locked.
1692  *
1693  * We cannot just dirty the page and leave attached buffers clean, because the
1694  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
1695  * or jbddirty because all the journalling code will explode.
1696  *
1697  * So what we do is to mark the page "pending dirty" and next time writepage
1698  * is called, propagate that into the buffers appropriately.
1699  */
1700 static int ext3_journalled_set_page_dirty(struct page *page)
1701 {
1702         SetPageChecked(page);
1703         return __set_page_dirty_nobuffers(page);
1704 }
1705
1706 static struct address_space_operations ext3_ordered_aops = {
1707         .readpage       = ext3_readpage,
1708         .readpages      = ext3_readpages,
1709         .writepage      = ext3_ordered_writepage,
1710         .sync_page      = block_sync_page,
1711         .prepare_write  = ext3_prepare_write,
1712         .commit_write   = ext3_ordered_commit_write,
1713         .bmap           = ext3_bmap,
1714         .invalidatepage = ext3_invalidatepage,
1715         .releasepage    = ext3_releasepage,
1716         .direct_IO      = ext3_direct_IO,
1717         .migratepage    = buffer_migrate_page,
1718 };
1719
1720 static struct address_space_operations ext3_writeback_aops = {
1721         .readpage       = ext3_readpage,
1722         .readpages      = ext3_readpages,
1723         .writepage      = ext3_writeback_writepage,
1724         .sync_page      = block_sync_page,
1725         .prepare_write  = ext3_prepare_write,
1726         .commit_write   = ext3_writeback_commit_write,
1727         .bmap           = ext3_bmap,
1728         .invalidatepage = ext3_invalidatepage,
1729         .releasepage    = ext3_releasepage,
1730         .direct_IO      = ext3_direct_IO,
1731         .migratepage    = buffer_migrate_page,
1732 };
1733
1734 static struct address_space_operations ext3_journalled_aops = {
1735         .readpage       = ext3_readpage,
1736         .readpages      = ext3_readpages,
1737         .writepage      = ext3_journalled_writepage,
1738         .sync_page      = block_sync_page,
1739         .prepare_write  = ext3_prepare_write,
1740         .commit_write   = ext3_journalled_commit_write,
1741         .set_page_dirty = ext3_journalled_set_page_dirty,
1742         .bmap           = ext3_bmap,
1743         .invalidatepage = ext3_invalidatepage,
1744         .releasepage    = ext3_releasepage,
1745 };
1746
1747 void ext3_set_aops(struct inode *inode)
1748 {
1749         if (ext3_should_order_data(inode))
1750                 inode->i_mapping->a_ops = &ext3_ordered_aops;
1751         else if (ext3_should_writeback_data(inode))
1752                 inode->i_mapping->a_ops = &ext3_writeback_aops;
1753         else
1754                 inode->i_mapping->a_ops = &ext3_journalled_aops;
1755 }
1756
1757 /*
1758  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1759  * up to the end of the block which corresponds to `from'.
1760  * This required during truncate. We need to physically zero the tail end
1761  * of that block so it doesn't yield old data if the file is later grown.
1762  */
1763 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1764                 struct address_space *mapping, loff_t from)
1765 {
1766         unsigned long index = from >> PAGE_CACHE_SHIFT;
1767         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1768         unsigned blocksize, iblock, length, pos;
1769         struct inode *inode = mapping->host;
1770         struct buffer_head *bh;
1771         int err = 0;
1772         void *kaddr;
1773
1774         blocksize = inode->i_sb->s_blocksize;
1775         length = blocksize - (offset & (blocksize - 1));
1776         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1777
1778         /*
1779          * For "nobh" option,  we can only work if we don't need to
1780          * read-in the page - otherwise we create buffers to do the IO.
1781          */
1782         if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1783              ext3_should_writeback_data(inode) && PageUptodate(page)) {
1784                 kaddr = kmap_atomic(page, KM_USER0);
1785                 memset(kaddr + offset, 0, length);
1786                 flush_dcache_page(page);
1787                 kunmap_atomic(kaddr, KM_USER0);
1788                 set_page_dirty(page);
1789                 goto unlock;
1790         }
1791
1792         if (!page_has_buffers(page))
1793                 create_empty_buffers(page, blocksize, 0);
1794
1795         /* Find the buffer that contains "offset" */
1796         bh = page_buffers(page);
1797         pos = blocksize;
1798         while (offset >= pos) {
1799                 bh = bh->b_this_page;
1800                 iblock++;
1801                 pos += blocksize;
1802         }
1803
1804         err = 0;
1805         if (buffer_freed(bh)) {
1806                 BUFFER_TRACE(bh, "freed: skip");
1807                 goto unlock;
1808         }
1809
1810         if (!buffer_mapped(bh)) {
1811                 BUFFER_TRACE(bh, "unmapped");
1812                 ext3_get_block(inode, iblock, bh, 0);
1813                 /* unmapped? It's a hole - nothing to do */
1814                 if (!buffer_mapped(bh)) {
1815                         BUFFER_TRACE(bh, "still unmapped");
1816                         goto unlock;
1817                 }
1818         }
1819
1820         /* Ok, it's mapped. Make sure it's up-to-date */
1821         if (PageUptodate(page))
1822                 set_buffer_uptodate(bh);
1823
1824         if (!buffer_uptodate(bh)) {
1825                 err = -EIO;
1826                 ll_rw_block(READ, 1, &bh);
1827                 wait_on_buffer(bh);
1828                 /* Uhhuh. Read error. Complain and punt. */
1829                 if (!buffer_uptodate(bh))
1830                         goto unlock;
1831         }
1832
1833         if (ext3_should_journal_data(inode)) {
1834                 BUFFER_TRACE(bh, "get write access");
1835                 err = ext3_journal_get_write_access(handle, bh);
1836                 if (err)
1837                         goto unlock;
1838         }
1839
1840         kaddr = kmap_atomic(page, KM_USER0);
1841         memset(kaddr + offset, 0, length);
1842         flush_dcache_page(page);
1843         kunmap_atomic(kaddr, KM_USER0);
1844
1845         BUFFER_TRACE(bh, "zeroed end of block");
1846
1847         err = 0;
1848         if (ext3_should_journal_data(inode)) {
1849                 err = ext3_journal_dirty_metadata(handle, bh);
1850         } else {
1851                 if (ext3_should_order_data(inode))
1852                         err = ext3_journal_dirty_data(handle, bh);
1853                 mark_buffer_dirty(bh);
1854         }
1855
1856 unlock:
1857         unlock_page(page);
1858         page_cache_release(page);
1859         return err;
1860 }
1861
1862 /*
1863  * Probably it should be a library function... search for first non-zero word
1864  * or memcmp with zero_page, whatever is better for particular architecture.
1865  * Linus?
1866  */
1867 static inline int all_zeroes(__le32 *p, __le32 *q)
1868 {
1869         while (p < q)
1870                 if (*p++)
1871                         return 0;
1872         return 1;
1873 }
1874
1875 /**
1876  *      ext3_find_shared - find the indirect blocks for partial truncation.
1877  *      @inode:   inode in question
1878  *      @depth:   depth of the affected branch
1879  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1880  *      @chain:   place to store the pointers to partial indirect blocks
1881  *      @top:     place to the (detached) top of branch
1882  *
1883  *      This is a helper function used by ext3_truncate().
1884  *
1885  *      When we do truncate() we may have to clean the ends of several
1886  *      indirect blocks but leave the blocks themselves alive. Block is
1887  *      partially truncated if some data below the new i_size is refered
1888  *      from it (and it is on the path to the first completely truncated
1889  *      data block, indeed).  We have to free the top of that path along
1890  *      with everything to the right of the path. Since no allocation
1891  *      past the truncation point is possible until ext3_truncate()
1892  *      finishes, we may safely do the latter, but top of branch may
1893  *      require special attention - pageout below the truncation point
1894  *      might try to populate it.
1895  *
1896  *      We atomically detach the top of branch from the tree, store the
1897  *      block number of its root in *@top, pointers to buffer_heads of
1898  *      partially truncated blocks - in @chain[].bh and pointers to
1899  *      their last elements that should not be removed - in
1900  *      @chain[].p. Return value is the pointer to last filled element
1901  *      of @chain.
1902  *
1903  *      The work left to caller to do the actual freeing of subtrees:
1904  *              a) free the subtree starting from *@top
1905  *              b) free the subtrees whose roots are stored in
1906  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1907  *              c) free the subtrees growing from the inode past the @chain[0].
1908  *                      (no partially truncated stuff there).  */
1909
1910 static Indirect *ext3_find_shared(struct inode *inode,
1911                                 int depth,
1912                                 int offsets[4],
1913                                 Indirect chain[4],
1914                                 __le32 *top)
1915 {
1916         Indirect *partial, *p;
1917         int k, err;
1918
1919         *top = 0;
1920         /* Make k index the deepest non-null offest + 1 */
1921         for (k = depth; k > 1 && !offsets[k-1]; k--)
1922                 ;
1923         partial = ext3_get_branch(inode, k, offsets, chain, &err);
1924         /* Writer: pointers */
1925         if (!partial)
1926                 partial = chain + k-1;
1927         /*
1928          * If the branch acquired continuation since we've looked at it -
1929          * fine, it should all survive and (new) top doesn't belong to us.
1930          */
1931         if (!partial->key && *partial->p)
1932                 /* Writer: end */
1933                 goto no_top;
1934         for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1935                 ;
1936         /*
1937          * OK, we've found the last block that must survive. The rest of our
1938          * branch should be detached before unlocking. However, if that rest
1939          * of branch is all ours and does not grow immediately from the inode
1940          * it's easier to cheat and just decrement partial->p.
1941          */
1942         if (p == chain + k - 1 && p > chain) {
1943                 p->p--;
1944         } else {
1945                 *top = *p->p;
1946                 /* Nope, don't do this in ext3.  Must leave the tree intact */
1947 #if 0
1948                 *p->p = 0;
1949 #endif
1950         }
1951         /* Writer: end */
1952
1953         while(partial > p)
1954         {
1955                 brelse(partial->bh);
1956                 partial--;
1957         }
1958 no_top:
1959         return partial;
1960 }
1961
1962 /*
1963  * Zero a number of block pointers in either an inode or an indirect block.
1964  * If we restart the transaction we must again get write access to the
1965  * indirect block for further modification.
1966  *
1967  * We release `count' blocks on disk, but (last - first) may be greater
1968  * than `count' because there can be holes in there.
1969  */
1970 static void
1971 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1972                 unsigned long block_to_free, unsigned long count,
1973                 __le32 *first, __le32 *last)
1974 {
1975         __le32 *p;
1976         if (try_to_extend_transaction(handle, inode)) {
1977                 if (bh) {
1978                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1979                         ext3_journal_dirty_metadata(handle, bh);
1980                 }
1981                 ext3_mark_inode_dirty(handle, inode);
1982                 ext3_journal_test_restart(handle, inode);
1983                 if (bh) {
1984                         BUFFER_TRACE(bh, "retaking write access");
1985                         ext3_journal_get_write_access(handle, bh);
1986                 }
1987         }
1988
1989         /*
1990          * Any buffers which are on the journal will be in memory. We find
1991          * them on the hash table so journal_revoke() will run journal_forget()
1992          * on them.  We've already detached each block from the file, so
1993          * bforget() in journal_forget() should be safe.
1994          *
1995          * AKPM: turn on bforget in journal_forget()!!!
1996          */
1997         for (p = first; p < last; p++) {
1998                 u32 nr = le32_to_cpu(*p);
1999                 if (nr) {
2000                         struct buffer_head *bh;
2001
2002                         *p = 0;
2003                         bh = sb_find_get_block(inode->i_sb, nr);
2004                         ext3_forget(handle, 0, inode, bh, nr);
2005                 }
2006         }
2007
2008         ext3_free_blocks(handle, inode, block_to_free, count);
2009 }
2010
2011 /**
2012  * ext3_free_data - free a list of data blocks
2013  * @handle:     handle for this transaction
2014  * @inode:      inode we are dealing with
2015  * @this_bh:    indirect buffer_head which contains *@first and *@last
2016  * @first:      array of block numbers
2017  * @last:       points immediately past the end of array
2018  *
2019  * We are freeing all blocks refered from that array (numbers are stored as
2020  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2021  *
2022  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
2023  * blocks are contiguous then releasing them at one time will only affect one
2024  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2025  * actually use a lot of journal space.
2026  *
2027  * @this_bh will be %NULL if @first and @last point into the inode's direct
2028  * block pointers.
2029  */
2030 static void ext3_free_data(handle_t *handle, struct inode *inode,
2031                            struct buffer_head *this_bh,
2032                            __le32 *first, __le32 *last)
2033 {
2034         unsigned long block_to_free = 0;    /* Starting block # of a run */
2035         unsigned long count = 0;            /* Number of blocks in the run */ 
2036         __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
2037                                                corresponding to
2038                                                block_to_free */
2039         unsigned long nr;                   /* Current block # */
2040         __le32 *p;                          /* Pointer into inode/ind
2041                                                for current block */
2042         int err;
2043
2044         if (this_bh) {                          /* For indirect block */
2045                 BUFFER_TRACE(this_bh, "get_write_access");
2046                 err = ext3_journal_get_write_access(handle, this_bh);
2047                 /* Important: if we can't update the indirect pointers
2048                  * to the blocks, we can't free them. */
2049                 if (err)
2050                         return;
2051         }
2052
2053         for (p = first; p < last; p++) {
2054                 nr = le32_to_cpu(*p);
2055                 if (nr) {
2056                         /* accumulate blocks to free if they're contiguous */
2057                         if (count == 0) {
2058                                 block_to_free = nr;
2059                                 block_to_free_p = p;
2060                                 count = 1;
2061                         } else if (nr == block_to_free + count) {
2062                                 count++;
2063                         } else {
2064                                 ext3_clear_blocks(handle, inode, this_bh, 
2065                                                   block_to_free,
2066                                                   count, block_to_free_p, p);
2067                                 block_to_free = nr;
2068                                 block_to_free_p = p;
2069                                 count = 1;
2070                         }
2071                 }
2072         }
2073
2074         if (count > 0)
2075                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
2076                                   count, block_to_free_p, p);
2077
2078         if (this_bh) {
2079                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
2080                 ext3_journal_dirty_metadata(handle, this_bh);
2081         }
2082 }
2083
2084 /**
2085  *      ext3_free_branches - free an array of branches
2086  *      @handle: JBD handle for this transaction
2087  *      @inode: inode we are dealing with
2088  *      @parent_bh: the buffer_head which contains *@first and *@last
2089  *      @first: array of block numbers
2090  *      @last:  pointer immediately past the end of array
2091  *      @depth: depth of the branches to free
2092  *
2093  *      We are freeing all blocks refered from these branches (numbers are
2094  *      stored as little-endian 32-bit) and updating @inode->i_blocks
2095  *      appropriately.
2096  */
2097 static void ext3_free_branches(handle_t *handle, struct inode *inode,
2098                                struct buffer_head *parent_bh,
2099                                __le32 *first, __le32 *last, int depth)
2100 {
2101         unsigned long nr;
2102         __le32 *p;
2103
2104         if (is_handle_aborted(handle))
2105                 return;
2106
2107         if (depth--) {
2108                 struct buffer_head *bh;
2109                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2110                 p = last;
2111                 while (--p >= first) {
2112                         nr = le32_to_cpu(*p);
2113                         if (!nr)
2114                                 continue;               /* A hole */
2115
2116                         /* Go read the buffer for the next level down */
2117                         bh = sb_bread(inode->i_sb, nr);
2118
2119                         /*
2120                          * A read failure? Report error and clear slot
2121                          * (should be rare).
2122                          */
2123                         if (!bh) {
2124                                 ext3_error(inode->i_sb, "ext3_free_branches",
2125                                            "Read failure, inode=%ld, block=%ld",
2126                                            inode->i_ino, nr);
2127                                 continue;
2128                         }
2129
2130                         /* This zaps the entire block.  Bottom up. */
2131                         BUFFER_TRACE(bh, "free child branches");
2132                         ext3_free_branches(handle, inode, bh,
2133                                            (__le32*)bh->b_data,
2134                                            (__le32*)bh->b_data + addr_per_block,
2135                                            depth);
2136
2137                         /*
2138                          * We've probably journalled the indirect block several
2139                          * times during the truncate.  But it's no longer
2140                          * needed and we now drop it from the transaction via
2141                          * journal_revoke().
2142                          *
2143                          * That's easy if it's exclusively part of this
2144                          * transaction.  But if it's part of the committing
2145                          * transaction then journal_forget() will simply
2146                          * brelse() it.  That means that if the underlying
2147                          * block is reallocated in ext3_get_block(),
2148                          * unmap_underlying_metadata() will find this block
2149                          * and will try to get rid of it.  damn, damn.
2150                          *
2151                          * If this block has already been committed to the
2152                          * journal, a revoke record will be written.  And
2153                          * revoke records must be emitted *before* clearing
2154                          * this block's bit in the bitmaps.
2155                          */
2156                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2157
2158                         /*
2159                          * Everything below this this pointer has been
2160                          * released.  Now let this top-of-subtree go.
2161                          *
2162                          * We want the freeing of this indirect block to be
2163                          * atomic in the journal with the updating of the
2164                          * bitmap block which owns it.  So make some room in
2165                          * the journal.
2166                          *
2167                          * We zero the parent pointer *after* freeing its
2168                          * pointee in the bitmaps, so if extend_transaction()
2169                          * for some reason fails to put the bitmap changes and
2170                          * the release into the same transaction, recovery
2171                          * will merely complain about releasing a free block,
2172                          * rather than leaking blocks.
2173                          */
2174                         if (is_handle_aborted(handle))
2175                                 return;
2176                         if (try_to_extend_transaction(handle, inode)) {
2177                                 ext3_mark_inode_dirty(handle, inode);
2178                                 ext3_journal_test_restart(handle, inode);
2179                         }
2180
2181                         ext3_free_blocks(handle, inode, nr, 1);
2182
2183                         if (parent_bh) {
2184                                 /*
2185                                  * The block which we have just freed is
2186                                  * pointed to by an indirect block: journal it
2187                                  */
2188                                 BUFFER_TRACE(parent_bh, "get_write_access");
2189                                 if (!ext3_journal_get_write_access(handle,
2190                                                                    parent_bh)){
2191                                         *p = 0;
2192                                         BUFFER_TRACE(parent_bh,
2193                                         "call ext3_journal_dirty_metadata");
2194                                         ext3_journal_dirty_metadata(handle, 
2195                                                                     parent_bh);
2196                                 }
2197                         }
2198                 }
2199         } else {
2200                 /* We have reached the bottom of the tree. */
2201                 BUFFER_TRACE(parent_bh, "free data blocks");
2202                 ext3_free_data(handle, inode, parent_bh, first, last);
2203         }
2204 }
2205
2206 /*
2207  * ext3_truncate()
2208  *
2209  * We block out ext3_get_block() block instantiations across the entire
2210  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2211  * simultaneously on behalf of the same inode.
2212  *
2213  * As we work through the truncate and commmit bits of it to the journal there
2214  * is one core, guiding principle: the file's tree must always be consistent on
2215  * disk.  We must be able to restart the truncate after a crash.
2216  *
2217  * The file's tree may be transiently inconsistent in memory (although it
2218  * probably isn't), but whenever we close off and commit a journal transaction,
2219  * the contents of (the filesystem + the journal) must be consistent and
2220  * restartable.  It's pretty simple, really: bottom up, right to left (although
2221  * left-to-right works OK too).
2222  *
2223  * Note that at recovery time, journal replay occurs *before* the restart of
2224  * truncate against the orphan inode list.
2225  *
2226  * The committed inode has the new, desired i_size (which is the same as
2227  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
2228  * that this inode's truncate did not complete and it will again call
2229  * ext3_truncate() to have another go.  So there will be instantiated blocks
2230  * to the right of the truncation point in a crashed ext3 filesystem.  But
2231  * that's fine - as long as they are linked from the inode, the post-crash
2232  * ext3_truncate() run will find them and release them.
2233  */
2234
2235 void ext3_truncate(struct inode * inode)
2236 {
2237         handle_t *handle;
2238         struct ext3_inode_info *ei = EXT3_I(inode);
2239         __le32 *i_data = ei->i_data;
2240         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2241         struct address_space *mapping = inode->i_mapping;
2242         int offsets[4];
2243         Indirect chain[4];
2244         Indirect *partial;
2245         __le32 nr = 0;
2246         int n;
2247         long last_block;
2248         unsigned blocksize = inode->i_sb->s_blocksize;
2249         struct page *page;
2250
2251         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2252             S_ISLNK(inode->i_mode)))
2253                 return;
2254         if (ext3_inode_is_fast_symlink(inode))
2255                 return;
2256         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2257                 return;
2258
2259         /*
2260          * We have to lock the EOF page here, because lock_page() nests
2261          * outside journal_start().
2262          */
2263         if ((inode->i_size & (blocksize - 1)) == 0) {
2264                 /* Block boundary? Nothing to do */
2265                 page = NULL;
2266         } else {
2267                 page = grab_cache_page(mapping,
2268                                 inode->i_size >> PAGE_CACHE_SHIFT);
2269                 if (!page)
2270                         return;
2271         }
2272
2273         handle = start_transaction(inode);
2274         if (IS_ERR(handle)) {
2275                 if (page) {
2276                         clear_highpage(page);
2277                         flush_dcache_page(page);
2278                         unlock_page(page);
2279                         page_cache_release(page);
2280                 }
2281                 return;         /* AKPM: return what? */
2282         }
2283
2284         last_block = (inode->i_size + blocksize-1)
2285                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2286
2287         if (page)
2288                 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2289
2290         n = ext3_block_to_path(inode, last_block, offsets, NULL);
2291         if (n == 0)
2292                 goto out_stop;  /* error */
2293
2294         /*
2295          * OK.  This truncate is going to happen.  We add the inode to the
2296          * orphan list, so that if this truncate spans multiple transactions,
2297          * and we crash, we will resume the truncate when the filesystem
2298          * recovers.  It also marks the inode dirty, to catch the new size.
2299          *
2300          * Implication: the file must always be in a sane, consistent
2301          * truncatable state while each transaction commits.
2302          */
2303         if (ext3_orphan_add(handle, inode))
2304                 goto out_stop;
2305
2306         /*
2307          * The orphan list entry will now protect us from any crash which
2308          * occurs before the truncate completes, so it is now safe to propagate
2309          * the new, shorter inode size (held for now in i_size) into the
2310          * on-disk inode. We do this via i_disksize, which is the value which
2311          * ext3 *really* writes onto the disk inode.
2312          */
2313         ei->i_disksize = inode->i_size;
2314
2315         /*
2316          * From here we block out all ext3_get_block() callers who want to
2317          * modify the block allocation tree.
2318          */
2319         mutex_lock(&ei->truncate_mutex);
2320
2321         if (n == 1) {           /* direct blocks */
2322                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2323                                i_data + EXT3_NDIR_BLOCKS);
2324                 goto do_indirects;
2325         }
2326
2327         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2328         /* Kill the top of shared branch (not detached) */
2329         if (nr) {
2330                 if (partial == chain) {
2331                         /* Shared branch grows from the inode */
2332                         ext3_free_branches(handle, inode, NULL,
2333                                            &nr, &nr+1, (chain+n-1) - partial);
2334                         *partial->p = 0;
2335                         /*
2336                          * We mark the inode dirty prior to restart,
2337                          * and prior to stop.  No need for it here.
2338                          */
2339                 } else {
2340                         /* Shared branch grows from an indirect block */
2341                         BUFFER_TRACE(partial->bh, "get_write_access");
2342                         ext3_free_branches(handle, inode, partial->bh,
2343                                         partial->p,
2344                                         partial->p+1, (chain+n-1) - partial);
2345                 }
2346         }
2347         /* Clear the ends of indirect blocks on the shared branch */
2348         while (partial > chain) {
2349                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2350                                    (__le32*)partial->bh->b_data+addr_per_block,
2351                                    (chain+n-1) - partial);
2352                 BUFFER_TRACE(partial->bh, "call brelse");
2353                 brelse (partial->bh);
2354                 partial--;
2355         }
2356 do_indirects:
2357         /* Kill the remaining (whole) subtrees */
2358         switch (offsets[0]) {
2359                 default:
2360                         nr = i_data[EXT3_IND_BLOCK];
2361                         if (nr) {
2362                                 ext3_free_branches(handle, inode, NULL,
2363                                                    &nr, &nr+1, 1);
2364                                 i_data[EXT3_IND_BLOCK] = 0;
2365                         }
2366                 case EXT3_IND_BLOCK:
2367                         nr = i_data[EXT3_DIND_BLOCK];
2368                         if (nr) {
2369                                 ext3_free_branches(handle, inode, NULL,
2370                                                    &nr, &nr+1, 2);
2371                                 i_data[EXT3_DIND_BLOCK] = 0;
2372                         }
2373                 case EXT3_DIND_BLOCK:
2374                         nr = i_data[EXT3_TIND_BLOCK];
2375                         if (nr) {
2376                                 ext3_free_branches(handle, inode, NULL,
2377                                                    &nr, &nr+1, 3);
2378                                 i_data[EXT3_TIND_BLOCK] = 0;
2379                         }
2380                 case EXT3_TIND_BLOCK:
2381                         ;
2382         }
2383
2384         ext3_discard_reservation(inode);
2385
2386         mutex_unlock(&ei->truncate_mutex);
2387         inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2388         ext3_mark_inode_dirty(handle, inode);
2389
2390         /* In a multi-transaction truncate, we only make the final
2391          * transaction synchronous */
2392         if (IS_SYNC(inode))
2393                 handle->h_sync = 1;
2394 out_stop:
2395         /*
2396          * If this was a simple ftruncate(), and the file will remain alive
2397          * then we need to clear up the orphan record which we created above.
2398          * However, if this was a real unlink then we were called by
2399          * ext3_delete_inode(), and we allow that function to clean up the
2400          * orphan info for us.
2401          */
2402         if (inode->i_nlink)
2403                 ext3_orphan_del(handle, inode);
2404
2405         ext3_journal_stop(handle);
2406 }
2407
2408 static unsigned long ext3_get_inode_block(struct super_block *sb,
2409                 unsigned long ino, struct ext3_iloc *iloc)
2410 {
2411         unsigned long desc, group_desc, block_group;
2412         unsigned long offset, block;
2413         struct buffer_head *bh;
2414         struct ext3_group_desc * gdp;
2415
2416
2417         if ((ino != EXT3_ROOT_INO &&
2418                 ino != EXT3_JOURNAL_INO &&
2419                 ino != EXT3_RESIZE_INO &&
2420                 ino < EXT3_FIRST_INO(sb)) ||
2421                 ino > le32_to_cpu(
2422                         EXT3_SB(sb)->s_es->s_inodes_count)) {
2423                 ext3_error (sb, "ext3_get_inode_block",
2424                             "bad inode number: %lu", ino);
2425                 return 0;
2426         }
2427         block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2428         if (block_group >= EXT3_SB(sb)->s_groups_count) {
2429                 ext3_error (sb, "ext3_get_inode_block",
2430                             "group >= groups count");
2431                 return 0;
2432         }
2433         smp_rmb();
2434         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2435         desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2436         bh = EXT3_SB(sb)->s_group_desc[group_desc];
2437         if (!bh) {
2438                 ext3_error (sb, "ext3_get_inode_block",
2439                             "Descriptor not loaded");
2440                 return 0;
2441         }
2442
2443         gdp = (struct ext3_group_desc *) bh->b_data;
2444         /*
2445          * Figure out the offset within the block group inode table
2446          */
2447         offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2448                 EXT3_INODE_SIZE(sb);
2449         block = le32_to_cpu(gdp[desc].bg_inode_table) +
2450                 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2451
2452         iloc->block_group = block_group;
2453         iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2454         return block;
2455 }
2456
2457 /*
2458  * ext3_get_inode_loc returns with an extra refcount against the inode's
2459  * underlying buffer_head on success. If 'in_mem' is true, we have all
2460  * data in memory that is needed to recreate the on-disk version of this
2461  * inode.
2462  */
2463 static int __ext3_get_inode_loc(struct inode *inode,
2464                                 struct ext3_iloc *iloc, int in_mem)
2465 {
2466         unsigned long block;
2467         struct buffer_head *bh;
2468
2469         block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2470         if (!block)
2471                 return -EIO;
2472
2473         bh = sb_getblk(inode->i_sb, block);
2474         if (!bh) {
2475                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2476                                 "unable to read inode block - "
2477                                 "inode=%lu, block=%lu", inode->i_ino, block);
2478                 return -EIO;
2479         }
2480         if (!buffer_uptodate(bh)) {
2481                 lock_buffer(bh);
2482                 if (buffer_uptodate(bh)) {
2483                         /* someone brought it uptodate while we waited */
2484                         unlock_buffer(bh);
2485                         goto has_buffer;
2486                 }
2487
2488                 /*
2489                  * If we have all information of the inode in memory and this
2490                  * is the only valid inode in the block, we need not read the
2491                  * block.
2492                  */
2493                 if (in_mem) {
2494                         struct buffer_head *bitmap_bh;
2495                         struct ext3_group_desc *desc;
2496                         int inodes_per_buffer;
2497                         int inode_offset, i;
2498                         int block_group;
2499                         int start;
2500
2501                         block_group = (inode->i_ino - 1) /
2502                                         EXT3_INODES_PER_GROUP(inode->i_sb);
2503                         inodes_per_buffer = bh->b_size /
2504                                 EXT3_INODE_SIZE(inode->i_sb);
2505                         inode_offset = ((inode->i_ino - 1) %
2506                                         EXT3_INODES_PER_GROUP(inode->i_sb));
2507                         start = inode_offset & ~(inodes_per_buffer - 1);
2508
2509                         /* Is the inode bitmap in cache? */
2510                         desc = ext3_get_group_desc(inode->i_sb,
2511                                                 block_group, NULL);
2512                         if (!desc)
2513                                 goto make_io;
2514
2515                         bitmap_bh = sb_getblk(inode->i_sb,
2516                                         le32_to_cpu(desc->bg_inode_bitmap));
2517                         if (!bitmap_bh)
2518                                 goto make_io;
2519
2520                         /*
2521                          * If the inode bitmap isn't in cache then the
2522                          * optimisation may end up performing two reads instead
2523                          * of one, so skip it.
2524                          */
2525                         if (!buffer_uptodate(bitmap_bh)) {
2526                                 brelse(bitmap_bh);
2527                                 goto make_io;
2528                         }
2529                         for (i = start; i < start + inodes_per_buffer; i++) {
2530                                 if (i == inode_offset)
2531                                         continue;
2532                                 if (ext3_test_bit(i, bitmap_bh->b_data))
2533                                         break;
2534                         }
2535                         brelse(bitmap_bh);
2536                         if (i == start + inodes_per_buffer) {
2537                                 /* all other inodes are free, so skip I/O */
2538                                 memset(bh->b_data, 0, bh->b_size);
2539                                 set_buffer_uptodate(bh);
2540                                 unlock_buffer(bh);
2541                                 goto has_buffer;
2542                         }
2543                 }
2544
2545 make_io:
2546                 /*
2547                  * There are other valid inodes in the buffer, this inode
2548                  * has in-inode xattrs, or we don't have this inode in memory.
2549                  * Read the block from disk.
2550                  */
2551                 get_bh(bh);
2552                 bh->b_end_io = end_buffer_read_sync;
2553                 submit_bh(READ, bh);
2554                 wait_on_buffer(bh);
2555                 if (!buffer_uptodate(bh)) {
2556                         ext3_error(inode->i_sb, "ext3_get_inode_loc",
2557                                         "unable to read inode block - "
2558                                         "inode=%lu, block=%lu",
2559                                         inode->i_ino, block);
2560                         brelse(bh);
2561                         return -EIO;
2562                 }
2563         }
2564 has_buffer:
2565         iloc->bh = bh;
2566         return 0;
2567 }
2568
2569 int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2570 {
2571         /* We have all inode data except xattrs in memory here. */
2572         return __ext3_get_inode_loc(inode, iloc,
2573                 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
2574 }
2575
2576 void ext3_set_inode_flags(struct inode *inode)
2577 {
2578         unsigned int flags = EXT3_I(inode)->i_flags;
2579
2580         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2581         if (flags & EXT3_SYNC_FL)
2582                 inode->i_flags |= S_SYNC;
2583         if (flags & EXT3_APPEND_FL)
2584                 inode->i_flags |= S_APPEND;
2585         if (flags & EXT3_IMMUTABLE_FL)
2586                 inode->i_flags |= S_IMMUTABLE;
2587         if (flags & EXT3_NOATIME_FL)
2588                 inode->i_flags |= S_NOATIME;
2589         if (flags & EXT3_DIRSYNC_FL)
2590                 inode->i_flags |= S_DIRSYNC;
2591 }
2592
2593 void ext3_read_inode(struct inode * inode)
2594 {
2595         struct ext3_iloc iloc;
2596         struct ext3_inode *raw_inode;
2597         struct ext3_inode_info *ei = EXT3_I(inode);
2598         struct buffer_head *bh;
2599         int block;
2600
2601 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2602         ei->i_acl = EXT3_ACL_NOT_CACHED;
2603         ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2604 #endif
2605         ei->i_block_alloc_info = NULL;
2606
2607         if (__ext3_get_inode_loc(inode, &iloc, 0))
2608                 goto bad_inode;
2609         bh = iloc.bh;
2610         raw_inode = ext3_raw_inode(&iloc);
2611         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2612         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2613         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2614         if(!(test_opt (inode->i_sb, NO_UID32))) {
2615                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2616                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2617         }
2618         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2619         inode->i_size = le32_to_cpu(raw_inode->i_size);
2620         inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2621         inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2622         inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2623         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2624
2625         ei->i_state = 0;
2626         ei->i_dir_start_lookup = 0;
2627         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2628         /* We now have enough fields to check if the inode was active or not.
2629          * This is needed because nfsd might try to access dead inodes
2630          * the test is that same one that e2fsck uses
2631          * NeilBrown 1999oct15
2632          */
2633         if (inode->i_nlink == 0) {
2634                 if (inode->i_mode == 0 ||
2635                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2636                         /* this inode is deleted */
2637                         brelse (bh);
2638                         goto bad_inode;
2639                 }
2640                 /* The only unlinked inodes we let through here have
2641                  * valid i_mode and are being read by the orphan
2642                  * recovery code: that's fine, we're about to complete
2643                  * the process of deleting those. */
2644         }
2645         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2646                                          * (for stat), not the fs block
2647                                          * size */  
2648         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2649         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2650 #ifdef EXT3_FRAGMENTS
2651         ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2652         ei->i_frag_no = raw_inode->i_frag;
2653         ei->i_frag_size = raw_inode->i_fsize;
2654 #endif
2655         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2656         if (!S_ISREG(inode->i_mode)) {
2657                 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2658         } else {
2659                 inode->i_size |=
2660                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2661         }
2662         ei->i_disksize = inode->i_size;
2663         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2664         ei->i_block_group = iloc.block_group;
2665         /*
2666          * NOTE! The in-memory inode i_data array is in little-endian order
2667          * even on big-endian machines: we do NOT byteswap the block numbers!
2668          */
2669         for (block = 0; block < EXT3_N_BLOCKS; block++)
2670                 ei->i_data[block] = raw_inode->i_block[block];
2671         INIT_LIST_HEAD(&ei->i_orphan);
2672
2673         if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2674             EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2675                 /*
2676                  * When mke2fs creates big inodes it does not zero out
2677                  * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2678                  * so ignore those first few inodes.
2679                  */
2680                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2681                 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2682                     EXT3_INODE_SIZE(inode->i_sb))
2683                         goto bad_inode;
2684                 if (ei->i_extra_isize == 0) {
2685                         /* The extra space is currently unused. Use it. */
2686                         ei->i_extra_isize = sizeof(struct ext3_inode) -
2687                                             EXT3_GOOD_OLD_INODE_SIZE;
2688                 } else {
2689                         __le32 *magic = (void *)raw_inode +
2690                                         EXT3_GOOD_OLD_INODE_SIZE +
2691                                         ei->i_extra_isize;
2692                         if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2693                                  ei->i_state |= EXT3_STATE_XATTR;
2694                 }
2695         } else
2696                 ei->i_extra_isize = 0;
2697
2698         if (S_ISREG(inode->i_mode)) {
2699                 inode->i_op = &ext3_file_inode_operations;
2700                 inode->i_fop = &ext3_file_operations;
2701                 ext3_set_aops(inode);
2702         } else if (S_ISDIR(inode->i_mode)) {
2703                 inode->i_op = &ext3_dir_inode_operations;
2704                 inode->i_fop = &ext3_dir_operations;
2705         } else if (S_ISLNK(inode->i_mode)) {
2706                 if (ext3_inode_is_fast_symlink(inode))
2707                         inode->i_op = &ext3_fast_symlink_inode_operations;
2708                 else {
2709                         inode->i_op = &ext3_symlink_inode_operations;
2710                         ext3_set_aops(inode);
2711                 }
2712         } else {
2713                 inode->i_op = &ext3_special_inode_operations;
2714                 if (raw_inode->i_block[0])
2715                         init_special_inode(inode, inode->i_mode,
2716                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2717                 else 
2718                         init_special_inode(inode, inode->i_mode,
2719                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2720         }
2721         brelse (iloc.bh);
2722         ext3_set_inode_flags(inode);
2723         return;
2724
2725 bad_inode:
2726         make_bad_inode(inode);
2727         return;
2728 }
2729
2730 /*
2731  * Post the struct inode info into an on-disk inode location in the
2732  * buffer-cache.  This gobbles the caller's reference to the
2733  * buffer_head in the inode location struct.
2734  *
2735  * The caller must have write access to iloc->bh.
2736  */
2737 static int ext3_do_update_inode(handle_t *handle, 
2738                                 struct inode *inode, 
2739                                 struct ext3_iloc *iloc)
2740 {
2741         struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2742         struct ext3_inode_info *ei = EXT3_I(inode);
2743         struct buffer_head *bh = iloc->bh;
2744         int err = 0, rc, block;
2745
2746         /* For fields not not tracking in the in-memory inode,
2747          * initialise them to zero for new inodes. */
2748         if (ei->i_state & EXT3_STATE_NEW)
2749                 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2750
2751         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2752         if(!(test_opt(inode->i_sb, NO_UID32))) {
2753                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2754                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2755 /*
2756  * Fix up interoperability with old kernels. Otherwise, old inodes get
2757  * re-used with the upper 16 bits of the uid/gid intact
2758  */
2759                 if(!ei->i_dtime) {
2760                         raw_inode->i_uid_high =
2761                                 cpu_to_le16(high_16_bits(inode->i_uid));
2762                         raw_inode->i_gid_high =
2763                                 cpu_to_le16(high_16_bits(inode->i_gid));
2764                 } else {
2765                         raw_inode->i_uid_high = 0;
2766                         raw_inode->i_gid_high = 0;
2767                 }
2768         } else {
2769                 raw_inode->i_uid_low =
2770                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
2771                 raw_inode->i_gid_low =
2772                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
2773                 raw_inode->i_uid_high = 0;
2774                 raw_inode->i_gid_high = 0;
2775         }
2776         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2777         raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2778         raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2779         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2780         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2781         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2782         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2783         raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2784 #ifdef EXT3_FRAGMENTS
2785         raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2786         raw_inode->i_frag = ei->i_frag_no;
2787         raw_inode->i_fsize = ei->i_frag_size;
2788 #endif
2789         raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2790         if (!S_ISREG(inode->i_mode)) {
2791                 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2792         } else {
2793                 raw_inode->i_size_high =
2794                         cpu_to_le32(ei->i_disksize >> 32);
2795                 if (ei->i_disksize > 0x7fffffffULL) {
2796                         struct super_block *sb = inode->i_sb;
2797                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2798                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2799                             EXT3_SB(sb)->s_es->s_rev_level ==
2800                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2801                                /* If this is the first large file
2802                                 * created, add a flag to the superblock.
2803                                 */
2804                                 err = ext3_journal_get_write_access(handle,
2805                                                 EXT3_SB(sb)->s_sbh);
2806                                 if (err)
2807                                         goto out_brelse;
2808                                 ext3_update_dynamic_rev(sb);
2809                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
2810                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2811                                 sb->s_dirt = 1;
2812                                 handle->h_sync = 1;
2813                                 err = ext3_journal_dirty_metadata(handle,
2814                                                 EXT3_SB(sb)->s_sbh);
2815                         }
2816                 }
2817         }
2818         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2819         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2820                 if (old_valid_dev(inode->i_rdev)) {
2821                         raw_inode->i_block[0] =
2822                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
2823                         raw_inode->i_block[1] = 0;
2824                 } else {
2825                         raw_inode->i_block[0] = 0;
2826                         raw_inode->i_block[1] =
2827                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
2828                         raw_inode->i_block[2] = 0;
2829                 }
2830         } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2831                 raw_inode->i_block[block] = ei->i_data[block];
2832
2833         if (ei->i_extra_isize)
2834                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2835
2836         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2837         rc = ext3_journal_dirty_metadata(handle, bh);
2838         if (!err)
2839                 err = rc;
2840         ei->i_state &= ~EXT3_STATE_NEW;
2841
2842 out_brelse:
2843         brelse (bh);
2844         ext3_std_error(inode->i_sb, err);
2845         return err;
2846 }
2847
2848 /*
2849  * ext3_write_inode()
2850  *
2851  * We are called from a few places:
2852  *
2853  * - Within generic_file_write() for O_SYNC files.
2854  *   Here, there will be no transaction running. We wait for any running
2855  *   trasnaction to commit.
2856  *
2857  * - Within sys_sync(), kupdate and such.
2858  *   We wait on commit, if tol to.
2859  *
2860  * - Within prune_icache() (PF_MEMALLOC == true)
2861  *   Here we simply return.  We can't afford to block kswapd on the
2862  *   journal commit.
2863  *
2864  * In all cases it is actually safe for us to return without doing anything,
2865  * because the inode has been copied into a raw inode buffer in
2866  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2867  * knfsd.
2868  *
2869  * Note that we are absolutely dependent upon all inode dirtiers doing the
2870  * right thing: they *must* call mark_inode_dirty() after dirtying info in
2871  * which we are interested.
2872  *
2873  * It would be a bug for them to not do this.  The code:
2874  *
2875  *      mark_inode_dirty(inode)
2876  *      stuff();
2877  *      inode->i_size = expr;
2878  *
2879  * is in error because a kswapd-driven write_inode() could occur while
2880  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2881  * will no longer be on the superblock's dirty inode list.
2882  */
2883 int ext3_write_inode(struct inode *inode, int wait)
2884 {
2885         if (current->flags & PF_MEMALLOC)
2886                 return 0;
2887
2888         if (ext3_journal_current_handle()) {
2889                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2890                 dump_stack();
2891                 return -EIO;
2892         }
2893
2894         if (!wait)
2895                 return 0;
2896
2897         return ext3_force_commit(inode->i_sb);
2898 }
2899
2900 /*
2901  * ext3_setattr()
2902  *
2903  * Called from notify_change.
2904  *
2905  * We want to trap VFS attempts to truncate the file as soon as
2906  * possible.  In particular, we want to make sure that when the VFS
2907  * shrinks i_size, we put the inode on the orphan list and modify
2908  * i_disksize immediately, so that during the subsequent flushing of
2909  * dirty pages and freeing of disk blocks, we can guarantee that any
2910  * commit will leave the blocks being flushed in an unused state on
2911  * disk.  (On recovery, the inode will get truncated and the blocks will
2912  * be freed, so we have a strong guarantee that no future commit will
2913  * leave these blocks visible to the user.)  
2914  *
2915  * Called with inode->sem down.
2916  */
2917 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2918 {
2919         struct inode *inode = dentry->d_inode;
2920         int error, rc = 0;
2921         const unsigned int ia_valid = attr->ia_valid;
2922
2923         error = inode_change_ok(inode, attr);
2924         if (error)
2925                 return error;
2926
2927         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2928                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2929                 handle_t *handle;
2930
2931                 /* (user+group)*(old+new) structure, inode write (sb,
2932                  * inode block, ? - but truncate inode update has it) */
2933                 handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
2934                                         EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
2935                 if (IS_ERR(handle)) {
2936                         error = PTR_ERR(handle);
2937                         goto err_out;
2938                 }
2939                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2940                 if (error) {
2941                         ext3_journal_stop(handle);
2942                         return error;
2943                 }
2944                 /* Update corresponding info in inode so that everything is in
2945                  * one transaction */
2946                 if (attr->ia_valid & ATTR_UID)
2947                         inode->i_uid = attr->ia_uid;
2948                 if (attr->ia_valid & ATTR_GID)
2949                         inode->i_gid = attr->ia_gid;
2950                 error = ext3_mark_inode_dirty(handle, inode);
2951                 ext3_journal_stop(handle);
2952         }
2953
2954         if (S_ISREG(inode->i_mode) &&
2955             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2956                 handle_t *handle;
2957
2958                 handle = ext3_journal_start(inode, 3);
2959                 if (IS_ERR(handle)) {
2960                         error = PTR_ERR(handle);
2961                         goto err_out;
2962                 }
2963
2964                 error = ext3_orphan_add(handle, inode);
2965                 EXT3_I(inode)->i_disksize = attr->ia_size;
2966                 rc = ext3_mark_inode_dirty(handle, inode);
2967                 if (!error)
2968                         error = rc;
2969                 ext3_journal_stop(handle);
2970         }
2971
2972         rc = inode_setattr(inode, attr);
2973
2974         /* If inode_setattr's call to ext3_truncate failed to get a
2975          * transaction handle at all, we need to clean up the in-core
2976          * orphan list manually. */
2977         if (inode->i_nlink)
2978                 ext3_orphan_del(NULL, inode);
2979
2980         if (!rc && (ia_valid & ATTR_MODE))
2981                 rc = ext3_acl_chmod(inode);
2982
2983 err_out:
2984         ext3_std_error(inode->i_sb, error);
2985         if (!error)
2986                 error = rc;
2987         return error;
2988 }
2989
2990
2991 /*
2992  * akpm: how many blocks doth make a writepage()?
2993  *
2994  * With N blocks per page, it may be:
2995  * N data blocks
2996  * 2 indirect block
2997  * 2 dindirect
2998  * 1 tindirect
2999  * N+5 bitmap blocks (from the above)
3000  * N+5 group descriptor summary blocks
3001  * 1 inode block
3002  * 1 superblock.
3003  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
3004  *
3005  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
3006  *
3007  * With ordered or writeback data it's the same, less the N data blocks.
3008  *
3009  * If the inode's direct blocks can hold an integral number of pages then a
3010  * page cannot straddle two indirect blocks, and we can only touch one indirect
3011  * and dindirect block, and the "5" above becomes "3".
3012  *
3013  * This still overestimates under most circumstances.  If we were to pass the
3014  * start and end offsets in here as well we could do block_to_path() on each
3015  * block and work out the exact number of indirects which are touched.  Pah.
3016  */
3017
3018 static int ext3_writepage_trans_blocks(struct inode *inode)
3019 {
3020         int bpp = ext3_journal_blocks_per_page(inode);
3021         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
3022         int ret;
3023
3024         if (ext3_should_journal_data(inode))
3025                 ret = 3 * (bpp + indirects) + 2;
3026         else
3027                 ret = 2 * (bpp + indirects) + 2;
3028
3029 #ifdef CONFIG_QUOTA
3030         /* We know that structure was already allocated during DQUOT_INIT so
3031          * we will be updating only the data blocks + inodes */
3032         ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
3033 #endif
3034
3035         return ret;
3036 }
3037
3038 /*
3039  * The caller must have previously called ext3_reserve_inode_write().
3040  * Give this, we know that the caller already has write access to iloc->bh.
3041  */
3042 int ext3_mark_iloc_dirty(handle_t *handle,
3043                 struct inode *inode, struct ext3_iloc *iloc)
3044 {
3045         int err = 0;
3046
3047         /* the do_update_inode consumes one bh->b_count */
3048         get_bh(iloc->bh);
3049
3050         /* ext3_do_update_inode() does journal_dirty_metadata */
3051         err = ext3_do_update_inode(handle, inode, iloc);
3052         put_bh(iloc->bh);
3053         return err;
3054 }
3055
3056 /* 
3057  * On success, We end up with an outstanding reference count against
3058  * iloc->bh.  This _must_ be cleaned up later. 
3059  */
3060
3061 int
3062 ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
3063                          struct ext3_iloc *iloc)
3064 {
3065         int err = 0;
3066         if (handle) {
3067                 err = ext3_get_inode_loc(inode, iloc);
3068                 if (!err) {
3069                         BUFFER_TRACE(iloc->bh, "get_write_access");
3070                         err = ext3_journal_get_write_access(handle, iloc->bh);
3071                         if (err) {
3072                                 brelse(iloc->bh);
3073                                 iloc->bh = NULL;
3074                         }
3075                 }
3076         }
3077         ext3_std_error(inode->i_sb, err);
3078         return err;
3079 }
3080
3081 /*
3082  * akpm: What we do here is to mark the in-core inode as clean
3083  * with respect to inode dirtiness (it may still be data-dirty).
3084  * This means that the in-core inode may be reaped by prune_icache
3085  * without having to perform any I/O.  This is a very good thing,
3086  * because *any* task may call prune_icache - even ones which
3087  * have a transaction open against a different journal.
3088  *
3089  * Is this cheating?  Not really.  Sure, we haven't written the
3090  * inode out, but prune_icache isn't a user-visible syncing function.
3091  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3092  * we start and wait on commits.
3093  *
3094  * Is this efficient/effective?  Well, we're being nice to the system
3095  * by cleaning up our inodes proactively so they can be reaped
3096  * without I/O.  But we are potentially leaving up to five seconds'
3097  * worth of inodes floating about which prune_icache wants us to
3098  * write out.  One way to fix that would be to get prune_icache()
3099  * to do a write_super() to free up some memory.  It has the desired
3100  * effect.
3101  */
3102 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3103 {
3104         struct ext3_iloc iloc;
3105         int err;
3106
3107         might_sleep();
3108         err = ext3_reserve_inode_write(handle, inode, &iloc);
3109         if (!err)
3110                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3111         return err;
3112 }
3113
3114 /*
3115  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
3116  *
3117  * We're really interested in the case where a file is being extended.
3118  * i_size has been changed by generic_commit_write() and we thus need
3119  * to include the updated inode in the current transaction.
3120  *
3121  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3122  * are allocated to the file.
3123  *
3124  * If the inode is marked synchronous, we don't honour that here - doing
3125  * so would cause a commit on atime updates, which we don't bother doing.
3126  * We handle synchronous inodes at the highest possible level.
3127  */
3128 void ext3_dirty_inode(struct inode *inode)
3129 {
3130         handle_t *current_handle = ext3_journal_current_handle();
3131         handle_t *handle;
3132
3133         handle = ext3_journal_start(inode, 2);
3134         if (IS_ERR(handle))
3135                 goto out;
3136         if (current_handle &&
3137                 current_handle->h_transaction != handle->h_transaction) {
3138                 /* This task has a transaction open against a different fs */
3139                 printk(KERN_EMERG "%s: transactions do not match!\n",
3140                        __FUNCTION__);
3141         } else {
3142                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
3143                                 current_handle);
3144                 ext3_mark_inode_dirty(handle, inode);
3145         }
3146         ext3_journal_stop(handle);
3147 out:
3148         return;
3149 }
3150
3151 #ifdef AKPM
3152 /* 
3153  * Bind an inode's backing buffer_head into this transaction, to prevent
3154  * it from being flushed to disk early.  Unlike
3155  * ext3_reserve_inode_write, this leaves behind no bh reference and
3156  * returns no iloc structure, so the caller needs to repeat the iloc
3157  * lookup to mark the inode dirty later.
3158  */
3159 static inline int
3160 ext3_pin_inode(handle_t *handle, struct inode *inode)
3161 {
3162         struct ext3_iloc iloc;
3163
3164         int err = 0;
3165         if (handle) {
3166                 err = ext3_get_inode_loc(inode, &iloc);
3167                 if (!err) {
3168                         BUFFER_TRACE(iloc.bh, "get_write_access");
3169                         err = journal_get_write_access(handle, iloc.bh);
3170                         if (!err)
3171                                 err = ext3_journal_dirty_metadata(handle, 
3172                                                                   iloc.bh);
3173                         brelse(iloc.bh);
3174                 }
3175         }
3176         ext3_std_error(inode->i_sb, err);
3177         return err;
3178 }
3179 #endif
3180
3181 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3182 {
3183         journal_t *journal;
3184         handle_t *handle;
3185         int err;
3186
3187         /*
3188          * We have to be very careful here: changing a data block's
3189          * journaling status dynamically is dangerous.  If we write a
3190          * data block to the journal, change the status and then delete
3191          * that block, we risk forgetting to revoke the old log record
3192          * from the journal and so a subsequent replay can corrupt data.
3193          * So, first we make sure that the journal is empty and that
3194          * nobody is changing anything.
3195          */
3196
3197         journal = EXT3_JOURNAL(inode);
3198         if (is_journal_aborted(journal) || IS_RDONLY(inode))
3199                 return -EROFS;
3200
3201         journal_lock_updates(journal);
3202         journal_flush(journal);
3203
3204         /*
3205          * OK, there are no updates running now, and all cached data is
3206          * synced to disk.  We are now in a completely consistent state
3207          * which doesn't have anything in the journal, and we know that
3208          * no filesystem updates are running, so it is safe to modify
3209          * the inode's in-core data-journaling state flag now.
3210          */
3211
3212         if (val)
3213                 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3214         else
3215                 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3216         ext3_set_aops(inode);
3217
3218         journal_unlock_updates(journal);
3219
3220         /* Finally we can mark the inode as dirty. */
3221
3222         handle = ext3_journal_start(inode, 1);
3223         if (IS_ERR(handle))
3224                 return PTR_ERR(handle);
3225
3226         err = ext3_mark_inode_dirty(handle, inode);
3227         handle->h_sync = 1;
3228         ext3_journal_stop(handle);
3229         ext3_std_error(inode->i_sb, err);
3230
3231         return err;
3232 }