85a05f1202497b55cc65595fc8bdb916a28774d3
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30
31 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
32 #include <cluster/masklog.h>
33
34 #include "ocfs2.h"
35
36 #include "alloc.h"
37 #include "dlmglue.h"
38 #include "extent_map.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "sysfile.h"
44 #include "file.h"
45 #include "super.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
51
52 /*
53  * Structures which describe a path through a btree, and functions to
54  * manipulate them.
55  *
56  * The idea here is to be as generic as possible with the tree
57  * manipulation code.
58  */
59 struct ocfs2_path_item {
60         struct buffer_head              *bh;
61         struct ocfs2_extent_list        *el;
62 };
63
64 #define OCFS2_MAX_PATH_DEPTH    5
65
66 struct ocfs2_path {
67         int                     p_tree_depth;
68         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
69 };
70
71 #define path_root_bh(_path) ((_path)->p_node[0].bh)
72 #define path_root_el(_path) ((_path)->p_node[0].el)
73 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
74 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
75 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
76
77 /*
78  * Reset the actual path elements so that we can re-use the structure
79  * to build another path. Generally, this involves freeing the buffer
80  * heads.
81  */
82 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
83 {
84         int i, start = 0, depth = 0;
85         struct ocfs2_path_item *node;
86
87         if (keep_root)
88                 start = 1;
89
90         for(i = start; i < path_num_items(path); i++) {
91                 node = &path->p_node[i];
92
93                 brelse(node->bh);
94                 node->bh = NULL;
95                 node->el = NULL;
96         }
97
98         /*
99          * Tree depth may change during truncate, or insert. If we're
100          * keeping the root extent list, then make sure that our path
101          * structure reflects the proper depth.
102          */
103         if (keep_root)
104                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
105
106         path->p_tree_depth = depth;
107 }
108
109 static void ocfs2_free_path(struct ocfs2_path *path)
110 {
111         if (path) {
112                 ocfs2_reinit_path(path, 0);
113                 kfree(path);
114         }
115 }
116
117 /*
118  * Make the *dest path the same as src and re-initialize src path to
119  * have a root only.
120  */
121 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
122 {
123         int i;
124
125         BUG_ON(path_root_bh(dest) != path_root_bh(src));
126
127         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
128                 brelse(dest->p_node[i].bh);
129
130                 dest->p_node[i].bh = src->p_node[i].bh;
131                 dest->p_node[i].el = src->p_node[i].el;
132
133                 src->p_node[i].bh = NULL;
134                 src->p_node[i].el = NULL;
135         }
136 }
137
138 /*
139  * Insert an extent block at given index.
140  *
141  * This will not take an additional reference on eb_bh.
142  */
143 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
144                                         struct buffer_head *eb_bh)
145 {
146         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
147
148         /*
149          * Right now, no root bh is an extent block, so this helps
150          * catch code errors with dinode trees. The assertion can be
151          * safely removed if we ever need to insert extent block
152          * structures at the root.
153          */
154         BUG_ON(index == 0);
155
156         path->p_node[index].bh = eb_bh;
157         path->p_node[index].el = &eb->h_list;
158 }
159
160 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
161                                          struct ocfs2_extent_list *root_el)
162 {
163         struct ocfs2_path *path;
164
165         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
166
167         path = kzalloc(sizeof(*path), GFP_NOFS);
168         if (path) {
169                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
170                 get_bh(root_bh);
171                 path_root_bh(path) = root_bh;
172                 path_root_el(path) = root_el;
173         }
174
175         return path;
176 }
177
178 /*
179  * Allocate and initialize a new path based on a disk inode tree.
180  */
181 static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
182 {
183         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
184         struct ocfs2_extent_list *el = &di->id2.i_list;
185
186         return ocfs2_new_path(di_bh, el);
187 }
188
189 /*
190  * Convenience function to journal all components in a path.
191  */
192 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
193                                      struct ocfs2_path *path)
194 {
195         int i, ret = 0;
196
197         if (!path)
198                 goto out;
199
200         for(i = 0; i < path_num_items(path); i++) {
201                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
202                                            OCFS2_JOURNAL_ACCESS_WRITE);
203                 if (ret < 0) {
204                         mlog_errno(ret);
205                         goto out;
206                 }
207         }
208
209 out:
210         return ret;
211 }
212
213 enum ocfs2_contig_type {
214         CONTIG_NONE = 0,
215         CONTIG_LEFT,
216         CONTIG_RIGHT
217 };
218
219 static int ocfs2_block_extent_contig(struct super_block *sb,
220                                      struct ocfs2_extent_rec *ext,
221                                      u64 blkno)
222 {
223         return blkno == (le64_to_cpu(ext->e_blkno) +
224                          ocfs2_clusters_to_blocks(sb,
225                                                   le32_to_cpu(ext->e_clusters)));
226 }
227
228 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
229                                   struct ocfs2_extent_rec *right)
230 {
231         return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) ==
232                 le32_to_cpu(right->e_cpos));
233 }
234
235 static enum ocfs2_contig_type
236         ocfs2_extent_contig(struct inode *inode,
237                             struct ocfs2_extent_rec *ext,
238                             struct ocfs2_extent_rec *insert_rec)
239 {
240         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
241
242         if (ocfs2_extents_adjacent(ext, insert_rec) &&
243             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
244                         return CONTIG_RIGHT;
245
246         blkno = le64_to_cpu(ext->e_blkno);
247         if (ocfs2_extents_adjacent(insert_rec, ext) &&
248             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
249                 return CONTIG_LEFT;
250
251         return CONTIG_NONE;
252 }
253
254 /*
255  * NOTE: We can have pretty much any combination of contiguousness and
256  * appending.
257  *
258  * The usefulness of APPEND_TAIL is more in that it lets us know that
259  * we'll have to update the path to that leaf.
260  */
261 enum ocfs2_append_type {
262         APPEND_NONE = 0,
263         APPEND_TAIL,
264 };
265
266 struct ocfs2_insert_type {
267         enum ocfs2_append_type  ins_appending;
268         enum ocfs2_contig_type  ins_contig;
269         int                     ins_contig_index;
270         int                     ins_free_records;
271         int                     ins_tree_depth;
272 };
273
274 /*
275  * How many free extents have we got before we need more meta data?
276  */
277 int ocfs2_num_free_extents(struct ocfs2_super *osb,
278                            struct inode *inode,
279                            struct ocfs2_dinode *fe)
280 {
281         int retval;
282         struct ocfs2_extent_list *el;
283         struct ocfs2_extent_block *eb;
284         struct buffer_head *eb_bh = NULL;
285
286         mlog_entry_void();
287
288         if (!OCFS2_IS_VALID_DINODE(fe)) {
289                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
290                 retval = -EIO;
291                 goto bail;
292         }
293
294         if (fe->i_last_eb_blk) {
295                 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
296                                           &eb_bh, OCFS2_BH_CACHED, inode);
297                 if (retval < 0) {
298                         mlog_errno(retval);
299                         goto bail;
300                 }
301                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
302                 el = &eb->h_list;
303         } else
304                 el = &fe->id2.i_list;
305
306         BUG_ON(el->l_tree_depth != 0);
307
308         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
309 bail:
310         if (eb_bh)
311                 brelse(eb_bh);
312
313         mlog_exit(retval);
314         return retval;
315 }
316
317 /* expects array to already be allocated
318  *
319  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
320  * l_count for you
321  */
322 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
323                                      handle_t *handle,
324                                      struct inode *inode,
325                                      int wanted,
326                                      struct ocfs2_alloc_context *meta_ac,
327                                      struct buffer_head *bhs[])
328 {
329         int count, status, i;
330         u16 suballoc_bit_start;
331         u32 num_got;
332         u64 first_blkno;
333         struct ocfs2_extent_block *eb;
334
335         mlog_entry_void();
336
337         count = 0;
338         while (count < wanted) {
339                 status = ocfs2_claim_metadata(osb,
340                                               handle,
341                                               meta_ac,
342                                               wanted - count,
343                                               &suballoc_bit_start,
344                                               &num_got,
345                                               &first_blkno);
346                 if (status < 0) {
347                         mlog_errno(status);
348                         goto bail;
349                 }
350
351                 for(i = count;  i < (num_got + count); i++) {
352                         bhs[i] = sb_getblk(osb->sb, first_blkno);
353                         if (bhs[i] == NULL) {
354                                 status = -EIO;
355                                 mlog_errno(status);
356                                 goto bail;
357                         }
358                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
359
360                         status = ocfs2_journal_access(handle, inode, bhs[i],
361                                                       OCFS2_JOURNAL_ACCESS_CREATE);
362                         if (status < 0) {
363                                 mlog_errno(status);
364                                 goto bail;
365                         }
366
367                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
368                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
369                         /* Ok, setup the minimal stuff here. */
370                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
371                         eb->h_blkno = cpu_to_le64(first_blkno);
372                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
373
374 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
375                         /* we always use slot zero's suballocator */
376                         eb->h_suballoc_slot = 0;
377 #else
378                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
379 #endif
380                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
381                         eb->h_list.l_count =
382                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
383
384                         suballoc_bit_start++;
385                         first_blkno++;
386
387                         /* We'll also be dirtied by the caller, so
388                          * this isn't absolutely necessary. */
389                         status = ocfs2_journal_dirty(handle, bhs[i]);
390                         if (status < 0) {
391                                 mlog_errno(status);
392                                 goto bail;
393                         }
394                 }
395
396                 count += num_got;
397         }
398
399         status = 0;
400 bail:
401         if (status < 0) {
402                 for(i = 0; i < wanted; i++) {
403                         if (bhs[i])
404                                 brelse(bhs[i]);
405                         bhs[i] = NULL;
406                 }
407         }
408         mlog_exit(status);
409         return status;
410 }
411
412 /*
413  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
414  *
415  * Returns the sum of the rightmost extent rec logical offset and
416  * cluster count.
417  *
418  * ocfs2_add_branch() uses this to determine what logical cluster
419  * value should be populated into the leftmost new branch records.
420  *
421  * ocfs2_shift_tree_depth() uses this to determine the # clusters
422  * value for the new topmost tree record.
423  */
424 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
425 {
426         int i;
427
428         i = le16_to_cpu(el->l_next_free_rec) - 1;
429
430         return le32_to_cpu(el->l_recs[i].e_cpos) +
431                 le32_to_cpu(el->l_recs[i].e_clusters);
432 }
433
434 /*
435  * Add an entire tree branch to our inode. eb_bh is the extent block
436  * to start at, if we don't want to start the branch at the dinode
437  * structure.
438  *
439  * last_eb_bh is required as we have to update it's next_leaf pointer
440  * for the new last extent block.
441  *
442  * the new branch will be 'empty' in the sense that every block will
443  * contain a single record with e_clusters == 0.
444  */
445 static int ocfs2_add_branch(struct ocfs2_super *osb,
446                             handle_t *handle,
447                             struct inode *inode,
448                             struct buffer_head *fe_bh,
449                             struct buffer_head *eb_bh,
450                             struct buffer_head *last_eb_bh,
451                             struct ocfs2_alloc_context *meta_ac)
452 {
453         int status, new_blocks, i;
454         u64 next_blkno, new_last_eb_blk;
455         struct buffer_head *bh;
456         struct buffer_head **new_eb_bhs = NULL;
457         struct ocfs2_dinode *fe;
458         struct ocfs2_extent_block *eb;
459         struct ocfs2_extent_list  *eb_el;
460         struct ocfs2_extent_list  *el;
461         u32 new_cpos;
462
463         mlog_entry_void();
464
465         BUG_ON(!last_eb_bh);
466
467         fe = (struct ocfs2_dinode *) fe_bh->b_data;
468
469         if (eb_bh) {
470                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
471                 el = &eb->h_list;
472         } else
473                 el = &fe->id2.i_list;
474
475         /* we never add a branch to a leaf. */
476         BUG_ON(!el->l_tree_depth);
477
478         new_blocks = le16_to_cpu(el->l_tree_depth);
479
480         /* allocate the number of new eb blocks we need */
481         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
482                              GFP_KERNEL);
483         if (!new_eb_bhs) {
484                 status = -ENOMEM;
485                 mlog_errno(status);
486                 goto bail;
487         }
488
489         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
490                                            meta_ac, new_eb_bhs);
491         if (status < 0) {
492                 mlog_errno(status);
493                 goto bail;
494         }
495
496         eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
497         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
498
499         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
500          * linked with the rest of the tree.
501          * conversly, new_eb_bhs[0] is the new bottommost leaf.
502          *
503          * when we leave the loop, new_last_eb_blk will point to the
504          * newest leaf, and next_blkno will point to the topmost extent
505          * block. */
506         next_blkno = new_last_eb_blk = 0;
507         for(i = 0; i < new_blocks; i++) {
508                 bh = new_eb_bhs[i];
509                 eb = (struct ocfs2_extent_block *) bh->b_data;
510                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
511                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
512                         status = -EIO;
513                         goto bail;
514                 }
515                 eb_el = &eb->h_list;
516
517                 status = ocfs2_journal_access(handle, inode, bh,
518                                               OCFS2_JOURNAL_ACCESS_CREATE);
519                 if (status < 0) {
520                         mlog_errno(status);
521                         goto bail;
522                 }
523
524                 eb->h_next_leaf_blk = 0;
525                 eb_el->l_tree_depth = cpu_to_le16(i);
526                 eb_el->l_next_free_rec = cpu_to_le16(1);
527                 /*
528                  * This actually counts as an empty extent as
529                  * c_clusters == 0
530                  */
531                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
532                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
533                 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
534                 if (!eb_el->l_tree_depth)
535                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
536
537                 status = ocfs2_journal_dirty(handle, bh);
538                 if (status < 0) {
539                         mlog_errno(status);
540                         goto bail;
541                 }
542
543                 next_blkno = le64_to_cpu(eb->h_blkno);
544         }
545
546         /* This is a bit hairy. We want to update up to three blocks
547          * here without leaving any of them in an inconsistent state
548          * in case of error. We don't have to worry about
549          * journal_dirty erroring as it won't unless we've aborted the
550          * handle (in which case we would never be here) so reserving
551          * the write with journal_access is all we need to do. */
552         status = ocfs2_journal_access(handle, inode, last_eb_bh,
553                                       OCFS2_JOURNAL_ACCESS_WRITE);
554         if (status < 0) {
555                 mlog_errno(status);
556                 goto bail;
557         }
558         status = ocfs2_journal_access(handle, inode, fe_bh,
559                                       OCFS2_JOURNAL_ACCESS_WRITE);
560         if (status < 0) {
561                 mlog_errno(status);
562                 goto bail;
563         }
564         if (eb_bh) {
565                 status = ocfs2_journal_access(handle, inode, eb_bh,
566                                               OCFS2_JOURNAL_ACCESS_WRITE);
567                 if (status < 0) {
568                         mlog_errno(status);
569                         goto bail;
570                 }
571         }
572
573         /* Link the new branch into the rest of the tree (el will
574          * either be on the fe, or the extent block passed in. */
575         i = le16_to_cpu(el->l_next_free_rec);
576         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
577         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
578         el->l_recs[i].e_clusters = 0;
579         le16_add_cpu(&el->l_next_free_rec, 1);
580
581         /* fe needs a new last extent block pointer, as does the
582          * next_leaf on the previously last-extent-block. */
583         fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
584
585         eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
586         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
587
588         status = ocfs2_journal_dirty(handle, last_eb_bh);
589         if (status < 0)
590                 mlog_errno(status);
591         status = ocfs2_journal_dirty(handle, fe_bh);
592         if (status < 0)
593                 mlog_errno(status);
594         if (eb_bh) {
595                 status = ocfs2_journal_dirty(handle, eb_bh);
596                 if (status < 0)
597                         mlog_errno(status);
598         }
599
600         status = 0;
601 bail:
602         if (new_eb_bhs) {
603                 for (i = 0; i < new_blocks; i++)
604                         if (new_eb_bhs[i])
605                                 brelse(new_eb_bhs[i]);
606                 kfree(new_eb_bhs);
607         }
608
609         mlog_exit(status);
610         return status;
611 }
612
613 /*
614  * adds another level to the allocation tree.
615  * returns back the new extent block so you can add a branch to it
616  * after this call.
617  */
618 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
619                                   handle_t *handle,
620                                   struct inode *inode,
621                                   struct buffer_head *fe_bh,
622                                   struct ocfs2_alloc_context *meta_ac,
623                                   struct buffer_head **ret_new_eb_bh)
624 {
625         int status, i;
626         u32 new_clusters;
627         struct buffer_head *new_eb_bh = NULL;
628         struct ocfs2_dinode *fe;
629         struct ocfs2_extent_block *eb;
630         struct ocfs2_extent_list  *fe_el;
631         struct ocfs2_extent_list  *eb_el;
632
633         mlog_entry_void();
634
635         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
636                                            &new_eb_bh);
637         if (status < 0) {
638                 mlog_errno(status);
639                 goto bail;
640         }
641
642         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
643         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
644                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
645                 status = -EIO;
646                 goto bail;
647         }
648
649         eb_el = &eb->h_list;
650         fe = (struct ocfs2_dinode *) fe_bh->b_data;
651         fe_el = &fe->id2.i_list;
652
653         status = ocfs2_journal_access(handle, inode, new_eb_bh,
654                                       OCFS2_JOURNAL_ACCESS_CREATE);
655         if (status < 0) {
656                 mlog_errno(status);
657                 goto bail;
658         }
659
660         /* copy the fe data into the new extent block */
661         eb_el->l_tree_depth = fe_el->l_tree_depth;
662         eb_el->l_next_free_rec = fe_el->l_next_free_rec;
663         for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
664                 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
665                 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
666                 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
667         }
668
669         status = ocfs2_journal_dirty(handle, new_eb_bh);
670         if (status < 0) {
671                 mlog_errno(status);
672                 goto bail;
673         }
674
675         status = ocfs2_journal_access(handle, inode, fe_bh,
676                                       OCFS2_JOURNAL_ACCESS_WRITE);
677         if (status < 0) {
678                 mlog_errno(status);
679                 goto bail;
680         }
681
682         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
683
684         /* update fe now */
685         le16_add_cpu(&fe_el->l_tree_depth, 1);
686         fe_el->l_recs[0].e_cpos = 0;
687         fe_el->l_recs[0].e_blkno = eb->h_blkno;
688         fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters);
689         for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
690                 fe_el->l_recs[i].e_cpos = 0;
691                 fe_el->l_recs[i].e_clusters = 0;
692                 fe_el->l_recs[i].e_blkno = 0;
693         }
694         fe_el->l_next_free_rec = cpu_to_le16(1);
695
696         /* If this is our 1st tree depth shift, then last_eb_blk
697          * becomes the allocated extent block */
698         if (fe_el->l_tree_depth == cpu_to_le16(1))
699                 fe->i_last_eb_blk = eb->h_blkno;
700
701         status = ocfs2_journal_dirty(handle, fe_bh);
702         if (status < 0) {
703                 mlog_errno(status);
704                 goto bail;
705         }
706
707         *ret_new_eb_bh = new_eb_bh;
708         new_eb_bh = NULL;
709         status = 0;
710 bail:
711         if (new_eb_bh)
712                 brelse(new_eb_bh);
713
714         mlog_exit(status);
715         return status;
716 }
717
718 /*
719  * Should only be called when there is no space left in any of the
720  * leaf nodes. What we want to do is find the lowest tree depth
721  * non-leaf extent block with room for new records. There are three
722  * valid results of this search:
723  *
724  * 1) a lowest extent block is found, then we pass it back in
725  *    *lowest_eb_bh and return '0'
726  *
727  * 2) the search fails to find anything, but the dinode has room. We
728  *    pass NULL back in *lowest_eb_bh, but still return '0'
729  *
730  * 3) the search fails to find anything AND the dinode is full, in
731  *    which case we return > 0
732  *
733  * return status < 0 indicates an error.
734  */
735 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
736                                     struct inode *inode,
737                                     struct buffer_head *fe_bh,
738                                     struct buffer_head **target_bh)
739 {
740         int status = 0, i;
741         u64 blkno;
742         struct ocfs2_dinode *fe;
743         struct ocfs2_extent_block *eb;
744         struct ocfs2_extent_list  *el;
745         struct buffer_head *bh = NULL;
746         struct buffer_head *lowest_bh = NULL;
747
748         mlog_entry_void();
749
750         *target_bh = NULL;
751
752         fe = (struct ocfs2_dinode *) fe_bh->b_data;
753         el = &fe->id2.i_list;
754
755         while(le16_to_cpu(el->l_tree_depth) > 1) {
756                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
757                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
758                                     "extent list (next_free_rec == 0)",
759                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
760                         status = -EIO;
761                         goto bail;
762                 }
763                 i = le16_to_cpu(el->l_next_free_rec) - 1;
764                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
765                 if (!blkno) {
766                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
767                                     "list where extent # %d has no physical "
768                                     "block start",
769                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
770                         status = -EIO;
771                         goto bail;
772                 }
773
774                 if (bh) {
775                         brelse(bh);
776                         bh = NULL;
777                 }
778
779                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
780                                           inode);
781                 if (status < 0) {
782                         mlog_errno(status);
783                         goto bail;
784                 }
785
786                 eb = (struct ocfs2_extent_block *) bh->b_data;
787                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
788                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
789                         status = -EIO;
790                         goto bail;
791                 }
792                 el = &eb->h_list;
793
794                 if (le16_to_cpu(el->l_next_free_rec) <
795                     le16_to_cpu(el->l_count)) {
796                         if (lowest_bh)
797                                 brelse(lowest_bh);
798                         lowest_bh = bh;
799                         get_bh(lowest_bh);
800                 }
801         }
802
803         /* If we didn't find one and the fe doesn't have any room,
804          * then return '1' */
805         if (!lowest_bh
806             && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
807                 status = 1;
808
809         *target_bh = lowest_bh;
810 bail:
811         if (bh)
812                 brelse(bh);
813
814         mlog_exit(status);
815         return status;
816 }
817
818 static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
819 {
820         return !rec->e_clusters;
821 }
822
823 /*
824  * This function will discard the rightmost extent record.
825  */
826 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
827 {
828         int next_free = le16_to_cpu(el->l_next_free_rec);
829         int count = le16_to_cpu(el->l_count);
830         unsigned int num_bytes;
831
832         BUG_ON(!next_free);
833         /* This will cause us to go off the end of our extent list. */
834         BUG_ON(next_free >= count);
835
836         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
837
838         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
839 }
840
841 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
842                               struct ocfs2_extent_rec *insert_rec)
843 {
844         int i, insert_index, next_free, has_empty, num_bytes;
845         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
846         struct ocfs2_extent_rec *rec;
847
848         next_free = le16_to_cpu(el->l_next_free_rec);
849         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
850
851         BUG_ON(!next_free);
852
853         /* The tree code before us didn't allow enough room in the leaf. */
854         if (el->l_next_free_rec == el->l_count && !has_empty)
855                 BUG();
856
857         /*
858          * The easiest way to approach this is to just remove the
859          * empty extent and temporarily decrement next_free.
860          */
861         if (has_empty) {
862                 /*
863                  * If next_free was 1 (only an empty extent), this
864                  * loop won't execute, which is fine. We still want
865                  * the decrement above to happen.
866                  */
867                 for(i = 0; i < (next_free - 1); i++)
868                         el->l_recs[i] = el->l_recs[i+1];
869
870                 next_free--;
871         }
872
873         /*
874          * Figure out what the new record index should be.
875          */
876         for(i = 0; i < next_free; i++) {
877                 rec = &el->l_recs[i];
878
879                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
880                         break;
881         }
882         insert_index = i;
883
884         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
885              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
886
887         BUG_ON(insert_index < 0);
888         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
889         BUG_ON(insert_index > next_free);
890
891         /*
892          * No need to memmove if we're just adding to the tail.
893          */
894         if (insert_index != next_free) {
895                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
896
897                 num_bytes = next_free - insert_index;
898                 num_bytes *= sizeof(struct ocfs2_extent_rec);
899                 memmove(&el->l_recs[insert_index + 1],
900                         &el->l_recs[insert_index],
901                         num_bytes);
902         }
903
904         /*
905          * Either we had an empty extent, and need to re-increment or
906          * there was no empty extent on a non full rightmost leaf node,
907          * in which case we still need to increment.
908          */
909         next_free++;
910         el->l_next_free_rec = cpu_to_le16(next_free);
911         /*
912          * Make sure none of the math above just messed up our tree.
913          */
914         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
915
916         el->l_recs[insert_index] = *insert_rec;
917
918 }
919
920 /*
921  * Create an empty extent record .
922  *
923  * l_next_free_rec may be updated.
924  *
925  * If an empty extent already exists do nothing.
926  */
927 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
928 {
929         int next_free = le16_to_cpu(el->l_next_free_rec);
930
931         if (next_free == 0)
932                 goto set_and_inc;
933
934         if (ocfs2_is_empty_extent(&el->l_recs[0]))
935                 return;
936
937         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
938                         "Asked to create an empty extent in a full list:\n"
939                         "count = %u, tree depth = %u",
940                         le16_to_cpu(el->l_count),
941                         le16_to_cpu(el->l_tree_depth));
942
943         ocfs2_shift_records_right(el);
944
945 set_and_inc:
946         le16_add_cpu(&el->l_next_free_rec, 1);
947         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
948 }
949
950 /*
951  * For a rotation which involves two leaf nodes, the "root node" is
952  * the lowest level tree node which contains a path to both leafs. This
953  * resulting set of information can be used to form a complete "subtree"
954  *
955  * This function is passed two full paths from the dinode down to a
956  * pair of adjacent leaves. It's task is to figure out which path
957  * index contains the subtree root - this can be the root index itself
958  * in a worst-case rotation.
959  *
960  * The array index of the subtree root is passed back.
961  */
962 static int ocfs2_find_subtree_root(struct inode *inode,
963                                    struct ocfs2_path *left,
964                                    struct ocfs2_path *right)
965 {
966         int i = 0;
967
968         /*
969          * Check that the caller passed in two paths from the same tree.
970          */
971         BUG_ON(path_root_bh(left) != path_root_bh(right));
972
973         do {
974                 i++;
975
976                 /*
977                  * The caller didn't pass two adjacent paths.
978                  */
979                 mlog_bug_on_msg(i > left->p_tree_depth,
980                                 "Inode %lu, left depth %u, right depth %u\n"
981                                 "left leaf blk %llu, right leaf blk %llu\n",
982                                 inode->i_ino, left->p_tree_depth,
983                                 right->p_tree_depth,
984                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
985                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
986         } while (left->p_node[i].bh->b_blocknr ==
987                  right->p_node[i].bh->b_blocknr);
988
989         return i - 1;
990 }
991
992 typedef void (path_insert_t)(void *, struct buffer_head *);
993
994 /*
995  * Traverse a btree path in search of cpos, starting at root_el.
996  *
997  * This code can be called with a cpos larger than the tree, in which
998  * case it will return the rightmost path.
999  */
1000 static int __ocfs2_find_path(struct inode *inode,
1001                              struct ocfs2_extent_list *root_el, u32 cpos,
1002                              path_insert_t *func, void *data)
1003 {
1004         int i, ret = 0;
1005         u32 range;
1006         u64 blkno;
1007         struct buffer_head *bh = NULL;
1008         struct ocfs2_extent_block *eb;
1009         struct ocfs2_extent_list *el;
1010         struct ocfs2_extent_rec *rec;
1011         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1012
1013         el = root_el;
1014         while (el->l_tree_depth) {
1015                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1016                         ocfs2_error(inode->i_sb,
1017                                     "Inode %llu has empty extent list at "
1018                                     "depth %u\n",
1019                                     (unsigned long long)oi->ip_blkno,
1020                                     le16_to_cpu(el->l_tree_depth));
1021                         ret = -EROFS;
1022                         goto out;
1023
1024                 }
1025
1026                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1027                         rec = &el->l_recs[i];
1028
1029                         /*
1030                          * In the case that cpos is off the allocation
1031                          * tree, this should just wind up returning the
1032                          * rightmost record.
1033                          */
1034                         range = le32_to_cpu(rec->e_cpos) +
1035                                 le32_to_cpu(rec->e_clusters);
1036                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1037                             break;
1038                 }
1039
1040                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1041                 if (blkno == 0) {
1042                         ocfs2_error(inode->i_sb,
1043                                     "Inode %llu has bad blkno in extent list "
1044                                     "at depth %u (index %d)\n",
1045                                     (unsigned long long)oi->ip_blkno,
1046                                     le16_to_cpu(el->l_tree_depth), i);
1047                         ret = -EROFS;
1048                         goto out;
1049                 }
1050
1051                 brelse(bh);
1052                 bh = NULL;
1053                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1054                                        &bh, OCFS2_BH_CACHED, inode);
1055                 if (ret) {
1056                         mlog_errno(ret);
1057                         goto out;
1058                 }
1059
1060                 eb = (struct ocfs2_extent_block *) bh->b_data;
1061                 el = &eb->h_list;
1062                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1063                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1064                         ret = -EIO;
1065                         goto out;
1066                 }
1067
1068                 if (le16_to_cpu(el->l_next_free_rec) >
1069                     le16_to_cpu(el->l_count)) {
1070                         ocfs2_error(inode->i_sb,
1071                                     "Inode %llu has bad count in extent list "
1072                                     "at block %llu (next free=%u, count=%u)\n",
1073                                     (unsigned long long)oi->ip_blkno,
1074                                     (unsigned long long)bh->b_blocknr,
1075                                     le16_to_cpu(el->l_next_free_rec),
1076                                     le16_to_cpu(el->l_count));
1077                         ret = -EROFS;
1078                         goto out;
1079                 }
1080
1081                 if (func)
1082                         func(data, bh);
1083         }
1084
1085 out:
1086         /*
1087          * Catch any trailing bh that the loop didn't handle.
1088          */
1089         brelse(bh);
1090
1091         return ret;
1092 }
1093
1094 /*
1095  * Given an initialized path (that is, it has a valid root extent
1096  * list), this function will traverse the btree in search of the path
1097  * which would contain cpos.
1098  *
1099  * The path traveled is recorded in the path structure.
1100  *
1101  * Note that this will not do any comparisons on leaf node extent
1102  * records, so it will work fine in the case that we just added a tree
1103  * branch.
1104  */
1105 struct find_path_data {
1106         int index;
1107         struct ocfs2_path *path;
1108 };
1109 static void find_path_ins(void *data, struct buffer_head *bh)
1110 {
1111         struct find_path_data *fp = data;
1112
1113         get_bh(bh);
1114         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1115         fp->index++;
1116 }
1117 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1118                            u32 cpos)
1119 {
1120         struct find_path_data data;
1121
1122         data.index = 1;
1123         data.path = path;
1124         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1125                                  find_path_ins, &data);
1126 }
1127
1128 static void find_leaf_ins(void *data, struct buffer_head *bh)
1129 {
1130         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1131         struct ocfs2_extent_list *el = &eb->h_list;
1132         struct buffer_head **ret = data;
1133
1134         /* We want to retain only the leaf block. */
1135         if (le16_to_cpu(el->l_tree_depth) == 0) {
1136                 get_bh(bh);
1137                 *ret = bh;
1138         }
1139 }
1140 /*
1141  * Find the leaf block in the tree which would contain cpos. No
1142  * checking of the actual leaf is done.
1143  *
1144  * Some paths want to call this instead of allocating a path structure
1145  * and calling ocfs2_find_path().
1146  *
1147  * This function doesn't handle non btree extent lists.
1148  */
1149 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1150                     u32 cpos, struct buffer_head **leaf_bh)
1151 {
1152         int ret;
1153         struct buffer_head *bh = NULL;
1154
1155         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1156         if (ret) {
1157                 mlog_errno(ret);
1158                 goto out;
1159         }
1160
1161         *leaf_bh = bh;
1162 out:
1163         return ret;
1164 }
1165
1166 /*
1167  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1168  *
1169  * Basically, we've moved stuff around at the bottom of the tree and
1170  * we need to fix up the extent records above the changes to reflect
1171  * the new changes.
1172  *
1173  * left_rec: the record on the left.
1174  * left_child_el: is the child list pointed to by left_rec
1175  * right_rec: the record to the right of left_rec
1176  * right_child_el: is the child list pointed to by right_rec
1177  *
1178  * By definition, this only works on interior nodes.
1179  */
1180 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1181                                   struct ocfs2_extent_list *left_child_el,
1182                                   struct ocfs2_extent_rec *right_rec,
1183                                   struct ocfs2_extent_list *right_child_el)
1184 {
1185         u32 left_clusters, right_end;
1186
1187         /*
1188          * Interior nodes never have holes. Their cpos is the cpos of
1189          * the leftmost record in their child list. Their cluster
1190          * count covers the full theoretical range of their child list
1191          * - the range between their cpos and the cpos of the record
1192          * immediately to their right.
1193          */
1194         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1195         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1196         left_rec->e_clusters = cpu_to_le32(left_clusters);
1197
1198         /*
1199          * Calculate the rightmost cluster count boundary before
1200          * moving cpos - we will need to adjust e_clusters after
1201          * updating e_cpos to keep the same highest cluster count.
1202          */
1203         right_end = le32_to_cpu(right_rec->e_cpos);
1204         right_end += le32_to_cpu(right_rec->e_clusters);
1205
1206         right_rec->e_cpos = left_rec->e_cpos;
1207         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1208
1209         right_end -= le32_to_cpu(right_rec->e_cpos);
1210         right_rec->e_clusters = cpu_to_le32(right_end);
1211 }
1212
1213 /*
1214  * Adjust the adjacent root node records involved in a
1215  * rotation. left_el_blkno is passed in as a key so that we can easily
1216  * find it's index in the root list.
1217  */
1218 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1219                                       struct ocfs2_extent_list *left_el,
1220                                       struct ocfs2_extent_list *right_el,
1221                                       u64 left_el_blkno)
1222 {
1223         int i;
1224
1225         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1226                le16_to_cpu(left_el->l_tree_depth));
1227
1228         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1229                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1230                         break;
1231         }
1232
1233         /*
1234          * The path walking code should have never returned a root and
1235          * two paths which are not adjacent.
1236          */
1237         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1238
1239         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1240                                       &root_el->l_recs[i + 1], right_el);
1241 }
1242
1243 /*
1244  * We've changed a leaf block (in right_path) and need to reflect that
1245  * change back up the subtree.
1246  *
1247  * This happens in multiple places:
1248  *   - When we've moved an extent record from the left path leaf to the right
1249  *     path leaf to make room for an empty extent in the left path leaf.
1250  *   - When our insert into the right path leaf is at the leftmost edge
1251  *     and requires an update of the path immediately to it's left. This
1252  *     can occur at the end of some types of rotation and appending inserts.
1253  */
1254 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1255                                        struct ocfs2_path *left_path,
1256                                        struct ocfs2_path *right_path,
1257                                        int subtree_index)
1258 {
1259         int ret, i, idx;
1260         struct ocfs2_extent_list *el, *left_el, *right_el;
1261         struct ocfs2_extent_rec *left_rec, *right_rec;
1262         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1263
1264         /*
1265          * Update the counts and position values within all the
1266          * interior nodes to reflect the leaf rotation we just did.
1267          *
1268          * The root node is handled below the loop.
1269          *
1270          * We begin the loop with right_el and left_el pointing to the
1271          * leaf lists and work our way up.
1272          *
1273          * NOTE: within this loop, left_el and right_el always refer
1274          * to the *child* lists.
1275          */
1276         left_el = path_leaf_el(left_path);
1277         right_el = path_leaf_el(right_path);
1278         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1279                 mlog(0, "Adjust records at index %u\n", i);
1280
1281                 /*
1282                  * One nice property of knowing that all of these
1283                  * nodes are below the root is that we only deal with
1284                  * the leftmost right node record and the rightmost
1285                  * left node record.
1286                  */
1287                 el = left_path->p_node[i].el;
1288                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1289                 left_rec = &el->l_recs[idx];
1290
1291                 el = right_path->p_node[i].el;
1292                 right_rec = &el->l_recs[0];
1293
1294                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1295                                               right_el);
1296
1297                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1298                 if (ret)
1299                         mlog_errno(ret);
1300
1301                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1302                 if (ret)
1303                         mlog_errno(ret);
1304
1305                 /*
1306                  * Setup our list pointers now so that the current
1307                  * parents become children in the next iteration.
1308                  */
1309                 left_el = left_path->p_node[i].el;
1310                 right_el = right_path->p_node[i].el;
1311         }
1312
1313         /*
1314          * At the root node, adjust the two adjacent records which
1315          * begin our path to the leaves.
1316          */
1317
1318         el = left_path->p_node[subtree_index].el;
1319         left_el = left_path->p_node[subtree_index + 1].el;
1320         right_el = right_path->p_node[subtree_index + 1].el;
1321
1322         ocfs2_adjust_root_records(el, left_el, right_el,
1323                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1324
1325         root_bh = left_path->p_node[subtree_index].bh;
1326
1327         ret = ocfs2_journal_dirty(handle, root_bh);
1328         if (ret)
1329                 mlog_errno(ret);
1330 }
1331
1332 static int ocfs2_rotate_subtree_right(struct inode *inode,
1333                                       handle_t *handle,
1334                                       struct ocfs2_path *left_path,
1335                                       struct ocfs2_path *right_path,
1336                                       int subtree_index)
1337 {
1338         int ret, i;
1339         struct buffer_head *right_leaf_bh;
1340         struct buffer_head *left_leaf_bh = NULL;
1341         struct buffer_head *root_bh;
1342         struct ocfs2_extent_list *right_el, *left_el;
1343         struct ocfs2_extent_rec move_rec;
1344
1345         left_leaf_bh = path_leaf_bh(left_path);
1346         left_el = path_leaf_el(left_path);
1347
1348         if (left_el->l_next_free_rec != left_el->l_count) {
1349                 ocfs2_error(inode->i_sb,
1350                             "Inode %llu has non-full interior leaf node %llu"
1351                             "(next free = %u)",
1352                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1353                             (unsigned long long)left_leaf_bh->b_blocknr,
1354                             le16_to_cpu(left_el->l_next_free_rec));
1355                 return -EROFS;
1356         }
1357
1358         /*
1359          * This extent block may already have an empty record, so we
1360          * return early if so.
1361          */
1362         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1363                 return 0;
1364
1365         root_bh = left_path->p_node[subtree_index].bh;
1366         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1367
1368         ret = ocfs2_journal_access(handle, inode, root_bh,
1369                                    OCFS2_JOURNAL_ACCESS_WRITE);
1370         if (ret) {
1371                 mlog_errno(ret);
1372                 goto out;
1373         }
1374
1375         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1376                 ret = ocfs2_journal_access(handle, inode,
1377                                            right_path->p_node[i].bh,
1378                                            OCFS2_JOURNAL_ACCESS_WRITE);
1379                 if (ret) {
1380                         mlog_errno(ret);
1381                         goto out;
1382                 }
1383
1384                 ret = ocfs2_journal_access(handle, inode,
1385                                            left_path->p_node[i].bh,
1386                                            OCFS2_JOURNAL_ACCESS_WRITE);
1387                 if (ret) {
1388                         mlog_errno(ret);
1389                         goto out;
1390                 }
1391         }
1392
1393         right_leaf_bh = path_leaf_bh(right_path);
1394         right_el = path_leaf_el(right_path);
1395
1396         /* This is a code error, not a disk corruption. */
1397         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1398                         "because rightmost leaf block %llu is empty\n",
1399                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1400                         (unsigned long long)right_leaf_bh->b_blocknr);
1401
1402         ocfs2_create_empty_extent(right_el);
1403
1404         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1405         if (ret) {
1406                 mlog_errno(ret);
1407                 goto out;
1408         }
1409
1410         /* Do the copy now. */
1411         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1412         move_rec = left_el->l_recs[i];
1413         right_el->l_recs[0] = move_rec;
1414
1415         /*
1416          * Clear out the record we just copied and shift everything
1417          * over, leaving an empty extent in the left leaf.
1418          *
1419          * We temporarily subtract from next_free_rec so that the
1420          * shift will lose the tail record (which is now defunct).
1421          */
1422         le16_add_cpu(&left_el->l_next_free_rec, -1);
1423         ocfs2_shift_records_right(left_el);
1424         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1425         le16_add_cpu(&left_el->l_next_free_rec, 1);
1426
1427         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1428         if (ret) {
1429                 mlog_errno(ret);
1430                 goto out;
1431         }
1432
1433         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1434                                 subtree_index);
1435
1436 out:
1437         return ret;
1438 }
1439
1440 /*
1441  * Given a full path, determine what cpos value would return us a path
1442  * containing the leaf immediately to the left of the current one.
1443  *
1444  * Will return zero if the path passed in is already the leftmost path.
1445  */
1446 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1447                                          struct ocfs2_path *path, u32 *cpos)
1448 {
1449         int i, j, ret = 0;
1450         u64 blkno;
1451         struct ocfs2_extent_list *el;
1452
1453         *cpos = 0;
1454
1455         blkno = path_leaf_bh(path)->b_blocknr;
1456
1457         /* Start at the tree node just above the leaf and work our way up. */
1458         i = path->p_tree_depth - 1;
1459         while (i >= 0) {
1460                 el = path->p_node[i].el;
1461
1462                 /*
1463                  * Find the extent record just before the one in our
1464                  * path.
1465                  */
1466                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1467                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1468                                 if (j == 0) {
1469                                         if (i == 0) {
1470                                                 /*
1471                                                  * We've determined that the
1472                                                  * path specified is already
1473                                                  * the leftmost one - return a
1474                                                  * cpos of zero.
1475                                                  */
1476                                                 goto out;
1477                                         }
1478                                         /*
1479                                          * The leftmost record points to our
1480                                          * leaf - we need to travel up the
1481                                          * tree one level.
1482                                          */
1483                                         goto next_node;
1484                                 }
1485
1486                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1487                                 *cpos = *cpos + le32_to_cpu(el->l_recs[j - 1].e_clusters) - 1;
1488                                 goto out;
1489                         }
1490                 }
1491
1492                 /*
1493                  * If we got here, we never found a valid node where
1494                  * the tree indicated one should be.
1495                  */
1496                 ocfs2_error(sb,
1497                             "Invalid extent tree at extent block %llu\n",
1498                             (unsigned long long)blkno);
1499                 ret = -EROFS;
1500                 goto out;
1501
1502 next_node:
1503                 blkno = path->p_node[i].bh->b_blocknr;
1504                 i--;
1505         }
1506
1507 out:
1508         return ret;
1509 }
1510
1511 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1512                                            struct ocfs2_path *path)
1513 {
1514         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
1515
1516         if (handle->h_buffer_credits < credits)
1517                 return ocfs2_extend_trans(handle, credits);
1518
1519         return 0;
1520 }
1521
1522 /*
1523  * Trap the case where we're inserting into the theoretical range past
1524  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1525  * whose cpos is less than ours into the right leaf.
1526  *
1527  * It's only necessary to look at the rightmost record of the left
1528  * leaf because the logic that calls us should ensure that the
1529  * theoretical ranges in the path components above the leaves are
1530  * correct.
1531  */
1532 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1533                                                  u32 insert_cpos)
1534 {
1535         struct ocfs2_extent_list *left_el;
1536         struct ocfs2_extent_rec *rec;
1537         int next_free;
1538
1539         left_el = path_leaf_el(left_path);
1540         next_free = le16_to_cpu(left_el->l_next_free_rec);
1541         rec = &left_el->l_recs[next_free - 1];
1542
1543         if (insert_cpos > le32_to_cpu(rec->e_cpos))
1544                 return 1;
1545         return 0;
1546 }
1547
1548 /*
1549  * Rotate all the records in a btree right one record, starting at insert_cpos.
1550  *
1551  * The path to the rightmost leaf should be passed in.
1552  *
1553  * The array is assumed to be large enough to hold an entire path (tree depth).
1554  *
1555  * Upon succesful return from this function:
1556  *
1557  * - The 'right_path' array will contain a path to the leaf block
1558  *   whose range contains e_cpos.
1559  * - That leaf block will have a single empty extent in list index 0.
1560  * - In the case that the rotation requires a post-insert update,
1561  *   *ret_left_path will contain a valid path which can be passed to
1562  *   ocfs2_insert_path().
1563  */
1564 static int ocfs2_rotate_tree_right(struct inode *inode,
1565                                    handle_t *handle,
1566                                    u32 insert_cpos,
1567                                    struct ocfs2_path *right_path,
1568                                    struct ocfs2_path **ret_left_path)
1569 {
1570         int ret, start;
1571         u32 cpos;
1572         struct ocfs2_path *left_path = NULL;
1573
1574         *ret_left_path = NULL;
1575
1576         left_path = ocfs2_new_path(path_root_bh(right_path),
1577                                    path_root_el(right_path));
1578         if (!left_path) {
1579                 ret = -ENOMEM;
1580                 mlog_errno(ret);
1581                 goto out;
1582         }
1583
1584         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1585         if (ret) {
1586                 mlog_errno(ret);
1587                 goto out;
1588         }
1589
1590         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1591
1592         /*
1593          * What we want to do here is:
1594          *
1595          * 1) Start with the rightmost path.
1596          *
1597          * 2) Determine a path to the leaf block directly to the left
1598          *    of that leaf.
1599          *
1600          * 3) Determine the 'subtree root' - the lowest level tree node
1601          *    which contains a path to both leaves.
1602          *
1603          * 4) Rotate the subtree.
1604          *
1605          * 5) Find the next subtree by considering the left path to be
1606          *    the new right path.
1607          *
1608          * The check at the top of this while loop also accepts
1609          * insert_cpos == cpos because cpos is only a _theoretical_
1610          * value to get us the left path - insert_cpos might very well
1611          * be filling that hole.
1612          *
1613          * Stop at a cpos of '0' because we either started at the
1614          * leftmost branch (i.e., a tree with one branch and a
1615          * rotation inside of it), or we've gone as far as we can in
1616          * rotating subtrees.
1617          */
1618         while (cpos && insert_cpos <= cpos) {
1619                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1620                      insert_cpos, cpos);
1621
1622                 ret = ocfs2_find_path(inode, left_path, cpos);
1623                 if (ret) {
1624                         mlog_errno(ret);
1625                         goto out;
1626                 }
1627
1628                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
1629                                 path_leaf_bh(right_path),
1630                                 "Inode %lu: error during insert of %u "
1631                                 "(left path cpos %u) results in two identical "
1632                                 "paths ending at %llu\n",
1633                                 inode->i_ino, insert_cpos, cpos,
1634                                 (unsigned long long)
1635                                 path_leaf_bh(left_path)->b_blocknr);
1636
1637                 if (ocfs2_rotate_requires_path_adjustment(left_path,
1638                                                           insert_cpos)) {
1639                         mlog(0, "Path adjustment required\n");
1640
1641                         /*
1642                          * We've rotated the tree as much as we
1643                          * should. The rest is up to
1644                          * ocfs2_insert_path() to complete, after the
1645                          * record insertion. We indicate this
1646                          * situation by returning the left path.
1647                          *
1648                          * The reason we don't adjust the records here
1649                          * before the record insert is that an error
1650                          * later might break the rule where a parent
1651                          * record e_cpos will reflect the actual
1652                          * e_cpos of the 1st nonempty record of the
1653                          * child list.
1654                          */
1655                         *ret_left_path = left_path;
1656                         goto out_ret_path;
1657                 }
1658
1659                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
1660
1661                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1662                      start,
1663                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1664                      right_path->p_tree_depth);
1665
1666                 ret = ocfs2_extend_rotate_transaction(handle, start,
1667                                                       right_path);
1668                 if (ret) {
1669                         mlog_errno(ret);
1670                         goto out;
1671                 }
1672
1673                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1674                                                  right_path, start);
1675                 if (ret) {
1676                         mlog_errno(ret);
1677                         goto out;
1678                 }
1679
1680                 /*
1681                  * There is no need to re-read the next right path
1682                  * as we know that it'll be our current left
1683                  * path. Optimize by copying values instead.
1684                  */
1685                 ocfs2_mv_path(right_path, left_path);
1686
1687                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1688                                                     &cpos);
1689                 if (ret) {
1690                         mlog_errno(ret);
1691                         goto out;
1692                 }
1693         }
1694
1695 out:
1696         ocfs2_free_path(left_path);
1697
1698 out_ret_path:
1699         return ret;
1700 }
1701
1702 /*
1703  * Do the final bits of extent record insertion at the target leaf
1704  * list. If this leaf is part of an allocation tree, it is assumed
1705  * that the tree above has been prepared.
1706  */
1707 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1708                                  struct ocfs2_extent_list *el,
1709                                  struct ocfs2_insert_type *insert,
1710                                  struct inode *inode)
1711 {
1712         int i = insert->ins_contig_index;
1713         unsigned int range;
1714         struct ocfs2_extent_rec *rec;
1715
1716         BUG_ON(el->l_tree_depth);
1717
1718         /*
1719          * Contiguous insert - either left or right.
1720          */
1721         if (insert->ins_contig != CONTIG_NONE) {
1722                 rec = &el->l_recs[i];
1723                 if (insert->ins_contig == CONTIG_LEFT) {
1724                         rec->e_blkno = insert_rec->e_blkno;
1725                         rec->e_cpos = insert_rec->e_cpos;
1726                 }
1727                 le32_add_cpu(&rec->e_clusters,
1728                              le32_to_cpu(insert_rec->e_clusters));
1729                 return;
1730         }
1731
1732         /*
1733          * Handle insert into an empty leaf.
1734          */
1735         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
1736             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
1737              ocfs2_is_empty_extent(&el->l_recs[0]))) {
1738                 el->l_recs[0] = *insert_rec;
1739                 el->l_next_free_rec = cpu_to_le16(1);
1740                 return;
1741         }
1742
1743         /*
1744          * Appending insert.
1745          */
1746         if (insert->ins_appending == APPEND_TAIL) {
1747                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1748                 rec = &el->l_recs[i];
1749                 range = le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters);
1750                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
1751
1752                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
1753                                 le16_to_cpu(el->l_count),
1754                                 "inode %lu, depth %u, count %u, next free %u, "
1755                                 "rec.cpos %u, rec.clusters %u, "
1756                                 "insert.cpos %u, insert.clusters %u\n",
1757                                 inode->i_ino,
1758                                 le16_to_cpu(el->l_tree_depth),
1759                                 le16_to_cpu(el->l_count),
1760                                 le16_to_cpu(el->l_next_free_rec),
1761                                 le32_to_cpu(el->l_recs[i].e_cpos),
1762                                 le32_to_cpu(el->l_recs[i].e_clusters),
1763                                 le32_to_cpu(insert_rec->e_cpos),
1764                                 le32_to_cpu(insert_rec->e_clusters));
1765                 i++;
1766                 el->l_recs[i] = *insert_rec;
1767                 le16_add_cpu(&el->l_next_free_rec, 1);
1768                 return;
1769         }
1770
1771         /*
1772          * Ok, we have to rotate.
1773          *
1774          * At this point, it is safe to assume that inserting into an
1775          * empty leaf and appending to a leaf have both been handled
1776          * above.
1777          *
1778          * This leaf needs to have space, either by the empty 1st
1779          * extent record, or by virtue of an l_next_rec < l_count.
1780          */
1781         ocfs2_rotate_leaf(el, insert_rec);
1782 }
1783
1784 static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1785                                                 struct ocfs2_dinode *di,
1786                                                 u32 clusters)
1787 {
1788         le32_add_cpu(&di->i_clusters, clusters);
1789         spin_lock(&OCFS2_I(inode)->ip_lock);
1790         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
1791         spin_unlock(&OCFS2_I(inode)->ip_lock);
1792 }
1793
1794 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1795                                     struct ocfs2_extent_rec *insert_rec,
1796                                     struct ocfs2_path *right_path,
1797                                     struct ocfs2_path **ret_left_path)
1798 {
1799         int ret, i, next_free;
1800         struct buffer_head *bh;
1801         struct ocfs2_extent_list *el;
1802         struct ocfs2_path *left_path = NULL;
1803
1804         *ret_left_path = NULL;
1805
1806         /*
1807          * If our appending insert is at the leftmost edge of a leaf,
1808          * then we might need to update the rightmost records of the
1809          * neighboring path.
1810          */
1811         el = path_leaf_el(right_path);
1812         next_free = le16_to_cpu(el->l_next_free_rec);
1813         if (next_free == 0 ||
1814             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
1815                 u32 left_cpos;
1816
1817                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1818                                                     &left_cpos);
1819                 if (ret) {
1820                         mlog_errno(ret);
1821                         goto out;
1822                 }
1823
1824                 mlog(0, "Append may need a left path update. cpos: %u, "
1825                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
1826                      left_cpos);
1827
1828                 /*
1829                  * No need to worry if the append is already in the
1830                  * leftmost leaf.
1831                  */
1832                 if (left_cpos) {
1833                         left_path = ocfs2_new_path(path_root_bh(right_path),
1834                                                    path_root_el(right_path));
1835                         if (!left_path) {
1836                                 ret = -ENOMEM;
1837                                 mlog_errno(ret);
1838                                 goto out;
1839                         }
1840
1841                         ret = ocfs2_find_path(inode, left_path, left_cpos);
1842                         if (ret) {
1843                                 mlog_errno(ret);
1844                                 goto out;
1845                         }
1846
1847                         /*
1848                          * ocfs2_insert_path() will pass the left_path to the
1849                          * journal for us.
1850                          */
1851                 }
1852         }
1853
1854         ret = ocfs2_journal_access_path(inode, handle, right_path);
1855         if (ret) {
1856                 mlog_errno(ret);
1857                 goto out;
1858         }
1859
1860         el = path_root_el(right_path);
1861         bh = path_root_bh(right_path);
1862         i = 0;
1863         while (1) {
1864                 next_free = le16_to_cpu(el->l_next_free_rec);
1865                 if (next_free == 0) {
1866                         ocfs2_error(inode->i_sb,
1867                                     "Dinode %llu has a bad extent list",
1868                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1869                         ret = -EIO;
1870                         goto out;
1871                 }
1872
1873                 el->l_recs[next_free - 1].e_clusters = insert_rec->e_cpos;
1874                 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
1875                              le32_to_cpu(insert_rec->e_clusters));
1876                 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
1877                             -le32_to_cpu(el->l_recs[next_free - 1].e_cpos));
1878
1879                 ret = ocfs2_journal_dirty(handle, bh);
1880                 if (ret)
1881                         mlog_errno(ret);
1882
1883                 if (++i >= right_path->p_tree_depth)
1884                         break;
1885
1886                 bh = right_path->p_node[i].bh;
1887                 el = right_path->p_node[i].el;
1888         }
1889
1890         *ret_left_path = left_path;
1891         ret = 0;
1892 out:
1893         if (ret != 0)
1894                 ocfs2_free_path(left_path);
1895
1896         return ret;
1897 }
1898
1899 /*
1900  * This function only does inserts on an allocation b-tree. For dinode
1901  * lists, ocfs2_insert_at_leaf() is called directly.
1902  *
1903  * right_path is the path we want to do the actual insert
1904  * in. left_path should only be passed in if we need to update that
1905  * portion of the tree after an edge insert.
1906  */
1907 static int ocfs2_insert_path(struct inode *inode,
1908                              handle_t *handle,
1909                              struct ocfs2_path *left_path,
1910                              struct ocfs2_path *right_path,
1911                              struct ocfs2_extent_rec *insert_rec,
1912                              struct ocfs2_insert_type *insert)
1913 {
1914         int ret, subtree_index;
1915         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1916         struct ocfs2_extent_list *el;
1917
1918         /*
1919          * Pass both paths to the journal. The majority of inserts
1920          * will be touching all components anyway.
1921          */
1922         ret = ocfs2_journal_access_path(inode, handle, right_path);
1923         if (ret < 0) {
1924                 mlog_errno(ret);
1925                 goto out;
1926         }
1927
1928         if (left_path) {
1929                 int credits = handle->h_buffer_credits;
1930
1931                 /*
1932                  * There's a chance that left_path got passed back to
1933                  * us without being accounted for in the
1934                  * journal. Extend our transaction here to be sure we
1935                  * can change those blocks.
1936                  */
1937                 credits += left_path->p_tree_depth;
1938
1939                 ret = ocfs2_extend_trans(handle, credits);
1940                 if (ret < 0) {
1941                         mlog_errno(ret);
1942                         goto out;
1943                 }
1944
1945                 ret = ocfs2_journal_access_path(inode, handle, left_path);
1946                 if (ret < 0) {
1947                         mlog_errno(ret);
1948                         goto out;
1949                 }
1950         }
1951
1952         el = path_leaf_el(right_path);
1953
1954         ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1955         ret = ocfs2_journal_dirty(handle, leaf_bh);
1956         if (ret)
1957                 mlog_errno(ret);
1958
1959         if (left_path) {
1960                 /*
1961                  * The rotate code has indicated that we need to fix
1962                  * up portions of the tree after the insert.
1963                  *
1964                  * XXX: Should we extend the transaction here?
1965                  */
1966                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
1967                                                         right_path);
1968                 ocfs2_complete_edge_insert(inode, handle, left_path,
1969                                            right_path, subtree_index);
1970         }
1971
1972         ret = 0;
1973 out:
1974         return ret;
1975 }
1976
1977 static int ocfs2_do_insert_extent(struct inode *inode,
1978                                   handle_t *handle,
1979                                   struct buffer_head *di_bh,
1980                                   struct ocfs2_extent_rec *insert_rec,
1981                                   struct ocfs2_insert_type *type)
1982 {
1983         int ret, rotate = 0;
1984         u32 cpos;
1985         struct ocfs2_path *right_path = NULL;
1986         struct ocfs2_path *left_path = NULL;
1987         struct ocfs2_dinode *di;
1988         struct ocfs2_extent_list *el;
1989
1990         di = (struct ocfs2_dinode *) di_bh->b_data;
1991         el = &di->id2.i_list;
1992
1993         ret = ocfs2_journal_access(handle, inode, di_bh,
1994                                    OCFS2_JOURNAL_ACCESS_WRITE);
1995         if (ret) {
1996                 mlog_errno(ret);
1997                 goto out;
1998         }
1999
2000         if (le16_to_cpu(el->l_tree_depth) == 0) {
2001                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
2002                 goto out_update_clusters;
2003         }
2004
2005         right_path = ocfs2_new_inode_path(di_bh);
2006         if (!right_path) {
2007                 ret = -ENOMEM;
2008                 mlog_errno(ret);
2009                 goto out;
2010         }
2011
2012         /*
2013          * Determine the path to start with. Rotations need the
2014          * rightmost path, everything else can go directly to the
2015          * target leaf.
2016          */
2017         cpos = le32_to_cpu(insert_rec->e_cpos);
2018         if (type->ins_appending == APPEND_NONE &&
2019             type->ins_contig == CONTIG_NONE) {
2020                 rotate = 1;
2021                 cpos = UINT_MAX;
2022         }
2023
2024         ret = ocfs2_find_path(inode, right_path, cpos);
2025         if (ret) {
2026                 mlog_errno(ret);
2027                 goto out;
2028         }
2029
2030         /*
2031          * Rotations and appends need special treatment - they modify
2032          * parts of the tree's above them.
2033          *
2034          * Both might pass back a path immediate to the left of the
2035          * one being inserted to. This will be cause
2036          * ocfs2_insert_path() to modify the rightmost records of
2037          * left_path to account for an edge insert.
2038          *
2039          * XXX: When modifying this code, keep in mind that an insert
2040          * can wind up skipping both of these two special cases...
2041          */
2042         if (rotate) {
2043                 ret = ocfs2_rotate_tree_right(inode, handle,
2044                                               le32_to_cpu(insert_rec->e_cpos),
2045                                               right_path, &left_path);
2046                 if (ret) {
2047                         mlog_errno(ret);
2048                         goto out;
2049                 }
2050         } else if (type->ins_appending == APPEND_TAIL
2051                    && type->ins_contig != CONTIG_LEFT) {
2052                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
2053                                                right_path, &left_path);
2054                 if (ret) {
2055                         mlog_errno(ret);
2056                         goto out;
2057                 }
2058         }
2059
2060         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
2061                                 insert_rec, type);
2062         if (ret) {
2063                 mlog_errno(ret);
2064                 goto out;
2065         }
2066
2067 out_update_clusters:
2068         ocfs2_update_dinode_clusters(inode, di,
2069                                      le32_to_cpu(insert_rec->e_clusters));
2070
2071         ret = ocfs2_journal_dirty(handle, di_bh);
2072         if (ret)
2073                 mlog_errno(ret);
2074
2075 out:
2076         ocfs2_free_path(left_path);
2077         ocfs2_free_path(right_path);
2078
2079         return ret;
2080 }
2081
2082 static void ocfs2_figure_contig_type(struct inode *inode,
2083                                      struct ocfs2_insert_type *insert,
2084                                      struct ocfs2_extent_list *el,
2085                                      struct ocfs2_extent_rec *insert_rec)
2086 {
2087         int i;
2088         enum ocfs2_contig_type contig_type = CONTIG_NONE;
2089
2090         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2091                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
2092                                                   insert_rec);
2093                 if (contig_type != CONTIG_NONE) {
2094                         insert->ins_contig_index = i;
2095                         break;
2096                 }
2097         }
2098         insert->ins_contig = contig_type;
2099 }
2100
2101 /*
2102  * This should only be called against the righmost leaf extent list.
2103  *
2104  * ocfs2_figure_appending_type() will figure out whether we'll have to
2105  * insert at the tail of the rightmost leaf.
2106  *
2107  * This should also work against the dinode list for tree's with 0
2108  * depth. If we consider the dinode list to be the rightmost leaf node
2109  * then the logic here makes sense.
2110  */
2111 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
2112                                         struct ocfs2_extent_list *el,
2113                                         struct ocfs2_extent_rec *insert_rec)
2114 {
2115         int i;
2116         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
2117         struct ocfs2_extent_rec *rec;
2118
2119         insert->ins_appending = APPEND_NONE;
2120
2121         BUG_ON(el->l_tree_depth);
2122
2123         if (!el->l_next_free_rec)
2124                 goto set_tail_append;
2125
2126         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2127                 /* Were all records empty? */
2128                 if (le16_to_cpu(el->l_next_free_rec) == 1)
2129                         goto set_tail_append;
2130         }
2131
2132         i = le16_to_cpu(el->l_next_free_rec) - 1;
2133         rec = &el->l_recs[i];
2134
2135         if (cpos >= (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)))
2136                 goto set_tail_append;
2137
2138         return;
2139
2140 set_tail_append:
2141         insert->ins_appending = APPEND_TAIL;
2142 }
2143
2144 /*
2145  * Helper function called at the begining of an insert.
2146  *
2147  * This computes a few things that are commonly used in the process of
2148  * inserting into the btree:
2149  *   - Whether the new extent is contiguous with an existing one.
2150  *   - The current tree depth.
2151  *   - Whether the insert is an appending one.
2152  *   - The total # of free records in the tree.
2153  *
2154  * All of the information is stored on the ocfs2_insert_type
2155  * structure.
2156  */
2157 static int ocfs2_figure_insert_type(struct inode *inode,
2158                                     struct buffer_head *di_bh,
2159                                     struct buffer_head **last_eb_bh,
2160                                     struct ocfs2_extent_rec *insert_rec,
2161                                     struct ocfs2_insert_type *insert)
2162 {
2163         int ret;
2164         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2165         struct ocfs2_extent_block *eb;
2166         struct ocfs2_extent_list *el;
2167         struct ocfs2_path *path = NULL;
2168         struct buffer_head *bh = NULL;
2169
2170         el = &di->id2.i_list;
2171         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2172
2173         if (el->l_tree_depth) {
2174                 /*
2175                  * If we have tree depth, we read in the
2176                  * rightmost extent block ahead of time as
2177                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
2178                  * may want it later.
2179                  */
2180                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
2181                                        le64_to_cpu(di->i_last_eb_blk), &bh,
2182                                        OCFS2_BH_CACHED, inode);
2183                 if (ret) {
2184                         mlog_exit(ret);
2185                         goto out;
2186                 }
2187                 eb = (struct ocfs2_extent_block *) bh->b_data;
2188                 el = &eb->h_list;
2189         }
2190
2191         /*
2192          * Unless we have a contiguous insert, we'll need to know if
2193          * there is room left in our allocation tree for another
2194          * extent record.
2195          *
2196          * XXX: This test is simplistic, we can search for empty
2197          * extent records too.
2198          */
2199         insert->ins_free_records = le16_to_cpu(el->l_count) -
2200                 le16_to_cpu(el->l_next_free_rec);
2201
2202         if (!insert->ins_tree_depth) {
2203                 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2204                 ocfs2_figure_appending_type(insert, el, insert_rec);
2205                 return 0;
2206         }
2207
2208         path = ocfs2_new_inode_path(di_bh);
2209         if (!path) {
2210                 ret = -ENOMEM;
2211                 mlog_errno(ret);
2212                 goto out;
2213         }
2214
2215         /*
2216          * In the case that we're inserting past what the tree
2217          * currently accounts for, ocfs2_find_path() will return for
2218          * us the rightmost tree path. This is accounted for below in
2219          * the appending code.
2220          */
2221         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
2222         if (ret) {
2223                 mlog_errno(ret);
2224                 goto out;
2225         }
2226
2227         el = path_leaf_el(path);
2228
2229         /*
2230          * Now that we have the path, there's two things we want to determine:
2231          * 1) Contiguousness (also set contig_index if this is so)
2232          *
2233          * 2) Are we doing an append? We can trivially break this up
2234          *     into two types of appends: simple record append, or a
2235          *     rotate inside the tail leaf.
2236          */
2237         ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2238
2239         /*
2240          * The insert code isn't quite ready to deal with all cases of
2241          * left contiguousness. Specifically, if it's an insert into
2242          * the 1st record in a leaf, it will require the adjustment of
2243          * e_clusters on the last record of the path directly to it's
2244          * left. For now, just catch that case and fool the layers
2245          * above us. This works just fine for tree_depth == 0, which
2246          * is why we allow that above.
2247          */
2248         if (insert->ins_contig == CONTIG_LEFT &&
2249             insert->ins_contig_index == 0)
2250                 insert->ins_contig = CONTIG_NONE;
2251
2252         /*
2253          * Ok, so we can simply compare against last_eb to figure out
2254          * whether the path doesn't exist. This will only happen in
2255          * the case that we're doing a tail append, so maybe we can
2256          * take advantage of that information somehow.
2257          */
2258         if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
2259                 /*
2260                  * Ok, ocfs2_find_path() returned us the rightmost
2261                  * tree path. This might be an appending insert. There are
2262                  * two cases:
2263                  *    1) We're doing a true append at the tail:
2264                  *      -This might even be off the end of the leaf
2265                  *    2) We're "appending" by rotating in the tail
2266                  */
2267                 ocfs2_figure_appending_type(insert, el, insert_rec);
2268         }
2269
2270 out:
2271         ocfs2_free_path(path);
2272
2273         if (ret == 0)
2274                 *last_eb_bh = bh;
2275         else
2276                 brelse(bh);
2277         return ret;
2278 }
2279
2280 /*
2281  * Insert an extent into an inode btree.
2282  *
2283  * The caller needs to update fe->i_clusters
2284  */
2285 int ocfs2_insert_extent(struct ocfs2_super *osb,
2286                         handle_t *handle,
2287                         struct inode *inode,
2288                         struct buffer_head *fe_bh,
2289                         u32 cpos,
2290                         u64 start_blk,
2291                         u32 new_clusters,
2292                         struct ocfs2_alloc_context *meta_ac)
2293 {
2294         int status, shift;
2295         struct buffer_head *last_eb_bh = NULL;
2296         struct buffer_head *bh = NULL;
2297         struct ocfs2_insert_type insert = {0, };
2298         struct ocfs2_extent_rec rec;
2299
2300         mlog(0, "add %u clusters at position %u to inode %llu\n",
2301              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
2302
2303         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
2304                         (OCFS2_I(inode)->ip_clusters != cpos),
2305                         "Device %s, asking for sparse allocation: inode %llu, "
2306                         "cpos %u, clusters %u\n",
2307                         osb->dev_str,
2308                         (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
2309                         OCFS2_I(inode)->ip_clusters);
2310
2311         rec.e_cpos = cpu_to_le32(cpos);
2312         rec.e_blkno = cpu_to_le64(start_blk);
2313         rec.e_clusters = cpu_to_le32(new_clusters);
2314
2315         status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2316                                           &insert);
2317         if (status < 0) {
2318                 mlog_errno(status);
2319                 goto bail;
2320         }
2321
2322         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
2323              "Insert.contig_index: %d, Insert.free_records: %d, "
2324              "Insert.tree_depth: %d\n",
2325              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2326              insert.ins_free_records, insert.ins_tree_depth);
2327
2328         /*
2329          * Avoid growing the tree unless we're out of records and the
2330          * insert type requres one.
2331          */
2332         if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
2333                 goto out_add;
2334
2335         shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
2336         if (shift < 0) {
2337                 status = shift;
2338                 mlog_errno(status);
2339                 goto bail;
2340         }
2341
2342         /* We traveled all the way to the bottom of the allocation tree
2343          * and didn't find room for any more extents - we need to add
2344          * another tree level */
2345         if (shift) {
2346                 BUG_ON(bh);
2347                 mlog(0, "need to shift tree depth "
2348                      "(current = %d)\n", insert.ins_tree_depth);
2349
2350                 /* ocfs2_shift_tree_depth will return us a buffer with
2351                  * the new extent block (so we can pass that to
2352                  * ocfs2_add_branch). */
2353                 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
2354                                                 meta_ac, &bh);
2355                 if (status < 0) {
2356                         mlog_errno(status);
2357                         goto bail;
2358                 }
2359                 insert.ins_tree_depth++;
2360                 /* Special case: we have room now if we shifted from
2361                  * tree_depth 0 */
2362                 if (insert.ins_tree_depth == 1)
2363                         goto out_add;
2364         }
2365
2366         /* call ocfs2_add_branch to add the final part of the tree with
2367          * the new data. */
2368         mlog(0, "add branch. bh = %p\n", bh);
2369         status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
2370                                   meta_ac);
2371         if (status < 0) {
2372                 mlog_errno(status);
2373                 goto bail;
2374         }
2375
2376 out_add:
2377         /* Finally, we can add clusters. This might rotate the tree for us. */
2378         status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
2379         if (status < 0)
2380                 mlog_errno(status);
2381
2382 bail:
2383         if (bh)
2384                 brelse(bh);
2385
2386         if (last_eb_bh)
2387                 brelse(last_eb_bh);
2388
2389         mlog_exit(status);
2390         return status;
2391 }
2392
2393 static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2394 {
2395         struct buffer_head *tl_bh = osb->osb_tl_bh;
2396         struct ocfs2_dinode *di;
2397         struct ocfs2_truncate_log *tl;
2398
2399         di = (struct ocfs2_dinode *) tl_bh->b_data;
2400         tl = &di->id2.i_dealloc;
2401
2402         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
2403                         "slot %d, invalid truncate log parameters: used = "
2404                         "%u, count = %u\n", osb->slot_num,
2405                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
2406         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
2407 }
2408
2409 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
2410                                            unsigned int new_start)
2411 {
2412         unsigned int tail_index;
2413         unsigned int current_tail;
2414
2415         /* No records, nothing to coalesce */
2416         if (!le16_to_cpu(tl->tl_used))
2417                 return 0;
2418
2419         tail_index = le16_to_cpu(tl->tl_used) - 1;
2420         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
2421         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
2422
2423         return current_tail == new_start;
2424 }
2425
2426 static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
2427                                      handle_t *handle,
2428                                      u64 start_blk,
2429                                      unsigned int num_clusters)
2430 {
2431         int status, index;
2432         unsigned int start_cluster, tl_count;
2433         struct inode *tl_inode = osb->osb_tl_inode;
2434         struct buffer_head *tl_bh = osb->osb_tl_bh;
2435         struct ocfs2_dinode *di;
2436         struct ocfs2_truncate_log *tl;
2437
2438         mlog_entry("start_blk = %llu, num_clusters = %u\n",
2439                    (unsigned long long)start_blk, num_clusters);
2440
2441         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
2442
2443         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
2444
2445         di = (struct ocfs2_dinode *) tl_bh->b_data;
2446         tl = &di->id2.i_dealloc;
2447         if (!OCFS2_IS_VALID_DINODE(di)) {
2448                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
2449                 status = -EIO;
2450                 goto bail;
2451         }
2452
2453         tl_count = le16_to_cpu(tl->tl_count);
2454         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
2455                         tl_count == 0,
2456                         "Truncate record count on #%llu invalid "
2457                         "wanted %u, actual %u\n",
2458                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
2459                         ocfs2_truncate_recs_per_inode(osb->sb),
2460                         le16_to_cpu(tl->tl_count));
2461
2462         /* Caller should have known to flush before calling us. */
2463         index = le16_to_cpu(tl->tl_used);
2464         if (index >= tl_count) {
2465                 status = -ENOSPC;
2466                 mlog_errno(status);
2467                 goto bail;
2468         }
2469
2470         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
2471                                       OCFS2_JOURNAL_ACCESS_WRITE);
2472         if (status < 0) {
2473                 mlog_errno(status);
2474                 goto bail;
2475         }
2476
2477         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
2478              "%llu (index = %d)\n", num_clusters, start_cluster,
2479              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
2480
2481         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
2482                 /*
2483                  * Move index back to the record we are coalescing with.
2484                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
2485                  */
2486                 index--;
2487
2488                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
2489                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
2490                      index, le32_to_cpu(tl->tl_recs[index].t_start),
2491                      num_clusters);
2492         } else {
2493                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
2494                 tl->tl_used = cpu_to_le16(index + 1);
2495         }
2496         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
2497
2498         status = ocfs2_journal_dirty(handle, tl_bh);
2499         if (status < 0) {
2500                 mlog_errno(status);
2501                 goto bail;
2502         }
2503
2504 bail:
2505         mlog_exit(status);
2506         return status;
2507 }
2508
2509 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
2510                                          handle_t *handle,
2511                                          struct inode *data_alloc_inode,
2512                                          struct buffer_head *data_alloc_bh)
2513 {
2514         int status = 0;
2515         int i;
2516         unsigned int num_clusters;
2517         u64 start_blk;
2518         struct ocfs2_truncate_rec rec;
2519         struct ocfs2_dinode *di;
2520         struct ocfs2_truncate_log *tl;
2521         struct inode *tl_inode = osb->osb_tl_inode;
2522         struct buffer_head *tl_bh = osb->osb_tl_bh;
2523
2524         mlog_entry_void();
2525
2526         di = (struct ocfs2_dinode *) tl_bh->b_data;
2527         tl = &di->id2.i_dealloc;
2528         i = le16_to_cpu(tl->tl_used) - 1;
2529         while (i >= 0) {
2530                 /* Caller has given us at least enough credits to
2531                  * update the truncate log dinode */
2532                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
2533                                               OCFS2_JOURNAL_ACCESS_WRITE);
2534                 if (status < 0) {
2535                         mlog_errno(status);
2536                         goto bail;
2537                 }
2538
2539                 tl->tl_used = cpu_to_le16(i);
2540
2541                 status = ocfs2_journal_dirty(handle, tl_bh);
2542                 if (status < 0) {
2543                         mlog_errno(status);
2544                         goto bail;
2545                 }
2546
2547                 /* TODO: Perhaps we can calculate the bulk of the
2548                  * credits up front rather than extending like
2549                  * this. */
2550                 status = ocfs2_extend_trans(handle,
2551                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
2552                 if (status < 0) {
2553                         mlog_errno(status);
2554                         goto bail;
2555                 }
2556
2557                 rec = tl->tl_recs[i];
2558                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
2559                                                     le32_to_cpu(rec.t_start));
2560                 num_clusters = le32_to_cpu(rec.t_clusters);
2561
2562                 /* if start_blk is not set, we ignore the record as
2563                  * invalid. */
2564                 if (start_blk) {
2565                         mlog(0, "free record %d, start = %u, clusters = %u\n",
2566                              i, le32_to_cpu(rec.t_start), num_clusters);
2567
2568                         status = ocfs2_free_clusters(handle, data_alloc_inode,
2569                                                      data_alloc_bh, start_blk,
2570                                                      num_clusters);
2571                         if (status < 0) {
2572                                 mlog_errno(status);
2573                                 goto bail;
2574                         }
2575                 }
2576                 i--;
2577         }
2578
2579 bail:
2580         mlog_exit(status);
2581         return status;
2582 }
2583
2584 /* Expects you to already be holding tl_inode->i_mutex */
2585 static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2586 {
2587         int status;
2588         unsigned int num_to_flush;
2589         handle_t *handle;
2590         struct inode *tl_inode = osb->osb_tl_inode;
2591         struct inode *data_alloc_inode = NULL;
2592         struct buffer_head *tl_bh = osb->osb_tl_bh;
2593         struct buffer_head *data_alloc_bh = NULL;
2594         struct ocfs2_dinode *di;
2595         struct ocfs2_truncate_log *tl;
2596
2597         mlog_entry_void();
2598
2599         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
2600
2601         di = (struct ocfs2_dinode *) tl_bh->b_data;
2602         tl = &di->id2.i_dealloc;
2603         if (!OCFS2_IS_VALID_DINODE(di)) {
2604                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
2605                 status = -EIO;
2606                 goto out;
2607         }
2608
2609         num_to_flush = le16_to_cpu(tl->tl_used);
2610         mlog(0, "Flush %u records from truncate log #%llu\n",
2611              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
2612         if (!num_to_flush) {
2613                 status = 0;
2614                 goto out;
2615         }
2616
2617         data_alloc_inode = ocfs2_get_system_file_inode(osb,
2618                                                        GLOBAL_BITMAP_SYSTEM_INODE,
2619                                                        OCFS2_INVALID_SLOT);
2620         if (!data_alloc_inode) {
2621                 status = -EINVAL;
2622                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
2623                 goto out;
2624         }
2625
2626         mutex_lock(&data_alloc_inode->i_mutex);
2627
2628         status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
2629         if (status < 0) {
2630                 mlog_errno(status);
2631                 goto out_mutex;
2632         }
2633
2634         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
2635         if (IS_ERR(handle)) {
2636                 status = PTR_ERR(handle);
2637                 mlog_errno(status);
2638                 goto out_unlock;
2639         }
2640
2641         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
2642                                                data_alloc_bh);
2643         if (status < 0)
2644                 mlog_errno(status);
2645
2646         ocfs2_commit_trans(osb, handle);
2647
2648 out_unlock:
2649         brelse(data_alloc_bh);
2650         ocfs2_meta_unlock(data_alloc_inode, 1);
2651
2652 out_mutex:
2653         mutex_unlock(&data_alloc_inode->i_mutex);
2654         iput(data_alloc_inode);
2655
2656 out:
2657         mlog_exit(status);
2658         return status;
2659 }
2660
2661 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2662 {
2663         int status;
2664         struct inode *tl_inode = osb->osb_tl_inode;
2665
2666         mutex_lock(&tl_inode->i_mutex);
2667         status = __ocfs2_flush_truncate_log(osb);
2668         mutex_unlock(&tl_inode->i_mutex);
2669
2670         return status;
2671 }
2672
2673 static void ocfs2_truncate_log_worker(struct work_struct *work)
2674 {
2675         int status;
2676         struct ocfs2_super *osb =
2677                 container_of(work, struct ocfs2_super,
2678                              osb_truncate_log_wq.work);
2679
2680         mlog_entry_void();
2681
2682         status = ocfs2_flush_truncate_log(osb);
2683         if (status < 0)
2684                 mlog_errno(status);
2685
2686         mlog_exit(status);
2687 }
2688
2689 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
2690 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
2691                                        int cancel)
2692 {
2693         if (osb->osb_tl_inode) {
2694                 /* We want to push off log flushes while truncates are
2695                  * still running. */
2696                 if (cancel)
2697                         cancel_delayed_work(&osb->osb_truncate_log_wq);
2698
2699                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
2700                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
2701         }
2702 }
2703
2704 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
2705                                        int slot_num,
2706                                        struct inode **tl_inode,
2707                                        struct buffer_head **tl_bh)
2708 {
2709         int status;
2710         struct inode *inode = NULL;
2711         struct buffer_head *bh = NULL;
2712
2713         inode = ocfs2_get_system_file_inode(osb,
2714                                            TRUNCATE_LOG_SYSTEM_INODE,
2715                                            slot_num);
2716         if (!inode) {
2717                 status = -EINVAL;
2718                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
2719                 goto bail;
2720         }
2721
2722         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
2723                                   OCFS2_BH_CACHED, inode);
2724         if (status < 0) {
2725                 iput(inode);
2726                 mlog_errno(status);
2727                 goto bail;
2728         }
2729
2730         *tl_inode = inode;
2731         *tl_bh    = bh;
2732 bail:
2733         mlog_exit(status);
2734         return status;
2735 }
2736
2737 /* called during the 1st stage of node recovery. we stamp a clean
2738  * truncate log and pass back a copy for processing later. if the
2739  * truncate log does not require processing, a *tl_copy is set to
2740  * NULL. */
2741 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
2742                                       int slot_num,
2743                                       struct ocfs2_dinode **tl_copy)
2744 {
2745         int status;
2746         struct inode *tl_inode = NULL;
2747         struct buffer_head *tl_bh = NULL;
2748         struct ocfs2_dinode *di;
2749         struct ocfs2_truncate_log *tl;
2750
2751         *tl_copy = NULL;
2752
2753         mlog(0, "recover truncate log from slot %d\n", slot_num);
2754
2755         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
2756         if (status < 0) {
2757                 mlog_errno(status);
2758                 goto bail;
2759         }
2760
2761         di = (struct ocfs2_dinode *) tl_bh->b_data;
2762         tl = &di->id2.i_dealloc;
2763         if (!OCFS2_IS_VALID_DINODE(di)) {
2764                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
2765                 status = -EIO;
2766                 goto bail;
2767         }
2768
2769         if (le16_to_cpu(tl->tl_used)) {
2770                 mlog(0, "We'll have %u logs to recover\n",
2771                      le16_to_cpu(tl->tl_used));
2772
2773                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
2774                 if (!(*tl_copy)) {
2775                         status = -ENOMEM;
2776                         mlog_errno(status);
2777                         goto bail;
2778                 }
2779
2780                 /* Assuming the write-out below goes well, this copy
2781                  * will be passed back to recovery for processing. */
2782                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
2783
2784                 /* All we need to do to clear the truncate log is set
2785                  * tl_used. */
2786                 tl->tl_used = 0;
2787
2788                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
2789                 if (status < 0) {
2790                         mlog_errno(status);
2791                         goto bail;
2792                 }
2793         }
2794
2795 bail:
2796         if (tl_inode)
2797                 iput(tl_inode);
2798         if (tl_bh)
2799                 brelse(tl_bh);
2800
2801         if (status < 0 && (*tl_copy)) {
2802                 kfree(*tl_copy);
2803                 *tl_copy = NULL;
2804         }
2805
2806         mlog_exit(status);
2807         return status;
2808 }
2809
2810 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
2811                                          struct ocfs2_dinode *tl_copy)
2812 {
2813         int status = 0;
2814         int i;
2815         unsigned int clusters, num_recs, start_cluster;
2816         u64 start_blk;
2817         handle_t *handle;
2818         struct inode *tl_inode = osb->osb_tl_inode;
2819         struct ocfs2_truncate_log *tl;
2820
2821         mlog_entry_void();
2822
2823         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
2824                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
2825                 return -EINVAL;
2826         }
2827
2828         tl = &tl_copy->id2.i_dealloc;
2829         num_recs = le16_to_cpu(tl->tl_used);
2830         mlog(0, "cleanup %u records from %llu\n", num_recs,
2831              (unsigned long long)tl_copy->i_blkno);
2832
2833         mutex_lock(&tl_inode->i_mutex);
2834         for(i = 0; i < num_recs; i++) {
2835                 if (ocfs2_truncate_log_needs_flush(osb)) {
2836                         status = __ocfs2_flush_truncate_log(osb);
2837                         if (status < 0) {
2838                                 mlog_errno(status);
2839                                 goto bail_up;
2840                         }
2841                 }
2842
2843                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
2844                 if (IS_ERR(handle)) {
2845                         status = PTR_ERR(handle);
2846                         mlog_errno(status);
2847                         goto bail_up;
2848                 }
2849
2850                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
2851                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
2852                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
2853
2854                 status = ocfs2_truncate_log_append(osb, handle,
2855                                                    start_blk, clusters);
2856                 ocfs2_commit_trans(osb, handle);
2857                 if (status < 0) {
2858                         mlog_errno(status);
2859                         goto bail_up;
2860                 }
2861         }
2862
2863 bail_up:
2864         mutex_unlock(&tl_inode->i_mutex);
2865
2866         mlog_exit(status);
2867         return status;
2868 }
2869
2870 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
2871 {
2872         int status;
2873         struct inode *tl_inode = osb->osb_tl_inode;
2874
2875         mlog_entry_void();
2876
2877         if (tl_inode) {
2878                 cancel_delayed_work(&osb->osb_truncate_log_wq);
2879                 flush_workqueue(ocfs2_wq);
2880
2881                 status = ocfs2_flush_truncate_log(osb);
2882                 if (status < 0)
2883                         mlog_errno(status);
2884
2885                 brelse(osb->osb_tl_bh);
2886                 iput(osb->osb_tl_inode);
2887         }
2888
2889         mlog_exit_void();
2890 }
2891
2892 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
2893 {
2894         int status;
2895         struct inode *tl_inode = NULL;
2896         struct buffer_head *tl_bh = NULL;
2897
2898         mlog_entry_void();
2899
2900         status = ocfs2_get_truncate_log_info(osb,
2901                                              osb->slot_num,
2902                                              &tl_inode,
2903                                              &tl_bh);
2904         if (status < 0)
2905                 mlog_errno(status);
2906
2907         /* ocfs2_truncate_log_shutdown keys on the existence of
2908          * osb->osb_tl_inode so we don't set any of the osb variables
2909          * until we're sure all is well. */
2910         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
2911                           ocfs2_truncate_log_worker);
2912         osb->osb_tl_bh    = tl_bh;
2913         osb->osb_tl_inode = tl_inode;
2914
2915         mlog_exit(status);
2916         return status;
2917 }
2918
2919 /* This function will figure out whether the currently last extent
2920  * block will be deleted, and if it will, what the new last extent
2921  * block will be so we can update his h_next_leaf_blk field, as well
2922  * as the dinodes i_last_eb_blk */
2923 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
2924                                        u32 new_i_clusters,
2925                                        struct ocfs2_path *path,
2926                                        struct buffer_head **new_last_eb)
2927 {
2928         int ret = 0;
2929         u32 cpos;
2930         struct ocfs2_extent_block *eb;
2931         struct ocfs2_extent_list *el;
2932         struct buffer_head *bh = NULL;
2933
2934         *new_last_eb = NULL;
2935
2936         /* we have no tree, so of course, no last_eb. */
2937         if (!path->p_tree_depth)
2938                 goto out;
2939
2940         /* trunc to zero special case - this makes tree_depth = 0
2941          * regardless of what it is.  */
2942         if (!new_i_clusters)
2943                 goto out;
2944
2945         el = path_leaf_el(path);
2946         BUG_ON(!el->l_next_free_rec);
2947
2948         /* Make sure that this guy will actually be empty after we
2949          * clear away the data. */
2950         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2951                 if (le16_to_cpu(el->l_next_free_rec) > 1 &&
2952                     le32_to_cpu(el->l_recs[1].e_cpos) < new_i_clusters)
2953                         goto out;
2954         } else if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
2955                 goto out;
2956
2957         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2958         if (ret) {
2959                 mlog_errno(ret);
2960                 goto out;
2961         }
2962
2963         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
2964         if (ret) {
2965                 mlog_errno(ret);
2966                 goto out;
2967         }
2968
2969         eb = (struct ocfs2_extent_block *) bh->b_data;
2970         el = &eb->h_list;
2971         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
2972                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
2973                 ret = -EROFS;
2974                 goto out;
2975         }
2976
2977         *new_last_eb = bh;
2978         get_bh(*new_last_eb);
2979         mlog(0, "returning block %llu, (cpos: %u)\n",
2980              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
2981 out:
2982         brelse(bh);
2983
2984         return ret;
2985 }
2986
2987 static int ocfs2_do_truncate(struct ocfs2_super *osb,
2988                              unsigned int clusters_to_del,
2989                              struct inode *inode,
2990                              struct buffer_head *fe_bh,
2991                              handle_t *handle,
2992                              struct ocfs2_truncate_context *tc,
2993                              struct ocfs2_path *path)
2994 {
2995         int status, i, index;
2996         struct ocfs2_dinode *fe;
2997         struct ocfs2_extent_block *eb;
2998         struct ocfs2_extent_block *last_eb = NULL;
2999         struct ocfs2_extent_list *el;
3000         struct buffer_head *eb_bh = NULL;
3001         struct buffer_head *last_eb_bh = NULL;
3002         u64 delete_blk = 0;
3003
3004         fe = (struct ocfs2_dinode *) fe_bh->b_data;
3005
3006         status = ocfs2_find_new_last_ext_blk(inode,
3007                                              le32_to_cpu(fe->i_clusters) -
3008                                              clusters_to_del,
3009                                              path, &last_eb_bh);
3010         if (status < 0) {
3011                 mlog_errno(status);
3012                 goto bail;
3013         }
3014
3015         /*
3016          * Each component will be touched, so we might as well journal
3017          * here to avoid having to handle errors later.
3018          */
3019         for (i = 0; i < path_num_items(path); i++) {
3020                 status = ocfs2_journal_access(handle, inode,
3021                                               path->p_node[i].bh,
3022                                               OCFS2_JOURNAL_ACCESS_WRITE);
3023                 if (status < 0) {
3024                         mlog_errno(status);
3025                         goto bail;
3026                 }
3027         }
3028
3029         if (last_eb_bh) {
3030                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
3031                                               OCFS2_JOURNAL_ACCESS_WRITE);
3032                 if (status < 0) {
3033                         mlog_errno(status);
3034                         goto bail;
3035                 }
3036
3037                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3038         }
3039
3040         el = &(fe->id2.i_list);
3041
3042         /*
3043          * Lower levels depend on this never happening, but it's best
3044          * to check it up here before changing the tree.
3045          */
3046         if (el->l_tree_depth && ocfs2_is_empty_extent(&el->l_recs[0])) {
3047                 ocfs2_error(inode->i_sb,
3048                             "Inode %lu has an empty extent record, depth %u\n",
3049                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
3050                 goto bail;
3051         }
3052
3053         spin_lock(&OCFS2_I(inode)->ip_lock);
3054         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
3055                                       clusters_to_del;
3056         spin_unlock(&OCFS2_I(inode)->ip_lock);
3057         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
3058
3059         i = le16_to_cpu(el->l_next_free_rec) - 1;
3060
3061         BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
3062         le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
3063         /* tree depth zero, we can just delete the clusters, otherwise
3064          * we need to record the offset of the next level extent block
3065          * as we may overwrite it. */
3066         if (!el->l_tree_depth) {
3067                 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
3068                         + ocfs2_clusters_to_blocks(osb->sb,
3069                                         le32_to_cpu(el->l_recs[i].e_clusters));
3070
3071                 if (!el->l_recs[i].e_clusters) {
3072                         /* if we deleted the whole extent record, then clear
3073                          * out the other fields and update the extent
3074                          * list.
3075                          */
3076                         el->l_recs[i].e_cpos = 0;
3077                         el->l_recs[i].e_blkno = 0;
3078                         BUG_ON(!el->l_next_free_rec);
3079                         le16_add_cpu(&el->l_next_free_rec, -1);
3080
3081                         /*
3082                          * The leftmost record might be an empty extent -
3083                          * delete it here too.
3084                          */
3085                         if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
3086                                 el->l_recs[0].e_cpos = 0;
3087                                 el->l_recs[0].e_blkno = 0;
3088                                 el->l_next_free_rec = 0;
3089                         }
3090                 }
3091         }
3092
3093         if (le32_to_cpu(fe->i_clusters) == 0) {
3094                 /* trunc to zero is a special case. */
3095                 el->l_tree_depth = 0;
3096                 fe->i_last_eb_blk = 0;
3097         } else if (last_eb)
3098                 fe->i_last_eb_blk = last_eb->h_blkno;
3099
3100         status = ocfs2_journal_dirty(handle, fe_bh);
3101         if (status < 0) {
3102                 mlog_errno(status);
3103                 goto bail;
3104         }
3105
3106         if (last_eb) {
3107                 /* If there will be a new last extent block, then by
3108                  * definition, there cannot be any leaves to the right of
3109                  * him. */
3110                 last_eb->h_next_leaf_blk = 0;
3111                 status = ocfs2_journal_dirty(handle, last_eb_bh);
3112                 if (status < 0) {
3113                         mlog_errno(status);
3114                         goto bail;
3115                 }
3116         }
3117
3118         index = 1;
3119         /* if our tree depth > 0, update all the tree blocks below us. */
3120         while (index <= path->p_tree_depth) {
3121                 eb_bh = path->p_node[index].bh;
3122                 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
3123                 el = path->p_node[index].el;
3124
3125                 mlog(0, "traveling tree (index = %d, extent block: %llu)\n",
3126                      index,  (unsigned long long)eb_bh->b_blocknr);
3127
3128                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
3129                 if (index !=
3130                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
3131                         ocfs2_error(inode->i_sb,
3132                                     "Inode %lu has invalid ext. block %llu\n",
3133                                     inode->i_ino,
3134                                     (unsigned long long)eb_bh->b_blocknr);
3135                         goto bail;
3136                 }
3137
3138                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3139
3140                 mlog(0, "extent block %llu, before: record %d: "
3141                      "(%u, %u, %llu), next = %u\n",
3142                      (unsigned long long)le64_to_cpu(eb->h_blkno), i,
3143                      le32_to_cpu(el->l_recs[i].e_cpos),
3144                      le32_to_cpu(el->l_recs[i].e_clusters),
3145                      (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
3146                      le16_to_cpu(el->l_next_free_rec));
3147
3148                 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
3149                 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
3150
3151                 /* bottom-most block requires us to delete data.*/
3152                 if (!el->l_tree_depth)
3153                         delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
3154                                 + ocfs2_clusters_to_blocks(osb->sb,
3155                                         le32_to_cpu(el->l_recs[i].e_clusters));
3156                 if (!el->l_recs[i].e_clusters) {
3157                         el->l_recs[i].e_cpos = 0;
3158                         el->l_recs[i].e_blkno = 0;
3159                         BUG_ON(!el->l_next_free_rec);
3160                         le16_add_cpu(&el->l_next_free_rec, -1);
3161                 }
3162                 if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
3163                         el->l_recs[0].e_cpos = 0;
3164                         el->l_recs[0].e_blkno = 0;
3165                         el->l_next_free_rec = 0;
3166                 }
3167
3168                 mlog(0, "extent block %llu, after: record %d: "
3169                      "(%u, %u, %llu), next = %u\n",
3170                      (unsigned long long)le64_to_cpu(eb->h_blkno), i,
3171                      le32_to_cpu(el->l_recs[i].e_cpos),
3172                      le32_to_cpu(el->l_recs[i].e_clusters),
3173                      (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
3174                      le16_to_cpu(el->l_next_free_rec));
3175
3176                 status = ocfs2_journal_dirty(handle, eb_bh);
3177                 if (status < 0) {
3178                         mlog_errno(status);
3179                         goto bail;
3180                 }
3181
3182                 if (!el->l_next_free_rec) {
3183                         mlog(0, "deleting this extent block.\n");
3184
3185                         ocfs2_remove_from_cache(inode, eb_bh);
3186
3187                         BUG_ON(el->l_recs[0].e_clusters);
3188                         BUG_ON(el->l_recs[0].e_cpos);
3189                         BUG_ON(el->l_recs[0].e_blkno);
3190
3191                         /*
3192                          * We need to remove this extent block from
3193                          * the list above it.
3194                          *
3195                          * Since we've passed it already in this loop,
3196                          * no need to worry about journaling.
3197                          */
3198                         el = path->p_node[index - 1].el;
3199                         i = le16_to_cpu(el->l_next_free_rec) - 1;
3200                         BUG_ON(i < 0);
3201                         el->l_recs[i].e_cpos = 0;
3202                         el->l_recs[i].e_clusters = 0;
3203                         el->l_recs[i].e_blkno = 0;
3204                         le16_add_cpu(&el->l_next_free_rec, -1);
3205
3206                         if (eb->h_suballoc_slot == 0) {
3207                                 /*
3208                                  * This code only understands how to
3209                                  * lock the suballocator in slot 0,
3210                                  * which is fine because allocation is
3211                                  * only ever done out of that
3212                                  * suballocator too. A future version
3213                                  * might change that however, so avoid
3214                                  * a free if we don't know how to
3215                                  * handle it. This way an fs incompat
3216                                  * bit will not be necessary.
3217                                  */
3218                                 status = ocfs2_free_extent_block(handle,
3219                                                                  tc->tc_ext_alloc_inode,
3220                                                                  tc->tc_ext_alloc_bh,
3221                                                                  eb);
3222                                 if (status < 0) {
3223                                         mlog_errno(status);
3224                                         goto bail;
3225                                 }
3226                         }
3227                 }
3228                 index++;
3229         }
3230
3231         BUG_ON(!delete_blk);
3232         status = ocfs2_truncate_log_append(osb, handle, delete_blk,
3233                                            clusters_to_del);
3234         if (status < 0) {
3235                 mlog_errno(status);
3236                 goto bail;
3237         }
3238         status = 0;
3239 bail:
3240
3241         mlog_exit(status);
3242         return status;
3243 }
3244
3245 /*
3246  * It is expected, that by the time you call this function,
3247  * inode->i_size and fe->i_size have been adjusted.
3248  *
3249  * WARNING: This will kfree the truncate context
3250  */
3251 int ocfs2_commit_truncate(struct ocfs2_super *osb,
3252                           struct inode *inode,
3253                           struct buffer_head *fe_bh,
3254                           struct ocfs2_truncate_context *tc)
3255 {
3256         int status, i, credits, tl_sem = 0;
3257         u32 clusters_to_del, new_highest_cpos, range;
3258         struct ocfs2_extent_list *el;
3259         handle_t *handle = NULL;
3260         struct inode *tl_inode = osb->osb_tl_inode;
3261         struct ocfs2_path *path = NULL;
3262
3263         mlog_entry_void();
3264
3265         down_write(&OCFS2_I(inode)->ip_alloc_sem);
3266
3267         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
3268                                                      i_size_read(inode));
3269
3270         path = ocfs2_new_inode_path(fe_bh);
3271         if (!path) {
3272                 status = -ENOMEM;
3273                 mlog_errno(status);
3274                 goto bail;
3275         }
3276 start:
3277         /*
3278          * Truncate always works against the rightmost tree branch.
3279          */
3280         status = ocfs2_find_path(inode, path, UINT_MAX);
3281         if (status) {
3282                 mlog_errno(status);
3283                 goto bail;
3284         }
3285
3286         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
3287              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
3288
3289         /*
3290          * By now, el will point to the extent list on the bottom most
3291          * portion of this tree. Only the tail record is considered in
3292          * each pass.
3293          *
3294          * We handle the following cases, in order:
3295          * - empty extent: delete the remaining branch
3296          * - remove the entire record
3297          * - remove a partial record
3298          * - no record needs to be removed (truncate has completed)
3299          */
3300         el = path_leaf_el(path);
3301         i = le16_to_cpu(el->l_next_free_rec) - 1;
3302         range = le32_to_cpu(el->l_recs[i].e_cpos) +
3303                 le32_to_cpu(el->l_recs[i].e_clusters);
3304         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
3305                 clusters_to_del = 0;
3306         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
3307                 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
3308         } else if (range > new_highest_cpos) {
3309                 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
3310                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
3311                                   new_highest_cpos;
3312         } else {
3313                 status = 0;
3314                 goto bail;
3315         }
3316
3317         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
3318              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
3319
3320         BUG_ON(clusters_to_del == 0);
3321
3322         mutex_lock(&tl_inode->i_mutex);
3323         tl_sem = 1;
3324         /* ocfs2_truncate_log_needs_flush guarantees us at least one
3325          * record is free for use. If there isn't any, we flush to get
3326          * an empty truncate log.  */
3327         if (ocfs2_truncate_log_needs_flush(osb)) {
3328                 status = __ocfs2_flush_truncate_log(osb);
3329                 if (status < 0) {
3330                         mlog_errno(status);
3331                         goto bail;
3332                 }
3333         }
3334
3335         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
3336                                                 (struct ocfs2_dinode *)fe_bh->b_data,
3337                                                 el);
3338         handle = ocfs2_start_trans(osb, credits);
3339         if (IS_ERR(handle)) {
3340                 status = PTR_ERR(handle);
3341                 handle = NULL;
3342                 mlog_errno(status);
3343                 goto bail;
3344         }
3345
3346         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
3347                                    tc, path);
3348         if (status < 0) {
3349                 mlog_errno(status);
3350                 goto bail;
3351         }
3352
3353         mutex_unlock(&tl_inode->i_mutex);
3354         tl_sem = 0;
3355
3356         ocfs2_commit_trans(osb, handle);
3357         handle = NULL;
3358
3359         ocfs2_reinit_path(path, 1);
3360
3361         /*
3362          * Only loop if we still have allocation.
3363          */
3364         if (OCFS2_I(inode)->ip_clusters)
3365                 goto start;
3366 bail:
3367         up_write(&OCFS2_I(inode)->ip_alloc_sem);
3368
3369         ocfs2_schedule_truncate_log_flush(osb, 1);
3370
3371         if (tl_sem)
3372                 mutex_unlock(&tl_inode->i_mutex);
3373
3374         if (handle)
3375                 ocfs2_commit_trans(osb, handle);
3376
3377         ocfs2_free_path(path);
3378
3379         /* This will drop the ext_alloc cluster lock for us */
3380         ocfs2_free_truncate_context(tc);
3381
3382         mlog_exit(status);
3383         return status;
3384 }
3385
3386 /*
3387  * Expects the inode to already be locked. This will figure out which
3388  * inodes need to be locked and will put them on the returned truncate
3389  * context.
3390  */
3391 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3392                            struct inode *inode,
3393                            struct buffer_head *fe_bh,
3394                            struct ocfs2_truncate_context **tc)
3395 {
3396         int status, metadata_delete, i;
3397         unsigned int new_i_clusters;
3398         struct ocfs2_dinode *fe;
3399         struct ocfs2_extent_block *eb;
3400         struct ocfs2_extent_list *el;
3401         struct buffer_head *last_eb_bh = NULL;
3402         struct inode *ext_alloc_inode = NULL;
3403         struct buffer_head *ext_alloc_bh = NULL;
3404
3405         mlog_entry_void();
3406
3407         *tc = NULL;
3408
3409         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
3410                                                   i_size_read(inode));
3411         fe = (struct ocfs2_dinode *) fe_bh->b_data;
3412
3413         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
3414              "%llu\n", fe->i_clusters, new_i_clusters,
3415              (unsigned long long)fe->i_size);
3416
3417         if (!ocfs2_sparse_alloc(osb) &&
3418             le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
3419                 ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
3420                             "%u and size %llu whereas struct inode has "
3421                             "cluster count %u and size %llu which caused an "
3422                             "invalid truncate to %u clusters.",
3423                             (unsigned long long)le64_to_cpu(fe->i_blkno),
3424                             le32_to_cpu(fe->i_clusters),
3425                             (unsigned long long)le64_to_cpu(fe->i_size),
3426                             OCFS2_I(inode)->ip_clusters, i_size_read(inode),
3427                             new_i_clusters);
3428                 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
3429                 status = -EIO;
3430                 goto bail;
3431         }
3432
3433         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
3434         if (!(*tc)) {
3435                 status = -ENOMEM;
3436                 mlog_errno(status);
3437                 goto bail;
3438         }
3439
3440         metadata_delete = 0;
3441         if (fe->id2.i_list.l_tree_depth) {
3442                 /* If we have a tree, then the truncate may result in
3443                  * metadata deletes. Figure this out from the
3444                  * rightmost leaf block.*/
3445                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
3446                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
3447                 if (status < 0) {
3448                         mlog_errno(status);
3449                         goto bail;
3450                 }
3451                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3452                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
3453                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
3454
3455                         brelse(last_eb_bh);
3456                         status = -EIO;
3457                         goto bail;
3458                 }
3459                 el = &(eb->h_list);
3460
3461                 i = 0;
3462                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3463                         i = 1;
3464                 /*
3465                  * XXX: Should we check that next_free_rec contains
3466                  * the extent?
3467                  */
3468                 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
3469                         metadata_delete = 1;
3470         }
3471
3472         (*tc)->tc_last_eb_bh = last_eb_bh;
3473
3474         if (metadata_delete) {
3475                 mlog(0, "Will have to delete metadata for this trunc. "
3476                      "locking allocator.\n");
3477                 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
3478                 if (!ext_alloc_inode) {
3479                         status = -ENOMEM;
3480                         mlog_errno(status);
3481                         goto bail;
3482                 }
3483
3484                 mutex_lock(&ext_alloc_inode->i_mutex);
3485                 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
3486
3487                 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
3488                 if (status < 0) {
3489                         mlog_errno(status);
3490                         goto bail;
3491                 }
3492                 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
3493                 (*tc)->tc_ext_alloc_locked = 1;
3494         }
3495
3496         status = 0;
3497 bail:
3498         if (status < 0) {
3499                 if (*tc)
3500                         ocfs2_free_truncate_context(*tc);
3501                 *tc = NULL;
3502         }
3503         mlog_exit_void();
3504         return status;
3505 }
3506
3507 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
3508 {
3509         if (tc->tc_ext_alloc_inode) {
3510                 if (tc->tc_ext_alloc_locked)
3511                         ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
3512
3513                 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
3514                 iput(tc->tc_ext_alloc_inode);
3515         }
3516
3517         if (tc->tc_ext_alloc_bh)
3518                 brelse(tc->tc_ext_alloc_bh);
3519
3520         if (tc->tc_last_eb_bh)
3521                 brelse(tc->tc_last_eb_bh);
3522
3523         kfree(tc);
3524 }