ocfs2: Modify ocfs2_num_free_extents for future xattr usage.
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54                                          struct ocfs2_extent_block *eb);
55
56 /*
57  * Structures which describe a path through a btree, and functions to
58  * manipulate them.
59  *
60  * The idea here is to be as generic as possible with the tree
61  * manipulation code.
62  */
63 struct ocfs2_path_item {
64         struct buffer_head              *bh;
65         struct ocfs2_extent_list        *el;
66 };
67
68 #define OCFS2_MAX_PATH_DEPTH    5
69
70 struct ocfs2_path {
71         int                     p_tree_depth;
72         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
73 };
74
75 #define path_root_bh(_path) ((_path)->p_node[0].bh)
76 #define path_root_el(_path) ((_path)->p_node[0].el)
77 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
78 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
79 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
80
81 /*
82  * Reset the actual path elements so that we can re-use the structure
83  * to build another path. Generally, this involves freeing the buffer
84  * heads.
85  */
86 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
87 {
88         int i, start = 0, depth = 0;
89         struct ocfs2_path_item *node;
90
91         if (keep_root)
92                 start = 1;
93
94         for(i = start; i < path_num_items(path); i++) {
95                 node = &path->p_node[i];
96
97                 brelse(node->bh);
98                 node->bh = NULL;
99                 node->el = NULL;
100         }
101
102         /*
103          * Tree depth may change during truncate, or insert. If we're
104          * keeping the root extent list, then make sure that our path
105          * structure reflects the proper depth.
106          */
107         if (keep_root)
108                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
109
110         path->p_tree_depth = depth;
111 }
112
113 static void ocfs2_free_path(struct ocfs2_path *path)
114 {
115         if (path) {
116                 ocfs2_reinit_path(path, 0);
117                 kfree(path);
118         }
119 }
120
121 /*
122  * All the elements of src into dest. After this call, src could be freed
123  * without affecting dest.
124  *
125  * Both paths should have the same root. Any non-root elements of dest
126  * will be freed.
127  */
128 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
129 {
130         int i;
131
132         BUG_ON(path_root_bh(dest) != path_root_bh(src));
133         BUG_ON(path_root_el(dest) != path_root_el(src));
134
135         ocfs2_reinit_path(dest, 1);
136
137         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
138                 dest->p_node[i].bh = src->p_node[i].bh;
139                 dest->p_node[i].el = src->p_node[i].el;
140
141                 if (dest->p_node[i].bh)
142                         get_bh(dest->p_node[i].bh);
143         }
144 }
145
146 /*
147  * Make the *dest path the same as src and re-initialize src path to
148  * have a root only.
149  */
150 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
151 {
152         int i;
153
154         BUG_ON(path_root_bh(dest) != path_root_bh(src));
155
156         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
157                 brelse(dest->p_node[i].bh);
158
159                 dest->p_node[i].bh = src->p_node[i].bh;
160                 dest->p_node[i].el = src->p_node[i].el;
161
162                 src->p_node[i].bh = NULL;
163                 src->p_node[i].el = NULL;
164         }
165 }
166
167 /*
168  * Insert an extent block at given index.
169  *
170  * This will not take an additional reference on eb_bh.
171  */
172 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
173                                         struct buffer_head *eb_bh)
174 {
175         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
176
177         /*
178          * Right now, no root bh is an extent block, so this helps
179          * catch code errors with dinode trees. The assertion can be
180          * safely removed if we ever need to insert extent block
181          * structures at the root.
182          */
183         BUG_ON(index == 0);
184
185         path->p_node[index].bh = eb_bh;
186         path->p_node[index].el = &eb->h_list;
187 }
188
189 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
190                                          struct ocfs2_extent_list *root_el)
191 {
192         struct ocfs2_path *path;
193
194         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
195
196         path = kzalloc(sizeof(*path), GFP_NOFS);
197         if (path) {
198                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
199                 get_bh(root_bh);
200                 path_root_bh(path) = root_bh;
201                 path_root_el(path) = root_el;
202         }
203
204         return path;
205 }
206
207 /*
208  * Allocate and initialize a new path based on a disk inode tree.
209  */
210 static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
211 {
212         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
213         struct ocfs2_extent_list *el = &di->id2.i_list;
214
215         return ocfs2_new_path(di_bh, el);
216 }
217
218 /*
219  * Convenience function to journal all components in a path.
220  */
221 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
222                                      struct ocfs2_path *path)
223 {
224         int i, ret = 0;
225
226         if (!path)
227                 goto out;
228
229         for(i = 0; i < path_num_items(path); i++) {
230                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
231                                            OCFS2_JOURNAL_ACCESS_WRITE);
232                 if (ret < 0) {
233                         mlog_errno(ret);
234                         goto out;
235                 }
236         }
237
238 out:
239         return ret;
240 }
241
242 /*
243  * Return the index of the extent record which contains cluster #v_cluster.
244  * -1 is returned if it was not found.
245  *
246  * Should work fine on interior and exterior nodes.
247  */
248 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
249 {
250         int ret = -1;
251         int i;
252         struct ocfs2_extent_rec *rec;
253         u32 rec_end, rec_start, clusters;
254
255         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
256                 rec = &el->l_recs[i];
257
258                 rec_start = le32_to_cpu(rec->e_cpos);
259                 clusters = ocfs2_rec_clusters(el, rec);
260
261                 rec_end = rec_start + clusters;
262
263                 if (v_cluster >= rec_start && v_cluster < rec_end) {
264                         ret = i;
265                         break;
266                 }
267         }
268
269         return ret;
270 }
271
272 enum ocfs2_contig_type {
273         CONTIG_NONE = 0,
274         CONTIG_LEFT,
275         CONTIG_RIGHT,
276         CONTIG_LEFTRIGHT,
277 };
278
279
280 /*
281  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
282  * ocfs2_extent_contig only work properly against leaf nodes!
283  */
284 static int ocfs2_block_extent_contig(struct super_block *sb,
285                                      struct ocfs2_extent_rec *ext,
286                                      u64 blkno)
287 {
288         u64 blk_end = le64_to_cpu(ext->e_blkno);
289
290         blk_end += ocfs2_clusters_to_blocks(sb,
291                                     le16_to_cpu(ext->e_leaf_clusters));
292
293         return blkno == blk_end;
294 }
295
296 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
297                                   struct ocfs2_extent_rec *right)
298 {
299         u32 left_range;
300
301         left_range = le32_to_cpu(left->e_cpos) +
302                 le16_to_cpu(left->e_leaf_clusters);
303
304         return (left_range == le32_to_cpu(right->e_cpos));
305 }
306
307 static enum ocfs2_contig_type
308         ocfs2_extent_contig(struct inode *inode,
309                             struct ocfs2_extent_rec *ext,
310                             struct ocfs2_extent_rec *insert_rec)
311 {
312         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
313
314         /*
315          * Refuse to coalesce extent records with different flag
316          * fields - we don't want to mix unwritten extents with user
317          * data.
318          */
319         if (ext->e_flags != insert_rec->e_flags)
320                 return CONTIG_NONE;
321
322         if (ocfs2_extents_adjacent(ext, insert_rec) &&
323             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
324                         return CONTIG_RIGHT;
325
326         blkno = le64_to_cpu(ext->e_blkno);
327         if (ocfs2_extents_adjacent(insert_rec, ext) &&
328             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
329                 return CONTIG_LEFT;
330
331         return CONTIG_NONE;
332 }
333
334 /*
335  * NOTE: We can have pretty much any combination of contiguousness and
336  * appending.
337  *
338  * The usefulness of APPEND_TAIL is more in that it lets us know that
339  * we'll have to update the path to that leaf.
340  */
341 enum ocfs2_append_type {
342         APPEND_NONE = 0,
343         APPEND_TAIL,
344 };
345
346 enum ocfs2_split_type {
347         SPLIT_NONE = 0,
348         SPLIT_LEFT,
349         SPLIT_RIGHT,
350 };
351
352 struct ocfs2_insert_type {
353         enum ocfs2_split_type   ins_split;
354         enum ocfs2_append_type  ins_appending;
355         enum ocfs2_contig_type  ins_contig;
356         int                     ins_contig_index;
357         int                     ins_tree_depth;
358 };
359
360 struct ocfs2_merge_ctxt {
361         enum ocfs2_contig_type  c_contig_type;
362         int                     c_has_empty_extent;
363         int                     c_split_covers_rec;
364 };
365
366 /*
367  * How many free extents have we got before we need more meta data?
368  */
369 int ocfs2_num_free_extents(struct ocfs2_super *osb,
370                            struct inode *inode,
371                            struct buffer_head *bh)
372 {
373         int retval;
374         struct ocfs2_extent_list *el;
375         struct ocfs2_extent_block *eb;
376         struct buffer_head *eb_bh = NULL;
377         struct ocfs2_dinode *fe = (struct ocfs2_dinode *)bh->b_data;
378
379         mlog_entry_void();
380
381         if (!OCFS2_IS_VALID_DINODE(fe)) {
382                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
383                 retval = -EIO;
384                 goto bail;
385         }
386
387         if (fe->i_last_eb_blk) {
388                 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
389                                           &eb_bh, OCFS2_BH_CACHED, inode);
390                 if (retval < 0) {
391                         mlog_errno(retval);
392                         goto bail;
393                 }
394                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
395                 el = &eb->h_list;
396         } else
397                 el = &fe->id2.i_list;
398
399         BUG_ON(el->l_tree_depth != 0);
400
401         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
402 bail:
403         if (eb_bh)
404                 brelse(eb_bh);
405
406         mlog_exit(retval);
407         return retval;
408 }
409
410 /* expects array to already be allocated
411  *
412  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
413  * l_count for you
414  */
415 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
416                                      handle_t *handle,
417                                      struct inode *inode,
418                                      int wanted,
419                                      struct ocfs2_alloc_context *meta_ac,
420                                      struct buffer_head *bhs[])
421 {
422         int count, status, i;
423         u16 suballoc_bit_start;
424         u32 num_got;
425         u64 first_blkno;
426         struct ocfs2_extent_block *eb;
427
428         mlog_entry_void();
429
430         count = 0;
431         while (count < wanted) {
432                 status = ocfs2_claim_metadata(osb,
433                                               handle,
434                                               meta_ac,
435                                               wanted - count,
436                                               &suballoc_bit_start,
437                                               &num_got,
438                                               &first_blkno);
439                 if (status < 0) {
440                         mlog_errno(status);
441                         goto bail;
442                 }
443
444                 for(i = count;  i < (num_got + count); i++) {
445                         bhs[i] = sb_getblk(osb->sb, first_blkno);
446                         if (bhs[i] == NULL) {
447                                 status = -EIO;
448                                 mlog_errno(status);
449                                 goto bail;
450                         }
451                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
452
453                         status = ocfs2_journal_access(handle, inode, bhs[i],
454                                                       OCFS2_JOURNAL_ACCESS_CREATE);
455                         if (status < 0) {
456                                 mlog_errno(status);
457                                 goto bail;
458                         }
459
460                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
461                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
462                         /* Ok, setup the minimal stuff here. */
463                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
464                         eb->h_blkno = cpu_to_le64(first_blkno);
465                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
466                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
467                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
468                         eb->h_list.l_count =
469                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
470
471                         suballoc_bit_start++;
472                         first_blkno++;
473
474                         /* We'll also be dirtied by the caller, so
475                          * this isn't absolutely necessary. */
476                         status = ocfs2_journal_dirty(handle, bhs[i]);
477                         if (status < 0) {
478                                 mlog_errno(status);
479                                 goto bail;
480                         }
481                 }
482
483                 count += num_got;
484         }
485
486         status = 0;
487 bail:
488         if (status < 0) {
489                 for(i = 0; i < wanted; i++) {
490                         if (bhs[i])
491                                 brelse(bhs[i]);
492                         bhs[i] = NULL;
493                 }
494         }
495         mlog_exit(status);
496         return status;
497 }
498
499 /*
500  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
501  *
502  * Returns the sum of the rightmost extent rec logical offset and
503  * cluster count.
504  *
505  * ocfs2_add_branch() uses this to determine what logical cluster
506  * value should be populated into the leftmost new branch records.
507  *
508  * ocfs2_shift_tree_depth() uses this to determine the # clusters
509  * value for the new topmost tree record.
510  */
511 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
512 {
513         int i;
514
515         i = le16_to_cpu(el->l_next_free_rec) - 1;
516
517         return le32_to_cpu(el->l_recs[i].e_cpos) +
518                 ocfs2_rec_clusters(el, &el->l_recs[i]);
519 }
520
521 /*
522  * Add an entire tree branch to our inode. eb_bh is the extent block
523  * to start at, if we don't want to start the branch at the dinode
524  * structure.
525  *
526  * last_eb_bh is required as we have to update it's next_leaf pointer
527  * for the new last extent block.
528  *
529  * the new branch will be 'empty' in the sense that every block will
530  * contain a single record with cluster count == 0.
531  */
532 static int ocfs2_add_branch(struct ocfs2_super *osb,
533                             handle_t *handle,
534                             struct inode *inode,
535                             struct buffer_head *fe_bh,
536                             struct buffer_head *eb_bh,
537                             struct buffer_head **last_eb_bh,
538                             struct ocfs2_alloc_context *meta_ac)
539 {
540         int status, new_blocks, i;
541         u64 next_blkno, new_last_eb_blk;
542         struct buffer_head *bh;
543         struct buffer_head **new_eb_bhs = NULL;
544         struct ocfs2_dinode *fe;
545         struct ocfs2_extent_block *eb;
546         struct ocfs2_extent_list  *eb_el;
547         struct ocfs2_extent_list  *el;
548         u32 new_cpos;
549
550         mlog_entry_void();
551
552         BUG_ON(!last_eb_bh || !*last_eb_bh);
553
554         fe = (struct ocfs2_dinode *) fe_bh->b_data;
555
556         if (eb_bh) {
557                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
558                 el = &eb->h_list;
559         } else
560                 el = &fe->id2.i_list;
561
562         /* we never add a branch to a leaf. */
563         BUG_ON(!el->l_tree_depth);
564
565         new_blocks = le16_to_cpu(el->l_tree_depth);
566
567         /* allocate the number of new eb blocks we need */
568         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
569                              GFP_KERNEL);
570         if (!new_eb_bhs) {
571                 status = -ENOMEM;
572                 mlog_errno(status);
573                 goto bail;
574         }
575
576         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
577                                            meta_ac, new_eb_bhs);
578         if (status < 0) {
579                 mlog_errno(status);
580                 goto bail;
581         }
582
583         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
584         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
585
586         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
587          * linked with the rest of the tree.
588          * conversly, new_eb_bhs[0] is the new bottommost leaf.
589          *
590          * when we leave the loop, new_last_eb_blk will point to the
591          * newest leaf, and next_blkno will point to the topmost extent
592          * block. */
593         next_blkno = new_last_eb_blk = 0;
594         for(i = 0; i < new_blocks; i++) {
595                 bh = new_eb_bhs[i];
596                 eb = (struct ocfs2_extent_block *) bh->b_data;
597                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
598                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
599                         status = -EIO;
600                         goto bail;
601                 }
602                 eb_el = &eb->h_list;
603
604                 status = ocfs2_journal_access(handle, inode, bh,
605                                               OCFS2_JOURNAL_ACCESS_CREATE);
606                 if (status < 0) {
607                         mlog_errno(status);
608                         goto bail;
609                 }
610
611                 eb->h_next_leaf_blk = 0;
612                 eb_el->l_tree_depth = cpu_to_le16(i);
613                 eb_el->l_next_free_rec = cpu_to_le16(1);
614                 /*
615                  * This actually counts as an empty extent as
616                  * c_clusters == 0
617                  */
618                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
619                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
620                 /*
621                  * eb_el isn't always an interior node, but even leaf
622                  * nodes want a zero'd flags and reserved field so
623                  * this gets the whole 32 bits regardless of use.
624                  */
625                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
626                 if (!eb_el->l_tree_depth)
627                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
628
629                 status = ocfs2_journal_dirty(handle, bh);
630                 if (status < 0) {
631                         mlog_errno(status);
632                         goto bail;
633                 }
634
635                 next_blkno = le64_to_cpu(eb->h_blkno);
636         }
637
638         /* This is a bit hairy. We want to update up to three blocks
639          * here without leaving any of them in an inconsistent state
640          * in case of error. We don't have to worry about
641          * journal_dirty erroring as it won't unless we've aborted the
642          * handle (in which case we would never be here) so reserving
643          * the write with journal_access is all we need to do. */
644         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
645                                       OCFS2_JOURNAL_ACCESS_WRITE);
646         if (status < 0) {
647                 mlog_errno(status);
648                 goto bail;
649         }
650         status = ocfs2_journal_access(handle, inode, fe_bh,
651                                       OCFS2_JOURNAL_ACCESS_WRITE);
652         if (status < 0) {
653                 mlog_errno(status);
654                 goto bail;
655         }
656         if (eb_bh) {
657                 status = ocfs2_journal_access(handle, inode, eb_bh,
658                                               OCFS2_JOURNAL_ACCESS_WRITE);
659                 if (status < 0) {
660                         mlog_errno(status);
661                         goto bail;
662                 }
663         }
664
665         /* Link the new branch into the rest of the tree (el will
666          * either be on the fe, or the extent block passed in. */
667         i = le16_to_cpu(el->l_next_free_rec);
668         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
669         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
670         el->l_recs[i].e_int_clusters = 0;
671         le16_add_cpu(&el->l_next_free_rec, 1);
672
673         /* fe needs a new last extent block pointer, as does the
674          * next_leaf on the previously last-extent-block. */
675         fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
676
677         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
678         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
679
680         status = ocfs2_journal_dirty(handle, *last_eb_bh);
681         if (status < 0)
682                 mlog_errno(status);
683         status = ocfs2_journal_dirty(handle, fe_bh);
684         if (status < 0)
685                 mlog_errno(status);
686         if (eb_bh) {
687                 status = ocfs2_journal_dirty(handle, eb_bh);
688                 if (status < 0)
689                         mlog_errno(status);
690         }
691
692         /*
693          * Some callers want to track the rightmost leaf so pass it
694          * back here.
695          */
696         brelse(*last_eb_bh);
697         get_bh(new_eb_bhs[0]);
698         *last_eb_bh = new_eb_bhs[0];
699
700         status = 0;
701 bail:
702         if (new_eb_bhs) {
703                 for (i = 0; i < new_blocks; i++)
704                         if (new_eb_bhs[i])
705                                 brelse(new_eb_bhs[i]);
706                 kfree(new_eb_bhs);
707         }
708
709         mlog_exit(status);
710         return status;
711 }
712
713 /*
714  * adds another level to the allocation tree.
715  * returns back the new extent block so you can add a branch to it
716  * after this call.
717  */
718 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
719                                   handle_t *handle,
720                                   struct inode *inode,
721                                   struct buffer_head *fe_bh,
722                                   struct ocfs2_alloc_context *meta_ac,
723                                   struct buffer_head **ret_new_eb_bh)
724 {
725         int status, i;
726         u32 new_clusters;
727         struct buffer_head *new_eb_bh = NULL;
728         struct ocfs2_dinode *fe;
729         struct ocfs2_extent_block *eb;
730         struct ocfs2_extent_list  *fe_el;
731         struct ocfs2_extent_list  *eb_el;
732
733         mlog_entry_void();
734
735         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
736                                            &new_eb_bh);
737         if (status < 0) {
738                 mlog_errno(status);
739                 goto bail;
740         }
741
742         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
743         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
744                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
745                 status = -EIO;
746                 goto bail;
747         }
748
749         eb_el = &eb->h_list;
750         fe = (struct ocfs2_dinode *) fe_bh->b_data;
751         fe_el = &fe->id2.i_list;
752
753         status = ocfs2_journal_access(handle, inode, new_eb_bh,
754                                       OCFS2_JOURNAL_ACCESS_CREATE);
755         if (status < 0) {
756                 mlog_errno(status);
757                 goto bail;
758         }
759
760         /* copy the fe data into the new extent block */
761         eb_el->l_tree_depth = fe_el->l_tree_depth;
762         eb_el->l_next_free_rec = fe_el->l_next_free_rec;
763         for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
764                 eb_el->l_recs[i] = fe_el->l_recs[i];
765
766         status = ocfs2_journal_dirty(handle, new_eb_bh);
767         if (status < 0) {
768                 mlog_errno(status);
769                 goto bail;
770         }
771
772         status = ocfs2_journal_access(handle, inode, fe_bh,
773                                       OCFS2_JOURNAL_ACCESS_WRITE);
774         if (status < 0) {
775                 mlog_errno(status);
776                 goto bail;
777         }
778
779         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
780
781         /* update fe now */
782         le16_add_cpu(&fe_el->l_tree_depth, 1);
783         fe_el->l_recs[0].e_cpos = 0;
784         fe_el->l_recs[0].e_blkno = eb->h_blkno;
785         fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
786         for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
787                 memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
788         fe_el->l_next_free_rec = cpu_to_le16(1);
789
790         /* If this is our 1st tree depth shift, then last_eb_blk
791          * becomes the allocated extent block */
792         if (fe_el->l_tree_depth == cpu_to_le16(1))
793                 fe->i_last_eb_blk = eb->h_blkno;
794
795         status = ocfs2_journal_dirty(handle, fe_bh);
796         if (status < 0) {
797                 mlog_errno(status);
798                 goto bail;
799         }
800
801         *ret_new_eb_bh = new_eb_bh;
802         new_eb_bh = NULL;
803         status = 0;
804 bail:
805         if (new_eb_bh)
806                 brelse(new_eb_bh);
807
808         mlog_exit(status);
809         return status;
810 }
811
812 /*
813  * Should only be called when there is no space left in any of the
814  * leaf nodes. What we want to do is find the lowest tree depth
815  * non-leaf extent block with room for new records. There are three
816  * valid results of this search:
817  *
818  * 1) a lowest extent block is found, then we pass it back in
819  *    *lowest_eb_bh and return '0'
820  *
821  * 2) the search fails to find anything, but the dinode has room. We
822  *    pass NULL back in *lowest_eb_bh, but still return '0'
823  *
824  * 3) the search fails to find anything AND the dinode is full, in
825  *    which case we return > 0
826  *
827  * return status < 0 indicates an error.
828  */
829 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
830                                     struct inode *inode,
831                                     struct buffer_head *fe_bh,
832                                     struct buffer_head **target_bh)
833 {
834         int status = 0, i;
835         u64 blkno;
836         struct ocfs2_dinode *fe;
837         struct ocfs2_extent_block *eb;
838         struct ocfs2_extent_list  *el;
839         struct buffer_head *bh = NULL;
840         struct buffer_head *lowest_bh = NULL;
841
842         mlog_entry_void();
843
844         *target_bh = NULL;
845
846         fe = (struct ocfs2_dinode *) fe_bh->b_data;
847         el = &fe->id2.i_list;
848
849         while(le16_to_cpu(el->l_tree_depth) > 1) {
850                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
851                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
852                                     "extent list (next_free_rec == 0)",
853                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
854                         status = -EIO;
855                         goto bail;
856                 }
857                 i = le16_to_cpu(el->l_next_free_rec) - 1;
858                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
859                 if (!blkno) {
860                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
861                                     "list where extent # %d has no physical "
862                                     "block start",
863                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
864                         status = -EIO;
865                         goto bail;
866                 }
867
868                 if (bh) {
869                         brelse(bh);
870                         bh = NULL;
871                 }
872
873                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
874                                           inode);
875                 if (status < 0) {
876                         mlog_errno(status);
877                         goto bail;
878                 }
879
880                 eb = (struct ocfs2_extent_block *) bh->b_data;
881                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
882                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
883                         status = -EIO;
884                         goto bail;
885                 }
886                 el = &eb->h_list;
887
888                 if (le16_to_cpu(el->l_next_free_rec) <
889                     le16_to_cpu(el->l_count)) {
890                         if (lowest_bh)
891                                 brelse(lowest_bh);
892                         lowest_bh = bh;
893                         get_bh(lowest_bh);
894                 }
895         }
896
897         /* If we didn't find one and the fe doesn't have any room,
898          * then return '1' */
899         if (!lowest_bh
900             && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
901                 status = 1;
902
903         *target_bh = lowest_bh;
904 bail:
905         if (bh)
906                 brelse(bh);
907
908         mlog_exit(status);
909         return status;
910 }
911
912 /*
913  * Grow a b-tree so that it has more records.
914  *
915  * We might shift the tree depth in which case existing paths should
916  * be considered invalid.
917  *
918  * Tree depth after the grow is returned via *final_depth.
919  *
920  * *last_eb_bh will be updated by ocfs2_add_branch().
921  */
922 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
923                            struct buffer_head *di_bh, int *final_depth,
924                            struct buffer_head **last_eb_bh,
925                            struct ocfs2_alloc_context *meta_ac)
926 {
927         int ret, shift;
928         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
929         int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
930         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
931         struct buffer_head *bh = NULL;
932
933         BUG_ON(meta_ac == NULL);
934
935         shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
936         if (shift < 0) {
937                 ret = shift;
938                 mlog_errno(ret);
939                 goto out;
940         }
941
942         /* We traveled all the way to the bottom of the allocation tree
943          * and didn't find room for any more extents - we need to add
944          * another tree level */
945         if (shift) {
946                 BUG_ON(bh);
947                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
948
949                 /* ocfs2_shift_tree_depth will return us a buffer with
950                  * the new extent block (so we can pass that to
951                  * ocfs2_add_branch). */
952                 ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
953                                              meta_ac, &bh);
954                 if (ret < 0) {
955                         mlog_errno(ret);
956                         goto out;
957                 }
958                 depth++;
959                 if (depth == 1) {
960                         /*
961                          * Special case: we have room now if we shifted from
962                          * tree_depth 0, so no more work needs to be done.
963                          *
964                          * We won't be calling add_branch, so pass
965                          * back *last_eb_bh as the new leaf. At depth
966                          * zero, it should always be null so there's
967                          * no reason to brelse.
968                          */
969                         BUG_ON(*last_eb_bh);
970                         get_bh(bh);
971                         *last_eb_bh = bh;
972                         goto out;
973                 }
974         }
975
976         /* call ocfs2_add_branch to add the final part of the tree with
977          * the new data. */
978         mlog(0, "add branch. bh = %p\n", bh);
979         ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
980                                meta_ac);
981         if (ret < 0) {
982                 mlog_errno(ret);
983                 goto out;
984         }
985
986 out:
987         if (final_depth)
988                 *final_depth = depth;
989         brelse(bh);
990         return ret;
991 }
992
993 /*
994  * This function will discard the rightmost extent record.
995  */
996 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
997 {
998         int next_free = le16_to_cpu(el->l_next_free_rec);
999         int count = le16_to_cpu(el->l_count);
1000         unsigned int num_bytes;
1001
1002         BUG_ON(!next_free);
1003         /* This will cause us to go off the end of our extent list. */
1004         BUG_ON(next_free >= count);
1005
1006         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1007
1008         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1009 }
1010
1011 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1012                               struct ocfs2_extent_rec *insert_rec)
1013 {
1014         int i, insert_index, next_free, has_empty, num_bytes;
1015         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1016         struct ocfs2_extent_rec *rec;
1017
1018         next_free = le16_to_cpu(el->l_next_free_rec);
1019         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1020
1021         BUG_ON(!next_free);
1022
1023         /* The tree code before us didn't allow enough room in the leaf. */
1024         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1025
1026         /*
1027          * The easiest way to approach this is to just remove the
1028          * empty extent and temporarily decrement next_free.
1029          */
1030         if (has_empty) {
1031                 /*
1032                  * If next_free was 1 (only an empty extent), this
1033                  * loop won't execute, which is fine. We still want
1034                  * the decrement above to happen.
1035                  */
1036                 for(i = 0; i < (next_free - 1); i++)
1037                         el->l_recs[i] = el->l_recs[i+1];
1038
1039                 next_free--;
1040         }
1041
1042         /*
1043          * Figure out what the new record index should be.
1044          */
1045         for(i = 0; i < next_free; i++) {
1046                 rec = &el->l_recs[i];
1047
1048                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1049                         break;
1050         }
1051         insert_index = i;
1052
1053         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1054              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1055
1056         BUG_ON(insert_index < 0);
1057         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1058         BUG_ON(insert_index > next_free);
1059
1060         /*
1061          * No need to memmove if we're just adding to the tail.
1062          */
1063         if (insert_index != next_free) {
1064                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1065
1066                 num_bytes = next_free - insert_index;
1067                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1068                 memmove(&el->l_recs[insert_index + 1],
1069                         &el->l_recs[insert_index],
1070                         num_bytes);
1071         }
1072
1073         /*
1074          * Either we had an empty extent, and need to re-increment or
1075          * there was no empty extent on a non full rightmost leaf node,
1076          * in which case we still need to increment.
1077          */
1078         next_free++;
1079         el->l_next_free_rec = cpu_to_le16(next_free);
1080         /*
1081          * Make sure none of the math above just messed up our tree.
1082          */
1083         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1084
1085         el->l_recs[insert_index] = *insert_rec;
1086
1087 }
1088
1089 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1090 {
1091         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1092
1093         BUG_ON(num_recs == 0);
1094
1095         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1096                 num_recs--;
1097                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1098                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1099                 memset(&el->l_recs[num_recs], 0,
1100                        sizeof(struct ocfs2_extent_rec));
1101                 el->l_next_free_rec = cpu_to_le16(num_recs);
1102         }
1103 }
1104
1105 /*
1106  * Create an empty extent record .
1107  *
1108  * l_next_free_rec may be updated.
1109  *
1110  * If an empty extent already exists do nothing.
1111  */
1112 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1113 {
1114         int next_free = le16_to_cpu(el->l_next_free_rec);
1115
1116         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1117
1118         if (next_free == 0)
1119                 goto set_and_inc;
1120
1121         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1122                 return;
1123
1124         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1125                         "Asked to create an empty extent in a full list:\n"
1126                         "count = %u, tree depth = %u",
1127                         le16_to_cpu(el->l_count),
1128                         le16_to_cpu(el->l_tree_depth));
1129
1130         ocfs2_shift_records_right(el);
1131
1132 set_and_inc:
1133         le16_add_cpu(&el->l_next_free_rec, 1);
1134         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1135 }
1136
1137 /*
1138  * For a rotation which involves two leaf nodes, the "root node" is
1139  * the lowest level tree node which contains a path to both leafs. This
1140  * resulting set of information can be used to form a complete "subtree"
1141  *
1142  * This function is passed two full paths from the dinode down to a
1143  * pair of adjacent leaves. It's task is to figure out which path
1144  * index contains the subtree root - this can be the root index itself
1145  * in a worst-case rotation.
1146  *
1147  * The array index of the subtree root is passed back.
1148  */
1149 static int ocfs2_find_subtree_root(struct inode *inode,
1150                                    struct ocfs2_path *left,
1151                                    struct ocfs2_path *right)
1152 {
1153         int i = 0;
1154
1155         /*
1156          * Check that the caller passed in two paths from the same tree.
1157          */
1158         BUG_ON(path_root_bh(left) != path_root_bh(right));
1159
1160         do {
1161                 i++;
1162
1163                 /*
1164                  * The caller didn't pass two adjacent paths.
1165                  */
1166                 mlog_bug_on_msg(i > left->p_tree_depth,
1167                                 "Inode %lu, left depth %u, right depth %u\n"
1168                                 "left leaf blk %llu, right leaf blk %llu\n",
1169                                 inode->i_ino, left->p_tree_depth,
1170                                 right->p_tree_depth,
1171                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1172                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1173         } while (left->p_node[i].bh->b_blocknr ==
1174                  right->p_node[i].bh->b_blocknr);
1175
1176         return i - 1;
1177 }
1178
1179 typedef void (path_insert_t)(void *, struct buffer_head *);
1180
1181 /*
1182  * Traverse a btree path in search of cpos, starting at root_el.
1183  *
1184  * This code can be called with a cpos larger than the tree, in which
1185  * case it will return the rightmost path.
1186  */
1187 static int __ocfs2_find_path(struct inode *inode,
1188                              struct ocfs2_extent_list *root_el, u32 cpos,
1189                              path_insert_t *func, void *data)
1190 {
1191         int i, ret = 0;
1192         u32 range;
1193         u64 blkno;
1194         struct buffer_head *bh = NULL;
1195         struct ocfs2_extent_block *eb;
1196         struct ocfs2_extent_list *el;
1197         struct ocfs2_extent_rec *rec;
1198         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1199
1200         el = root_el;
1201         while (el->l_tree_depth) {
1202                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1203                         ocfs2_error(inode->i_sb,
1204                                     "Inode %llu has empty extent list at "
1205                                     "depth %u\n",
1206                                     (unsigned long long)oi->ip_blkno,
1207                                     le16_to_cpu(el->l_tree_depth));
1208                         ret = -EROFS;
1209                         goto out;
1210
1211                 }
1212
1213                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1214                         rec = &el->l_recs[i];
1215
1216                         /*
1217                          * In the case that cpos is off the allocation
1218                          * tree, this should just wind up returning the
1219                          * rightmost record.
1220                          */
1221                         range = le32_to_cpu(rec->e_cpos) +
1222                                 ocfs2_rec_clusters(el, rec);
1223                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1224                             break;
1225                 }
1226
1227                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1228                 if (blkno == 0) {
1229                         ocfs2_error(inode->i_sb,
1230                                     "Inode %llu has bad blkno in extent list "
1231                                     "at depth %u (index %d)\n",
1232                                     (unsigned long long)oi->ip_blkno,
1233                                     le16_to_cpu(el->l_tree_depth), i);
1234                         ret = -EROFS;
1235                         goto out;
1236                 }
1237
1238                 brelse(bh);
1239                 bh = NULL;
1240                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1241                                        &bh, OCFS2_BH_CACHED, inode);
1242                 if (ret) {
1243                         mlog_errno(ret);
1244                         goto out;
1245                 }
1246
1247                 eb = (struct ocfs2_extent_block *) bh->b_data;
1248                 el = &eb->h_list;
1249                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1250                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1251                         ret = -EIO;
1252                         goto out;
1253                 }
1254
1255                 if (le16_to_cpu(el->l_next_free_rec) >
1256                     le16_to_cpu(el->l_count)) {
1257                         ocfs2_error(inode->i_sb,
1258                                     "Inode %llu has bad count in extent list "
1259                                     "at block %llu (next free=%u, count=%u)\n",
1260                                     (unsigned long long)oi->ip_blkno,
1261                                     (unsigned long long)bh->b_blocknr,
1262                                     le16_to_cpu(el->l_next_free_rec),
1263                                     le16_to_cpu(el->l_count));
1264                         ret = -EROFS;
1265                         goto out;
1266                 }
1267
1268                 if (func)
1269                         func(data, bh);
1270         }
1271
1272 out:
1273         /*
1274          * Catch any trailing bh that the loop didn't handle.
1275          */
1276         brelse(bh);
1277
1278         return ret;
1279 }
1280
1281 /*
1282  * Given an initialized path (that is, it has a valid root extent
1283  * list), this function will traverse the btree in search of the path
1284  * which would contain cpos.
1285  *
1286  * The path traveled is recorded in the path structure.
1287  *
1288  * Note that this will not do any comparisons on leaf node extent
1289  * records, so it will work fine in the case that we just added a tree
1290  * branch.
1291  */
1292 struct find_path_data {
1293         int index;
1294         struct ocfs2_path *path;
1295 };
1296 static void find_path_ins(void *data, struct buffer_head *bh)
1297 {
1298         struct find_path_data *fp = data;
1299
1300         get_bh(bh);
1301         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1302         fp->index++;
1303 }
1304 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1305                            u32 cpos)
1306 {
1307         struct find_path_data data;
1308
1309         data.index = 1;
1310         data.path = path;
1311         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1312                                  find_path_ins, &data);
1313 }
1314
1315 static void find_leaf_ins(void *data, struct buffer_head *bh)
1316 {
1317         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1318         struct ocfs2_extent_list *el = &eb->h_list;
1319         struct buffer_head **ret = data;
1320
1321         /* We want to retain only the leaf block. */
1322         if (le16_to_cpu(el->l_tree_depth) == 0) {
1323                 get_bh(bh);
1324                 *ret = bh;
1325         }
1326 }
1327 /*
1328  * Find the leaf block in the tree which would contain cpos. No
1329  * checking of the actual leaf is done.
1330  *
1331  * Some paths want to call this instead of allocating a path structure
1332  * and calling ocfs2_find_path().
1333  *
1334  * This function doesn't handle non btree extent lists.
1335  */
1336 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1337                     u32 cpos, struct buffer_head **leaf_bh)
1338 {
1339         int ret;
1340         struct buffer_head *bh = NULL;
1341
1342         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1343         if (ret) {
1344                 mlog_errno(ret);
1345                 goto out;
1346         }
1347
1348         *leaf_bh = bh;
1349 out:
1350         return ret;
1351 }
1352
1353 /*
1354  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1355  *
1356  * Basically, we've moved stuff around at the bottom of the tree and
1357  * we need to fix up the extent records above the changes to reflect
1358  * the new changes.
1359  *
1360  * left_rec: the record on the left.
1361  * left_child_el: is the child list pointed to by left_rec
1362  * right_rec: the record to the right of left_rec
1363  * right_child_el: is the child list pointed to by right_rec
1364  *
1365  * By definition, this only works on interior nodes.
1366  */
1367 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1368                                   struct ocfs2_extent_list *left_child_el,
1369                                   struct ocfs2_extent_rec *right_rec,
1370                                   struct ocfs2_extent_list *right_child_el)
1371 {
1372         u32 left_clusters, right_end;
1373
1374         /*
1375          * Interior nodes never have holes. Their cpos is the cpos of
1376          * the leftmost record in their child list. Their cluster
1377          * count covers the full theoretical range of their child list
1378          * - the range between their cpos and the cpos of the record
1379          * immediately to their right.
1380          */
1381         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1382         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1383                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1384                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1385         }
1386         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1387         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1388
1389         /*
1390          * Calculate the rightmost cluster count boundary before
1391          * moving cpos - we will need to adjust clusters after
1392          * updating e_cpos to keep the same highest cluster count.
1393          */
1394         right_end = le32_to_cpu(right_rec->e_cpos);
1395         right_end += le32_to_cpu(right_rec->e_int_clusters);
1396
1397         right_rec->e_cpos = left_rec->e_cpos;
1398         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1399
1400         right_end -= le32_to_cpu(right_rec->e_cpos);
1401         right_rec->e_int_clusters = cpu_to_le32(right_end);
1402 }
1403
1404 /*
1405  * Adjust the adjacent root node records involved in a
1406  * rotation. left_el_blkno is passed in as a key so that we can easily
1407  * find it's index in the root list.
1408  */
1409 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1410                                       struct ocfs2_extent_list *left_el,
1411                                       struct ocfs2_extent_list *right_el,
1412                                       u64 left_el_blkno)
1413 {
1414         int i;
1415
1416         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1417                le16_to_cpu(left_el->l_tree_depth));
1418
1419         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1420                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1421                         break;
1422         }
1423
1424         /*
1425          * The path walking code should have never returned a root and
1426          * two paths which are not adjacent.
1427          */
1428         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1429
1430         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1431                                       &root_el->l_recs[i + 1], right_el);
1432 }
1433
1434 /*
1435  * We've changed a leaf block (in right_path) and need to reflect that
1436  * change back up the subtree.
1437  *
1438  * This happens in multiple places:
1439  *   - When we've moved an extent record from the left path leaf to the right
1440  *     path leaf to make room for an empty extent in the left path leaf.
1441  *   - When our insert into the right path leaf is at the leftmost edge
1442  *     and requires an update of the path immediately to it's left. This
1443  *     can occur at the end of some types of rotation and appending inserts.
1444  *   - When we've adjusted the last extent record in the left path leaf and the
1445  *     1st extent record in the right path leaf during cross extent block merge.
1446  */
1447 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1448                                        struct ocfs2_path *left_path,
1449                                        struct ocfs2_path *right_path,
1450                                        int subtree_index)
1451 {
1452         int ret, i, idx;
1453         struct ocfs2_extent_list *el, *left_el, *right_el;
1454         struct ocfs2_extent_rec *left_rec, *right_rec;
1455         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1456
1457         /*
1458          * Update the counts and position values within all the
1459          * interior nodes to reflect the leaf rotation we just did.
1460          *
1461          * The root node is handled below the loop.
1462          *
1463          * We begin the loop with right_el and left_el pointing to the
1464          * leaf lists and work our way up.
1465          *
1466          * NOTE: within this loop, left_el and right_el always refer
1467          * to the *child* lists.
1468          */
1469         left_el = path_leaf_el(left_path);
1470         right_el = path_leaf_el(right_path);
1471         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1472                 mlog(0, "Adjust records at index %u\n", i);
1473
1474                 /*
1475                  * One nice property of knowing that all of these
1476                  * nodes are below the root is that we only deal with
1477                  * the leftmost right node record and the rightmost
1478                  * left node record.
1479                  */
1480                 el = left_path->p_node[i].el;
1481                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1482                 left_rec = &el->l_recs[idx];
1483
1484                 el = right_path->p_node[i].el;
1485                 right_rec = &el->l_recs[0];
1486
1487                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1488                                               right_el);
1489
1490                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1491                 if (ret)
1492                         mlog_errno(ret);
1493
1494                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1495                 if (ret)
1496                         mlog_errno(ret);
1497
1498                 /*
1499                  * Setup our list pointers now so that the current
1500                  * parents become children in the next iteration.
1501                  */
1502                 left_el = left_path->p_node[i].el;
1503                 right_el = right_path->p_node[i].el;
1504         }
1505
1506         /*
1507          * At the root node, adjust the two adjacent records which
1508          * begin our path to the leaves.
1509          */
1510
1511         el = left_path->p_node[subtree_index].el;
1512         left_el = left_path->p_node[subtree_index + 1].el;
1513         right_el = right_path->p_node[subtree_index + 1].el;
1514
1515         ocfs2_adjust_root_records(el, left_el, right_el,
1516                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1517
1518         root_bh = left_path->p_node[subtree_index].bh;
1519
1520         ret = ocfs2_journal_dirty(handle, root_bh);
1521         if (ret)
1522                 mlog_errno(ret);
1523 }
1524
1525 static int ocfs2_rotate_subtree_right(struct inode *inode,
1526                                       handle_t *handle,
1527                                       struct ocfs2_path *left_path,
1528                                       struct ocfs2_path *right_path,
1529                                       int subtree_index)
1530 {
1531         int ret, i;
1532         struct buffer_head *right_leaf_bh;
1533         struct buffer_head *left_leaf_bh = NULL;
1534         struct buffer_head *root_bh;
1535         struct ocfs2_extent_list *right_el, *left_el;
1536         struct ocfs2_extent_rec move_rec;
1537
1538         left_leaf_bh = path_leaf_bh(left_path);
1539         left_el = path_leaf_el(left_path);
1540
1541         if (left_el->l_next_free_rec != left_el->l_count) {
1542                 ocfs2_error(inode->i_sb,
1543                             "Inode %llu has non-full interior leaf node %llu"
1544                             "(next free = %u)",
1545                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1546                             (unsigned long long)left_leaf_bh->b_blocknr,
1547                             le16_to_cpu(left_el->l_next_free_rec));
1548                 return -EROFS;
1549         }
1550
1551         /*
1552          * This extent block may already have an empty record, so we
1553          * return early if so.
1554          */
1555         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1556                 return 0;
1557
1558         root_bh = left_path->p_node[subtree_index].bh;
1559         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1560
1561         ret = ocfs2_journal_access(handle, inode, root_bh,
1562                                    OCFS2_JOURNAL_ACCESS_WRITE);
1563         if (ret) {
1564                 mlog_errno(ret);
1565                 goto out;
1566         }
1567
1568         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1569                 ret = ocfs2_journal_access(handle, inode,
1570                                            right_path->p_node[i].bh,
1571                                            OCFS2_JOURNAL_ACCESS_WRITE);
1572                 if (ret) {
1573                         mlog_errno(ret);
1574                         goto out;
1575                 }
1576
1577                 ret = ocfs2_journal_access(handle, inode,
1578                                            left_path->p_node[i].bh,
1579                                            OCFS2_JOURNAL_ACCESS_WRITE);
1580                 if (ret) {
1581                         mlog_errno(ret);
1582                         goto out;
1583                 }
1584         }
1585
1586         right_leaf_bh = path_leaf_bh(right_path);
1587         right_el = path_leaf_el(right_path);
1588
1589         /* This is a code error, not a disk corruption. */
1590         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1591                         "because rightmost leaf block %llu is empty\n",
1592                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1593                         (unsigned long long)right_leaf_bh->b_blocknr);
1594
1595         ocfs2_create_empty_extent(right_el);
1596
1597         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1598         if (ret) {
1599                 mlog_errno(ret);
1600                 goto out;
1601         }
1602
1603         /* Do the copy now. */
1604         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1605         move_rec = left_el->l_recs[i];
1606         right_el->l_recs[0] = move_rec;
1607
1608         /*
1609          * Clear out the record we just copied and shift everything
1610          * over, leaving an empty extent in the left leaf.
1611          *
1612          * We temporarily subtract from next_free_rec so that the
1613          * shift will lose the tail record (which is now defunct).
1614          */
1615         le16_add_cpu(&left_el->l_next_free_rec, -1);
1616         ocfs2_shift_records_right(left_el);
1617         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1618         le16_add_cpu(&left_el->l_next_free_rec, 1);
1619
1620         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1621         if (ret) {
1622                 mlog_errno(ret);
1623                 goto out;
1624         }
1625
1626         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1627                                 subtree_index);
1628
1629 out:
1630         return ret;
1631 }
1632
1633 /*
1634  * Given a full path, determine what cpos value would return us a path
1635  * containing the leaf immediately to the left of the current one.
1636  *
1637  * Will return zero if the path passed in is already the leftmost path.
1638  */
1639 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1640                                          struct ocfs2_path *path, u32 *cpos)
1641 {
1642         int i, j, ret = 0;
1643         u64 blkno;
1644         struct ocfs2_extent_list *el;
1645
1646         BUG_ON(path->p_tree_depth == 0);
1647
1648         *cpos = 0;
1649
1650         blkno = path_leaf_bh(path)->b_blocknr;
1651
1652         /* Start at the tree node just above the leaf and work our way up. */
1653         i = path->p_tree_depth - 1;
1654         while (i >= 0) {
1655                 el = path->p_node[i].el;
1656
1657                 /*
1658                  * Find the extent record just before the one in our
1659                  * path.
1660                  */
1661                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1662                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1663                                 if (j == 0) {
1664                                         if (i == 0) {
1665                                                 /*
1666                                                  * We've determined that the
1667                                                  * path specified is already
1668                                                  * the leftmost one - return a
1669                                                  * cpos of zero.
1670                                                  */
1671                                                 goto out;
1672                                         }
1673                                         /*
1674                                          * The leftmost record points to our
1675                                          * leaf - we need to travel up the
1676                                          * tree one level.
1677                                          */
1678                                         goto next_node;
1679                                 }
1680
1681                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1682                                 *cpos = *cpos + ocfs2_rec_clusters(el,
1683                                                            &el->l_recs[j - 1]);
1684                                 *cpos = *cpos - 1;
1685                                 goto out;
1686                         }
1687                 }
1688
1689                 /*
1690                  * If we got here, we never found a valid node where
1691                  * the tree indicated one should be.
1692                  */
1693                 ocfs2_error(sb,
1694                             "Invalid extent tree at extent block %llu\n",
1695                             (unsigned long long)blkno);
1696                 ret = -EROFS;
1697                 goto out;
1698
1699 next_node:
1700                 blkno = path->p_node[i].bh->b_blocknr;
1701                 i--;
1702         }
1703
1704 out:
1705         return ret;
1706 }
1707
1708 /*
1709  * Extend the transaction by enough credits to complete the rotation,
1710  * and still leave at least the original number of credits allocated
1711  * to this transaction.
1712  */
1713 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1714                                            int op_credits,
1715                                            struct ocfs2_path *path)
1716 {
1717         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1718
1719         if (handle->h_buffer_credits < credits)
1720                 return ocfs2_extend_trans(handle, credits);
1721
1722         return 0;
1723 }
1724
1725 /*
1726  * Trap the case where we're inserting into the theoretical range past
1727  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1728  * whose cpos is less than ours into the right leaf.
1729  *
1730  * It's only necessary to look at the rightmost record of the left
1731  * leaf because the logic that calls us should ensure that the
1732  * theoretical ranges in the path components above the leaves are
1733  * correct.
1734  */
1735 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1736                                                  u32 insert_cpos)
1737 {
1738         struct ocfs2_extent_list *left_el;
1739         struct ocfs2_extent_rec *rec;
1740         int next_free;
1741
1742         left_el = path_leaf_el(left_path);
1743         next_free = le16_to_cpu(left_el->l_next_free_rec);
1744         rec = &left_el->l_recs[next_free - 1];
1745
1746         if (insert_cpos > le32_to_cpu(rec->e_cpos))
1747                 return 1;
1748         return 0;
1749 }
1750
1751 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1752 {
1753         int next_free = le16_to_cpu(el->l_next_free_rec);
1754         unsigned int range;
1755         struct ocfs2_extent_rec *rec;
1756
1757         if (next_free == 0)
1758                 return 0;
1759
1760         rec = &el->l_recs[0];
1761         if (ocfs2_is_empty_extent(rec)) {
1762                 /* Empty list. */
1763                 if (next_free == 1)
1764                         return 0;
1765                 rec = &el->l_recs[1];
1766         }
1767
1768         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1769         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1770                 return 1;
1771         return 0;
1772 }
1773
1774 /*
1775  * Rotate all the records in a btree right one record, starting at insert_cpos.
1776  *
1777  * The path to the rightmost leaf should be passed in.
1778  *
1779  * The array is assumed to be large enough to hold an entire path (tree depth).
1780  *
1781  * Upon succesful return from this function:
1782  *
1783  * - The 'right_path' array will contain a path to the leaf block
1784  *   whose range contains e_cpos.
1785  * - That leaf block will have a single empty extent in list index 0.
1786  * - In the case that the rotation requires a post-insert update,
1787  *   *ret_left_path will contain a valid path which can be passed to
1788  *   ocfs2_insert_path().
1789  */
1790 static int ocfs2_rotate_tree_right(struct inode *inode,
1791                                    handle_t *handle,
1792                                    enum ocfs2_split_type split,
1793                                    u32 insert_cpos,
1794                                    struct ocfs2_path *right_path,
1795                                    struct ocfs2_path **ret_left_path)
1796 {
1797         int ret, start, orig_credits = handle->h_buffer_credits;
1798         u32 cpos;
1799         struct ocfs2_path *left_path = NULL;
1800
1801         *ret_left_path = NULL;
1802
1803         left_path = ocfs2_new_path(path_root_bh(right_path),
1804                                    path_root_el(right_path));
1805         if (!left_path) {
1806                 ret = -ENOMEM;
1807                 mlog_errno(ret);
1808                 goto out;
1809         }
1810
1811         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1812         if (ret) {
1813                 mlog_errno(ret);
1814                 goto out;
1815         }
1816
1817         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1818
1819         /*
1820          * What we want to do here is:
1821          *
1822          * 1) Start with the rightmost path.
1823          *
1824          * 2) Determine a path to the leaf block directly to the left
1825          *    of that leaf.
1826          *
1827          * 3) Determine the 'subtree root' - the lowest level tree node
1828          *    which contains a path to both leaves.
1829          *
1830          * 4) Rotate the subtree.
1831          *
1832          * 5) Find the next subtree by considering the left path to be
1833          *    the new right path.
1834          *
1835          * The check at the top of this while loop also accepts
1836          * insert_cpos == cpos because cpos is only a _theoretical_
1837          * value to get us the left path - insert_cpos might very well
1838          * be filling that hole.
1839          *
1840          * Stop at a cpos of '0' because we either started at the
1841          * leftmost branch (i.e., a tree with one branch and a
1842          * rotation inside of it), or we've gone as far as we can in
1843          * rotating subtrees.
1844          */
1845         while (cpos && insert_cpos <= cpos) {
1846                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1847                      insert_cpos, cpos);
1848
1849                 ret = ocfs2_find_path(inode, left_path, cpos);
1850                 if (ret) {
1851                         mlog_errno(ret);
1852                         goto out;
1853                 }
1854
1855                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
1856                                 path_leaf_bh(right_path),
1857                                 "Inode %lu: error during insert of %u "
1858                                 "(left path cpos %u) results in two identical "
1859                                 "paths ending at %llu\n",
1860                                 inode->i_ino, insert_cpos, cpos,
1861                                 (unsigned long long)
1862                                 path_leaf_bh(left_path)->b_blocknr);
1863
1864                 if (split == SPLIT_NONE &&
1865                     ocfs2_rotate_requires_path_adjustment(left_path,
1866                                                           insert_cpos)) {
1867
1868                         /*
1869                          * We've rotated the tree as much as we
1870                          * should. The rest is up to
1871                          * ocfs2_insert_path() to complete, after the
1872                          * record insertion. We indicate this
1873                          * situation by returning the left path.
1874                          *
1875                          * The reason we don't adjust the records here
1876                          * before the record insert is that an error
1877                          * later might break the rule where a parent
1878                          * record e_cpos will reflect the actual
1879                          * e_cpos of the 1st nonempty record of the
1880                          * child list.
1881                          */
1882                         *ret_left_path = left_path;
1883                         goto out_ret_path;
1884                 }
1885
1886                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
1887
1888                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1889                      start,
1890                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1891                      right_path->p_tree_depth);
1892
1893                 ret = ocfs2_extend_rotate_transaction(handle, start,
1894                                                       orig_credits, right_path);
1895                 if (ret) {
1896                         mlog_errno(ret);
1897                         goto out;
1898                 }
1899
1900                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1901                                                  right_path, start);
1902                 if (ret) {
1903                         mlog_errno(ret);
1904                         goto out;
1905                 }
1906
1907                 if (split != SPLIT_NONE &&
1908                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
1909                                                 insert_cpos)) {
1910                         /*
1911                          * A rotate moves the rightmost left leaf
1912                          * record over to the leftmost right leaf
1913                          * slot. If we're doing an extent split
1914                          * instead of a real insert, then we have to
1915                          * check that the extent to be split wasn't
1916                          * just moved over. If it was, then we can
1917                          * exit here, passing left_path back -
1918                          * ocfs2_split_extent() is smart enough to
1919                          * search both leaves.
1920                          */
1921                         *ret_left_path = left_path;
1922                         goto out_ret_path;
1923                 }
1924
1925                 /*
1926                  * There is no need to re-read the next right path
1927                  * as we know that it'll be our current left
1928                  * path. Optimize by copying values instead.
1929                  */
1930                 ocfs2_mv_path(right_path, left_path);
1931
1932                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1933                                                     &cpos);
1934                 if (ret) {
1935                         mlog_errno(ret);
1936                         goto out;
1937                 }
1938         }
1939
1940 out:
1941         ocfs2_free_path(left_path);
1942
1943 out_ret_path:
1944         return ret;
1945 }
1946
1947 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
1948                                       struct ocfs2_path *path)
1949 {
1950         int i, idx;
1951         struct ocfs2_extent_rec *rec;
1952         struct ocfs2_extent_list *el;
1953         struct ocfs2_extent_block *eb;
1954         u32 range;
1955
1956         /* Path should always be rightmost. */
1957         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
1958         BUG_ON(eb->h_next_leaf_blk != 0ULL);
1959
1960         el = &eb->h_list;
1961         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1962         idx = le16_to_cpu(el->l_next_free_rec) - 1;
1963         rec = &el->l_recs[idx];
1964         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1965
1966         for (i = 0; i < path->p_tree_depth; i++) {
1967                 el = path->p_node[i].el;
1968                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1969                 rec = &el->l_recs[idx];
1970
1971                 rec->e_int_clusters = cpu_to_le32(range);
1972                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
1973
1974                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
1975         }
1976 }
1977
1978 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
1979                               struct ocfs2_cached_dealloc_ctxt *dealloc,
1980                               struct ocfs2_path *path, int unlink_start)
1981 {
1982         int ret, i;
1983         struct ocfs2_extent_block *eb;
1984         struct ocfs2_extent_list *el;
1985         struct buffer_head *bh;
1986
1987         for(i = unlink_start; i < path_num_items(path); i++) {
1988                 bh = path->p_node[i].bh;
1989
1990                 eb = (struct ocfs2_extent_block *)bh->b_data;
1991                 /*
1992                  * Not all nodes might have had their final count
1993                  * decremented by the caller - handle this here.
1994                  */
1995                 el = &eb->h_list;
1996                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
1997                         mlog(ML_ERROR,
1998                              "Inode %llu, attempted to remove extent block "
1999                              "%llu with %u records\n",
2000                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2001                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2002                              le16_to_cpu(el->l_next_free_rec));
2003
2004                         ocfs2_journal_dirty(handle, bh);
2005                         ocfs2_remove_from_cache(inode, bh);
2006                         continue;
2007                 }
2008
2009                 el->l_next_free_rec = 0;
2010                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2011
2012                 ocfs2_journal_dirty(handle, bh);
2013
2014                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2015                 if (ret)
2016                         mlog_errno(ret);
2017
2018                 ocfs2_remove_from_cache(inode, bh);
2019         }
2020 }
2021
2022 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2023                                  struct ocfs2_path *left_path,
2024                                  struct ocfs2_path *right_path,
2025                                  int subtree_index,
2026                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2027 {
2028         int i;
2029         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2030         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2031         struct ocfs2_extent_list *el;
2032         struct ocfs2_extent_block *eb;
2033
2034         el = path_leaf_el(left_path);
2035
2036         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2037
2038         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2039                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2040                         break;
2041
2042         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2043
2044         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2045         le16_add_cpu(&root_el->l_next_free_rec, -1);
2046
2047         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2048         eb->h_next_leaf_blk = 0;
2049
2050         ocfs2_journal_dirty(handle, root_bh);
2051         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2052
2053         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2054                           subtree_index + 1);
2055 }
2056
2057 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2058                                      struct ocfs2_path *left_path,
2059                                      struct ocfs2_path *right_path,
2060                                      int subtree_index,
2061                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2062                                      int *deleted)
2063 {
2064         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2065         struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
2066         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2067         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2068         struct ocfs2_extent_block *eb;
2069
2070         *deleted = 0;
2071
2072         right_leaf_el = path_leaf_el(right_path);
2073         left_leaf_el = path_leaf_el(left_path);
2074         root_bh = left_path->p_node[subtree_index].bh;
2075         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2076
2077         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2078                 return 0;
2079
2080         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2081         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2082                 /*
2083                  * It's legal for us to proceed if the right leaf is
2084                  * the rightmost one and it has an empty extent. There
2085                  * are two cases to handle - whether the leaf will be
2086                  * empty after removal or not. If the leaf isn't empty
2087                  * then just remove the empty extent up front. The
2088                  * next block will handle empty leaves by flagging
2089                  * them for unlink.
2090                  *
2091                  * Non rightmost leaves will throw -EAGAIN and the
2092                  * caller can manually move the subtree and retry.
2093                  */
2094
2095                 if (eb->h_next_leaf_blk != 0ULL)
2096                         return -EAGAIN;
2097
2098                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2099                         ret = ocfs2_journal_access(handle, inode,
2100                                                    path_leaf_bh(right_path),
2101                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2102                         if (ret) {
2103                                 mlog_errno(ret);
2104                                 goto out;
2105                         }
2106
2107                         ocfs2_remove_empty_extent(right_leaf_el);
2108                 } else
2109                         right_has_empty = 1;
2110         }
2111
2112         if (eb->h_next_leaf_blk == 0ULL &&
2113             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2114                 /*
2115                  * We have to update i_last_eb_blk during the meta
2116                  * data delete.
2117                  */
2118                 ret = ocfs2_journal_access(handle, inode, di_bh,
2119                                            OCFS2_JOURNAL_ACCESS_WRITE);
2120                 if (ret) {
2121                         mlog_errno(ret);
2122                         goto out;
2123                 }
2124
2125                 del_right_subtree = 1;
2126         }
2127
2128         /*
2129          * Getting here with an empty extent in the right path implies
2130          * that it's the rightmost path and will be deleted.
2131          */
2132         BUG_ON(right_has_empty && !del_right_subtree);
2133
2134         ret = ocfs2_journal_access(handle, inode, root_bh,
2135                                    OCFS2_JOURNAL_ACCESS_WRITE);
2136         if (ret) {
2137                 mlog_errno(ret);
2138                 goto out;
2139         }
2140
2141         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2142                 ret = ocfs2_journal_access(handle, inode,
2143                                            right_path->p_node[i].bh,
2144                                            OCFS2_JOURNAL_ACCESS_WRITE);
2145                 if (ret) {
2146                         mlog_errno(ret);
2147                         goto out;
2148                 }
2149
2150                 ret = ocfs2_journal_access(handle, inode,
2151                                            left_path->p_node[i].bh,
2152                                            OCFS2_JOURNAL_ACCESS_WRITE);
2153                 if (ret) {
2154                         mlog_errno(ret);
2155                         goto out;
2156                 }
2157         }
2158
2159         if (!right_has_empty) {
2160                 /*
2161                  * Only do this if we're moving a real
2162                  * record. Otherwise, the action is delayed until
2163                  * after removal of the right path in which case we
2164                  * can do a simple shift to remove the empty extent.
2165                  */
2166                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2167                 memset(&right_leaf_el->l_recs[0], 0,
2168                        sizeof(struct ocfs2_extent_rec));
2169         }
2170         if (eb->h_next_leaf_blk == 0ULL) {
2171                 /*
2172                  * Move recs over to get rid of empty extent, decrease
2173                  * next_free. This is allowed to remove the last
2174                  * extent in our leaf (setting l_next_free_rec to
2175                  * zero) - the delete code below won't care.
2176                  */
2177                 ocfs2_remove_empty_extent(right_leaf_el);
2178         }
2179
2180         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2181         if (ret)
2182                 mlog_errno(ret);
2183         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2184         if (ret)
2185                 mlog_errno(ret);
2186
2187         if (del_right_subtree) {
2188                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2189                                      subtree_index, dealloc);
2190                 ocfs2_update_edge_lengths(inode, handle, left_path);
2191
2192                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2193                 di->i_last_eb_blk = eb->h_blkno;
2194
2195                 /*
2196                  * Removal of the extent in the left leaf was skipped
2197                  * above so we could delete the right path
2198                  * 1st.
2199                  */
2200                 if (right_has_empty)
2201                         ocfs2_remove_empty_extent(left_leaf_el);
2202
2203                 ret = ocfs2_journal_dirty(handle, di_bh);
2204                 if (ret)
2205                         mlog_errno(ret);
2206
2207                 *deleted = 1;
2208         } else
2209                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2210                                            subtree_index);
2211
2212 out:
2213         return ret;
2214 }
2215
2216 /*
2217  * Given a full path, determine what cpos value would return us a path
2218  * containing the leaf immediately to the right of the current one.
2219  *
2220  * Will return zero if the path passed in is already the rightmost path.
2221  *
2222  * This looks similar, but is subtly different to
2223  * ocfs2_find_cpos_for_left_leaf().
2224  */
2225 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2226                                           struct ocfs2_path *path, u32 *cpos)
2227 {
2228         int i, j, ret = 0;
2229         u64 blkno;
2230         struct ocfs2_extent_list *el;
2231
2232         *cpos = 0;
2233
2234         if (path->p_tree_depth == 0)
2235                 return 0;
2236
2237         blkno = path_leaf_bh(path)->b_blocknr;
2238
2239         /* Start at the tree node just above the leaf and work our way up. */
2240         i = path->p_tree_depth - 1;
2241         while (i >= 0) {
2242                 int next_free;
2243
2244                 el = path->p_node[i].el;
2245
2246                 /*
2247                  * Find the extent record just after the one in our
2248                  * path.
2249                  */
2250                 next_free = le16_to_cpu(el->l_next_free_rec);
2251                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2252                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2253                                 if (j == (next_free - 1)) {
2254                                         if (i == 0) {
2255                                                 /*
2256                                                  * We've determined that the
2257                                                  * path specified is already
2258                                                  * the rightmost one - return a
2259                                                  * cpos of zero.
2260                                                  */
2261                                                 goto out;
2262                                         }
2263                                         /*
2264                                          * The rightmost record points to our
2265                                          * leaf - we need to travel up the
2266                                          * tree one level.
2267                                          */
2268                                         goto next_node;
2269                                 }
2270
2271                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2272                                 goto out;
2273                         }
2274                 }
2275
2276                 /*
2277                  * If we got here, we never found a valid node where
2278                  * the tree indicated one should be.
2279                  */
2280                 ocfs2_error(sb,
2281                             "Invalid extent tree at extent block %llu\n",
2282                             (unsigned long long)blkno);
2283                 ret = -EROFS;
2284                 goto out;
2285
2286 next_node:
2287                 blkno = path->p_node[i].bh->b_blocknr;
2288                 i--;
2289         }
2290
2291 out:
2292         return ret;
2293 }
2294
2295 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2296                                             handle_t *handle,
2297                                             struct buffer_head *bh,
2298                                             struct ocfs2_extent_list *el)
2299 {
2300         int ret;
2301
2302         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2303                 return 0;
2304
2305         ret = ocfs2_journal_access(handle, inode, bh,
2306                                    OCFS2_JOURNAL_ACCESS_WRITE);
2307         if (ret) {
2308                 mlog_errno(ret);
2309                 goto out;
2310         }
2311
2312         ocfs2_remove_empty_extent(el);
2313
2314         ret = ocfs2_journal_dirty(handle, bh);
2315         if (ret)
2316                 mlog_errno(ret);
2317
2318 out:
2319         return ret;
2320 }
2321
2322 static int __ocfs2_rotate_tree_left(struct inode *inode,
2323                                     handle_t *handle, int orig_credits,
2324                                     struct ocfs2_path *path,
2325                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2326                                     struct ocfs2_path **empty_extent_path)
2327 {
2328         int ret, subtree_root, deleted;
2329         u32 right_cpos;
2330         struct ocfs2_path *left_path = NULL;
2331         struct ocfs2_path *right_path = NULL;
2332
2333         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2334
2335         *empty_extent_path = NULL;
2336
2337         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2338                                              &right_cpos);
2339         if (ret) {
2340                 mlog_errno(ret);
2341                 goto out;
2342         }
2343
2344         left_path = ocfs2_new_path(path_root_bh(path),
2345                                    path_root_el(path));
2346         if (!left_path) {
2347                 ret = -ENOMEM;
2348                 mlog_errno(ret);
2349                 goto out;
2350         }
2351
2352         ocfs2_cp_path(left_path, path);
2353
2354         right_path = ocfs2_new_path(path_root_bh(path),
2355                                     path_root_el(path));
2356         if (!right_path) {
2357                 ret = -ENOMEM;
2358                 mlog_errno(ret);
2359                 goto out;
2360         }
2361
2362         while (right_cpos) {
2363                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2364                 if (ret) {
2365                         mlog_errno(ret);
2366                         goto out;
2367                 }
2368
2369                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2370                                                        right_path);
2371
2372                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2373                      subtree_root,
2374                      (unsigned long long)
2375                      right_path->p_node[subtree_root].bh->b_blocknr,
2376                      right_path->p_tree_depth);
2377
2378                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2379                                                       orig_credits, left_path);
2380                 if (ret) {
2381                         mlog_errno(ret);
2382                         goto out;
2383                 }
2384
2385                 /*
2386                  * Caller might still want to make changes to the
2387                  * tree root, so re-add it to the journal here.
2388                  */
2389                 ret = ocfs2_journal_access(handle, inode,
2390                                            path_root_bh(left_path),
2391                                            OCFS2_JOURNAL_ACCESS_WRITE);
2392                 if (ret) {
2393                         mlog_errno(ret);
2394                         goto out;
2395                 }
2396
2397                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2398                                                 right_path, subtree_root,
2399                                                 dealloc, &deleted);
2400                 if (ret == -EAGAIN) {
2401                         /*
2402                          * The rotation has to temporarily stop due to
2403                          * the right subtree having an empty
2404                          * extent. Pass it back to the caller for a
2405                          * fixup.
2406                          */
2407                         *empty_extent_path = right_path;
2408                         right_path = NULL;
2409                         goto out;
2410                 }
2411                 if (ret) {
2412                         mlog_errno(ret);
2413                         goto out;
2414                 }
2415
2416                 /*
2417                  * The subtree rotate might have removed records on
2418                  * the rightmost edge. If so, then rotation is
2419                  * complete.
2420                  */
2421                 if (deleted)
2422                         break;
2423
2424                 ocfs2_mv_path(left_path, right_path);
2425
2426                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2427                                                      &right_cpos);
2428                 if (ret) {
2429                         mlog_errno(ret);
2430                         goto out;
2431                 }
2432         }
2433
2434 out:
2435         ocfs2_free_path(right_path);
2436         ocfs2_free_path(left_path);
2437
2438         return ret;
2439 }
2440
2441 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2442                                        struct ocfs2_path *path,
2443                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
2444 {
2445         int ret, subtree_index;
2446         u32 cpos;
2447         struct ocfs2_path *left_path = NULL;
2448         struct ocfs2_dinode *di;
2449         struct ocfs2_extent_block *eb;
2450         struct ocfs2_extent_list *el;
2451
2452         /*
2453          * XXX: This code assumes that the root is an inode, which is
2454          * true for now but may change as tree code gets generic.
2455          */
2456         di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2457         if (!OCFS2_IS_VALID_DINODE(di)) {
2458                 ret = -EIO;
2459                 ocfs2_error(inode->i_sb,
2460                             "Inode %llu has invalid path root",
2461                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
2462                 goto out;
2463         }
2464
2465         /*
2466          * There's two ways we handle this depending on
2467          * whether path is the only existing one.
2468          */
2469         ret = ocfs2_extend_rotate_transaction(handle, 0,
2470                                               handle->h_buffer_credits,
2471                                               path);
2472         if (ret) {
2473                 mlog_errno(ret);
2474                 goto out;
2475         }
2476
2477         ret = ocfs2_journal_access_path(inode, handle, path);
2478         if (ret) {
2479                 mlog_errno(ret);
2480                 goto out;
2481         }
2482
2483         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2484         if (ret) {
2485                 mlog_errno(ret);
2486                 goto out;
2487         }
2488
2489         if (cpos) {
2490                 /*
2491                  * We have a path to the left of this one - it needs
2492                  * an update too.
2493                  */
2494                 left_path = ocfs2_new_path(path_root_bh(path),
2495                                            path_root_el(path));
2496                 if (!left_path) {
2497                         ret = -ENOMEM;
2498                         mlog_errno(ret);
2499                         goto out;
2500                 }
2501
2502                 ret = ocfs2_find_path(inode, left_path, cpos);
2503                 if (ret) {
2504                         mlog_errno(ret);
2505                         goto out;
2506                 }
2507
2508                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2509                 if (ret) {
2510                         mlog_errno(ret);
2511                         goto out;
2512                 }
2513
2514                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2515
2516                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2517                                      subtree_index, dealloc);
2518                 ocfs2_update_edge_lengths(inode, handle, left_path);
2519
2520                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2521                 di->i_last_eb_blk = eb->h_blkno;
2522         } else {
2523                 /*
2524                  * 'path' is also the leftmost path which
2525                  * means it must be the only one. This gets
2526                  * handled differently because we want to
2527                  * revert the inode back to having extents
2528                  * in-line.
2529                  */
2530                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2531
2532                 el = &di->id2.i_list;
2533                 el->l_tree_depth = 0;
2534                 el->l_next_free_rec = 0;
2535                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2536
2537                 di->i_last_eb_blk = 0;
2538         }
2539
2540         ocfs2_journal_dirty(handle, path_root_bh(path));
2541
2542 out:
2543         ocfs2_free_path(left_path);
2544         return ret;
2545 }
2546
2547 /*
2548  * Left rotation of btree records.
2549  *
2550  * In many ways, this is (unsurprisingly) the opposite of right
2551  * rotation. We start at some non-rightmost path containing an empty
2552  * extent in the leaf block. The code works its way to the rightmost
2553  * path by rotating records to the left in every subtree.
2554  *
2555  * This is used by any code which reduces the number of extent records
2556  * in a leaf. After removal, an empty record should be placed in the
2557  * leftmost list position.
2558  *
2559  * This won't handle a length update of the rightmost path records if
2560  * the rightmost tree leaf record is removed so the caller is
2561  * responsible for detecting and correcting that.
2562  */
2563 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2564                                   struct ocfs2_path *path,
2565                                   struct ocfs2_cached_dealloc_ctxt *dealloc)
2566 {
2567         int ret, orig_credits = handle->h_buffer_credits;
2568         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2569         struct ocfs2_extent_block *eb;
2570         struct ocfs2_extent_list *el;
2571
2572         el = path_leaf_el(path);
2573         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2574                 return 0;
2575
2576         if (path->p_tree_depth == 0) {
2577 rightmost_no_delete:
2578                 /*
2579                  * In-inode extents. This is trivially handled, so do
2580                  * it up front.
2581                  */
2582                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2583                                                        path_leaf_bh(path),
2584                                                        path_leaf_el(path));
2585                 if (ret)
2586                         mlog_errno(ret);
2587                 goto out;
2588         }
2589
2590         /*
2591          * Handle rightmost branch now. There's several cases:
2592          *  1) simple rotation leaving records in there. That's trivial.
2593          *  2) rotation requiring a branch delete - there's no more
2594          *     records left. Two cases of this:
2595          *     a) There are branches to the left.
2596          *     b) This is also the leftmost (the only) branch.
2597          *
2598          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2599          *  2a) we need the left branch so that we can update it with the unlink
2600          *  2b) we need to bring the inode back to inline extents.
2601          */
2602
2603         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2604         el = &eb->h_list;
2605         if (eb->h_next_leaf_blk == 0) {
2606                 /*
2607                  * This gets a bit tricky if we're going to delete the
2608                  * rightmost path. Get the other cases out of the way
2609                  * 1st.
2610                  */
2611                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2612                         goto rightmost_no_delete;
2613
2614                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2615                         ret = -EIO;
2616                         ocfs2_error(inode->i_sb,
2617                                     "Inode %llu has empty extent block at %llu",
2618                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2619                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2620                         goto out;
2621                 }
2622
2623                 /*
2624                  * XXX: The caller can not trust "path" any more after
2625                  * this as it will have been deleted. What do we do?
2626                  *
2627                  * In theory the rotate-for-merge code will never get
2628                  * here because it'll always ask for a rotate in a
2629                  * nonempty list.
2630                  */
2631
2632                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2633                                                   dealloc);
2634                 if (ret)
2635                         mlog_errno(ret);
2636                 goto out;
2637         }
2638
2639         /*
2640          * Now we can loop, remembering the path we get from -EAGAIN
2641          * and restarting from there.
2642          */
2643 try_rotate:
2644         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2645                                        dealloc, &restart_path);
2646         if (ret && ret != -EAGAIN) {
2647                 mlog_errno(ret);
2648                 goto out;
2649         }
2650
2651         while (ret == -EAGAIN) {
2652                 tmp_path = restart_path;
2653                 restart_path = NULL;
2654
2655                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2656                                                tmp_path, dealloc,
2657                                                &restart_path);
2658                 if (ret && ret != -EAGAIN) {
2659                         mlog_errno(ret);
2660                         goto out;
2661                 }
2662
2663                 ocfs2_free_path(tmp_path);
2664                 tmp_path = NULL;
2665
2666                 if (ret == 0)
2667                         goto try_rotate;
2668         }
2669
2670 out:
2671         ocfs2_free_path(tmp_path);
2672         ocfs2_free_path(restart_path);
2673         return ret;
2674 }
2675
2676 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2677                                 int index)
2678 {
2679         struct ocfs2_extent_rec *rec = &el->l_recs[index];
2680         unsigned int size;
2681
2682         if (rec->e_leaf_clusters == 0) {
2683                 /*
2684                  * We consumed all of the merged-from record. An empty
2685                  * extent cannot exist anywhere but the 1st array
2686                  * position, so move things over if the merged-from
2687                  * record doesn't occupy that position.
2688                  *
2689                  * This creates a new empty extent so the caller
2690                  * should be smart enough to have removed any existing
2691                  * ones.
2692                  */
2693                 if (index > 0) {
2694                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2695                         size = index * sizeof(struct ocfs2_extent_rec);
2696                         memmove(&el->l_recs[1], &el->l_recs[0], size);
2697                 }
2698
2699                 /*
2700                  * Always memset - the caller doesn't check whether it
2701                  * created an empty extent, so there could be junk in
2702                  * the other fields.
2703                  */
2704                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2705         }
2706 }
2707
2708 static int ocfs2_get_right_path(struct inode *inode,
2709                                 struct ocfs2_path *left_path,
2710                                 struct ocfs2_path **ret_right_path)
2711 {
2712         int ret;
2713         u32 right_cpos;
2714         struct ocfs2_path *right_path = NULL;
2715         struct ocfs2_extent_list *left_el;
2716
2717         *ret_right_path = NULL;
2718
2719         /* This function shouldn't be called for non-trees. */
2720         BUG_ON(left_path->p_tree_depth == 0);
2721
2722         left_el = path_leaf_el(left_path);
2723         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2724
2725         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2726                                              &right_cpos);
2727         if (ret) {
2728                 mlog_errno(ret);
2729                 goto out;
2730         }
2731
2732         /* This function shouldn't be called for the rightmost leaf. */
2733         BUG_ON(right_cpos == 0);
2734
2735         right_path = ocfs2_new_path(path_root_bh(left_path),
2736                                     path_root_el(left_path));
2737         if (!right_path) {
2738                 ret = -ENOMEM;
2739                 mlog_errno(ret);
2740                 goto out;
2741         }
2742
2743         ret = ocfs2_find_path(inode, right_path, right_cpos);
2744         if (ret) {
2745                 mlog_errno(ret);
2746                 goto out;
2747         }
2748
2749         *ret_right_path = right_path;
2750 out:
2751         if (ret)
2752                 ocfs2_free_path(right_path);
2753         return ret;
2754 }
2755
2756 /*
2757  * Remove split_rec clusters from the record at index and merge them
2758  * onto the beginning of the record "next" to it.
2759  * For index < l_count - 1, the next means the extent rec at index + 1.
2760  * For index == l_count - 1, the "next" means the 1st extent rec of the
2761  * next extent block.
2762  */
2763 static int ocfs2_merge_rec_right(struct inode *inode,
2764                                  struct ocfs2_path *left_path,
2765                                  handle_t *handle,
2766                                  struct ocfs2_extent_rec *split_rec,
2767                                  int index)
2768 {
2769         int ret, next_free, i;
2770         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2771         struct ocfs2_extent_rec *left_rec;
2772         struct ocfs2_extent_rec *right_rec;
2773         struct ocfs2_extent_list *right_el;
2774         struct ocfs2_path *right_path = NULL;
2775         int subtree_index = 0;
2776         struct ocfs2_extent_list *el = path_leaf_el(left_path);
2777         struct buffer_head *bh = path_leaf_bh(left_path);
2778         struct buffer_head *root_bh = NULL;
2779
2780         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2781         left_rec = &el->l_recs[index];
2782
2783         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
2784             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
2785                 /* we meet with a cross extent block merge. */
2786                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
2787                 if (ret) {
2788                         mlog_errno(ret);
2789                         goto out;
2790                 }
2791
2792                 right_el = path_leaf_el(right_path);
2793                 next_free = le16_to_cpu(right_el->l_next_free_rec);
2794                 BUG_ON(next_free <= 0);
2795                 right_rec = &right_el->l_recs[0];
2796                 if (ocfs2_is_empty_extent(right_rec)) {
2797                         BUG_ON(next_free <= 1);
2798                         right_rec = &right_el->l_recs[1];
2799                 }
2800
2801                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2802                        le16_to_cpu(left_rec->e_leaf_clusters) !=
2803                        le32_to_cpu(right_rec->e_cpos));
2804
2805                 subtree_index = ocfs2_find_subtree_root(inode,
2806                                                         left_path, right_path);
2807
2808                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2809                                                       handle->h_buffer_credits,
2810                                                       right_path);
2811                 if (ret) {
2812                         mlog_errno(ret);
2813                         goto out;
2814                 }
2815
2816                 root_bh = left_path->p_node[subtree_index].bh;
2817                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2818
2819                 ret = ocfs2_journal_access(handle, inode, root_bh,
2820                                            OCFS2_JOURNAL_ACCESS_WRITE);
2821                 if (ret) {
2822                         mlog_errno(ret);
2823                         goto out;
2824                 }
2825
2826                 for (i = subtree_index + 1;
2827                      i < path_num_items(right_path); i++) {
2828                         ret = ocfs2_journal_access(handle, inode,
2829                                                    right_path->p_node[i].bh,
2830                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2831                         if (ret) {
2832                                 mlog_errno(ret);
2833                                 goto out;
2834                         }
2835
2836                         ret = ocfs2_journal_access(handle, inode,
2837                                                    left_path->p_node[i].bh,
2838                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2839                         if (ret) {
2840                                 mlog_errno(ret);
2841                                 goto out;
2842                         }
2843                 }
2844
2845         } else {
2846                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
2847                 right_rec = &el->l_recs[index + 1];
2848         }
2849
2850         ret = ocfs2_journal_access(handle, inode, bh,
2851                                    OCFS2_JOURNAL_ACCESS_WRITE);
2852         if (ret) {
2853                 mlog_errno(ret);
2854                 goto out;
2855         }
2856
2857         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2858
2859         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2860         le64_add_cpu(&right_rec->e_blkno,
2861                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2862         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2863
2864         ocfs2_cleanup_merge(el, index);
2865
2866         ret = ocfs2_journal_dirty(handle, bh);
2867         if (ret)
2868                 mlog_errno(ret);
2869
2870         if (right_path) {
2871                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2872                 if (ret)
2873                         mlog_errno(ret);
2874
2875                 ocfs2_complete_edge_insert(inode, handle, left_path,
2876                                            right_path, subtree_index);
2877         }
2878 out:
2879         if (right_path)
2880                 ocfs2_free_path(right_path);
2881         return ret;
2882 }
2883
2884 static int ocfs2_get_left_path(struct inode *inode,
2885                                struct ocfs2_path *right_path,
2886                                struct ocfs2_path **ret_left_path)
2887 {
2888         int ret;
2889         u32 left_cpos;
2890         struct ocfs2_path *left_path = NULL;
2891
2892         *ret_left_path = NULL;
2893
2894         /* This function shouldn't be called for non-trees. */
2895         BUG_ON(right_path->p_tree_depth == 0);
2896
2897         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
2898                                             right_path, &left_cpos);
2899         if (ret) {
2900                 mlog_errno(ret);
2901                 goto out;
2902         }
2903
2904         /* This function shouldn't be called for the leftmost leaf. */
2905         BUG_ON(left_cpos == 0);
2906
2907         left_path = ocfs2_new_path(path_root_bh(right_path),
2908                                    path_root_el(right_path));
2909         if (!left_path) {
2910                 ret = -ENOMEM;
2911                 mlog_errno(ret);
2912                 goto out;
2913         }
2914
2915         ret = ocfs2_find_path(inode, left_path, left_cpos);
2916         if (ret) {
2917                 mlog_errno(ret);
2918                 goto out;
2919         }
2920
2921         *ret_left_path = left_path;
2922 out:
2923         if (ret)
2924                 ocfs2_free_path(left_path);
2925         return ret;
2926 }
2927
2928 /*
2929  * Remove split_rec clusters from the record at index and merge them
2930  * onto the tail of the record "before" it.
2931  * For index > 0, the "before" means the extent rec at index - 1.
2932  *
2933  * For index == 0, the "before" means the last record of the previous
2934  * extent block. And there is also a situation that we may need to
2935  * remove the rightmost leaf extent block in the right_path and change
2936  * the right path to indicate the new rightmost path.
2937  */
2938 static int ocfs2_merge_rec_left(struct inode *inode,
2939                                 struct ocfs2_path *right_path,
2940                                 handle_t *handle,
2941                                 struct ocfs2_extent_rec *split_rec,
2942                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2943                                 int index)
2944 {
2945         int ret, i, subtree_index = 0, has_empty_extent = 0;
2946         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2947         struct ocfs2_extent_rec *left_rec;
2948         struct ocfs2_extent_rec *right_rec;
2949         struct ocfs2_extent_list *el = path_leaf_el(right_path);
2950         struct buffer_head *bh = path_leaf_bh(right_path);
2951         struct buffer_head *root_bh = NULL;
2952         struct ocfs2_path *left_path = NULL;
2953         struct ocfs2_extent_list *left_el;
2954
2955         BUG_ON(index < 0);
2956
2957         right_rec = &el->l_recs[index];
2958         if (index == 0) {
2959                 /* we meet with a cross extent block merge. */
2960                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
2961                 if (ret) {
2962                         mlog_errno(ret);
2963                         goto out;
2964                 }
2965
2966                 left_el = path_leaf_el(left_path);
2967                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
2968                        le16_to_cpu(left_el->l_count));
2969
2970                 left_rec = &left_el->l_recs[
2971                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
2972                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2973                        le16_to_cpu(left_rec->e_leaf_clusters) !=
2974                        le32_to_cpu(split_rec->e_cpos));
2975
2976                 subtree_index = ocfs2_find_subtree_root(inode,
2977                                                         left_path, right_path);
2978
2979                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2980                                                       handle->h_buffer_credits,
2981                                                       left_path);
2982                 if (ret) {
2983                         mlog_errno(ret);
2984                         goto out;
2985                 }
2986
2987                 root_bh = left_path->p_node[subtree_index].bh;
2988                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2989
2990                 ret = ocfs2_journal_access(handle, inode, root_bh,
2991                                            OCFS2_JOURNAL_ACCESS_WRITE);
2992                 if (ret) {
2993                         mlog_errno(ret);
2994                         goto out;
2995                 }
2996
2997                 for (i = subtree_index + 1;
2998                      i < path_num_items(right_path); i++) {
2999                         ret = ocfs2_journal_access(handle, inode,
3000                                                    right_path->p_node[i].bh,
3001                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3002                         if (ret) {
3003                                 mlog_errno(ret);
3004                                 goto out;
3005                         }
3006
3007                         ret = ocfs2_journal_access(handle, inode,
3008                                                    left_path->p_node[i].bh,
3009                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3010                         if (ret) {
3011                                 mlog_errno(ret);
3012                                 goto out;
3013                         }
3014                 }
3015         } else {
3016                 left_rec = &el->l_recs[index - 1];
3017                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3018                         has_empty_extent = 1;
3019         }
3020
3021         ret = ocfs2_journal_access(handle, inode, bh,
3022                                    OCFS2_JOURNAL_ACCESS_WRITE);
3023         if (ret) {
3024                 mlog_errno(ret);
3025                 goto out;
3026         }
3027
3028         if (has_empty_extent && index == 1) {
3029                 /*
3030                  * The easy case - we can just plop the record right in.
3031                  */
3032                 *left_rec = *split_rec;
3033
3034                 has_empty_extent = 0;
3035         } else
3036                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3037
3038         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3039         le64_add_cpu(&right_rec->e_blkno,
3040                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3041         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3042
3043         ocfs2_cleanup_merge(el, index);
3044
3045         ret = ocfs2_journal_dirty(handle, bh);
3046         if (ret)
3047                 mlog_errno(ret);
3048
3049         if (left_path) {
3050                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3051                 if (ret)
3052                         mlog_errno(ret);
3053
3054                 /*
3055                  * In the situation that the right_rec is empty and the extent
3056                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3057                  * it and we need to delete the right extent block.
3058                  */
3059                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3060                     le16_to_cpu(el->l_next_free_rec) == 1) {
3061
3062                         ret = ocfs2_remove_rightmost_path(inode, handle,
3063                                                           right_path, dealloc);
3064                         if (ret) {
3065                                 mlog_errno(ret);
3066                                 goto out;
3067                         }
3068
3069                         /* Now the rightmost extent block has been deleted.
3070                          * So we use the new rightmost path.
3071                          */
3072                         ocfs2_mv_path(right_path, left_path);
3073                         left_path = NULL;
3074                 } else
3075                         ocfs2_complete_edge_insert(inode, handle, left_path,
3076                                                    right_path, subtree_index);
3077         }
3078 out:
3079         if (left_path)
3080                 ocfs2_free_path(left_path);
3081         return ret;
3082 }
3083
3084 static int ocfs2_try_to_merge_extent(struct inode *inode,
3085                                      handle_t *handle,
3086                                      struct ocfs2_path *path,
3087                                      int split_index,
3088                                      struct ocfs2_extent_rec *split_rec,
3089                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3090                                      struct ocfs2_merge_ctxt *ctxt)
3091
3092 {
3093         int ret = 0;
3094         struct ocfs2_extent_list *el = path_leaf_el(path);
3095         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3096
3097         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3098
3099         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3100                 /*
3101                  * The merge code will need to create an empty
3102                  * extent to take the place of the newly
3103                  * emptied slot. Remove any pre-existing empty
3104                  * extents - having more than one in a leaf is
3105                  * illegal.
3106                  */
3107                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3108                                              dealloc);
3109                 if (ret) {
3110                         mlog_errno(ret);
3111                         goto out;
3112                 }
3113                 split_index--;
3114                 rec = &el->l_recs[split_index];
3115         }
3116
3117         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3118                 /*
3119                  * Left-right contig implies this.
3120                  */
3121                 BUG_ON(!ctxt->c_split_covers_rec);
3122
3123                 /*
3124                  * Since the leftright insert always covers the entire
3125                  * extent, this call will delete the insert record
3126                  * entirely, resulting in an empty extent record added to
3127                  * the extent block.
3128                  *
3129                  * Since the adding of an empty extent shifts
3130                  * everything back to the right, there's no need to
3131                  * update split_index here.
3132                  *
3133                  * When the split_index is zero, we need to merge it to the
3134                  * prevoius extent block. It is more efficient and easier
3135                  * if we do merge_right first and merge_left later.
3136                  */
3137                 ret = ocfs2_merge_rec_right(inode, path,
3138                                             handle, split_rec,
3139                                             split_index);
3140                 if (ret) {
3141                         mlog_errno(ret);
3142                         goto out;
3143                 }
3144
3145                 /*
3146                  * We can only get this from logic error above.
3147                  */
3148                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3149
3150                 /* The merge left us with an empty extent, remove it. */
3151                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
3152                 if (ret) {
3153                         mlog_errno(ret);
3154                         goto out;
3155                 }
3156
3157                 rec = &el->l_recs[split_index];
3158
3159                 /*
3160                  * Note that we don't pass split_rec here on purpose -
3161                  * we've merged it into the rec already.
3162                  */
3163                 ret = ocfs2_merge_rec_left(inode, path,
3164                                            handle, rec,
3165                                            dealloc,
3166                                            split_index);
3167
3168                 if (ret) {
3169                         mlog_errno(ret);
3170                         goto out;
3171                 }
3172
3173                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3174                                              dealloc);
3175                 /*
3176                  * Error from this last rotate is not critical, so
3177                  * print but don't bubble it up.
3178                  */
3179                 if (ret)
3180                         mlog_errno(ret);
3181                 ret = 0;
3182         } else {
3183                 /*
3184                  * Merge a record to the left or right.
3185                  *
3186                  * 'contig_type' is relative to the existing record,
3187                  * so for example, if we're "right contig", it's to
3188                  * the record on the left (hence the left merge).
3189                  */
3190                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3191                         ret = ocfs2_merge_rec_left(inode,
3192                                                    path,
3193                                                    handle, split_rec,
3194                                                    dealloc,
3195                                                    split_index);
3196                         if (ret) {
3197                                 mlog_errno(ret);
3198                                 goto out;
3199                         }
3200                 } else {
3201                         ret = ocfs2_merge_rec_right(inode,
3202                                                     path,
3203                                                     handle, split_rec,
3204                                                     split_index);
3205                         if (ret) {
3206                                 mlog_errno(ret);
3207                                 goto out;
3208                         }
3209                 }
3210
3211                 if (ctxt->c_split_covers_rec) {
3212                         /*
3213                          * The merge may have left an empty extent in
3214                          * our leaf. Try to rotate it away.
3215                          */
3216                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3217                                                      dealloc);
3218                         if (ret)
3219                                 mlog_errno(ret);
3220                         ret = 0;
3221                 }
3222         }
3223
3224 out:
3225         return ret;
3226 }
3227
3228 static void ocfs2_subtract_from_rec(struct super_block *sb,
3229                                     enum ocfs2_split_type split,
3230                                     struct ocfs2_extent_rec *rec,
3231                                     struct ocfs2_extent_rec *split_rec)
3232 {
3233         u64 len_blocks;
3234
3235         len_blocks = ocfs2_clusters_to_blocks(sb,
3236                                 le16_to_cpu(split_rec->e_leaf_clusters));
3237
3238         if (split == SPLIT_LEFT) {
3239                 /*
3240                  * Region is on the left edge of the existing
3241                  * record.
3242                  */
3243                 le32_add_cpu(&rec->e_cpos,
3244                              le16_to_cpu(split_rec->e_leaf_clusters));
3245                 le64_add_cpu(&rec->e_blkno, len_blocks);
3246                 le16_add_cpu(&rec->e_leaf_clusters,
3247                              -le16_to_cpu(split_rec->e_leaf_clusters));
3248         } else {
3249                 /*
3250                  * Region is on the right edge of the existing
3251                  * record.
3252                  */
3253                 le16_add_cpu(&rec->e_leaf_clusters,
3254                              -le16_to_cpu(split_rec->e_leaf_clusters));
3255         }
3256 }
3257
3258 /*
3259  * Do the final bits of extent record insertion at the target leaf
3260  * list. If this leaf is part of an allocation tree, it is assumed
3261  * that the tree above has been prepared.
3262  */
3263 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3264                                  struct ocfs2_extent_list *el,
3265                                  struct ocfs2_insert_type *insert,
3266                                  struct inode *inode)
3267 {
3268         int i = insert->ins_contig_index;
3269         unsigned int range;
3270         struct ocfs2_extent_rec *rec;
3271
3272         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3273
3274         if (insert->ins_split != SPLIT_NONE) {
3275                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3276                 BUG_ON(i == -1);
3277                 rec = &el->l_recs[i];
3278                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3279                                         insert_rec);
3280                 goto rotate;
3281         }
3282
3283         /*
3284          * Contiguous insert - either left or right.
3285          */
3286         if (insert->ins_contig != CONTIG_NONE) {
3287                 rec = &el->l_recs[i];
3288                 if (insert->ins_contig == CONTIG_LEFT) {
3289                         rec->e_blkno = insert_rec->e_blkno;
3290                         rec->e_cpos = insert_rec->e_cpos;
3291                 }
3292                 le16_add_cpu(&rec->e_leaf_clusters,
3293                              le16_to_cpu(insert_rec->e_leaf_clusters));
3294                 return;
3295         }
3296
3297         /*
3298          * Handle insert into an empty leaf.
3299          */
3300         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3301             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3302              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3303                 el->l_recs[0] = *insert_rec;
3304                 el->l_next_free_rec = cpu_to_le16(1);
3305                 return;
3306         }
3307
3308         /*
3309          * Appending insert.
3310          */
3311         if (insert->ins_appending == APPEND_TAIL) {
3312                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3313                 rec = &el->l_recs[i];
3314                 range = le32_to_cpu(rec->e_cpos)
3315                         + le16_to_cpu(rec->e_leaf_clusters);
3316                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3317
3318                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3319                                 le16_to_cpu(el->l_count),
3320                                 "inode %lu, depth %u, count %u, next free %u, "
3321                                 "rec.cpos %u, rec.clusters %u, "
3322                                 "insert.cpos %u, insert.clusters %u\n",
3323                                 inode->i_ino,
3324                                 le16_to_cpu(el->l_tree_depth),
3325                                 le16_to_cpu(el->l_count),
3326                                 le16_to_cpu(el->l_next_free_rec),
3327                                 le32_to_cpu(el->l_recs[i].e_cpos),
3328                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3329                                 le32_to_cpu(insert_rec->e_cpos),
3330                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3331                 i++;
3332                 el->l_recs[i] = *insert_rec;
3333                 le16_add_cpu(&el->l_next_free_rec, 1);
3334                 return;
3335         }
3336
3337 rotate:
3338         /*
3339          * Ok, we have to rotate.
3340          *
3341          * At this point, it is safe to assume that inserting into an
3342          * empty leaf and appending to a leaf have both been handled
3343          * above.
3344          *
3345          * This leaf needs to have space, either by the empty 1st
3346          * extent record, or by virtue of an l_next_rec < l_count.
3347          */
3348         ocfs2_rotate_leaf(el, insert_rec);
3349 }
3350
3351 static inline void ocfs2_update_dinode_clusters(struct inode *inode,
3352                                                 struct ocfs2_dinode *di,
3353                                                 u32 clusters)
3354 {
3355         le32_add_cpu(&di->i_clusters, clusters);
3356         spin_lock(&OCFS2_I(inode)->ip_lock);
3357         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
3358         spin_unlock(&OCFS2_I(inode)->ip_lock);
3359 }
3360
3361 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3362                                            handle_t *handle,
3363                                            struct ocfs2_path *path,
3364                                            struct ocfs2_extent_rec *insert_rec)
3365 {
3366         int ret, i, next_free;
3367         struct buffer_head *bh;
3368         struct ocfs2_extent_list *el;
3369         struct ocfs2_extent_rec *rec;
3370
3371         /*
3372          * Update everything except the leaf block.
3373          */
3374         for (i = 0; i < path->p_tree_depth; i++) {
3375                 bh = path->p_node[i].bh;
3376                 el = path->p_node[i].el;
3377
3378                 next_free = le16_to_cpu(el->l_next_free_rec);
3379                 if (next_free == 0) {
3380                         ocfs2_error(inode->i_sb,
3381                                     "Dinode %llu has a bad extent list",
3382                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3383                         ret = -EIO;
3384                         return;
3385                 }
3386
3387                 rec = &el->l_recs[next_free - 1];
3388
3389                 rec->e_int_clusters = insert_rec->e_cpos;
3390                 le32_add_cpu(&rec->e_int_clusters,
3391                              le16_to_cpu(insert_rec->e_leaf_clusters));
3392                 le32_add_cpu(&rec->e_int_clusters,
3393                              -le32_to_cpu(rec->e_cpos));
3394
3395                 ret = ocfs2_journal_dirty(handle, bh);
3396                 if (ret)
3397                         mlog_errno(ret);
3398
3399         }
3400 }
3401
3402 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3403                                     struct ocfs2_extent_rec *insert_rec,
3404                                     struct ocfs2_path *right_path,
3405                                     struct ocfs2_path **ret_left_path)
3406 {
3407         int ret, next_free;
3408         struct ocfs2_extent_list *el;
3409         struct ocfs2_path *left_path = NULL;
3410
3411         *ret_left_path = NULL;
3412
3413         /*
3414          * This shouldn't happen for non-trees. The extent rec cluster
3415          * count manipulation below only works for interior nodes.
3416          */
3417         BUG_ON(right_path->p_tree_depth == 0);
3418
3419         /*
3420          * If our appending insert is at the leftmost edge of a leaf,
3421          * then we might need to update the rightmost records of the
3422          * neighboring path.
3423          */
3424         el = path_leaf_el(right_path);
3425         next_free = le16_to_cpu(el->l_next_free_rec);
3426         if (next_free == 0 ||
3427             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3428                 u32 left_cpos;
3429
3430                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3431                                                     &left_cpos);
3432                 if (ret) {
3433                         mlog_errno(ret);
3434                         goto out;
3435                 }
3436
3437                 mlog(0, "Append may need a left path update. cpos: %u, "
3438                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3439                      left_cpos);
3440
3441                 /*
3442                  * No need to worry if the append is already in the
3443                  * leftmost leaf.
3444                  */
3445                 if (left_cpos) {
3446                         left_path = ocfs2_new_path(path_root_bh(right_path),
3447                                                    path_root_el(right_path));
3448                         if (!left_path) {
3449                                 ret = -ENOMEM;
3450                                 mlog_errno(ret);
3451                                 goto out;
3452                         }
3453
3454                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3455                         if (ret) {
3456                                 mlog_errno(ret);
3457                                 goto out;
3458                         }
3459
3460                         /*
3461                          * ocfs2_insert_path() will pass the left_path to the
3462                          * journal for us.
3463                          */
3464                 }
3465         }
3466
3467         ret = ocfs2_journal_access_path(inode, handle, right_path);
3468         if (ret) {
3469                 mlog_errno(ret);
3470                 goto out;
3471         }
3472
3473         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3474
3475         *ret_left_path = left_path;
3476         ret = 0;
3477 out:
3478         if (ret != 0)
3479                 ocfs2_free_path(left_path);
3480
3481         return ret;
3482 }
3483
3484 static void ocfs2_split_record(struct inode *inode,
3485                                struct ocfs2_path *left_path,
3486                                struct ocfs2_path *right_path,
3487                                struct ocfs2_extent_rec *split_rec,
3488                                enum ocfs2_split_type split)
3489 {
3490         int index;
3491         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3492         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3493         struct ocfs2_extent_rec *rec, *tmprec;
3494
3495         right_el = path_leaf_el(right_path);;
3496         if (left_path)
3497                 left_el = path_leaf_el(left_path);
3498
3499         el = right_el;
3500         insert_el = right_el;
3501         index = ocfs2_search_extent_list(el, cpos);
3502         if (index != -1) {
3503                 if (index == 0 && left_path) {
3504                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3505
3506                         /*
3507                          * This typically means that the record
3508                          * started in the left path but moved to the
3509                          * right as a result of rotation. We either
3510                          * move the existing record to the left, or we
3511                          * do the later insert there.
3512                          *
3513                          * In this case, the left path should always
3514                          * exist as the rotate code will have passed
3515                          * it back for a post-insert update.
3516                          */
3517
3518                         if (split == SPLIT_LEFT) {
3519                                 /*
3520                                  * It's a left split. Since we know
3521                                  * that the rotate code gave us an
3522                                  * empty extent in the left path, we
3523                                  * can just do the insert there.
3524                                  */
3525                                 insert_el = left_el;
3526                         } else {
3527                                 /*
3528                                  * Right split - we have to move the
3529                                  * existing record over to the left
3530                                  * leaf. The insert will be into the
3531                                  * newly created empty extent in the
3532                                  * right leaf.
3533                                  */
3534                                 tmprec = &right_el->l_recs[index];
3535                                 ocfs2_rotate_leaf(left_el, tmprec);
3536                                 el = left_el;
3537
3538                                 memset(tmprec, 0, sizeof(*tmprec));
3539                                 index = ocfs2_search_extent_list(left_el, cpos);
3540                                 BUG_ON(index == -1);
3541                         }
3542                 }
3543         } else {
3544                 BUG_ON(!left_path);
3545                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3546                 /*
3547                  * Left path is easy - we can just allow the insert to
3548                  * happen.
3549                  */
3550                 el = left_el;
3551                 insert_el = left_el;
3552                 index = ocfs2_search_extent_list(el, cpos);
3553                 BUG_ON(index == -1);
3554         }
3555
3556         rec = &el->l_recs[index];
3557         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3558         ocfs2_rotate_leaf(insert_el, split_rec);
3559 }
3560
3561 /*
3562  * This function only does inserts on an allocation b-tree. For dinode
3563  * lists, ocfs2_insert_at_leaf() is called directly.
3564  *
3565  * right_path is the path we want to do the actual insert
3566  * in. left_path should only be passed in if we need to update that
3567  * portion of the tree after an edge insert.
3568  */
3569 static int ocfs2_insert_path(struct inode *inode,
3570                              handle_t *handle,
3571                              struct ocfs2_path *left_path,
3572                              struct ocfs2_path *right_path,
3573                              struct ocfs2_extent_rec *insert_rec,
3574                              struct ocfs2_insert_type *insert)
3575 {
3576         int ret, subtree_index;
3577         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3578
3579         if (left_path) {
3580                 int credits = handle->h_buffer_credits;
3581
3582                 /*
3583                  * There's a chance that left_path got passed back to
3584                  * us without being accounted for in the
3585                  * journal. Extend our transaction here to be sure we
3586                  * can change those blocks.
3587                  */
3588                 credits += left_path->p_tree_depth;
3589
3590                 ret = ocfs2_extend_trans(handle, credits);
3591                 if (ret < 0) {
3592                         mlog_errno(ret);
3593                         goto out;
3594                 }
3595
3596                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3597                 if (ret < 0) {
3598                         mlog_errno(ret);
3599                         goto out;
3600                 }
3601         }
3602
3603         /*
3604          * Pass both paths to the journal. The majority of inserts
3605          * will be touching all components anyway.
3606          */
3607         ret = ocfs2_journal_access_path(inode, handle, right_path);
3608         if (ret < 0) {
3609                 mlog_errno(ret);
3610                 goto out;
3611         }
3612
3613         if (insert->ins_split != SPLIT_NONE) {
3614                 /*
3615                  * We could call ocfs2_insert_at_leaf() for some types
3616                  * of splits, but it's easier to just let one separate
3617                  * function sort it all out.
3618                  */
3619                 ocfs2_split_record(inode, left_path, right_path,
3620                                    insert_rec, insert->ins_split);
3621
3622                 /*
3623                  * Split might have modified either leaf and we don't
3624                  * have a guarantee that the later edge insert will
3625                  * dirty this for us.
3626                  */
3627                 if (left_path)
3628                         ret = ocfs2_journal_dirty(handle,
3629                                                   path_leaf_bh(left_path));
3630                         if (ret)
3631                                 mlog_errno(ret);
3632         } else
3633                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3634                                      insert, inode);
3635
3636         ret = ocfs2_journal_dirty(handle, leaf_bh);
3637         if (ret)
3638                 mlog_errno(ret);
3639
3640         if (left_path) {
3641                 /*
3642                  * The rotate code has indicated that we need to fix
3643                  * up portions of the tree after the insert.
3644                  *
3645                  * XXX: Should we extend the transaction here?
3646                  */
3647                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3648                                                         right_path);
3649                 ocfs2_complete_edge_insert(inode, handle, left_path,
3650                                            right_path, subtree_index);
3651         }
3652
3653         ret = 0;
3654 out:
3655         return ret;
3656 }
3657
3658 static int ocfs2_do_insert_extent(struct inode *inode,
3659                                   handle_t *handle,
3660                                   struct buffer_head *di_bh,
3661                                   struct ocfs2_extent_rec *insert_rec,
3662                                   struct ocfs2_insert_type *type)
3663 {
3664         int ret, rotate = 0;
3665         u32 cpos;
3666         struct ocfs2_path *right_path = NULL;
3667         struct ocfs2_path *left_path = NULL;
3668         struct ocfs2_dinode *di;
3669         struct ocfs2_extent_list *el;
3670
3671         di = (struct ocfs2_dinode *) di_bh->b_data;
3672         el = &di->id2.i_list;
3673
3674         ret = ocfs2_journal_access(handle, inode, di_bh,
3675                                    OCFS2_JOURNAL_ACCESS_WRITE);
3676         if (ret) {
3677                 mlog_errno(ret);
3678                 goto out;
3679         }
3680
3681         if (le16_to_cpu(el->l_tree_depth) == 0) {
3682                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3683                 goto out_update_clusters;
3684         }
3685
3686         right_path = ocfs2_new_inode_path(di_bh);
3687         if (!right_path) {
3688                 ret = -ENOMEM;
3689                 mlog_errno(ret);
3690                 goto out;
3691         }
3692
3693         /*
3694          * Determine the path to start with. Rotations need the
3695          * rightmost path, everything else can go directly to the
3696          * target leaf.
3697          */
3698         cpos = le32_to_cpu(insert_rec->e_cpos);
3699         if (type->ins_appending == APPEND_NONE &&
3700             type->ins_contig == CONTIG_NONE) {
3701                 rotate = 1;
3702                 cpos = UINT_MAX;
3703         }
3704
3705         ret = ocfs2_find_path(inode, right_path, cpos);
3706         if (ret) {
3707                 mlog_errno(ret);
3708                 goto out;
3709         }
3710
3711         /*
3712          * Rotations and appends need special treatment - they modify
3713          * parts of the tree's above them.
3714          *
3715          * Both might pass back a path immediate to the left of the
3716          * one being inserted to. This will be cause
3717          * ocfs2_insert_path() to modify the rightmost records of
3718          * left_path to account for an edge insert.
3719          *
3720          * XXX: When modifying this code, keep in mind that an insert
3721          * can wind up skipping both of these two special cases...
3722          */
3723         if (rotate) {
3724                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3725                                               le32_to_cpu(insert_rec->e_cpos),
3726                                               right_path, &left_path);
3727                 if (ret) {
3728                         mlog_errno(ret);
3729                         goto out;
3730                 }
3731
3732                 /*
3733                  * ocfs2_rotate_tree_right() might have extended the
3734                  * transaction without re-journaling our tree root.
3735                  */
3736                 ret = ocfs2_journal_access(handle, inode, di_bh,
3737                                            OCFS2_JOURNAL_ACCESS_WRITE);
3738                 if (ret) {
3739                         mlog_errno(ret);
3740                         goto out;
3741                 }
3742         } else if (type->ins_appending == APPEND_TAIL
3743                    && type->ins_contig != CONTIG_LEFT) {
3744                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
3745                                                right_path, &left_path);
3746                 if (ret) {
3747                         mlog_errno(ret);
3748                         goto out;
3749                 }
3750         }
3751
3752         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
3753                                 insert_rec, type);
3754         if (ret) {
3755                 mlog_errno(ret);
3756                 goto out;
3757         }
3758
3759 out_update_clusters:
3760         if (type->ins_split == SPLIT_NONE)
3761                 ocfs2_update_dinode_clusters(inode, di,
3762                                              le16_to_cpu(insert_rec->e_leaf_clusters));
3763
3764         ret = ocfs2_journal_dirty(handle, di_bh);
3765         if (ret)
3766                 mlog_errno(ret);
3767
3768 out:
3769         ocfs2_free_path(left_path);
3770         ocfs2_free_path(right_path);
3771
3772         return ret;
3773 }
3774
3775 static enum ocfs2_contig_type
3776 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3777                                struct ocfs2_extent_list *el, int index,
3778                                struct ocfs2_extent_rec *split_rec)
3779 {
3780         int status;
3781         enum ocfs2_contig_type ret = CONTIG_NONE;
3782         u32 left_cpos, right_cpos;
3783         struct ocfs2_extent_rec *rec = NULL;
3784         struct ocfs2_extent_list *new_el;
3785         struct ocfs2_path *left_path = NULL, *right_path = NULL;
3786         struct buffer_head *bh;
3787         struct ocfs2_extent_block *eb;
3788
3789         if (index > 0) {
3790                 rec = &el->l_recs[index - 1];
3791         } else if (path->p_tree_depth > 0) {
3792                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3793                                                        path, &left_cpos);
3794                 if (status)
3795                         goto out;
3796
3797                 if (left_cpos != 0) {
3798                         left_path = ocfs2_new_path(path_root_bh(path),
3799                                                    path_root_el(path));
3800                         if (!left_path)
3801                                 goto out;
3802
3803                         status = ocfs2_find_path(inode, left_path, left_cpos);
3804                         if (status)
3805                                 goto out;
3806
3807                         new_el = path_leaf_el(left_path);
3808
3809                         if (le16_to_cpu(new_el->l_next_free_rec) !=
3810                             le16_to_cpu(new_el->l_count)) {
3811                                 bh = path_leaf_bh(left_path);
3812                                 eb = (struct ocfs2_extent_block *)bh->b_data;
3813                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3814                                                                  eb);
3815                                 goto out;
3816                         }
3817                         rec = &new_el->l_recs[
3818                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
3819                 }
3820         }
3821
3822         /*
3823          * We're careful to check for an empty extent record here -
3824          * the merge code will know what to do if it sees one.
3825          */
3826         if (rec) {
3827                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3828                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3829                                 ret = CONTIG_RIGHT;
3830                 } else {
3831                         ret = ocfs2_extent_contig(inode, rec, split_rec);
3832                 }
3833         }
3834
3835         rec = NULL;
3836         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
3837                 rec = &el->l_recs[index + 1];
3838         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
3839                  path->p_tree_depth > 0) {
3840                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
3841                                                         path, &right_cpos);
3842                 if (status)
3843                         goto out;
3844
3845                 if (right_cpos == 0)
3846                         goto out;
3847
3848                 right_path = ocfs2_new_path(path_root_bh(path),
3849                                             path_root_el(path));
3850                 if (!right_path)
3851                         goto out;
3852
3853                 status = ocfs2_find_path(inode, right_path, right_cpos);
3854                 if (status)
3855                         goto out;
3856
3857                 new_el = path_leaf_el(right_path);
3858                 rec = &new_el->l_recs[0];
3859                 if (ocfs2_is_empty_extent(rec)) {
3860                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
3861                                 bh = path_leaf_bh(right_path);
3862                                 eb = (struct ocfs2_extent_block *)bh->b_data;
3863                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3864                                                                  eb);
3865                                 goto out;
3866                         }
3867                         rec = &new_el->l_recs[1];
3868                 }
3869         }
3870
3871         if (rec) {
3872                 enum ocfs2_contig_type contig_type;
3873
3874                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3875
3876                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3877                         ret = CONTIG_LEFTRIGHT;
3878                 else if (ret == CONTIG_NONE)
3879                         ret = contig_type;
3880         }
3881
3882 out:
3883         if (left_path)
3884                 ocfs2_free_path(left_path);
3885         if (right_path)
3886                 ocfs2_free_path(right_path);
3887
3888         return ret;
3889 }
3890
3891 static void ocfs2_figure_contig_type(struct inode *inode,
3892                                      struct ocfs2_insert_type *insert,
3893                                      struct ocfs2_extent_list *el,
3894                                      struct ocfs2_extent_rec *insert_rec)
3895 {
3896         int i;
3897         enum ocfs2_contig_type contig_type = CONTIG_NONE;
3898
3899         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3900
3901         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
3902                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
3903                                                   insert_rec);
3904                 if (contig_type != CONTIG_NONE) {
3905                         insert->ins_contig_index = i;
3906                         break;
3907                 }
3908         }
3909         insert->ins_contig = contig_type;
3910 }
3911
3912 /*
3913  * This should only be called against the righmost leaf extent list.
3914  *
3915  * ocfs2_figure_appending_type() will figure out whether we'll have to
3916  * insert at the tail of the rightmost leaf.
3917  *
3918  * This should also work against the dinode list for tree's with 0
3919  * depth. If we consider the dinode list to be the rightmost leaf node
3920  * then the logic here makes sense.
3921  */
3922 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
3923                                         struct ocfs2_extent_list *el,
3924                                         struct ocfs2_extent_rec *insert_rec)
3925 {
3926         int i;
3927         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
3928         struct ocfs2_extent_rec *rec;
3929
3930         insert->ins_appending = APPEND_NONE;
3931
3932         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3933
3934         if (!el->l_next_free_rec)
3935                 goto set_tail_append;
3936
3937         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3938                 /* Were all records empty? */
3939                 if (le16_to_cpu(el->l_next_free_rec) == 1)
3940                         goto set_tail_append;
3941         }
3942
3943         i = le16_to_cpu(el->l_next_free_rec) - 1;
3944         rec = &el->l_recs[i];
3945
3946         if (cpos >=
3947             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
3948                 goto set_tail_append;
3949
3950         return;
3951
3952 set_tail_append:
3953         insert->ins_appending = APPEND_TAIL;
3954 }
3955
3956 /*
3957  * Helper function called at the begining of an insert.
3958  *
3959  * This computes a few things that are commonly used in the process of
3960  * inserting into the btree:
3961  *   - Whether the new extent is contiguous with an existing one.
3962  *   - The current tree depth.
3963  *   - Whether the insert is an appending one.
3964  *   - The total # of free records in the tree.
3965  *
3966  * All of the information is stored on the ocfs2_insert_type
3967  * structure.
3968  */
3969 static int ocfs2_figure_insert_type(struct inode *inode,
3970                                     struct buffer_head *di_bh,
3971                                     struct buffer_head **last_eb_bh,
3972                                     struct ocfs2_extent_rec *insert_rec,
3973                                     int *free_records,
3974                                     struct ocfs2_insert_type *insert)
3975 {
3976         int ret;
3977         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3978         struct ocfs2_extent_block *eb;
3979         struct ocfs2_extent_list *el;
3980         struct ocfs2_path *path = NULL;
3981         struct buffer_head *bh = NULL;
3982
3983         insert->ins_split = SPLIT_NONE;
3984
3985         el = &di->id2.i_list;
3986         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
3987
3988         if (el->l_tree_depth) {
3989                 /*
3990                  * If we have tree depth, we read in the
3991                  * rightmost extent block ahead of time as
3992                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
3993                  * may want it later.
3994                  */
3995                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
3996                                        le64_to_cpu(di->i_last_eb_blk), &bh,
3997                                        OCFS2_BH_CACHED, inode);
3998                 if (ret) {
3999                         mlog_exit(ret);
4000                         goto out;
4001                 }
4002                 eb = (struct ocfs2_extent_block *) bh->b_data;
4003                 el = &eb->h_list;
4004         }
4005
4006         /*
4007          * Unless we have a contiguous insert, we'll need to know if
4008          * there is room left in our allocation tree for another
4009          * extent record.
4010          *
4011          * XXX: This test is simplistic, we can search for empty
4012          * extent records too.
4013          */
4014         *free_records = le16_to_cpu(el->l_count) -
4015                 le16_to_cpu(el->l_next_free_rec);
4016
4017         if (!insert->ins_tree_depth) {
4018                 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4019                 ocfs2_figure_appending_type(insert, el, insert_rec);
4020                 return 0;
4021         }
4022
4023         path = ocfs2_new_inode_path(di_bh);
4024         if (!path) {
4025                 ret = -ENOMEM;
4026                 mlog_errno(ret);
4027                 goto out;
4028         }
4029
4030         /*
4031          * In the case that we're inserting past what the tree
4032          * currently accounts for, ocfs2_find_path() will return for
4033          * us the rightmost tree path. This is accounted for below in
4034          * the appending code.
4035          */
4036         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4037         if (ret) {
4038                 mlog_errno(ret);
4039                 goto out;
4040         }
4041
4042         el = path_leaf_el(path);
4043
4044         /*
4045          * Now that we have the path, there's two things we want to determine:
4046          * 1) Contiguousness (also set contig_index if this is so)
4047          *
4048          * 2) Are we doing an append? We can trivially break this up
4049          *     into two types of appends: simple record append, or a
4050          *     rotate inside the tail leaf.
4051          */
4052         ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4053
4054         /*
4055          * The insert code isn't quite ready to deal with all cases of
4056          * left contiguousness. Specifically, if it's an insert into
4057          * the 1st record in a leaf, it will require the adjustment of
4058          * cluster count on the last record of the path directly to it's
4059          * left. For now, just catch that case and fool the layers
4060          * above us. This works just fine for tree_depth == 0, which
4061          * is why we allow that above.
4062          */
4063         if (insert->ins_contig == CONTIG_LEFT &&
4064             insert->ins_contig_index == 0)
4065                 insert->ins_contig = CONTIG_NONE;
4066
4067         /*
4068          * Ok, so we can simply compare against last_eb to figure out
4069          * whether the path doesn't exist. This will only happen in
4070          * the case that we're doing a tail append, so maybe we can
4071          * take advantage of that information somehow.
4072          */
4073         if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
4074                 /*
4075                  * Ok, ocfs2_find_path() returned us the rightmost
4076                  * tree path. This might be an appending insert. There are
4077                  * two cases:
4078                  *    1) We're doing a true append at the tail:
4079                  *      -This might even be off the end of the leaf
4080                  *    2) We're "appending" by rotating in the tail
4081                  */
4082                 ocfs2_figure_appending_type(insert, el, insert_rec);
4083         }
4084
4085 out:
4086         ocfs2_free_path(path);
4087
4088         if (ret == 0)
4089                 *last_eb_bh = bh;
4090         else
4091                 brelse(bh);
4092         return ret;
4093 }
4094
4095 /*
4096  * Insert an extent into an inode btree.
4097  *
4098  * The caller needs to update fe->i_clusters
4099  */
4100 int ocfs2_insert_extent(struct ocfs2_super *osb,
4101                         handle_t *handle,
4102                         struct inode *inode,
4103                         struct buffer_head *fe_bh,
4104                         u32 cpos,
4105                         u64 start_blk,
4106                         u32 new_clusters,
4107                         u8 flags,
4108                         struct ocfs2_alloc_context *meta_ac)
4109 {
4110         int status;
4111         int uninitialized_var(free_records);
4112         struct buffer_head *last_eb_bh = NULL;
4113         struct ocfs2_insert_type insert = {0, };
4114         struct ocfs2_extent_rec rec;
4115
4116         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4117
4118         mlog(0, "add %u clusters at position %u to inode %llu\n",
4119              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4120
4121         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4122                         (OCFS2_I(inode)->ip_clusters != cpos),
4123                         "Device %s, asking for sparse allocation: inode %llu, "
4124                         "cpos %u, clusters %u\n",
4125                         osb->dev_str,
4126                         (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4127                         OCFS2_I(inode)->ip_clusters);
4128
4129         memset(&rec, 0, sizeof(rec));
4130         rec.e_cpos = cpu_to_le32(cpos);
4131         rec.e_blkno = cpu_to_le64(start_blk);
4132         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4133         rec.e_flags = flags;
4134
4135         status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
4136                                           &free_records, &insert);
4137         if (status < 0) {
4138                 mlog_errno(status);
4139                 goto bail;
4140         }
4141
4142         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4143              "Insert.contig_index: %d, Insert.free_records: %d, "
4144              "Insert.tree_depth: %d\n",
4145              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4146              free_records, insert.ins_tree_depth);
4147
4148         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4149                 status = ocfs2_grow_tree(inode, handle, fe_bh,
4150                                          &insert.ins_tree_depth, &last_eb_bh,
4151                                          meta_ac);
4152                 if (status) {
4153                         mlog_errno(status);
4154                         goto bail;
4155                 }
4156         }
4157
4158         /* Finally, we can add clusters. This might rotate the tree for us. */
4159         status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
4160         if (status < 0)
4161                 mlog_errno(status);
4162         else
4163                 ocfs2_extent_map_insert_rec(inode, &rec);
4164
4165 bail:
4166         if (last_eb_bh)
4167                 brelse(last_eb_bh);
4168
4169         mlog_exit(status);
4170         return status;
4171 }
4172
4173 static void ocfs2_make_right_split_rec(struct super_block *sb,
4174                                        struct ocfs2_extent_rec *split_rec,
4175                                        u32 cpos,
4176                                        struct ocfs2_extent_rec *rec)
4177 {
4178         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4179         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4180
4181         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4182
4183         split_rec->e_cpos = cpu_to_le32(cpos);
4184         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4185
4186         split_rec->e_blkno = rec->e_blkno;
4187         le64_add_cpu(&split_rec->e_blkno,
4188                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4189
4190         split_rec->e_flags = rec->e_flags;
4191 }
4192
4193 static int ocfs2_split_and_insert(struct inode *inode,
4194                                   handle_t *handle,
4195                                   struct ocfs2_path *path,
4196                                   struct buffer_head *di_bh,
4197                                   struct buffer_head **last_eb_bh,
4198                                   int split_index,
4199                                   struct ocfs2_extent_rec *orig_split_rec,
4200                                   struct ocfs2_alloc_context *meta_ac)
4201 {
4202         int ret = 0, depth;
4203         unsigned int insert_range, rec_range, do_leftright = 0;
4204         struct ocfs2_extent_rec tmprec;
4205         struct ocfs2_extent_list *rightmost_el;
4206         struct ocfs2_extent_rec rec;
4207         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4208         struct ocfs2_insert_type insert;
4209         struct ocfs2_extent_block *eb;
4210         struct ocfs2_dinode *di;
4211
4212 leftright:
4213         /*
4214          * Store a copy of the record on the stack - it might move
4215          * around as the tree is manipulated below.
4216          */
4217         rec = path_leaf_el(path)->l_recs[split_index];
4218
4219         di = (struct ocfs2_dinode *)di_bh->b_data;
4220         rightmost_el = &di->id2.i_list;
4221
4222         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4223         if (depth) {
4224                 BUG_ON(!(*last_eb_bh));
4225                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4226                 rightmost_el = &eb->h_list;
4227         }
4228
4229         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4230             le16_to_cpu(rightmost_el->l_count)) {
4231                 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
4232                                       meta_ac);
4233                 if (ret) {
4234                         mlog_errno(ret);
4235                         goto out;
4236                 }
4237         }
4238
4239         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4240         insert.ins_appending = APPEND_NONE;
4241         insert.ins_contig = CONTIG_NONE;
4242         insert.ins_tree_depth = depth;
4243
4244         insert_range = le32_to_cpu(split_rec.e_cpos) +
4245                 le16_to_cpu(split_rec.e_leaf_clusters);
4246         rec_range = le32_to_cpu(rec.e_cpos) +
4247                 le16_to_cpu(rec.e_leaf_clusters);
4248
4249         if (split_rec.e_cpos == rec.e_cpos) {
4250                 insert.ins_split = SPLIT_LEFT;
4251         } else if (insert_range == rec_range) {
4252                 insert.ins_split = SPLIT_RIGHT;
4253         } else {
4254                 /*
4255                  * Left/right split. We fake this as a right split
4256                  * first and then make a second pass as a left split.
4257                  */
4258                 insert.ins_split = SPLIT_RIGHT;
4259
4260                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4261                                            &rec);
4262
4263                 split_rec = tmprec;
4264
4265                 BUG_ON(do_leftright);
4266                 do_leftright = 1;
4267         }
4268
4269         ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
4270                                      &insert);
4271         if (ret) {
4272                 mlog_errno(ret);
4273                 goto out;
4274         }
4275
4276         if (do_leftright == 1) {
4277                 u32 cpos;
4278                 struct ocfs2_extent_list *el;
4279
4280                 do_leftright++;
4281                 split_rec = *orig_split_rec;
4282
4283                 ocfs2_reinit_path(path, 1);
4284
4285                 cpos = le32_to_cpu(split_rec.e_cpos);
4286                 ret = ocfs2_find_path(inode, path, cpos);
4287                 if (ret) {
4288                         mlog_errno(ret);
4289                         goto out;
4290                 }
4291
4292                 el = path_leaf_el(path);
4293                 split_index = ocfs2_search_extent_list(el, cpos);
4294                 goto leftright;
4295         }
4296 out:
4297
4298         return ret;
4299 }
4300
4301 /*
4302  * Mark part or all of the extent record at split_index in the leaf
4303  * pointed to by path as written. This removes the unwritten
4304  * extent flag.
4305  *
4306  * Care is taken to handle contiguousness so as to not grow the tree.
4307  *
4308  * meta_ac is not strictly necessary - we only truly need it if growth
4309  * of the tree is required. All other cases will degrade into a less
4310  * optimal tree layout.
4311  *
4312  * last_eb_bh should be the rightmost leaf block for any inode with a
4313  * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
4314  *
4315  * This code is optimized for readability - several passes might be
4316  * made over certain portions of the tree. All of those blocks will
4317  * have been brought into cache (and pinned via the journal), so the
4318  * extra overhead is not expressed in terms of disk reads.
4319  */
4320 static int __ocfs2_mark_extent_written(struct inode *inode,
4321                                        struct buffer_head *di_bh,
4322                                        handle_t *handle,
4323                                        struct ocfs2_path *path,
4324                                        int split_index,
4325                                        struct ocfs2_extent_rec *split_rec,
4326                                        struct ocfs2_alloc_context *meta_ac,
4327                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4328 {
4329         int ret = 0;
4330         struct ocfs2_extent_list *el = path_leaf_el(path);
4331         struct buffer_head *last_eb_bh = NULL;
4332         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4333         struct ocfs2_merge_ctxt ctxt;
4334         struct ocfs2_extent_list *rightmost_el;
4335
4336         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4337                 ret = -EIO;
4338                 mlog_errno(ret);
4339                 goto out;
4340         }
4341
4342         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4343             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4344              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4345                 ret = -EIO;
4346                 mlog_errno(ret);
4347                 goto out;
4348         }
4349
4350         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4351                                                             split_index,
4352                                                             split_rec);
4353
4354         /*
4355          * The core merge / split code wants to know how much room is
4356          * left in this inodes allocation tree, so we pass the
4357          * rightmost extent list.
4358          */
4359         if (path->p_tree_depth) {
4360                 struct ocfs2_extent_block *eb;
4361                 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4362
4363                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4364                                        le64_to_cpu(di->i_last_eb_blk),
4365                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4366                 if (ret) {
4367                         mlog_exit(ret);
4368                         goto out;
4369                 }
4370
4371                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4372                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4373                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4374                         ret = -EROFS;
4375                         goto out;
4376                 }
4377
4378                 rightmost_el = &eb->h_list;
4379         } else
4380                 rightmost_el = path_root_el(path);
4381
4382         if (rec->e_cpos == split_rec->e_cpos &&
4383             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4384                 ctxt.c_split_covers_rec = 1;
4385         else
4386                 ctxt.c_split_covers_rec = 0;
4387
4388         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4389
4390         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4391              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4392              ctxt.c_split_covers_rec);
4393
4394         if (ctxt.c_contig_type == CONTIG_NONE) {
4395                 if (ctxt.c_split_covers_rec)
4396                         el->l_recs[split_index] = *split_rec;
4397                 else
4398                         ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
4399                                                      &last_eb_bh, split_index,
4400                                                      split_rec, meta_ac);
4401                 if (ret)
4402                         mlog_errno(ret);
4403         } else {
4404                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4405                                                 split_index, split_rec,
4406                                                 dealloc, &ctxt);
4407                 if (ret)
4408                         mlog_errno(ret);
4409         }
4410
4411 out:
4412         brelse(last_eb_bh);
4413         return ret;
4414 }
4415
4416 /*
4417  * Mark the already-existing extent at cpos as written for len clusters.
4418  *
4419  * If the existing extent is larger than the request, initiate a
4420  * split. An attempt will be made at merging with adjacent extents.
4421  *
4422  * The caller is responsible for passing down meta_ac if we'll need it.
4423  */
4424 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4425                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4426                               struct ocfs2_alloc_context *meta_ac,
4427                               struct ocfs2_cached_dealloc_ctxt *dealloc)
4428 {
4429         int ret, index;
4430         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4431         struct ocfs2_extent_rec split_rec;
4432         struct ocfs2_path *left_path = NULL;
4433         struct ocfs2_extent_list *el;
4434
4435         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4436              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4437
4438         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4439                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4440                             "that are being written to, but the feature bit "
4441                             "is not set in the super block.",
4442                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4443                 ret = -EROFS;
4444                 goto out;
4445         }
4446
4447         /*
4448          * XXX: This should be fixed up so that we just re-insert the
4449          * next extent records.
4450          */
4451         ocfs2_extent_map_trunc(inode, 0);
4452
4453         left_path = ocfs2_new_inode_path(di_bh);
4454         if (!left_path) {
4455                 ret = -ENOMEM;
4456                 mlog_errno(ret);
4457                 goto out;
4458         }
4459
4460         ret = ocfs2_find_path(inode, left_path, cpos);
4461         if (ret) {
4462                 mlog_errno(ret);
4463                 goto out;
4464         }
4465         el = path_leaf_el(left_path);
4466
4467         index = ocfs2_search_extent_list(el, cpos);
4468         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4469                 ocfs2_error(inode->i_sb,
4470                             "Inode %llu has an extent at cpos %u which can no "
4471                             "longer be found.\n",
4472                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4473                 ret = -EROFS;
4474                 goto out;
4475         }
4476
4477         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4478         split_rec.e_cpos = cpu_to_le32(cpos);
4479         split_rec.e_leaf_clusters = cpu_to_le16(len);
4480         split_rec.e_blkno = cpu_to_le64(start_blkno);
4481         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4482         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4483
4484         ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
4485                                           index, &split_rec, meta_ac, dealloc);
4486         if (ret)
4487                 mlog_errno(ret);
4488
4489 out:
4490         ocfs2_free_path(left_path);
4491         return ret;
4492 }
4493
4494 static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4495                             handle_t *handle, struct ocfs2_path *path,
4496                             int index, u32 new_range,
4497                             struct ocfs2_alloc_context *meta_ac)
4498 {
4499         int ret, depth, credits = handle->h_buffer_credits;
4500         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4501         struct buffer_head *last_eb_bh = NULL;
4502         struct ocfs2_extent_block *eb;
4503         struct ocfs2_extent_list *rightmost_el, *el;
4504         struct ocfs2_extent_rec split_rec;
4505         struct ocfs2_extent_rec *rec;
4506         struct ocfs2_insert_type insert;
4507
4508         /*
4509          * Setup the record to split before we grow the tree.
4510          */
4511         el = path_leaf_el(path);
4512         rec = &el->l_recs[index];
4513         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4514
4515         depth = path->p_tree_depth;
4516         if (depth > 0) {
4517                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4518                                        le64_to_cpu(di->i_last_eb_blk),
4519                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4520                 if (ret < 0) {
4521                         mlog_errno(ret);
4522                         goto out;
4523                 }
4524
4525                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4526                 rightmost_el = &eb->h_list;
4527         } else
4528                 rightmost_el = path_leaf_el(path);
4529
4530         credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
4531         ret = ocfs2_extend_trans(handle, credits);
4532         if (ret) {
4533                 mlog_errno(ret);
4534                 goto out;
4535         }
4536
4537         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4538             le16_to_cpu(rightmost_el->l_count)) {
4539                 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4540                                       meta_ac);
4541                 if (ret) {
4542                         mlog_errno(ret);
4543                         goto out;
4544                 }
4545         }
4546
4547         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4548         insert.ins_appending = APPEND_NONE;
4549         insert.ins_contig = CONTIG_NONE;
4550         insert.ins_split = SPLIT_RIGHT;
4551         insert.ins_tree_depth = depth;
4552
4553         ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
4554         if (ret)
4555                 mlog_errno(ret);
4556
4557 out:
4558         brelse(last_eb_bh);
4559         return ret;
4560 }
4561
4562 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4563                               struct ocfs2_path *path, int index,
4564                               struct ocfs2_cached_dealloc_ctxt *dealloc,
4565                               u32 cpos, u32 len)
4566 {
4567         int ret;
4568         u32 left_cpos, rec_range, trunc_range;
4569         int wants_rotate = 0, is_rightmost_tree_rec = 0;
4570         struct super_block *sb = inode->i_sb;
4571         struct ocfs2_path *left_path = NULL;
4572         struct ocfs2_extent_list *el = path_leaf_el(path);
4573         struct ocfs2_extent_rec *rec;
4574         struct ocfs2_extent_block *eb;
4575
4576         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4577                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4578                 if (ret) {
4579                         mlog_errno(ret);
4580                         goto out;
4581                 }
4582
4583                 index--;
4584         }
4585
4586         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4587             path->p_tree_depth) {
4588                 /*
4589                  * Check whether this is the rightmost tree record. If
4590                  * we remove all of this record or part of its right
4591                  * edge then an update of the record lengths above it
4592                  * will be required.
4593                  */
4594                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4595                 if (eb->h_next_leaf_blk == 0)
4596                         is_rightmost_tree_rec = 1;
4597         }
4598
4599         rec = &el->l_recs[index];
4600         if (index == 0 && path->p_tree_depth &&
4601             le32_to_cpu(rec->e_cpos) == cpos) {
4602                 /*
4603                  * Changing the leftmost offset (via partial or whole
4604                  * record truncate) of an interior (or rightmost) path
4605                  * means we have to update the subtree that is formed
4606                  * by this leaf and the one to it's left.
4607                  *
4608                  * There are two cases we can skip:
4609                  *   1) Path is the leftmost one in our inode tree.
4610                  *   2) The leaf is rightmost and will be empty after
4611                  *      we remove the extent record - the rotate code
4612                  *      knows how to update the newly formed edge.
4613                  */
4614
4615                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4616                                                     &left_cpos);
4617                 if (ret) {
4618                         mlog_errno(ret);
4619                         goto out;
4620                 }
4621
4622                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4623                         left_path = ocfs2_new_path(path_root_bh(path),
4624                                                    path_root_el(path));
4625                         if (!left_path) {
4626                                 ret = -ENOMEM;
4627                                 mlog_errno(ret);
4628                                 goto out;
4629                         }
4630
4631                         ret = ocfs2_find_path(inode, left_path, left_cpos);
4632                         if (ret) {
4633                                 mlog_errno(ret);
4634                                 goto out;
4635                         }
4636                 }
4637         }
4638
4639         ret = ocfs2_extend_rotate_transaction(handle, 0,
4640                                               handle->h_buffer_credits,
4641                                               path);
4642         if (ret) {
4643                 mlog_errno(ret);
4644                 goto out;
4645         }
4646
4647         ret = ocfs2_journal_access_path(inode, handle, path);
4648         if (ret) {
4649                 mlog_errno(ret);
4650                 goto out;
4651         }
4652
4653         ret = ocfs2_journal_access_path(inode, handle, left_path);
4654         if (ret) {
4655                 mlog_errno(ret);
4656                 goto out;
4657         }
4658
4659         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4660         trunc_range = cpos + len;
4661
4662         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
4663                 int next_free;
4664
4665                 memset(rec, 0, sizeof(*rec));
4666                 ocfs2_cleanup_merge(el, index);
4667                 wants_rotate = 1;
4668
4669                 next_free = le16_to_cpu(el->l_next_free_rec);
4670                 if (is_rightmost_tree_rec && next_free > 1) {
4671                         /*
4672                          * We skip the edge update if this path will
4673                          * be deleted by the rotate code.
4674                          */
4675                         rec = &el->l_recs[next_free - 1];
4676                         ocfs2_adjust_rightmost_records(inode, handle, path,
4677                                                        rec);
4678                 }
4679         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
4680                 /* Remove leftmost portion of the record. */
4681                 le32_add_cpu(&rec->e_cpos, len);
4682                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
4683                 le16_add_cpu(&rec->e_leaf_clusters, -len);
4684         } else if (rec_range == trunc_range) {
4685                 /* Remove rightmost portion of the record */
4686                 le16_add_cpu(&rec->e_leaf_clusters, -len);
4687                 if (is_rightmost_tree_rec)
4688                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
4689         } else {
4690                 /* Caller should have trapped this. */
4691                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
4692                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
4693                      le32_to_cpu(rec->e_cpos),
4694                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
4695                 BUG();
4696         }
4697
4698         if (left_path) {
4699                 int subtree_index;
4700
4701                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
4702                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
4703                                            subtree_index);
4704         }
4705
4706         ocfs2_journal_dirty(handle, path_leaf_bh(path));
4707
4708         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4709         if (ret) {
4710                 mlog_errno(ret);
4711                 goto out;
4712         }
4713
4714 out:
4715         ocfs2_free_path(left_path);
4716         return ret;
4717 }
4718
4719 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4720                         u32 cpos, u32 len, handle_t *handle,
4721                         struct ocfs2_alloc_context *meta_ac,
4722                         struct ocfs2_cached_dealloc_ctxt *dealloc)
4723 {
4724         int ret, index;
4725         u32 rec_range, trunc_range;
4726         struct ocfs2_extent_rec *rec;
4727         struct ocfs2_extent_list *el;
4728         struct ocfs2_path *path;
4729
4730         ocfs2_extent_map_trunc(inode, 0);
4731
4732         path = ocfs2_new_inode_path(di_bh);
4733         if (!path) {
4734                 ret = -ENOMEM;
4735                 mlog_errno(ret);
4736                 goto out;
4737         }
4738
4739         ret = ocfs2_find_path(inode, path, cpos);
4740         if (ret) {
4741                 mlog_errno(ret);
4742                 goto out;
4743         }
4744
4745         el = path_leaf_el(path);
4746         index = ocfs2_search_extent_list(el, cpos);
4747         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4748                 ocfs2_error(inode->i_sb,
4749                             "Inode %llu has an extent at cpos %u which can no "
4750                             "longer be found.\n",
4751                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4752                 ret = -EROFS;
4753                 goto out;
4754         }
4755
4756         /*
4757          * We have 3 cases of extent removal:
4758          *   1) Range covers the entire extent rec
4759          *   2) Range begins or ends on one edge of the extent rec
4760          *   3) Range is in the middle of the extent rec (no shared edges)
4761          *
4762          * For case 1 we remove the extent rec and left rotate to
4763          * fill the hole.
4764          *
4765          * For case 2 we just shrink the existing extent rec, with a
4766          * tree update if the shrinking edge is also the edge of an
4767          * extent block.
4768          *
4769          * For case 3 we do a right split to turn the extent rec into
4770          * something case 2 can handle.
4771          */
4772         rec = &el->l_recs[index];
4773         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4774         trunc_range = cpos + len;
4775
4776         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
4777
4778         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
4779              "(cpos %u, len %u)\n",
4780              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
4781              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
4782
4783         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4784                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4785                                          cpos, len);
4786                 if (ret) {
4787                         mlog_errno(ret);
4788                         goto out;
4789                 }
4790         } else {
4791                 ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
4792                                        trunc_range, meta_ac);
4793                 if (ret) {
4794                         mlog_errno(ret);
4795                         goto out;
4796                 }
4797
4798                 /*
4799                  * The split could have manipulated the tree enough to
4800                  * move the record location, so we have to look for it again.
4801                  */
4802                 ocfs2_reinit_path(path, 1);
4803
4804                 ret = ocfs2_find_path(inode, path, cpos);
4805                 if (ret) {
4806                         mlog_errno(ret);
4807                         goto out;
4808                 }
4809
4810                 el = path_leaf_el(path);
4811                 index = ocfs2_search_extent_list(el, cpos);
4812                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4813                         ocfs2_error(inode->i_sb,
4814                                     "Inode %llu: split at cpos %u lost record.",
4815                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
4816                                     cpos);
4817                         ret = -EROFS;
4818                         goto out;
4819                 }
4820
4821                 /*
4822                  * Double check our values here. If anything is fishy,
4823                  * it's easier to catch it at the top level.
4824                  */
4825                 rec = &el->l_recs[index];
4826                 rec_range = le32_to_cpu(rec->e_cpos) +
4827                         ocfs2_rec_clusters(el, rec);
4828                 if (rec_range != trunc_range) {
4829                         ocfs2_error(inode->i_sb,
4830                                     "Inode %llu: error after split at cpos %u"
4831                                     "trunc len %u, existing record is (%u,%u)",
4832                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
4833                                     cpos, len, le32_to_cpu(rec->e_cpos),
4834                                     ocfs2_rec_clusters(el, rec));
4835                         ret = -EROFS;
4836                         goto out;
4837                 }
4838
4839                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4840                                          cpos, len);
4841                 if (ret) {
4842                         mlog_errno(ret);
4843                         goto out;
4844                 }
4845         }
4846
4847 out:
4848         ocfs2_free_path(path);
4849         return ret;
4850 }
4851
4852 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
4853 {
4854         struct buffer_head *tl_bh = osb->osb_tl_bh;
4855         struct ocfs2_dinode *di;
4856         struct ocfs2_truncate_log *tl;
4857
4858         di = (struct ocfs2_dinode *) tl_bh->b_data;
4859         tl = &di->id2.i_dealloc;
4860
4861         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
4862                         "slot %d, invalid truncate log parameters: used = "
4863                         "%u, count = %u\n", osb->slot_num,
4864                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
4865         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
4866 }
4867
4868 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
4869                                            unsigned int new_start)
4870 {
4871         unsigned int tail_index;
4872         unsigned int current_tail;
4873
4874         /* No records, nothing to coalesce */
4875         if (!le16_to_cpu(tl->tl_used))
4876                 return 0;
4877
4878         tail_index = le16_to_cpu(tl->tl_used) - 1;
4879         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
4880         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
4881
4882         return current_tail == new_start;
4883 }
4884
4885 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
4886                               handle_t *handle,
4887                               u64 start_blk,
4888                               unsigned int num_clusters)
4889 {
4890         int status, index;
4891         unsigned int start_cluster, tl_count;
4892         struct inode *tl_inode = osb->osb_tl_inode;
4893         struct buffer_head *tl_bh = osb->osb_tl_bh;
4894         struct ocfs2_dinode *di;
4895         struct ocfs2_truncate_log *tl;
4896
4897         mlog_entry("start_blk = %llu, num_clusters = %u\n",
4898                    (unsigned long long)start_blk, num_clusters);
4899
4900         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
4901
4902         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
4903
4904         di = (struct ocfs2_dinode *) tl_bh->b_data;
4905         tl = &di->id2.i_dealloc;
4906         if (!OCFS2_IS_VALID_DINODE(di)) {
4907                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
4908                 status = -EIO;
4909                 goto bail;
4910         }
4911
4912         tl_count = le16_to_cpu(tl->tl_count);
4913         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
4914                         tl_count == 0,
4915                         "Truncate record count on #%llu invalid "
4916                         "wanted %u, actual %u\n",
4917                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
4918                         ocfs2_truncate_recs_per_inode(osb->sb),
4919                         le16_to_cpu(tl->tl_count));
4920
4921         /* Caller should have known to flush before calling us. */
4922         index = le16_to_cpu(tl->tl_used);
4923         if (index >= tl_count) {
4924                 status = -ENOSPC;
4925                 mlog_errno(status);
4926                 goto bail;
4927         }
4928
4929         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
4930                                       OCFS2_JOURNAL_ACCESS_WRITE);
4931         if (status < 0) {
4932                 mlog_errno(status);
4933                 goto bail;
4934         }
4935
4936         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
4937              "%llu (index = %d)\n", num_clusters, start_cluster,
4938              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
4939
4940         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
4941                 /*
4942                  * Move index back to the record we are coalescing with.
4943                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
4944                  */
4945                 index--;
4946
4947                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
4948                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
4949                      index, le32_to_cpu(tl->tl_recs[index].t_start),
4950                      num_clusters);
4951         } else {
4952                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
4953                 tl->tl_used = cpu_to_le16(index + 1);
4954         }
4955         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
4956
4957         status = ocfs2_journal_dirty(handle, tl_bh);
4958         if (status < 0) {
4959                 mlog_errno(status);
4960                 goto bail;
4961         }
4962
4963 bail:
4964         mlog_exit(status);
4965         return status;
4966 }
4967
4968 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
4969                                          handle_t *handle,
4970                                          struct inode *data_alloc_inode,
4971                                          struct buffer_head *data_alloc_bh)
4972 {
4973         int status = 0;
4974         int i;
4975         unsigned int num_clusters;
4976         u64 start_blk;
4977         struct ocfs2_truncate_rec rec;
4978         struct ocfs2_dinode *di;
4979         struct ocfs2_truncate_log *tl;
4980         struct inode *tl_inode = osb->osb_tl_inode;
4981         struct buffer_head *tl_bh = osb->osb_tl_bh;
4982
4983         mlog_entry_void();
4984
4985         di = (struct ocfs2_dinode *) tl_bh->b_data;
4986         tl = &di->id2.i_dealloc;
4987         i = le16_to_cpu(tl->tl_used) - 1;
4988         while (i >= 0) {
4989                 /* Caller has given us at least enough credits to
4990                  * update the truncate log dinode */
4991                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
4992                                               OCFS2_JOURNAL_ACCESS_WRITE);
4993                 if (status < 0) {
4994                         mlog_errno(status);
4995                         goto bail;
4996                 }
4997
4998                 tl->tl_used = cpu_to_le16(i);
4999
5000                 status = ocfs2_journal_dirty(handle, tl_bh);
5001                 if (status < 0) {
5002                         mlog_errno(status);
5003                         goto bail;
5004                 }
5005
5006                 /* TODO: Perhaps we can calculate the bulk of the
5007                  * credits up front rather than extending like
5008                  * this. */
5009                 status = ocfs2_extend_trans(handle,
5010                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5011                 if (status < 0) {
5012                         mlog_errno(status);
5013                         goto bail;
5014                 }
5015
5016                 rec = tl->tl_recs[i];
5017                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5018                                                     le32_to_cpu(rec.t_start));
5019                 num_clusters = le32_to_cpu(rec.t_clusters);
5020
5021                 /* if start_blk is not set, we ignore the record as
5022                  * invalid. */
5023                 if (start_blk) {
5024                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5025                              i, le32_to_cpu(rec.t_start), num_clusters);
5026
5027                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5028                                                      data_alloc_bh, start_blk,
5029                                                      num_clusters);
5030                         if (status < 0) {
5031                                 mlog_errno(status);
5032                                 goto bail;
5033                         }
5034                 }
5035                 i--;
5036         }
5037
5038 bail:
5039         mlog_exit(status);
5040         return status;
5041 }
5042
5043 /* Expects you to already be holding tl_inode->i_mutex */
5044 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5045 {
5046         int status;
5047         unsigned int num_to_flush;
5048         handle_t *handle;
5049         struct inode *tl_inode = osb->osb_tl_inode;
5050         struct inode *data_alloc_inode = NULL;
5051         struct buffer_head *tl_bh = osb->osb_tl_bh;
5052         struct buffer_head *data_alloc_bh = NULL;
5053         struct ocfs2_dinode *di;
5054         struct ocfs2_truncate_log *tl;
5055
5056         mlog_entry_void();
5057
5058         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5059
5060         di = (struct ocfs2_dinode *) tl_bh->b_data;
5061         tl = &di->id2.i_dealloc;
5062         if (!OCFS2_IS_VALID_DINODE(di)) {
5063                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5064                 status = -EIO;
5065                 goto out;
5066         }
5067
5068         num_to_flush = le16_to_cpu(tl->tl_used);
5069         mlog(0, "Flush %u records from truncate log #%llu\n",
5070              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5071         if (!num_to_flush) {
5072                 status = 0;
5073                 goto out;
5074         }
5075
5076         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5077                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5078                                                        OCFS2_INVALID_SLOT);
5079         if (!data_alloc_inode) {
5080                 status = -EINVAL;
5081                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5082                 goto out;
5083         }
5084
5085         mutex_lock(&data_alloc_inode->i_mutex);
5086
5087         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5088         if (status < 0) {
5089                 mlog_errno(status);
5090                 goto out_mutex;
5091         }
5092
5093         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5094         if (IS_ERR(handle)) {
5095                 status = PTR_ERR(handle);
5096                 mlog_errno(status);
5097                 goto out_unlock;
5098         }
5099
5100         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5101                                                data_alloc_bh);
5102         if (status < 0)
5103                 mlog_errno(status);
5104
5105         ocfs2_commit_trans(osb, handle);
5106
5107 out_unlock:
5108         brelse(data_alloc_bh);
5109         ocfs2_inode_unlock(data_alloc_inode, 1);
5110
5111 out_mutex:
5112         mutex_unlock(&data_alloc_inode->i_mutex);
5113         iput(data_alloc_inode);
5114
5115 out:
5116         mlog_exit(status);
5117         return status;
5118 }
5119
5120 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5121 {
5122         int status;
5123         struct inode *tl_inode = osb->osb_tl_inode;
5124
5125         mutex_lock(&tl_inode->i_mutex);
5126         status = __ocfs2_flush_truncate_log(osb);
5127         mutex_unlock(&tl_inode->i_mutex);
5128
5129         return status;
5130 }
5131
5132 static void ocfs2_truncate_log_worker(struct work_struct *work)
5133 {
5134         int status;
5135         struct ocfs2_super *osb =
5136                 container_of(work, struct ocfs2_super,
5137                              osb_truncate_log_wq.work);
5138
5139         mlog_entry_void();
5140
5141         status = ocfs2_flush_truncate_log(osb);
5142         if (status < 0)
5143                 mlog_errno(status);
5144         else
5145                 ocfs2_init_inode_steal_slot(osb);
5146
5147         mlog_exit(status);
5148 }
5149
5150 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5151 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5152                                        int cancel)
5153 {
5154         if (osb->osb_tl_inode) {
5155                 /* We want to push off log flushes while truncates are
5156                  * still running. */
5157                 if (cancel)
5158                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5159
5160                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5161                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5162         }
5163 }
5164
5165 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5166                                        int slot_num,
5167                                        struct inode **tl_inode,
5168                                        struct buffer_head **tl_bh)
5169 {
5170         int status;
5171         struct inode *inode = NULL;
5172         struct buffer_head *bh = NULL;
5173
5174         inode = ocfs2_get_system_file_inode(osb,
5175                                            TRUNCATE_LOG_SYSTEM_INODE,
5176                                            slot_num);
5177         if (!inode) {
5178                 status = -EINVAL;
5179                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5180                 goto bail;
5181         }
5182
5183         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
5184                                   OCFS2_BH_CACHED, inode);
5185         if (status < 0) {
5186                 iput(inode);
5187                 mlog_errno(status);
5188                 goto bail;
5189         }
5190
5191         *tl_inode = inode;
5192         *tl_bh    = bh;
5193 bail:
5194         mlog_exit(status);
5195         return status;
5196 }
5197
5198 /* called during the 1st stage of node recovery. we stamp a clean
5199  * truncate log and pass back a copy for processing later. if the
5200  * truncate log does not require processing, a *tl_copy is set to
5201  * NULL. */
5202 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5203                                       int slot_num,
5204                                       struct ocfs2_dinode **tl_copy)
5205 {
5206         int status;
5207         struct inode *tl_inode = NULL;
5208         struct buffer_head *tl_bh = NULL;
5209         struct ocfs2_dinode *di;
5210         struct ocfs2_truncate_log *tl;
5211
5212         *tl_copy = NULL;
5213
5214         mlog(0, "recover truncate log from slot %d\n", slot_num);
5215
5216         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5217         if (status < 0) {
5218                 mlog_errno(status);
5219                 goto bail;
5220         }
5221
5222         di = (struct ocfs2_dinode *) tl_bh->b_data;
5223         tl = &di->id2.i_dealloc;
5224         if (!OCFS2_IS_VALID_DINODE(di)) {
5225                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5226                 status = -EIO;
5227                 goto bail;
5228         }
5229
5230         if (le16_to_cpu(tl->tl_used)) {
5231                 mlog(0, "We'll have %u logs to recover\n",
5232                      le16_to_cpu(tl->tl_used));
5233
5234                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5235                 if (!(*tl_copy)) {
5236                         status = -ENOMEM;
5237                         mlog_errno(status);
5238                         goto bail;
5239                 }
5240
5241                 /* Assuming the write-out below goes well, this copy
5242                  * will be passed back to recovery for processing. */
5243                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5244
5245                 /* All we need to do to clear the truncate log is set
5246                  * tl_used. */
5247                 tl->tl_used = 0;
5248
5249                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5250                 if (status < 0) {
5251                         mlog_errno(status);
5252                         goto bail;
5253                 }
5254         }
5255
5256 bail:
5257         if (tl_inode)
5258                 iput(tl_inode);
5259         if (tl_bh)
5260                 brelse(tl_bh);
5261
5262         if (status < 0 && (*tl_copy)) {
5263                 kfree(*tl_copy);
5264                 *tl_copy = NULL;
5265         }
5266
5267         mlog_exit(status);
5268         return status;
5269 }
5270
5271 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5272                                          struct ocfs2_dinode *tl_copy)
5273 {
5274         int status = 0;
5275         int i;
5276         unsigned int clusters, num_recs, start_cluster;
5277         u64 start_blk;
5278         handle_t *handle;
5279         struct inode *tl_inode = osb->osb_tl_inode;
5280         struct ocfs2_truncate_log *tl;
5281
5282         mlog_entry_void();
5283
5284         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5285                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5286                 return -EINVAL;
5287         }
5288
5289         tl = &tl_copy->id2.i_dealloc;
5290         num_recs = le16_to_cpu(tl->tl_used);
5291         mlog(0, "cleanup %u records from %llu\n", num_recs,
5292              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5293
5294         mutex_lock(&tl_inode->i_mutex);
5295         for(i = 0; i < num_recs; i++) {
5296                 if (ocfs2_truncate_log_needs_flush(osb)) {
5297                         status = __ocfs2_flush_truncate_log(osb);
5298                         if (status < 0) {
5299                                 mlog_errno(status);
5300                                 goto bail_up;
5301                         }
5302                 }
5303
5304                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5305                 if (IS_ERR(handle)) {
5306                         status = PTR_ERR(handle);
5307                         mlog_errno(status);
5308                         goto bail_up;
5309                 }
5310
5311                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5312                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5313                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5314
5315                 status = ocfs2_truncate_log_append(osb, handle,
5316                                                    start_blk, clusters);
5317                 ocfs2_commit_trans(osb, handle);
5318                 if (status < 0) {
5319                         mlog_errno(status);
5320                         goto bail_up;
5321                 }
5322         }
5323
5324 bail_up:
5325         mutex_unlock(&tl_inode->i_mutex);
5326
5327         mlog_exit(status);
5328         return status;
5329 }
5330
5331 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5332 {
5333         int status;
5334         struct inode *tl_inode = osb->osb_tl_inode;
5335
5336         mlog_entry_void();
5337
5338         if (tl_inode) {
5339                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5340                 flush_workqueue(ocfs2_wq);
5341
5342                 status = ocfs2_flush_truncate_log(osb);
5343                 if (status < 0)
5344                         mlog_errno(status);
5345
5346                 brelse(osb->osb_tl_bh);
5347                 iput(osb->osb_tl_inode);
5348         }
5349
5350         mlog_exit_void();
5351 }
5352
5353 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5354 {
5355         int status;
5356         struct inode *tl_inode = NULL;
5357         struct buffer_head *tl_bh = NULL;
5358
5359         mlog_entry_void();
5360
5361         status = ocfs2_get_truncate_log_info(osb,
5362                                              osb->slot_num,
5363                                              &tl_inode,
5364                                              &tl_bh);
5365         if (status < 0)
5366                 mlog_errno(status);
5367
5368         /* ocfs2_truncate_log_shutdown keys on the existence of
5369          * osb->osb_tl_inode so we don't set any of the osb variables
5370          * until we're sure all is well. */
5371         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5372                           ocfs2_truncate_log_worker);
5373         osb->osb_tl_bh    = tl_bh;
5374         osb->osb_tl_inode = tl_inode;
5375
5376         mlog_exit(status);
5377         return status;
5378 }
5379
5380 /*
5381  * Delayed de-allocation of suballocator blocks.
5382  *
5383  * Some sets of block de-allocations might involve multiple suballocator inodes.
5384  *
5385  * The locking for this can get extremely complicated, especially when
5386  * the suballocator inodes to delete from aren't known until deep
5387  * within an unrelated codepath.
5388  *
5389  * ocfs2_extent_block structures are a good example of this - an inode
5390  * btree could have been grown by any number of nodes each allocating
5391  * out of their own suballoc inode.
5392  *
5393  * These structures allow the delay of block de-allocation until a
5394  * later time, when locking of multiple cluster inodes won't cause
5395  * deadlock.
5396  */
5397
5398 /*
5399  * Describes a single block free from a suballocator
5400  */
5401 struct ocfs2_cached_block_free {
5402         struct ocfs2_cached_block_free          *free_next;
5403         u64                                     free_blk;
5404         unsigned int                            free_bit;
5405 };
5406
5407 struct ocfs2_per_slot_free_list {
5408         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5409         int                                     f_inode_type;
5410         int                                     f_slot;
5411         struct ocfs2_cached_block_free          *f_first;
5412 };
5413
5414 static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5415                                    int sysfile_type,
5416                                    int slot,
5417                                    struct ocfs2_cached_block_free *head)
5418 {
5419         int ret;
5420         u64 bg_blkno;
5421         handle_t *handle;
5422         struct inode *inode;
5423         struct buffer_head *di_bh = NULL;
5424         struct ocfs2_cached_block_free *tmp;
5425
5426         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5427         if (!inode) {
5428                 ret = -EINVAL;
5429                 mlog_errno(ret);
5430                 goto out;
5431         }
5432
5433         mutex_lock(&inode->i_mutex);
5434
5435         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5436         if (ret) {
5437                 mlog_errno(ret);
5438                 goto out_mutex;
5439         }
5440
5441         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5442         if (IS_ERR(handle)) {
5443                 ret = PTR_ERR(handle);
5444                 mlog_errno(ret);
5445                 goto out_unlock;
5446         }
5447
5448         while (head) {
5449                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5450                                                       head->free_bit);
5451                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5452                      head->free_bit, (unsigned long long)head->free_blk);
5453
5454                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5455                                                head->free_bit, bg_blkno, 1);
5456                 if (ret) {
5457                         mlog_errno(ret);
5458                         goto out_journal;
5459                 }
5460
5461                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5462                 if (ret) {
5463                         mlog_errno(ret);
5464                         goto out_journal;
5465                 }
5466
5467                 tmp = head;
5468                 head = head->free_next;
5469                 kfree(tmp);
5470         }
5471
5472 out_journal:
5473         ocfs2_commit_trans(osb, handle);
5474
5475 out_unlock:
5476         ocfs2_inode_unlock(inode, 1);
5477         brelse(di_bh);
5478 out_mutex:
5479         mutex_unlock(&inode->i_mutex);
5480         iput(inode);
5481 out:
5482         while(head) {
5483                 /* Premature exit may have left some dangling items. */
5484                 tmp = head;
5485                 head = head->free_next;
5486                 kfree(tmp);
5487         }
5488
5489         return ret;
5490 }
5491
5492 int ocfs2_run_deallocs(struct ocfs2_super *osb,
5493                        struct ocfs2_cached_dealloc_ctxt *ctxt)
5494 {
5495         int ret = 0, ret2;
5496         struct ocfs2_per_slot_free_list *fl;
5497
5498         if (!ctxt)
5499                 return 0;
5500
5501         while (ctxt->c_first_suballocator) {
5502                 fl = ctxt->c_first_suballocator;
5503
5504                 if (fl->f_first) {
5505                         mlog(0, "Free items: (type %u, slot %d)\n",
5506                              fl->f_inode_type, fl->f_slot);
5507                         ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5508                                                        fl->f_slot, fl->f_first);
5509                         if (ret2)
5510                                 mlog_errno(ret2);
5511                         if (!ret)
5512                                 ret = ret2;
5513                 }
5514
5515                 ctxt->c_first_suballocator = fl->f_next_suballocator;
5516                 kfree(fl);
5517         }
5518
5519         return ret;
5520 }
5521
5522 static struct ocfs2_per_slot_free_list *
5523 ocfs2_find_per_slot_free_list(int type,
5524                               int slot,
5525                               struct ocfs2_cached_dealloc_ctxt *ctxt)
5526 {
5527         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5528
5529         while (fl) {
5530                 if (fl->f_inode_type == type && fl->f_slot == slot)
5531                         return fl;
5532
5533                 fl = fl->f_next_suballocator;
5534         }
5535
5536         fl = kmalloc(sizeof(*fl), GFP_NOFS);
5537         if (fl) {
5538                 fl->f_inode_type = type;
5539                 fl->f_slot = slot;
5540                 fl->f_first = NULL;
5541                 fl->f_next_suballocator = ctxt->c_first_suballocator;
5542
5543                 ctxt->c_first_suballocator = fl;
5544         }
5545         return fl;
5546 }
5547
5548 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5549                                      int type, int slot, u64 blkno,
5550                                      unsigned int bit)
5551 {
5552         int ret;
5553         struct ocfs2_per_slot_free_list *fl;
5554         struct ocfs2_cached_block_free *item;
5555
5556         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5557         if (fl == NULL) {
5558                 ret = -ENOMEM;
5559                 mlog_errno(ret);
5560                 goto out;
5561         }
5562
5563         item = kmalloc(sizeof(*item), GFP_NOFS);
5564         if (item == NULL) {
5565                 ret = -ENOMEM;
5566                 mlog_errno(ret);
5567                 goto out;
5568         }
5569
5570         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5571              type, slot, bit, (unsigned long long)blkno);
5572
5573         item->free_blk = blkno;
5574         item->free_bit = bit;
5575         item->free_next = fl->f_first;
5576
5577         fl->f_first = item;
5578
5579         ret = 0;
5580 out:
5581         return ret;
5582 }
5583
5584 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5585                                          struct ocfs2_extent_block *eb)
5586 {
5587         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5588                                          le16_to_cpu(eb->h_suballoc_slot),
5589                                          le64_to_cpu(eb->h_blkno),
5590                                          le16_to_cpu(eb->h_suballoc_bit));
5591 }
5592
5593 /* This function will figure out whether the currently last extent
5594  * block will be deleted, and if it will, what the new last extent
5595  * block will be so we can update his h_next_leaf_blk field, as well
5596  * as the dinodes i_last_eb_blk */
5597 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
5598                                        unsigned int clusters_to_del,
5599                                        struct ocfs2_path *path,
5600                                        struct buffer_head **new_last_eb)
5601 {
5602         int next_free, ret = 0;
5603         u32 cpos;
5604         struct ocfs2_extent_rec *rec;
5605         struct ocfs2_extent_block *eb;
5606         struct ocfs2_extent_list *el;
5607         struct buffer_head *bh = NULL;
5608
5609         *new_last_eb = NULL;
5610
5611         /* we have no tree, so of course, no last_eb. */
5612         if (!path->p_tree_depth)
5613                 goto out;
5614
5615         /* trunc to zero special case - this makes tree_depth = 0
5616          * regardless of what it is.  */
5617         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
5618                 goto out;
5619
5620         el = path_leaf_el(path);
5621         BUG_ON(!el->l_next_free_rec);
5622
5623         /*
5624          * Make sure that this extent list will actually be empty
5625          * after we clear away the data. We can shortcut out if
5626          * there's more than one non-empty extent in the
5627          * list. Otherwise, a check of the remaining extent is
5628          * necessary.
5629          */
5630         next_free = le16_to_cpu(el->l_next_free_rec);
5631         rec = NULL;
5632         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
5633                 if (next_free > 2)
5634                         goto out;
5635
5636                 /* We may have a valid extent in index 1, check it. */
5637                 if (next_free == 2)
5638                         rec = &el->l_recs[1];
5639
5640                 /*
5641                  * Fall through - no more nonempty extents, so we want
5642                  * to delete this leaf.
5643                  */
5644         } else {
5645                 if (next_free > 1)
5646                         goto out;
5647
5648                 rec = &el->l_recs[0];
5649         }
5650
5651         if (rec) {
5652                 /*
5653                  * Check it we'll only be trimming off the end of this
5654                  * cluster.
5655                  */
5656                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
5657                         goto out;
5658         }
5659
5660         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
5661         if (ret) {
5662                 mlog_errno(ret);
5663                 goto out;
5664         }
5665
5666         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
5667         if (ret) {
5668                 mlog_errno(ret);
5669                 goto out;
5670         }
5671
5672         eb = (struct ocfs2_extent_block *) bh->b_data;
5673         el = &eb->h_list;
5674         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
5675                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
5676                 ret = -EROFS;
5677                 goto out;
5678         }
5679
5680         *new_last_eb = bh;
5681         get_bh(*new_last_eb);
5682         mlog(0, "returning block %llu, (cpos: %u)\n",
5683              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
5684 out:
5685         brelse(bh);
5686
5687         return ret;
5688 }
5689
5690 /*
5691  * Trim some clusters off the rightmost edge of a tree. Only called
5692  * during truncate.
5693  *
5694  * The caller needs to:
5695  *   - start journaling of each path component.
5696  *   - compute and fully set up any new last ext block
5697  */
5698 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
5699                            handle_t *handle, struct ocfs2_truncate_context *tc,
5700                            u32 clusters_to_del, u64 *delete_start)
5701 {
5702         int ret, i, index = path->p_tree_depth;
5703         u32 new_edge = 0;
5704         u64 deleted_eb = 0;
5705         struct buffer_head *bh;
5706         struct ocfs2_extent_list *el;
5707         struct ocfs2_extent_rec *rec;
5708
5709         *delete_start = 0;
5710
5711         while (index >= 0) {
5712                 bh = path->p_node[index].bh;
5713                 el = path->p_node[index].el;
5714
5715                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
5716                      index,  (unsigned long long)bh->b_blocknr);
5717
5718                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
5719
5720                 if (index !=
5721                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
5722                         ocfs2_error(inode->i_sb,
5723                                     "Inode %lu has invalid ext. block %llu",
5724                                     inode->i_ino,
5725                                     (unsigned long long)bh->b_blocknr);
5726                         ret = -EROFS;
5727                         goto out;
5728                 }
5729
5730 find_tail_record:
5731                 i = le16_to_cpu(el->l_next_free_rec) - 1;
5732                 rec = &el->l_recs[i];
5733
5734                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
5735                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
5736                      ocfs2_rec_clusters(el, rec),
5737                      (unsigned long long)le64_to_cpu(rec->e_blkno),
5738                      le16_to_cpu(el->l_next_free_rec));
5739
5740                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
5741
5742                 if (le16_to_cpu(el->l_tree_depth) == 0) {
5743                         /*
5744                          * If the leaf block contains a single empty
5745                          * extent and no records, we can just remove
5746                          * the block.
5747                          */
5748                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
5749                                 memset(rec, 0,
5750                                        sizeof(struct ocfs2_extent_rec));
5751                                 el->l_next_free_rec = cpu_to_le16(0);
5752
5753                                 goto delete;
5754                         }
5755
5756                         /*
5757                          * Remove any empty extents by shifting things
5758                          * left. That should make life much easier on
5759                          * the code below. This condition is rare
5760                          * enough that we shouldn't see a performance
5761                          * hit.
5762                          */
5763                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
5764                                 le16_add_cpu(&el->l_next_free_rec, -1);
5765
5766                                 for(i = 0;
5767                                     i < le16_to_cpu(el->l_next_free_rec); i++)
5768                                         el->l_recs[i] = el->l_recs[i + 1];
5769
5770                                 memset(&el->l_recs[i], 0,
5771                                        sizeof(struct ocfs2_extent_rec));
5772
5773                                 /*
5774                                  * We've modified our extent list. The
5775                                  * simplest way to handle this change
5776                                  * is to being the search from the
5777                                  * start again.
5778                                  */
5779                                 goto find_tail_record;
5780                         }
5781
5782                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
5783
5784                         /*
5785                          * We'll use "new_edge" on our way back up the
5786                          * tree to know what our rightmost cpos is.
5787                          */
5788                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
5789                         new_edge += le32_to_cpu(rec->e_cpos);
5790
5791                         /*
5792                          * The caller will use this to delete data blocks.
5793                          */
5794                         *delete_start = le64_to_cpu(rec->e_blkno)
5795                                 + ocfs2_clusters_to_blocks(inode->i_sb,
5796                                         le16_to_cpu(rec->e_leaf_clusters));
5797
5798                         /*
5799                          * If it's now empty, remove this record.
5800                          */
5801                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
5802                                 memset(rec, 0,
5803                                        sizeof(struct ocfs2_extent_rec));
5804                                 le16_add_cpu(&el->l_next_free_rec, -1);
5805                         }
5806                 } else {
5807                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
5808                                 memset(rec, 0,
5809                                        sizeof(struct ocfs2_extent_rec));
5810                                 le16_add_cpu(&el->l_next_free_rec, -1);
5811
5812                                 goto delete;
5813                         }
5814
5815                         /* Can this actually happen? */
5816                         if (le16_to_cpu(el->l_next_free_rec) == 0)
5817                                 goto delete;
5818
5819                         /*
5820                          * We never actually deleted any clusters
5821                          * because our leaf was empty. There's no
5822                          * reason to adjust the rightmost edge then.
5823                          */
5824                         if (new_edge == 0)
5825                                 goto delete;
5826
5827                         rec->e_int_clusters = cpu_to_le32(new_edge);
5828                         le32_add_cpu(&rec->e_int_clusters,
5829                                      -le32_to_cpu(rec->e_cpos));
5830
5831                          /*
5832                           * A deleted child record should have been
5833                           * caught above.
5834                           */
5835                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
5836                 }
5837
5838 delete:
5839                 ret = ocfs2_journal_dirty(handle, bh);
5840                 if (ret) {
5841                         mlog_errno(ret);
5842                         goto out;
5843                 }
5844
5845                 mlog(0, "extent list container %llu, after: record %d: "
5846                      "(%u, %u, %llu), next = %u.\n",
5847                      (unsigned long long)bh->b_blocknr, i,
5848                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
5849                      (unsigned long long)le64_to_cpu(rec->e_blkno),
5850                      le16_to_cpu(el->l_next_free_rec));
5851
5852                 /*
5853                  * We must be careful to only attempt delete of an
5854                  * extent block (and not the root inode block).
5855                  */
5856                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
5857                         struct ocfs2_extent_block *eb =
5858                                 (struct ocfs2_extent_block *)bh->b_data;
5859
5860                         /*
5861                          * Save this for use when processing the
5862                          * parent block.
5863                          */
5864                         deleted_eb = le64_to_cpu(eb->h_blkno);
5865
5866                         mlog(0, "deleting this extent block.\n");
5867
5868                         ocfs2_remove_from_cache(inode, bh);
5869
5870                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
5871                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
5872                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
5873
5874                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
5875                         /* An error here is not fatal. */
5876                         if (ret < 0)
5877                                 mlog_errno(ret);
5878                 } else {
5879                         deleted_eb = 0;
5880                 }
5881
5882                 index--;
5883         }
5884
5885         ret = 0;
5886 out:
5887         return ret;
5888 }
5889
5890 static int ocfs2_do_truncate(struct ocfs2_super *osb,
5891                              unsigned int clusters_to_del,
5892                              struct inode *inode,
5893                              struct buffer_head *fe_bh,
5894                              handle_t *handle,
5895                              struct ocfs2_truncate_context *tc,
5896                              struct ocfs2_path *path)
5897 {
5898         int status;
5899         struct ocfs2_dinode *fe;
5900         struct ocfs2_extent_block *last_eb = NULL;
5901         struct ocfs2_extent_list *el;
5902         struct buffer_head *last_eb_bh = NULL;
5903         u64 delete_blk = 0;
5904
5905         fe = (struct ocfs2_dinode *) fe_bh->b_data;
5906
5907         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
5908                                              path, &last_eb_bh);
5909         if (status < 0) {
5910                 mlog_errno(status);
5911                 goto bail;
5912         }
5913
5914         /*
5915          * Each component will be touched, so we might as well journal
5916          * here to avoid having to handle errors later.
5917          */
5918         status = ocfs2_journal_access_path(inode, handle, path);
5919         if (status < 0) {
5920                 mlog_errno(status);
5921                 goto bail;
5922         }
5923
5924         if (last_eb_bh) {
5925                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
5926                                               OCFS2_JOURNAL_ACCESS_WRITE);
5927                 if (status < 0) {
5928                         mlog_errno(status);
5929                         goto bail;
5930                 }
5931
5932                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5933         }
5934
5935         el = &(fe->id2.i_list);
5936
5937         /*
5938          * Lower levels depend on this never happening, but it's best
5939          * to check it up here before changing the tree.
5940          */
5941         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
5942                 ocfs2_error(inode->i_sb,
5943                             "Inode %lu has an empty extent record, depth %u\n",
5944                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
5945                 status = -EROFS;
5946                 goto bail;
5947         }
5948
5949         spin_lock(&OCFS2_I(inode)->ip_lock);
5950         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
5951                                       clusters_to_del;
5952         spin_unlock(&OCFS2_I(inode)->ip_lock);
5953         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
5954         inode->i_blocks = ocfs2_inode_sector_count(inode);
5955
5956         status = ocfs2_trim_tree(inode, path, handle, tc,
5957                                  clusters_to_del, &delete_blk);
5958         if (status) {
5959                 mlog_errno(status);
5960                 goto bail;
5961         }
5962
5963         if (le32_to_cpu(fe->i_clusters) == 0) {
5964                 /* trunc to zero is a special case. */
5965                 el->l_tree_depth = 0;
5966                 fe->i_last_eb_blk = 0;
5967         } else if (last_eb)
5968                 fe->i_last_eb_blk = last_eb->h_blkno;
5969
5970         status = ocfs2_journal_dirty(handle, fe_bh);
5971         if (status < 0) {
5972                 mlog_errno(status);
5973                 goto bail;
5974         }
5975
5976         if (last_eb) {
5977                 /* If there will be a new last extent block, then by
5978                  * definition, there cannot be any leaves to the right of
5979                  * him. */
5980                 last_eb->h_next_leaf_blk = 0;
5981                 status = ocfs2_journal_dirty(handle, last_eb_bh);
5982                 if (status < 0) {
5983                         mlog_errno(status);
5984                         goto bail;
5985                 }
5986         }
5987
5988         if (delete_blk) {
5989                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
5990                                                    clusters_to_del);
5991                 if (status < 0) {
5992                         mlog_errno(status);
5993                         goto bail;
5994                 }
5995         }
5996         status = 0;
5997 bail:
5998
5999         mlog_exit(status);
6000         return status;
6001 }
6002
6003 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
6004 {
6005         set_buffer_uptodate(bh);
6006         mark_buffer_dirty(bh);
6007         return 0;
6008 }
6009
6010 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6011 {
6012         set_buffer_uptodate(bh);
6013         mark_buffer_dirty(bh);
6014         return ocfs2_journal_dirty_data(handle, bh);
6015 }
6016
6017 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6018                                      unsigned int from, unsigned int to,
6019                                      struct page *page, int zero, u64 *phys)
6020 {
6021         int ret, partial = 0;
6022
6023         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6024         if (ret)
6025                 mlog_errno(ret);
6026
6027         if (zero)
6028                 zero_user_segment(page, from, to);
6029
6030         /*
6031          * Need to set the buffers we zero'd into uptodate
6032          * here if they aren't - ocfs2_map_page_blocks()
6033          * might've skipped some
6034          */
6035         if (ocfs2_should_order_data(inode)) {
6036                 ret = walk_page_buffers(handle,
6037                                         page_buffers(page),
6038                                         from, to, &partial,
6039                                         ocfs2_ordered_zero_func);
6040                 if (ret < 0)
6041                         mlog_errno(ret);
6042         } else {
6043                 ret = walk_page_buffers(handle, page_buffers(page),
6044                                         from, to, &partial,
6045                                         ocfs2_writeback_zero_func);
6046                 if (ret < 0)
6047                         mlog_errno(ret);
6048         }
6049
6050         if (!partial)
6051                 SetPageUptodate(page);
6052
6053         flush_dcache_page(page);
6054 }
6055
6056 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6057                                      loff_t end, struct page **pages,
6058                                      int numpages, u64 phys, handle_t *handle)
6059 {
6060         int i;
6061         struct page *page;
6062         unsigned int from, to = PAGE_CACHE_SIZE;
6063         struct super_block *sb = inode->i_sb;
6064
6065         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6066
6067         if (numpages == 0)
6068                 goto out;
6069
6070         to = PAGE_CACHE_SIZE;
6071         for(i = 0; i < numpages; i++) {
6072                 page = pages[i];
6073
6074                 from = start & (PAGE_CACHE_SIZE - 1);
6075                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6076                         to = end & (PAGE_CACHE_SIZE - 1);
6077
6078                 BUG_ON(from > PAGE_CACHE_SIZE);
6079                 BUG_ON(to > PAGE_CACHE_SIZE);
6080
6081                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6082                                          &phys);
6083
6084                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6085         }
6086 out:
6087         if (pages)
6088                 ocfs2_unlock_and_free_pages(pages, numpages);
6089 }
6090
6091 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6092                                 struct page **pages, int *num)
6093 {
6094         int numpages, ret = 0;
6095         struct super_block *sb = inode->i_sb;
6096         struct address_space *mapping = inode->i_mapping;
6097         unsigned long index;
6098         loff_t last_page_bytes;
6099
6100         BUG_ON(start > end);
6101
6102         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6103                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6104
6105         numpages = 0;
6106         last_page_bytes = PAGE_ALIGN(end);
6107         index = start >> PAGE_CACHE_SHIFT;
6108         do {
6109                 pages[numpages] = grab_cache_page(mapping, index);
6110                 if (!pages[numpages]) {
6111                         ret = -ENOMEM;
6112                         mlog_errno(ret);
6113                         goto out;
6114                 }
6115
6116                 numpages++;
6117                 index++;
6118         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6119
6120 out:
6121         if (ret != 0) {
6122                 if (pages)
6123                         ocfs2_unlock_and_free_pages(pages, numpages);
6124                 numpages = 0;
6125         }
6126
6127         *num = numpages;
6128
6129         return ret;
6130 }
6131
6132 /*
6133  * Zero the area past i_size but still within an allocated
6134  * cluster. This avoids exposing nonzero data on subsequent file
6135  * extends.
6136  *
6137  * We need to call this before i_size is updated on the inode because
6138  * otherwise block_write_full_page() will skip writeout of pages past
6139  * i_size. The new_i_size parameter is passed for this reason.
6140  */
6141 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6142                                   u64 range_start, u64 range_end)
6143 {
6144         int ret = 0, numpages;
6145         struct page **pages = NULL;
6146         u64 phys;
6147         unsigned int ext_flags;
6148         struct super_block *sb = inode->i_sb;
6149
6150         /*
6151          * File systems which don't support sparse files zero on every
6152          * extend.
6153          */
6154         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6155                 return 0;
6156
6157         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6158                         sizeof(struct page *), GFP_NOFS);
6159         if (pages == NULL) {
6160                 ret = -ENOMEM;
6161                 mlog_errno(ret);
6162                 goto out;
6163         }
6164
6165         if (range_start == range_end)
6166                 goto out;
6167
6168         ret = ocfs2_extent_map_get_blocks(inode,
6169                                           range_start >> sb->s_blocksize_bits,
6170                                           &phys, NULL, &ext_flags);
6171         if (ret) {
6172                 mlog_errno(ret);
6173                 goto out;
6174         }
6175
6176         /*
6177          * Tail is a hole, or is marked unwritten. In either case, we
6178          * can count on read and write to return/push zero's.
6179          */
6180         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6181                 goto out;
6182
6183         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6184                                    &numpages);
6185         if (ret) {
6186                 mlog_errno(ret);
6187                 goto out;
6188         }
6189
6190         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6191                                  numpages, phys, handle);
6192
6193         /*
6194          * Initiate writeout of the pages we zero'd here. We don't
6195          * wait on them - the truncate_inode_pages() call later will
6196          * do that for us.
6197          */
6198         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6199                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6200         if (ret)
6201                 mlog_errno(ret);
6202
6203 out:
6204         if (pages)
6205                 kfree(pages);
6206
6207         return ret;
6208 }
6209
6210 static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
6211 {
6212         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6213
6214         memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
6215 }
6216
6217 void ocfs2_dinode_new_extent_list(struct inode *inode,
6218                                   struct ocfs2_dinode *di)
6219 {
6220         ocfs2_zero_dinode_id2(inode, di);
6221         di->id2.i_list.l_tree_depth = 0;
6222         di->id2.i_list.l_next_free_rec = 0;
6223         di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
6224 }
6225
6226 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6227 {
6228         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6229         struct ocfs2_inline_data *idata = &di->id2.i_data;
6230
6231         spin_lock(&oi->ip_lock);
6232         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6233         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6234         spin_unlock(&oi->ip_lock);
6235
6236         /*
6237          * We clear the entire i_data structure here so that all
6238          * fields can be properly initialized.
6239          */
6240         ocfs2_zero_dinode_id2(inode, di);
6241
6242         idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
6243 }
6244
6245 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6246                                          struct buffer_head *di_bh)
6247 {
6248         int ret, i, has_data, num_pages = 0;
6249         handle_t *handle;
6250         u64 uninitialized_var(block);
6251         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6252         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6253         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6254         struct ocfs2_alloc_context *data_ac = NULL;
6255         struct page **pages = NULL;
6256         loff_t end = osb->s_clustersize;
6257
6258         has_data = i_size_read(inode) ? 1 : 0;
6259
6260         if (has_data) {
6261                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6262                                 sizeof(struct page *), GFP_NOFS);
6263                 if (pages == NULL) {
6264                         ret = -ENOMEM;
6265                         mlog_errno(ret);
6266                         goto out;
6267                 }
6268
6269                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6270                 if (ret) {
6271                         mlog_errno(ret);
6272                         goto out;
6273                 }
6274         }
6275
6276         handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6277         if (IS_ERR(handle)) {
6278                 ret = PTR_ERR(handle);
6279                 mlog_errno(ret);
6280                 goto out_unlock;
6281         }
6282
6283         ret = ocfs2_journal_access(handle, inode, di_bh,
6284                                    OCFS2_JOURNAL_ACCESS_WRITE);
6285         if (ret) {
6286                 mlog_errno(ret);
6287                 goto out_commit;
6288         }
6289
6290         if (has_data) {
6291                 u32 bit_off, num;
6292                 unsigned int page_end;
6293                 u64 phys;
6294
6295                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6296                                            &num);
6297                 if (ret) {
6298                         mlog_errno(ret);
6299                         goto out_commit;
6300                 }
6301
6302                 /*
6303                  * Save two copies, one for insert, and one that can
6304                  * be changed by ocfs2_map_and_dirty_page() below.
6305                  */
6306                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6307
6308                 /*
6309                  * Non sparse file systems zero on extend, so no need
6310                  * to do that now.
6311                  */
6312                 if (!ocfs2_sparse_alloc(osb) &&
6313                     PAGE_CACHE_SIZE < osb->s_clustersize)
6314                         end = PAGE_CACHE_SIZE;
6315
6316                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6317                 if (ret) {
6318                         mlog_errno(ret);
6319                         goto out_commit;
6320                 }
6321
6322                 /*
6323                  * This should populate the 1st page for us and mark
6324                  * it up to date.
6325                  */
6326                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6327                 if (ret) {
6328                         mlog_errno(ret);
6329                         goto out_commit;
6330                 }
6331
6332                 page_end = PAGE_CACHE_SIZE;
6333                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6334                         page_end = osb->s_clustersize;
6335
6336                 for (i = 0; i < num_pages; i++)
6337                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6338                                                  pages[i], i > 0, &phys);
6339         }
6340
6341         spin_lock(&oi->ip_lock);
6342         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6343         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6344         spin_unlock(&oi->ip_lock);
6345
6346         ocfs2_dinode_new_extent_list(inode, di);
6347
6348         ocfs2_journal_dirty(handle, di_bh);
6349
6350         if (has_data) {
6351                 /*
6352                  * An error at this point should be extremely rare. If
6353                  * this proves to be false, we could always re-build
6354                  * the in-inode data from our pages.
6355                  */
6356                 ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
6357                                           0, block, 1, 0, NULL);
6358                 if (ret) {
6359                         mlog_errno(ret);
6360                         goto out_commit;
6361                 }
6362
6363                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6364         }
6365
6366 out_commit:
6367         ocfs2_commit_trans(osb, handle);
6368
6369 out_unlock:
6370         if (data_ac)
6371                 ocfs2_free_alloc_context(data_ac);
6372
6373 out:
6374         if (pages) {
6375                 ocfs2_unlock_and_free_pages(pages, num_pages);
6376                 kfree(pages);
6377         }
6378
6379         return ret;
6380 }
6381
6382 /*
6383  * It is expected, that by the time you call this function,
6384  * inode->i_size and fe->i_size have been adjusted.
6385  *
6386  * WARNING: This will kfree the truncate context
6387  */
6388 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6389                           struct inode *inode,
6390                           struct buffer_head *fe_bh,
6391                           struct ocfs2_truncate_context *tc)
6392 {
6393         int status, i, credits, tl_sem = 0;
6394         u32 clusters_to_del, new_highest_cpos, range;
6395         struct ocfs2_extent_list *el;
6396         handle_t *handle = NULL;
6397         struct inode *tl_inode = osb->osb_tl_inode;
6398         struct ocfs2_path *path = NULL;
6399
6400         mlog_entry_void();
6401
6402         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6403                                                      i_size_read(inode));
6404
6405         path = ocfs2_new_inode_path(fe_bh);
6406         if (!path) {
6407                 status = -ENOMEM;
6408                 mlog_errno(status);
6409                 goto bail;
6410         }
6411
6412         ocfs2_extent_map_trunc(inode, new_highest_cpos);
6413
6414 start:
6415         /*
6416          * Check that we still have allocation to delete.
6417          */
6418         if (OCFS2_I(inode)->ip_clusters == 0) {
6419                 status = 0;
6420                 goto bail;
6421         }
6422
6423         /*
6424          * Truncate always works against the rightmost tree branch.
6425          */
6426         status = ocfs2_find_path(inode, path, UINT_MAX);
6427         if (status) {
6428                 mlog_errno(status);
6429                 goto bail;
6430         }
6431
6432         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6433              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6434
6435         /*
6436          * By now, el will point to the extent list on the bottom most
6437          * portion of this tree. Only the tail record is considered in
6438          * each pass.
6439          *
6440          * We handle the following cases, in order:
6441          * - empty extent: delete the remaining branch
6442          * - remove the entire record
6443          * - remove a partial record
6444          * - no record needs to be removed (truncate has completed)
6445          */
6446         el = path_leaf_el(path);
6447         if (le16_to_cpu(el->l_next_free_rec) == 0) {
6448                 ocfs2_error(inode->i_sb,
6449                             "Inode %llu has empty extent block at %llu\n",
6450                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
6451                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
6452                 status = -EROFS;
6453                 goto bail;
6454         }
6455
6456         i = le16_to_cpu(el->l_next_free_rec) - 1;
6457         range = le32_to_cpu(el->l_recs[i].e_cpos) +
6458                 ocfs2_rec_clusters(el, &el->l_recs[i]);
6459         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6460                 clusters_to_del = 0;
6461         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6462                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6463         } else if (range > new_highest_cpos) {
6464                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6465                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
6466                                   new_highest_cpos;
6467         } else {
6468                 status = 0;
6469                 goto bail;
6470         }
6471
6472         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6473              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6474
6475         mutex_lock(&tl_inode->i_mutex);
6476         tl_sem = 1;
6477         /* ocfs2_truncate_log_needs_flush guarantees us at least one
6478          * record is free for use. If there isn't any, we flush to get
6479          * an empty truncate log.  */
6480         if (ocfs2_truncate_log_needs_flush(osb)) {
6481                 status = __ocfs2_flush_truncate_log(osb);
6482                 if (status < 0) {
6483                         mlog_errno(status);
6484                         goto bail;
6485                 }
6486         }
6487
6488         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6489                                                 (struct ocfs2_dinode *)fe_bh->b_data,
6490                                                 el);
6491         handle = ocfs2_start_trans(osb, credits);
6492         if (IS_ERR(handle)) {
6493                 status = PTR_ERR(handle);
6494                 handle = NULL;
6495                 mlog_errno(status);
6496                 goto bail;
6497         }
6498
6499         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6500                                    tc, path);
6501         if (status < 0) {
6502                 mlog_errno(status);
6503                 goto bail;
6504         }
6505
6506         mutex_unlock(&tl_inode->i_mutex);
6507         tl_sem = 0;
6508
6509         ocfs2_commit_trans(osb, handle);
6510         handle = NULL;
6511
6512         ocfs2_reinit_path(path, 1);
6513
6514         /*
6515          * The check above will catch the case where we've truncated
6516          * away all allocation.
6517          */
6518         goto start;
6519
6520 bail:
6521
6522         ocfs2_schedule_truncate_log_flush(osb, 1);
6523
6524         if (tl_sem)
6525                 mutex_unlock(&tl_inode->i_mutex);
6526
6527         if (handle)
6528                 ocfs2_commit_trans(osb, handle);
6529
6530         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6531
6532         ocfs2_free_path(path);
6533
6534         /* This will drop the ext_alloc cluster lock for us */
6535         ocfs2_free_truncate_context(tc);
6536
6537         mlog_exit(status);
6538         return status;
6539 }
6540
6541 /*
6542  * Expects the inode to already be locked.
6543  */
6544 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6545                            struct inode *inode,
6546                            struct buffer_head *fe_bh,
6547                            struct ocfs2_truncate_context **tc)
6548 {
6549         int status;
6550         unsigned int new_i_clusters;
6551         struct ocfs2_dinode *fe;
6552         struct ocfs2_extent_block *eb;
6553         struct buffer_head *last_eb_bh = NULL;
6554
6555         mlog_entry_void();
6556
6557         *tc = NULL;
6558
6559         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
6560                                                   i_size_read(inode));
6561         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6562
6563         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
6564              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
6565              (unsigned long long)le64_to_cpu(fe->i_size));
6566
6567         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
6568         if (!(*tc)) {
6569                 status = -ENOMEM;
6570                 mlog_errno(status);
6571                 goto bail;
6572         }
6573         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6574
6575         if (fe->id2.i_list.l_tree_depth) {
6576                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
6577                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
6578                 if (status < 0) {
6579                         mlog_errno(status);
6580                         goto bail;
6581                 }
6582                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6583                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6584                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6585
6586                         brelse(last_eb_bh);
6587                         status = -EIO;
6588                         goto bail;
6589                 }
6590         }
6591
6592         (*tc)->tc_last_eb_bh = last_eb_bh;
6593
6594         status = 0;
6595 bail:
6596         if (status < 0) {
6597                 if (*tc)
6598                         ocfs2_free_truncate_context(*tc);
6599                 *tc = NULL;
6600         }
6601         mlog_exit_void();
6602         return status;
6603 }
6604
6605 /*
6606  * 'start' is inclusive, 'end' is not.
6607  */
6608 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
6609                           unsigned int start, unsigned int end, int trunc)
6610 {
6611         int ret;
6612         unsigned int numbytes;
6613         handle_t *handle;
6614         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6615         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6616         struct ocfs2_inline_data *idata = &di->id2.i_data;
6617
6618         if (end > i_size_read(inode))
6619                 end = i_size_read(inode);
6620
6621         BUG_ON(start >= end);
6622
6623         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
6624             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
6625             !ocfs2_supports_inline_data(osb)) {
6626                 ocfs2_error(inode->i_sb,
6627                             "Inline data flags for inode %llu don't agree! "
6628                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
6629                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
6630                             le16_to_cpu(di->i_dyn_features),
6631                             OCFS2_I(inode)->ip_dyn_features,
6632                             osb->s_feature_incompat);
6633                 ret = -EROFS;
6634                 goto out;
6635         }
6636
6637         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
6638         if (IS_ERR(handle)) {
6639                 ret = PTR_ERR(handle);
6640                 mlog_errno(ret);
6641                 goto out;
6642         }
6643
6644         ret = ocfs2_journal_access(handle, inode, di_bh,
6645                                    OCFS2_JOURNAL_ACCESS_WRITE);
6646         if (ret) {
6647                 mlog_errno(ret);
6648                 goto out_commit;
6649         }
6650
6651         numbytes = end - start;
6652         memset(idata->id_data + start, 0, numbytes);
6653
6654         /*
6655          * No need to worry about the data page here - it's been
6656          * truncated already and inline data doesn't need it for
6657          * pushing zero's to disk, so we'll let readpage pick it up
6658          * later.
6659          */
6660         if (trunc) {
6661                 i_size_write(inode, start);
6662                 di->i_size = cpu_to_le64(start);
6663         }
6664
6665         inode->i_blocks = ocfs2_inode_sector_count(inode);
6666         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
6667
6668         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
6669         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
6670
6671         ocfs2_journal_dirty(handle, di_bh);
6672
6673 out_commit:
6674         ocfs2_commit_trans(osb, handle);
6675
6676 out:
6677         return ret;
6678 }
6679
6680 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
6681 {
6682         /*
6683          * The caller is responsible for completing deallocation
6684          * before freeing the context.
6685          */
6686         if (tc->tc_dealloc.c_first_suballocator != NULL)
6687                 mlog(ML_NOTICE,
6688                      "Truncate completion has non-empty dealloc context\n");
6689
6690         if (tc->tc_last_eb_bh)
6691                 brelse(tc->tc_last_eb_bh);
6692
6693         kfree(tc);
6694 }