ocfs2: Remove JBD compatibility layer
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52
53 /*
54  * Operations for a specific extent tree type.
55  *
56  * To implement an on-disk btree (extent tree) type in ocfs2, add
57  * an ocfs2_extent_tree_operations structure and the matching
58  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
59  * for the allocation portion of the extent tree.
60  */
61 struct ocfs2_extent_tree_operations {
62         /*
63          * last_eb_blk is the block number of the right most leaf extent
64          * block.  Most on-disk structures containing an extent tree store
65          * this value for fast access.  The ->eo_set_last_eb_blk() and
66          * ->eo_get_last_eb_blk() operations access this value.  They are
67          *  both required.
68          */
69         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
70                                    u64 blkno);
71         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
72
73         /*
74          * The on-disk structure usually keeps track of how many total
75          * clusters are stored in this extent tree.  This function updates
76          * that value.  new_clusters is the delta, and must be
77          * added to the total.  Required.
78          */
79         void (*eo_update_clusters)(struct inode *inode,
80                                    struct ocfs2_extent_tree *et,
81                                    u32 new_clusters);
82
83         /*
84          * If ->eo_insert_check() exists, it is called before rec is
85          * inserted into the extent tree.  It is optional.
86          */
87         int (*eo_insert_check)(struct inode *inode,
88                                struct ocfs2_extent_tree *et,
89                                struct ocfs2_extent_rec *rec);
90         int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
91
92         /*
93          * --------------------------------------------------------------
94          * The remaining are internal to ocfs2_extent_tree and don't have
95          * accessor functions
96          */
97
98         /*
99          * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
100          * It is required.
101          */
102         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
103
104         /*
105          * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
106          * it exists.  If it does not, et->et_max_leaf_clusters is set
107          * to 0 (unlimited).  Optional.
108          */
109         void (*eo_fill_max_leaf_clusters)(struct inode *inode,
110                                           struct ocfs2_extent_tree *et);
111 };
112
113
114 /*
115  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
116  * in the methods.
117  */
118 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
119 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
120                                          u64 blkno);
121 static void ocfs2_dinode_update_clusters(struct inode *inode,
122                                          struct ocfs2_extent_tree *et,
123                                          u32 clusters);
124 static int ocfs2_dinode_insert_check(struct inode *inode,
125                                      struct ocfs2_extent_tree *et,
126                                      struct ocfs2_extent_rec *rec);
127 static int ocfs2_dinode_sanity_check(struct inode *inode,
128                                      struct ocfs2_extent_tree *et);
129 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
130 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
131         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
132         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
133         .eo_update_clusters     = ocfs2_dinode_update_clusters,
134         .eo_insert_check        = ocfs2_dinode_insert_check,
135         .eo_sanity_check        = ocfs2_dinode_sanity_check,
136         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
137 };
138
139 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
140                                          u64 blkno)
141 {
142         struct ocfs2_dinode *di = et->et_object;
143
144         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
145         di->i_last_eb_blk = cpu_to_le64(blkno);
146 }
147
148 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
149 {
150         struct ocfs2_dinode *di = et->et_object;
151
152         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
153         return le64_to_cpu(di->i_last_eb_blk);
154 }
155
156 static void ocfs2_dinode_update_clusters(struct inode *inode,
157                                          struct ocfs2_extent_tree *et,
158                                          u32 clusters)
159 {
160         struct ocfs2_dinode *di = et->et_object;
161
162         le32_add_cpu(&di->i_clusters, clusters);
163         spin_lock(&OCFS2_I(inode)->ip_lock);
164         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
165         spin_unlock(&OCFS2_I(inode)->ip_lock);
166 }
167
168 static int ocfs2_dinode_insert_check(struct inode *inode,
169                                      struct ocfs2_extent_tree *et,
170                                      struct ocfs2_extent_rec *rec)
171 {
172         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
173
174         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
175         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
176                         (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
177                         "Device %s, asking for sparse allocation: inode %llu, "
178                         "cpos %u, clusters %u\n",
179                         osb->dev_str,
180                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
181                         rec->e_cpos,
182                         OCFS2_I(inode)->ip_clusters);
183
184         return 0;
185 }
186
187 static int ocfs2_dinode_sanity_check(struct inode *inode,
188                                      struct ocfs2_extent_tree *et)
189 {
190         struct ocfs2_dinode *di = et->et_object;
191
192         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
193         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
194
195         return 0;
196 }
197
198 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
199 {
200         struct ocfs2_dinode *di = et->et_object;
201
202         et->et_root_el = &di->id2.i_list;
203 }
204
205
206 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
207 {
208         struct ocfs2_xattr_value_root *xv = et->et_object;
209
210         et->et_root_el = &xv->xr_list;
211 }
212
213 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
214                                               u64 blkno)
215 {
216         struct ocfs2_xattr_value_root *xv =
217                 (struct ocfs2_xattr_value_root *)et->et_object;
218
219         xv->xr_last_eb_blk = cpu_to_le64(blkno);
220 }
221
222 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
223 {
224         struct ocfs2_xattr_value_root *xv =
225                 (struct ocfs2_xattr_value_root *) et->et_object;
226
227         return le64_to_cpu(xv->xr_last_eb_blk);
228 }
229
230 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
231                                               struct ocfs2_extent_tree *et,
232                                               u32 clusters)
233 {
234         struct ocfs2_xattr_value_root *xv =
235                 (struct ocfs2_xattr_value_root *)et->et_object;
236
237         le32_add_cpu(&xv->xr_clusters, clusters);
238 }
239
240 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
241         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
242         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
243         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
244         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
245 };
246
247 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
248 {
249         struct ocfs2_xattr_block *xb = et->et_object;
250
251         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
252 }
253
254 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
255                                                     struct ocfs2_extent_tree *et)
256 {
257         et->et_max_leaf_clusters =
258                 ocfs2_clusters_for_bytes(inode->i_sb,
259                                          OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
260 }
261
262 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
263                                              u64 blkno)
264 {
265         struct ocfs2_xattr_block *xb = et->et_object;
266         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
267
268         xt->xt_last_eb_blk = cpu_to_le64(blkno);
269 }
270
271 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
272 {
273         struct ocfs2_xattr_block *xb = et->et_object;
274         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
275
276         return le64_to_cpu(xt->xt_last_eb_blk);
277 }
278
279 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
280                                              struct ocfs2_extent_tree *et,
281                                              u32 clusters)
282 {
283         struct ocfs2_xattr_block *xb = et->et_object;
284
285         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
286 }
287
288 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
289         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
290         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
291         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
292         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
293         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
294 };
295
296 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
297                                      struct inode *inode,
298                                      struct buffer_head *bh,
299                                      void *obj,
300                                      struct ocfs2_extent_tree_operations *ops)
301 {
302         et->et_ops = ops;
303         et->et_root_bh = bh;
304         if (!obj)
305                 obj = (void *)bh->b_data;
306         et->et_object = obj;
307
308         et->et_ops->eo_fill_root_el(et);
309         if (!et->et_ops->eo_fill_max_leaf_clusters)
310                 et->et_max_leaf_clusters = 0;
311         else
312                 et->et_ops->eo_fill_max_leaf_clusters(inode, et);
313 }
314
315 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
316                                    struct inode *inode,
317                                    struct buffer_head *bh)
318 {
319         __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
320 }
321
322 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
323                                        struct inode *inode,
324                                        struct buffer_head *bh)
325 {
326         __ocfs2_init_extent_tree(et, inode, bh, NULL,
327                                  &ocfs2_xattr_tree_et_ops);
328 }
329
330 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
331                                         struct inode *inode,
332                                         struct buffer_head *bh,
333                                         struct ocfs2_xattr_value_root *xv)
334 {
335         __ocfs2_init_extent_tree(et, inode, bh, xv,
336                                  &ocfs2_xattr_value_et_ops);
337 }
338
339 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
340                                             u64 new_last_eb_blk)
341 {
342         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
343 }
344
345 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
346 {
347         return et->et_ops->eo_get_last_eb_blk(et);
348 }
349
350 static inline void ocfs2_et_update_clusters(struct inode *inode,
351                                             struct ocfs2_extent_tree *et,
352                                             u32 clusters)
353 {
354         et->et_ops->eo_update_clusters(inode, et, clusters);
355 }
356
357 static inline int ocfs2_et_insert_check(struct inode *inode,
358                                         struct ocfs2_extent_tree *et,
359                                         struct ocfs2_extent_rec *rec)
360 {
361         int ret = 0;
362
363         if (et->et_ops->eo_insert_check)
364                 ret = et->et_ops->eo_insert_check(inode, et, rec);
365         return ret;
366 }
367
368 static inline int ocfs2_et_sanity_check(struct inode *inode,
369                                         struct ocfs2_extent_tree *et)
370 {
371         int ret = 0;
372
373         if (et->et_ops->eo_sanity_check)
374                 ret = et->et_ops->eo_sanity_check(inode, et);
375         return ret;
376 }
377
378 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
379 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
380                                          struct ocfs2_extent_block *eb);
381
382 /*
383  * Structures which describe a path through a btree, and functions to
384  * manipulate them.
385  *
386  * The idea here is to be as generic as possible with the tree
387  * manipulation code.
388  */
389 struct ocfs2_path_item {
390         struct buffer_head              *bh;
391         struct ocfs2_extent_list        *el;
392 };
393
394 #define OCFS2_MAX_PATH_DEPTH    5
395
396 struct ocfs2_path {
397         int                     p_tree_depth;
398         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
399 };
400
401 #define path_root_bh(_path) ((_path)->p_node[0].bh)
402 #define path_root_el(_path) ((_path)->p_node[0].el)
403 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
404 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
405 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
406
407 /*
408  * Reset the actual path elements so that we can re-use the structure
409  * to build another path. Generally, this involves freeing the buffer
410  * heads.
411  */
412 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
413 {
414         int i, start = 0, depth = 0;
415         struct ocfs2_path_item *node;
416
417         if (keep_root)
418                 start = 1;
419
420         for(i = start; i < path_num_items(path); i++) {
421                 node = &path->p_node[i];
422
423                 brelse(node->bh);
424                 node->bh = NULL;
425                 node->el = NULL;
426         }
427
428         /*
429          * Tree depth may change during truncate, or insert. If we're
430          * keeping the root extent list, then make sure that our path
431          * structure reflects the proper depth.
432          */
433         if (keep_root)
434                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
435
436         path->p_tree_depth = depth;
437 }
438
439 static void ocfs2_free_path(struct ocfs2_path *path)
440 {
441         if (path) {
442                 ocfs2_reinit_path(path, 0);
443                 kfree(path);
444         }
445 }
446
447 /*
448  * All the elements of src into dest. After this call, src could be freed
449  * without affecting dest.
450  *
451  * Both paths should have the same root. Any non-root elements of dest
452  * will be freed.
453  */
454 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
455 {
456         int i;
457
458         BUG_ON(path_root_bh(dest) != path_root_bh(src));
459         BUG_ON(path_root_el(dest) != path_root_el(src));
460
461         ocfs2_reinit_path(dest, 1);
462
463         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
464                 dest->p_node[i].bh = src->p_node[i].bh;
465                 dest->p_node[i].el = src->p_node[i].el;
466
467                 if (dest->p_node[i].bh)
468                         get_bh(dest->p_node[i].bh);
469         }
470 }
471
472 /*
473  * Make the *dest path the same as src and re-initialize src path to
474  * have a root only.
475  */
476 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
477 {
478         int i;
479
480         BUG_ON(path_root_bh(dest) != path_root_bh(src));
481
482         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
483                 brelse(dest->p_node[i].bh);
484
485                 dest->p_node[i].bh = src->p_node[i].bh;
486                 dest->p_node[i].el = src->p_node[i].el;
487
488                 src->p_node[i].bh = NULL;
489                 src->p_node[i].el = NULL;
490         }
491 }
492
493 /*
494  * Insert an extent block at given index.
495  *
496  * This will not take an additional reference on eb_bh.
497  */
498 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
499                                         struct buffer_head *eb_bh)
500 {
501         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
502
503         /*
504          * Right now, no root bh is an extent block, so this helps
505          * catch code errors with dinode trees. The assertion can be
506          * safely removed if we ever need to insert extent block
507          * structures at the root.
508          */
509         BUG_ON(index == 0);
510
511         path->p_node[index].bh = eb_bh;
512         path->p_node[index].el = &eb->h_list;
513 }
514
515 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
516                                          struct ocfs2_extent_list *root_el)
517 {
518         struct ocfs2_path *path;
519
520         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
521
522         path = kzalloc(sizeof(*path), GFP_NOFS);
523         if (path) {
524                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
525                 get_bh(root_bh);
526                 path_root_bh(path) = root_bh;
527                 path_root_el(path) = root_el;
528         }
529
530         return path;
531 }
532
533 /*
534  * Convenience function to journal all components in a path.
535  */
536 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
537                                      struct ocfs2_path *path)
538 {
539         int i, ret = 0;
540
541         if (!path)
542                 goto out;
543
544         for(i = 0; i < path_num_items(path); i++) {
545                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
546                                            OCFS2_JOURNAL_ACCESS_WRITE);
547                 if (ret < 0) {
548                         mlog_errno(ret);
549                         goto out;
550                 }
551         }
552
553 out:
554         return ret;
555 }
556
557 /*
558  * Return the index of the extent record which contains cluster #v_cluster.
559  * -1 is returned if it was not found.
560  *
561  * Should work fine on interior and exterior nodes.
562  */
563 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
564 {
565         int ret = -1;
566         int i;
567         struct ocfs2_extent_rec *rec;
568         u32 rec_end, rec_start, clusters;
569
570         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
571                 rec = &el->l_recs[i];
572
573                 rec_start = le32_to_cpu(rec->e_cpos);
574                 clusters = ocfs2_rec_clusters(el, rec);
575
576                 rec_end = rec_start + clusters;
577
578                 if (v_cluster >= rec_start && v_cluster < rec_end) {
579                         ret = i;
580                         break;
581                 }
582         }
583
584         return ret;
585 }
586
587 enum ocfs2_contig_type {
588         CONTIG_NONE = 0,
589         CONTIG_LEFT,
590         CONTIG_RIGHT,
591         CONTIG_LEFTRIGHT,
592 };
593
594
595 /*
596  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
597  * ocfs2_extent_contig only work properly against leaf nodes!
598  */
599 static int ocfs2_block_extent_contig(struct super_block *sb,
600                                      struct ocfs2_extent_rec *ext,
601                                      u64 blkno)
602 {
603         u64 blk_end = le64_to_cpu(ext->e_blkno);
604
605         blk_end += ocfs2_clusters_to_blocks(sb,
606                                     le16_to_cpu(ext->e_leaf_clusters));
607
608         return blkno == blk_end;
609 }
610
611 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
612                                   struct ocfs2_extent_rec *right)
613 {
614         u32 left_range;
615
616         left_range = le32_to_cpu(left->e_cpos) +
617                 le16_to_cpu(left->e_leaf_clusters);
618
619         return (left_range == le32_to_cpu(right->e_cpos));
620 }
621
622 static enum ocfs2_contig_type
623         ocfs2_extent_contig(struct inode *inode,
624                             struct ocfs2_extent_rec *ext,
625                             struct ocfs2_extent_rec *insert_rec)
626 {
627         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
628
629         /*
630          * Refuse to coalesce extent records with different flag
631          * fields - we don't want to mix unwritten extents with user
632          * data.
633          */
634         if (ext->e_flags != insert_rec->e_flags)
635                 return CONTIG_NONE;
636
637         if (ocfs2_extents_adjacent(ext, insert_rec) &&
638             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
639                         return CONTIG_RIGHT;
640
641         blkno = le64_to_cpu(ext->e_blkno);
642         if (ocfs2_extents_adjacent(insert_rec, ext) &&
643             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
644                 return CONTIG_LEFT;
645
646         return CONTIG_NONE;
647 }
648
649 /*
650  * NOTE: We can have pretty much any combination of contiguousness and
651  * appending.
652  *
653  * The usefulness of APPEND_TAIL is more in that it lets us know that
654  * we'll have to update the path to that leaf.
655  */
656 enum ocfs2_append_type {
657         APPEND_NONE = 0,
658         APPEND_TAIL,
659 };
660
661 enum ocfs2_split_type {
662         SPLIT_NONE = 0,
663         SPLIT_LEFT,
664         SPLIT_RIGHT,
665 };
666
667 struct ocfs2_insert_type {
668         enum ocfs2_split_type   ins_split;
669         enum ocfs2_append_type  ins_appending;
670         enum ocfs2_contig_type  ins_contig;
671         int                     ins_contig_index;
672         int                     ins_tree_depth;
673 };
674
675 struct ocfs2_merge_ctxt {
676         enum ocfs2_contig_type  c_contig_type;
677         int                     c_has_empty_extent;
678         int                     c_split_covers_rec;
679 };
680
681 static int ocfs2_validate_extent_block(struct super_block *sb,
682                                        struct buffer_head *bh)
683 {
684         struct ocfs2_extent_block *eb =
685                 (struct ocfs2_extent_block *)bh->b_data;
686
687         mlog(0, "Validating extent block %llu\n",
688              (unsigned long long)bh->b_blocknr);
689
690         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
691                 ocfs2_error(sb,
692                             "Extent block #%llu has bad signature %.*s",
693                             (unsigned long long)bh->b_blocknr, 7,
694                             eb->h_signature);
695                 return -EINVAL;
696         }
697
698         if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
699                 ocfs2_error(sb,
700                             "Extent block #%llu has an invalid h_blkno "
701                             "of %llu",
702                             (unsigned long long)bh->b_blocknr,
703                             (unsigned long long)le64_to_cpu(eb->h_blkno));
704                 return -EINVAL;
705         }
706
707         if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
708                 ocfs2_error(sb,
709                             "Extent block #%llu has an invalid "
710                             "h_fs_generation of #%u",
711                             (unsigned long long)bh->b_blocknr,
712                             le32_to_cpu(eb->h_fs_generation));
713                 return -EINVAL;
714         }
715
716         return 0;
717 }
718
719 int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
720                             struct buffer_head **bh)
721 {
722         int rc;
723         struct buffer_head *tmp = *bh;
724
725         rc = ocfs2_read_block(inode, eb_blkno, &tmp,
726                               ocfs2_validate_extent_block);
727
728         /* If ocfs2_read_block() got us a new bh, pass it up. */
729         if (!rc && !*bh)
730                 *bh = tmp;
731
732         return rc;
733 }
734
735
736 /*
737  * How many free extents have we got before we need more meta data?
738  */
739 int ocfs2_num_free_extents(struct ocfs2_super *osb,
740                            struct inode *inode,
741                            struct ocfs2_extent_tree *et)
742 {
743         int retval;
744         struct ocfs2_extent_list *el = NULL;
745         struct ocfs2_extent_block *eb;
746         struct buffer_head *eb_bh = NULL;
747         u64 last_eb_blk = 0;
748
749         mlog_entry_void();
750
751         el = et->et_root_el;
752         last_eb_blk = ocfs2_et_get_last_eb_blk(et);
753
754         if (last_eb_blk) {
755                 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
756                 if (retval < 0) {
757                         mlog_errno(retval);
758                         goto bail;
759                 }
760                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
761                 el = &eb->h_list;
762         }
763
764         BUG_ON(el->l_tree_depth != 0);
765
766         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
767 bail:
768         brelse(eb_bh);
769
770         mlog_exit(retval);
771         return retval;
772 }
773
774 /* expects array to already be allocated
775  *
776  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
777  * l_count for you
778  */
779 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
780                                      handle_t *handle,
781                                      struct inode *inode,
782                                      int wanted,
783                                      struct ocfs2_alloc_context *meta_ac,
784                                      struct buffer_head *bhs[])
785 {
786         int count, status, i;
787         u16 suballoc_bit_start;
788         u32 num_got;
789         u64 first_blkno;
790         struct ocfs2_extent_block *eb;
791
792         mlog_entry_void();
793
794         count = 0;
795         while (count < wanted) {
796                 status = ocfs2_claim_metadata(osb,
797                                               handle,
798                                               meta_ac,
799                                               wanted - count,
800                                               &suballoc_bit_start,
801                                               &num_got,
802                                               &first_blkno);
803                 if (status < 0) {
804                         mlog_errno(status);
805                         goto bail;
806                 }
807
808                 for(i = count;  i < (num_got + count); i++) {
809                         bhs[i] = sb_getblk(osb->sb, first_blkno);
810                         if (bhs[i] == NULL) {
811                                 status = -EIO;
812                                 mlog_errno(status);
813                                 goto bail;
814                         }
815                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
816
817                         status = ocfs2_journal_access(handle, inode, bhs[i],
818                                                       OCFS2_JOURNAL_ACCESS_CREATE);
819                         if (status < 0) {
820                                 mlog_errno(status);
821                                 goto bail;
822                         }
823
824                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
825                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
826                         /* Ok, setup the minimal stuff here. */
827                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
828                         eb->h_blkno = cpu_to_le64(first_blkno);
829                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
830                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
831                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
832                         eb->h_list.l_count =
833                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
834
835                         suballoc_bit_start++;
836                         first_blkno++;
837
838                         /* We'll also be dirtied by the caller, so
839                          * this isn't absolutely necessary. */
840                         status = ocfs2_journal_dirty(handle, bhs[i]);
841                         if (status < 0) {
842                                 mlog_errno(status);
843                                 goto bail;
844                         }
845                 }
846
847                 count += num_got;
848         }
849
850         status = 0;
851 bail:
852         if (status < 0) {
853                 for(i = 0; i < wanted; i++) {
854                         brelse(bhs[i]);
855                         bhs[i] = NULL;
856                 }
857         }
858         mlog_exit(status);
859         return status;
860 }
861
862 /*
863  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
864  *
865  * Returns the sum of the rightmost extent rec logical offset and
866  * cluster count.
867  *
868  * ocfs2_add_branch() uses this to determine what logical cluster
869  * value should be populated into the leftmost new branch records.
870  *
871  * ocfs2_shift_tree_depth() uses this to determine the # clusters
872  * value for the new topmost tree record.
873  */
874 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
875 {
876         int i;
877
878         i = le16_to_cpu(el->l_next_free_rec) - 1;
879
880         return le32_to_cpu(el->l_recs[i].e_cpos) +
881                 ocfs2_rec_clusters(el, &el->l_recs[i]);
882 }
883
884 /*
885  * Add an entire tree branch to our inode. eb_bh is the extent block
886  * to start at, if we don't want to start the branch at the dinode
887  * structure.
888  *
889  * last_eb_bh is required as we have to update it's next_leaf pointer
890  * for the new last extent block.
891  *
892  * the new branch will be 'empty' in the sense that every block will
893  * contain a single record with cluster count == 0.
894  */
895 static int ocfs2_add_branch(struct ocfs2_super *osb,
896                             handle_t *handle,
897                             struct inode *inode,
898                             struct ocfs2_extent_tree *et,
899                             struct buffer_head *eb_bh,
900                             struct buffer_head **last_eb_bh,
901                             struct ocfs2_alloc_context *meta_ac)
902 {
903         int status, new_blocks, i;
904         u64 next_blkno, new_last_eb_blk;
905         struct buffer_head *bh;
906         struct buffer_head **new_eb_bhs = NULL;
907         struct ocfs2_extent_block *eb;
908         struct ocfs2_extent_list  *eb_el;
909         struct ocfs2_extent_list  *el;
910         u32 new_cpos;
911
912         mlog_entry_void();
913
914         BUG_ON(!last_eb_bh || !*last_eb_bh);
915
916         if (eb_bh) {
917                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
918                 el = &eb->h_list;
919         } else
920                 el = et->et_root_el;
921
922         /* we never add a branch to a leaf. */
923         BUG_ON(!el->l_tree_depth);
924
925         new_blocks = le16_to_cpu(el->l_tree_depth);
926
927         /* allocate the number of new eb blocks we need */
928         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
929                              GFP_KERNEL);
930         if (!new_eb_bhs) {
931                 status = -ENOMEM;
932                 mlog_errno(status);
933                 goto bail;
934         }
935
936         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
937                                            meta_ac, new_eb_bhs);
938         if (status < 0) {
939                 mlog_errno(status);
940                 goto bail;
941         }
942
943         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
944         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
945
946         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
947          * linked with the rest of the tree.
948          * conversly, new_eb_bhs[0] is the new bottommost leaf.
949          *
950          * when we leave the loop, new_last_eb_blk will point to the
951          * newest leaf, and next_blkno will point to the topmost extent
952          * block. */
953         next_blkno = new_last_eb_blk = 0;
954         for(i = 0; i < new_blocks; i++) {
955                 bh = new_eb_bhs[i];
956                 eb = (struct ocfs2_extent_block *) bh->b_data;
957                 /* ocfs2_create_new_meta_bhs() should create it right! */
958                 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
959                 eb_el = &eb->h_list;
960
961                 status = ocfs2_journal_access(handle, inode, bh,
962                                               OCFS2_JOURNAL_ACCESS_CREATE);
963                 if (status < 0) {
964                         mlog_errno(status);
965                         goto bail;
966                 }
967
968                 eb->h_next_leaf_blk = 0;
969                 eb_el->l_tree_depth = cpu_to_le16(i);
970                 eb_el->l_next_free_rec = cpu_to_le16(1);
971                 /*
972                  * This actually counts as an empty extent as
973                  * c_clusters == 0
974                  */
975                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
976                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
977                 /*
978                  * eb_el isn't always an interior node, but even leaf
979                  * nodes want a zero'd flags and reserved field so
980                  * this gets the whole 32 bits regardless of use.
981                  */
982                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
983                 if (!eb_el->l_tree_depth)
984                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
985
986                 status = ocfs2_journal_dirty(handle, bh);
987                 if (status < 0) {
988                         mlog_errno(status);
989                         goto bail;
990                 }
991
992                 next_blkno = le64_to_cpu(eb->h_blkno);
993         }
994
995         /* This is a bit hairy. We want to update up to three blocks
996          * here without leaving any of them in an inconsistent state
997          * in case of error. We don't have to worry about
998          * journal_dirty erroring as it won't unless we've aborted the
999          * handle (in which case we would never be here) so reserving
1000          * the write with journal_access is all we need to do. */
1001         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
1002                                       OCFS2_JOURNAL_ACCESS_WRITE);
1003         if (status < 0) {
1004                 mlog_errno(status);
1005                 goto bail;
1006         }
1007         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1008                                       OCFS2_JOURNAL_ACCESS_WRITE);
1009         if (status < 0) {
1010                 mlog_errno(status);
1011                 goto bail;
1012         }
1013         if (eb_bh) {
1014                 status = ocfs2_journal_access(handle, inode, eb_bh,
1015                                               OCFS2_JOURNAL_ACCESS_WRITE);
1016                 if (status < 0) {
1017                         mlog_errno(status);
1018                         goto bail;
1019                 }
1020         }
1021
1022         /* Link the new branch into the rest of the tree (el will
1023          * either be on the root_bh, or the extent block passed in. */
1024         i = le16_to_cpu(el->l_next_free_rec);
1025         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1026         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1027         el->l_recs[i].e_int_clusters = 0;
1028         le16_add_cpu(&el->l_next_free_rec, 1);
1029
1030         /* fe needs a new last extent block pointer, as does the
1031          * next_leaf on the previously last-extent-block. */
1032         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1033
1034         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1035         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1036
1037         status = ocfs2_journal_dirty(handle, *last_eb_bh);
1038         if (status < 0)
1039                 mlog_errno(status);
1040         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1041         if (status < 0)
1042                 mlog_errno(status);
1043         if (eb_bh) {
1044                 status = ocfs2_journal_dirty(handle, eb_bh);
1045                 if (status < 0)
1046                         mlog_errno(status);
1047         }
1048
1049         /*
1050          * Some callers want to track the rightmost leaf so pass it
1051          * back here.
1052          */
1053         brelse(*last_eb_bh);
1054         get_bh(new_eb_bhs[0]);
1055         *last_eb_bh = new_eb_bhs[0];
1056
1057         status = 0;
1058 bail:
1059         if (new_eb_bhs) {
1060                 for (i = 0; i < new_blocks; i++)
1061                         brelse(new_eb_bhs[i]);
1062                 kfree(new_eb_bhs);
1063         }
1064
1065         mlog_exit(status);
1066         return status;
1067 }
1068
1069 /*
1070  * adds another level to the allocation tree.
1071  * returns back the new extent block so you can add a branch to it
1072  * after this call.
1073  */
1074 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1075                                   handle_t *handle,
1076                                   struct inode *inode,
1077                                   struct ocfs2_extent_tree *et,
1078                                   struct ocfs2_alloc_context *meta_ac,
1079                                   struct buffer_head **ret_new_eb_bh)
1080 {
1081         int status, i;
1082         u32 new_clusters;
1083         struct buffer_head *new_eb_bh = NULL;
1084         struct ocfs2_extent_block *eb;
1085         struct ocfs2_extent_list  *root_el;
1086         struct ocfs2_extent_list  *eb_el;
1087
1088         mlog_entry_void();
1089
1090         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1091                                            &new_eb_bh);
1092         if (status < 0) {
1093                 mlog_errno(status);
1094                 goto bail;
1095         }
1096
1097         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1098         /* ocfs2_create_new_meta_bhs() should create it right! */
1099         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1100
1101         eb_el = &eb->h_list;
1102         root_el = et->et_root_el;
1103
1104         status = ocfs2_journal_access(handle, inode, new_eb_bh,
1105                                       OCFS2_JOURNAL_ACCESS_CREATE);
1106         if (status < 0) {
1107                 mlog_errno(status);
1108                 goto bail;
1109         }
1110
1111         /* copy the root extent list data into the new extent block */
1112         eb_el->l_tree_depth = root_el->l_tree_depth;
1113         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1114         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1115                 eb_el->l_recs[i] = root_el->l_recs[i];
1116
1117         status = ocfs2_journal_dirty(handle, new_eb_bh);
1118         if (status < 0) {
1119                 mlog_errno(status);
1120                 goto bail;
1121         }
1122
1123         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1124                                       OCFS2_JOURNAL_ACCESS_WRITE);
1125         if (status < 0) {
1126                 mlog_errno(status);
1127                 goto bail;
1128         }
1129
1130         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1131
1132         /* update root_bh now */
1133         le16_add_cpu(&root_el->l_tree_depth, 1);
1134         root_el->l_recs[0].e_cpos = 0;
1135         root_el->l_recs[0].e_blkno = eb->h_blkno;
1136         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1137         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1138                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1139         root_el->l_next_free_rec = cpu_to_le16(1);
1140
1141         /* If this is our 1st tree depth shift, then last_eb_blk
1142          * becomes the allocated extent block */
1143         if (root_el->l_tree_depth == cpu_to_le16(1))
1144                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1145
1146         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1147         if (status < 0) {
1148                 mlog_errno(status);
1149                 goto bail;
1150         }
1151
1152         *ret_new_eb_bh = new_eb_bh;
1153         new_eb_bh = NULL;
1154         status = 0;
1155 bail:
1156         brelse(new_eb_bh);
1157
1158         mlog_exit(status);
1159         return status;
1160 }
1161
1162 /*
1163  * Should only be called when there is no space left in any of the
1164  * leaf nodes. What we want to do is find the lowest tree depth
1165  * non-leaf extent block with room for new records. There are three
1166  * valid results of this search:
1167  *
1168  * 1) a lowest extent block is found, then we pass it back in
1169  *    *lowest_eb_bh and return '0'
1170  *
1171  * 2) the search fails to find anything, but the root_el has room. We
1172  *    pass NULL back in *lowest_eb_bh, but still return '0'
1173  *
1174  * 3) the search fails to find anything AND the root_el is full, in
1175  *    which case we return > 0
1176  *
1177  * return status < 0 indicates an error.
1178  */
1179 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1180                                     struct inode *inode,
1181                                     struct ocfs2_extent_tree *et,
1182                                     struct buffer_head **target_bh)
1183 {
1184         int status = 0, i;
1185         u64 blkno;
1186         struct ocfs2_extent_block *eb;
1187         struct ocfs2_extent_list  *el;
1188         struct buffer_head *bh = NULL;
1189         struct buffer_head *lowest_bh = NULL;
1190
1191         mlog_entry_void();
1192
1193         *target_bh = NULL;
1194
1195         el = et->et_root_el;
1196
1197         while(le16_to_cpu(el->l_tree_depth) > 1) {
1198                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1199                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1200                                     "extent list (next_free_rec == 0)",
1201                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1202                         status = -EIO;
1203                         goto bail;
1204                 }
1205                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1206                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1207                 if (!blkno) {
1208                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1209                                     "list where extent # %d has no physical "
1210                                     "block start",
1211                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1212                         status = -EIO;
1213                         goto bail;
1214                 }
1215
1216                 brelse(bh);
1217                 bh = NULL;
1218
1219                 status = ocfs2_read_extent_block(inode, blkno, &bh);
1220                 if (status < 0) {
1221                         mlog_errno(status);
1222                         goto bail;
1223                 }
1224
1225                 eb = (struct ocfs2_extent_block *) bh->b_data;
1226                 el = &eb->h_list;
1227
1228                 if (le16_to_cpu(el->l_next_free_rec) <
1229                     le16_to_cpu(el->l_count)) {
1230                         brelse(lowest_bh);
1231                         lowest_bh = bh;
1232                         get_bh(lowest_bh);
1233                 }
1234         }
1235
1236         /* If we didn't find one and the fe doesn't have any room,
1237          * then return '1' */
1238         el = et->et_root_el;
1239         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1240                 status = 1;
1241
1242         *target_bh = lowest_bh;
1243 bail:
1244         brelse(bh);
1245
1246         mlog_exit(status);
1247         return status;
1248 }
1249
1250 /*
1251  * Grow a b-tree so that it has more records.
1252  *
1253  * We might shift the tree depth in which case existing paths should
1254  * be considered invalid.
1255  *
1256  * Tree depth after the grow is returned via *final_depth.
1257  *
1258  * *last_eb_bh will be updated by ocfs2_add_branch().
1259  */
1260 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1261                            struct ocfs2_extent_tree *et, int *final_depth,
1262                            struct buffer_head **last_eb_bh,
1263                            struct ocfs2_alloc_context *meta_ac)
1264 {
1265         int ret, shift;
1266         struct ocfs2_extent_list *el = et->et_root_el;
1267         int depth = le16_to_cpu(el->l_tree_depth);
1268         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1269         struct buffer_head *bh = NULL;
1270
1271         BUG_ON(meta_ac == NULL);
1272
1273         shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1274         if (shift < 0) {
1275                 ret = shift;
1276                 mlog_errno(ret);
1277                 goto out;
1278         }
1279
1280         /* We traveled all the way to the bottom of the allocation tree
1281          * and didn't find room for any more extents - we need to add
1282          * another tree level */
1283         if (shift) {
1284                 BUG_ON(bh);
1285                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1286
1287                 /* ocfs2_shift_tree_depth will return us a buffer with
1288                  * the new extent block (so we can pass that to
1289                  * ocfs2_add_branch). */
1290                 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1291                                              meta_ac, &bh);
1292                 if (ret < 0) {
1293                         mlog_errno(ret);
1294                         goto out;
1295                 }
1296                 depth++;
1297                 if (depth == 1) {
1298                         /*
1299                          * Special case: we have room now if we shifted from
1300                          * tree_depth 0, so no more work needs to be done.
1301                          *
1302                          * We won't be calling add_branch, so pass
1303                          * back *last_eb_bh as the new leaf. At depth
1304                          * zero, it should always be null so there's
1305                          * no reason to brelse.
1306                          */
1307                         BUG_ON(*last_eb_bh);
1308                         get_bh(bh);
1309                         *last_eb_bh = bh;
1310                         goto out;
1311                 }
1312         }
1313
1314         /* call ocfs2_add_branch to add the final part of the tree with
1315          * the new data. */
1316         mlog(0, "add branch. bh = %p\n", bh);
1317         ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1318                                meta_ac);
1319         if (ret < 0) {
1320                 mlog_errno(ret);
1321                 goto out;
1322         }
1323
1324 out:
1325         if (final_depth)
1326                 *final_depth = depth;
1327         brelse(bh);
1328         return ret;
1329 }
1330
1331 /*
1332  * This function will discard the rightmost extent record.
1333  */
1334 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1335 {
1336         int next_free = le16_to_cpu(el->l_next_free_rec);
1337         int count = le16_to_cpu(el->l_count);
1338         unsigned int num_bytes;
1339
1340         BUG_ON(!next_free);
1341         /* This will cause us to go off the end of our extent list. */
1342         BUG_ON(next_free >= count);
1343
1344         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1345
1346         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1347 }
1348
1349 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1350                               struct ocfs2_extent_rec *insert_rec)
1351 {
1352         int i, insert_index, next_free, has_empty, num_bytes;
1353         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1354         struct ocfs2_extent_rec *rec;
1355
1356         next_free = le16_to_cpu(el->l_next_free_rec);
1357         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1358
1359         BUG_ON(!next_free);
1360
1361         /* The tree code before us didn't allow enough room in the leaf. */
1362         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1363
1364         /*
1365          * The easiest way to approach this is to just remove the
1366          * empty extent and temporarily decrement next_free.
1367          */
1368         if (has_empty) {
1369                 /*
1370                  * If next_free was 1 (only an empty extent), this
1371                  * loop won't execute, which is fine. We still want
1372                  * the decrement above to happen.
1373                  */
1374                 for(i = 0; i < (next_free - 1); i++)
1375                         el->l_recs[i] = el->l_recs[i+1];
1376
1377                 next_free--;
1378         }
1379
1380         /*
1381          * Figure out what the new record index should be.
1382          */
1383         for(i = 0; i < next_free; i++) {
1384                 rec = &el->l_recs[i];
1385
1386                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1387                         break;
1388         }
1389         insert_index = i;
1390
1391         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1392              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1393
1394         BUG_ON(insert_index < 0);
1395         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1396         BUG_ON(insert_index > next_free);
1397
1398         /*
1399          * No need to memmove if we're just adding to the tail.
1400          */
1401         if (insert_index != next_free) {
1402                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1403
1404                 num_bytes = next_free - insert_index;
1405                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1406                 memmove(&el->l_recs[insert_index + 1],
1407                         &el->l_recs[insert_index],
1408                         num_bytes);
1409         }
1410
1411         /*
1412          * Either we had an empty extent, and need to re-increment or
1413          * there was no empty extent on a non full rightmost leaf node,
1414          * in which case we still need to increment.
1415          */
1416         next_free++;
1417         el->l_next_free_rec = cpu_to_le16(next_free);
1418         /*
1419          * Make sure none of the math above just messed up our tree.
1420          */
1421         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1422
1423         el->l_recs[insert_index] = *insert_rec;
1424
1425 }
1426
1427 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1428 {
1429         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1430
1431         BUG_ON(num_recs == 0);
1432
1433         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1434                 num_recs--;
1435                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1436                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1437                 memset(&el->l_recs[num_recs], 0,
1438                        sizeof(struct ocfs2_extent_rec));
1439                 el->l_next_free_rec = cpu_to_le16(num_recs);
1440         }
1441 }
1442
1443 /*
1444  * Create an empty extent record .
1445  *
1446  * l_next_free_rec may be updated.
1447  *
1448  * If an empty extent already exists do nothing.
1449  */
1450 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1451 {
1452         int next_free = le16_to_cpu(el->l_next_free_rec);
1453
1454         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1455
1456         if (next_free == 0)
1457                 goto set_and_inc;
1458
1459         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1460                 return;
1461
1462         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1463                         "Asked to create an empty extent in a full list:\n"
1464                         "count = %u, tree depth = %u",
1465                         le16_to_cpu(el->l_count),
1466                         le16_to_cpu(el->l_tree_depth));
1467
1468         ocfs2_shift_records_right(el);
1469
1470 set_and_inc:
1471         le16_add_cpu(&el->l_next_free_rec, 1);
1472         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1473 }
1474
1475 /*
1476  * For a rotation which involves two leaf nodes, the "root node" is
1477  * the lowest level tree node which contains a path to both leafs. This
1478  * resulting set of information can be used to form a complete "subtree"
1479  *
1480  * This function is passed two full paths from the dinode down to a
1481  * pair of adjacent leaves. It's task is to figure out which path
1482  * index contains the subtree root - this can be the root index itself
1483  * in a worst-case rotation.
1484  *
1485  * The array index of the subtree root is passed back.
1486  */
1487 static int ocfs2_find_subtree_root(struct inode *inode,
1488                                    struct ocfs2_path *left,
1489                                    struct ocfs2_path *right)
1490 {
1491         int i = 0;
1492
1493         /*
1494          * Check that the caller passed in two paths from the same tree.
1495          */
1496         BUG_ON(path_root_bh(left) != path_root_bh(right));
1497
1498         do {
1499                 i++;
1500
1501                 /*
1502                  * The caller didn't pass two adjacent paths.
1503                  */
1504                 mlog_bug_on_msg(i > left->p_tree_depth,
1505                                 "Inode %lu, left depth %u, right depth %u\n"
1506                                 "left leaf blk %llu, right leaf blk %llu\n",
1507                                 inode->i_ino, left->p_tree_depth,
1508                                 right->p_tree_depth,
1509                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1510                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1511         } while (left->p_node[i].bh->b_blocknr ==
1512                  right->p_node[i].bh->b_blocknr);
1513
1514         return i - 1;
1515 }
1516
1517 typedef void (path_insert_t)(void *, struct buffer_head *);
1518
1519 /*
1520  * Traverse a btree path in search of cpos, starting at root_el.
1521  *
1522  * This code can be called with a cpos larger than the tree, in which
1523  * case it will return the rightmost path.
1524  */
1525 static int __ocfs2_find_path(struct inode *inode,
1526                              struct ocfs2_extent_list *root_el, u32 cpos,
1527                              path_insert_t *func, void *data)
1528 {
1529         int i, ret = 0;
1530         u32 range;
1531         u64 blkno;
1532         struct buffer_head *bh = NULL;
1533         struct ocfs2_extent_block *eb;
1534         struct ocfs2_extent_list *el;
1535         struct ocfs2_extent_rec *rec;
1536         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1537
1538         el = root_el;
1539         while (el->l_tree_depth) {
1540                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1541                         ocfs2_error(inode->i_sb,
1542                                     "Inode %llu has empty extent list at "
1543                                     "depth %u\n",
1544                                     (unsigned long long)oi->ip_blkno,
1545                                     le16_to_cpu(el->l_tree_depth));
1546                         ret = -EROFS;
1547                         goto out;
1548
1549                 }
1550
1551                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1552                         rec = &el->l_recs[i];
1553
1554                         /*
1555                          * In the case that cpos is off the allocation
1556                          * tree, this should just wind up returning the
1557                          * rightmost record.
1558                          */
1559                         range = le32_to_cpu(rec->e_cpos) +
1560                                 ocfs2_rec_clusters(el, rec);
1561                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1562                             break;
1563                 }
1564
1565                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1566                 if (blkno == 0) {
1567                         ocfs2_error(inode->i_sb,
1568                                     "Inode %llu has bad blkno in extent list "
1569                                     "at depth %u (index %d)\n",
1570                                     (unsigned long long)oi->ip_blkno,
1571                                     le16_to_cpu(el->l_tree_depth), i);
1572                         ret = -EROFS;
1573                         goto out;
1574                 }
1575
1576                 brelse(bh);
1577                 bh = NULL;
1578                 ret = ocfs2_read_extent_block(inode, blkno, &bh);
1579                 if (ret) {
1580                         mlog_errno(ret);
1581                         goto out;
1582                 }
1583
1584                 eb = (struct ocfs2_extent_block *) bh->b_data;
1585                 el = &eb->h_list;
1586
1587                 if (le16_to_cpu(el->l_next_free_rec) >
1588                     le16_to_cpu(el->l_count)) {
1589                         ocfs2_error(inode->i_sb,
1590                                     "Inode %llu has bad count in extent list "
1591                                     "at block %llu (next free=%u, count=%u)\n",
1592                                     (unsigned long long)oi->ip_blkno,
1593                                     (unsigned long long)bh->b_blocknr,
1594                                     le16_to_cpu(el->l_next_free_rec),
1595                                     le16_to_cpu(el->l_count));
1596                         ret = -EROFS;
1597                         goto out;
1598                 }
1599
1600                 if (func)
1601                         func(data, bh);
1602         }
1603
1604 out:
1605         /*
1606          * Catch any trailing bh that the loop didn't handle.
1607          */
1608         brelse(bh);
1609
1610         return ret;
1611 }
1612
1613 /*
1614  * Given an initialized path (that is, it has a valid root extent
1615  * list), this function will traverse the btree in search of the path
1616  * which would contain cpos.
1617  *
1618  * The path traveled is recorded in the path structure.
1619  *
1620  * Note that this will not do any comparisons on leaf node extent
1621  * records, so it will work fine in the case that we just added a tree
1622  * branch.
1623  */
1624 struct find_path_data {
1625         int index;
1626         struct ocfs2_path *path;
1627 };
1628 static void find_path_ins(void *data, struct buffer_head *bh)
1629 {
1630         struct find_path_data *fp = data;
1631
1632         get_bh(bh);
1633         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1634         fp->index++;
1635 }
1636 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1637                            u32 cpos)
1638 {
1639         struct find_path_data data;
1640
1641         data.index = 1;
1642         data.path = path;
1643         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1644                                  find_path_ins, &data);
1645 }
1646
1647 static void find_leaf_ins(void *data, struct buffer_head *bh)
1648 {
1649         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1650         struct ocfs2_extent_list *el = &eb->h_list;
1651         struct buffer_head **ret = data;
1652
1653         /* We want to retain only the leaf block. */
1654         if (le16_to_cpu(el->l_tree_depth) == 0) {
1655                 get_bh(bh);
1656                 *ret = bh;
1657         }
1658 }
1659 /*
1660  * Find the leaf block in the tree which would contain cpos. No
1661  * checking of the actual leaf is done.
1662  *
1663  * Some paths want to call this instead of allocating a path structure
1664  * and calling ocfs2_find_path().
1665  *
1666  * This function doesn't handle non btree extent lists.
1667  */
1668 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1669                     u32 cpos, struct buffer_head **leaf_bh)
1670 {
1671         int ret;
1672         struct buffer_head *bh = NULL;
1673
1674         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1675         if (ret) {
1676                 mlog_errno(ret);
1677                 goto out;
1678         }
1679
1680         *leaf_bh = bh;
1681 out:
1682         return ret;
1683 }
1684
1685 /*
1686  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1687  *
1688  * Basically, we've moved stuff around at the bottom of the tree and
1689  * we need to fix up the extent records above the changes to reflect
1690  * the new changes.
1691  *
1692  * left_rec: the record on the left.
1693  * left_child_el: is the child list pointed to by left_rec
1694  * right_rec: the record to the right of left_rec
1695  * right_child_el: is the child list pointed to by right_rec
1696  *
1697  * By definition, this only works on interior nodes.
1698  */
1699 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1700                                   struct ocfs2_extent_list *left_child_el,
1701                                   struct ocfs2_extent_rec *right_rec,
1702                                   struct ocfs2_extent_list *right_child_el)
1703 {
1704         u32 left_clusters, right_end;
1705
1706         /*
1707          * Interior nodes never have holes. Their cpos is the cpos of
1708          * the leftmost record in their child list. Their cluster
1709          * count covers the full theoretical range of their child list
1710          * - the range between their cpos and the cpos of the record
1711          * immediately to their right.
1712          */
1713         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1714         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1715                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1716                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1717         }
1718         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1719         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1720
1721         /*
1722          * Calculate the rightmost cluster count boundary before
1723          * moving cpos - we will need to adjust clusters after
1724          * updating e_cpos to keep the same highest cluster count.
1725          */
1726         right_end = le32_to_cpu(right_rec->e_cpos);
1727         right_end += le32_to_cpu(right_rec->e_int_clusters);
1728
1729         right_rec->e_cpos = left_rec->e_cpos;
1730         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1731
1732         right_end -= le32_to_cpu(right_rec->e_cpos);
1733         right_rec->e_int_clusters = cpu_to_le32(right_end);
1734 }
1735
1736 /*
1737  * Adjust the adjacent root node records involved in a
1738  * rotation. left_el_blkno is passed in as a key so that we can easily
1739  * find it's index in the root list.
1740  */
1741 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1742                                       struct ocfs2_extent_list *left_el,
1743                                       struct ocfs2_extent_list *right_el,
1744                                       u64 left_el_blkno)
1745 {
1746         int i;
1747
1748         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1749                le16_to_cpu(left_el->l_tree_depth));
1750
1751         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1752                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1753                         break;
1754         }
1755
1756         /*
1757          * The path walking code should have never returned a root and
1758          * two paths which are not adjacent.
1759          */
1760         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1761
1762         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1763                                       &root_el->l_recs[i + 1], right_el);
1764 }
1765
1766 /*
1767  * We've changed a leaf block (in right_path) and need to reflect that
1768  * change back up the subtree.
1769  *
1770  * This happens in multiple places:
1771  *   - When we've moved an extent record from the left path leaf to the right
1772  *     path leaf to make room for an empty extent in the left path leaf.
1773  *   - When our insert into the right path leaf is at the leftmost edge
1774  *     and requires an update of the path immediately to it's left. This
1775  *     can occur at the end of some types of rotation and appending inserts.
1776  *   - When we've adjusted the last extent record in the left path leaf and the
1777  *     1st extent record in the right path leaf during cross extent block merge.
1778  */
1779 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1780                                        struct ocfs2_path *left_path,
1781                                        struct ocfs2_path *right_path,
1782                                        int subtree_index)
1783 {
1784         int ret, i, idx;
1785         struct ocfs2_extent_list *el, *left_el, *right_el;
1786         struct ocfs2_extent_rec *left_rec, *right_rec;
1787         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1788
1789         /*
1790          * Update the counts and position values within all the
1791          * interior nodes to reflect the leaf rotation we just did.
1792          *
1793          * The root node is handled below the loop.
1794          *
1795          * We begin the loop with right_el and left_el pointing to the
1796          * leaf lists and work our way up.
1797          *
1798          * NOTE: within this loop, left_el and right_el always refer
1799          * to the *child* lists.
1800          */
1801         left_el = path_leaf_el(left_path);
1802         right_el = path_leaf_el(right_path);
1803         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1804                 mlog(0, "Adjust records at index %u\n", i);
1805
1806                 /*
1807                  * One nice property of knowing that all of these
1808                  * nodes are below the root is that we only deal with
1809                  * the leftmost right node record and the rightmost
1810                  * left node record.
1811                  */
1812                 el = left_path->p_node[i].el;
1813                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1814                 left_rec = &el->l_recs[idx];
1815
1816                 el = right_path->p_node[i].el;
1817                 right_rec = &el->l_recs[0];
1818
1819                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1820                                               right_el);
1821
1822                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1823                 if (ret)
1824                         mlog_errno(ret);
1825
1826                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1827                 if (ret)
1828                         mlog_errno(ret);
1829
1830                 /*
1831                  * Setup our list pointers now so that the current
1832                  * parents become children in the next iteration.
1833                  */
1834                 left_el = left_path->p_node[i].el;
1835                 right_el = right_path->p_node[i].el;
1836         }
1837
1838         /*
1839          * At the root node, adjust the two adjacent records which
1840          * begin our path to the leaves.
1841          */
1842
1843         el = left_path->p_node[subtree_index].el;
1844         left_el = left_path->p_node[subtree_index + 1].el;
1845         right_el = right_path->p_node[subtree_index + 1].el;
1846
1847         ocfs2_adjust_root_records(el, left_el, right_el,
1848                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1849
1850         root_bh = left_path->p_node[subtree_index].bh;
1851
1852         ret = ocfs2_journal_dirty(handle, root_bh);
1853         if (ret)
1854                 mlog_errno(ret);
1855 }
1856
1857 static int ocfs2_rotate_subtree_right(struct inode *inode,
1858                                       handle_t *handle,
1859                                       struct ocfs2_path *left_path,
1860                                       struct ocfs2_path *right_path,
1861                                       int subtree_index)
1862 {
1863         int ret, i;
1864         struct buffer_head *right_leaf_bh;
1865         struct buffer_head *left_leaf_bh = NULL;
1866         struct buffer_head *root_bh;
1867         struct ocfs2_extent_list *right_el, *left_el;
1868         struct ocfs2_extent_rec move_rec;
1869
1870         left_leaf_bh = path_leaf_bh(left_path);
1871         left_el = path_leaf_el(left_path);
1872
1873         if (left_el->l_next_free_rec != left_el->l_count) {
1874                 ocfs2_error(inode->i_sb,
1875                             "Inode %llu has non-full interior leaf node %llu"
1876                             "(next free = %u)",
1877                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1878                             (unsigned long long)left_leaf_bh->b_blocknr,
1879                             le16_to_cpu(left_el->l_next_free_rec));
1880                 return -EROFS;
1881         }
1882
1883         /*
1884          * This extent block may already have an empty record, so we
1885          * return early if so.
1886          */
1887         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1888                 return 0;
1889
1890         root_bh = left_path->p_node[subtree_index].bh;
1891         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1892
1893         ret = ocfs2_journal_access(handle, inode, root_bh,
1894                                    OCFS2_JOURNAL_ACCESS_WRITE);
1895         if (ret) {
1896                 mlog_errno(ret);
1897                 goto out;
1898         }
1899
1900         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1901                 ret = ocfs2_journal_access(handle, inode,
1902                                            right_path->p_node[i].bh,
1903                                            OCFS2_JOURNAL_ACCESS_WRITE);
1904                 if (ret) {
1905                         mlog_errno(ret);
1906                         goto out;
1907                 }
1908
1909                 ret = ocfs2_journal_access(handle, inode,
1910                                            left_path->p_node[i].bh,
1911                                            OCFS2_JOURNAL_ACCESS_WRITE);
1912                 if (ret) {
1913                         mlog_errno(ret);
1914                         goto out;
1915                 }
1916         }
1917
1918         right_leaf_bh = path_leaf_bh(right_path);
1919         right_el = path_leaf_el(right_path);
1920
1921         /* This is a code error, not a disk corruption. */
1922         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1923                         "because rightmost leaf block %llu is empty\n",
1924                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1925                         (unsigned long long)right_leaf_bh->b_blocknr);
1926
1927         ocfs2_create_empty_extent(right_el);
1928
1929         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1930         if (ret) {
1931                 mlog_errno(ret);
1932                 goto out;
1933         }
1934
1935         /* Do the copy now. */
1936         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1937         move_rec = left_el->l_recs[i];
1938         right_el->l_recs[0] = move_rec;
1939
1940         /*
1941          * Clear out the record we just copied and shift everything
1942          * over, leaving an empty extent in the left leaf.
1943          *
1944          * We temporarily subtract from next_free_rec so that the
1945          * shift will lose the tail record (which is now defunct).
1946          */
1947         le16_add_cpu(&left_el->l_next_free_rec, -1);
1948         ocfs2_shift_records_right(left_el);
1949         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1950         le16_add_cpu(&left_el->l_next_free_rec, 1);
1951
1952         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1953         if (ret) {
1954                 mlog_errno(ret);
1955                 goto out;
1956         }
1957
1958         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1959                                 subtree_index);
1960
1961 out:
1962         return ret;
1963 }
1964
1965 /*
1966  * Given a full path, determine what cpos value would return us a path
1967  * containing the leaf immediately to the left of the current one.
1968  *
1969  * Will return zero if the path passed in is already the leftmost path.
1970  */
1971 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1972                                          struct ocfs2_path *path, u32 *cpos)
1973 {
1974         int i, j, ret = 0;
1975         u64 blkno;
1976         struct ocfs2_extent_list *el;
1977
1978         BUG_ON(path->p_tree_depth == 0);
1979
1980         *cpos = 0;
1981
1982         blkno = path_leaf_bh(path)->b_blocknr;
1983
1984         /* Start at the tree node just above the leaf and work our way up. */
1985         i = path->p_tree_depth - 1;
1986         while (i >= 0) {
1987                 el = path->p_node[i].el;
1988
1989                 /*
1990                  * Find the extent record just before the one in our
1991                  * path.
1992                  */
1993                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1994                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1995                                 if (j == 0) {
1996                                         if (i == 0) {
1997                                                 /*
1998                                                  * We've determined that the
1999                                                  * path specified is already
2000                                                  * the leftmost one - return a
2001                                                  * cpos of zero.
2002                                                  */
2003                                                 goto out;
2004                                         }
2005                                         /*
2006                                          * The leftmost record points to our
2007                                          * leaf - we need to travel up the
2008                                          * tree one level.
2009                                          */
2010                                         goto next_node;
2011                                 }
2012
2013                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2014                                 *cpos = *cpos + ocfs2_rec_clusters(el,
2015                                                            &el->l_recs[j - 1]);
2016                                 *cpos = *cpos - 1;
2017                                 goto out;
2018                         }
2019                 }
2020
2021                 /*
2022                  * If we got here, we never found a valid node where
2023                  * the tree indicated one should be.
2024                  */
2025                 ocfs2_error(sb,
2026                             "Invalid extent tree at extent block %llu\n",
2027                             (unsigned long long)blkno);
2028                 ret = -EROFS;
2029                 goto out;
2030
2031 next_node:
2032                 blkno = path->p_node[i].bh->b_blocknr;
2033                 i--;
2034         }
2035
2036 out:
2037         return ret;
2038 }
2039
2040 /*
2041  * Extend the transaction by enough credits to complete the rotation,
2042  * and still leave at least the original number of credits allocated
2043  * to this transaction.
2044  */
2045 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2046                                            int op_credits,
2047                                            struct ocfs2_path *path)
2048 {
2049         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2050
2051         if (handle->h_buffer_credits < credits)
2052                 return ocfs2_extend_trans(handle, credits);
2053
2054         return 0;
2055 }
2056
2057 /*
2058  * Trap the case where we're inserting into the theoretical range past
2059  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2060  * whose cpos is less than ours into the right leaf.
2061  *
2062  * It's only necessary to look at the rightmost record of the left
2063  * leaf because the logic that calls us should ensure that the
2064  * theoretical ranges in the path components above the leaves are
2065  * correct.
2066  */
2067 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2068                                                  u32 insert_cpos)
2069 {
2070         struct ocfs2_extent_list *left_el;
2071         struct ocfs2_extent_rec *rec;
2072         int next_free;
2073
2074         left_el = path_leaf_el(left_path);
2075         next_free = le16_to_cpu(left_el->l_next_free_rec);
2076         rec = &left_el->l_recs[next_free - 1];
2077
2078         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2079                 return 1;
2080         return 0;
2081 }
2082
2083 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2084 {
2085         int next_free = le16_to_cpu(el->l_next_free_rec);
2086         unsigned int range;
2087         struct ocfs2_extent_rec *rec;
2088
2089         if (next_free == 0)
2090                 return 0;
2091
2092         rec = &el->l_recs[0];
2093         if (ocfs2_is_empty_extent(rec)) {
2094                 /* Empty list. */
2095                 if (next_free == 1)
2096                         return 0;
2097                 rec = &el->l_recs[1];
2098         }
2099
2100         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2101         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2102                 return 1;
2103         return 0;
2104 }
2105
2106 /*
2107  * Rotate all the records in a btree right one record, starting at insert_cpos.
2108  *
2109  * The path to the rightmost leaf should be passed in.
2110  *
2111  * The array is assumed to be large enough to hold an entire path (tree depth).
2112  *
2113  * Upon succesful return from this function:
2114  *
2115  * - The 'right_path' array will contain a path to the leaf block
2116  *   whose range contains e_cpos.
2117  * - That leaf block will have a single empty extent in list index 0.
2118  * - In the case that the rotation requires a post-insert update,
2119  *   *ret_left_path will contain a valid path which can be passed to
2120  *   ocfs2_insert_path().
2121  */
2122 static int ocfs2_rotate_tree_right(struct inode *inode,
2123                                    handle_t *handle,
2124                                    enum ocfs2_split_type split,
2125                                    u32 insert_cpos,
2126                                    struct ocfs2_path *right_path,
2127                                    struct ocfs2_path **ret_left_path)
2128 {
2129         int ret, start, orig_credits = handle->h_buffer_credits;
2130         u32 cpos;
2131         struct ocfs2_path *left_path = NULL;
2132
2133         *ret_left_path = NULL;
2134
2135         left_path = ocfs2_new_path(path_root_bh(right_path),
2136                                    path_root_el(right_path));
2137         if (!left_path) {
2138                 ret = -ENOMEM;
2139                 mlog_errno(ret);
2140                 goto out;
2141         }
2142
2143         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2144         if (ret) {
2145                 mlog_errno(ret);
2146                 goto out;
2147         }
2148
2149         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2150
2151         /*
2152          * What we want to do here is:
2153          *
2154          * 1) Start with the rightmost path.
2155          *
2156          * 2) Determine a path to the leaf block directly to the left
2157          *    of that leaf.
2158          *
2159          * 3) Determine the 'subtree root' - the lowest level tree node
2160          *    which contains a path to both leaves.
2161          *
2162          * 4) Rotate the subtree.
2163          *
2164          * 5) Find the next subtree by considering the left path to be
2165          *    the new right path.
2166          *
2167          * The check at the top of this while loop also accepts
2168          * insert_cpos == cpos because cpos is only a _theoretical_
2169          * value to get us the left path - insert_cpos might very well
2170          * be filling that hole.
2171          *
2172          * Stop at a cpos of '0' because we either started at the
2173          * leftmost branch (i.e., a tree with one branch and a
2174          * rotation inside of it), or we've gone as far as we can in
2175          * rotating subtrees.
2176          */
2177         while (cpos && insert_cpos <= cpos) {
2178                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2179                      insert_cpos, cpos);
2180
2181                 ret = ocfs2_find_path(inode, left_path, cpos);
2182                 if (ret) {
2183                         mlog_errno(ret);
2184                         goto out;
2185                 }
2186
2187                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2188                                 path_leaf_bh(right_path),
2189                                 "Inode %lu: error during insert of %u "
2190                                 "(left path cpos %u) results in two identical "
2191                                 "paths ending at %llu\n",
2192                                 inode->i_ino, insert_cpos, cpos,
2193                                 (unsigned long long)
2194                                 path_leaf_bh(left_path)->b_blocknr);
2195
2196                 if (split == SPLIT_NONE &&
2197                     ocfs2_rotate_requires_path_adjustment(left_path,
2198                                                           insert_cpos)) {
2199
2200                         /*
2201                          * We've rotated the tree as much as we
2202                          * should. The rest is up to
2203                          * ocfs2_insert_path() to complete, after the
2204                          * record insertion. We indicate this
2205                          * situation by returning the left path.
2206                          *
2207                          * The reason we don't adjust the records here
2208                          * before the record insert is that an error
2209                          * later might break the rule where a parent
2210                          * record e_cpos will reflect the actual
2211                          * e_cpos of the 1st nonempty record of the
2212                          * child list.
2213                          */
2214                         *ret_left_path = left_path;
2215                         goto out_ret_path;
2216                 }
2217
2218                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
2219
2220                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2221                      start,
2222                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2223                      right_path->p_tree_depth);
2224
2225                 ret = ocfs2_extend_rotate_transaction(handle, start,
2226                                                       orig_credits, right_path);
2227                 if (ret) {
2228                         mlog_errno(ret);
2229                         goto out;
2230                 }
2231
2232                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2233                                                  right_path, start);
2234                 if (ret) {
2235                         mlog_errno(ret);
2236                         goto out;
2237                 }
2238
2239                 if (split != SPLIT_NONE &&
2240                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2241                                                 insert_cpos)) {
2242                         /*
2243                          * A rotate moves the rightmost left leaf
2244                          * record over to the leftmost right leaf
2245                          * slot. If we're doing an extent split
2246                          * instead of a real insert, then we have to
2247                          * check that the extent to be split wasn't
2248                          * just moved over. If it was, then we can
2249                          * exit here, passing left_path back -
2250                          * ocfs2_split_extent() is smart enough to
2251                          * search both leaves.
2252                          */
2253                         *ret_left_path = left_path;
2254                         goto out_ret_path;
2255                 }
2256
2257                 /*
2258                  * There is no need to re-read the next right path
2259                  * as we know that it'll be our current left
2260                  * path. Optimize by copying values instead.
2261                  */
2262                 ocfs2_mv_path(right_path, left_path);
2263
2264                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2265                                                     &cpos);
2266                 if (ret) {
2267                         mlog_errno(ret);
2268                         goto out;
2269                 }
2270         }
2271
2272 out:
2273         ocfs2_free_path(left_path);
2274
2275 out_ret_path:
2276         return ret;
2277 }
2278
2279 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2280                                       struct ocfs2_path *path)
2281 {
2282         int i, idx;
2283         struct ocfs2_extent_rec *rec;
2284         struct ocfs2_extent_list *el;
2285         struct ocfs2_extent_block *eb;
2286         u32 range;
2287
2288         /* Path should always be rightmost. */
2289         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2290         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2291
2292         el = &eb->h_list;
2293         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2294         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2295         rec = &el->l_recs[idx];
2296         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2297
2298         for (i = 0; i < path->p_tree_depth; i++) {
2299                 el = path->p_node[i].el;
2300                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2301                 rec = &el->l_recs[idx];
2302
2303                 rec->e_int_clusters = cpu_to_le32(range);
2304                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2305
2306                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2307         }
2308 }
2309
2310 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2311                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2312                               struct ocfs2_path *path, int unlink_start)
2313 {
2314         int ret, i;
2315         struct ocfs2_extent_block *eb;
2316         struct ocfs2_extent_list *el;
2317         struct buffer_head *bh;
2318
2319         for(i = unlink_start; i < path_num_items(path); i++) {
2320                 bh = path->p_node[i].bh;
2321
2322                 eb = (struct ocfs2_extent_block *)bh->b_data;
2323                 /*
2324                  * Not all nodes might have had their final count
2325                  * decremented by the caller - handle this here.
2326                  */
2327                 el = &eb->h_list;
2328                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2329                         mlog(ML_ERROR,
2330                              "Inode %llu, attempted to remove extent block "
2331                              "%llu with %u records\n",
2332                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2333                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2334                              le16_to_cpu(el->l_next_free_rec));
2335
2336                         ocfs2_journal_dirty(handle, bh);
2337                         ocfs2_remove_from_cache(inode, bh);
2338                         continue;
2339                 }
2340
2341                 el->l_next_free_rec = 0;
2342                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2343
2344                 ocfs2_journal_dirty(handle, bh);
2345
2346                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2347                 if (ret)
2348                         mlog_errno(ret);
2349
2350                 ocfs2_remove_from_cache(inode, bh);
2351         }
2352 }
2353
2354 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2355                                  struct ocfs2_path *left_path,
2356                                  struct ocfs2_path *right_path,
2357                                  int subtree_index,
2358                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2359 {
2360         int i;
2361         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2362         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2363         struct ocfs2_extent_list *el;
2364         struct ocfs2_extent_block *eb;
2365
2366         el = path_leaf_el(left_path);
2367
2368         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2369
2370         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2371                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2372                         break;
2373
2374         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2375
2376         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2377         le16_add_cpu(&root_el->l_next_free_rec, -1);
2378
2379         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2380         eb->h_next_leaf_blk = 0;
2381
2382         ocfs2_journal_dirty(handle, root_bh);
2383         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2384
2385         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2386                           subtree_index + 1);
2387 }
2388
2389 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2390                                      struct ocfs2_path *left_path,
2391                                      struct ocfs2_path *right_path,
2392                                      int subtree_index,
2393                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2394                                      int *deleted,
2395                                      struct ocfs2_extent_tree *et)
2396 {
2397         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2398         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2399         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2400         struct ocfs2_extent_block *eb;
2401
2402         *deleted = 0;
2403
2404         right_leaf_el = path_leaf_el(right_path);
2405         left_leaf_el = path_leaf_el(left_path);
2406         root_bh = left_path->p_node[subtree_index].bh;
2407         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2408
2409         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2410                 return 0;
2411
2412         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2413         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2414                 /*
2415                  * It's legal for us to proceed if the right leaf is
2416                  * the rightmost one and it has an empty extent. There
2417                  * are two cases to handle - whether the leaf will be
2418                  * empty after removal or not. If the leaf isn't empty
2419                  * then just remove the empty extent up front. The
2420                  * next block will handle empty leaves by flagging
2421                  * them for unlink.
2422                  *
2423                  * Non rightmost leaves will throw -EAGAIN and the
2424                  * caller can manually move the subtree and retry.
2425                  */
2426
2427                 if (eb->h_next_leaf_blk != 0ULL)
2428                         return -EAGAIN;
2429
2430                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2431                         ret = ocfs2_journal_access(handle, inode,
2432                                                    path_leaf_bh(right_path),
2433                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2434                         if (ret) {
2435                                 mlog_errno(ret);
2436                                 goto out;
2437                         }
2438
2439                         ocfs2_remove_empty_extent(right_leaf_el);
2440                 } else
2441                         right_has_empty = 1;
2442         }
2443
2444         if (eb->h_next_leaf_blk == 0ULL &&
2445             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2446                 /*
2447                  * We have to update i_last_eb_blk during the meta
2448                  * data delete.
2449                  */
2450                 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2451                                            OCFS2_JOURNAL_ACCESS_WRITE);
2452                 if (ret) {
2453                         mlog_errno(ret);
2454                         goto out;
2455                 }
2456
2457                 del_right_subtree = 1;
2458         }
2459
2460         /*
2461          * Getting here with an empty extent in the right path implies
2462          * that it's the rightmost path and will be deleted.
2463          */
2464         BUG_ON(right_has_empty && !del_right_subtree);
2465
2466         ret = ocfs2_journal_access(handle, inode, root_bh,
2467                                    OCFS2_JOURNAL_ACCESS_WRITE);
2468         if (ret) {
2469                 mlog_errno(ret);
2470                 goto out;
2471         }
2472
2473         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2474                 ret = ocfs2_journal_access(handle, inode,
2475                                            right_path->p_node[i].bh,
2476                                            OCFS2_JOURNAL_ACCESS_WRITE);
2477                 if (ret) {
2478                         mlog_errno(ret);
2479                         goto out;
2480                 }
2481
2482                 ret = ocfs2_journal_access(handle, inode,
2483                                            left_path->p_node[i].bh,
2484                                            OCFS2_JOURNAL_ACCESS_WRITE);
2485                 if (ret) {
2486                         mlog_errno(ret);
2487                         goto out;
2488                 }
2489         }
2490
2491         if (!right_has_empty) {
2492                 /*
2493                  * Only do this if we're moving a real
2494                  * record. Otherwise, the action is delayed until
2495                  * after removal of the right path in which case we
2496                  * can do a simple shift to remove the empty extent.
2497                  */
2498                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2499                 memset(&right_leaf_el->l_recs[0], 0,
2500                        sizeof(struct ocfs2_extent_rec));
2501         }
2502         if (eb->h_next_leaf_blk == 0ULL) {
2503                 /*
2504                  * Move recs over to get rid of empty extent, decrease
2505                  * next_free. This is allowed to remove the last
2506                  * extent in our leaf (setting l_next_free_rec to
2507                  * zero) - the delete code below won't care.
2508                  */
2509                 ocfs2_remove_empty_extent(right_leaf_el);
2510         }
2511
2512         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2513         if (ret)
2514                 mlog_errno(ret);
2515         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2516         if (ret)
2517                 mlog_errno(ret);
2518
2519         if (del_right_subtree) {
2520                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2521                                      subtree_index, dealloc);
2522                 ocfs2_update_edge_lengths(inode, handle, left_path);
2523
2524                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2525                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2526
2527                 /*
2528                  * Removal of the extent in the left leaf was skipped
2529                  * above so we could delete the right path
2530                  * 1st.
2531                  */
2532                 if (right_has_empty)
2533                         ocfs2_remove_empty_extent(left_leaf_el);
2534
2535                 ret = ocfs2_journal_dirty(handle, et_root_bh);
2536                 if (ret)
2537                         mlog_errno(ret);
2538
2539                 *deleted = 1;
2540         } else
2541                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2542                                            subtree_index);
2543
2544 out:
2545         return ret;
2546 }
2547
2548 /*
2549  * Given a full path, determine what cpos value would return us a path
2550  * containing the leaf immediately to the right of the current one.
2551  *
2552  * Will return zero if the path passed in is already the rightmost path.
2553  *
2554  * This looks similar, but is subtly different to
2555  * ocfs2_find_cpos_for_left_leaf().
2556  */
2557 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2558                                           struct ocfs2_path *path, u32 *cpos)
2559 {
2560         int i, j, ret = 0;
2561         u64 blkno;
2562         struct ocfs2_extent_list *el;
2563
2564         *cpos = 0;
2565
2566         if (path->p_tree_depth == 0)
2567                 return 0;
2568
2569         blkno = path_leaf_bh(path)->b_blocknr;
2570
2571         /* Start at the tree node just above the leaf and work our way up. */
2572         i = path->p_tree_depth - 1;
2573         while (i >= 0) {
2574                 int next_free;
2575
2576                 el = path->p_node[i].el;
2577
2578                 /*
2579                  * Find the extent record just after the one in our
2580                  * path.
2581                  */
2582                 next_free = le16_to_cpu(el->l_next_free_rec);
2583                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2584                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2585                                 if (j == (next_free - 1)) {
2586                                         if (i == 0) {
2587                                                 /*
2588                                                  * We've determined that the
2589                                                  * path specified is already
2590                                                  * the rightmost one - return a
2591                                                  * cpos of zero.
2592                                                  */
2593                                                 goto out;
2594                                         }
2595                                         /*
2596                                          * The rightmost record points to our
2597                                          * leaf - we need to travel up the
2598                                          * tree one level.
2599                                          */
2600                                         goto next_node;
2601                                 }
2602
2603                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2604                                 goto out;
2605                         }
2606                 }
2607
2608                 /*
2609                  * If we got here, we never found a valid node where
2610                  * the tree indicated one should be.
2611                  */
2612                 ocfs2_error(sb,
2613                             "Invalid extent tree at extent block %llu\n",
2614                             (unsigned long long)blkno);
2615                 ret = -EROFS;
2616                 goto out;
2617
2618 next_node:
2619                 blkno = path->p_node[i].bh->b_blocknr;
2620                 i--;
2621         }
2622
2623 out:
2624         return ret;
2625 }
2626
2627 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2628                                             handle_t *handle,
2629                                             struct buffer_head *bh,
2630                                             struct ocfs2_extent_list *el)
2631 {
2632         int ret;
2633
2634         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2635                 return 0;
2636
2637         ret = ocfs2_journal_access(handle, inode, bh,
2638                                    OCFS2_JOURNAL_ACCESS_WRITE);
2639         if (ret) {
2640                 mlog_errno(ret);
2641                 goto out;
2642         }
2643
2644         ocfs2_remove_empty_extent(el);
2645
2646         ret = ocfs2_journal_dirty(handle, bh);
2647         if (ret)
2648                 mlog_errno(ret);
2649
2650 out:
2651         return ret;
2652 }
2653
2654 static int __ocfs2_rotate_tree_left(struct inode *inode,
2655                                     handle_t *handle, int orig_credits,
2656                                     struct ocfs2_path *path,
2657                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2658                                     struct ocfs2_path **empty_extent_path,
2659                                     struct ocfs2_extent_tree *et)
2660 {
2661         int ret, subtree_root, deleted;
2662         u32 right_cpos;
2663         struct ocfs2_path *left_path = NULL;
2664         struct ocfs2_path *right_path = NULL;
2665
2666         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2667
2668         *empty_extent_path = NULL;
2669
2670         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2671                                              &right_cpos);
2672         if (ret) {
2673                 mlog_errno(ret);
2674                 goto out;
2675         }
2676
2677         left_path = ocfs2_new_path(path_root_bh(path),
2678                                    path_root_el(path));
2679         if (!left_path) {
2680                 ret = -ENOMEM;
2681                 mlog_errno(ret);
2682                 goto out;
2683         }
2684
2685         ocfs2_cp_path(left_path, path);
2686
2687         right_path = ocfs2_new_path(path_root_bh(path),
2688                                     path_root_el(path));
2689         if (!right_path) {
2690                 ret = -ENOMEM;
2691                 mlog_errno(ret);
2692                 goto out;
2693         }
2694
2695         while (right_cpos) {
2696                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2697                 if (ret) {
2698                         mlog_errno(ret);
2699                         goto out;
2700                 }
2701
2702                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2703                                                        right_path);
2704
2705                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2706                      subtree_root,
2707                      (unsigned long long)
2708                      right_path->p_node[subtree_root].bh->b_blocknr,
2709                      right_path->p_tree_depth);
2710
2711                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2712                                                       orig_credits, left_path);
2713                 if (ret) {
2714                         mlog_errno(ret);
2715                         goto out;
2716                 }
2717
2718                 /*
2719                  * Caller might still want to make changes to the
2720                  * tree root, so re-add it to the journal here.
2721                  */
2722                 ret = ocfs2_journal_access(handle, inode,
2723                                            path_root_bh(left_path),
2724                                            OCFS2_JOURNAL_ACCESS_WRITE);
2725                 if (ret) {
2726                         mlog_errno(ret);
2727                         goto out;
2728                 }
2729
2730                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2731                                                 right_path, subtree_root,
2732                                                 dealloc, &deleted, et);
2733                 if (ret == -EAGAIN) {
2734                         /*
2735                          * The rotation has to temporarily stop due to
2736                          * the right subtree having an empty
2737                          * extent. Pass it back to the caller for a
2738                          * fixup.
2739                          */
2740                         *empty_extent_path = right_path;
2741                         right_path = NULL;
2742                         goto out;
2743                 }
2744                 if (ret) {
2745                         mlog_errno(ret);
2746                         goto out;
2747                 }
2748
2749                 /*
2750                  * The subtree rotate might have removed records on
2751                  * the rightmost edge. If so, then rotation is
2752                  * complete.
2753                  */
2754                 if (deleted)
2755                         break;
2756
2757                 ocfs2_mv_path(left_path, right_path);
2758
2759                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2760                                                      &right_cpos);
2761                 if (ret) {
2762                         mlog_errno(ret);
2763                         goto out;
2764                 }
2765         }
2766
2767 out:
2768         ocfs2_free_path(right_path);
2769         ocfs2_free_path(left_path);
2770
2771         return ret;
2772 }
2773
2774 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2775                                 struct ocfs2_path *path,
2776                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2777                                 struct ocfs2_extent_tree *et)
2778 {
2779         int ret, subtree_index;
2780         u32 cpos;
2781         struct ocfs2_path *left_path = NULL;
2782         struct ocfs2_extent_block *eb;
2783         struct ocfs2_extent_list *el;
2784
2785
2786         ret = ocfs2_et_sanity_check(inode, et);
2787         if (ret)
2788                 goto out;
2789         /*
2790          * There's two ways we handle this depending on
2791          * whether path is the only existing one.
2792          */
2793         ret = ocfs2_extend_rotate_transaction(handle, 0,
2794                                               handle->h_buffer_credits,
2795                                               path);
2796         if (ret) {
2797                 mlog_errno(ret);
2798                 goto out;
2799         }
2800
2801         ret = ocfs2_journal_access_path(inode, handle, path);
2802         if (ret) {
2803                 mlog_errno(ret);
2804                 goto out;
2805         }
2806
2807         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2808         if (ret) {
2809                 mlog_errno(ret);
2810                 goto out;
2811         }
2812
2813         if (cpos) {
2814                 /*
2815                  * We have a path to the left of this one - it needs
2816                  * an update too.
2817                  */
2818                 left_path = ocfs2_new_path(path_root_bh(path),
2819                                            path_root_el(path));
2820                 if (!left_path) {
2821                         ret = -ENOMEM;
2822                         mlog_errno(ret);
2823                         goto out;
2824                 }
2825
2826                 ret = ocfs2_find_path(inode, left_path, cpos);
2827                 if (ret) {
2828                         mlog_errno(ret);
2829                         goto out;
2830                 }
2831
2832                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2833                 if (ret) {
2834                         mlog_errno(ret);
2835                         goto out;
2836                 }
2837
2838                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2839
2840                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2841                                      subtree_index, dealloc);
2842                 ocfs2_update_edge_lengths(inode, handle, left_path);
2843
2844                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2845                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2846         } else {
2847                 /*
2848                  * 'path' is also the leftmost path which
2849                  * means it must be the only one. This gets
2850                  * handled differently because we want to
2851                  * revert the inode back to having extents
2852                  * in-line.
2853                  */
2854                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2855
2856                 el = et->et_root_el;
2857                 el->l_tree_depth = 0;
2858                 el->l_next_free_rec = 0;
2859                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2860
2861                 ocfs2_et_set_last_eb_blk(et, 0);
2862         }
2863
2864         ocfs2_journal_dirty(handle, path_root_bh(path));
2865
2866 out:
2867         ocfs2_free_path(left_path);
2868         return ret;
2869 }
2870
2871 /*
2872  * Left rotation of btree records.
2873  *
2874  * In many ways, this is (unsurprisingly) the opposite of right
2875  * rotation. We start at some non-rightmost path containing an empty
2876  * extent in the leaf block. The code works its way to the rightmost
2877  * path by rotating records to the left in every subtree.
2878  *
2879  * This is used by any code which reduces the number of extent records
2880  * in a leaf. After removal, an empty record should be placed in the
2881  * leftmost list position.
2882  *
2883  * This won't handle a length update of the rightmost path records if
2884  * the rightmost tree leaf record is removed so the caller is
2885  * responsible for detecting and correcting that.
2886  */
2887 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2888                                   struct ocfs2_path *path,
2889                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
2890                                   struct ocfs2_extent_tree *et)
2891 {
2892         int ret, orig_credits = handle->h_buffer_credits;
2893         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2894         struct ocfs2_extent_block *eb;
2895         struct ocfs2_extent_list *el;
2896
2897         el = path_leaf_el(path);
2898         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2899                 return 0;
2900
2901         if (path->p_tree_depth == 0) {
2902 rightmost_no_delete:
2903                 /*
2904                  * Inline extents. This is trivially handled, so do
2905                  * it up front.
2906                  */
2907                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2908                                                        path_leaf_bh(path),
2909                                                        path_leaf_el(path));
2910                 if (ret)
2911                         mlog_errno(ret);
2912                 goto out;
2913         }
2914
2915         /*
2916          * Handle rightmost branch now. There's several cases:
2917          *  1) simple rotation leaving records in there. That's trivial.
2918          *  2) rotation requiring a branch delete - there's no more
2919          *     records left. Two cases of this:
2920          *     a) There are branches to the left.
2921          *     b) This is also the leftmost (the only) branch.
2922          *
2923          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2924          *  2a) we need the left branch so that we can update it with the unlink
2925          *  2b) we need to bring the inode back to inline extents.
2926          */
2927
2928         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2929         el = &eb->h_list;
2930         if (eb->h_next_leaf_blk == 0) {
2931                 /*
2932                  * This gets a bit tricky if we're going to delete the
2933                  * rightmost path. Get the other cases out of the way
2934                  * 1st.
2935                  */
2936                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2937                         goto rightmost_no_delete;
2938
2939                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2940                         ret = -EIO;
2941                         ocfs2_error(inode->i_sb,
2942                                     "Inode %llu has empty extent block at %llu",
2943                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2944                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2945                         goto out;
2946                 }
2947
2948                 /*
2949                  * XXX: The caller can not trust "path" any more after
2950                  * this as it will have been deleted. What do we do?
2951                  *
2952                  * In theory the rotate-for-merge code will never get
2953                  * here because it'll always ask for a rotate in a
2954                  * nonempty list.
2955                  */
2956
2957                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2958                                                   dealloc, et);
2959                 if (ret)
2960                         mlog_errno(ret);
2961                 goto out;
2962         }
2963
2964         /*
2965          * Now we can loop, remembering the path we get from -EAGAIN
2966          * and restarting from there.
2967          */
2968 try_rotate:
2969         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2970                                        dealloc, &restart_path, et);
2971         if (ret && ret != -EAGAIN) {
2972                 mlog_errno(ret);
2973                 goto out;
2974         }
2975
2976         while (ret == -EAGAIN) {
2977                 tmp_path = restart_path;
2978                 restart_path = NULL;
2979
2980                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2981                                                tmp_path, dealloc,
2982                                                &restart_path, et);
2983                 if (ret && ret != -EAGAIN) {
2984                         mlog_errno(ret);
2985                         goto out;
2986                 }
2987
2988                 ocfs2_free_path(tmp_path);
2989                 tmp_path = NULL;
2990
2991                 if (ret == 0)
2992                         goto try_rotate;
2993         }
2994
2995 out:
2996         ocfs2_free_path(tmp_path);
2997         ocfs2_free_path(restart_path);
2998         return ret;
2999 }
3000
3001 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3002                                 int index)
3003 {
3004         struct ocfs2_extent_rec *rec = &el->l_recs[index];
3005         unsigned int size;
3006
3007         if (rec->e_leaf_clusters == 0) {
3008                 /*
3009                  * We consumed all of the merged-from record. An empty
3010                  * extent cannot exist anywhere but the 1st array
3011                  * position, so move things over if the merged-from
3012                  * record doesn't occupy that position.
3013                  *
3014                  * This creates a new empty extent so the caller
3015                  * should be smart enough to have removed any existing
3016                  * ones.
3017                  */
3018                 if (index > 0) {
3019                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3020                         size = index * sizeof(struct ocfs2_extent_rec);
3021                         memmove(&el->l_recs[1], &el->l_recs[0], size);
3022                 }
3023
3024                 /*
3025                  * Always memset - the caller doesn't check whether it
3026                  * created an empty extent, so there could be junk in
3027                  * the other fields.
3028                  */
3029                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3030         }
3031 }
3032
3033 static int ocfs2_get_right_path(struct inode *inode,
3034                                 struct ocfs2_path *left_path,
3035                                 struct ocfs2_path **ret_right_path)
3036 {
3037         int ret;
3038         u32 right_cpos;
3039         struct ocfs2_path *right_path = NULL;
3040         struct ocfs2_extent_list *left_el;
3041
3042         *ret_right_path = NULL;
3043
3044         /* This function shouldn't be called for non-trees. */
3045         BUG_ON(left_path->p_tree_depth == 0);
3046
3047         left_el = path_leaf_el(left_path);
3048         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3049
3050         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
3051                                              &right_cpos);
3052         if (ret) {
3053                 mlog_errno(ret);
3054                 goto out;
3055         }
3056
3057         /* This function shouldn't be called for the rightmost leaf. */
3058         BUG_ON(right_cpos == 0);
3059
3060         right_path = ocfs2_new_path(path_root_bh(left_path),
3061                                     path_root_el(left_path));
3062         if (!right_path) {
3063                 ret = -ENOMEM;
3064                 mlog_errno(ret);
3065                 goto out;
3066         }
3067
3068         ret = ocfs2_find_path(inode, right_path, right_cpos);
3069         if (ret) {
3070                 mlog_errno(ret);
3071                 goto out;
3072         }
3073
3074         *ret_right_path = right_path;
3075 out:
3076         if (ret)
3077                 ocfs2_free_path(right_path);
3078         return ret;
3079 }
3080
3081 /*
3082  * Remove split_rec clusters from the record at index and merge them
3083  * onto the beginning of the record "next" to it.
3084  * For index < l_count - 1, the next means the extent rec at index + 1.
3085  * For index == l_count - 1, the "next" means the 1st extent rec of the
3086  * next extent block.
3087  */
3088 static int ocfs2_merge_rec_right(struct inode *inode,
3089                                  struct ocfs2_path *left_path,
3090                                  handle_t *handle,
3091                                  struct ocfs2_extent_rec *split_rec,
3092                                  int index)
3093 {
3094         int ret, next_free, i;
3095         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3096         struct ocfs2_extent_rec *left_rec;
3097         struct ocfs2_extent_rec *right_rec;
3098         struct ocfs2_extent_list *right_el;
3099         struct ocfs2_path *right_path = NULL;
3100         int subtree_index = 0;
3101         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3102         struct buffer_head *bh = path_leaf_bh(left_path);
3103         struct buffer_head *root_bh = NULL;
3104
3105         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3106         left_rec = &el->l_recs[index];
3107
3108         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3109             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3110                 /* we meet with a cross extent block merge. */
3111                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
3112                 if (ret) {
3113                         mlog_errno(ret);
3114                         goto out;
3115                 }
3116
3117                 right_el = path_leaf_el(right_path);
3118                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3119                 BUG_ON(next_free <= 0);
3120                 right_rec = &right_el->l_recs[0];
3121                 if (ocfs2_is_empty_extent(right_rec)) {
3122                         BUG_ON(next_free <= 1);
3123                         right_rec = &right_el->l_recs[1];
3124                 }
3125
3126                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3127                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3128                        le32_to_cpu(right_rec->e_cpos));
3129
3130                 subtree_index = ocfs2_find_subtree_root(inode,
3131                                                         left_path, right_path);
3132
3133                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3134                                                       handle->h_buffer_credits,
3135                                                       right_path);
3136                 if (ret) {
3137                         mlog_errno(ret);
3138                         goto out;
3139                 }
3140
3141                 root_bh = left_path->p_node[subtree_index].bh;
3142                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3143
3144                 ret = ocfs2_journal_access(handle, inode, root_bh,
3145                                            OCFS2_JOURNAL_ACCESS_WRITE);
3146                 if (ret) {
3147                         mlog_errno(ret);
3148                         goto out;
3149                 }
3150
3151                 for (i = subtree_index + 1;
3152                      i < path_num_items(right_path); i++) {
3153                         ret = ocfs2_journal_access(handle, inode,
3154                                                    right_path->p_node[i].bh,
3155                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3156                         if (ret) {
3157                                 mlog_errno(ret);
3158                                 goto out;
3159                         }
3160
3161                         ret = ocfs2_journal_access(handle, inode,
3162                                                    left_path->p_node[i].bh,
3163                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3164                         if (ret) {
3165                                 mlog_errno(ret);
3166                                 goto out;
3167                         }
3168                 }
3169
3170         } else {
3171                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3172                 right_rec = &el->l_recs[index + 1];
3173         }
3174
3175         ret = ocfs2_journal_access(handle, inode, bh,
3176                                    OCFS2_JOURNAL_ACCESS_WRITE);
3177         if (ret) {
3178                 mlog_errno(ret);
3179                 goto out;
3180         }
3181
3182         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3183
3184         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3185         le64_add_cpu(&right_rec->e_blkno,
3186                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3187         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3188
3189         ocfs2_cleanup_merge(el, index);
3190
3191         ret = ocfs2_journal_dirty(handle, bh);
3192         if (ret)
3193                 mlog_errno(ret);
3194
3195         if (right_path) {
3196                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3197                 if (ret)
3198                         mlog_errno(ret);
3199
3200                 ocfs2_complete_edge_insert(inode, handle, left_path,
3201                                            right_path, subtree_index);
3202         }
3203 out:
3204         if (right_path)
3205                 ocfs2_free_path(right_path);
3206         return ret;
3207 }
3208
3209 static int ocfs2_get_left_path(struct inode *inode,
3210                                struct ocfs2_path *right_path,
3211                                struct ocfs2_path **ret_left_path)
3212 {
3213         int ret;
3214         u32 left_cpos;
3215         struct ocfs2_path *left_path = NULL;
3216
3217         *ret_left_path = NULL;
3218
3219         /* This function shouldn't be called for non-trees. */
3220         BUG_ON(right_path->p_tree_depth == 0);
3221
3222         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3223                                             right_path, &left_cpos);
3224         if (ret) {
3225                 mlog_errno(ret);
3226                 goto out;
3227         }
3228
3229         /* This function shouldn't be called for the leftmost leaf. */
3230         BUG_ON(left_cpos == 0);
3231
3232         left_path = ocfs2_new_path(path_root_bh(right_path),
3233                                    path_root_el(right_path));
3234         if (!left_path) {
3235                 ret = -ENOMEM;
3236                 mlog_errno(ret);
3237                 goto out;
3238         }
3239
3240         ret = ocfs2_find_path(inode, left_path, left_cpos);
3241         if (ret) {
3242                 mlog_errno(ret);
3243                 goto out;
3244         }
3245
3246         *ret_left_path = left_path;
3247 out:
3248         if (ret)
3249                 ocfs2_free_path(left_path);
3250         return ret;
3251 }
3252
3253 /*
3254  * Remove split_rec clusters from the record at index and merge them
3255  * onto the tail of the record "before" it.
3256  * For index > 0, the "before" means the extent rec at index - 1.
3257  *
3258  * For index == 0, the "before" means the last record of the previous
3259  * extent block. And there is also a situation that we may need to
3260  * remove the rightmost leaf extent block in the right_path and change
3261  * the right path to indicate the new rightmost path.
3262  */
3263 static int ocfs2_merge_rec_left(struct inode *inode,
3264                                 struct ocfs2_path *right_path,
3265                                 handle_t *handle,
3266                                 struct ocfs2_extent_rec *split_rec,
3267                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3268                                 struct ocfs2_extent_tree *et,
3269                                 int index)
3270 {
3271         int ret, i, subtree_index = 0, has_empty_extent = 0;
3272         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3273         struct ocfs2_extent_rec *left_rec;
3274         struct ocfs2_extent_rec *right_rec;
3275         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3276         struct buffer_head *bh = path_leaf_bh(right_path);
3277         struct buffer_head *root_bh = NULL;
3278         struct ocfs2_path *left_path = NULL;
3279         struct ocfs2_extent_list *left_el;
3280
3281         BUG_ON(index < 0);
3282
3283         right_rec = &el->l_recs[index];
3284         if (index == 0) {
3285                 /* we meet with a cross extent block merge. */
3286                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
3287                 if (ret) {
3288                         mlog_errno(ret);
3289                         goto out;
3290                 }
3291
3292                 left_el = path_leaf_el(left_path);
3293                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3294                        le16_to_cpu(left_el->l_count));
3295
3296                 left_rec = &left_el->l_recs[
3297                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3298                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3299                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3300                        le32_to_cpu(split_rec->e_cpos));
3301
3302                 subtree_index = ocfs2_find_subtree_root(inode,
3303                                                         left_path, right_path);
3304
3305                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3306                                                       handle->h_buffer_credits,
3307                                                       left_path);
3308                 if (ret) {
3309                         mlog_errno(ret);
3310                         goto out;
3311                 }
3312
3313                 root_bh = left_path->p_node[subtree_index].bh;
3314                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3315
3316                 ret = ocfs2_journal_access(handle, inode, root_bh,
3317                                            OCFS2_JOURNAL_ACCESS_WRITE);
3318                 if (ret) {
3319                         mlog_errno(ret);
3320                         goto out;
3321                 }
3322
3323                 for (i = subtree_index + 1;
3324                      i < path_num_items(right_path); i++) {
3325                         ret = ocfs2_journal_access(handle, inode,
3326                                                    right_path->p_node[i].bh,
3327                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3328                         if (ret) {
3329                                 mlog_errno(ret);
3330                                 goto out;
3331                         }
3332
3333                         ret = ocfs2_journal_access(handle, inode,
3334                                                    left_path->p_node[i].bh,
3335                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3336                         if (ret) {
3337                                 mlog_errno(ret);
3338                                 goto out;
3339                         }
3340                 }
3341         } else {
3342                 left_rec = &el->l_recs[index - 1];
3343                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3344                         has_empty_extent = 1;
3345         }
3346
3347         ret = ocfs2_journal_access(handle, inode, bh,
3348                                    OCFS2_JOURNAL_ACCESS_WRITE);
3349         if (ret) {
3350                 mlog_errno(ret);
3351                 goto out;
3352         }
3353
3354         if (has_empty_extent && index == 1) {
3355                 /*
3356                  * The easy case - we can just plop the record right in.
3357                  */
3358                 *left_rec = *split_rec;
3359
3360                 has_empty_extent = 0;
3361         } else
3362                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3363
3364         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3365         le64_add_cpu(&right_rec->e_blkno,
3366                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3367         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3368
3369         ocfs2_cleanup_merge(el, index);
3370
3371         ret = ocfs2_journal_dirty(handle, bh);
3372         if (ret)
3373                 mlog_errno(ret);
3374
3375         if (left_path) {
3376                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3377                 if (ret)
3378                         mlog_errno(ret);
3379
3380                 /*
3381                  * In the situation that the right_rec is empty and the extent
3382                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3383                  * it and we need to delete the right extent block.
3384                  */
3385                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3386                     le16_to_cpu(el->l_next_free_rec) == 1) {
3387
3388                         ret = ocfs2_remove_rightmost_path(inode, handle,
3389                                                           right_path,
3390                                                           dealloc, et);
3391                         if (ret) {
3392                                 mlog_errno(ret);
3393                                 goto out;
3394                         }
3395
3396                         /* Now the rightmost extent block has been deleted.
3397                          * So we use the new rightmost path.
3398                          */
3399                         ocfs2_mv_path(right_path, left_path);
3400                         left_path = NULL;
3401                 } else
3402                         ocfs2_complete_edge_insert(inode, handle, left_path,
3403                                                    right_path, subtree_index);
3404         }
3405 out:
3406         if (left_path)
3407                 ocfs2_free_path(left_path);
3408         return ret;
3409 }
3410
3411 static int ocfs2_try_to_merge_extent(struct inode *inode,
3412                                      handle_t *handle,
3413                                      struct ocfs2_path *path,
3414                                      int split_index,
3415                                      struct ocfs2_extent_rec *split_rec,
3416                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3417                                      struct ocfs2_merge_ctxt *ctxt,
3418                                      struct ocfs2_extent_tree *et)
3419
3420 {
3421         int ret = 0;
3422         struct ocfs2_extent_list *el = path_leaf_el(path);
3423         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3424
3425         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3426
3427         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3428                 /*
3429                  * The merge code will need to create an empty
3430                  * extent to take the place of the newly
3431                  * emptied slot. Remove any pre-existing empty
3432                  * extents - having more than one in a leaf is
3433                  * illegal.
3434                  */
3435                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3436                                              dealloc, et);
3437                 if (ret) {
3438                         mlog_errno(ret);
3439                         goto out;
3440                 }
3441                 split_index--;
3442                 rec = &el->l_recs[split_index];
3443         }
3444
3445         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3446                 /*
3447                  * Left-right contig implies this.
3448                  */
3449                 BUG_ON(!ctxt->c_split_covers_rec);
3450
3451                 /*
3452                  * Since the leftright insert always covers the entire
3453                  * extent, this call will delete the insert record
3454                  * entirely, resulting in an empty extent record added to
3455                  * the extent block.
3456                  *
3457                  * Since the adding of an empty extent shifts
3458                  * everything back to the right, there's no need to
3459                  * update split_index here.
3460                  *
3461                  * When the split_index is zero, we need to merge it to the
3462                  * prevoius extent block. It is more efficient and easier
3463                  * if we do merge_right first and merge_left later.
3464                  */
3465                 ret = ocfs2_merge_rec_right(inode, path,
3466                                             handle, split_rec,
3467                                             split_index);
3468                 if (ret) {
3469                         mlog_errno(ret);
3470                         goto out;
3471                 }
3472
3473                 /*
3474                  * We can only get this from logic error above.
3475                  */
3476                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3477
3478                 /* The merge left us with an empty extent, remove it. */
3479                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3480                                              dealloc, et);
3481                 if (ret) {
3482                         mlog_errno(ret);
3483                         goto out;
3484                 }
3485
3486                 rec = &el->l_recs[split_index];
3487
3488                 /*
3489                  * Note that we don't pass split_rec here on purpose -
3490                  * we've merged it into the rec already.
3491                  */
3492                 ret = ocfs2_merge_rec_left(inode, path,
3493                                            handle, rec,
3494                                            dealloc, et,
3495                                            split_index);
3496
3497                 if (ret) {
3498                         mlog_errno(ret);
3499                         goto out;
3500                 }
3501
3502                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3503                                              dealloc, et);
3504                 /*
3505                  * Error from this last rotate is not critical, so
3506                  * print but don't bubble it up.
3507                  */
3508                 if (ret)
3509                         mlog_errno(ret);
3510                 ret = 0;
3511         } else {
3512                 /*
3513                  * Merge a record to the left or right.
3514                  *
3515                  * 'contig_type' is relative to the existing record,
3516                  * so for example, if we're "right contig", it's to
3517                  * the record on the left (hence the left merge).
3518                  */
3519                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3520                         ret = ocfs2_merge_rec_left(inode,
3521                                                    path,
3522                                                    handle, split_rec,
3523                                                    dealloc, et,
3524                                                    split_index);
3525                         if (ret) {
3526                                 mlog_errno(ret);
3527                                 goto out;
3528                         }
3529                 } else {
3530                         ret = ocfs2_merge_rec_right(inode,
3531                                                     path,
3532                                                     handle, split_rec,
3533                                                     split_index);
3534                         if (ret) {
3535                                 mlog_errno(ret);
3536                                 goto out;
3537                         }
3538                 }
3539
3540                 if (ctxt->c_split_covers_rec) {
3541                         /*
3542                          * The merge may have left an empty extent in
3543                          * our leaf. Try to rotate it away.
3544                          */
3545                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3546                                                      dealloc, et);
3547                         if (ret)
3548                                 mlog_errno(ret);
3549                         ret = 0;
3550                 }
3551         }
3552
3553 out:
3554         return ret;
3555 }
3556
3557 static void ocfs2_subtract_from_rec(struct super_block *sb,
3558                                     enum ocfs2_split_type split,
3559                                     struct ocfs2_extent_rec *rec,
3560                                     struct ocfs2_extent_rec *split_rec)
3561 {
3562         u64 len_blocks;
3563
3564         len_blocks = ocfs2_clusters_to_blocks(sb,
3565                                 le16_to_cpu(split_rec->e_leaf_clusters));
3566
3567         if (split == SPLIT_LEFT) {
3568                 /*
3569                  * Region is on the left edge of the existing
3570                  * record.
3571                  */
3572                 le32_add_cpu(&rec->e_cpos,
3573                              le16_to_cpu(split_rec->e_leaf_clusters));
3574                 le64_add_cpu(&rec->e_blkno, len_blocks);
3575                 le16_add_cpu(&rec->e_leaf_clusters,
3576                              -le16_to_cpu(split_rec->e_leaf_clusters));
3577         } else {
3578                 /*
3579                  * Region is on the right edge of the existing
3580                  * record.
3581                  */
3582                 le16_add_cpu(&rec->e_leaf_clusters,
3583                              -le16_to_cpu(split_rec->e_leaf_clusters));
3584         }
3585 }
3586
3587 /*
3588  * Do the final bits of extent record insertion at the target leaf
3589  * list. If this leaf is part of an allocation tree, it is assumed
3590  * that the tree above has been prepared.
3591  */
3592 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3593                                  struct ocfs2_extent_list *el,
3594                                  struct ocfs2_insert_type *insert,
3595                                  struct inode *inode)
3596 {
3597         int i = insert->ins_contig_index;
3598         unsigned int range;
3599         struct ocfs2_extent_rec *rec;
3600
3601         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3602
3603         if (insert->ins_split != SPLIT_NONE) {
3604                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3605                 BUG_ON(i == -1);
3606                 rec = &el->l_recs[i];
3607                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3608                                         insert_rec);
3609                 goto rotate;
3610         }
3611
3612         /*
3613          * Contiguous insert - either left or right.
3614          */
3615         if (insert->ins_contig != CONTIG_NONE) {
3616                 rec = &el->l_recs[i];
3617                 if (insert->ins_contig == CONTIG_LEFT) {
3618                         rec->e_blkno = insert_rec->e_blkno;
3619                         rec->e_cpos = insert_rec->e_cpos;
3620                 }
3621                 le16_add_cpu(&rec->e_leaf_clusters,
3622                              le16_to_cpu(insert_rec->e_leaf_clusters));
3623                 return;
3624         }
3625
3626         /*
3627          * Handle insert into an empty leaf.
3628          */
3629         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3630             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3631              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3632                 el->l_recs[0] = *insert_rec;
3633                 el->l_next_free_rec = cpu_to_le16(1);
3634                 return;
3635         }
3636
3637         /*
3638          * Appending insert.
3639          */
3640         if (insert->ins_appending == APPEND_TAIL) {
3641                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3642                 rec = &el->l_recs[i];
3643                 range = le32_to_cpu(rec->e_cpos)
3644                         + le16_to_cpu(rec->e_leaf_clusters);
3645                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3646
3647                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3648                                 le16_to_cpu(el->l_count),
3649                                 "inode %lu, depth %u, count %u, next free %u, "
3650                                 "rec.cpos %u, rec.clusters %u, "
3651                                 "insert.cpos %u, insert.clusters %u\n",
3652                                 inode->i_ino,
3653                                 le16_to_cpu(el->l_tree_depth),
3654                                 le16_to_cpu(el->l_count),
3655                                 le16_to_cpu(el->l_next_free_rec),
3656                                 le32_to_cpu(el->l_recs[i].e_cpos),
3657                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3658                                 le32_to_cpu(insert_rec->e_cpos),
3659                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3660                 i++;
3661                 el->l_recs[i] = *insert_rec;
3662                 le16_add_cpu(&el->l_next_free_rec, 1);
3663                 return;
3664         }
3665
3666 rotate:
3667         /*
3668          * Ok, we have to rotate.
3669          *
3670          * At this point, it is safe to assume that inserting into an
3671          * empty leaf and appending to a leaf have both been handled
3672          * above.
3673          *
3674          * This leaf needs to have space, either by the empty 1st
3675          * extent record, or by virtue of an l_next_rec < l_count.
3676          */
3677         ocfs2_rotate_leaf(el, insert_rec);
3678 }
3679
3680 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3681                                            handle_t *handle,
3682                                            struct ocfs2_path *path,
3683                                            struct ocfs2_extent_rec *insert_rec)
3684 {
3685         int ret, i, next_free;
3686         struct buffer_head *bh;
3687         struct ocfs2_extent_list *el;
3688         struct ocfs2_extent_rec *rec;
3689
3690         /*
3691          * Update everything except the leaf block.
3692          */
3693         for (i = 0; i < path->p_tree_depth; i++) {
3694                 bh = path->p_node[i].bh;
3695                 el = path->p_node[i].el;
3696
3697                 next_free = le16_to_cpu(el->l_next_free_rec);
3698                 if (next_free == 0) {
3699                         ocfs2_error(inode->i_sb,
3700                                     "Dinode %llu has a bad extent list",
3701                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3702                         ret = -EIO;
3703                         return;
3704                 }
3705
3706                 rec = &el->l_recs[next_free - 1];
3707
3708                 rec->e_int_clusters = insert_rec->e_cpos;
3709                 le32_add_cpu(&rec->e_int_clusters,
3710                              le16_to_cpu(insert_rec->e_leaf_clusters));
3711                 le32_add_cpu(&rec->e_int_clusters,
3712                              -le32_to_cpu(rec->e_cpos));
3713
3714                 ret = ocfs2_journal_dirty(handle, bh);
3715                 if (ret)
3716                         mlog_errno(ret);
3717
3718         }
3719 }
3720
3721 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3722                                     struct ocfs2_extent_rec *insert_rec,
3723                                     struct ocfs2_path *right_path,
3724                                     struct ocfs2_path **ret_left_path)
3725 {
3726         int ret, next_free;
3727         struct ocfs2_extent_list *el;
3728         struct ocfs2_path *left_path = NULL;
3729
3730         *ret_left_path = NULL;
3731
3732         /*
3733          * This shouldn't happen for non-trees. The extent rec cluster
3734          * count manipulation below only works for interior nodes.
3735          */
3736         BUG_ON(right_path->p_tree_depth == 0);
3737
3738         /*
3739          * If our appending insert is at the leftmost edge of a leaf,
3740          * then we might need to update the rightmost records of the
3741          * neighboring path.
3742          */
3743         el = path_leaf_el(right_path);
3744         next_free = le16_to_cpu(el->l_next_free_rec);
3745         if (next_free == 0 ||
3746             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3747                 u32 left_cpos;
3748
3749                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3750                                                     &left_cpos);
3751                 if (ret) {
3752                         mlog_errno(ret);
3753                         goto out;
3754                 }
3755
3756                 mlog(0, "Append may need a left path update. cpos: %u, "
3757                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3758                      left_cpos);
3759
3760                 /*
3761                  * No need to worry if the append is already in the
3762                  * leftmost leaf.
3763                  */
3764                 if (left_cpos) {
3765                         left_path = ocfs2_new_path(path_root_bh(right_path),
3766                                                    path_root_el(right_path));
3767                         if (!left_path) {
3768                                 ret = -ENOMEM;
3769                                 mlog_errno(ret);
3770                                 goto out;
3771                         }
3772
3773                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3774                         if (ret) {
3775                                 mlog_errno(ret);
3776                                 goto out;
3777                         }
3778
3779                         /*
3780                          * ocfs2_insert_path() will pass the left_path to the
3781                          * journal for us.
3782                          */
3783                 }
3784         }
3785
3786         ret = ocfs2_journal_access_path(inode, handle, right_path);
3787         if (ret) {
3788                 mlog_errno(ret);
3789                 goto out;
3790         }
3791
3792         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3793
3794         *ret_left_path = left_path;
3795         ret = 0;
3796 out:
3797         if (ret != 0)
3798                 ocfs2_free_path(left_path);
3799
3800         return ret;
3801 }
3802
3803 static void ocfs2_split_record(struct inode *inode,
3804                                struct ocfs2_path *left_path,
3805                                struct ocfs2_path *right_path,
3806                                struct ocfs2_extent_rec *split_rec,
3807                                enum ocfs2_split_type split)
3808 {
3809         int index;
3810         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3811         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3812         struct ocfs2_extent_rec *rec, *tmprec;
3813
3814         right_el = path_leaf_el(right_path);;
3815         if (left_path)
3816                 left_el = path_leaf_el(left_path);
3817
3818         el = right_el;
3819         insert_el = right_el;
3820         index = ocfs2_search_extent_list(el, cpos);
3821         if (index != -1) {
3822                 if (index == 0 && left_path) {
3823                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3824
3825                         /*
3826                          * This typically means that the record
3827                          * started in the left path but moved to the
3828                          * right as a result of rotation. We either
3829                          * move the existing record to the left, or we
3830                          * do the later insert there.
3831                          *
3832                          * In this case, the left path should always
3833                          * exist as the rotate code will have passed
3834                          * it back for a post-insert update.
3835                          */
3836
3837                         if (split == SPLIT_LEFT) {
3838                                 /*
3839                                  * It's a left split. Since we know
3840                                  * that the rotate code gave us an
3841                                  * empty extent in the left path, we
3842                                  * can just do the insert there.
3843                                  */
3844                                 insert_el = left_el;
3845                         } else {
3846                                 /*
3847                                  * Right split - we have to move the
3848                                  * existing record over to the left
3849                                  * leaf. The insert will be into the
3850                                  * newly created empty extent in the
3851                                  * right leaf.
3852                                  */
3853                                 tmprec = &right_el->l_recs[index];
3854                                 ocfs2_rotate_leaf(left_el, tmprec);
3855                                 el = left_el;
3856
3857                                 memset(tmprec, 0, sizeof(*tmprec));
3858                                 index = ocfs2_search_extent_list(left_el, cpos);
3859                                 BUG_ON(index == -1);
3860                         }
3861                 }
3862         } else {
3863                 BUG_ON(!left_path);
3864                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3865                 /*
3866                  * Left path is easy - we can just allow the insert to
3867                  * happen.
3868                  */
3869                 el = left_el;
3870                 insert_el = left_el;
3871                 index = ocfs2_search_extent_list(el, cpos);
3872                 BUG_ON(index == -1);
3873         }
3874
3875         rec = &el->l_recs[index];
3876         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3877         ocfs2_rotate_leaf(insert_el, split_rec);
3878 }
3879
3880 /*
3881  * This function only does inserts on an allocation b-tree. For tree
3882  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3883  *
3884  * right_path is the path we want to do the actual insert
3885  * in. left_path should only be passed in if we need to update that
3886  * portion of the tree after an edge insert.
3887  */
3888 static int ocfs2_insert_path(struct inode *inode,
3889                              handle_t *handle,
3890                              struct ocfs2_path *left_path,
3891                              struct ocfs2_path *right_path,
3892                              struct ocfs2_extent_rec *insert_rec,
3893                              struct ocfs2_insert_type *insert)
3894 {
3895         int ret, subtree_index;
3896         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3897
3898         if (left_path) {
3899                 int credits = handle->h_buffer_credits;
3900
3901                 /*
3902                  * There's a chance that left_path got passed back to
3903                  * us without being accounted for in the
3904                  * journal. Extend our transaction here to be sure we
3905                  * can change those blocks.
3906                  */
3907                 credits += left_path->p_tree_depth;
3908
3909                 ret = ocfs2_extend_trans(handle, credits);
3910                 if (ret < 0) {
3911                         mlog_errno(ret);
3912                         goto out;
3913                 }
3914
3915                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3916                 if (ret < 0) {
3917                         mlog_errno(ret);
3918                         goto out;
3919                 }
3920         }
3921
3922         /*
3923          * Pass both paths to the journal. The majority of inserts
3924          * will be touching all components anyway.
3925          */
3926         ret = ocfs2_journal_access_path(inode, handle, right_path);
3927         if (ret < 0) {
3928                 mlog_errno(ret);
3929                 goto out;
3930         }
3931
3932         if (insert->ins_split != SPLIT_NONE) {
3933                 /*
3934                  * We could call ocfs2_insert_at_leaf() for some types
3935                  * of splits, but it's easier to just let one separate
3936                  * function sort it all out.
3937                  */
3938                 ocfs2_split_record(inode, left_path, right_path,
3939                                    insert_rec, insert->ins_split);
3940
3941                 /*
3942                  * Split might have modified either leaf and we don't
3943                  * have a guarantee that the later edge insert will
3944                  * dirty this for us.
3945                  */
3946                 if (left_path)
3947                         ret = ocfs2_journal_dirty(handle,
3948                                                   path_leaf_bh(left_path));
3949                         if (ret)
3950                                 mlog_errno(ret);
3951         } else
3952                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3953                                      insert, inode);
3954
3955         ret = ocfs2_journal_dirty(handle, leaf_bh);
3956         if (ret)
3957                 mlog_errno(ret);
3958
3959         if (left_path) {
3960                 /*
3961                  * The rotate code has indicated that we need to fix
3962                  * up portions of the tree after the insert.
3963                  *
3964                  * XXX: Should we extend the transaction here?
3965                  */
3966                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3967                                                         right_path);
3968                 ocfs2_complete_edge_insert(inode, handle, left_path,
3969                                            right_path, subtree_index);
3970         }
3971
3972         ret = 0;
3973 out:
3974         return ret;
3975 }
3976
3977 static int ocfs2_do_insert_extent(struct inode *inode,
3978                                   handle_t *handle,
3979                                   struct ocfs2_extent_tree *et,
3980                                   struct ocfs2_extent_rec *insert_rec,
3981                                   struct ocfs2_insert_type *type)
3982 {
3983         int ret, rotate = 0;
3984         u32 cpos;
3985         struct ocfs2_path *right_path = NULL;
3986         struct ocfs2_path *left_path = NULL;
3987         struct ocfs2_extent_list *el;
3988
3989         el = et->et_root_el;
3990
3991         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3992                                    OCFS2_JOURNAL_ACCESS_WRITE);
3993         if (ret) {
3994                 mlog_errno(ret);
3995                 goto out;
3996         }
3997
3998         if (le16_to_cpu(el->l_tree_depth) == 0) {
3999                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
4000                 goto out_update_clusters;
4001         }
4002
4003         right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4004         if (!right_path) {
4005                 ret = -ENOMEM;
4006                 mlog_errno(ret);
4007                 goto out;
4008         }
4009
4010         /*
4011          * Determine the path to start with. Rotations need the
4012          * rightmost path, everything else can go directly to the
4013          * target leaf.
4014          */
4015         cpos = le32_to_cpu(insert_rec->e_cpos);
4016         if (type->ins_appending == APPEND_NONE &&
4017             type->ins_contig == CONTIG_NONE) {
4018                 rotate = 1;
4019                 cpos = UINT_MAX;
4020         }
4021
4022         ret = ocfs2_find_path(inode, right_path, cpos);
4023         if (ret) {
4024                 mlog_errno(ret);
4025                 goto out;
4026         }
4027
4028         /*
4029          * Rotations and appends need special treatment - they modify
4030          * parts of the tree's above them.
4031          *
4032          * Both might pass back a path immediate to the left of the
4033          * one being inserted to. This will be cause
4034          * ocfs2_insert_path() to modify the rightmost records of
4035          * left_path to account for an edge insert.
4036          *
4037          * XXX: When modifying this code, keep in mind that an insert
4038          * can wind up skipping both of these two special cases...
4039          */
4040         if (rotate) {
4041                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
4042                                               le32_to_cpu(insert_rec->e_cpos),
4043                                               right_path, &left_path);
4044                 if (ret) {
4045                         mlog_errno(ret);
4046                         goto out;
4047                 }
4048
4049                 /*
4050                  * ocfs2_rotate_tree_right() might have extended the
4051                  * transaction without re-journaling our tree root.
4052                  */
4053                 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
4054                                            OCFS2_JOURNAL_ACCESS_WRITE);
4055                 if (ret) {
4056                         mlog_errno(ret);
4057                         goto out;
4058                 }
4059         } else if (type->ins_appending == APPEND_TAIL
4060                    && type->ins_contig != CONTIG_LEFT) {
4061                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4062                                                right_path, &left_path);
4063                 if (ret) {
4064                         mlog_errno(ret);
4065                         goto out;
4066                 }
4067         }
4068
4069         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4070                                 insert_rec, type);
4071         if (ret) {
4072                 mlog_errno(ret);
4073                 goto out;
4074         }
4075
4076 out_update_clusters:
4077         if (type->ins_split == SPLIT_NONE)
4078                 ocfs2_et_update_clusters(inode, et,
4079                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4080
4081         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4082         if (ret)
4083                 mlog_errno(ret);
4084
4085 out:
4086         ocfs2_free_path(left_path);
4087         ocfs2_free_path(right_path);
4088
4089         return ret;
4090 }
4091
4092 static enum ocfs2_contig_type
4093 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4094                                struct ocfs2_extent_list *el, int index,
4095                                struct ocfs2_extent_rec *split_rec)
4096 {
4097         int status;
4098         enum ocfs2_contig_type ret = CONTIG_NONE;
4099         u32 left_cpos, right_cpos;
4100         struct ocfs2_extent_rec *rec = NULL;
4101         struct ocfs2_extent_list *new_el;
4102         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4103         struct buffer_head *bh;
4104         struct ocfs2_extent_block *eb;
4105
4106         if (index > 0) {
4107                 rec = &el->l_recs[index - 1];
4108         } else if (path->p_tree_depth > 0) {
4109                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4110                                                        path, &left_cpos);
4111                 if (status)
4112                         goto out;
4113
4114                 if (left_cpos != 0) {
4115                         left_path = ocfs2_new_path(path_root_bh(path),
4116                                                    path_root_el(path));
4117                         if (!left_path)
4118                                 goto out;
4119
4120                         status = ocfs2_find_path(inode, left_path, left_cpos);
4121                         if (status)
4122                                 goto out;
4123
4124                         new_el = path_leaf_el(left_path);
4125
4126                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4127                             le16_to_cpu(new_el->l_count)) {
4128                                 bh = path_leaf_bh(left_path);
4129                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4130                                 ocfs2_error(inode->i_sb,
4131                                             "Extent block #%llu has an "
4132                                             "invalid l_next_free_rec of "
4133                                             "%d.  It should have "
4134                                             "matched the l_count of %d",
4135                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4136                                             le16_to_cpu(new_el->l_next_free_rec),
4137                                             le16_to_cpu(new_el->l_count));
4138                                 status = -EINVAL;
4139                                 goto out;
4140                         }
4141                         rec = &new_el->l_recs[
4142                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4143                 }
4144         }
4145
4146         /*
4147          * We're careful to check for an empty extent record here -
4148          * the merge code will know what to do if it sees one.
4149          */
4150         if (rec) {
4151                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4152                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4153                                 ret = CONTIG_RIGHT;
4154                 } else {
4155                         ret = ocfs2_extent_contig(inode, rec, split_rec);
4156                 }
4157         }
4158
4159         rec = NULL;
4160         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4161                 rec = &el->l_recs[index + 1];
4162         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4163                  path->p_tree_depth > 0) {
4164                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4165                                                         path, &right_cpos);
4166                 if (status)
4167                         goto out;
4168
4169                 if (right_cpos == 0)
4170                         goto out;
4171
4172                 right_path = ocfs2_new_path(path_root_bh(path),
4173                                             path_root_el(path));
4174                 if (!right_path)
4175                         goto out;
4176
4177                 status = ocfs2_find_path(inode, right_path, right_cpos);
4178                 if (status)
4179                         goto out;
4180
4181                 new_el = path_leaf_el(right_path);
4182                 rec = &new_el->l_recs[0];
4183                 if (ocfs2_is_empty_extent(rec)) {
4184                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4185                                 bh = path_leaf_bh(right_path);
4186                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4187                                 ocfs2_error(inode->i_sb,
4188                                             "Extent block #%llu has an "
4189                                             "invalid l_next_free_rec of %d",
4190                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4191                                             le16_to_cpu(new_el->l_next_free_rec));
4192                                 status = -EINVAL;
4193                                 goto out;
4194                         }
4195                         rec = &new_el->l_recs[1];
4196                 }
4197         }
4198
4199         if (rec) {
4200                 enum ocfs2_contig_type contig_type;
4201
4202                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4203
4204                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4205                         ret = CONTIG_LEFTRIGHT;
4206                 else if (ret == CONTIG_NONE)
4207                         ret = contig_type;
4208         }
4209
4210 out:
4211         if (left_path)
4212                 ocfs2_free_path(left_path);
4213         if (right_path)
4214                 ocfs2_free_path(right_path);
4215
4216         return ret;
4217 }
4218
4219 static void ocfs2_figure_contig_type(struct inode *inode,
4220                                      struct ocfs2_insert_type *insert,
4221                                      struct ocfs2_extent_list *el,
4222                                      struct ocfs2_extent_rec *insert_rec,
4223                                      struct ocfs2_extent_tree *et)
4224 {
4225         int i;
4226         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4227
4228         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4229
4230         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4231                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4232                                                   insert_rec);
4233                 if (contig_type != CONTIG_NONE) {
4234                         insert->ins_contig_index = i;
4235                         break;
4236                 }
4237         }
4238         insert->ins_contig = contig_type;
4239
4240         if (insert->ins_contig != CONTIG_NONE) {
4241                 struct ocfs2_extent_rec *rec =
4242                                 &el->l_recs[insert->ins_contig_index];
4243                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4244                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4245
4246                 /*
4247                  * Caller might want us to limit the size of extents, don't
4248                  * calculate contiguousness if we might exceed that limit.
4249                  */
4250                 if (et->et_max_leaf_clusters &&
4251                     (len > et->et_max_leaf_clusters))
4252                         insert->ins_contig = CONTIG_NONE;
4253         }
4254 }
4255
4256 /*
4257  * This should only be called against the righmost leaf extent list.
4258  *
4259  * ocfs2_figure_appending_type() will figure out whether we'll have to
4260  * insert at the tail of the rightmost leaf.
4261  *
4262  * This should also work against the root extent list for tree's with 0
4263  * depth. If we consider the root extent list to be the rightmost leaf node
4264  * then the logic here makes sense.
4265  */
4266 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4267                                         struct ocfs2_extent_list *el,
4268                                         struct ocfs2_extent_rec *insert_rec)
4269 {
4270         int i;
4271         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4272         struct ocfs2_extent_rec *rec;
4273
4274         insert->ins_appending = APPEND_NONE;
4275
4276         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4277
4278         if (!el->l_next_free_rec)
4279                 goto set_tail_append;
4280
4281         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4282                 /* Were all records empty? */
4283                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4284                         goto set_tail_append;
4285         }
4286
4287         i = le16_to_cpu(el->l_next_free_rec) - 1;
4288         rec = &el->l_recs[i];
4289
4290         if (cpos >=
4291             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4292                 goto set_tail_append;
4293
4294         return;
4295
4296 set_tail_append:
4297         insert->ins_appending = APPEND_TAIL;
4298 }
4299
4300 /*
4301  * Helper function called at the begining of an insert.
4302  *
4303  * This computes a few things that are commonly used in the process of
4304  * inserting into the btree:
4305  *   - Whether the new extent is contiguous with an existing one.
4306  *   - The current tree depth.
4307  *   - Whether the insert is an appending one.
4308  *   - The total # of free records in the tree.
4309  *
4310  * All of the information is stored on the ocfs2_insert_type
4311  * structure.
4312  */
4313 static int ocfs2_figure_insert_type(struct inode *inode,
4314                                     struct ocfs2_extent_tree *et,
4315                                     struct buffer_head **last_eb_bh,
4316                                     struct ocfs2_extent_rec *insert_rec,
4317                                     int *free_records,
4318                                     struct ocfs2_insert_type *insert)
4319 {
4320         int ret;
4321         struct ocfs2_extent_block *eb;
4322         struct ocfs2_extent_list *el;
4323         struct ocfs2_path *path = NULL;
4324         struct buffer_head *bh = NULL;
4325
4326         insert->ins_split = SPLIT_NONE;
4327
4328         el = et->et_root_el;
4329         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4330
4331         if (el->l_tree_depth) {
4332                 /*
4333                  * If we have tree depth, we read in the
4334                  * rightmost extent block ahead of time as
4335                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4336                  * may want it later.
4337                  */
4338                 ret = ocfs2_read_extent_block(inode,
4339                                               ocfs2_et_get_last_eb_blk(et),
4340                                               &bh);
4341                 if (ret) {
4342                         mlog_exit(ret);
4343                         goto out;
4344                 }
4345                 eb = (struct ocfs2_extent_block *) bh->b_data;
4346                 el = &eb->h_list;
4347         }
4348
4349         /*
4350          * Unless we have a contiguous insert, we'll need to know if
4351          * there is room left in our allocation tree for another
4352          * extent record.
4353          *
4354          * XXX: This test is simplistic, we can search for empty
4355          * extent records too.
4356          */
4357         *free_records = le16_to_cpu(el->l_count) -
4358                 le16_to_cpu(el->l_next_free_rec);
4359
4360         if (!insert->ins_tree_depth) {
4361                 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4362                 ocfs2_figure_appending_type(insert, el, insert_rec);
4363                 return 0;
4364         }
4365
4366         path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4367         if (!path) {
4368                 ret = -ENOMEM;
4369                 mlog_errno(ret);
4370                 goto out;
4371         }
4372
4373         /*
4374          * In the case that we're inserting past what the tree
4375          * currently accounts for, ocfs2_find_path() will return for
4376          * us the rightmost tree path. This is accounted for below in
4377          * the appending code.
4378          */
4379         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4380         if (ret) {
4381                 mlog_errno(ret);
4382                 goto out;
4383         }
4384
4385         el = path_leaf_el(path);
4386
4387         /*
4388          * Now that we have the path, there's two things we want to determine:
4389          * 1) Contiguousness (also set contig_index if this is so)
4390          *
4391          * 2) Are we doing an append? We can trivially break this up
4392          *     into two types of appends: simple record append, or a
4393          *     rotate inside the tail leaf.
4394          */
4395         ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4396
4397         /*
4398          * The insert code isn't quite ready to deal with all cases of
4399          * left contiguousness. Specifically, if it's an insert into
4400          * the 1st record in a leaf, it will require the adjustment of
4401          * cluster count on the last record of the path directly to it's
4402          * left. For now, just catch that case and fool the layers
4403          * above us. This works just fine for tree_depth == 0, which
4404          * is why we allow that above.
4405          */
4406         if (insert->ins_contig == CONTIG_LEFT &&
4407             insert->ins_contig_index == 0)
4408                 insert->ins_contig = CONTIG_NONE;
4409
4410         /*
4411          * Ok, so we can simply compare against last_eb to figure out
4412          * whether the path doesn't exist. This will only happen in
4413          * the case that we're doing a tail append, so maybe we can
4414          * take advantage of that information somehow.
4415          */
4416         if (ocfs2_et_get_last_eb_blk(et) ==
4417             path_leaf_bh(path)->b_blocknr) {
4418                 /*
4419                  * Ok, ocfs2_find_path() returned us the rightmost
4420                  * tree path. This might be an appending insert. There are
4421                  * two cases:
4422                  *    1) We're doing a true append at the tail:
4423                  *      -This might even be off the end of the leaf
4424                  *    2) We're "appending" by rotating in the tail
4425                  */
4426                 ocfs2_figure_appending_type(insert, el, insert_rec);
4427         }
4428
4429 out:
4430         ocfs2_free_path(path);
4431
4432         if (ret == 0)
4433                 *last_eb_bh = bh;
4434         else
4435                 brelse(bh);
4436         return ret;
4437 }
4438
4439 /*
4440  * Insert an extent into an inode btree.
4441  *
4442  * The caller needs to update fe->i_clusters
4443  */
4444 int ocfs2_insert_extent(struct ocfs2_super *osb,
4445                         handle_t *handle,
4446                         struct inode *inode,
4447                         struct ocfs2_extent_tree *et,
4448                         u32 cpos,
4449                         u64 start_blk,
4450                         u32 new_clusters,
4451                         u8 flags,
4452                         struct ocfs2_alloc_context *meta_ac)
4453 {
4454         int status;
4455         int uninitialized_var(free_records);
4456         struct buffer_head *last_eb_bh = NULL;
4457         struct ocfs2_insert_type insert = {0, };
4458         struct ocfs2_extent_rec rec;
4459
4460         mlog(0, "add %u clusters at position %u to inode %llu\n",
4461              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4462
4463         memset(&rec, 0, sizeof(rec));
4464         rec.e_cpos = cpu_to_le32(cpos);
4465         rec.e_blkno = cpu_to_le64(start_blk);
4466         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4467         rec.e_flags = flags;
4468         status = ocfs2_et_insert_check(inode, et, &rec);
4469         if (status) {
4470                 mlog_errno(status);
4471                 goto bail;
4472         }
4473
4474         status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4475                                           &free_records, &insert);
4476         if (status < 0) {
4477                 mlog_errno(status);
4478                 goto bail;
4479         }
4480
4481         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4482              "Insert.contig_index: %d, Insert.free_records: %d, "
4483              "Insert.tree_depth: %d\n",
4484              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4485              free_records, insert.ins_tree_depth);
4486
4487         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4488                 status = ocfs2_grow_tree(inode, handle, et,
4489                                          &insert.ins_tree_depth, &last_eb_bh,
4490                                          meta_ac);
4491                 if (status) {
4492                         mlog_errno(status);
4493                         goto bail;
4494                 }
4495         }
4496
4497         /* Finally, we can add clusters. This might rotate the tree for us. */
4498         status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4499         if (status < 0)
4500                 mlog_errno(status);
4501         else if (et->et_ops == &ocfs2_dinode_et_ops)
4502                 ocfs2_extent_map_insert_rec(inode, &rec);
4503
4504 bail:
4505         brelse(last_eb_bh);
4506
4507         mlog_exit(status);
4508         return status;
4509 }
4510
4511 /*
4512  * Allcate and add clusters into the extent b-tree.
4513  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4514  * The extent b-tree's root is specified by et, and
4515  * it is not limited to the file storage. Any extent tree can use this
4516  * function if it implements the proper ocfs2_extent_tree.
4517  */
4518 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4519                                 struct inode *inode,
4520                                 u32 *logical_offset,
4521                                 u32 clusters_to_add,
4522                                 int mark_unwritten,
4523                                 struct ocfs2_extent_tree *et,
4524                                 handle_t *handle,
4525                                 struct ocfs2_alloc_context *data_ac,
4526                                 struct ocfs2_alloc_context *meta_ac,
4527                                 enum ocfs2_alloc_restarted *reason_ret)
4528 {
4529         int status = 0;
4530         int free_extents;
4531         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4532         u32 bit_off, num_bits;
4533         u64 block;
4534         u8 flags = 0;
4535
4536         BUG_ON(!clusters_to_add);
4537
4538         if (mark_unwritten)
4539                 flags = OCFS2_EXT_UNWRITTEN;
4540
4541         free_extents = ocfs2_num_free_extents(osb, inode, et);
4542         if (free_extents < 0) {
4543                 status = free_extents;
4544                 mlog_errno(status);
4545                 goto leave;
4546         }
4547
4548         /* there are two cases which could cause us to EAGAIN in the
4549          * we-need-more-metadata case:
4550          * 1) we haven't reserved *any*
4551          * 2) we are so fragmented, we've needed to add metadata too
4552          *    many times. */
4553         if (!free_extents && !meta_ac) {
4554                 mlog(0, "we haven't reserved any metadata!\n");
4555                 status = -EAGAIN;
4556                 reason = RESTART_META;
4557                 goto leave;
4558         } else if ((!free_extents)
4559                    && (ocfs2_alloc_context_bits_left(meta_ac)
4560                        < ocfs2_extend_meta_needed(et->et_root_el))) {
4561                 mlog(0, "filesystem is really fragmented...\n");
4562                 status = -EAGAIN;
4563                 reason = RESTART_META;
4564                 goto leave;
4565         }
4566
4567         status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4568                                         clusters_to_add, &bit_off, &num_bits);
4569         if (status < 0) {
4570                 if (status != -ENOSPC)
4571                         mlog_errno(status);
4572                 goto leave;
4573         }
4574
4575         BUG_ON(num_bits > clusters_to_add);
4576
4577         /* reserve our write early -- insert_extent may update the inode */
4578         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
4579                                       OCFS2_JOURNAL_ACCESS_WRITE);
4580         if (status < 0) {
4581                 mlog_errno(status);
4582                 goto leave;
4583         }
4584
4585         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4586         mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4587              num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4588         status = ocfs2_insert_extent(osb, handle, inode, et,
4589                                      *logical_offset, block,
4590                                      num_bits, flags, meta_ac);
4591         if (status < 0) {
4592                 mlog_errno(status);
4593                 goto leave;
4594         }
4595
4596         status = ocfs2_journal_dirty(handle, et->et_root_bh);
4597         if (status < 0) {
4598                 mlog_errno(status);
4599                 goto leave;
4600         }
4601
4602         clusters_to_add -= num_bits;
4603         *logical_offset += num_bits;
4604
4605         if (clusters_to_add) {
4606                 mlog(0, "need to alloc once more, wanted = %u\n",
4607                      clusters_to_add);
4608                 status = -EAGAIN;
4609                 reason = RESTART_TRANS;
4610         }
4611
4612 leave:
4613         mlog_exit(status);
4614         if (reason_ret)
4615                 *reason_ret = reason;
4616         return status;
4617 }
4618
4619 static void ocfs2_make_right_split_rec(struct super_block *sb,
4620                                        struct ocfs2_extent_rec *split_rec,
4621                                        u32 cpos,
4622                                        struct ocfs2_extent_rec *rec)
4623 {
4624         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4625         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4626
4627         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4628
4629         split_rec->e_cpos = cpu_to_le32(cpos);
4630         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4631
4632         split_rec->e_blkno = rec->e_blkno;
4633         le64_add_cpu(&split_rec->e_blkno,
4634                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4635
4636         split_rec->e_flags = rec->e_flags;
4637 }
4638
4639 static int ocfs2_split_and_insert(struct inode *inode,
4640                                   handle_t *handle,
4641                                   struct ocfs2_path *path,
4642                                   struct ocfs2_extent_tree *et,
4643                                   struct buffer_head **last_eb_bh,
4644                                   int split_index,
4645                                   struct ocfs2_extent_rec *orig_split_rec,
4646                                   struct ocfs2_alloc_context *meta_ac)
4647 {
4648         int ret = 0, depth;
4649         unsigned int insert_range, rec_range, do_leftright = 0;
4650         struct ocfs2_extent_rec tmprec;
4651         struct ocfs2_extent_list *rightmost_el;
4652         struct ocfs2_extent_rec rec;
4653         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4654         struct ocfs2_insert_type insert;
4655         struct ocfs2_extent_block *eb;
4656
4657 leftright:
4658         /*
4659          * Store a copy of the record on the stack - it might move
4660          * around as the tree is manipulated below.
4661          */
4662         rec = path_leaf_el(path)->l_recs[split_index];
4663
4664         rightmost_el = et->et_root_el;
4665
4666         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4667         if (depth) {
4668                 BUG_ON(!(*last_eb_bh));
4669                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4670                 rightmost_el = &eb->h_list;
4671         }
4672
4673         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4674             le16_to_cpu(rightmost_el->l_count)) {
4675                 ret = ocfs2_grow_tree(inode, handle, et,
4676                                       &depth, last_eb_bh, meta_ac);
4677                 if (ret) {
4678                         mlog_errno(ret);
4679                         goto out;
4680                 }
4681         }
4682
4683         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4684         insert.ins_appending = APPEND_NONE;
4685         insert.ins_contig = CONTIG_NONE;
4686         insert.ins_tree_depth = depth;
4687
4688         insert_range = le32_to_cpu(split_rec.e_cpos) +
4689                 le16_to_cpu(split_rec.e_leaf_clusters);
4690         rec_range = le32_to_cpu(rec.e_cpos) +
4691                 le16_to_cpu(rec.e_leaf_clusters);
4692
4693         if (split_rec.e_cpos == rec.e_cpos) {
4694                 insert.ins_split = SPLIT_LEFT;
4695         } else if (insert_range == rec_range) {
4696                 insert.ins_split = SPLIT_RIGHT;
4697         } else {
4698                 /*
4699                  * Left/right split. We fake this as a right split
4700                  * first and then make a second pass as a left split.
4701                  */
4702                 insert.ins_split = SPLIT_RIGHT;
4703
4704                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4705                                            &rec);
4706
4707                 split_rec = tmprec;
4708
4709                 BUG_ON(do_leftright);
4710                 do_leftright = 1;
4711         }
4712
4713         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4714         if (ret) {
4715                 mlog_errno(ret);
4716                 goto out;
4717         }
4718
4719         if (do_leftright == 1) {
4720                 u32 cpos;
4721                 struct ocfs2_extent_list *el;
4722
4723                 do_leftright++;
4724                 split_rec = *orig_split_rec;
4725
4726                 ocfs2_reinit_path(path, 1);
4727
4728                 cpos = le32_to_cpu(split_rec.e_cpos);
4729                 ret = ocfs2_find_path(inode, path, cpos);
4730                 if (ret) {
4731                         mlog_errno(ret);
4732                         goto out;
4733                 }
4734
4735                 el = path_leaf_el(path);
4736                 split_index = ocfs2_search_extent_list(el, cpos);
4737                 goto leftright;
4738         }
4739 out:
4740
4741         return ret;
4742 }
4743
4744 /*
4745  * Mark part or all of the extent record at split_index in the leaf
4746  * pointed to by path as written. This removes the unwritten
4747  * extent flag.
4748  *
4749  * Care is taken to handle contiguousness so as to not grow the tree.
4750  *
4751  * meta_ac is not strictly necessary - we only truly need it if growth
4752  * of the tree is required. All other cases will degrade into a less
4753  * optimal tree layout.
4754  *
4755  * last_eb_bh should be the rightmost leaf block for any extent
4756  * btree. Since a split may grow the tree or a merge might shrink it,
4757  * the caller cannot trust the contents of that buffer after this call.
4758  *
4759  * This code is optimized for readability - several passes might be
4760  * made over certain portions of the tree. All of those blocks will
4761  * have been brought into cache (and pinned via the journal), so the
4762  * extra overhead is not expressed in terms of disk reads.
4763  */
4764 static int __ocfs2_mark_extent_written(struct inode *inode,
4765                                        struct ocfs2_extent_tree *et,
4766                                        handle_t *handle,
4767                                        struct ocfs2_path *path,
4768                                        int split_index,
4769                                        struct ocfs2_extent_rec *split_rec,
4770                                        struct ocfs2_alloc_context *meta_ac,
4771                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4772 {
4773         int ret = 0;
4774         struct ocfs2_extent_list *el = path_leaf_el(path);
4775         struct buffer_head *last_eb_bh = NULL;
4776         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4777         struct ocfs2_merge_ctxt ctxt;
4778         struct ocfs2_extent_list *rightmost_el;
4779
4780         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4781                 ret = -EIO;
4782                 mlog_errno(ret);
4783                 goto out;
4784         }
4785
4786         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4787             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4788              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4789                 ret = -EIO;
4790                 mlog_errno(ret);
4791                 goto out;
4792         }
4793
4794         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4795                                                             split_index,
4796                                                             split_rec);
4797
4798         /*
4799          * The core merge / split code wants to know how much room is
4800          * left in this inodes allocation tree, so we pass the
4801          * rightmost extent list.
4802          */
4803         if (path->p_tree_depth) {
4804                 struct ocfs2_extent_block *eb;
4805
4806                 ret = ocfs2_read_extent_block(inode,
4807                                               ocfs2_et_get_last_eb_blk(et),
4808                                               &last_eb_bh);
4809                 if (ret) {
4810                         mlog_exit(ret);
4811                         goto out;
4812                 }
4813
4814                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4815                 rightmost_el = &eb->h_list;
4816         } else
4817                 rightmost_el = path_root_el(path);
4818
4819         if (rec->e_cpos == split_rec->e_cpos &&
4820             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4821                 ctxt.c_split_covers_rec = 1;
4822         else
4823                 ctxt.c_split_covers_rec = 0;
4824
4825         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4826
4827         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4828              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4829              ctxt.c_split_covers_rec);
4830
4831         if (ctxt.c_contig_type == CONTIG_NONE) {
4832                 if (ctxt.c_split_covers_rec)
4833                         el->l_recs[split_index] = *split_rec;
4834                 else
4835                         ret = ocfs2_split_and_insert(inode, handle, path, et,
4836                                                      &last_eb_bh, split_index,
4837                                                      split_rec, meta_ac);
4838                 if (ret)
4839                         mlog_errno(ret);
4840         } else {
4841                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4842                                                 split_index, split_rec,
4843                                                 dealloc, &ctxt, et);
4844                 if (ret)
4845                         mlog_errno(ret);
4846         }
4847
4848 out:
4849         brelse(last_eb_bh);
4850         return ret;
4851 }
4852
4853 /*
4854  * Mark the already-existing extent at cpos as written for len clusters.
4855  *
4856  * If the existing extent is larger than the request, initiate a
4857  * split. An attempt will be made at merging with adjacent extents.
4858  *
4859  * The caller is responsible for passing down meta_ac if we'll need it.
4860  */
4861 int ocfs2_mark_extent_written(struct inode *inode,
4862                               struct ocfs2_extent_tree *et,
4863                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4864                               struct ocfs2_alloc_context *meta_ac,
4865                               struct ocfs2_cached_dealloc_ctxt *dealloc)
4866 {
4867         int ret, index;
4868         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4869         struct ocfs2_extent_rec split_rec;
4870         struct ocfs2_path *left_path = NULL;
4871         struct ocfs2_extent_list *el;
4872
4873         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4874              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4875
4876         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4877                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4878                             "that are being written to, but the feature bit "
4879                             "is not set in the super block.",
4880                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4881                 ret = -EROFS;
4882                 goto out;
4883         }
4884
4885         /*
4886          * XXX: This should be fixed up so that we just re-insert the
4887          * next extent records.
4888          *
4889          * XXX: This is a hack on the extent tree, maybe it should be
4890          * an op?
4891          */
4892         if (et->et_ops == &ocfs2_dinode_et_ops)
4893                 ocfs2_extent_map_trunc(inode, 0);
4894
4895         left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4896         if (!left_path) {
4897                 ret = -ENOMEM;
4898                 mlog_errno(ret);
4899                 goto out;
4900         }
4901
4902         ret = ocfs2_find_path(inode, left_path, cpos);
4903         if (ret) {
4904                 mlog_errno(ret);
4905                 goto out;
4906         }
4907         el = path_leaf_el(left_path);
4908
4909         index = ocfs2_search_extent_list(el, cpos);
4910         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4911                 ocfs2_error(inode->i_sb,
4912                             "Inode %llu has an extent at cpos %u which can no "
4913                             "longer be found.\n",
4914                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4915                 ret = -EROFS;
4916                 goto out;
4917         }
4918
4919         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4920         split_rec.e_cpos = cpu_to_le32(cpos);
4921         split_rec.e_leaf_clusters = cpu_to_le16(len);
4922         split_rec.e_blkno = cpu_to_le64(start_blkno);
4923         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4924         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4925
4926         ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4927                                           index, &split_rec, meta_ac,
4928                                           dealloc);
4929         if (ret)
4930                 mlog_errno(ret);
4931
4932 out:
4933         ocfs2_free_path(left_path);
4934         return ret;
4935 }
4936
4937 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4938                             handle_t *handle, struct ocfs2_path *path,
4939                             int index, u32 new_range,
4940                             struct ocfs2_alloc_context *meta_ac)
4941 {
4942         int ret, depth, credits = handle->h_buffer_credits;
4943         struct buffer_head *last_eb_bh = NULL;
4944         struct ocfs2_extent_block *eb;
4945         struct ocfs2_extent_list *rightmost_el, *el;
4946         struct ocfs2_extent_rec split_rec;
4947         struct ocfs2_extent_rec *rec;
4948         struct ocfs2_insert_type insert;
4949
4950         /*
4951          * Setup the record to split before we grow the tree.
4952          */
4953         el = path_leaf_el(path);
4954         rec = &el->l_recs[index];
4955         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4956
4957         depth = path->p_tree_depth;
4958         if (depth > 0) {
4959                 ret = ocfs2_read_extent_block(inode,
4960                                               ocfs2_et_get_last_eb_blk(et),
4961                                               &last_eb_bh);
4962                 if (ret < 0) {
4963                         mlog_errno(ret);
4964                         goto out;
4965                 }
4966
4967                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4968                 rightmost_el = &eb->h_list;
4969         } else
4970                 rightmost_el = path_leaf_el(path);
4971
4972         credits += path->p_tree_depth +
4973                    ocfs2_extend_meta_needed(et->et_root_el);
4974         ret = ocfs2_extend_trans(handle, credits);
4975         if (ret) {
4976                 mlog_errno(ret);
4977                 goto out;
4978         }
4979
4980         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4981             le16_to_cpu(rightmost_el->l_count)) {
4982                 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4983                                       meta_ac);
4984                 if (ret) {
4985                         mlog_errno(ret);
4986                         goto out;
4987                 }
4988         }
4989
4990         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4991         insert.ins_appending = APPEND_NONE;
4992         insert.ins_contig = CONTIG_NONE;
4993         insert.ins_split = SPLIT_RIGHT;
4994         insert.ins_tree_depth = depth;
4995
4996         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4997         if (ret)
4998                 mlog_errno(ret);
4999
5000 out:
5001         brelse(last_eb_bh);
5002         return ret;
5003 }
5004
5005 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5006                               struct ocfs2_path *path, int index,
5007                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5008                               u32 cpos, u32 len,
5009                               struct ocfs2_extent_tree *et)
5010 {
5011         int ret;
5012         u32 left_cpos, rec_range, trunc_range;
5013         int wants_rotate = 0, is_rightmost_tree_rec = 0;
5014         struct super_block *sb = inode->i_sb;
5015         struct ocfs2_path *left_path = NULL;
5016         struct ocfs2_extent_list *el = path_leaf_el(path);
5017         struct ocfs2_extent_rec *rec;
5018         struct ocfs2_extent_block *eb;
5019
5020         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5021                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5022                 if (ret) {
5023                         mlog_errno(ret);
5024                         goto out;
5025                 }
5026
5027                 index--;
5028         }
5029
5030         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5031             path->p_tree_depth) {
5032                 /*
5033                  * Check whether this is the rightmost tree record. If
5034                  * we remove all of this record or part of its right
5035                  * edge then an update of the record lengths above it
5036                  * will be required.
5037                  */
5038                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5039                 if (eb->h_next_leaf_blk == 0)
5040                         is_rightmost_tree_rec = 1;
5041         }
5042
5043         rec = &el->l_recs[index];
5044         if (index == 0 && path->p_tree_depth &&
5045             le32_to_cpu(rec->e_cpos) == cpos) {
5046                 /*
5047                  * Changing the leftmost offset (via partial or whole
5048                  * record truncate) of an interior (or rightmost) path
5049                  * means we have to update the subtree that is formed
5050                  * by this leaf and the one to it's left.
5051                  *
5052                  * There are two cases we can skip:
5053                  *   1) Path is the leftmost one in our inode tree.
5054                  *   2) The leaf is rightmost and will be empty after
5055                  *      we remove the extent record - the rotate code
5056                  *      knows how to update the newly formed edge.
5057                  */
5058
5059                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5060                                                     &left_cpos);
5061                 if (ret) {
5062                         mlog_errno(ret);
5063                         goto out;
5064                 }
5065
5066                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5067                         left_path = ocfs2_new_path(path_root_bh(path),
5068                                                    path_root_el(path));
5069                         if (!left_path) {
5070                                 ret = -ENOMEM;
5071                                 mlog_errno(ret);
5072                                 goto out;
5073                         }
5074
5075                         ret = ocfs2_find_path(inode, left_path, left_cpos);
5076                         if (ret) {
5077                                 mlog_errno(ret);
5078                                 goto out;
5079                         }
5080                 }
5081         }
5082
5083         ret = ocfs2_extend_rotate_transaction(handle, 0,
5084                                               handle->h_buffer_credits,
5085                                               path);
5086         if (ret) {
5087                 mlog_errno(ret);
5088                 goto out;
5089         }
5090
5091         ret = ocfs2_journal_access_path(inode, handle, path);
5092         if (ret) {
5093                 mlog_errno(ret);
5094                 goto out;
5095         }
5096
5097         ret = ocfs2_journal_access_path(inode, handle, left_path);
5098         if (ret) {
5099                 mlog_errno(ret);
5100                 goto out;
5101         }
5102
5103         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5104         trunc_range = cpos + len;
5105
5106         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5107                 int next_free;
5108
5109                 memset(rec, 0, sizeof(*rec));
5110                 ocfs2_cleanup_merge(el, index);
5111                 wants_rotate = 1;
5112
5113                 next_free = le16_to_cpu(el->l_next_free_rec);
5114                 if (is_rightmost_tree_rec && next_free > 1) {
5115                         /*
5116                          * We skip the edge update if this path will
5117                          * be deleted by the rotate code.
5118                          */
5119                         rec = &el->l_recs[next_free - 1];
5120                         ocfs2_adjust_rightmost_records(inode, handle, path,
5121                                                        rec);
5122                 }
5123         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5124                 /* Remove leftmost portion of the record. */
5125                 le32_add_cpu(&rec->e_cpos, len);
5126                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5127                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5128         } else if (rec_range == trunc_range) {
5129                 /* Remove rightmost portion of the record */
5130                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5131                 if (is_rightmost_tree_rec)
5132                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5133         } else {
5134                 /* Caller should have trapped this. */
5135                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5136                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5137                      le32_to_cpu(rec->e_cpos),
5138                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5139                 BUG();
5140         }
5141
5142         if (left_path) {
5143                 int subtree_index;
5144
5145                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5146                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
5147                                            subtree_index);
5148         }
5149
5150         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5151
5152         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5153         if (ret) {
5154                 mlog_errno(ret);
5155                 goto out;
5156         }
5157
5158 out:
5159         ocfs2_free_path(left_path);
5160         return ret;
5161 }
5162
5163 int ocfs2_remove_extent(struct inode *inode,
5164                         struct ocfs2_extent_tree *et,
5165                         u32 cpos, u32 len, handle_t *handle,
5166                         struct ocfs2_alloc_context *meta_ac,
5167                         struct ocfs2_cached_dealloc_ctxt *dealloc)
5168 {
5169         int ret, index;
5170         u32 rec_range, trunc_range;
5171         struct ocfs2_extent_rec *rec;
5172         struct ocfs2_extent_list *el;
5173         struct ocfs2_path *path = NULL;
5174
5175         ocfs2_extent_map_trunc(inode, 0);
5176
5177         path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
5178         if (!path) {
5179                 ret = -ENOMEM;
5180                 mlog_errno(ret);
5181                 goto out;
5182         }
5183
5184         ret = ocfs2_find_path(inode, path, cpos);
5185         if (ret) {
5186                 mlog_errno(ret);
5187                 goto out;
5188         }
5189
5190         el = path_leaf_el(path);
5191         index = ocfs2_search_extent_list(el, cpos);
5192         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5193                 ocfs2_error(inode->i_sb,
5194                             "Inode %llu has an extent at cpos %u which can no "
5195                             "longer be found.\n",
5196                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5197                 ret = -EROFS;
5198                 goto out;
5199         }
5200
5201         /*
5202          * We have 3 cases of extent removal:
5203          *   1) Range covers the entire extent rec
5204          *   2) Range begins or ends on one edge of the extent rec
5205          *   3) Range is in the middle of the extent rec (no shared edges)
5206          *
5207          * For case 1 we remove the extent rec and left rotate to
5208          * fill the hole.
5209          *
5210          * For case 2 we just shrink the existing extent rec, with a
5211          * tree update if the shrinking edge is also the edge of an
5212          * extent block.
5213          *
5214          * For case 3 we do a right split to turn the extent rec into
5215          * something case 2 can handle.
5216          */
5217         rec = &el->l_recs[index];
5218         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5219         trunc_range = cpos + len;
5220
5221         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5222
5223         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5224              "(cpos %u, len %u)\n",
5225              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5226              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5227
5228         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5229                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5230                                          cpos, len, et);
5231                 if (ret) {
5232                         mlog_errno(ret);
5233                         goto out;
5234                 }
5235         } else {
5236                 ret = ocfs2_split_tree(inode, et, handle, path, index,
5237                                        trunc_range, meta_ac);
5238                 if (ret) {
5239                         mlog_errno(ret);
5240                         goto out;
5241                 }
5242
5243                 /*
5244                  * The split could have manipulated the tree enough to
5245                  * move the record location, so we have to look for it again.
5246                  */
5247                 ocfs2_reinit_path(path, 1);
5248
5249                 ret = ocfs2_find_path(inode, path, cpos);
5250                 if (ret) {
5251                         mlog_errno(ret);
5252                         goto out;
5253                 }
5254
5255                 el = path_leaf_el(path);
5256                 index = ocfs2_search_extent_list(el, cpos);
5257                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5258                         ocfs2_error(inode->i_sb,
5259                                     "Inode %llu: split at cpos %u lost record.",
5260                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5261                                     cpos);
5262                         ret = -EROFS;
5263                         goto out;
5264                 }
5265
5266                 /*
5267                  * Double check our values here. If anything is fishy,
5268                  * it's easier to catch it at the top level.
5269                  */
5270                 rec = &el->l_recs[index];
5271                 rec_range = le32_to_cpu(rec->e_cpos) +
5272                         ocfs2_rec_clusters(el, rec);
5273                 if (rec_range != trunc_range) {
5274                         ocfs2_error(inode->i_sb,
5275                                     "Inode %llu: error after split at cpos %u"
5276                                     "trunc len %u, existing record is (%u,%u)",
5277                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5278                                     cpos, len, le32_to_cpu(rec->e_cpos),
5279                                     ocfs2_rec_clusters(el, rec));
5280                         ret = -EROFS;
5281                         goto out;
5282                 }
5283
5284                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5285                                          cpos, len, et);
5286                 if (ret) {
5287                         mlog_errno(ret);
5288                         goto out;
5289                 }
5290         }
5291
5292 out:
5293         ocfs2_free_path(path);
5294         return ret;
5295 }
5296
5297 int ocfs2_remove_btree_range(struct inode *inode,
5298                              struct ocfs2_extent_tree *et,
5299                              u32 cpos, u32 phys_cpos, u32 len,
5300                              struct ocfs2_cached_dealloc_ctxt *dealloc)
5301 {
5302         int ret;
5303         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5304         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5305         struct inode *tl_inode = osb->osb_tl_inode;
5306         handle_t *handle;
5307         struct ocfs2_alloc_context *meta_ac = NULL;
5308
5309         ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5310         if (ret) {
5311                 mlog_errno(ret);
5312                 return ret;
5313         }
5314
5315         mutex_lock(&tl_inode->i_mutex);
5316
5317         if (ocfs2_truncate_log_needs_flush(osb)) {
5318                 ret = __ocfs2_flush_truncate_log(osb);
5319                 if (ret < 0) {
5320                         mlog_errno(ret);
5321                         goto out;
5322                 }
5323         }
5324
5325         handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
5326         if (IS_ERR(handle)) {
5327                 ret = PTR_ERR(handle);
5328                 mlog_errno(ret);
5329                 goto out;
5330         }
5331
5332         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
5333                                    OCFS2_JOURNAL_ACCESS_WRITE);
5334         if (ret) {
5335                 mlog_errno(ret);
5336                 goto out;
5337         }
5338
5339         ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5340                                   dealloc);
5341         if (ret) {
5342                 mlog_errno(ret);
5343                 goto out_commit;
5344         }
5345
5346         ocfs2_et_update_clusters(inode, et, -len);
5347
5348         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5349         if (ret) {
5350                 mlog_errno(ret);
5351                 goto out_commit;
5352         }
5353
5354         ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5355         if (ret)
5356                 mlog_errno(ret);
5357
5358 out_commit:
5359         ocfs2_commit_trans(osb, handle);
5360 out:
5361         mutex_unlock(&tl_inode->i_mutex);
5362
5363         if (meta_ac)
5364                 ocfs2_free_alloc_context(meta_ac);
5365
5366         return ret;
5367 }
5368
5369 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5370 {
5371         struct buffer_head *tl_bh = osb->osb_tl_bh;
5372         struct ocfs2_dinode *di;
5373         struct ocfs2_truncate_log *tl;
5374
5375         di = (struct ocfs2_dinode *) tl_bh->b_data;
5376         tl = &di->id2.i_dealloc;
5377
5378         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5379                         "slot %d, invalid truncate log parameters: used = "
5380                         "%u, count = %u\n", osb->slot_num,
5381                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5382         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5383 }
5384
5385 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5386                                            unsigned int new_start)
5387 {
5388         unsigned int tail_index;
5389         unsigned int current_tail;
5390
5391         /* No records, nothing to coalesce */
5392         if (!le16_to_cpu(tl->tl_used))
5393                 return 0;
5394
5395         tail_index = le16_to_cpu(tl->tl_used) - 1;
5396         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5397         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5398
5399         return current_tail == new_start;
5400 }
5401
5402 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5403                               handle_t *handle,
5404                               u64 start_blk,
5405                               unsigned int num_clusters)
5406 {
5407         int status, index;
5408         unsigned int start_cluster, tl_count;
5409         struct inode *tl_inode = osb->osb_tl_inode;
5410         struct buffer_head *tl_bh = osb->osb_tl_bh;
5411         struct ocfs2_dinode *di;
5412         struct ocfs2_truncate_log *tl;
5413
5414         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5415                    (unsigned long long)start_blk, num_clusters);
5416
5417         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5418
5419         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5420
5421         di = (struct ocfs2_dinode *) tl_bh->b_data;
5422
5423         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5424          * by the underlying call to ocfs2_read_inode_block(), so any
5425          * corruption is a code bug */
5426         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5427
5428         tl = &di->id2.i_dealloc;
5429         tl_count = le16_to_cpu(tl->tl_count);
5430         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5431                         tl_count == 0,
5432                         "Truncate record count on #%llu invalid "
5433                         "wanted %u, actual %u\n",
5434                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5435                         ocfs2_truncate_recs_per_inode(osb->sb),
5436                         le16_to_cpu(tl->tl_count));
5437
5438         /* Caller should have known to flush before calling us. */
5439         index = le16_to_cpu(tl->tl_used);
5440         if (index >= tl_count) {
5441                 status = -ENOSPC;
5442                 mlog_errno(status);
5443                 goto bail;
5444         }
5445
5446         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5447                                       OCFS2_JOURNAL_ACCESS_WRITE);
5448         if (status < 0) {
5449                 mlog_errno(status);
5450                 goto bail;
5451         }
5452
5453         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5454              "%llu (index = %d)\n", num_clusters, start_cluster,
5455              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5456
5457         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5458                 /*
5459                  * Move index back to the record we are coalescing with.
5460                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5461                  */
5462                 index--;
5463
5464                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5465                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5466                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5467                      num_clusters);
5468         } else {
5469                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5470                 tl->tl_used = cpu_to_le16(index + 1);
5471         }
5472         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5473
5474         status = ocfs2_journal_dirty(handle, tl_bh);
5475         if (status < 0) {
5476                 mlog_errno(status);
5477                 goto bail;
5478         }
5479
5480 bail:
5481         mlog_exit(status);
5482         return status;
5483 }
5484
5485 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5486                                          handle_t *handle,
5487                                          struct inode *data_alloc_inode,
5488                                          struct buffer_head *data_alloc_bh)
5489 {
5490         int status = 0;
5491         int i;
5492         unsigned int num_clusters;
5493         u64 start_blk;
5494         struct ocfs2_truncate_rec rec;
5495         struct ocfs2_dinode *di;
5496         struct ocfs2_truncate_log *tl;
5497         struct inode *tl_inode = osb->osb_tl_inode;
5498         struct buffer_head *tl_bh = osb->osb_tl_bh;
5499
5500         mlog_entry_void();
5501
5502         di = (struct ocfs2_dinode *) tl_bh->b_data;
5503         tl = &di->id2.i_dealloc;
5504         i = le16_to_cpu(tl->tl_used) - 1;
5505         while (i >= 0) {
5506                 /* Caller has given us at least enough credits to
5507                  * update the truncate log dinode */
5508                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5509                                               OCFS2_JOURNAL_ACCESS_WRITE);
5510                 if (status < 0) {
5511                         mlog_errno(status);
5512                         goto bail;
5513                 }
5514
5515                 tl->tl_used = cpu_to_le16(i);
5516
5517                 status = ocfs2_journal_dirty(handle, tl_bh);
5518                 if (status < 0) {
5519                         mlog_errno(status);
5520                         goto bail;
5521                 }
5522
5523                 /* TODO: Perhaps we can calculate the bulk of the
5524                  * credits up front rather than extending like
5525                  * this. */
5526                 status = ocfs2_extend_trans(handle,
5527                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5528                 if (status < 0) {
5529                         mlog_errno(status);
5530                         goto bail;
5531                 }
5532
5533                 rec = tl->tl_recs[i];
5534                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5535                                                     le32_to_cpu(rec.t_start));
5536                 num_clusters = le32_to_cpu(rec.t_clusters);
5537
5538                 /* if start_blk is not set, we ignore the record as
5539                  * invalid. */
5540                 if (start_blk) {
5541                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5542                              i, le32_to_cpu(rec.t_start), num_clusters);
5543
5544                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5545                                                      data_alloc_bh, start_blk,
5546                                                      num_clusters);
5547                         if (status < 0) {
5548                                 mlog_errno(status);
5549                                 goto bail;
5550                         }
5551                 }
5552                 i--;
5553         }
5554
5555 bail:
5556         mlog_exit(status);
5557         return status;
5558 }
5559
5560 /* Expects you to already be holding tl_inode->i_mutex */
5561 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5562 {
5563         int status;
5564         unsigned int num_to_flush;
5565         handle_t *handle;
5566         struct inode *tl_inode = osb->osb_tl_inode;
5567         struct inode *data_alloc_inode = NULL;
5568         struct buffer_head *tl_bh = osb->osb_tl_bh;
5569         struct buffer_head *data_alloc_bh = NULL;
5570         struct ocfs2_dinode *di;
5571         struct ocfs2_truncate_log *tl;
5572
5573         mlog_entry_void();
5574
5575         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5576
5577         di = (struct ocfs2_dinode *) tl_bh->b_data;
5578
5579         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5580          * by the underlying call to ocfs2_read_inode_block(), so any
5581          * corruption is a code bug */
5582         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5583
5584         tl = &di->id2.i_dealloc;
5585         num_to_flush = le16_to_cpu(tl->tl_used);
5586         mlog(0, "Flush %u records from truncate log #%llu\n",
5587              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5588         if (!num_to_flush) {
5589                 status = 0;
5590                 goto out;
5591         }
5592
5593         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5594                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5595                                                        OCFS2_INVALID_SLOT);
5596         if (!data_alloc_inode) {
5597                 status = -EINVAL;
5598                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5599                 goto out;
5600         }
5601
5602         mutex_lock(&data_alloc_inode->i_mutex);
5603
5604         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5605         if (status < 0) {
5606                 mlog_errno(status);
5607                 goto out_mutex;
5608         }
5609
5610         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5611         if (IS_ERR(handle)) {
5612                 status = PTR_ERR(handle);
5613                 mlog_errno(status);
5614                 goto out_unlock;
5615         }
5616
5617         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5618                                                data_alloc_bh);
5619         if (status < 0)
5620                 mlog_errno(status);
5621
5622         ocfs2_commit_trans(osb, handle);
5623
5624 out_unlock:
5625         brelse(data_alloc_bh);
5626         ocfs2_inode_unlock(data_alloc_inode, 1);
5627
5628 out_mutex:
5629         mutex_unlock(&data_alloc_inode->i_mutex);
5630         iput(data_alloc_inode);
5631
5632 out:
5633         mlog_exit(status);
5634         return status;
5635 }
5636
5637 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5638 {
5639         int status;
5640         struct inode *tl_inode = osb->osb_tl_inode;
5641
5642         mutex_lock(&tl_inode->i_mutex);
5643         status = __ocfs2_flush_truncate_log(osb);
5644         mutex_unlock(&tl_inode->i_mutex);
5645
5646         return status;
5647 }
5648
5649 static void ocfs2_truncate_log_worker(struct work_struct *work)
5650 {
5651         int status;
5652         struct ocfs2_super *osb =
5653                 container_of(work, struct ocfs2_super,
5654                              osb_truncate_log_wq.work);
5655
5656         mlog_entry_void();
5657
5658         status = ocfs2_flush_truncate_log(osb);
5659         if (status < 0)
5660                 mlog_errno(status);
5661         else
5662                 ocfs2_init_inode_steal_slot(osb);
5663
5664         mlog_exit(status);
5665 }
5666
5667 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5668 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5669                                        int cancel)
5670 {
5671         if (osb->osb_tl_inode) {
5672                 /* We want to push off log flushes while truncates are
5673                  * still running. */
5674                 if (cancel)
5675                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5676
5677                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5678                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5679         }
5680 }
5681
5682 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5683                                        int slot_num,
5684                                        struct inode **tl_inode,
5685                                        struct buffer_head **tl_bh)
5686 {
5687         int status;
5688         struct inode *inode = NULL;
5689         struct buffer_head *bh = NULL;
5690
5691         inode = ocfs2_get_system_file_inode(osb,
5692                                            TRUNCATE_LOG_SYSTEM_INODE,
5693                                            slot_num);
5694         if (!inode) {
5695                 status = -EINVAL;
5696                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5697                 goto bail;
5698         }
5699
5700         status = ocfs2_read_inode_block(inode, &bh);
5701         if (status < 0) {
5702                 iput(inode);
5703                 mlog_errno(status);
5704                 goto bail;
5705         }
5706
5707         *tl_inode = inode;
5708         *tl_bh    = bh;
5709 bail:
5710         mlog_exit(status);
5711         return status;
5712 }
5713
5714 /* called during the 1st stage of node recovery. we stamp a clean
5715  * truncate log and pass back a copy for processing later. if the
5716  * truncate log does not require processing, a *tl_copy is set to
5717  * NULL. */
5718 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5719                                       int slot_num,
5720                                       struct ocfs2_dinode **tl_copy)
5721 {
5722         int status;
5723         struct inode *tl_inode = NULL;
5724         struct buffer_head *tl_bh = NULL;
5725         struct ocfs2_dinode *di;
5726         struct ocfs2_truncate_log *tl;
5727
5728         *tl_copy = NULL;
5729
5730         mlog(0, "recover truncate log from slot %d\n", slot_num);
5731
5732         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5733         if (status < 0) {
5734                 mlog_errno(status);
5735                 goto bail;
5736         }
5737
5738         di = (struct ocfs2_dinode *) tl_bh->b_data;
5739
5740         /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
5741          * validated by the underlying call to ocfs2_read_inode_block(),
5742          * so any corruption is a code bug */
5743         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5744
5745         tl = &di->id2.i_dealloc;
5746         if (le16_to_cpu(tl->tl_used)) {
5747                 mlog(0, "We'll have %u logs to recover\n",
5748                      le16_to_cpu(tl->tl_used));
5749
5750                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5751                 if (!(*tl_copy)) {
5752                         status = -ENOMEM;
5753                         mlog_errno(status);
5754                         goto bail;
5755                 }
5756
5757                 /* Assuming the write-out below goes well, this copy
5758                  * will be passed back to recovery for processing. */
5759                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5760
5761                 /* All we need to do to clear the truncate log is set
5762                  * tl_used. */
5763                 tl->tl_used = 0;
5764
5765                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5766                 if (status < 0) {
5767                         mlog_errno(status);
5768                         goto bail;
5769                 }
5770         }
5771
5772 bail:
5773         if (tl_inode)
5774                 iput(tl_inode);
5775         brelse(tl_bh);
5776
5777         if (status < 0 && (*tl_copy)) {
5778                 kfree(*tl_copy);
5779                 *tl_copy = NULL;
5780         }
5781
5782         mlog_exit(status);
5783         return status;
5784 }
5785
5786 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5787                                          struct ocfs2_dinode *tl_copy)
5788 {
5789         int status = 0;
5790         int i;
5791         unsigned int clusters, num_recs, start_cluster;
5792         u64 start_blk;
5793         handle_t *handle;
5794         struct inode *tl_inode = osb->osb_tl_inode;
5795         struct ocfs2_truncate_log *tl;
5796
5797         mlog_entry_void();
5798
5799         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5800                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5801                 return -EINVAL;
5802         }
5803
5804         tl = &tl_copy->id2.i_dealloc;
5805         num_recs = le16_to_cpu(tl->tl_used);
5806         mlog(0, "cleanup %u records from %llu\n", num_recs,
5807              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5808
5809         mutex_lock(&tl_inode->i_mutex);
5810         for(i = 0; i < num_recs; i++) {
5811                 if (ocfs2_truncate_log_needs_flush(osb)) {
5812                         status = __ocfs2_flush_truncate_log(osb);
5813                         if (status < 0) {
5814                                 mlog_errno(status);
5815                                 goto bail_up;
5816                         }
5817                 }
5818
5819                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5820                 if (IS_ERR(handle)) {
5821                         status = PTR_ERR(handle);
5822                         mlog_errno(status);
5823                         goto bail_up;
5824                 }
5825
5826                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5827                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5828                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5829
5830                 status = ocfs2_truncate_log_append(osb, handle,
5831                                                    start_blk, clusters);
5832                 ocfs2_commit_trans(osb, handle);
5833                 if (status < 0) {
5834                         mlog_errno(status);
5835                         goto bail_up;
5836                 }
5837         }
5838
5839 bail_up:
5840         mutex_unlock(&tl_inode->i_mutex);
5841
5842         mlog_exit(status);
5843         return status;
5844 }
5845
5846 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5847 {
5848         int status;
5849         struct inode *tl_inode = osb->osb_tl_inode;
5850
5851         mlog_entry_void();
5852
5853         if (tl_inode) {
5854                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5855                 flush_workqueue(ocfs2_wq);
5856
5857                 status = ocfs2_flush_truncate_log(osb);
5858                 if (status < 0)
5859                         mlog_errno(status);
5860
5861                 brelse(osb->osb_tl_bh);
5862                 iput(osb->osb_tl_inode);
5863         }
5864
5865         mlog_exit_void();
5866 }
5867
5868 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5869 {
5870         int status;
5871         struct inode *tl_inode = NULL;
5872         struct buffer_head *tl_bh = NULL;
5873
5874         mlog_entry_void();
5875
5876         status = ocfs2_get_truncate_log_info(osb,
5877                                              osb->slot_num,
5878                                              &tl_inode,
5879                                              &tl_bh);
5880         if (status < 0)
5881                 mlog_errno(status);
5882
5883         /* ocfs2_truncate_log_shutdown keys on the existence of
5884          * osb->osb_tl_inode so we don't set any of the osb variables
5885          * until we're sure all is well. */
5886         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5887                           ocfs2_truncate_log_worker);
5888         osb->osb_tl_bh    = tl_bh;
5889         osb->osb_tl_inode = tl_inode;
5890
5891         mlog_exit(status);
5892         return status;
5893 }
5894
5895 /*
5896  * Delayed de-allocation of suballocator blocks.
5897  *
5898  * Some sets of block de-allocations might involve multiple suballocator inodes.
5899  *
5900  * The locking for this can get extremely complicated, especially when
5901  * the suballocator inodes to delete from aren't known until deep
5902  * within an unrelated codepath.
5903  *
5904  * ocfs2_extent_block structures are a good example of this - an inode
5905  * btree could have been grown by any number of nodes each allocating
5906  * out of their own suballoc inode.
5907  *
5908  * These structures allow the delay of block de-allocation until a
5909  * later time, when locking of multiple cluster inodes won't cause
5910  * deadlock.
5911  */
5912
5913 /*
5914  * Describe a single bit freed from a suballocator.  For the block
5915  * suballocators, it represents one block.  For the global cluster
5916  * allocator, it represents some clusters and free_bit indicates
5917  * clusters number.
5918  */
5919 struct ocfs2_cached_block_free {
5920         struct ocfs2_cached_block_free          *free_next;
5921         u64                                     free_blk;
5922         unsigned int                            free_bit;
5923 };
5924
5925 struct ocfs2_per_slot_free_list {
5926         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5927         int                                     f_inode_type;
5928         int                                     f_slot;
5929         struct ocfs2_cached_block_free          *f_first;
5930 };
5931
5932 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
5933                                     int sysfile_type,
5934                                     int slot,
5935                                     struct ocfs2_cached_block_free *head)
5936 {
5937         int ret;
5938         u64 bg_blkno;
5939         handle_t *handle;
5940         struct inode *inode;
5941         struct buffer_head *di_bh = NULL;
5942         struct ocfs2_cached_block_free *tmp;
5943
5944         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5945         if (!inode) {
5946                 ret = -EINVAL;
5947                 mlog_errno(ret);
5948                 goto out;
5949         }
5950
5951         mutex_lock(&inode->i_mutex);
5952
5953         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5954         if (ret) {
5955                 mlog_errno(ret);
5956                 goto out_mutex;
5957         }
5958
5959         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5960         if (IS_ERR(handle)) {
5961                 ret = PTR_ERR(handle);
5962                 mlog_errno(ret);
5963                 goto out_unlock;
5964         }
5965
5966         while (head) {
5967                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5968                                                       head->free_bit);
5969                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5970                      head->free_bit, (unsigned long long)head->free_blk);
5971
5972                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5973                                                head->free_bit, bg_blkno, 1);
5974                 if (ret) {
5975                         mlog_errno(ret);
5976                         goto out_journal;
5977                 }
5978
5979                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5980                 if (ret) {
5981                         mlog_errno(ret);
5982                         goto out_journal;
5983                 }
5984
5985                 tmp = head;
5986                 head = head->free_next;
5987                 kfree(tmp);
5988         }
5989
5990 out_journal:
5991         ocfs2_commit_trans(osb, handle);
5992
5993 out_unlock:
5994         ocfs2_inode_unlock(inode, 1);
5995         brelse(di_bh);
5996 out_mutex:
5997         mutex_unlock(&inode->i_mutex);
5998         iput(inode);
5999 out:
6000         while(head) {
6001                 /* Premature exit may have left some dangling items. */
6002                 tmp = head;
6003                 head = head->free_next;
6004                 kfree(tmp);
6005         }
6006
6007         return ret;
6008 }
6009
6010 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6011                                 u64 blkno, unsigned int bit)
6012 {
6013         int ret = 0;
6014         struct ocfs2_cached_block_free *item;
6015
6016         item = kmalloc(sizeof(*item), GFP_NOFS);
6017         if (item == NULL) {
6018                 ret = -ENOMEM;
6019                 mlog_errno(ret);
6020                 return ret;
6021         }
6022
6023         mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6024              bit, (unsigned long long)blkno);
6025
6026         item->free_blk = blkno;
6027         item->free_bit = bit;
6028         item->free_next = ctxt->c_global_allocator;
6029
6030         ctxt->c_global_allocator = item;
6031         return ret;
6032 }
6033
6034 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6035                                       struct ocfs2_cached_block_free *head)
6036 {
6037         struct ocfs2_cached_block_free *tmp;
6038         struct inode *tl_inode = osb->osb_tl_inode;
6039         handle_t *handle;
6040         int ret = 0;
6041
6042         mutex_lock(&tl_inode->i_mutex);
6043
6044         while (head) {
6045                 if (ocfs2_truncate_log_needs_flush(osb)) {
6046                         ret = __ocfs2_flush_truncate_log(osb);
6047                         if (ret < 0) {
6048                                 mlog_errno(ret);
6049                                 break;
6050                         }
6051                 }
6052
6053                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6054                 if (IS_ERR(handle)) {
6055                         ret = PTR_ERR(handle);
6056                         mlog_errno(ret);
6057                         break;
6058                 }
6059
6060                 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6061                                                 head->free_bit);
6062
6063                 ocfs2_commit_trans(osb, handle);
6064                 tmp = head;
6065                 head = head->free_next;
6066                 kfree(tmp);
6067
6068                 if (ret < 0) {
6069                         mlog_errno(ret);
6070                         break;
6071                 }
6072         }
6073
6074         mutex_unlock(&tl_inode->i_mutex);
6075
6076         while (head) {
6077                 /* Premature exit may have left some dangling items. */
6078                 tmp = head;
6079                 head = head->free_next;
6080                 kfree(tmp);
6081         }
6082
6083         return ret;
6084 }
6085
6086 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6087                        struct ocfs2_cached_dealloc_ctxt *ctxt)
6088 {
6089         int ret = 0, ret2;
6090         struct ocfs2_per_slot_free_list *fl;
6091
6092         if (!ctxt)
6093                 return 0;
6094
6095         while (ctxt->c_first_suballocator) {
6096                 fl = ctxt->c_first_suballocator;
6097
6098                 if (fl->f_first) {
6099                         mlog(0, "Free items: (type %u, slot %d)\n",
6100                              fl->f_inode_type, fl->f_slot);
6101                         ret2 = ocfs2_free_cached_blocks(osb,
6102                                                         fl->f_inode_type,
6103                                                         fl->f_slot,
6104                                                         fl->f_first);
6105                         if (ret2)
6106                                 mlog_errno(ret2);
6107                         if (!ret)
6108                                 ret = ret2;
6109                 }
6110
6111                 ctxt->c_first_suballocator = fl->f_next_suballocator;
6112                 kfree(fl);
6113         }
6114
6115         if (ctxt->c_global_allocator) {
6116                 ret2 = ocfs2_free_cached_clusters(osb,
6117                                                   ctxt->c_global_allocator);
6118                 if (ret2)
6119                         mlog_errno(ret2);
6120                 if (!ret)
6121                         ret = ret2;
6122
6123                 ctxt->c_global_allocator = NULL;
6124         }
6125
6126         return ret;
6127 }
6128
6129 static struct ocfs2_per_slot_free_list *
6130 ocfs2_find_per_slot_free_list(int type,
6131                               int slot,
6132                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6133 {
6134         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6135
6136         while (fl) {
6137                 if (fl->f_inode_type == type && fl->f_slot == slot)
6138                         return fl;
6139
6140                 fl = fl->f_next_suballocator;
6141         }
6142
6143         fl = kmalloc(sizeof(*fl), GFP_NOFS);
6144         if (fl) {
6145                 fl->f_inode_type = type;
6146                 fl->f_slot = slot;
6147                 fl->f_first = NULL;
6148                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6149
6150                 ctxt->c_first_suballocator = fl;
6151         }
6152         return fl;
6153 }
6154
6155 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6156                                      int type, int slot, u64 blkno,
6157                                      unsigned int bit)
6158 {
6159         int ret;
6160         struct ocfs2_per_slot_free_list *fl;
6161         struct ocfs2_cached_block_free *item;
6162
6163         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6164         if (fl == NULL) {
6165                 ret = -ENOMEM;
6166                 mlog_errno(ret);
6167                 goto out;
6168         }
6169
6170         item = kmalloc(sizeof(*item), GFP_NOFS);
6171         if (item == NULL) {
6172                 ret = -ENOMEM;
6173                 mlog_errno(ret);
6174                 goto out;
6175         }
6176
6177         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6178              type, slot, bit, (unsigned long long)blkno);
6179
6180         item->free_blk = blkno;
6181         item->free_bit = bit;
6182         item->free_next = fl->f_first;
6183
6184         fl->f_first = item;
6185
6186         ret = 0;
6187 out:
6188         return ret;
6189 }
6190
6191 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6192                                          struct ocfs2_extent_block *eb)
6193 {
6194         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6195                                          le16_to_cpu(eb->h_suballoc_slot),
6196                                          le64_to_cpu(eb->h_blkno),
6197                                          le16_to_cpu(eb->h_suballoc_bit));
6198 }
6199
6200 /* This function will figure out whether the currently last extent
6201  * block will be deleted, and if it will, what the new last extent
6202  * block will be so we can update his h_next_leaf_blk field, as well
6203  * as the dinodes i_last_eb_blk */
6204 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6205                                        unsigned int clusters_to_del,
6206                                        struct ocfs2_path *path,
6207                                        struct buffer_head **new_last_eb)
6208 {
6209         int next_free, ret = 0;
6210         u32 cpos;
6211         struct ocfs2_extent_rec *rec;
6212         struct ocfs2_extent_block *eb;
6213         struct ocfs2_extent_list *el;
6214         struct buffer_head *bh = NULL;
6215
6216         *new_last_eb = NULL;
6217
6218         /* we have no tree, so of course, no last_eb. */
6219         if (!path->p_tree_depth)
6220                 goto out;
6221
6222         /* trunc to zero special case - this makes tree_depth = 0
6223          * regardless of what it is.  */
6224         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6225                 goto out;
6226
6227         el = path_leaf_el(path);
6228         BUG_ON(!el->l_next_free_rec);
6229
6230         /*
6231          * Make sure that this extent list will actually be empty
6232          * after we clear away the data. We can shortcut out if
6233          * there's more than one non-empty extent in the
6234          * list. Otherwise, a check of the remaining extent is
6235          * necessary.
6236          */
6237         next_free = le16_to_cpu(el->l_next_free_rec);
6238         rec = NULL;
6239         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6240                 if (next_free > 2)
6241                         goto out;
6242
6243                 /* We may have a valid extent in index 1, check it. */
6244                 if (next_free == 2)
6245                         rec = &el->l_recs[1];
6246
6247                 /*
6248                  * Fall through - no more nonempty extents, so we want
6249                  * to delete this leaf.
6250                  */
6251         } else {
6252                 if (next_free > 1)
6253                         goto out;
6254
6255                 rec = &el->l_recs[0];
6256         }
6257
6258         if (rec) {
6259                 /*
6260                  * Check it we'll only be trimming off the end of this
6261                  * cluster.
6262                  */
6263                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6264                         goto out;
6265         }
6266
6267         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6268         if (ret) {
6269                 mlog_errno(ret);
6270                 goto out;
6271         }
6272
6273         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6274         if (ret) {
6275                 mlog_errno(ret);
6276                 goto out;
6277         }
6278
6279         eb = (struct ocfs2_extent_block *) bh->b_data;
6280         el = &eb->h_list;
6281
6282         /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6283          * Any corruption is a code bug. */
6284         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6285
6286         *new_last_eb = bh;
6287         get_bh(*new_last_eb);
6288         mlog(0, "returning block %llu, (cpos: %u)\n",
6289              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6290 out:
6291         brelse(bh);
6292
6293         return ret;
6294 }
6295
6296 /*
6297  * Trim some clusters off the rightmost edge of a tree. Only called
6298  * during truncate.
6299  *
6300  * The caller needs to:
6301  *   - start journaling of each path component.
6302  *   - compute and fully set up any new last ext block
6303  */
6304 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6305                            handle_t *handle, struct ocfs2_truncate_context *tc,
6306                            u32 clusters_to_del, u64 *delete_start)
6307 {
6308         int ret, i, index = path->p_tree_depth;
6309         u32 new_edge = 0;
6310         u64 deleted_eb = 0;
6311         struct buffer_head *bh;
6312         struct ocfs2_extent_list *el;
6313         struct ocfs2_extent_rec *rec;
6314
6315         *delete_start = 0;
6316
6317         while (index >= 0) {
6318                 bh = path->p_node[index].bh;
6319                 el = path->p_node[index].el;
6320
6321                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6322                      index,  (unsigned long long)bh->b_blocknr);
6323
6324                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6325
6326                 if (index !=
6327                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6328                         ocfs2_error(inode->i_sb,
6329                                     "Inode %lu has invalid ext. block %llu",
6330                                     inode->i_ino,
6331                                     (unsigned long long)bh->b_blocknr);
6332                         ret = -EROFS;
6333                         goto out;
6334                 }
6335
6336 find_tail_record:
6337                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6338                 rec = &el->l_recs[i];
6339
6340                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6341                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6342                      ocfs2_rec_clusters(el, rec),
6343                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6344                      le16_to_cpu(el->l_next_free_rec));
6345
6346                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6347
6348                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6349                         /*
6350                          * If the leaf block contains a single empty
6351                          * extent and no records, we can just remove
6352                          * the block.
6353                          */
6354                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6355                                 memset(rec, 0,
6356                                        sizeof(struct ocfs2_extent_rec));
6357                                 el->l_next_free_rec = cpu_to_le16(0);
6358
6359                                 goto delete;
6360                         }
6361
6362                         /*
6363                          * Remove any empty extents by shifting things
6364                          * left. That should make life much easier on
6365                          * the code below. This condition is rare
6366                          * enough that we shouldn't see a performance
6367                          * hit.
6368                          */
6369                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6370                                 le16_add_cpu(&el->l_next_free_rec, -1);
6371
6372                                 for(i = 0;
6373                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6374                                         el->l_recs[i] = el->l_recs[i + 1];
6375
6376                                 memset(&el->l_recs[i], 0,
6377                                        sizeof(struct ocfs2_extent_rec));
6378
6379                                 /*
6380                                  * We've modified our extent list. The
6381                                  * simplest way to handle this change
6382                                  * is to being the search from the
6383                                  * start again.
6384                                  */
6385                                 goto find_tail_record;
6386                         }
6387
6388                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6389
6390                         /*
6391                          * We'll use "new_edge" on our way back up the
6392                          * tree to know what our rightmost cpos is.
6393                          */
6394                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6395                         new_edge += le32_to_cpu(rec->e_cpos);
6396
6397                         /*
6398                          * The caller will use this to delete data blocks.
6399                          */
6400                         *delete_start = le64_to_cpu(rec->e_blkno)
6401                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6402                                         le16_to_cpu(rec->e_leaf_clusters));
6403
6404                         /*
6405                          * If it's now empty, remove this record.
6406                          */
6407                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6408                                 memset(rec, 0,
6409                                        sizeof(struct ocfs2_extent_rec));
6410                                 le16_add_cpu(&el->l_next_free_rec, -1);
6411                         }
6412                 } else {
6413                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6414                                 memset(rec, 0,
6415                                        sizeof(struct ocfs2_extent_rec));
6416                                 le16_add_cpu(&el->l_next_free_rec, -1);
6417
6418                                 goto delete;
6419                         }
6420
6421                         /* Can this actually happen? */
6422                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6423                                 goto delete;
6424
6425                         /*
6426                          * We never actually deleted any clusters
6427                          * because our leaf was empty. There's no
6428                          * reason to adjust the rightmost edge then.
6429                          */
6430                         if (new_edge == 0)
6431                                 goto delete;
6432
6433                         rec->e_int_clusters = cpu_to_le32(new_edge);
6434                         le32_add_cpu(&rec->e_int_clusters,
6435                                      -le32_to_cpu(rec->e_cpos));
6436
6437                          /*
6438                           * A deleted child record should have been
6439                           * caught above.
6440                           */
6441                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6442                 }
6443
6444 delete:
6445                 ret = ocfs2_journal_dirty(handle, bh);
6446                 if (ret) {
6447                         mlog_errno(ret);
6448                         goto out;
6449                 }
6450
6451                 mlog(0, "extent list container %llu, after: record %d: "
6452                      "(%u, %u, %llu), next = %u.\n",
6453                      (unsigned long long)bh->b_blocknr, i,
6454                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6455                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6456                      le16_to_cpu(el->l_next_free_rec));
6457
6458                 /*
6459                  * We must be careful to only attempt delete of an
6460                  * extent block (and not the root inode block).
6461                  */
6462                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6463                         struct ocfs2_extent_block *eb =
6464                                 (struct ocfs2_extent_block *)bh->b_data;
6465
6466                         /*
6467                          * Save this for use when processing the
6468                          * parent block.
6469                          */
6470                         deleted_eb = le64_to_cpu(eb->h_blkno);
6471
6472                         mlog(0, "deleting this extent block.\n");
6473
6474                         ocfs2_remove_from_cache(inode, bh);
6475
6476                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6477                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6478                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6479
6480                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6481                         /* An error here is not fatal. */
6482                         if (ret < 0)
6483                                 mlog_errno(ret);
6484                 } else {
6485                         deleted_eb = 0;
6486                 }
6487
6488                 index--;
6489         }
6490
6491         ret = 0;
6492 out:
6493         return ret;
6494 }
6495
6496 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6497                              unsigned int clusters_to_del,
6498                              struct inode *inode,
6499                              struct buffer_head *fe_bh,
6500                              handle_t *handle,
6501                              struct ocfs2_truncate_context *tc,
6502                              struct ocfs2_path *path)
6503 {
6504         int status;
6505         struct ocfs2_dinode *fe;
6506         struct ocfs2_extent_block *last_eb = NULL;
6507         struct ocfs2_extent_list *el;
6508         struct buffer_head *last_eb_bh = NULL;
6509         u64 delete_blk = 0;
6510
6511         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6512
6513         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6514                                              path, &last_eb_bh);
6515         if (status < 0) {
6516                 mlog_errno(status);
6517                 goto bail;
6518         }
6519
6520         /*
6521          * Each component will be touched, so we might as well journal
6522          * here to avoid having to handle errors later.
6523          */
6524         status = ocfs2_journal_access_path(inode, handle, path);
6525         if (status < 0) {
6526                 mlog_errno(status);
6527                 goto bail;
6528         }
6529
6530         if (last_eb_bh) {
6531                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
6532                                               OCFS2_JOURNAL_ACCESS_WRITE);
6533                 if (status < 0) {
6534                         mlog_errno(status);
6535                         goto bail;
6536                 }
6537
6538                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6539         }
6540
6541         el = &(fe->id2.i_list);
6542
6543         /*
6544          * Lower levels depend on this never happening, but it's best
6545          * to check it up here before changing the tree.
6546          */
6547         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6548                 ocfs2_error(inode->i_sb,
6549                             "Inode %lu has an empty extent record, depth %u\n",
6550                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6551                 status = -EROFS;
6552                 goto bail;
6553         }
6554
6555         spin_lock(&OCFS2_I(inode)->ip_lock);
6556         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6557                                       clusters_to_del;
6558         spin_unlock(&OCFS2_I(inode)->ip_lock);
6559         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6560         inode->i_blocks = ocfs2_inode_sector_count(inode);
6561
6562         status = ocfs2_trim_tree(inode, path, handle, tc,
6563                                  clusters_to_del, &delete_blk);
6564         if (status) {
6565                 mlog_errno(status);
6566                 goto bail;
6567         }
6568
6569         if (le32_to_cpu(fe->i_clusters) == 0) {
6570                 /* trunc to zero is a special case. */
6571                 el->l_tree_depth = 0;
6572                 fe->i_last_eb_blk = 0;
6573         } else if (last_eb)
6574                 fe->i_last_eb_blk = last_eb->h_blkno;
6575
6576         status = ocfs2_journal_dirty(handle, fe_bh);
6577         if (status < 0) {
6578                 mlog_errno(status);
6579                 goto bail;
6580         }
6581
6582         if (last_eb) {
6583                 /* If there will be a new last extent block, then by
6584                  * definition, there cannot be any leaves to the right of
6585                  * him. */
6586                 last_eb->h_next_leaf_blk = 0;
6587                 status = ocfs2_journal_dirty(handle, last_eb_bh);
6588                 if (status < 0) {
6589                         mlog_errno(status);
6590                         goto bail;
6591                 }
6592         }
6593
6594         if (delete_blk) {
6595                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6596                                                    clusters_to_del);
6597                 if (status < 0) {
6598                         mlog_errno(status);
6599                         goto bail;
6600                 }
6601         }
6602         status = 0;
6603 bail:
6604
6605         mlog_exit(status);
6606         return status;
6607 }
6608
6609 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6610 {
6611         set_buffer_uptodate(bh);
6612         mark_buffer_dirty(bh);
6613         return 0;
6614 }
6615
6616 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6617                                      unsigned int from, unsigned int to,
6618                                      struct page *page, int zero, u64 *phys)
6619 {
6620         int ret, partial = 0;
6621
6622         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6623         if (ret)
6624                 mlog_errno(ret);
6625
6626         if (zero)
6627                 zero_user_segment(page, from, to);
6628
6629         /*
6630          * Need to set the buffers we zero'd into uptodate
6631          * here if they aren't - ocfs2_map_page_blocks()
6632          * might've skipped some
6633          */
6634         ret = walk_page_buffers(handle, page_buffers(page),
6635                                 from, to, &partial,
6636                                 ocfs2_zero_func);
6637         if (ret < 0)
6638                 mlog_errno(ret);
6639         else if (ocfs2_should_order_data(inode)) {
6640                 ret = ocfs2_jbd2_file_inode(handle, inode);
6641                 if (ret < 0)
6642                         mlog_errno(ret);
6643         }
6644
6645         if (!partial)
6646                 SetPageUptodate(page);
6647
6648         flush_dcache_page(page);
6649 }
6650
6651 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6652                                      loff_t end, struct page **pages,
6653                                      int numpages, u64 phys, handle_t *handle)
6654 {
6655         int i;
6656         struct page *page;
6657         unsigned int from, to = PAGE_CACHE_SIZE;
6658         struct super_block *sb = inode->i_sb;
6659
6660         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6661
6662         if (numpages == 0)
6663                 goto out;
6664
6665         to = PAGE_CACHE_SIZE;
6666         for(i = 0; i < numpages; i++) {
6667                 page = pages[i];
6668
6669                 from = start & (PAGE_CACHE_SIZE - 1);
6670                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6671                         to = end & (PAGE_CACHE_SIZE - 1);
6672
6673                 BUG_ON(from > PAGE_CACHE_SIZE);
6674                 BUG_ON(to > PAGE_CACHE_SIZE);
6675
6676                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6677                                          &phys);
6678
6679                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6680         }
6681 out:
6682         if (pages)
6683                 ocfs2_unlock_and_free_pages(pages, numpages);
6684 }
6685
6686 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6687                                 struct page **pages, int *num)
6688 {
6689         int numpages, ret = 0;
6690         struct super_block *sb = inode->i_sb;
6691         struct address_space *mapping = inode->i_mapping;
6692         unsigned long index;
6693         loff_t last_page_bytes;
6694
6695         BUG_ON(start > end);
6696
6697         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6698                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6699
6700         numpages = 0;
6701         last_page_bytes = PAGE_ALIGN(end);
6702         index = start >> PAGE_CACHE_SHIFT;
6703         do {
6704                 pages[numpages] = grab_cache_page(mapping, index);
6705                 if (!pages[numpages]) {
6706                         ret = -ENOMEM;
6707                         mlog_errno(ret);
6708                         goto out;
6709                 }
6710
6711                 numpages++;
6712                 index++;
6713         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6714
6715 out:
6716         if (ret != 0) {
6717                 if (pages)
6718                         ocfs2_unlock_and_free_pages(pages, numpages);
6719                 numpages = 0;
6720         }
6721
6722         *num = numpages;
6723
6724         return ret;
6725 }
6726
6727 /*
6728  * Zero the area past i_size but still within an allocated
6729  * cluster. This avoids exposing nonzero data on subsequent file
6730  * extends.
6731  *
6732  * We need to call this before i_size is updated on the inode because
6733  * otherwise block_write_full_page() will skip writeout of pages past
6734  * i_size. The new_i_size parameter is passed for this reason.
6735  */
6736 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6737                                   u64 range_start, u64 range_end)
6738 {
6739         int ret = 0, numpages;
6740         struct page **pages = NULL;
6741         u64 phys;
6742         unsigned int ext_flags;
6743         struct super_block *sb = inode->i_sb;
6744
6745         /*
6746          * File systems which don't support sparse files zero on every
6747          * extend.
6748          */
6749         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6750                 return 0;
6751
6752         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6753                         sizeof(struct page *), GFP_NOFS);
6754         if (pages == NULL) {
6755                 ret = -ENOMEM;
6756                 mlog_errno(ret);
6757                 goto out;
6758         }
6759
6760         if (range_start == range_end)
6761                 goto out;
6762
6763         ret = ocfs2_extent_map_get_blocks(inode,
6764                                           range_start >> sb->s_blocksize_bits,
6765                                           &phys, NULL, &ext_flags);
6766         if (ret) {
6767                 mlog_errno(ret);
6768                 goto out;
6769         }
6770
6771         /*
6772          * Tail is a hole, or is marked unwritten. In either case, we
6773          * can count on read and write to return/push zero's.
6774          */
6775         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6776                 goto out;
6777
6778         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6779                                    &numpages);
6780         if (ret) {
6781                 mlog_errno(ret);
6782                 goto out;
6783         }
6784
6785         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6786                                  numpages, phys, handle);
6787
6788         /*
6789          * Initiate writeout of the pages we zero'd here. We don't
6790          * wait on them - the truncate_inode_pages() call later will
6791          * do that for us.
6792          */
6793         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6794                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6795         if (ret)
6796                 mlog_errno(ret);
6797
6798 out:
6799         if (pages)
6800                 kfree(pages);
6801
6802         return ret;
6803 }
6804
6805 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6806                                              struct ocfs2_dinode *di)
6807 {
6808         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6809         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6810
6811         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6812                 memset(&di->id2, 0, blocksize -
6813                                     offsetof(struct ocfs2_dinode, id2) -
6814                                     xattrsize);
6815         else
6816                 memset(&di->id2, 0, blocksize -
6817                                     offsetof(struct ocfs2_dinode, id2));
6818 }
6819
6820 void ocfs2_dinode_new_extent_list(struct inode *inode,
6821                                   struct ocfs2_dinode *di)
6822 {
6823         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6824         di->id2.i_list.l_tree_depth = 0;
6825         di->id2.i_list.l_next_free_rec = 0;
6826         di->id2.i_list.l_count = cpu_to_le16(
6827                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6828 }
6829
6830 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6831 {
6832         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6833         struct ocfs2_inline_data *idata = &di->id2.i_data;
6834
6835         spin_lock(&oi->ip_lock);
6836         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6837         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6838         spin_unlock(&oi->ip_lock);
6839
6840         /*
6841          * We clear the entire i_data structure here so that all
6842          * fields can be properly initialized.
6843          */
6844         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6845
6846         idata->id_count = cpu_to_le16(
6847                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6848 }
6849
6850 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6851                                          struct buffer_head *di_bh)
6852 {
6853         int ret, i, has_data, num_pages = 0;
6854         handle_t *handle;
6855         u64 uninitialized_var(block);
6856         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6857         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6858         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6859         struct ocfs2_alloc_context *data_ac = NULL;
6860         struct page **pages = NULL;
6861         loff_t end = osb->s_clustersize;
6862         struct ocfs2_extent_tree et;
6863
6864         has_data = i_size_read(inode) ? 1 : 0;
6865
6866         if (has_data) {
6867                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6868                                 sizeof(struct page *), GFP_NOFS);
6869                 if (pages == NULL) {
6870                         ret = -ENOMEM;
6871                         mlog_errno(ret);
6872                         goto out;
6873                 }
6874
6875                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6876                 if (ret) {
6877                         mlog_errno(ret);
6878                         goto out;
6879                 }
6880         }
6881
6882         handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6883         if (IS_ERR(handle)) {
6884                 ret = PTR_ERR(handle);
6885                 mlog_errno(ret);
6886                 goto out_unlock;
6887         }
6888
6889         ret = ocfs2_journal_access(handle, inode, di_bh,
6890                                    OCFS2_JOURNAL_ACCESS_WRITE);
6891         if (ret) {
6892                 mlog_errno(ret);
6893                 goto out_commit;
6894         }
6895
6896         if (has_data) {
6897                 u32 bit_off, num;
6898                 unsigned int page_end;
6899                 u64 phys;
6900
6901                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6902                                            &num);
6903                 if (ret) {
6904                         mlog_errno(ret);
6905                         goto out_commit;
6906                 }
6907
6908                 /*
6909                  * Save two copies, one for insert, and one that can
6910                  * be changed by ocfs2_map_and_dirty_page() below.
6911                  */
6912                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6913
6914                 /*
6915                  * Non sparse file systems zero on extend, so no need
6916                  * to do that now.
6917                  */
6918                 if (!ocfs2_sparse_alloc(osb) &&
6919                     PAGE_CACHE_SIZE < osb->s_clustersize)
6920                         end = PAGE_CACHE_SIZE;
6921
6922                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6923                 if (ret) {
6924                         mlog_errno(ret);
6925                         goto out_commit;
6926                 }
6927
6928                 /*
6929                  * This should populate the 1st page for us and mark
6930                  * it up to date.
6931                  */
6932                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6933                 if (ret) {
6934                         mlog_errno(ret);
6935                         goto out_commit;
6936                 }
6937
6938                 page_end = PAGE_CACHE_SIZE;
6939                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6940                         page_end = osb->s_clustersize;
6941
6942                 for (i = 0; i < num_pages; i++)
6943                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6944                                                  pages[i], i > 0, &phys);
6945         }
6946
6947         spin_lock(&oi->ip_lock);
6948         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6949         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6950         spin_unlock(&oi->ip_lock);
6951
6952         ocfs2_dinode_new_extent_list(inode, di);
6953
6954         ocfs2_journal_dirty(handle, di_bh);
6955
6956         if (has_data) {
6957                 /*
6958                  * An error at this point should be extremely rare. If
6959                  * this proves to be false, we could always re-build
6960                  * the in-inode data from our pages.
6961                  */
6962                 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
6963                 ret = ocfs2_insert_extent(osb, handle, inode, &et,
6964                                           0, block, 1, 0, NULL);
6965                 if (ret) {
6966                         mlog_errno(ret);
6967                         goto out_commit;
6968                 }
6969
6970                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6971         }
6972
6973 out_commit:
6974         ocfs2_commit_trans(osb, handle);
6975
6976 out_unlock:
6977         if (data_ac)
6978                 ocfs2_free_alloc_context(data_ac);
6979
6980 out:
6981         if (pages) {
6982                 ocfs2_unlock_and_free_pages(pages, num_pages);
6983                 kfree(pages);
6984         }
6985
6986         return ret;
6987 }
6988
6989 /*
6990  * It is expected, that by the time you call this function,
6991  * inode->i_size and fe->i_size have been adjusted.
6992  *
6993  * WARNING: This will kfree the truncate context
6994  */
6995 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6996                           struct inode *inode,
6997                           struct buffer_head *fe_bh,
6998                           struct ocfs2_truncate_context *tc)
6999 {
7000         int status, i, credits, tl_sem = 0;
7001         u32 clusters_to_del, new_highest_cpos, range;
7002         struct ocfs2_extent_list *el;
7003         handle_t *handle = NULL;
7004         struct inode *tl_inode = osb->osb_tl_inode;
7005         struct ocfs2_path *path = NULL;
7006         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7007
7008         mlog_entry_void();
7009
7010         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7011                                                      i_size_read(inode));
7012
7013         path = ocfs2_new_path(fe_bh, &di->id2.i_list);
7014         if (!path) {
7015                 status = -ENOMEM;
7016                 mlog_errno(status);
7017                 goto bail;
7018         }
7019
7020         ocfs2_extent_map_trunc(inode, new_highest_cpos);
7021
7022 start:
7023         /*
7024          * Check that we still have allocation to delete.
7025          */
7026         if (OCFS2_I(inode)->ip_clusters == 0) {
7027                 status = 0;
7028                 goto bail;
7029         }
7030
7031         /*
7032          * Truncate always works against the rightmost tree branch.
7033          */
7034         status = ocfs2_find_path(inode, path, UINT_MAX);
7035         if (status) {
7036                 mlog_errno(status);
7037                 goto bail;
7038         }
7039
7040         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7041              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7042
7043         /*
7044          * By now, el will point to the extent list on the bottom most
7045          * portion of this tree. Only the tail record is considered in
7046          * each pass.
7047          *
7048          * We handle the following cases, in order:
7049          * - empty extent: delete the remaining branch
7050          * - remove the entire record
7051          * - remove a partial record
7052          * - no record needs to be removed (truncate has completed)
7053          */
7054         el = path_leaf_el(path);
7055         if (le16_to_cpu(el->l_next_free_rec) == 0) {
7056                 ocfs2_error(inode->i_sb,
7057                             "Inode %llu has empty extent block at %llu\n",
7058                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7059                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
7060                 status = -EROFS;
7061                 goto bail;
7062         }
7063
7064         i = le16_to_cpu(el->l_next_free_rec) - 1;
7065         range = le32_to_cpu(el->l_recs[i].e_cpos) +
7066                 ocfs2_rec_clusters(el, &el->l_recs[i]);
7067         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
7068                 clusters_to_del = 0;
7069         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7070                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7071         } else if (range > new_highest_cpos) {
7072                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7073                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
7074                                   new_highest_cpos;
7075         } else {
7076                 status = 0;
7077                 goto bail;
7078         }
7079
7080         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7081              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7082
7083         mutex_lock(&tl_inode->i_mutex);
7084         tl_sem = 1;
7085         /* ocfs2_truncate_log_needs_flush guarantees us at least one
7086          * record is free for use. If there isn't any, we flush to get
7087          * an empty truncate log.  */
7088         if (ocfs2_truncate_log_needs_flush(osb)) {
7089                 status = __ocfs2_flush_truncate_log(osb);
7090                 if (status < 0) {
7091                         mlog_errno(status);
7092                         goto bail;
7093                 }
7094         }
7095
7096         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7097                                                 (struct ocfs2_dinode *)fe_bh->b_data,
7098                                                 el);
7099         handle = ocfs2_start_trans(osb, credits);
7100         if (IS_ERR(handle)) {
7101                 status = PTR_ERR(handle);
7102                 handle = NULL;
7103                 mlog_errno(status);
7104                 goto bail;
7105         }
7106
7107         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7108                                    tc, path);
7109         if (status < 0) {
7110                 mlog_errno(status);
7111                 goto bail;
7112         }
7113
7114         mutex_unlock(&tl_inode->i_mutex);
7115         tl_sem = 0;
7116
7117         ocfs2_commit_trans(osb, handle);
7118         handle = NULL;
7119
7120         ocfs2_reinit_path(path, 1);
7121
7122         /*
7123          * The check above will catch the case where we've truncated
7124          * away all allocation.
7125          */
7126         goto start;
7127
7128 bail:
7129
7130         ocfs2_schedule_truncate_log_flush(osb, 1);
7131
7132         if (tl_sem)
7133                 mutex_unlock(&tl_inode->i_mutex);
7134
7135         if (handle)
7136                 ocfs2_commit_trans(osb, handle);
7137
7138         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7139
7140         ocfs2_free_path(path);
7141
7142         /* This will drop the ext_alloc cluster lock for us */
7143         ocfs2_free_truncate_context(tc);
7144
7145         mlog_exit(status);
7146         return status;
7147 }
7148
7149 /*
7150  * Expects the inode to already be locked.
7151  */
7152 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7153                            struct inode *inode,
7154                            struct buffer_head *fe_bh,
7155                            struct ocfs2_truncate_context **tc)
7156 {
7157         int status;
7158         unsigned int new_i_clusters;
7159         struct ocfs2_dinode *fe;
7160         struct ocfs2_extent_block *eb;
7161         struct buffer_head *last_eb_bh = NULL;
7162
7163         mlog_entry_void();
7164
7165         *tc = NULL;
7166
7167         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7168                                                   i_size_read(inode));
7169         fe = (struct ocfs2_dinode *) fe_bh->b_data;
7170
7171         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7172              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7173              (unsigned long long)le64_to_cpu(fe->i_size));
7174
7175         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7176         if (!(*tc)) {
7177                 status = -ENOMEM;
7178                 mlog_errno(status);
7179                 goto bail;
7180         }
7181         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7182
7183         if (fe->id2.i_list.l_tree_depth) {
7184                 status = ocfs2_read_extent_block(inode,
7185                                                  le64_to_cpu(fe->i_last_eb_blk),
7186                                                  &last_eb_bh);
7187                 if (status < 0) {
7188                         mlog_errno(status);
7189                         goto bail;
7190                 }
7191                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7192         }
7193
7194         (*tc)->tc_last_eb_bh = last_eb_bh;
7195
7196         status = 0;
7197 bail:
7198         if (status < 0) {
7199                 if (*tc)
7200                         ocfs2_free_truncate_context(*tc);
7201                 *tc = NULL;
7202         }
7203         mlog_exit_void();
7204         return status;
7205 }
7206
7207 /*
7208  * 'start' is inclusive, 'end' is not.
7209  */
7210 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7211                           unsigned int start, unsigned int end, int trunc)
7212 {
7213         int ret;
7214         unsigned int numbytes;
7215         handle_t *handle;
7216         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7217         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7218         struct ocfs2_inline_data *idata = &di->id2.i_data;
7219
7220         if (end > i_size_read(inode))
7221                 end = i_size_read(inode);
7222
7223         BUG_ON(start >= end);
7224
7225         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7226             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7227             !ocfs2_supports_inline_data(osb)) {
7228                 ocfs2_error(inode->i_sb,
7229                             "Inline data flags for inode %llu don't agree! "
7230                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7231                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7232                             le16_to_cpu(di->i_dyn_features),
7233                             OCFS2_I(inode)->ip_dyn_features,
7234                             osb->s_feature_incompat);
7235                 ret = -EROFS;
7236                 goto out;
7237         }
7238
7239         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7240         if (IS_ERR(handle)) {
7241                 ret = PTR_ERR(handle);
7242                 mlog_errno(ret);
7243                 goto out;
7244         }
7245
7246         ret = ocfs2_journal_access(handle, inode, di_bh,
7247                                    OCFS2_JOURNAL_ACCESS_WRITE);
7248         if (ret) {
7249                 mlog_errno(ret);
7250                 goto out_commit;
7251         }
7252
7253         numbytes = end - start;
7254         memset(idata->id_data + start, 0, numbytes);
7255
7256         /*
7257          * No need to worry about the data page here - it's been
7258          * truncated already and inline data doesn't need it for
7259          * pushing zero's to disk, so we'll let readpage pick it up
7260          * later.
7261          */
7262         if (trunc) {
7263                 i_size_write(inode, start);
7264                 di->i_size = cpu_to_le64(start);
7265         }
7266
7267         inode->i_blocks = ocfs2_inode_sector_count(inode);
7268         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7269
7270         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7271         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7272
7273         ocfs2_journal_dirty(handle, di_bh);
7274
7275 out_commit:
7276         ocfs2_commit_trans(osb, handle);
7277
7278 out:
7279         return ret;
7280 }
7281
7282 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7283 {
7284         /*
7285          * The caller is responsible for completing deallocation
7286          * before freeing the context.
7287          */
7288         if (tc->tc_dealloc.c_first_suballocator != NULL)
7289                 mlog(ML_NOTICE,
7290                      "Truncate completion has non-empty dealloc context\n");
7291
7292         brelse(tc->tc_last_eb_bh);
7293
7294         kfree(tc);
7295 }