ocfs2: Wrap up the common use cases of ocfs2_new_path().
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 #include <linux/quotaops.h>
32
33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
34 #include <cluster/masklog.h>
35
36 #include "ocfs2.h"
37
38 #include "alloc.h"
39 #include "aops.h"
40 #include "blockcheck.h"
41 #include "dlmglue.h"
42 #include "extent_map.h"
43 #include "inode.h"
44 #include "journal.h"
45 #include "localalloc.h"
46 #include "suballoc.h"
47 #include "sysfile.h"
48 #include "file.h"
49 #include "super.h"
50 #include "uptodate.h"
51
52 #include "buffer_head_io.h"
53
54
55 /*
56  * Operations for a specific extent tree type.
57  *
58  * To implement an on-disk btree (extent tree) type in ocfs2, add
59  * an ocfs2_extent_tree_operations structure and the matching
60  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
61  * for the allocation portion of the extent tree.
62  */
63 struct ocfs2_extent_tree_operations {
64         /*
65          * last_eb_blk is the block number of the right most leaf extent
66          * block.  Most on-disk structures containing an extent tree store
67          * this value for fast access.  The ->eo_set_last_eb_blk() and
68          * ->eo_get_last_eb_blk() operations access this value.  They are
69          *  both required.
70          */
71         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
72                                    u64 blkno);
73         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
74
75         /*
76          * The on-disk structure usually keeps track of how many total
77          * clusters are stored in this extent tree.  This function updates
78          * that value.  new_clusters is the delta, and must be
79          * added to the total.  Required.
80          */
81         void (*eo_update_clusters)(struct inode *inode,
82                                    struct ocfs2_extent_tree *et,
83                                    u32 new_clusters);
84
85         /*
86          * If ->eo_insert_check() exists, it is called before rec is
87          * inserted into the extent tree.  It is optional.
88          */
89         int (*eo_insert_check)(struct inode *inode,
90                                struct ocfs2_extent_tree *et,
91                                struct ocfs2_extent_rec *rec);
92         int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
93
94         /*
95          * --------------------------------------------------------------
96          * The remaining are internal to ocfs2_extent_tree and don't have
97          * accessor functions
98          */
99
100         /*
101          * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
102          * It is required.
103          */
104         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
105
106         /*
107          * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
108          * it exists.  If it does not, et->et_max_leaf_clusters is set
109          * to 0 (unlimited).  Optional.
110          */
111         void (*eo_fill_max_leaf_clusters)(struct inode *inode,
112                                           struct ocfs2_extent_tree *et);
113 };
114
115
116 /*
117  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
118  * in the methods.
119  */
120 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
121 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
122                                          u64 blkno);
123 static void ocfs2_dinode_update_clusters(struct inode *inode,
124                                          struct ocfs2_extent_tree *et,
125                                          u32 clusters);
126 static int ocfs2_dinode_insert_check(struct inode *inode,
127                                      struct ocfs2_extent_tree *et,
128                                      struct ocfs2_extent_rec *rec);
129 static int ocfs2_dinode_sanity_check(struct inode *inode,
130                                      struct ocfs2_extent_tree *et);
131 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
132 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
133         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
134         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
135         .eo_update_clusters     = ocfs2_dinode_update_clusters,
136         .eo_insert_check        = ocfs2_dinode_insert_check,
137         .eo_sanity_check        = ocfs2_dinode_sanity_check,
138         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
139 };
140
141 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
142                                          u64 blkno)
143 {
144         struct ocfs2_dinode *di = et->et_object;
145
146         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
147         di->i_last_eb_blk = cpu_to_le64(blkno);
148 }
149
150 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
151 {
152         struct ocfs2_dinode *di = et->et_object;
153
154         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
155         return le64_to_cpu(di->i_last_eb_blk);
156 }
157
158 static void ocfs2_dinode_update_clusters(struct inode *inode,
159                                          struct ocfs2_extent_tree *et,
160                                          u32 clusters)
161 {
162         struct ocfs2_dinode *di = et->et_object;
163
164         le32_add_cpu(&di->i_clusters, clusters);
165         spin_lock(&OCFS2_I(inode)->ip_lock);
166         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
167         spin_unlock(&OCFS2_I(inode)->ip_lock);
168 }
169
170 static int ocfs2_dinode_insert_check(struct inode *inode,
171                                      struct ocfs2_extent_tree *et,
172                                      struct ocfs2_extent_rec *rec)
173 {
174         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
175
176         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
177         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
178                         (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
179                         "Device %s, asking for sparse allocation: inode %llu, "
180                         "cpos %u, clusters %u\n",
181                         osb->dev_str,
182                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
183                         rec->e_cpos,
184                         OCFS2_I(inode)->ip_clusters);
185
186         return 0;
187 }
188
189 static int ocfs2_dinode_sanity_check(struct inode *inode,
190                                      struct ocfs2_extent_tree *et)
191 {
192         struct ocfs2_dinode *di = et->et_object;
193
194         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
195         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
196
197         return 0;
198 }
199
200 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
201 {
202         struct ocfs2_dinode *di = et->et_object;
203
204         et->et_root_el = &di->id2.i_list;
205 }
206
207
208 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
209 {
210         struct ocfs2_xattr_value_root *xv = et->et_object;
211
212         et->et_root_el = &xv->xr_list;
213 }
214
215 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
216                                               u64 blkno)
217 {
218         struct ocfs2_xattr_value_root *xv =
219                 (struct ocfs2_xattr_value_root *)et->et_object;
220
221         xv->xr_last_eb_blk = cpu_to_le64(blkno);
222 }
223
224 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
225 {
226         struct ocfs2_xattr_value_root *xv =
227                 (struct ocfs2_xattr_value_root *) et->et_object;
228
229         return le64_to_cpu(xv->xr_last_eb_blk);
230 }
231
232 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
233                                               struct ocfs2_extent_tree *et,
234                                               u32 clusters)
235 {
236         struct ocfs2_xattr_value_root *xv =
237                 (struct ocfs2_xattr_value_root *)et->et_object;
238
239         le32_add_cpu(&xv->xr_clusters, clusters);
240 }
241
242 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
243         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
244         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
245         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
246         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
247 };
248
249 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
250 {
251         struct ocfs2_xattr_block *xb = et->et_object;
252
253         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
254 }
255
256 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
257                                                     struct ocfs2_extent_tree *et)
258 {
259         et->et_max_leaf_clusters =
260                 ocfs2_clusters_for_bytes(inode->i_sb,
261                                          OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
262 }
263
264 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
265                                              u64 blkno)
266 {
267         struct ocfs2_xattr_block *xb = et->et_object;
268         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
269
270         xt->xt_last_eb_blk = cpu_to_le64(blkno);
271 }
272
273 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
274 {
275         struct ocfs2_xattr_block *xb = et->et_object;
276         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
277
278         return le64_to_cpu(xt->xt_last_eb_blk);
279 }
280
281 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
282                                              struct ocfs2_extent_tree *et,
283                                              u32 clusters)
284 {
285         struct ocfs2_xattr_block *xb = et->et_object;
286
287         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
288 }
289
290 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
291         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
292         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
293         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
294         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
295         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
296 };
297
298 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
299                                      struct inode *inode,
300                                      struct buffer_head *bh,
301                                      void *obj,
302                                      struct ocfs2_extent_tree_operations *ops)
303 {
304         et->et_ops = ops;
305         et->et_root_bh = bh;
306         if (!obj)
307                 obj = (void *)bh->b_data;
308         et->et_object = obj;
309
310         et->et_ops->eo_fill_root_el(et);
311         if (!et->et_ops->eo_fill_max_leaf_clusters)
312                 et->et_max_leaf_clusters = 0;
313         else
314                 et->et_ops->eo_fill_max_leaf_clusters(inode, et);
315 }
316
317 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
318                                    struct inode *inode,
319                                    struct buffer_head *bh)
320 {
321         __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
322 }
323
324 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
325                                        struct inode *inode,
326                                        struct buffer_head *bh)
327 {
328         __ocfs2_init_extent_tree(et, inode, bh, NULL,
329                                  &ocfs2_xattr_tree_et_ops);
330 }
331
332 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
333                                         struct inode *inode,
334                                         struct buffer_head *bh,
335                                         struct ocfs2_xattr_value_root *xv)
336 {
337         __ocfs2_init_extent_tree(et, inode, bh, xv,
338                                  &ocfs2_xattr_value_et_ops);
339 }
340
341 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
342                                             u64 new_last_eb_blk)
343 {
344         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
345 }
346
347 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
348 {
349         return et->et_ops->eo_get_last_eb_blk(et);
350 }
351
352 static inline void ocfs2_et_update_clusters(struct inode *inode,
353                                             struct ocfs2_extent_tree *et,
354                                             u32 clusters)
355 {
356         et->et_ops->eo_update_clusters(inode, et, clusters);
357 }
358
359 static inline int ocfs2_et_insert_check(struct inode *inode,
360                                         struct ocfs2_extent_tree *et,
361                                         struct ocfs2_extent_rec *rec)
362 {
363         int ret = 0;
364
365         if (et->et_ops->eo_insert_check)
366                 ret = et->et_ops->eo_insert_check(inode, et, rec);
367         return ret;
368 }
369
370 static inline int ocfs2_et_sanity_check(struct inode *inode,
371                                         struct ocfs2_extent_tree *et)
372 {
373         int ret = 0;
374
375         if (et->et_ops->eo_sanity_check)
376                 ret = et->et_ops->eo_sanity_check(inode, et);
377         return ret;
378 }
379
380 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
381 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
382                                          struct ocfs2_extent_block *eb);
383
384 /*
385  * Structures which describe a path through a btree, and functions to
386  * manipulate them.
387  *
388  * The idea here is to be as generic as possible with the tree
389  * manipulation code.
390  */
391 struct ocfs2_path_item {
392         struct buffer_head              *bh;
393         struct ocfs2_extent_list        *el;
394 };
395
396 #define OCFS2_MAX_PATH_DEPTH    5
397
398 struct ocfs2_path {
399         int                     p_tree_depth;
400         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
401 };
402
403 #define path_root_bh(_path) ((_path)->p_node[0].bh)
404 #define path_root_el(_path) ((_path)->p_node[0].el)
405 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
406 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
407 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
408
409 /*
410  * Reset the actual path elements so that we can re-use the structure
411  * to build another path. Generally, this involves freeing the buffer
412  * heads.
413  */
414 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
415 {
416         int i, start = 0, depth = 0;
417         struct ocfs2_path_item *node;
418
419         if (keep_root)
420                 start = 1;
421
422         for(i = start; i < path_num_items(path); i++) {
423                 node = &path->p_node[i];
424
425                 brelse(node->bh);
426                 node->bh = NULL;
427                 node->el = NULL;
428         }
429
430         /*
431          * Tree depth may change during truncate, or insert. If we're
432          * keeping the root extent list, then make sure that our path
433          * structure reflects the proper depth.
434          */
435         if (keep_root)
436                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
437
438         path->p_tree_depth = depth;
439 }
440
441 static void ocfs2_free_path(struct ocfs2_path *path)
442 {
443         if (path) {
444                 ocfs2_reinit_path(path, 0);
445                 kfree(path);
446         }
447 }
448
449 /*
450  * All the elements of src into dest. After this call, src could be freed
451  * without affecting dest.
452  *
453  * Both paths should have the same root. Any non-root elements of dest
454  * will be freed.
455  */
456 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
457 {
458         int i;
459
460         BUG_ON(path_root_bh(dest) != path_root_bh(src));
461         BUG_ON(path_root_el(dest) != path_root_el(src));
462
463         ocfs2_reinit_path(dest, 1);
464
465         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
466                 dest->p_node[i].bh = src->p_node[i].bh;
467                 dest->p_node[i].el = src->p_node[i].el;
468
469                 if (dest->p_node[i].bh)
470                         get_bh(dest->p_node[i].bh);
471         }
472 }
473
474 /*
475  * Make the *dest path the same as src and re-initialize src path to
476  * have a root only.
477  */
478 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
479 {
480         int i;
481
482         BUG_ON(path_root_bh(dest) != path_root_bh(src));
483
484         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
485                 brelse(dest->p_node[i].bh);
486
487                 dest->p_node[i].bh = src->p_node[i].bh;
488                 dest->p_node[i].el = src->p_node[i].el;
489
490                 src->p_node[i].bh = NULL;
491                 src->p_node[i].el = NULL;
492         }
493 }
494
495 /*
496  * Insert an extent block at given index.
497  *
498  * This will not take an additional reference on eb_bh.
499  */
500 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
501                                         struct buffer_head *eb_bh)
502 {
503         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
504
505         /*
506          * Right now, no root bh is an extent block, so this helps
507          * catch code errors with dinode trees. The assertion can be
508          * safely removed if we ever need to insert extent block
509          * structures at the root.
510          */
511         BUG_ON(index == 0);
512
513         path->p_node[index].bh = eb_bh;
514         path->p_node[index].el = &eb->h_list;
515 }
516
517 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
518                                          struct ocfs2_extent_list *root_el)
519 {
520         struct ocfs2_path *path;
521
522         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
523
524         path = kzalloc(sizeof(*path), GFP_NOFS);
525         if (path) {
526                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
527                 get_bh(root_bh);
528                 path_root_bh(path) = root_bh;
529                 path_root_el(path) = root_el;
530         }
531
532         return path;
533 }
534
535 static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
536 {
537         return ocfs2_new_path(path_root_bh(path), path_root_el(path));
538 }
539
540 static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
541 {
542         return ocfs2_new_path(et->et_root_bh, et->et_root_el);
543 }
544
545 /*
546  * Convenience function to journal all components in a path.
547  */
548 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
549                                      struct ocfs2_path *path)
550 {
551         int i, ret = 0;
552
553         if (!path)
554                 goto out;
555
556         for(i = 0; i < path_num_items(path); i++) {
557                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
558                                            OCFS2_JOURNAL_ACCESS_WRITE);
559                 if (ret < 0) {
560                         mlog_errno(ret);
561                         goto out;
562                 }
563         }
564
565 out:
566         return ret;
567 }
568
569 /*
570  * Return the index of the extent record which contains cluster #v_cluster.
571  * -1 is returned if it was not found.
572  *
573  * Should work fine on interior and exterior nodes.
574  */
575 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
576 {
577         int ret = -1;
578         int i;
579         struct ocfs2_extent_rec *rec;
580         u32 rec_end, rec_start, clusters;
581
582         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
583                 rec = &el->l_recs[i];
584
585                 rec_start = le32_to_cpu(rec->e_cpos);
586                 clusters = ocfs2_rec_clusters(el, rec);
587
588                 rec_end = rec_start + clusters;
589
590                 if (v_cluster >= rec_start && v_cluster < rec_end) {
591                         ret = i;
592                         break;
593                 }
594         }
595
596         return ret;
597 }
598
599 enum ocfs2_contig_type {
600         CONTIG_NONE = 0,
601         CONTIG_LEFT,
602         CONTIG_RIGHT,
603         CONTIG_LEFTRIGHT,
604 };
605
606
607 /*
608  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
609  * ocfs2_extent_contig only work properly against leaf nodes!
610  */
611 static int ocfs2_block_extent_contig(struct super_block *sb,
612                                      struct ocfs2_extent_rec *ext,
613                                      u64 blkno)
614 {
615         u64 blk_end = le64_to_cpu(ext->e_blkno);
616
617         blk_end += ocfs2_clusters_to_blocks(sb,
618                                     le16_to_cpu(ext->e_leaf_clusters));
619
620         return blkno == blk_end;
621 }
622
623 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
624                                   struct ocfs2_extent_rec *right)
625 {
626         u32 left_range;
627
628         left_range = le32_to_cpu(left->e_cpos) +
629                 le16_to_cpu(left->e_leaf_clusters);
630
631         return (left_range == le32_to_cpu(right->e_cpos));
632 }
633
634 static enum ocfs2_contig_type
635         ocfs2_extent_contig(struct inode *inode,
636                             struct ocfs2_extent_rec *ext,
637                             struct ocfs2_extent_rec *insert_rec)
638 {
639         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
640
641         /*
642          * Refuse to coalesce extent records with different flag
643          * fields - we don't want to mix unwritten extents with user
644          * data.
645          */
646         if (ext->e_flags != insert_rec->e_flags)
647                 return CONTIG_NONE;
648
649         if (ocfs2_extents_adjacent(ext, insert_rec) &&
650             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
651                         return CONTIG_RIGHT;
652
653         blkno = le64_to_cpu(ext->e_blkno);
654         if (ocfs2_extents_adjacent(insert_rec, ext) &&
655             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
656                 return CONTIG_LEFT;
657
658         return CONTIG_NONE;
659 }
660
661 /*
662  * NOTE: We can have pretty much any combination of contiguousness and
663  * appending.
664  *
665  * The usefulness of APPEND_TAIL is more in that it lets us know that
666  * we'll have to update the path to that leaf.
667  */
668 enum ocfs2_append_type {
669         APPEND_NONE = 0,
670         APPEND_TAIL,
671 };
672
673 enum ocfs2_split_type {
674         SPLIT_NONE = 0,
675         SPLIT_LEFT,
676         SPLIT_RIGHT,
677 };
678
679 struct ocfs2_insert_type {
680         enum ocfs2_split_type   ins_split;
681         enum ocfs2_append_type  ins_appending;
682         enum ocfs2_contig_type  ins_contig;
683         int                     ins_contig_index;
684         int                     ins_tree_depth;
685 };
686
687 struct ocfs2_merge_ctxt {
688         enum ocfs2_contig_type  c_contig_type;
689         int                     c_has_empty_extent;
690         int                     c_split_covers_rec;
691 };
692
693 static int ocfs2_validate_extent_block(struct super_block *sb,
694                                        struct buffer_head *bh)
695 {
696         int rc;
697         struct ocfs2_extent_block *eb =
698                 (struct ocfs2_extent_block *)bh->b_data;
699
700         mlog(0, "Validating extent block %llu\n",
701              (unsigned long long)bh->b_blocknr);
702
703         BUG_ON(!buffer_uptodate(bh));
704
705         /*
706          * If the ecc fails, we return the error but otherwise
707          * leave the filesystem running.  We know any error is
708          * local to this block.
709          */
710         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
711         if (rc)
712                 return rc;
713
714         /*
715          * Errors after here are fatal.
716          */
717
718         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
719                 ocfs2_error(sb,
720                             "Extent block #%llu has bad signature %.*s",
721                             (unsigned long long)bh->b_blocknr, 7,
722                             eb->h_signature);
723                 return -EINVAL;
724         }
725
726         if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
727                 ocfs2_error(sb,
728                             "Extent block #%llu has an invalid h_blkno "
729                             "of %llu",
730                             (unsigned long long)bh->b_blocknr,
731                             (unsigned long long)le64_to_cpu(eb->h_blkno));
732                 return -EINVAL;
733         }
734
735         if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
736                 ocfs2_error(sb,
737                             "Extent block #%llu has an invalid "
738                             "h_fs_generation of #%u",
739                             (unsigned long long)bh->b_blocknr,
740                             le32_to_cpu(eb->h_fs_generation));
741                 return -EINVAL;
742         }
743
744         return 0;
745 }
746
747 int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
748                             struct buffer_head **bh)
749 {
750         int rc;
751         struct buffer_head *tmp = *bh;
752
753         rc = ocfs2_read_block(inode, eb_blkno, &tmp,
754                               ocfs2_validate_extent_block);
755
756         /* If ocfs2_read_block() got us a new bh, pass it up. */
757         if (!rc && !*bh)
758                 *bh = tmp;
759
760         return rc;
761 }
762
763
764 /*
765  * How many free extents have we got before we need more meta data?
766  */
767 int ocfs2_num_free_extents(struct ocfs2_super *osb,
768                            struct inode *inode,
769                            struct ocfs2_extent_tree *et)
770 {
771         int retval;
772         struct ocfs2_extent_list *el = NULL;
773         struct ocfs2_extent_block *eb;
774         struct buffer_head *eb_bh = NULL;
775         u64 last_eb_blk = 0;
776
777         mlog_entry_void();
778
779         el = et->et_root_el;
780         last_eb_blk = ocfs2_et_get_last_eb_blk(et);
781
782         if (last_eb_blk) {
783                 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
784                 if (retval < 0) {
785                         mlog_errno(retval);
786                         goto bail;
787                 }
788                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
789                 el = &eb->h_list;
790         }
791
792         BUG_ON(el->l_tree_depth != 0);
793
794         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
795 bail:
796         brelse(eb_bh);
797
798         mlog_exit(retval);
799         return retval;
800 }
801
802 /* expects array to already be allocated
803  *
804  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
805  * l_count for you
806  */
807 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
808                                      handle_t *handle,
809                                      struct inode *inode,
810                                      int wanted,
811                                      struct ocfs2_alloc_context *meta_ac,
812                                      struct buffer_head *bhs[])
813 {
814         int count, status, i;
815         u16 suballoc_bit_start;
816         u32 num_got;
817         u64 first_blkno;
818         struct ocfs2_extent_block *eb;
819
820         mlog_entry_void();
821
822         count = 0;
823         while (count < wanted) {
824                 status = ocfs2_claim_metadata(osb,
825                                               handle,
826                                               meta_ac,
827                                               wanted - count,
828                                               &suballoc_bit_start,
829                                               &num_got,
830                                               &first_blkno);
831                 if (status < 0) {
832                         mlog_errno(status);
833                         goto bail;
834                 }
835
836                 for(i = count;  i < (num_got + count); i++) {
837                         bhs[i] = sb_getblk(osb->sb, first_blkno);
838                         if (bhs[i] == NULL) {
839                                 status = -EIO;
840                                 mlog_errno(status);
841                                 goto bail;
842                         }
843                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
844
845                         status = ocfs2_journal_access(handle, inode, bhs[i],
846                                                       OCFS2_JOURNAL_ACCESS_CREATE);
847                         if (status < 0) {
848                                 mlog_errno(status);
849                                 goto bail;
850                         }
851
852                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
853                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
854                         /* Ok, setup the minimal stuff here. */
855                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
856                         eb->h_blkno = cpu_to_le64(first_blkno);
857                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
858                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
859                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
860                         eb->h_list.l_count =
861                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
862
863                         suballoc_bit_start++;
864                         first_blkno++;
865
866                         /* We'll also be dirtied by the caller, so
867                          * this isn't absolutely necessary. */
868                         status = ocfs2_journal_dirty(handle, bhs[i]);
869                         if (status < 0) {
870                                 mlog_errno(status);
871                                 goto bail;
872                         }
873                 }
874
875                 count += num_got;
876         }
877
878         status = 0;
879 bail:
880         if (status < 0) {
881                 for(i = 0; i < wanted; i++) {
882                         brelse(bhs[i]);
883                         bhs[i] = NULL;
884                 }
885         }
886         mlog_exit(status);
887         return status;
888 }
889
890 /*
891  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
892  *
893  * Returns the sum of the rightmost extent rec logical offset and
894  * cluster count.
895  *
896  * ocfs2_add_branch() uses this to determine what logical cluster
897  * value should be populated into the leftmost new branch records.
898  *
899  * ocfs2_shift_tree_depth() uses this to determine the # clusters
900  * value for the new topmost tree record.
901  */
902 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
903 {
904         int i;
905
906         i = le16_to_cpu(el->l_next_free_rec) - 1;
907
908         return le32_to_cpu(el->l_recs[i].e_cpos) +
909                 ocfs2_rec_clusters(el, &el->l_recs[i]);
910 }
911
912 /*
913  * Add an entire tree branch to our inode. eb_bh is the extent block
914  * to start at, if we don't want to start the branch at the dinode
915  * structure.
916  *
917  * last_eb_bh is required as we have to update it's next_leaf pointer
918  * for the new last extent block.
919  *
920  * the new branch will be 'empty' in the sense that every block will
921  * contain a single record with cluster count == 0.
922  */
923 static int ocfs2_add_branch(struct ocfs2_super *osb,
924                             handle_t *handle,
925                             struct inode *inode,
926                             struct ocfs2_extent_tree *et,
927                             struct buffer_head *eb_bh,
928                             struct buffer_head **last_eb_bh,
929                             struct ocfs2_alloc_context *meta_ac)
930 {
931         int status, new_blocks, i;
932         u64 next_blkno, new_last_eb_blk;
933         struct buffer_head *bh;
934         struct buffer_head **new_eb_bhs = NULL;
935         struct ocfs2_extent_block *eb;
936         struct ocfs2_extent_list  *eb_el;
937         struct ocfs2_extent_list  *el;
938         u32 new_cpos;
939
940         mlog_entry_void();
941
942         BUG_ON(!last_eb_bh || !*last_eb_bh);
943
944         if (eb_bh) {
945                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
946                 el = &eb->h_list;
947         } else
948                 el = et->et_root_el;
949
950         /* we never add a branch to a leaf. */
951         BUG_ON(!el->l_tree_depth);
952
953         new_blocks = le16_to_cpu(el->l_tree_depth);
954
955         /* allocate the number of new eb blocks we need */
956         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
957                              GFP_KERNEL);
958         if (!new_eb_bhs) {
959                 status = -ENOMEM;
960                 mlog_errno(status);
961                 goto bail;
962         }
963
964         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
965                                            meta_ac, new_eb_bhs);
966         if (status < 0) {
967                 mlog_errno(status);
968                 goto bail;
969         }
970
971         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
972         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
973
974         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
975          * linked with the rest of the tree.
976          * conversly, new_eb_bhs[0] is the new bottommost leaf.
977          *
978          * when we leave the loop, new_last_eb_blk will point to the
979          * newest leaf, and next_blkno will point to the topmost extent
980          * block. */
981         next_blkno = new_last_eb_blk = 0;
982         for(i = 0; i < new_blocks; i++) {
983                 bh = new_eb_bhs[i];
984                 eb = (struct ocfs2_extent_block *) bh->b_data;
985                 /* ocfs2_create_new_meta_bhs() should create it right! */
986                 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
987                 eb_el = &eb->h_list;
988
989                 status = ocfs2_journal_access(handle, inode, bh,
990                                               OCFS2_JOURNAL_ACCESS_CREATE);
991                 if (status < 0) {
992                         mlog_errno(status);
993                         goto bail;
994                 }
995
996                 eb->h_next_leaf_blk = 0;
997                 eb_el->l_tree_depth = cpu_to_le16(i);
998                 eb_el->l_next_free_rec = cpu_to_le16(1);
999                 /*
1000                  * This actually counts as an empty extent as
1001                  * c_clusters == 0
1002                  */
1003                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1004                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1005                 /*
1006                  * eb_el isn't always an interior node, but even leaf
1007                  * nodes want a zero'd flags and reserved field so
1008                  * this gets the whole 32 bits regardless of use.
1009                  */
1010                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1011                 if (!eb_el->l_tree_depth)
1012                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1013
1014                 status = ocfs2_journal_dirty(handle, bh);
1015                 if (status < 0) {
1016                         mlog_errno(status);
1017                         goto bail;
1018                 }
1019
1020                 next_blkno = le64_to_cpu(eb->h_blkno);
1021         }
1022
1023         /* This is a bit hairy. We want to update up to three blocks
1024          * here without leaving any of them in an inconsistent state
1025          * in case of error. We don't have to worry about
1026          * journal_dirty erroring as it won't unless we've aborted the
1027          * handle (in which case we would never be here) so reserving
1028          * the write with journal_access is all we need to do. */
1029         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
1030                                       OCFS2_JOURNAL_ACCESS_WRITE);
1031         if (status < 0) {
1032                 mlog_errno(status);
1033                 goto bail;
1034         }
1035         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1036                                       OCFS2_JOURNAL_ACCESS_WRITE);
1037         if (status < 0) {
1038                 mlog_errno(status);
1039                 goto bail;
1040         }
1041         if (eb_bh) {
1042                 status = ocfs2_journal_access(handle, inode, eb_bh,
1043                                               OCFS2_JOURNAL_ACCESS_WRITE);
1044                 if (status < 0) {
1045                         mlog_errno(status);
1046                         goto bail;
1047                 }
1048         }
1049
1050         /* Link the new branch into the rest of the tree (el will
1051          * either be on the root_bh, or the extent block passed in. */
1052         i = le16_to_cpu(el->l_next_free_rec);
1053         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1054         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1055         el->l_recs[i].e_int_clusters = 0;
1056         le16_add_cpu(&el->l_next_free_rec, 1);
1057
1058         /* fe needs a new last extent block pointer, as does the
1059          * next_leaf on the previously last-extent-block. */
1060         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1061
1062         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1063         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1064
1065         status = ocfs2_journal_dirty(handle, *last_eb_bh);
1066         if (status < 0)
1067                 mlog_errno(status);
1068         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1069         if (status < 0)
1070                 mlog_errno(status);
1071         if (eb_bh) {
1072                 status = ocfs2_journal_dirty(handle, eb_bh);
1073                 if (status < 0)
1074                         mlog_errno(status);
1075         }
1076
1077         /*
1078          * Some callers want to track the rightmost leaf so pass it
1079          * back here.
1080          */
1081         brelse(*last_eb_bh);
1082         get_bh(new_eb_bhs[0]);
1083         *last_eb_bh = new_eb_bhs[0];
1084
1085         status = 0;
1086 bail:
1087         if (new_eb_bhs) {
1088                 for (i = 0; i < new_blocks; i++)
1089                         brelse(new_eb_bhs[i]);
1090                 kfree(new_eb_bhs);
1091         }
1092
1093         mlog_exit(status);
1094         return status;
1095 }
1096
1097 /*
1098  * adds another level to the allocation tree.
1099  * returns back the new extent block so you can add a branch to it
1100  * after this call.
1101  */
1102 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1103                                   handle_t *handle,
1104                                   struct inode *inode,
1105                                   struct ocfs2_extent_tree *et,
1106                                   struct ocfs2_alloc_context *meta_ac,
1107                                   struct buffer_head **ret_new_eb_bh)
1108 {
1109         int status, i;
1110         u32 new_clusters;
1111         struct buffer_head *new_eb_bh = NULL;
1112         struct ocfs2_extent_block *eb;
1113         struct ocfs2_extent_list  *root_el;
1114         struct ocfs2_extent_list  *eb_el;
1115
1116         mlog_entry_void();
1117
1118         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1119                                            &new_eb_bh);
1120         if (status < 0) {
1121                 mlog_errno(status);
1122                 goto bail;
1123         }
1124
1125         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1126         /* ocfs2_create_new_meta_bhs() should create it right! */
1127         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1128
1129         eb_el = &eb->h_list;
1130         root_el = et->et_root_el;
1131
1132         status = ocfs2_journal_access(handle, inode, new_eb_bh,
1133                                       OCFS2_JOURNAL_ACCESS_CREATE);
1134         if (status < 0) {
1135                 mlog_errno(status);
1136                 goto bail;
1137         }
1138
1139         /* copy the root extent list data into the new extent block */
1140         eb_el->l_tree_depth = root_el->l_tree_depth;
1141         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1142         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1143                 eb_el->l_recs[i] = root_el->l_recs[i];
1144
1145         status = ocfs2_journal_dirty(handle, new_eb_bh);
1146         if (status < 0) {
1147                 mlog_errno(status);
1148                 goto bail;
1149         }
1150
1151         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1152                                       OCFS2_JOURNAL_ACCESS_WRITE);
1153         if (status < 0) {
1154                 mlog_errno(status);
1155                 goto bail;
1156         }
1157
1158         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1159
1160         /* update root_bh now */
1161         le16_add_cpu(&root_el->l_tree_depth, 1);
1162         root_el->l_recs[0].e_cpos = 0;
1163         root_el->l_recs[0].e_blkno = eb->h_blkno;
1164         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1165         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1166                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1167         root_el->l_next_free_rec = cpu_to_le16(1);
1168
1169         /* If this is our 1st tree depth shift, then last_eb_blk
1170          * becomes the allocated extent block */
1171         if (root_el->l_tree_depth == cpu_to_le16(1))
1172                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1173
1174         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1175         if (status < 0) {
1176                 mlog_errno(status);
1177                 goto bail;
1178         }
1179
1180         *ret_new_eb_bh = new_eb_bh;
1181         new_eb_bh = NULL;
1182         status = 0;
1183 bail:
1184         brelse(new_eb_bh);
1185
1186         mlog_exit(status);
1187         return status;
1188 }
1189
1190 /*
1191  * Should only be called when there is no space left in any of the
1192  * leaf nodes. What we want to do is find the lowest tree depth
1193  * non-leaf extent block with room for new records. There are three
1194  * valid results of this search:
1195  *
1196  * 1) a lowest extent block is found, then we pass it back in
1197  *    *lowest_eb_bh and return '0'
1198  *
1199  * 2) the search fails to find anything, but the root_el has room. We
1200  *    pass NULL back in *lowest_eb_bh, but still return '0'
1201  *
1202  * 3) the search fails to find anything AND the root_el is full, in
1203  *    which case we return > 0
1204  *
1205  * return status < 0 indicates an error.
1206  */
1207 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1208                                     struct inode *inode,
1209                                     struct ocfs2_extent_tree *et,
1210                                     struct buffer_head **target_bh)
1211 {
1212         int status = 0, i;
1213         u64 blkno;
1214         struct ocfs2_extent_block *eb;
1215         struct ocfs2_extent_list  *el;
1216         struct buffer_head *bh = NULL;
1217         struct buffer_head *lowest_bh = NULL;
1218
1219         mlog_entry_void();
1220
1221         *target_bh = NULL;
1222
1223         el = et->et_root_el;
1224
1225         while(le16_to_cpu(el->l_tree_depth) > 1) {
1226                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1227                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1228                                     "extent list (next_free_rec == 0)",
1229                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1230                         status = -EIO;
1231                         goto bail;
1232                 }
1233                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1234                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1235                 if (!blkno) {
1236                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1237                                     "list where extent # %d has no physical "
1238                                     "block start",
1239                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1240                         status = -EIO;
1241                         goto bail;
1242                 }
1243
1244                 brelse(bh);
1245                 bh = NULL;
1246
1247                 status = ocfs2_read_extent_block(inode, blkno, &bh);
1248                 if (status < 0) {
1249                         mlog_errno(status);
1250                         goto bail;
1251                 }
1252
1253                 eb = (struct ocfs2_extent_block *) bh->b_data;
1254                 el = &eb->h_list;
1255
1256                 if (le16_to_cpu(el->l_next_free_rec) <
1257                     le16_to_cpu(el->l_count)) {
1258                         brelse(lowest_bh);
1259                         lowest_bh = bh;
1260                         get_bh(lowest_bh);
1261                 }
1262         }
1263
1264         /* If we didn't find one and the fe doesn't have any room,
1265          * then return '1' */
1266         el = et->et_root_el;
1267         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1268                 status = 1;
1269
1270         *target_bh = lowest_bh;
1271 bail:
1272         brelse(bh);
1273
1274         mlog_exit(status);
1275         return status;
1276 }
1277
1278 /*
1279  * Grow a b-tree so that it has more records.
1280  *
1281  * We might shift the tree depth in which case existing paths should
1282  * be considered invalid.
1283  *
1284  * Tree depth after the grow is returned via *final_depth.
1285  *
1286  * *last_eb_bh will be updated by ocfs2_add_branch().
1287  */
1288 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1289                            struct ocfs2_extent_tree *et, int *final_depth,
1290                            struct buffer_head **last_eb_bh,
1291                            struct ocfs2_alloc_context *meta_ac)
1292 {
1293         int ret, shift;
1294         struct ocfs2_extent_list *el = et->et_root_el;
1295         int depth = le16_to_cpu(el->l_tree_depth);
1296         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1297         struct buffer_head *bh = NULL;
1298
1299         BUG_ON(meta_ac == NULL);
1300
1301         shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1302         if (shift < 0) {
1303                 ret = shift;
1304                 mlog_errno(ret);
1305                 goto out;
1306         }
1307
1308         /* We traveled all the way to the bottom of the allocation tree
1309          * and didn't find room for any more extents - we need to add
1310          * another tree level */
1311         if (shift) {
1312                 BUG_ON(bh);
1313                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1314
1315                 /* ocfs2_shift_tree_depth will return us a buffer with
1316                  * the new extent block (so we can pass that to
1317                  * ocfs2_add_branch). */
1318                 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1319                                              meta_ac, &bh);
1320                 if (ret < 0) {
1321                         mlog_errno(ret);
1322                         goto out;
1323                 }
1324                 depth++;
1325                 if (depth == 1) {
1326                         /*
1327                          * Special case: we have room now if we shifted from
1328                          * tree_depth 0, so no more work needs to be done.
1329                          *
1330                          * We won't be calling add_branch, so pass
1331                          * back *last_eb_bh as the new leaf. At depth
1332                          * zero, it should always be null so there's
1333                          * no reason to brelse.
1334                          */
1335                         BUG_ON(*last_eb_bh);
1336                         get_bh(bh);
1337                         *last_eb_bh = bh;
1338                         goto out;
1339                 }
1340         }
1341
1342         /* call ocfs2_add_branch to add the final part of the tree with
1343          * the new data. */
1344         mlog(0, "add branch. bh = %p\n", bh);
1345         ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1346                                meta_ac);
1347         if (ret < 0) {
1348                 mlog_errno(ret);
1349                 goto out;
1350         }
1351
1352 out:
1353         if (final_depth)
1354                 *final_depth = depth;
1355         brelse(bh);
1356         return ret;
1357 }
1358
1359 /*
1360  * This function will discard the rightmost extent record.
1361  */
1362 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1363 {
1364         int next_free = le16_to_cpu(el->l_next_free_rec);
1365         int count = le16_to_cpu(el->l_count);
1366         unsigned int num_bytes;
1367
1368         BUG_ON(!next_free);
1369         /* This will cause us to go off the end of our extent list. */
1370         BUG_ON(next_free >= count);
1371
1372         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1373
1374         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1375 }
1376
1377 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1378                               struct ocfs2_extent_rec *insert_rec)
1379 {
1380         int i, insert_index, next_free, has_empty, num_bytes;
1381         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1382         struct ocfs2_extent_rec *rec;
1383
1384         next_free = le16_to_cpu(el->l_next_free_rec);
1385         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1386
1387         BUG_ON(!next_free);
1388
1389         /* The tree code before us didn't allow enough room in the leaf. */
1390         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1391
1392         /*
1393          * The easiest way to approach this is to just remove the
1394          * empty extent and temporarily decrement next_free.
1395          */
1396         if (has_empty) {
1397                 /*
1398                  * If next_free was 1 (only an empty extent), this
1399                  * loop won't execute, which is fine. We still want
1400                  * the decrement above to happen.
1401                  */
1402                 for(i = 0; i < (next_free - 1); i++)
1403                         el->l_recs[i] = el->l_recs[i+1];
1404
1405                 next_free--;
1406         }
1407
1408         /*
1409          * Figure out what the new record index should be.
1410          */
1411         for(i = 0; i < next_free; i++) {
1412                 rec = &el->l_recs[i];
1413
1414                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1415                         break;
1416         }
1417         insert_index = i;
1418
1419         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1420              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1421
1422         BUG_ON(insert_index < 0);
1423         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1424         BUG_ON(insert_index > next_free);
1425
1426         /*
1427          * No need to memmove if we're just adding to the tail.
1428          */
1429         if (insert_index != next_free) {
1430                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1431
1432                 num_bytes = next_free - insert_index;
1433                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1434                 memmove(&el->l_recs[insert_index + 1],
1435                         &el->l_recs[insert_index],
1436                         num_bytes);
1437         }
1438
1439         /*
1440          * Either we had an empty extent, and need to re-increment or
1441          * there was no empty extent on a non full rightmost leaf node,
1442          * in which case we still need to increment.
1443          */
1444         next_free++;
1445         el->l_next_free_rec = cpu_to_le16(next_free);
1446         /*
1447          * Make sure none of the math above just messed up our tree.
1448          */
1449         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1450
1451         el->l_recs[insert_index] = *insert_rec;
1452
1453 }
1454
1455 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1456 {
1457         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1458
1459         BUG_ON(num_recs == 0);
1460
1461         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1462                 num_recs--;
1463                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1464                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1465                 memset(&el->l_recs[num_recs], 0,
1466                        sizeof(struct ocfs2_extent_rec));
1467                 el->l_next_free_rec = cpu_to_le16(num_recs);
1468         }
1469 }
1470
1471 /*
1472  * Create an empty extent record .
1473  *
1474  * l_next_free_rec may be updated.
1475  *
1476  * If an empty extent already exists do nothing.
1477  */
1478 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1479 {
1480         int next_free = le16_to_cpu(el->l_next_free_rec);
1481
1482         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1483
1484         if (next_free == 0)
1485                 goto set_and_inc;
1486
1487         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1488                 return;
1489
1490         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1491                         "Asked to create an empty extent in a full list:\n"
1492                         "count = %u, tree depth = %u",
1493                         le16_to_cpu(el->l_count),
1494                         le16_to_cpu(el->l_tree_depth));
1495
1496         ocfs2_shift_records_right(el);
1497
1498 set_and_inc:
1499         le16_add_cpu(&el->l_next_free_rec, 1);
1500         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1501 }
1502
1503 /*
1504  * For a rotation which involves two leaf nodes, the "root node" is
1505  * the lowest level tree node which contains a path to both leafs. This
1506  * resulting set of information can be used to form a complete "subtree"
1507  *
1508  * This function is passed two full paths from the dinode down to a
1509  * pair of adjacent leaves. It's task is to figure out which path
1510  * index contains the subtree root - this can be the root index itself
1511  * in a worst-case rotation.
1512  *
1513  * The array index of the subtree root is passed back.
1514  */
1515 static int ocfs2_find_subtree_root(struct inode *inode,
1516                                    struct ocfs2_path *left,
1517                                    struct ocfs2_path *right)
1518 {
1519         int i = 0;
1520
1521         /*
1522          * Check that the caller passed in two paths from the same tree.
1523          */
1524         BUG_ON(path_root_bh(left) != path_root_bh(right));
1525
1526         do {
1527                 i++;
1528
1529                 /*
1530                  * The caller didn't pass two adjacent paths.
1531                  */
1532                 mlog_bug_on_msg(i > left->p_tree_depth,
1533                                 "Inode %lu, left depth %u, right depth %u\n"
1534                                 "left leaf blk %llu, right leaf blk %llu\n",
1535                                 inode->i_ino, left->p_tree_depth,
1536                                 right->p_tree_depth,
1537                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1538                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1539         } while (left->p_node[i].bh->b_blocknr ==
1540                  right->p_node[i].bh->b_blocknr);
1541
1542         return i - 1;
1543 }
1544
1545 typedef void (path_insert_t)(void *, struct buffer_head *);
1546
1547 /*
1548  * Traverse a btree path in search of cpos, starting at root_el.
1549  *
1550  * This code can be called with a cpos larger than the tree, in which
1551  * case it will return the rightmost path.
1552  */
1553 static int __ocfs2_find_path(struct inode *inode,
1554                              struct ocfs2_extent_list *root_el, u32 cpos,
1555                              path_insert_t *func, void *data)
1556 {
1557         int i, ret = 0;
1558         u32 range;
1559         u64 blkno;
1560         struct buffer_head *bh = NULL;
1561         struct ocfs2_extent_block *eb;
1562         struct ocfs2_extent_list *el;
1563         struct ocfs2_extent_rec *rec;
1564         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1565
1566         el = root_el;
1567         while (el->l_tree_depth) {
1568                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1569                         ocfs2_error(inode->i_sb,
1570                                     "Inode %llu has empty extent list at "
1571                                     "depth %u\n",
1572                                     (unsigned long long)oi->ip_blkno,
1573                                     le16_to_cpu(el->l_tree_depth));
1574                         ret = -EROFS;
1575                         goto out;
1576
1577                 }
1578
1579                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1580                         rec = &el->l_recs[i];
1581
1582                         /*
1583                          * In the case that cpos is off the allocation
1584                          * tree, this should just wind up returning the
1585                          * rightmost record.
1586                          */
1587                         range = le32_to_cpu(rec->e_cpos) +
1588                                 ocfs2_rec_clusters(el, rec);
1589                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1590                             break;
1591                 }
1592
1593                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1594                 if (blkno == 0) {
1595                         ocfs2_error(inode->i_sb,
1596                                     "Inode %llu has bad blkno in extent list "
1597                                     "at depth %u (index %d)\n",
1598                                     (unsigned long long)oi->ip_blkno,
1599                                     le16_to_cpu(el->l_tree_depth), i);
1600                         ret = -EROFS;
1601                         goto out;
1602                 }
1603
1604                 brelse(bh);
1605                 bh = NULL;
1606                 ret = ocfs2_read_extent_block(inode, blkno, &bh);
1607                 if (ret) {
1608                         mlog_errno(ret);
1609                         goto out;
1610                 }
1611
1612                 eb = (struct ocfs2_extent_block *) bh->b_data;
1613                 el = &eb->h_list;
1614
1615                 if (le16_to_cpu(el->l_next_free_rec) >
1616                     le16_to_cpu(el->l_count)) {
1617                         ocfs2_error(inode->i_sb,
1618                                     "Inode %llu has bad count in extent list "
1619                                     "at block %llu (next free=%u, count=%u)\n",
1620                                     (unsigned long long)oi->ip_blkno,
1621                                     (unsigned long long)bh->b_blocknr,
1622                                     le16_to_cpu(el->l_next_free_rec),
1623                                     le16_to_cpu(el->l_count));
1624                         ret = -EROFS;
1625                         goto out;
1626                 }
1627
1628                 if (func)
1629                         func(data, bh);
1630         }
1631
1632 out:
1633         /*
1634          * Catch any trailing bh that the loop didn't handle.
1635          */
1636         brelse(bh);
1637
1638         return ret;
1639 }
1640
1641 /*
1642  * Given an initialized path (that is, it has a valid root extent
1643  * list), this function will traverse the btree in search of the path
1644  * which would contain cpos.
1645  *
1646  * The path traveled is recorded in the path structure.
1647  *
1648  * Note that this will not do any comparisons on leaf node extent
1649  * records, so it will work fine in the case that we just added a tree
1650  * branch.
1651  */
1652 struct find_path_data {
1653         int index;
1654         struct ocfs2_path *path;
1655 };
1656 static void find_path_ins(void *data, struct buffer_head *bh)
1657 {
1658         struct find_path_data *fp = data;
1659
1660         get_bh(bh);
1661         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1662         fp->index++;
1663 }
1664 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1665                            u32 cpos)
1666 {
1667         struct find_path_data data;
1668
1669         data.index = 1;
1670         data.path = path;
1671         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1672                                  find_path_ins, &data);
1673 }
1674
1675 static void find_leaf_ins(void *data, struct buffer_head *bh)
1676 {
1677         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1678         struct ocfs2_extent_list *el = &eb->h_list;
1679         struct buffer_head **ret = data;
1680
1681         /* We want to retain only the leaf block. */
1682         if (le16_to_cpu(el->l_tree_depth) == 0) {
1683                 get_bh(bh);
1684                 *ret = bh;
1685         }
1686 }
1687 /*
1688  * Find the leaf block in the tree which would contain cpos. No
1689  * checking of the actual leaf is done.
1690  *
1691  * Some paths want to call this instead of allocating a path structure
1692  * and calling ocfs2_find_path().
1693  *
1694  * This function doesn't handle non btree extent lists.
1695  */
1696 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1697                     u32 cpos, struct buffer_head **leaf_bh)
1698 {
1699         int ret;
1700         struct buffer_head *bh = NULL;
1701
1702         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1703         if (ret) {
1704                 mlog_errno(ret);
1705                 goto out;
1706         }
1707
1708         *leaf_bh = bh;
1709 out:
1710         return ret;
1711 }
1712
1713 /*
1714  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1715  *
1716  * Basically, we've moved stuff around at the bottom of the tree and
1717  * we need to fix up the extent records above the changes to reflect
1718  * the new changes.
1719  *
1720  * left_rec: the record on the left.
1721  * left_child_el: is the child list pointed to by left_rec
1722  * right_rec: the record to the right of left_rec
1723  * right_child_el: is the child list pointed to by right_rec
1724  *
1725  * By definition, this only works on interior nodes.
1726  */
1727 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1728                                   struct ocfs2_extent_list *left_child_el,
1729                                   struct ocfs2_extent_rec *right_rec,
1730                                   struct ocfs2_extent_list *right_child_el)
1731 {
1732         u32 left_clusters, right_end;
1733
1734         /*
1735          * Interior nodes never have holes. Their cpos is the cpos of
1736          * the leftmost record in their child list. Their cluster
1737          * count covers the full theoretical range of their child list
1738          * - the range between their cpos and the cpos of the record
1739          * immediately to their right.
1740          */
1741         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1742         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1743                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1744                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1745         }
1746         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1747         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1748
1749         /*
1750          * Calculate the rightmost cluster count boundary before
1751          * moving cpos - we will need to adjust clusters after
1752          * updating e_cpos to keep the same highest cluster count.
1753          */
1754         right_end = le32_to_cpu(right_rec->e_cpos);
1755         right_end += le32_to_cpu(right_rec->e_int_clusters);
1756
1757         right_rec->e_cpos = left_rec->e_cpos;
1758         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1759
1760         right_end -= le32_to_cpu(right_rec->e_cpos);
1761         right_rec->e_int_clusters = cpu_to_le32(right_end);
1762 }
1763
1764 /*
1765  * Adjust the adjacent root node records involved in a
1766  * rotation. left_el_blkno is passed in as a key so that we can easily
1767  * find it's index in the root list.
1768  */
1769 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1770                                       struct ocfs2_extent_list *left_el,
1771                                       struct ocfs2_extent_list *right_el,
1772                                       u64 left_el_blkno)
1773 {
1774         int i;
1775
1776         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1777                le16_to_cpu(left_el->l_tree_depth));
1778
1779         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1780                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1781                         break;
1782         }
1783
1784         /*
1785          * The path walking code should have never returned a root and
1786          * two paths which are not adjacent.
1787          */
1788         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1789
1790         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1791                                       &root_el->l_recs[i + 1], right_el);
1792 }
1793
1794 /*
1795  * We've changed a leaf block (in right_path) and need to reflect that
1796  * change back up the subtree.
1797  *
1798  * This happens in multiple places:
1799  *   - When we've moved an extent record from the left path leaf to the right
1800  *     path leaf to make room for an empty extent in the left path leaf.
1801  *   - When our insert into the right path leaf is at the leftmost edge
1802  *     and requires an update of the path immediately to it's left. This
1803  *     can occur at the end of some types of rotation and appending inserts.
1804  *   - When we've adjusted the last extent record in the left path leaf and the
1805  *     1st extent record in the right path leaf during cross extent block merge.
1806  */
1807 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1808                                        struct ocfs2_path *left_path,
1809                                        struct ocfs2_path *right_path,
1810                                        int subtree_index)
1811 {
1812         int ret, i, idx;
1813         struct ocfs2_extent_list *el, *left_el, *right_el;
1814         struct ocfs2_extent_rec *left_rec, *right_rec;
1815         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1816
1817         /*
1818          * Update the counts and position values within all the
1819          * interior nodes to reflect the leaf rotation we just did.
1820          *
1821          * The root node is handled below the loop.
1822          *
1823          * We begin the loop with right_el and left_el pointing to the
1824          * leaf lists and work our way up.
1825          *
1826          * NOTE: within this loop, left_el and right_el always refer
1827          * to the *child* lists.
1828          */
1829         left_el = path_leaf_el(left_path);
1830         right_el = path_leaf_el(right_path);
1831         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1832                 mlog(0, "Adjust records at index %u\n", i);
1833
1834                 /*
1835                  * One nice property of knowing that all of these
1836                  * nodes are below the root is that we only deal with
1837                  * the leftmost right node record and the rightmost
1838                  * left node record.
1839                  */
1840                 el = left_path->p_node[i].el;
1841                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1842                 left_rec = &el->l_recs[idx];
1843
1844                 el = right_path->p_node[i].el;
1845                 right_rec = &el->l_recs[0];
1846
1847                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1848                                               right_el);
1849
1850                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1851                 if (ret)
1852                         mlog_errno(ret);
1853
1854                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1855                 if (ret)
1856                         mlog_errno(ret);
1857
1858                 /*
1859                  * Setup our list pointers now so that the current
1860                  * parents become children in the next iteration.
1861                  */
1862                 left_el = left_path->p_node[i].el;
1863                 right_el = right_path->p_node[i].el;
1864         }
1865
1866         /*
1867          * At the root node, adjust the two adjacent records which
1868          * begin our path to the leaves.
1869          */
1870
1871         el = left_path->p_node[subtree_index].el;
1872         left_el = left_path->p_node[subtree_index + 1].el;
1873         right_el = right_path->p_node[subtree_index + 1].el;
1874
1875         ocfs2_adjust_root_records(el, left_el, right_el,
1876                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1877
1878         root_bh = left_path->p_node[subtree_index].bh;
1879
1880         ret = ocfs2_journal_dirty(handle, root_bh);
1881         if (ret)
1882                 mlog_errno(ret);
1883 }
1884
1885 static int ocfs2_rotate_subtree_right(struct inode *inode,
1886                                       handle_t *handle,
1887                                       struct ocfs2_path *left_path,
1888                                       struct ocfs2_path *right_path,
1889                                       int subtree_index)
1890 {
1891         int ret, i;
1892         struct buffer_head *right_leaf_bh;
1893         struct buffer_head *left_leaf_bh = NULL;
1894         struct buffer_head *root_bh;
1895         struct ocfs2_extent_list *right_el, *left_el;
1896         struct ocfs2_extent_rec move_rec;
1897
1898         left_leaf_bh = path_leaf_bh(left_path);
1899         left_el = path_leaf_el(left_path);
1900
1901         if (left_el->l_next_free_rec != left_el->l_count) {
1902                 ocfs2_error(inode->i_sb,
1903                             "Inode %llu has non-full interior leaf node %llu"
1904                             "(next free = %u)",
1905                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1906                             (unsigned long long)left_leaf_bh->b_blocknr,
1907                             le16_to_cpu(left_el->l_next_free_rec));
1908                 return -EROFS;
1909         }
1910
1911         /*
1912          * This extent block may already have an empty record, so we
1913          * return early if so.
1914          */
1915         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1916                 return 0;
1917
1918         root_bh = left_path->p_node[subtree_index].bh;
1919         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1920
1921         ret = ocfs2_journal_access(handle, inode, root_bh,
1922                                    OCFS2_JOURNAL_ACCESS_WRITE);
1923         if (ret) {
1924                 mlog_errno(ret);
1925                 goto out;
1926         }
1927
1928         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1929                 ret = ocfs2_journal_access(handle, inode,
1930                                            right_path->p_node[i].bh,
1931                                            OCFS2_JOURNAL_ACCESS_WRITE);
1932                 if (ret) {
1933                         mlog_errno(ret);
1934                         goto out;
1935                 }
1936
1937                 ret = ocfs2_journal_access(handle, inode,
1938                                            left_path->p_node[i].bh,
1939                                            OCFS2_JOURNAL_ACCESS_WRITE);
1940                 if (ret) {
1941                         mlog_errno(ret);
1942                         goto out;
1943                 }
1944         }
1945
1946         right_leaf_bh = path_leaf_bh(right_path);
1947         right_el = path_leaf_el(right_path);
1948
1949         /* This is a code error, not a disk corruption. */
1950         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1951                         "because rightmost leaf block %llu is empty\n",
1952                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1953                         (unsigned long long)right_leaf_bh->b_blocknr);
1954
1955         ocfs2_create_empty_extent(right_el);
1956
1957         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1958         if (ret) {
1959                 mlog_errno(ret);
1960                 goto out;
1961         }
1962
1963         /* Do the copy now. */
1964         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1965         move_rec = left_el->l_recs[i];
1966         right_el->l_recs[0] = move_rec;
1967
1968         /*
1969          * Clear out the record we just copied and shift everything
1970          * over, leaving an empty extent in the left leaf.
1971          *
1972          * We temporarily subtract from next_free_rec so that the
1973          * shift will lose the tail record (which is now defunct).
1974          */
1975         le16_add_cpu(&left_el->l_next_free_rec, -1);
1976         ocfs2_shift_records_right(left_el);
1977         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1978         le16_add_cpu(&left_el->l_next_free_rec, 1);
1979
1980         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1981         if (ret) {
1982                 mlog_errno(ret);
1983                 goto out;
1984         }
1985
1986         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1987                                 subtree_index);
1988
1989 out:
1990         return ret;
1991 }
1992
1993 /*
1994  * Given a full path, determine what cpos value would return us a path
1995  * containing the leaf immediately to the left of the current one.
1996  *
1997  * Will return zero if the path passed in is already the leftmost path.
1998  */
1999 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2000                                          struct ocfs2_path *path, u32 *cpos)
2001 {
2002         int i, j, ret = 0;
2003         u64 blkno;
2004         struct ocfs2_extent_list *el;
2005
2006         BUG_ON(path->p_tree_depth == 0);
2007
2008         *cpos = 0;
2009
2010         blkno = path_leaf_bh(path)->b_blocknr;
2011
2012         /* Start at the tree node just above the leaf and work our way up. */
2013         i = path->p_tree_depth - 1;
2014         while (i >= 0) {
2015                 el = path->p_node[i].el;
2016
2017                 /*
2018                  * Find the extent record just before the one in our
2019                  * path.
2020                  */
2021                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2022                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2023                                 if (j == 0) {
2024                                         if (i == 0) {
2025                                                 /*
2026                                                  * We've determined that the
2027                                                  * path specified is already
2028                                                  * the leftmost one - return a
2029                                                  * cpos of zero.
2030                                                  */
2031                                                 goto out;
2032                                         }
2033                                         /*
2034                                          * The leftmost record points to our
2035                                          * leaf - we need to travel up the
2036                                          * tree one level.
2037                                          */
2038                                         goto next_node;
2039                                 }
2040
2041                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2042                                 *cpos = *cpos + ocfs2_rec_clusters(el,
2043                                                            &el->l_recs[j - 1]);
2044                                 *cpos = *cpos - 1;
2045                                 goto out;
2046                         }
2047                 }
2048
2049                 /*
2050                  * If we got here, we never found a valid node where
2051                  * the tree indicated one should be.
2052                  */
2053                 ocfs2_error(sb,
2054                             "Invalid extent tree at extent block %llu\n",
2055                             (unsigned long long)blkno);
2056                 ret = -EROFS;
2057                 goto out;
2058
2059 next_node:
2060                 blkno = path->p_node[i].bh->b_blocknr;
2061                 i--;
2062         }
2063
2064 out:
2065         return ret;
2066 }
2067
2068 /*
2069  * Extend the transaction by enough credits to complete the rotation,
2070  * and still leave at least the original number of credits allocated
2071  * to this transaction.
2072  */
2073 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2074                                            int op_credits,
2075                                            struct ocfs2_path *path)
2076 {
2077         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2078
2079         if (handle->h_buffer_credits < credits)
2080                 return ocfs2_extend_trans(handle, credits);
2081
2082         return 0;
2083 }
2084
2085 /*
2086  * Trap the case where we're inserting into the theoretical range past
2087  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2088  * whose cpos is less than ours into the right leaf.
2089  *
2090  * It's only necessary to look at the rightmost record of the left
2091  * leaf because the logic that calls us should ensure that the
2092  * theoretical ranges in the path components above the leaves are
2093  * correct.
2094  */
2095 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2096                                                  u32 insert_cpos)
2097 {
2098         struct ocfs2_extent_list *left_el;
2099         struct ocfs2_extent_rec *rec;
2100         int next_free;
2101
2102         left_el = path_leaf_el(left_path);
2103         next_free = le16_to_cpu(left_el->l_next_free_rec);
2104         rec = &left_el->l_recs[next_free - 1];
2105
2106         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2107                 return 1;
2108         return 0;
2109 }
2110
2111 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2112 {
2113         int next_free = le16_to_cpu(el->l_next_free_rec);
2114         unsigned int range;
2115         struct ocfs2_extent_rec *rec;
2116
2117         if (next_free == 0)
2118                 return 0;
2119
2120         rec = &el->l_recs[0];
2121         if (ocfs2_is_empty_extent(rec)) {
2122                 /* Empty list. */
2123                 if (next_free == 1)
2124                         return 0;
2125                 rec = &el->l_recs[1];
2126         }
2127
2128         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2129         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2130                 return 1;
2131         return 0;
2132 }
2133
2134 /*
2135  * Rotate all the records in a btree right one record, starting at insert_cpos.
2136  *
2137  * The path to the rightmost leaf should be passed in.
2138  *
2139  * The array is assumed to be large enough to hold an entire path (tree depth).
2140  *
2141  * Upon succesful return from this function:
2142  *
2143  * - The 'right_path' array will contain a path to the leaf block
2144  *   whose range contains e_cpos.
2145  * - That leaf block will have a single empty extent in list index 0.
2146  * - In the case that the rotation requires a post-insert update,
2147  *   *ret_left_path will contain a valid path which can be passed to
2148  *   ocfs2_insert_path().
2149  */
2150 static int ocfs2_rotate_tree_right(struct inode *inode,
2151                                    handle_t *handle,
2152                                    enum ocfs2_split_type split,
2153                                    u32 insert_cpos,
2154                                    struct ocfs2_path *right_path,
2155                                    struct ocfs2_path **ret_left_path)
2156 {
2157         int ret, start, orig_credits = handle->h_buffer_credits;
2158         u32 cpos;
2159         struct ocfs2_path *left_path = NULL;
2160
2161         *ret_left_path = NULL;
2162
2163         left_path = ocfs2_new_path_from_path(right_path);
2164         if (!left_path) {
2165                 ret = -ENOMEM;
2166                 mlog_errno(ret);
2167                 goto out;
2168         }
2169
2170         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2171         if (ret) {
2172                 mlog_errno(ret);
2173                 goto out;
2174         }
2175
2176         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2177
2178         /*
2179          * What we want to do here is:
2180          *
2181          * 1) Start with the rightmost path.
2182          *
2183          * 2) Determine a path to the leaf block directly to the left
2184          *    of that leaf.
2185          *
2186          * 3) Determine the 'subtree root' - the lowest level tree node
2187          *    which contains a path to both leaves.
2188          *
2189          * 4) Rotate the subtree.
2190          *
2191          * 5) Find the next subtree by considering the left path to be
2192          *    the new right path.
2193          *
2194          * The check at the top of this while loop also accepts
2195          * insert_cpos == cpos because cpos is only a _theoretical_
2196          * value to get us the left path - insert_cpos might very well
2197          * be filling that hole.
2198          *
2199          * Stop at a cpos of '0' because we either started at the
2200          * leftmost branch (i.e., a tree with one branch and a
2201          * rotation inside of it), or we've gone as far as we can in
2202          * rotating subtrees.
2203          */
2204         while (cpos && insert_cpos <= cpos) {
2205                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2206                      insert_cpos, cpos);
2207
2208                 ret = ocfs2_find_path(inode, left_path, cpos);
2209                 if (ret) {
2210                         mlog_errno(ret);
2211                         goto out;
2212                 }
2213
2214                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2215                                 path_leaf_bh(right_path),
2216                                 "Inode %lu: error during insert of %u "
2217                                 "(left path cpos %u) results in two identical "
2218                                 "paths ending at %llu\n",
2219                                 inode->i_ino, insert_cpos, cpos,
2220                                 (unsigned long long)
2221                                 path_leaf_bh(left_path)->b_blocknr);
2222
2223                 if (split == SPLIT_NONE &&
2224                     ocfs2_rotate_requires_path_adjustment(left_path,
2225                                                           insert_cpos)) {
2226
2227                         /*
2228                          * We've rotated the tree as much as we
2229                          * should. The rest is up to
2230                          * ocfs2_insert_path() to complete, after the
2231                          * record insertion. We indicate this
2232                          * situation by returning the left path.
2233                          *
2234                          * The reason we don't adjust the records here
2235                          * before the record insert is that an error
2236                          * later might break the rule where a parent
2237                          * record e_cpos will reflect the actual
2238                          * e_cpos of the 1st nonempty record of the
2239                          * child list.
2240                          */
2241                         *ret_left_path = left_path;
2242                         goto out_ret_path;
2243                 }
2244
2245                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
2246
2247                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2248                      start,
2249                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2250                      right_path->p_tree_depth);
2251
2252                 ret = ocfs2_extend_rotate_transaction(handle, start,
2253                                                       orig_credits, right_path);
2254                 if (ret) {
2255                         mlog_errno(ret);
2256                         goto out;
2257                 }
2258
2259                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2260                                                  right_path, start);
2261                 if (ret) {
2262                         mlog_errno(ret);
2263                         goto out;
2264                 }
2265
2266                 if (split != SPLIT_NONE &&
2267                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2268                                                 insert_cpos)) {
2269                         /*
2270                          * A rotate moves the rightmost left leaf
2271                          * record over to the leftmost right leaf
2272                          * slot. If we're doing an extent split
2273                          * instead of a real insert, then we have to
2274                          * check that the extent to be split wasn't
2275                          * just moved over. If it was, then we can
2276                          * exit here, passing left_path back -
2277                          * ocfs2_split_extent() is smart enough to
2278                          * search both leaves.
2279                          */
2280                         *ret_left_path = left_path;
2281                         goto out_ret_path;
2282                 }
2283
2284                 /*
2285                  * There is no need to re-read the next right path
2286                  * as we know that it'll be our current left
2287                  * path. Optimize by copying values instead.
2288                  */
2289                 ocfs2_mv_path(right_path, left_path);
2290
2291                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2292                                                     &cpos);
2293                 if (ret) {
2294                         mlog_errno(ret);
2295                         goto out;
2296                 }
2297         }
2298
2299 out:
2300         ocfs2_free_path(left_path);
2301
2302 out_ret_path:
2303         return ret;
2304 }
2305
2306 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2307                                       struct ocfs2_path *path)
2308 {
2309         int i, idx;
2310         struct ocfs2_extent_rec *rec;
2311         struct ocfs2_extent_list *el;
2312         struct ocfs2_extent_block *eb;
2313         u32 range;
2314
2315         /* Path should always be rightmost. */
2316         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2317         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2318
2319         el = &eb->h_list;
2320         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2321         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2322         rec = &el->l_recs[idx];
2323         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2324
2325         for (i = 0; i < path->p_tree_depth; i++) {
2326                 el = path->p_node[i].el;
2327                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2328                 rec = &el->l_recs[idx];
2329
2330                 rec->e_int_clusters = cpu_to_le32(range);
2331                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2332
2333                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2334         }
2335 }
2336
2337 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2338                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2339                               struct ocfs2_path *path, int unlink_start)
2340 {
2341         int ret, i;
2342         struct ocfs2_extent_block *eb;
2343         struct ocfs2_extent_list *el;
2344         struct buffer_head *bh;
2345
2346         for(i = unlink_start; i < path_num_items(path); i++) {
2347                 bh = path->p_node[i].bh;
2348
2349                 eb = (struct ocfs2_extent_block *)bh->b_data;
2350                 /*
2351                  * Not all nodes might have had their final count
2352                  * decremented by the caller - handle this here.
2353                  */
2354                 el = &eb->h_list;
2355                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2356                         mlog(ML_ERROR,
2357                              "Inode %llu, attempted to remove extent block "
2358                              "%llu with %u records\n",
2359                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2360                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2361                              le16_to_cpu(el->l_next_free_rec));
2362
2363                         ocfs2_journal_dirty(handle, bh);
2364                         ocfs2_remove_from_cache(inode, bh);
2365                         continue;
2366                 }
2367
2368                 el->l_next_free_rec = 0;
2369                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2370
2371                 ocfs2_journal_dirty(handle, bh);
2372
2373                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2374                 if (ret)
2375                         mlog_errno(ret);
2376
2377                 ocfs2_remove_from_cache(inode, bh);
2378         }
2379 }
2380
2381 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2382                                  struct ocfs2_path *left_path,
2383                                  struct ocfs2_path *right_path,
2384                                  int subtree_index,
2385                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2386 {
2387         int i;
2388         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2389         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2390         struct ocfs2_extent_list *el;
2391         struct ocfs2_extent_block *eb;
2392
2393         el = path_leaf_el(left_path);
2394
2395         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2396
2397         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2398                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2399                         break;
2400
2401         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2402
2403         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2404         le16_add_cpu(&root_el->l_next_free_rec, -1);
2405
2406         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2407         eb->h_next_leaf_blk = 0;
2408
2409         ocfs2_journal_dirty(handle, root_bh);
2410         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2411
2412         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2413                           subtree_index + 1);
2414 }
2415
2416 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2417                                      struct ocfs2_path *left_path,
2418                                      struct ocfs2_path *right_path,
2419                                      int subtree_index,
2420                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2421                                      int *deleted,
2422                                      struct ocfs2_extent_tree *et)
2423 {
2424         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2425         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2426         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2427         struct ocfs2_extent_block *eb;
2428
2429         *deleted = 0;
2430
2431         right_leaf_el = path_leaf_el(right_path);
2432         left_leaf_el = path_leaf_el(left_path);
2433         root_bh = left_path->p_node[subtree_index].bh;
2434         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2435
2436         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2437                 return 0;
2438
2439         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2440         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2441                 /*
2442                  * It's legal for us to proceed if the right leaf is
2443                  * the rightmost one and it has an empty extent. There
2444                  * are two cases to handle - whether the leaf will be
2445                  * empty after removal or not. If the leaf isn't empty
2446                  * then just remove the empty extent up front. The
2447                  * next block will handle empty leaves by flagging
2448                  * them for unlink.
2449                  *
2450                  * Non rightmost leaves will throw -EAGAIN and the
2451                  * caller can manually move the subtree and retry.
2452                  */
2453
2454                 if (eb->h_next_leaf_blk != 0ULL)
2455                         return -EAGAIN;
2456
2457                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2458                         ret = ocfs2_journal_access(handle, inode,
2459                                                    path_leaf_bh(right_path),
2460                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2461                         if (ret) {
2462                                 mlog_errno(ret);
2463                                 goto out;
2464                         }
2465
2466                         ocfs2_remove_empty_extent(right_leaf_el);
2467                 } else
2468                         right_has_empty = 1;
2469         }
2470
2471         if (eb->h_next_leaf_blk == 0ULL &&
2472             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2473                 /*
2474                  * We have to update i_last_eb_blk during the meta
2475                  * data delete.
2476                  */
2477                 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2478                                            OCFS2_JOURNAL_ACCESS_WRITE);
2479                 if (ret) {
2480                         mlog_errno(ret);
2481                         goto out;
2482                 }
2483
2484                 del_right_subtree = 1;
2485         }
2486
2487         /*
2488          * Getting here with an empty extent in the right path implies
2489          * that it's the rightmost path and will be deleted.
2490          */
2491         BUG_ON(right_has_empty && !del_right_subtree);
2492
2493         ret = ocfs2_journal_access(handle, inode, root_bh,
2494                                    OCFS2_JOURNAL_ACCESS_WRITE);
2495         if (ret) {
2496                 mlog_errno(ret);
2497                 goto out;
2498         }
2499
2500         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2501                 ret = ocfs2_journal_access(handle, inode,
2502                                            right_path->p_node[i].bh,
2503                                            OCFS2_JOURNAL_ACCESS_WRITE);
2504                 if (ret) {
2505                         mlog_errno(ret);
2506                         goto out;
2507                 }
2508
2509                 ret = ocfs2_journal_access(handle, inode,
2510                                            left_path->p_node[i].bh,
2511                                            OCFS2_JOURNAL_ACCESS_WRITE);
2512                 if (ret) {
2513                         mlog_errno(ret);
2514                         goto out;
2515                 }
2516         }
2517
2518         if (!right_has_empty) {
2519                 /*
2520                  * Only do this if we're moving a real
2521                  * record. Otherwise, the action is delayed until
2522                  * after removal of the right path in which case we
2523                  * can do a simple shift to remove the empty extent.
2524                  */
2525                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2526                 memset(&right_leaf_el->l_recs[0], 0,
2527                        sizeof(struct ocfs2_extent_rec));
2528         }
2529         if (eb->h_next_leaf_blk == 0ULL) {
2530                 /*
2531                  * Move recs over to get rid of empty extent, decrease
2532                  * next_free. This is allowed to remove the last
2533                  * extent in our leaf (setting l_next_free_rec to
2534                  * zero) - the delete code below won't care.
2535                  */
2536                 ocfs2_remove_empty_extent(right_leaf_el);
2537         }
2538
2539         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2540         if (ret)
2541                 mlog_errno(ret);
2542         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2543         if (ret)
2544                 mlog_errno(ret);
2545
2546         if (del_right_subtree) {
2547                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2548                                      subtree_index, dealloc);
2549                 ocfs2_update_edge_lengths(inode, handle, left_path);
2550
2551                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2552                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2553
2554                 /*
2555                  * Removal of the extent in the left leaf was skipped
2556                  * above so we could delete the right path
2557                  * 1st.
2558                  */
2559                 if (right_has_empty)
2560                         ocfs2_remove_empty_extent(left_leaf_el);
2561
2562                 ret = ocfs2_journal_dirty(handle, et_root_bh);
2563                 if (ret)
2564                         mlog_errno(ret);
2565
2566                 *deleted = 1;
2567         } else
2568                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2569                                            subtree_index);
2570
2571 out:
2572         return ret;
2573 }
2574
2575 /*
2576  * Given a full path, determine what cpos value would return us a path
2577  * containing the leaf immediately to the right of the current one.
2578  *
2579  * Will return zero if the path passed in is already the rightmost path.
2580  *
2581  * This looks similar, but is subtly different to
2582  * ocfs2_find_cpos_for_left_leaf().
2583  */
2584 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2585                                           struct ocfs2_path *path, u32 *cpos)
2586 {
2587         int i, j, ret = 0;
2588         u64 blkno;
2589         struct ocfs2_extent_list *el;
2590
2591         *cpos = 0;
2592
2593         if (path->p_tree_depth == 0)
2594                 return 0;
2595
2596         blkno = path_leaf_bh(path)->b_blocknr;
2597
2598         /* Start at the tree node just above the leaf and work our way up. */
2599         i = path->p_tree_depth - 1;
2600         while (i >= 0) {
2601                 int next_free;
2602
2603                 el = path->p_node[i].el;
2604
2605                 /*
2606                  * Find the extent record just after the one in our
2607                  * path.
2608                  */
2609                 next_free = le16_to_cpu(el->l_next_free_rec);
2610                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2611                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2612                                 if (j == (next_free - 1)) {
2613                                         if (i == 0) {
2614                                                 /*
2615                                                  * We've determined that the
2616                                                  * path specified is already
2617                                                  * the rightmost one - return a
2618                                                  * cpos of zero.
2619                                                  */
2620                                                 goto out;
2621                                         }
2622                                         /*
2623                                          * The rightmost record points to our
2624                                          * leaf - we need to travel up the
2625                                          * tree one level.
2626                                          */
2627                                         goto next_node;
2628                                 }
2629
2630                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2631                                 goto out;
2632                         }
2633                 }
2634
2635                 /*
2636                  * If we got here, we never found a valid node where
2637                  * the tree indicated one should be.
2638                  */
2639                 ocfs2_error(sb,
2640                             "Invalid extent tree at extent block %llu\n",
2641                             (unsigned long long)blkno);
2642                 ret = -EROFS;
2643                 goto out;
2644
2645 next_node:
2646                 blkno = path->p_node[i].bh->b_blocknr;
2647                 i--;
2648         }
2649
2650 out:
2651         return ret;
2652 }
2653
2654 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2655                                             handle_t *handle,
2656                                             struct buffer_head *bh,
2657                                             struct ocfs2_extent_list *el)
2658 {
2659         int ret;
2660
2661         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2662                 return 0;
2663
2664         ret = ocfs2_journal_access(handle, inode, bh,
2665                                    OCFS2_JOURNAL_ACCESS_WRITE);
2666         if (ret) {
2667                 mlog_errno(ret);
2668                 goto out;
2669         }
2670
2671         ocfs2_remove_empty_extent(el);
2672
2673         ret = ocfs2_journal_dirty(handle, bh);
2674         if (ret)
2675                 mlog_errno(ret);
2676
2677 out:
2678         return ret;
2679 }
2680
2681 static int __ocfs2_rotate_tree_left(struct inode *inode,
2682                                     handle_t *handle, int orig_credits,
2683                                     struct ocfs2_path *path,
2684                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2685                                     struct ocfs2_path **empty_extent_path,
2686                                     struct ocfs2_extent_tree *et)
2687 {
2688         int ret, subtree_root, deleted;
2689         u32 right_cpos;
2690         struct ocfs2_path *left_path = NULL;
2691         struct ocfs2_path *right_path = NULL;
2692
2693         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2694
2695         *empty_extent_path = NULL;
2696
2697         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2698                                              &right_cpos);
2699         if (ret) {
2700                 mlog_errno(ret);
2701                 goto out;
2702         }
2703
2704         left_path = ocfs2_new_path_from_path(path);
2705         if (!left_path) {
2706                 ret = -ENOMEM;
2707                 mlog_errno(ret);
2708                 goto out;
2709         }
2710
2711         ocfs2_cp_path(left_path, path);
2712
2713         right_path = ocfs2_new_path_from_path(path);
2714         if (!right_path) {
2715                 ret = -ENOMEM;
2716                 mlog_errno(ret);
2717                 goto out;
2718         }
2719
2720         while (right_cpos) {
2721                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2722                 if (ret) {
2723                         mlog_errno(ret);
2724                         goto out;
2725                 }
2726
2727                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2728                                                        right_path);
2729
2730                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2731                      subtree_root,
2732                      (unsigned long long)
2733                      right_path->p_node[subtree_root].bh->b_blocknr,
2734                      right_path->p_tree_depth);
2735
2736                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2737                                                       orig_credits, left_path);
2738                 if (ret) {
2739                         mlog_errno(ret);
2740                         goto out;
2741                 }
2742
2743                 /*
2744                  * Caller might still want to make changes to the
2745                  * tree root, so re-add it to the journal here.
2746                  */
2747                 ret = ocfs2_journal_access(handle, inode,
2748                                            path_root_bh(left_path),
2749                                            OCFS2_JOURNAL_ACCESS_WRITE);
2750                 if (ret) {
2751                         mlog_errno(ret);
2752                         goto out;
2753                 }
2754
2755                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2756                                                 right_path, subtree_root,
2757                                                 dealloc, &deleted, et);
2758                 if (ret == -EAGAIN) {
2759                         /*
2760                          * The rotation has to temporarily stop due to
2761                          * the right subtree having an empty
2762                          * extent. Pass it back to the caller for a
2763                          * fixup.
2764                          */
2765                         *empty_extent_path = right_path;
2766                         right_path = NULL;
2767                         goto out;
2768                 }
2769                 if (ret) {
2770                         mlog_errno(ret);
2771                         goto out;
2772                 }
2773
2774                 /*
2775                  * The subtree rotate might have removed records on
2776                  * the rightmost edge. If so, then rotation is
2777                  * complete.
2778                  */
2779                 if (deleted)
2780                         break;
2781
2782                 ocfs2_mv_path(left_path, right_path);
2783
2784                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2785                                                      &right_cpos);
2786                 if (ret) {
2787                         mlog_errno(ret);
2788                         goto out;
2789                 }
2790         }
2791
2792 out:
2793         ocfs2_free_path(right_path);
2794         ocfs2_free_path(left_path);
2795
2796         return ret;
2797 }
2798
2799 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2800                                 struct ocfs2_path *path,
2801                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2802                                 struct ocfs2_extent_tree *et)
2803 {
2804         int ret, subtree_index;
2805         u32 cpos;
2806         struct ocfs2_path *left_path = NULL;
2807         struct ocfs2_extent_block *eb;
2808         struct ocfs2_extent_list *el;
2809
2810
2811         ret = ocfs2_et_sanity_check(inode, et);
2812         if (ret)
2813                 goto out;
2814         /*
2815          * There's two ways we handle this depending on
2816          * whether path is the only existing one.
2817          */
2818         ret = ocfs2_extend_rotate_transaction(handle, 0,
2819                                               handle->h_buffer_credits,
2820                                               path);
2821         if (ret) {
2822                 mlog_errno(ret);
2823                 goto out;
2824         }
2825
2826         ret = ocfs2_journal_access_path(inode, handle, path);
2827         if (ret) {
2828                 mlog_errno(ret);
2829                 goto out;
2830         }
2831
2832         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2833         if (ret) {
2834                 mlog_errno(ret);
2835                 goto out;
2836         }
2837
2838         if (cpos) {
2839                 /*
2840                  * We have a path to the left of this one - it needs
2841                  * an update too.
2842                  */
2843                 left_path = ocfs2_new_path_from_path(path);
2844                 if (!left_path) {
2845                         ret = -ENOMEM;
2846                         mlog_errno(ret);
2847                         goto out;
2848                 }
2849
2850                 ret = ocfs2_find_path(inode, left_path, cpos);
2851                 if (ret) {
2852                         mlog_errno(ret);
2853                         goto out;
2854                 }
2855
2856                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2857                 if (ret) {
2858                         mlog_errno(ret);
2859                         goto out;
2860                 }
2861
2862                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2863
2864                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2865                                      subtree_index, dealloc);
2866                 ocfs2_update_edge_lengths(inode, handle, left_path);
2867
2868                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2869                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2870         } else {
2871                 /*
2872                  * 'path' is also the leftmost path which
2873                  * means it must be the only one. This gets
2874                  * handled differently because we want to
2875                  * revert the inode back to having extents
2876                  * in-line.
2877                  */
2878                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2879
2880                 el = et->et_root_el;
2881                 el->l_tree_depth = 0;
2882                 el->l_next_free_rec = 0;
2883                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2884
2885                 ocfs2_et_set_last_eb_blk(et, 0);
2886         }
2887
2888         ocfs2_journal_dirty(handle, path_root_bh(path));
2889
2890 out:
2891         ocfs2_free_path(left_path);
2892         return ret;
2893 }
2894
2895 /*
2896  * Left rotation of btree records.
2897  *
2898  * In many ways, this is (unsurprisingly) the opposite of right
2899  * rotation. We start at some non-rightmost path containing an empty
2900  * extent in the leaf block. The code works its way to the rightmost
2901  * path by rotating records to the left in every subtree.
2902  *
2903  * This is used by any code which reduces the number of extent records
2904  * in a leaf. After removal, an empty record should be placed in the
2905  * leftmost list position.
2906  *
2907  * This won't handle a length update of the rightmost path records if
2908  * the rightmost tree leaf record is removed so the caller is
2909  * responsible for detecting and correcting that.
2910  */
2911 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2912                                   struct ocfs2_path *path,
2913                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
2914                                   struct ocfs2_extent_tree *et)
2915 {
2916         int ret, orig_credits = handle->h_buffer_credits;
2917         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2918         struct ocfs2_extent_block *eb;
2919         struct ocfs2_extent_list *el;
2920
2921         el = path_leaf_el(path);
2922         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2923                 return 0;
2924
2925         if (path->p_tree_depth == 0) {
2926 rightmost_no_delete:
2927                 /*
2928                  * Inline extents. This is trivially handled, so do
2929                  * it up front.
2930                  */
2931                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2932                                                        path_leaf_bh(path),
2933                                                        path_leaf_el(path));
2934                 if (ret)
2935                         mlog_errno(ret);
2936                 goto out;
2937         }
2938
2939         /*
2940          * Handle rightmost branch now. There's several cases:
2941          *  1) simple rotation leaving records in there. That's trivial.
2942          *  2) rotation requiring a branch delete - there's no more
2943          *     records left. Two cases of this:
2944          *     a) There are branches to the left.
2945          *     b) This is also the leftmost (the only) branch.
2946          *
2947          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2948          *  2a) we need the left branch so that we can update it with the unlink
2949          *  2b) we need to bring the inode back to inline extents.
2950          */
2951
2952         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2953         el = &eb->h_list;
2954         if (eb->h_next_leaf_blk == 0) {
2955                 /*
2956                  * This gets a bit tricky if we're going to delete the
2957                  * rightmost path. Get the other cases out of the way
2958                  * 1st.
2959                  */
2960                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2961                         goto rightmost_no_delete;
2962
2963                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2964                         ret = -EIO;
2965                         ocfs2_error(inode->i_sb,
2966                                     "Inode %llu has empty extent block at %llu",
2967                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2968                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2969                         goto out;
2970                 }
2971
2972                 /*
2973                  * XXX: The caller can not trust "path" any more after
2974                  * this as it will have been deleted. What do we do?
2975                  *
2976                  * In theory the rotate-for-merge code will never get
2977                  * here because it'll always ask for a rotate in a
2978                  * nonempty list.
2979                  */
2980
2981                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2982                                                   dealloc, et);
2983                 if (ret)
2984                         mlog_errno(ret);
2985                 goto out;
2986         }
2987
2988         /*
2989          * Now we can loop, remembering the path we get from -EAGAIN
2990          * and restarting from there.
2991          */
2992 try_rotate:
2993         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2994                                        dealloc, &restart_path, et);
2995         if (ret && ret != -EAGAIN) {
2996                 mlog_errno(ret);
2997                 goto out;
2998         }
2999
3000         while (ret == -EAGAIN) {
3001                 tmp_path = restart_path;
3002                 restart_path = NULL;
3003
3004                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
3005                                                tmp_path, dealloc,
3006                                                &restart_path, et);
3007                 if (ret && ret != -EAGAIN) {
3008                         mlog_errno(ret);
3009                         goto out;
3010                 }
3011
3012                 ocfs2_free_path(tmp_path);
3013                 tmp_path = NULL;
3014
3015                 if (ret == 0)
3016                         goto try_rotate;
3017         }
3018
3019 out:
3020         ocfs2_free_path(tmp_path);
3021         ocfs2_free_path(restart_path);
3022         return ret;
3023 }
3024
3025 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3026                                 int index)
3027 {
3028         struct ocfs2_extent_rec *rec = &el->l_recs[index];
3029         unsigned int size;
3030
3031         if (rec->e_leaf_clusters == 0) {
3032                 /*
3033                  * We consumed all of the merged-from record. An empty
3034                  * extent cannot exist anywhere but the 1st array
3035                  * position, so move things over if the merged-from
3036                  * record doesn't occupy that position.
3037                  *
3038                  * This creates a new empty extent so the caller
3039                  * should be smart enough to have removed any existing
3040                  * ones.
3041                  */
3042                 if (index > 0) {
3043                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3044                         size = index * sizeof(struct ocfs2_extent_rec);
3045                         memmove(&el->l_recs[1], &el->l_recs[0], size);
3046                 }
3047
3048                 /*
3049                  * Always memset - the caller doesn't check whether it
3050                  * created an empty extent, so there could be junk in
3051                  * the other fields.
3052                  */
3053                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3054         }
3055 }
3056
3057 static int ocfs2_get_right_path(struct inode *inode,
3058                                 struct ocfs2_path *left_path,
3059                                 struct ocfs2_path **ret_right_path)
3060 {
3061         int ret;
3062         u32 right_cpos;
3063         struct ocfs2_path *right_path = NULL;
3064         struct ocfs2_extent_list *left_el;
3065
3066         *ret_right_path = NULL;
3067
3068         /* This function shouldn't be called for non-trees. */
3069         BUG_ON(left_path->p_tree_depth == 0);
3070
3071         left_el = path_leaf_el(left_path);
3072         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3073
3074         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
3075                                              &right_cpos);
3076         if (ret) {
3077                 mlog_errno(ret);
3078                 goto out;
3079         }
3080
3081         /* This function shouldn't be called for the rightmost leaf. */
3082         BUG_ON(right_cpos == 0);
3083
3084         right_path = ocfs2_new_path_from_path(left_path);
3085         if (!right_path) {
3086                 ret = -ENOMEM;
3087                 mlog_errno(ret);
3088                 goto out;
3089         }
3090
3091         ret = ocfs2_find_path(inode, right_path, right_cpos);
3092         if (ret) {
3093                 mlog_errno(ret);
3094                 goto out;
3095         }
3096
3097         *ret_right_path = right_path;
3098 out:
3099         if (ret)
3100                 ocfs2_free_path(right_path);
3101         return ret;
3102 }
3103
3104 /*
3105  * Remove split_rec clusters from the record at index and merge them
3106  * onto the beginning of the record "next" to it.
3107  * For index < l_count - 1, the next means the extent rec at index + 1.
3108  * For index == l_count - 1, the "next" means the 1st extent rec of the
3109  * next extent block.
3110  */
3111 static int ocfs2_merge_rec_right(struct inode *inode,
3112                                  struct ocfs2_path *left_path,
3113                                  handle_t *handle,
3114                                  struct ocfs2_extent_rec *split_rec,
3115                                  int index)
3116 {
3117         int ret, next_free, i;
3118         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3119         struct ocfs2_extent_rec *left_rec;
3120         struct ocfs2_extent_rec *right_rec;
3121         struct ocfs2_extent_list *right_el;
3122         struct ocfs2_path *right_path = NULL;
3123         int subtree_index = 0;
3124         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3125         struct buffer_head *bh = path_leaf_bh(left_path);
3126         struct buffer_head *root_bh = NULL;
3127
3128         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3129         left_rec = &el->l_recs[index];
3130
3131         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3132             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3133                 /* we meet with a cross extent block merge. */
3134                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
3135                 if (ret) {
3136                         mlog_errno(ret);
3137                         goto out;
3138                 }
3139
3140                 right_el = path_leaf_el(right_path);
3141                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3142                 BUG_ON(next_free <= 0);
3143                 right_rec = &right_el->l_recs[0];
3144                 if (ocfs2_is_empty_extent(right_rec)) {
3145                         BUG_ON(next_free <= 1);
3146                         right_rec = &right_el->l_recs[1];
3147                 }
3148
3149                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3150                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3151                        le32_to_cpu(right_rec->e_cpos));
3152
3153                 subtree_index = ocfs2_find_subtree_root(inode,
3154                                                         left_path, right_path);
3155
3156                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3157                                                       handle->h_buffer_credits,
3158                                                       right_path);
3159                 if (ret) {
3160                         mlog_errno(ret);
3161                         goto out;
3162                 }
3163
3164                 root_bh = left_path->p_node[subtree_index].bh;
3165                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3166
3167                 ret = ocfs2_journal_access(handle, inode, root_bh,
3168                                            OCFS2_JOURNAL_ACCESS_WRITE);
3169                 if (ret) {
3170                         mlog_errno(ret);
3171                         goto out;
3172                 }
3173
3174                 for (i = subtree_index + 1;
3175                      i < path_num_items(right_path); i++) {
3176                         ret = ocfs2_journal_access(handle, inode,
3177                                                    right_path->p_node[i].bh,
3178                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3179                         if (ret) {
3180                                 mlog_errno(ret);
3181                                 goto out;
3182                         }
3183
3184                         ret = ocfs2_journal_access(handle, inode,
3185                                                    left_path->p_node[i].bh,
3186                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3187                         if (ret) {
3188                                 mlog_errno(ret);
3189                                 goto out;
3190                         }
3191                 }
3192
3193         } else {
3194                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3195                 right_rec = &el->l_recs[index + 1];
3196         }
3197
3198         ret = ocfs2_journal_access(handle, inode, bh,
3199                                    OCFS2_JOURNAL_ACCESS_WRITE);
3200         if (ret) {
3201                 mlog_errno(ret);
3202                 goto out;
3203         }
3204
3205         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3206
3207         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3208         le64_add_cpu(&right_rec->e_blkno,
3209                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3210         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3211
3212         ocfs2_cleanup_merge(el, index);
3213
3214         ret = ocfs2_journal_dirty(handle, bh);
3215         if (ret)
3216                 mlog_errno(ret);
3217
3218         if (right_path) {
3219                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3220                 if (ret)
3221                         mlog_errno(ret);
3222
3223                 ocfs2_complete_edge_insert(inode, handle, left_path,
3224                                            right_path, subtree_index);
3225         }
3226 out:
3227         if (right_path)
3228                 ocfs2_free_path(right_path);
3229         return ret;
3230 }
3231
3232 static int ocfs2_get_left_path(struct inode *inode,
3233                                struct ocfs2_path *right_path,
3234                                struct ocfs2_path **ret_left_path)
3235 {
3236         int ret;
3237         u32 left_cpos;
3238         struct ocfs2_path *left_path = NULL;
3239
3240         *ret_left_path = NULL;
3241
3242         /* This function shouldn't be called for non-trees. */
3243         BUG_ON(right_path->p_tree_depth == 0);
3244
3245         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3246                                             right_path, &left_cpos);
3247         if (ret) {
3248                 mlog_errno(ret);
3249                 goto out;
3250         }
3251
3252         /* This function shouldn't be called for the leftmost leaf. */
3253         BUG_ON(left_cpos == 0);
3254
3255         left_path = ocfs2_new_path_from_path(right_path);
3256         if (!left_path) {
3257                 ret = -ENOMEM;
3258                 mlog_errno(ret);
3259                 goto out;
3260         }
3261
3262         ret = ocfs2_find_path(inode, left_path, left_cpos);
3263         if (ret) {
3264                 mlog_errno(ret);
3265                 goto out;
3266         }
3267
3268         *ret_left_path = left_path;
3269 out:
3270         if (ret)
3271                 ocfs2_free_path(left_path);
3272         return ret;
3273 }
3274
3275 /*
3276  * Remove split_rec clusters from the record at index and merge them
3277  * onto the tail of the record "before" it.
3278  * For index > 0, the "before" means the extent rec at index - 1.
3279  *
3280  * For index == 0, the "before" means the last record of the previous
3281  * extent block. And there is also a situation that we may need to
3282  * remove the rightmost leaf extent block in the right_path and change
3283  * the right path to indicate the new rightmost path.
3284  */
3285 static int ocfs2_merge_rec_left(struct inode *inode,
3286                                 struct ocfs2_path *right_path,
3287                                 handle_t *handle,
3288                                 struct ocfs2_extent_rec *split_rec,
3289                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3290                                 struct ocfs2_extent_tree *et,
3291                                 int index)
3292 {
3293         int ret, i, subtree_index = 0, has_empty_extent = 0;
3294         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3295         struct ocfs2_extent_rec *left_rec;
3296         struct ocfs2_extent_rec *right_rec;
3297         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3298         struct buffer_head *bh = path_leaf_bh(right_path);
3299         struct buffer_head *root_bh = NULL;
3300         struct ocfs2_path *left_path = NULL;
3301         struct ocfs2_extent_list *left_el;
3302
3303         BUG_ON(index < 0);
3304
3305         right_rec = &el->l_recs[index];
3306         if (index == 0) {
3307                 /* we meet with a cross extent block merge. */
3308                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
3309                 if (ret) {
3310                         mlog_errno(ret);
3311                         goto out;
3312                 }
3313
3314                 left_el = path_leaf_el(left_path);
3315                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3316                        le16_to_cpu(left_el->l_count));
3317
3318                 left_rec = &left_el->l_recs[
3319                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3320                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3321                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3322                        le32_to_cpu(split_rec->e_cpos));
3323
3324                 subtree_index = ocfs2_find_subtree_root(inode,
3325                                                         left_path, right_path);
3326
3327                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3328                                                       handle->h_buffer_credits,
3329                                                       left_path);
3330                 if (ret) {
3331                         mlog_errno(ret);
3332                         goto out;
3333                 }
3334
3335                 root_bh = left_path->p_node[subtree_index].bh;
3336                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3337
3338                 ret = ocfs2_journal_access(handle, inode, root_bh,
3339                                            OCFS2_JOURNAL_ACCESS_WRITE);
3340                 if (ret) {
3341                         mlog_errno(ret);
3342                         goto out;
3343                 }
3344
3345                 for (i = subtree_index + 1;
3346                      i < path_num_items(right_path); i++) {
3347                         ret = ocfs2_journal_access(handle, inode,
3348                                                    right_path->p_node[i].bh,
3349                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3350                         if (ret) {
3351                                 mlog_errno(ret);
3352                                 goto out;
3353                         }
3354
3355                         ret = ocfs2_journal_access(handle, inode,
3356                                                    left_path->p_node[i].bh,
3357                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3358                         if (ret) {
3359                                 mlog_errno(ret);
3360                                 goto out;
3361                         }
3362                 }
3363         } else {
3364                 left_rec = &el->l_recs[index - 1];
3365                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3366                         has_empty_extent = 1;
3367         }
3368
3369         ret = ocfs2_journal_access(handle, inode, bh,
3370                                    OCFS2_JOURNAL_ACCESS_WRITE);
3371         if (ret) {
3372                 mlog_errno(ret);
3373                 goto out;
3374         }
3375
3376         if (has_empty_extent && index == 1) {
3377                 /*
3378                  * The easy case - we can just plop the record right in.
3379                  */
3380                 *left_rec = *split_rec;
3381
3382                 has_empty_extent = 0;
3383         } else
3384                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3385
3386         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3387         le64_add_cpu(&right_rec->e_blkno,
3388                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3389         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3390
3391         ocfs2_cleanup_merge(el, index);
3392
3393         ret = ocfs2_journal_dirty(handle, bh);
3394         if (ret)
3395                 mlog_errno(ret);
3396
3397         if (left_path) {
3398                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3399                 if (ret)
3400                         mlog_errno(ret);
3401
3402                 /*
3403                  * In the situation that the right_rec is empty and the extent
3404                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3405                  * it and we need to delete the right extent block.
3406                  */
3407                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3408                     le16_to_cpu(el->l_next_free_rec) == 1) {
3409
3410                         ret = ocfs2_remove_rightmost_path(inode, handle,
3411                                                           right_path,
3412                                                           dealloc, et);
3413                         if (ret) {
3414                                 mlog_errno(ret);
3415                                 goto out;
3416                         }
3417
3418                         /* Now the rightmost extent block has been deleted.
3419                          * So we use the new rightmost path.
3420                          */
3421                         ocfs2_mv_path(right_path, left_path);
3422                         left_path = NULL;
3423                 } else
3424                         ocfs2_complete_edge_insert(inode, handle, left_path,
3425                                                    right_path, subtree_index);
3426         }
3427 out:
3428         if (left_path)
3429                 ocfs2_free_path(left_path);
3430         return ret;
3431 }
3432
3433 static int ocfs2_try_to_merge_extent(struct inode *inode,
3434                                      handle_t *handle,
3435                                      struct ocfs2_path *path,
3436                                      int split_index,
3437                                      struct ocfs2_extent_rec *split_rec,
3438                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3439                                      struct ocfs2_merge_ctxt *ctxt,
3440                                      struct ocfs2_extent_tree *et)
3441
3442 {
3443         int ret = 0;
3444         struct ocfs2_extent_list *el = path_leaf_el(path);
3445         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3446
3447         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3448
3449         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3450                 /*
3451                  * The merge code will need to create an empty
3452                  * extent to take the place of the newly
3453                  * emptied slot. Remove any pre-existing empty
3454                  * extents - having more than one in a leaf is
3455                  * illegal.
3456                  */
3457                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3458                                              dealloc, et);
3459                 if (ret) {
3460                         mlog_errno(ret);
3461                         goto out;
3462                 }
3463                 split_index--;
3464                 rec = &el->l_recs[split_index];
3465         }
3466
3467         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3468                 /*
3469                  * Left-right contig implies this.
3470                  */
3471                 BUG_ON(!ctxt->c_split_covers_rec);
3472
3473                 /*
3474                  * Since the leftright insert always covers the entire
3475                  * extent, this call will delete the insert record
3476                  * entirely, resulting in an empty extent record added to
3477                  * the extent block.
3478                  *
3479                  * Since the adding of an empty extent shifts
3480                  * everything back to the right, there's no need to
3481                  * update split_index here.
3482                  *
3483                  * When the split_index is zero, we need to merge it to the
3484                  * prevoius extent block. It is more efficient and easier
3485                  * if we do merge_right first and merge_left later.
3486                  */
3487                 ret = ocfs2_merge_rec_right(inode, path,
3488                                             handle, split_rec,
3489                                             split_index);
3490                 if (ret) {
3491                         mlog_errno(ret);
3492                         goto out;
3493                 }
3494
3495                 /*
3496                  * We can only get this from logic error above.
3497                  */
3498                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3499
3500                 /* The merge left us with an empty extent, remove it. */
3501                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3502                                              dealloc, et);
3503                 if (ret) {
3504                         mlog_errno(ret);
3505                         goto out;
3506                 }
3507
3508                 rec = &el->l_recs[split_index];
3509
3510                 /*
3511                  * Note that we don't pass split_rec here on purpose -
3512                  * we've merged it into the rec already.
3513                  */
3514                 ret = ocfs2_merge_rec_left(inode, path,
3515                                            handle, rec,
3516                                            dealloc, et,
3517                                            split_index);
3518
3519                 if (ret) {
3520                         mlog_errno(ret);
3521                         goto out;
3522                 }
3523
3524                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3525                                              dealloc, et);
3526                 /*
3527                  * Error from this last rotate is not critical, so
3528                  * print but don't bubble it up.
3529                  */
3530                 if (ret)
3531                         mlog_errno(ret);
3532                 ret = 0;
3533         } else {
3534                 /*
3535                  * Merge a record to the left or right.
3536                  *
3537                  * 'contig_type' is relative to the existing record,
3538                  * so for example, if we're "right contig", it's to
3539                  * the record on the left (hence the left merge).
3540                  */
3541                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3542                         ret = ocfs2_merge_rec_left(inode,
3543                                                    path,
3544                                                    handle, split_rec,
3545                                                    dealloc, et,
3546                                                    split_index);
3547                         if (ret) {
3548                                 mlog_errno(ret);
3549                                 goto out;
3550                         }
3551                 } else {
3552                         ret = ocfs2_merge_rec_right(inode,
3553                                                     path,
3554                                                     handle, split_rec,
3555                                                     split_index);
3556                         if (ret) {
3557                                 mlog_errno(ret);
3558                                 goto out;
3559                         }
3560                 }
3561
3562                 if (ctxt->c_split_covers_rec) {
3563                         /*
3564                          * The merge may have left an empty extent in
3565                          * our leaf. Try to rotate it away.
3566                          */
3567                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3568                                                      dealloc, et);
3569                         if (ret)
3570                                 mlog_errno(ret);
3571                         ret = 0;
3572                 }
3573         }
3574
3575 out:
3576         return ret;
3577 }
3578
3579 static void ocfs2_subtract_from_rec(struct super_block *sb,
3580                                     enum ocfs2_split_type split,
3581                                     struct ocfs2_extent_rec *rec,
3582                                     struct ocfs2_extent_rec *split_rec)
3583 {
3584         u64 len_blocks;
3585
3586         len_blocks = ocfs2_clusters_to_blocks(sb,
3587                                 le16_to_cpu(split_rec->e_leaf_clusters));
3588
3589         if (split == SPLIT_LEFT) {
3590                 /*
3591                  * Region is on the left edge of the existing
3592                  * record.
3593                  */
3594                 le32_add_cpu(&rec->e_cpos,
3595                              le16_to_cpu(split_rec->e_leaf_clusters));
3596                 le64_add_cpu(&rec->e_blkno, len_blocks);
3597                 le16_add_cpu(&rec->e_leaf_clusters,
3598                              -le16_to_cpu(split_rec->e_leaf_clusters));
3599         } else {
3600                 /*
3601                  * Region is on the right edge of the existing
3602                  * record.
3603                  */
3604                 le16_add_cpu(&rec->e_leaf_clusters,
3605                              -le16_to_cpu(split_rec->e_leaf_clusters));
3606         }
3607 }
3608
3609 /*
3610  * Do the final bits of extent record insertion at the target leaf
3611  * list. If this leaf is part of an allocation tree, it is assumed
3612  * that the tree above has been prepared.
3613  */
3614 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3615                                  struct ocfs2_extent_list *el,
3616                                  struct ocfs2_insert_type *insert,
3617                                  struct inode *inode)
3618 {
3619         int i = insert->ins_contig_index;
3620         unsigned int range;
3621         struct ocfs2_extent_rec *rec;
3622
3623         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3624
3625         if (insert->ins_split != SPLIT_NONE) {
3626                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3627                 BUG_ON(i == -1);
3628                 rec = &el->l_recs[i];
3629                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3630                                         insert_rec);
3631                 goto rotate;
3632         }
3633
3634         /*
3635          * Contiguous insert - either left or right.
3636          */
3637         if (insert->ins_contig != CONTIG_NONE) {
3638                 rec = &el->l_recs[i];
3639                 if (insert->ins_contig == CONTIG_LEFT) {
3640                         rec->e_blkno = insert_rec->e_blkno;
3641                         rec->e_cpos = insert_rec->e_cpos;
3642                 }
3643                 le16_add_cpu(&rec->e_leaf_clusters,
3644                              le16_to_cpu(insert_rec->e_leaf_clusters));
3645                 return;
3646         }
3647
3648         /*
3649          * Handle insert into an empty leaf.
3650          */
3651         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3652             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3653              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3654                 el->l_recs[0] = *insert_rec;
3655                 el->l_next_free_rec = cpu_to_le16(1);
3656                 return;
3657         }
3658
3659         /*
3660          * Appending insert.
3661          */
3662         if (insert->ins_appending == APPEND_TAIL) {
3663                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3664                 rec = &el->l_recs[i];
3665                 range = le32_to_cpu(rec->e_cpos)
3666                         + le16_to_cpu(rec->e_leaf_clusters);
3667                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3668
3669                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3670                                 le16_to_cpu(el->l_count),
3671                                 "inode %lu, depth %u, count %u, next free %u, "
3672                                 "rec.cpos %u, rec.clusters %u, "
3673                                 "insert.cpos %u, insert.clusters %u\n",
3674                                 inode->i_ino,
3675                                 le16_to_cpu(el->l_tree_depth),
3676                                 le16_to_cpu(el->l_count),
3677                                 le16_to_cpu(el->l_next_free_rec),
3678                                 le32_to_cpu(el->l_recs[i].e_cpos),
3679                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3680                                 le32_to_cpu(insert_rec->e_cpos),
3681                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3682                 i++;
3683                 el->l_recs[i] = *insert_rec;
3684                 le16_add_cpu(&el->l_next_free_rec, 1);
3685                 return;
3686         }
3687
3688 rotate:
3689         /*
3690          * Ok, we have to rotate.
3691          *
3692          * At this point, it is safe to assume that inserting into an
3693          * empty leaf and appending to a leaf have both been handled
3694          * above.
3695          *
3696          * This leaf needs to have space, either by the empty 1st
3697          * extent record, or by virtue of an l_next_rec < l_count.
3698          */
3699         ocfs2_rotate_leaf(el, insert_rec);
3700 }
3701
3702 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3703                                            handle_t *handle,
3704                                            struct ocfs2_path *path,
3705                                            struct ocfs2_extent_rec *insert_rec)
3706 {
3707         int ret, i, next_free;
3708         struct buffer_head *bh;
3709         struct ocfs2_extent_list *el;
3710         struct ocfs2_extent_rec *rec;
3711
3712         /*
3713          * Update everything except the leaf block.
3714          */
3715         for (i = 0; i < path->p_tree_depth; i++) {
3716                 bh = path->p_node[i].bh;
3717                 el = path->p_node[i].el;
3718
3719                 next_free = le16_to_cpu(el->l_next_free_rec);
3720                 if (next_free == 0) {
3721                         ocfs2_error(inode->i_sb,
3722                                     "Dinode %llu has a bad extent list",
3723                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3724                         ret = -EIO;
3725                         return;
3726                 }
3727
3728                 rec = &el->l_recs[next_free - 1];
3729
3730                 rec->e_int_clusters = insert_rec->e_cpos;
3731                 le32_add_cpu(&rec->e_int_clusters,
3732                              le16_to_cpu(insert_rec->e_leaf_clusters));
3733                 le32_add_cpu(&rec->e_int_clusters,
3734                              -le32_to_cpu(rec->e_cpos));
3735
3736                 ret = ocfs2_journal_dirty(handle, bh);
3737                 if (ret)
3738                         mlog_errno(ret);
3739
3740         }
3741 }
3742
3743 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3744                                     struct ocfs2_extent_rec *insert_rec,
3745                                     struct ocfs2_path *right_path,
3746                                     struct ocfs2_path **ret_left_path)
3747 {
3748         int ret, next_free;
3749         struct ocfs2_extent_list *el;
3750         struct ocfs2_path *left_path = NULL;
3751
3752         *ret_left_path = NULL;
3753
3754         /*
3755          * This shouldn't happen for non-trees. The extent rec cluster
3756          * count manipulation below only works for interior nodes.
3757          */
3758         BUG_ON(right_path->p_tree_depth == 0);
3759
3760         /*
3761          * If our appending insert is at the leftmost edge of a leaf,
3762          * then we might need to update the rightmost records of the
3763          * neighboring path.
3764          */
3765         el = path_leaf_el(right_path);
3766         next_free = le16_to_cpu(el->l_next_free_rec);
3767         if (next_free == 0 ||
3768             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3769                 u32 left_cpos;
3770
3771                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3772                                                     &left_cpos);
3773                 if (ret) {
3774                         mlog_errno(ret);
3775                         goto out;
3776                 }
3777
3778                 mlog(0, "Append may need a left path update. cpos: %u, "
3779                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3780                      left_cpos);
3781
3782                 /*
3783                  * No need to worry if the append is already in the
3784                  * leftmost leaf.
3785                  */
3786                 if (left_cpos) {
3787                         left_path = ocfs2_new_path_from_path(right_path);
3788                         if (!left_path) {
3789                                 ret = -ENOMEM;
3790                                 mlog_errno(ret);
3791                                 goto out;
3792                         }
3793
3794                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3795                         if (ret) {
3796                                 mlog_errno(ret);
3797                                 goto out;
3798                         }
3799
3800                         /*
3801                          * ocfs2_insert_path() will pass the left_path to the
3802                          * journal for us.
3803                          */
3804                 }
3805         }
3806
3807         ret = ocfs2_journal_access_path(inode, handle, right_path);
3808         if (ret) {
3809                 mlog_errno(ret);
3810                 goto out;
3811         }
3812
3813         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3814
3815         *ret_left_path = left_path;
3816         ret = 0;
3817 out:
3818         if (ret != 0)
3819                 ocfs2_free_path(left_path);
3820
3821         return ret;
3822 }
3823
3824 static void ocfs2_split_record(struct inode *inode,
3825                                struct ocfs2_path *left_path,
3826                                struct ocfs2_path *right_path,
3827                                struct ocfs2_extent_rec *split_rec,
3828                                enum ocfs2_split_type split)
3829 {
3830         int index;
3831         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3832         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3833         struct ocfs2_extent_rec *rec, *tmprec;
3834
3835         right_el = path_leaf_el(right_path);;
3836         if (left_path)
3837                 left_el = path_leaf_el(left_path);
3838
3839         el = right_el;
3840         insert_el = right_el;
3841         index = ocfs2_search_extent_list(el, cpos);
3842         if (index != -1) {
3843                 if (index == 0 && left_path) {
3844                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3845
3846                         /*
3847                          * This typically means that the record
3848                          * started in the left path but moved to the
3849                          * right as a result of rotation. We either
3850                          * move the existing record to the left, or we
3851                          * do the later insert there.
3852                          *
3853                          * In this case, the left path should always
3854                          * exist as the rotate code will have passed
3855                          * it back for a post-insert update.
3856                          */
3857
3858                         if (split == SPLIT_LEFT) {
3859                                 /*
3860                                  * It's a left split. Since we know
3861                                  * that the rotate code gave us an
3862                                  * empty extent in the left path, we
3863                                  * can just do the insert there.
3864                                  */
3865                                 insert_el = left_el;
3866                         } else {
3867                                 /*
3868                                  * Right split - we have to move the
3869                                  * existing record over to the left
3870                                  * leaf. The insert will be into the
3871                                  * newly created empty extent in the
3872                                  * right leaf.
3873                                  */
3874                                 tmprec = &right_el->l_recs[index];
3875                                 ocfs2_rotate_leaf(left_el, tmprec);
3876                                 el = left_el;
3877
3878                                 memset(tmprec, 0, sizeof(*tmprec));
3879                                 index = ocfs2_search_extent_list(left_el, cpos);
3880                                 BUG_ON(index == -1);
3881                         }
3882                 }
3883         } else {
3884                 BUG_ON(!left_path);
3885                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3886                 /*
3887                  * Left path is easy - we can just allow the insert to
3888                  * happen.
3889                  */
3890                 el = left_el;
3891                 insert_el = left_el;
3892                 index = ocfs2_search_extent_list(el, cpos);
3893                 BUG_ON(index == -1);
3894         }
3895
3896         rec = &el->l_recs[index];
3897         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3898         ocfs2_rotate_leaf(insert_el, split_rec);
3899 }
3900
3901 /*
3902  * This function only does inserts on an allocation b-tree. For tree
3903  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3904  *
3905  * right_path is the path we want to do the actual insert
3906  * in. left_path should only be passed in if we need to update that
3907  * portion of the tree after an edge insert.
3908  */
3909 static int ocfs2_insert_path(struct inode *inode,
3910                              handle_t *handle,
3911                              struct ocfs2_path *left_path,
3912                              struct ocfs2_path *right_path,
3913                              struct ocfs2_extent_rec *insert_rec,
3914                              struct ocfs2_insert_type *insert)
3915 {
3916         int ret, subtree_index;
3917         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3918
3919         if (left_path) {
3920                 int credits = handle->h_buffer_credits;
3921
3922                 /*
3923                  * There's a chance that left_path got passed back to
3924                  * us without being accounted for in the
3925                  * journal. Extend our transaction here to be sure we
3926                  * can change those blocks.
3927                  */
3928                 credits += left_path->p_tree_depth;
3929
3930                 ret = ocfs2_extend_trans(handle, credits);
3931                 if (ret < 0) {
3932                         mlog_errno(ret);
3933                         goto out;
3934                 }
3935
3936                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3937                 if (ret < 0) {
3938                         mlog_errno(ret);
3939                         goto out;
3940                 }
3941         }
3942
3943         /*
3944          * Pass both paths to the journal. The majority of inserts
3945          * will be touching all components anyway.
3946          */
3947         ret = ocfs2_journal_access_path(inode, handle, right_path);
3948         if (ret < 0) {
3949                 mlog_errno(ret);
3950                 goto out;
3951         }
3952
3953         if (insert->ins_split != SPLIT_NONE) {
3954                 /*
3955                  * We could call ocfs2_insert_at_leaf() for some types
3956                  * of splits, but it's easier to just let one separate
3957                  * function sort it all out.
3958                  */
3959                 ocfs2_split_record(inode, left_path, right_path,
3960                                    insert_rec, insert->ins_split);
3961
3962                 /*
3963                  * Split might have modified either leaf and we don't
3964                  * have a guarantee that the later edge insert will
3965                  * dirty this for us.
3966                  */
3967                 if (left_path)
3968                         ret = ocfs2_journal_dirty(handle,
3969                                                   path_leaf_bh(left_path));
3970                         if (ret)
3971                                 mlog_errno(ret);
3972         } else
3973                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3974                                      insert, inode);
3975
3976         ret = ocfs2_journal_dirty(handle, leaf_bh);
3977         if (ret)
3978                 mlog_errno(ret);
3979
3980         if (left_path) {
3981                 /*
3982                  * The rotate code has indicated that we need to fix
3983                  * up portions of the tree after the insert.
3984                  *
3985                  * XXX: Should we extend the transaction here?
3986                  */
3987                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3988                                                         right_path);
3989                 ocfs2_complete_edge_insert(inode, handle, left_path,
3990                                            right_path, subtree_index);
3991         }
3992
3993         ret = 0;
3994 out:
3995         return ret;
3996 }
3997
3998 static int ocfs2_do_insert_extent(struct inode *inode,
3999                                   handle_t *handle,
4000                                   struct ocfs2_extent_tree *et,
4001                                   struct ocfs2_extent_rec *insert_rec,
4002                                   struct ocfs2_insert_type *type)
4003 {
4004         int ret, rotate = 0;
4005         u32 cpos;
4006         struct ocfs2_path *right_path = NULL;
4007         struct ocfs2_path *left_path = NULL;
4008         struct ocfs2_extent_list *el;
4009
4010         el = et->et_root_el;
4011
4012         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
4013                                    OCFS2_JOURNAL_ACCESS_WRITE);
4014         if (ret) {
4015                 mlog_errno(ret);
4016                 goto out;
4017         }
4018
4019         if (le16_to_cpu(el->l_tree_depth) == 0) {
4020                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
4021                 goto out_update_clusters;
4022         }
4023
4024         right_path = ocfs2_new_path_from_et(et);
4025         if (!right_path) {
4026                 ret = -ENOMEM;
4027                 mlog_errno(ret);
4028                 goto out;
4029         }
4030
4031         /*
4032          * Determine the path to start with. Rotations need the
4033          * rightmost path, everything else can go directly to the
4034          * target leaf.
4035          */
4036         cpos = le32_to_cpu(insert_rec->e_cpos);
4037         if (type->ins_appending == APPEND_NONE &&
4038             type->ins_contig == CONTIG_NONE) {
4039                 rotate = 1;
4040                 cpos = UINT_MAX;
4041         }
4042
4043         ret = ocfs2_find_path(inode, right_path, cpos);
4044         if (ret) {
4045                 mlog_errno(ret);
4046                 goto out;
4047         }
4048
4049         /*
4050          * Rotations and appends need special treatment - they modify
4051          * parts of the tree's above them.
4052          *
4053          * Both might pass back a path immediate to the left of the
4054          * one being inserted to. This will be cause
4055          * ocfs2_insert_path() to modify the rightmost records of
4056          * left_path to account for an edge insert.
4057          *
4058          * XXX: When modifying this code, keep in mind that an insert
4059          * can wind up skipping both of these two special cases...
4060          */
4061         if (rotate) {
4062                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
4063                                               le32_to_cpu(insert_rec->e_cpos),
4064                                               right_path, &left_path);
4065                 if (ret) {
4066                         mlog_errno(ret);
4067                         goto out;
4068                 }
4069
4070                 /*
4071                  * ocfs2_rotate_tree_right() might have extended the
4072                  * transaction without re-journaling our tree root.
4073                  */
4074                 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
4075                                            OCFS2_JOURNAL_ACCESS_WRITE);
4076                 if (ret) {
4077                         mlog_errno(ret);
4078                         goto out;
4079                 }
4080         } else if (type->ins_appending == APPEND_TAIL
4081                    && type->ins_contig != CONTIG_LEFT) {
4082                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4083                                                right_path, &left_path);
4084                 if (ret) {
4085                         mlog_errno(ret);
4086                         goto out;
4087                 }
4088         }
4089
4090         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4091                                 insert_rec, type);
4092         if (ret) {
4093                 mlog_errno(ret);
4094                 goto out;
4095         }
4096
4097 out_update_clusters:
4098         if (type->ins_split == SPLIT_NONE)
4099                 ocfs2_et_update_clusters(inode, et,
4100                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4101
4102         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4103         if (ret)
4104                 mlog_errno(ret);
4105
4106 out:
4107         ocfs2_free_path(left_path);
4108         ocfs2_free_path(right_path);
4109
4110         return ret;
4111 }
4112
4113 static enum ocfs2_contig_type
4114 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4115                                struct ocfs2_extent_list *el, int index,
4116                                struct ocfs2_extent_rec *split_rec)
4117 {
4118         int status;
4119         enum ocfs2_contig_type ret = CONTIG_NONE;
4120         u32 left_cpos, right_cpos;
4121         struct ocfs2_extent_rec *rec = NULL;
4122         struct ocfs2_extent_list *new_el;
4123         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4124         struct buffer_head *bh;
4125         struct ocfs2_extent_block *eb;
4126
4127         if (index > 0) {
4128                 rec = &el->l_recs[index - 1];
4129         } else if (path->p_tree_depth > 0) {
4130                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4131                                                        path, &left_cpos);
4132                 if (status)
4133                         goto out;
4134
4135                 if (left_cpos != 0) {
4136                         left_path = ocfs2_new_path_from_path(path);
4137                         if (!left_path)
4138                                 goto out;
4139
4140                         status = ocfs2_find_path(inode, left_path, left_cpos);
4141                         if (status)
4142                                 goto out;
4143
4144                         new_el = path_leaf_el(left_path);
4145
4146                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4147                             le16_to_cpu(new_el->l_count)) {
4148                                 bh = path_leaf_bh(left_path);
4149                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4150                                 ocfs2_error(inode->i_sb,
4151                                             "Extent block #%llu has an "
4152                                             "invalid l_next_free_rec of "
4153                                             "%d.  It should have "
4154                                             "matched the l_count of %d",
4155                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4156                                             le16_to_cpu(new_el->l_next_free_rec),
4157                                             le16_to_cpu(new_el->l_count));
4158                                 status = -EINVAL;
4159                                 goto out;
4160                         }
4161                         rec = &new_el->l_recs[
4162                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4163                 }
4164         }
4165
4166         /*
4167          * We're careful to check for an empty extent record here -
4168          * the merge code will know what to do if it sees one.
4169          */
4170         if (rec) {
4171                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4172                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4173                                 ret = CONTIG_RIGHT;
4174                 } else {
4175                         ret = ocfs2_extent_contig(inode, rec, split_rec);
4176                 }
4177         }
4178
4179         rec = NULL;
4180         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4181                 rec = &el->l_recs[index + 1];
4182         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4183                  path->p_tree_depth > 0) {
4184                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4185                                                         path, &right_cpos);
4186                 if (status)
4187                         goto out;
4188
4189                 if (right_cpos == 0)
4190                         goto out;
4191
4192                 right_path = ocfs2_new_path_from_path(path);
4193                 if (!right_path)
4194                         goto out;
4195
4196                 status = ocfs2_find_path(inode, right_path, right_cpos);
4197                 if (status)
4198                         goto out;
4199
4200                 new_el = path_leaf_el(right_path);
4201                 rec = &new_el->l_recs[0];
4202                 if (ocfs2_is_empty_extent(rec)) {
4203                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4204                                 bh = path_leaf_bh(right_path);
4205                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4206                                 ocfs2_error(inode->i_sb,
4207                                             "Extent block #%llu has an "
4208                                             "invalid l_next_free_rec of %d",
4209                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4210                                             le16_to_cpu(new_el->l_next_free_rec));
4211                                 status = -EINVAL;
4212                                 goto out;
4213                         }
4214                         rec = &new_el->l_recs[1];
4215                 }
4216         }
4217
4218         if (rec) {
4219                 enum ocfs2_contig_type contig_type;
4220
4221                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4222
4223                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4224                         ret = CONTIG_LEFTRIGHT;
4225                 else if (ret == CONTIG_NONE)
4226                         ret = contig_type;
4227         }
4228
4229 out:
4230         if (left_path)
4231                 ocfs2_free_path(left_path);
4232         if (right_path)
4233                 ocfs2_free_path(right_path);
4234
4235         return ret;
4236 }
4237
4238 static void ocfs2_figure_contig_type(struct inode *inode,
4239                                      struct ocfs2_insert_type *insert,
4240                                      struct ocfs2_extent_list *el,
4241                                      struct ocfs2_extent_rec *insert_rec,
4242                                      struct ocfs2_extent_tree *et)
4243 {
4244         int i;
4245         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4246
4247         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4248
4249         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4250                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4251                                                   insert_rec);
4252                 if (contig_type != CONTIG_NONE) {
4253                         insert->ins_contig_index = i;
4254                         break;
4255                 }
4256         }
4257         insert->ins_contig = contig_type;
4258
4259         if (insert->ins_contig != CONTIG_NONE) {
4260                 struct ocfs2_extent_rec *rec =
4261                                 &el->l_recs[insert->ins_contig_index];
4262                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4263                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4264
4265                 /*
4266                  * Caller might want us to limit the size of extents, don't
4267                  * calculate contiguousness if we might exceed that limit.
4268                  */
4269                 if (et->et_max_leaf_clusters &&
4270                     (len > et->et_max_leaf_clusters))
4271                         insert->ins_contig = CONTIG_NONE;
4272         }
4273 }
4274
4275 /*
4276  * This should only be called against the righmost leaf extent list.
4277  *
4278  * ocfs2_figure_appending_type() will figure out whether we'll have to
4279  * insert at the tail of the rightmost leaf.
4280  *
4281  * This should also work against the root extent list for tree's with 0
4282  * depth. If we consider the root extent list to be the rightmost leaf node
4283  * then the logic here makes sense.
4284  */
4285 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4286                                         struct ocfs2_extent_list *el,
4287                                         struct ocfs2_extent_rec *insert_rec)
4288 {
4289         int i;
4290         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4291         struct ocfs2_extent_rec *rec;
4292
4293         insert->ins_appending = APPEND_NONE;
4294
4295         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4296
4297         if (!el->l_next_free_rec)
4298                 goto set_tail_append;
4299
4300         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4301                 /* Were all records empty? */
4302                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4303                         goto set_tail_append;
4304         }
4305
4306         i = le16_to_cpu(el->l_next_free_rec) - 1;
4307         rec = &el->l_recs[i];
4308
4309         if (cpos >=
4310             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4311                 goto set_tail_append;
4312
4313         return;
4314
4315 set_tail_append:
4316         insert->ins_appending = APPEND_TAIL;
4317 }
4318
4319 /*
4320  * Helper function called at the begining of an insert.
4321  *
4322  * This computes a few things that are commonly used in the process of
4323  * inserting into the btree:
4324  *   - Whether the new extent is contiguous with an existing one.
4325  *   - The current tree depth.
4326  *   - Whether the insert is an appending one.
4327  *   - The total # of free records in the tree.
4328  *
4329  * All of the information is stored on the ocfs2_insert_type
4330  * structure.
4331  */
4332 static int ocfs2_figure_insert_type(struct inode *inode,
4333                                     struct ocfs2_extent_tree *et,
4334                                     struct buffer_head **last_eb_bh,
4335                                     struct ocfs2_extent_rec *insert_rec,
4336                                     int *free_records,
4337                                     struct ocfs2_insert_type *insert)
4338 {
4339         int ret;
4340         struct ocfs2_extent_block *eb;
4341         struct ocfs2_extent_list *el;
4342         struct ocfs2_path *path = NULL;
4343         struct buffer_head *bh = NULL;
4344
4345         insert->ins_split = SPLIT_NONE;
4346
4347         el = et->et_root_el;
4348         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4349
4350         if (el->l_tree_depth) {
4351                 /*
4352                  * If we have tree depth, we read in the
4353                  * rightmost extent block ahead of time as
4354                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4355                  * may want it later.
4356                  */
4357                 ret = ocfs2_read_extent_block(inode,
4358                                               ocfs2_et_get_last_eb_blk(et),
4359                                               &bh);
4360                 if (ret) {
4361                         mlog_exit(ret);
4362                         goto out;
4363                 }
4364                 eb = (struct ocfs2_extent_block *) bh->b_data;
4365                 el = &eb->h_list;
4366         }
4367
4368         /*
4369          * Unless we have a contiguous insert, we'll need to know if
4370          * there is room left in our allocation tree for another
4371          * extent record.
4372          *
4373          * XXX: This test is simplistic, we can search for empty
4374          * extent records too.
4375          */
4376         *free_records = le16_to_cpu(el->l_count) -
4377                 le16_to_cpu(el->l_next_free_rec);
4378
4379         if (!insert->ins_tree_depth) {
4380                 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4381                 ocfs2_figure_appending_type(insert, el, insert_rec);
4382                 return 0;
4383         }
4384
4385         path = ocfs2_new_path_from_et(et);
4386         if (!path) {
4387                 ret = -ENOMEM;
4388                 mlog_errno(ret);
4389                 goto out;
4390         }
4391
4392         /*
4393          * In the case that we're inserting past what the tree
4394          * currently accounts for, ocfs2_find_path() will return for
4395          * us the rightmost tree path. This is accounted for below in
4396          * the appending code.
4397          */
4398         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4399         if (ret) {
4400                 mlog_errno(ret);
4401                 goto out;
4402         }
4403
4404         el = path_leaf_el(path);
4405
4406         /*
4407          * Now that we have the path, there's two things we want to determine:
4408          * 1) Contiguousness (also set contig_index if this is so)
4409          *
4410          * 2) Are we doing an append? We can trivially break this up
4411          *     into two types of appends: simple record append, or a
4412          *     rotate inside the tail leaf.
4413          */
4414         ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4415
4416         /*
4417          * The insert code isn't quite ready to deal with all cases of
4418          * left contiguousness. Specifically, if it's an insert into
4419          * the 1st record in a leaf, it will require the adjustment of
4420          * cluster count on the last record of the path directly to it's
4421          * left. For now, just catch that case and fool the layers
4422          * above us. This works just fine for tree_depth == 0, which
4423          * is why we allow that above.
4424          */
4425         if (insert->ins_contig == CONTIG_LEFT &&
4426             insert->ins_contig_index == 0)
4427                 insert->ins_contig = CONTIG_NONE;
4428
4429         /*
4430          * Ok, so we can simply compare against last_eb to figure out
4431          * whether the path doesn't exist. This will only happen in
4432          * the case that we're doing a tail append, so maybe we can
4433          * take advantage of that information somehow.
4434          */
4435         if (ocfs2_et_get_last_eb_blk(et) ==
4436             path_leaf_bh(path)->b_blocknr) {
4437                 /*
4438                  * Ok, ocfs2_find_path() returned us the rightmost
4439                  * tree path. This might be an appending insert. There are
4440                  * two cases:
4441                  *    1) We're doing a true append at the tail:
4442                  *      -This might even be off the end of the leaf
4443                  *    2) We're "appending" by rotating in the tail
4444                  */
4445                 ocfs2_figure_appending_type(insert, el, insert_rec);
4446         }
4447
4448 out:
4449         ocfs2_free_path(path);
4450
4451         if (ret == 0)
4452                 *last_eb_bh = bh;
4453         else
4454                 brelse(bh);
4455         return ret;
4456 }
4457
4458 /*
4459  * Insert an extent into an inode btree.
4460  *
4461  * The caller needs to update fe->i_clusters
4462  */
4463 int ocfs2_insert_extent(struct ocfs2_super *osb,
4464                         handle_t *handle,
4465                         struct inode *inode,
4466                         struct ocfs2_extent_tree *et,
4467                         u32 cpos,
4468                         u64 start_blk,
4469                         u32 new_clusters,
4470                         u8 flags,
4471                         struct ocfs2_alloc_context *meta_ac)
4472 {
4473         int status;
4474         int uninitialized_var(free_records);
4475         struct buffer_head *last_eb_bh = NULL;
4476         struct ocfs2_insert_type insert = {0, };
4477         struct ocfs2_extent_rec rec;
4478
4479         mlog(0, "add %u clusters at position %u to inode %llu\n",
4480              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4481
4482         memset(&rec, 0, sizeof(rec));
4483         rec.e_cpos = cpu_to_le32(cpos);
4484         rec.e_blkno = cpu_to_le64(start_blk);
4485         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4486         rec.e_flags = flags;
4487         status = ocfs2_et_insert_check(inode, et, &rec);
4488         if (status) {
4489                 mlog_errno(status);
4490                 goto bail;
4491         }
4492
4493         status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4494                                           &free_records, &insert);
4495         if (status < 0) {
4496                 mlog_errno(status);
4497                 goto bail;
4498         }
4499
4500         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4501              "Insert.contig_index: %d, Insert.free_records: %d, "
4502              "Insert.tree_depth: %d\n",
4503              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4504              free_records, insert.ins_tree_depth);
4505
4506         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4507                 status = ocfs2_grow_tree(inode, handle, et,
4508                                          &insert.ins_tree_depth, &last_eb_bh,
4509                                          meta_ac);
4510                 if (status) {
4511                         mlog_errno(status);
4512                         goto bail;
4513                 }
4514         }
4515
4516         /* Finally, we can add clusters. This might rotate the tree for us. */
4517         status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4518         if (status < 0)
4519                 mlog_errno(status);
4520         else if (et->et_ops == &ocfs2_dinode_et_ops)
4521                 ocfs2_extent_map_insert_rec(inode, &rec);
4522
4523 bail:
4524         brelse(last_eb_bh);
4525
4526         mlog_exit(status);
4527         return status;
4528 }
4529
4530 /*
4531  * Allcate and add clusters into the extent b-tree.
4532  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4533  * The extent b-tree's root is specified by et, and
4534  * it is not limited to the file storage. Any extent tree can use this
4535  * function if it implements the proper ocfs2_extent_tree.
4536  */
4537 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4538                                 struct inode *inode,
4539                                 u32 *logical_offset,
4540                                 u32 clusters_to_add,
4541                                 int mark_unwritten,
4542                                 struct ocfs2_extent_tree *et,
4543                                 handle_t *handle,
4544                                 struct ocfs2_alloc_context *data_ac,
4545                                 struct ocfs2_alloc_context *meta_ac,
4546                                 enum ocfs2_alloc_restarted *reason_ret)
4547 {
4548         int status = 0;
4549         int free_extents;
4550         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4551         u32 bit_off, num_bits;
4552         u64 block;
4553         u8 flags = 0;
4554
4555         BUG_ON(!clusters_to_add);
4556
4557         if (mark_unwritten)
4558                 flags = OCFS2_EXT_UNWRITTEN;
4559
4560         free_extents = ocfs2_num_free_extents(osb, inode, et);
4561         if (free_extents < 0) {
4562                 status = free_extents;
4563                 mlog_errno(status);
4564                 goto leave;
4565         }
4566
4567         /* there are two cases which could cause us to EAGAIN in the
4568          * we-need-more-metadata case:
4569          * 1) we haven't reserved *any*
4570          * 2) we are so fragmented, we've needed to add metadata too
4571          *    many times. */
4572         if (!free_extents && !meta_ac) {
4573                 mlog(0, "we haven't reserved any metadata!\n");
4574                 status = -EAGAIN;
4575                 reason = RESTART_META;
4576                 goto leave;
4577         } else if ((!free_extents)
4578                    && (ocfs2_alloc_context_bits_left(meta_ac)
4579                        < ocfs2_extend_meta_needed(et->et_root_el))) {
4580                 mlog(0, "filesystem is really fragmented...\n");
4581                 status = -EAGAIN;
4582                 reason = RESTART_META;
4583                 goto leave;
4584         }
4585
4586         status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4587                                         clusters_to_add, &bit_off, &num_bits);
4588         if (status < 0) {
4589                 if (status != -ENOSPC)
4590                         mlog_errno(status);
4591                 goto leave;
4592         }
4593
4594         BUG_ON(num_bits > clusters_to_add);
4595
4596         /* reserve our write early -- insert_extent may update the inode */
4597         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
4598                                       OCFS2_JOURNAL_ACCESS_WRITE);
4599         if (status < 0) {
4600                 mlog_errno(status);
4601                 goto leave;
4602         }
4603
4604         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4605         mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4606              num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4607         status = ocfs2_insert_extent(osb, handle, inode, et,
4608                                      *logical_offset, block,
4609                                      num_bits, flags, meta_ac);
4610         if (status < 0) {
4611                 mlog_errno(status);
4612                 goto leave;
4613         }
4614
4615         status = ocfs2_journal_dirty(handle, et->et_root_bh);
4616         if (status < 0) {
4617                 mlog_errno(status);
4618                 goto leave;
4619         }
4620
4621         clusters_to_add -= num_bits;
4622         *logical_offset += num_bits;
4623
4624         if (clusters_to_add) {
4625                 mlog(0, "need to alloc once more, wanted = %u\n",
4626                      clusters_to_add);
4627                 status = -EAGAIN;
4628                 reason = RESTART_TRANS;
4629         }
4630
4631 leave:
4632         mlog_exit(status);
4633         if (reason_ret)
4634                 *reason_ret = reason;
4635         return status;
4636 }
4637
4638 static void ocfs2_make_right_split_rec(struct super_block *sb,
4639                                        struct ocfs2_extent_rec *split_rec,
4640                                        u32 cpos,
4641                                        struct ocfs2_extent_rec *rec)
4642 {
4643         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4644         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4645
4646         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4647
4648         split_rec->e_cpos = cpu_to_le32(cpos);
4649         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4650
4651         split_rec->e_blkno = rec->e_blkno;
4652         le64_add_cpu(&split_rec->e_blkno,
4653                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4654
4655         split_rec->e_flags = rec->e_flags;
4656 }
4657
4658 static int ocfs2_split_and_insert(struct inode *inode,
4659                                   handle_t *handle,
4660                                   struct ocfs2_path *path,
4661                                   struct ocfs2_extent_tree *et,
4662                                   struct buffer_head **last_eb_bh,
4663                                   int split_index,
4664                                   struct ocfs2_extent_rec *orig_split_rec,
4665                                   struct ocfs2_alloc_context *meta_ac)
4666 {
4667         int ret = 0, depth;
4668         unsigned int insert_range, rec_range, do_leftright = 0;
4669         struct ocfs2_extent_rec tmprec;
4670         struct ocfs2_extent_list *rightmost_el;
4671         struct ocfs2_extent_rec rec;
4672         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4673         struct ocfs2_insert_type insert;
4674         struct ocfs2_extent_block *eb;
4675
4676 leftright:
4677         /*
4678          * Store a copy of the record on the stack - it might move
4679          * around as the tree is manipulated below.
4680          */
4681         rec = path_leaf_el(path)->l_recs[split_index];
4682
4683         rightmost_el = et->et_root_el;
4684
4685         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4686         if (depth) {
4687                 BUG_ON(!(*last_eb_bh));
4688                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4689                 rightmost_el = &eb->h_list;
4690         }
4691
4692         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4693             le16_to_cpu(rightmost_el->l_count)) {
4694                 ret = ocfs2_grow_tree(inode, handle, et,
4695                                       &depth, last_eb_bh, meta_ac);
4696                 if (ret) {
4697                         mlog_errno(ret);
4698                         goto out;
4699                 }
4700         }
4701
4702         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4703         insert.ins_appending = APPEND_NONE;
4704         insert.ins_contig = CONTIG_NONE;
4705         insert.ins_tree_depth = depth;
4706
4707         insert_range = le32_to_cpu(split_rec.e_cpos) +
4708                 le16_to_cpu(split_rec.e_leaf_clusters);
4709         rec_range = le32_to_cpu(rec.e_cpos) +
4710                 le16_to_cpu(rec.e_leaf_clusters);
4711
4712         if (split_rec.e_cpos == rec.e_cpos) {
4713                 insert.ins_split = SPLIT_LEFT;
4714         } else if (insert_range == rec_range) {
4715                 insert.ins_split = SPLIT_RIGHT;
4716         } else {
4717                 /*
4718                  * Left/right split. We fake this as a right split
4719                  * first and then make a second pass as a left split.
4720                  */
4721                 insert.ins_split = SPLIT_RIGHT;
4722
4723                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4724                                            &rec);
4725
4726                 split_rec = tmprec;
4727
4728                 BUG_ON(do_leftright);
4729                 do_leftright = 1;
4730         }
4731
4732         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4733         if (ret) {
4734                 mlog_errno(ret);
4735                 goto out;
4736         }
4737
4738         if (do_leftright == 1) {
4739                 u32 cpos;
4740                 struct ocfs2_extent_list *el;
4741
4742                 do_leftright++;
4743                 split_rec = *orig_split_rec;
4744
4745                 ocfs2_reinit_path(path, 1);
4746
4747                 cpos = le32_to_cpu(split_rec.e_cpos);
4748                 ret = ocfs2_find_path(inode, path, cpos);
4749                 if (ret) {
4750                         mlog_errno(ret);
4751                         goto out;
4752                 }
4753
4754                 el = path_leaf_el(path);
4755                 split_index = ocfs2_search_extent_list(el, cpos);
4756                 goto leftright;
4757         }
4758 out:
4759
4760         return ret;
4761 }
4762
4763 /*
4764  * Mark part or all of the extent record at split_index in the leaf
4765  * pointed to by path as written. This removes the unwritten
4766  * extent flag.
4767  *
4768  * Care is taken to handle contiguousness so as to not grow the tree.
4769  *
4770  * meta_ac is not strictly necessary - we only truly need it if growth
4771  * of the tree is required. All other cases will degrade into a less
4772  * optimal tree layout.
4773  *
4774  * last_eb_bh should be the rightmost leaf block for any extent
4775  * btree. Since a split may grow the tree or a merge might shrink it,
4776  * the caller cannot trust the contents of that buffer after this call.
4777  *
4778  * This code is optimized for readability - several passes might be
4779  * made over certain portions of the tree. All of those blocks will
4780  * have been brought into cache (and pinned via the journal), so the
4781  * extra overhead is not expressed in terms of disk reads.
4782  */
4783 static int __ocfs2_mark_extent_written(struct inode *inode,
4784                                        struct ocfs2_extent_tree *et,
4785                                        handle_t *handle,
4786                                        struct ocfs2_path *path,
4787                                        int split_index,
4788                                        struct ocfs2_extent_rec *split_rec,
4789                                        struct ocfs2_alloc_context *meta_ac,
4790                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4791 {
4792         int ret = 0;
4793         struct ocfs2_extent_list *el = path_leaf_el(path);
4794         struct buffer_head *last_eb_bh = NULL;
4795         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4796         struct ocfs2_merge_ctxt ctxt;
4797         struct ocfs2_extent_list *rightmost_el;
4798
4799         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4800                 ret = -EIO;
4801                 mlog_errno(ret);
4802                 goto out;
4803         }
4804
4805         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4806             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4807              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4808                 ret = -EIO;
4809                 mlog_errno(ret);
4810                 goto out;
4811         }
4812
4813         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4814                                                             split_index,
4815                                                             split_rec);
4816
4817         /*
4818          * The core merge / split code wants to know how much room is
4819          * left in this inodes allocation tree, so we pass the
4820          * rightmost extent list.
4821          */
4822         if (path->p_tree_depth) {
4823                 struct ocfs2_extent_block *eb;
4824
4825                 ret = ocfs2_read_extent_block(inode,
4826                                               ocfs2_et_get_last_eb_blk(et),
4827                                               &last_eb_bh);
4828                 if (ret) {
4829                         mlog_exit(ret);
4830                         goto out;
4831                 }
4832
4833                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4834                 rightmost_el = &eb->h_list;
4835         } else
4836                 rightmost_el = path_root_el(path);
4837
4838         if (rec->e_cpos == split_rec->e_cpos &&
4839             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4840                 ctxt.c_split_covers_rec = 1;
4841         else
4842                 ctxt.c_split_covers_rec = 0;
4843
4844         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4845
4846         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4847              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4848              ctxt.c_split_covers_rec);
4849
4850         if (ctxt.c_contig_type == CONTIG_NONE) {
4851                 if (ctxt.c_split_covers_rec)
4852                         el->l_recs[split_index] = *split_rec;
4853                 else
4854                         ret = ocfs2_split_and_insert(inode, handle, path, et,
4855                                                      &last_eb_bh, split_index,
4856                                                      split_rec, meta_ac);
4857                 if (ret)
4858                         mlog_errno(ret);
4859         } else {
4860                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4861                                                 split_index, split_rec,
4862                                                 dealloc, &ctxt, et);
4863                 if (ret)
4864                         mlog_errno(ret);
4865         }
4866
4867 out:
4868         brelse(last_eb_bh);
4869         return ret;
4870 }
4871
4872 /*
4873  * Mark the already-existing extent at cpos as written for len clusters.
4874  *
4875  * If the existing extent is larger than the request, initiate a
4876  * split. An attempt will be made at merging with adjacent extents.
4877  *
4878  * The caller is responsible for passing down meta_ac if we'll need it.
4879  */
4880 int ocfs2_mark_extent_written(struct inode *inode,
4881                               struct ocfs2_extent_tree *et,
4882                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4883                               struct ocfs2_alloc_context *meta_ac,
4884                               struct ocfs2_cached_dealloc_ctxt *dealloc)
4885 {
4886         int ret, index;
4887         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4888         struct ocfs2_extent_rec split_rec;
4889         struct ocfs2_path *left_path = NULL;
4890         struct ocfs2_extent_list *el;
4891
4892         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4893              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4894
4895         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4896                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4897                             "that are being written to, but the feature bit "
4898                             "is not set in the super block.",
4899                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4900                 ret = -EROFS;
4901                 goto out;
4902         }
4903
4904         /*
4905          * XXX: This should be fixed up so that we just re-insert the
4906          * next extent records.
4907          *
4908          * XXX: This is a hack on the extent tree, maybe it should be
4909          * an op?
4910          */
4911         if (et->et_ops == &ocfs2_dinode_et_ops)
4912                 ocfs2_extent_map_trunc(inode, 0);
4913
4914         left_path = ocfs2_new_path_from_et(et);
4915         if (!left_path) {
4916                 ret = -ENOMEM;
4917                 mlog_errno(ret);
4918                 goto out;
4919         }
4920
4921         ret = ocfs2_find_path(inode, left_path, cpos);
4922         if (ret) {
4923                 mlog_errno(ret);
4924                 goto out;
4925         }
4926         el = path_leaf_el(left_path);
4927
4928         index = ocfs2_search_extent_list(el, cpos);
4929         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4930                 ocfs2_error(inode->i_sb,
4931                             "Inode %llu has an extent at cpos %u which can no "
4932                             "longer be found.\n",
4933                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4934                 ret = -EROFS;
4935                 goto out;
4936         }
4937
4938         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4939         split_rec.e_cpos = cpu_to_le32(cpos);
4940         split_rec.e_leaf_clusters = cpu_to_le16(len);
4941         split_rec.e_blkno = cpu_to_le64(start_blkno);
4942         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4943         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4944
4945         ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4946                                           index, &split_rec, meta_ac,
4947                                           dealloc);
4948         if (ret)
4949                 mlog_errno(ret);
4950
4951 out:
4952         ocfs2_free_path(left_path);
4953         return ret;
4954 }
4955
4956 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4957                             handle_t *handle, struct ocfs2_path *path,
4958                             int index, u32 new_range,
4959                             struct ocfs2_alloc_context *meta_ac)
4960 {
4961         int ret, depth, credits = handle->h_buffer_credits;
4962         struct buffer_head *last_eb_bh = NULL;
4963         struct ocfs2_extent_block *eb;
4964         struct ocfs2_extent_list *rightmost_el, *el;
4965         struct ocfs2_extent_rec split_rec;
4966         struct ocfs2_extent_rec *rec;
4967         struct ocfs2_insert_type insert;
4968
4969         /*
4970          * Setup the record to split before we grow the tree.
4971          */
4972         el = path_leaf_el(path);
4973         rec = &el->l_recs[index];
4974         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4975
4976         depth = path->p_tree_depth;
4977         if (depth > 0) {
4978                 ret = ocfs2_read_extent_block(inode,
4979                                               ocfs2_et_get_last_eb_blk(et),
4980                                               &last_eb_bh);
4981                 if (ret < 0) {
4982                         mlog_errno(ret);
4983                         goto out;
4984                 }
4985
4986                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4987                 rightmost_el = &eb->h_list;
4988         } else
4989                 rightmost_el = path_leaf_el(path);
4990
4991         credits += path->p_tree_depth +
4992                    ocfs2_extend_meta_needed(et->et_root_el);
4993         ret = ocfs2_extend_trans(handle, credits);
4994         if (ret) {
4995                 mlog_errno(ret);
4996                 goto out;
4997         }
4998
4999         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5000             le16_to_cpu(rightmost_el->l_count)) {
5001                 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
5002                                       meta_ac);
5003                 if (ret) {
5004                         mlog_errno(ret);
5005                         goto out;
5006                 }
5007         }
5008
5009         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5010         insert.ins_appending = APPEND_NONE;
5011         insert.ins_contig = CONTIG_NONE;
5012         insert.ins_split = SPLIT_RIGHT;
5013         insert.ins_tree_depth = depth;
5014
5015         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
5016         if (ret)
5017                 mlog_errno(ret);
5018
5019 out:
5020         brelse(last_eb_bh);
5021         return ret;
5022 }
5023
5024 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5025                               struct ocfs2_path *path, int index,
5026                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5027                               u32 cpos, u32 len,
5028                               struct ocfs2_extent_tree *et)
5029 {
5030         int ret;
5031         u32 left_cpos, rec_range, trunc_range;
5032         int wants_rotate = 0, is_rightmost_tree_rec = 0;
5033         struct super_block *sb = inode->i_sb;
5034         struct ocfs2_path *left_path = NULL;
5035         struct ocfs2_extent_list *el = path_leaf_el(path);
5036         struct ocfs2_extent_rec *rec;
5037         struct ocfs2_extent_block *eb;
5038
5039         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5040                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5041                 if (ret) {
5042                         mlog_errno(ret);
5043                         goto out;
5044                 }
5045
5046                 index--;
5047         }
5048
5049         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5050             path->p_tree_depth) {
5051                 /*
5052                  * Check whether this is the rightmost tree record. If
5053                  * we remove all of this record or part of its right
5054                  * edge then an update of the record lengths above it
5055                  * will be required.
5056                  */
5057                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5058                 if (eb->h_next_leaf_blk == 0)
5059                         is_rightmost_tree_rec = 1;
5060         }
5061
5062         rec = &el->l_recs[index];
5063         if (index == 0 && path->p_tree_depth &&
5064             le32_to_cpu(rec->e_cpos) == cpos) {
5065                 /*
5066                  * Changing the leftmost offset (via partial or whole
5067                  * record truncate) of an interior (or rightmost) path
5068                  * means we have to update the subtree that is formed
5069                  * by this leaf and the one to it's left.
5070                  *
5071                  * There are two cases we can skip:
5072                  *   1) Path is the leftmost one in our inode tree.
5073                  *   2) The leaf is rightmost and will be empty after
5074                  *      we remove the extent record - the rotate code
5075                  *      knows how to update the newly formed edge.
5076                  */
5077
5078                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5079                                                     &left_cpos);
5080                 if (ret) {
5081                         mlog_errno(ret);
5082                         goto out;
5083                 }
5084
5085                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5086                         left_path = ocfs2_new_path_from_path(path);
5087                         if (!left_path) {
5088                                 ret = -ENOMEM;
5089                                 mlog_errno(ret);
5090                                 goto out;
5091                         }
5092
5093                         ret = ocfs2_find_path(inode, left_path, left_cpos);
5094                         if (ret) {
5095                                 mlog_errno(ret);
5096                                 goto out;
5097                         }
5098                 }
5099         }
5100
5101         ret = ocfs2_extend_rotate_transaction(handle, 0,
5102                                               handle->h_buffer_credits,
5103                                               path);
5104         if (ret) {
5105                 mlog_errno(ret);
5106                 goto out;
5107         }
5108
5109         ret = ocfs2_journal_access_path(inode, handle, path);
5110         if (ret) {
5111                 mlog_errno(ret);
5112                 goto out;
5113         }
5114
5115         ret = ocfs2_journal_access_path(inode, handle, left_path);
5116         if (ret) {
5117                 mlog_errno(ret);
5118                 goto out;
5119         }
5120
5121         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5122         trunc_range = cpos + len;
5123
5124         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5125                 int next_free;
5126
5127                 memset(rec, 0, sizeof(*rec));
5128                 ocfs2_cleanup_merge(el, index);
5129                 wants_rotate = 1;
5130
5131                 next_free = le16_to_cpu(el->l_next_free_rec);
5132                 if (is_rightmost_tree_rec && next_free > 1) {
5133                         /*
5134                          * We skip the edge update if this path will
5135                          * be deleted by the rotate code.
5136                          */
5137                         rec = &el->l_recs[next_free - 1];
5138                         ocfs2_adjust_rightmost_records(inode, handle, path,
5139                                                        rec);
5140                 }
5141         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5142                 /* Remove leftmost portion of the record. */
5143                 le32_add_cpu(&rec->e_cpos, len);
5144                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5145                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5146         } else if (rec_range == trunc_range) {
5147                 /* Remove rightmost portion of the record */
5148                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5149                 if (is_rightmost_tree_rec)
5150                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5151         } else {
5152                 /* Caller should have trapped this. */
5153                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5154                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5155                      le32_to_cpu(rec->e_cpos),
5156                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5157                 BUG();
5158         }
5159
5160         if (left_path) {
5161                 int subtree_index;
5162
5163                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5164                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
5165                                            subtree_index);
5166         }
5167
5168         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5169
5170         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5171         if (ret) {
5172                 mlog_errno(ret);
5173                 goto out;
5174         }
5175
5176 out:
5177         ocfs2_free_path(left_path);
5178         return ret;
5179 }
5180
5181 int ocfs2_remove_extent(struct inode *inode,
5182                         struct ocfs2_extent_tree *et,
5183                         u32 cpos, u32 len, handle_t *handle,
5184                         struct ocfs2_alloc_context *meta_ac,
5185                         struct ocfs2_cached_dealloc_ctxt *dealloc)
5186 {
5187         int ret, index;
5188         u32 rec_range, trunc_range;
5189         struct ocfs2_extent_rec *rec;
5190         struct ocfs2_extent_list *el;
5191         struct ocfs2_path *path = NULL;
5192
5193         ocfs2_extent_map_trunc(inode, 0);
5194
5195         path = ocfs2_new_path_from_et(et);
5196         if (!path) {
5197                 ret = -ENOMEM;
5198                 mlog_errno(ret);
5199                 goto out;
5200         }
5201
5202         ret = ocfs2_find_path(inode, path, cpos);
5203         if (ret) {
5204                 mlog_errno(ret);
5205                 goto out;
5206         }
5207
5208         el = path_leaf_el(path);
5209         index = ocfs2_search_extent_list(el, cpos);
5210         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5211                 ocfs2_error(inode->i_sb,
5212                             "Inode %llu has an extent at cpos %u which can no "
5213                             "longer be found.\n",
5214                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5215                 ret = -EROFS;
5216                 goto out;
5217         }
5218
5219         /*
5220          * We have 3 cases of extent removal:
5221          *   1) Range covers the entire extent rec
5222          *   2) Range begins or ends on one edge of the extent rec
5223          *   3) Range is in the middle of the extent rec (no shared edges)
5224          *
5225          * For case 1 we remove the extent rec and left rotate to
5226          * fill the hole.
5227          *
5228          * For case 2 we just shrink the existing extent rec, with a
5229          * tree update if the shrinking edge is also the edge of an
5230          * extent block.
5231          *
5232          * For case 3 we do a right split to turn the extent rec into
5233          * something case 2 can handle.
5234          */
5235         rec = &el->l_recs[index];
5236         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5237         trunc_range = cpos + len;
5238
5239         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5240
5241         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5242              "(cpos %u, len %u)\n",
5243              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5244              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5245
5246         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5247                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5248                                          cpos, len, et);
5249                 if (ret) {
5250                         mlog_errno(ret);
5251                         goto out;
5252                 }
5253         } else {
5254                 ret = ocfs2_split_tree(inode, et, handle, path, index,
5255                                        trunc_range, meta_ac);
5256                 if (ret) {
5257                         mlog_errno(ret);
5258                         goto out;
5259                 }
5260
5261                 /*
5262                  * The split could have manipulated the tree enough to
5263                  * move the record location, so we have to look for it again.
5264                  */
5265                 ocfs2_reinit_path(path, 1);
5266
5267                 ret = ocfs2_find_path(inode, path, cpos);
5268                 if (ret) {
5269                         mlog_errno(ret);
5270                         goto out;
5271                 }
5272
5273                 el = path_leaf_el(path);
5274                 index = ocfs2_search_extent_list(el, cpos);
5275                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5276                         ocfs2_error(inode->i_sb,
5277                                     "Inode %llu: split at cpos %u lost record.",
5278                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5279                                     cpos);
5280                         ret = -EROFS;
5281                         goto out;
5282                 }
5283
5284                 /*
5285                  * Double check our values here. If anything is fishy,
5286                  * it's easier to catch it at the top level.
5287                  */
5288                 rec = &el->l_recs[index];
5289                 rec_range = le32_to_cpu(rec->e_cpos) +
5290                         ocfs2_rec_clusters(el, rec);
5291                 if (rec_range != trunc_range) {
5292                         ocfs2_error(inode->i_sb,
5293                                     "Inode %llu: error after split at cpos %u"
5294                                     "trunc len %u, existing record is (%u,%u)",
5295                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5296                                     cpos, len, le32_to_cpu(rec->e_cpos),
5297                                     ocfs2_rec_clusters(el, rec));
5298                         ret = -EROFS;
5299                         goto out;
5300                 }
5301
5302                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5303                                          cpos, len, et);
5304                 if (ret) {
5305                         mlog_errno(ret);
5306                         goto out;
5307                 }
5308         }
5309
5310 out:
5311         ocfs2_free_path(path);
5312         return ret;
5313 }
5314
5315 int ocfs2_remove_btree_range(struct inode *inode,
5316                              struct ocfs2_extent_tree *et,
5317                              u32 cpos, u32 phys_cpos, u32 len,
5318                              struct ocfs2_cached_dealloc_ctxt *dealloc)
5319 {
5320         int ret;
5321         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5322         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5323         struct inode *tl_inode = osb->osb_tl_inode;
5324         handle_t *handle;
5325         struct ocfs2_alloc_context *meta_ac = NULL;
5326
5327         ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5328         if (ret) {
5329                 mlog_errno(ret);
5330                 return ret;
5331         }
5332
5333         mutex_lock(&tl_inode->i_mutex);
5334
5335         if (ocfs2_truncate_log_needs_flush(osb)) {
5336                 ret = __ocfs2_flush_truncate_log(osb);
5337                 if (ret < 0) {
5338                         mlog_errno(ret);
5339                         goto out;
5340                 }
5341         }
5342
5343         handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5344         if (IS_ERR(handle)) {
5345                 ret = PTR_ERR(handle);
5346                 mlog_errno(ret);
5347                 goto out;
5348         }
5349
5350         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
5351                                    OCFS2_JOURNAL_ACCESS_WRITE);
5352         if (ret) {
5353                 mlog_errno(ret);
5354                 goto out;
5355         }
5356
5357         ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5358                                   dealloc);
5359         if (ret) {
5360                 mlog_errno(ret);
5361                 goto out_commit;
5362         }
5363
5364         ocfs2_et_update_clusters(inode, et, -len);
5365
5366         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5367         if (ret) {
5368                 mlog_errno(ret);
5369                 goto out_commit;
5370         }
5371
5372         ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5373         if (ret)
5374                 mlog_errno(ret);
5375
5376 out_commit:
5377         ocfs2_commit_trans(osb, handle);
5378 out:
5379         mutex_unlock(&tl_inode->i_mutex);
5380
5381         if (meta_ac)
5382                 ocfs2_free_alloc_context(meta_ac);
5383
5384         return ret;
5385 }
5386
5387 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5388 {
5389         struct buffer_head *tl_bh = osb->osb_tl_bh;
5390         struct ocfs2_dinode *di;
5391         struct ocfs2_truncate_log *tl;
5392
5393         di = (struct ocfs2_dinode *) tl_bh->b_data;
5394         tl = &di->id2.i_dealloc;
5395
5396         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5397                         "slot %d, invalid truncate log parameters: used = "
5398                         "%u, count = %u\n", osb->slot_num,
5399                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5400         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5401 }
5402
5403 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5404                                            unsigned int new_start)
5405 {
5406         unsigned int tail_index;
5407         unsigned int current_tail;
5408
5409         /* No records, nothing to coalesce */
5410         if (!le16_to_cpu(tl->tl_used))
5411                 return 0;
5412
5413         tail_index = le16_to_cpu(tl->tl_used) - 1;
5414         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5415         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5416
5417         return current_tail == new_start;
5418 }
5419
5420 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5421                               handle_t *handle,
5422                               u64 start_blk,
5423                               unsigned int num_clusters)
5424 {
5425         int status, index;
5426         unsigned int start_cluster, tl_count;
5427         struct inode *tl_inode = osb->osb_tl_inode;
5428         struct buffer_head *tl_bh = osb->osb_tl_bh;
5429         struct ocfs2_dinode *di;
5430         struct ocfs2_truncate_log *tl;
5431
5432         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5433                    (unsigned long long)start_blk, num_clusters);
5434
5435         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5436
5437         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5438
5439         di = (struct ocfs2_dinode *) tl_bh->b_data;
5440
5441         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5442          * by the underlying call to ocfs2_read_inode_block(), so any
5443          * corruption is a code bug */
5444         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5445
5446         tl = &di->id2.i_dealloc;
5447         tl_count = le16_to_cpu(tl->tl_count);
5448         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5449                         tl_count == 0,
5450                         "Truncate record count on #%llu invalid "
5451                         "wanted %u, actual %u\n",
5452                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5453                         ocfs2_truncate_recs_per_inode(osb->sb),
5454                         le16_to_cpu(tl->tl_count));
5455
5456         /* Caller should have known to flush before calling us. */
5457         index = le16_to_cpu(tl->tl_used);
5458         if (index >= tl_count) {
5459                 status = -ENOSPC;
5460                 mlog_errno(status);
5461                 goto bail;
5462         }
5463
5464         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5465                                       OCFS2_JOURNAL_ACCESS_WRITE);
5466         if (status < 0) {
5467                 mlog_errno(status);
5468                 goto bail;
5469         }
5470
5471         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5472              "%llu (index = %d)\n", num_clusters, start_cluster,
5473              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5474
5475         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5476                 /*
5477                  * Move index back to the record we are coalescing with.
5478                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5479                  */
5480                 index--;
5481
5482                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5483                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5484                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5485                      num_clusters);
5486         } else {
5487                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5488                 tl->tl_used = cpu_to_le16(index + 1);
5489         }
5490         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5491
5492         status = ocfs2_journal_dirty(handle, tl_bh);
5493         if (status < 0) {
5494                 mlog_errno(status);
5495                 goto bail;
5496         }
5497
5498 bail:
5499         mlog_exit(status);
5500         return status;
5501 }
5502
5503 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5504                                          handle_t *handle,
5505                                          struct inode *data_alloc_inode,
5506                                          struct buffer_head *data_alloc_bh)
5507 {
5508         int status = 0;
5509         int i;
5510         unsigned int num_clusters;
5511         u64 start_blk;
5512         struct ocfs2_truncate_rec rec;
5513         struct ocfs2_dinode *di;
5514         struct ocfs2_truncate_log *tl;
5515         struct inode *tl_inode = osb->osb_tl_inode;
5516         struct buffer_head *tl_bh = osb->osb_tl_bh;
5517
5518         mlog_entry_void();
5519
5520         di = (struct ocfs2_dinode *) tl_bh->b_data;
5521         tl = &di->id2.i_dealloc;
5522         i = le16_to_cpu(tl->tl_used) - 1;
5523         while (i >= 0) {
5524                 /* Caller has given us at least enough credits to
5525                  * update the truncate log dinode */
5526                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5527                                               OCFS2_JOURNAL_ACCESS_WRITE);
5528                 if (status < 0) {
5529                         mlog_errno(status);
5530                         goto bail;
5531                 }
5532
5533                 tl->tl_used = cpu_to_le16(i);
5534
5535                 status = ocfs2_journal_dirty(handle, tl_bh);
5536                 if (status < 0) {
5537                         mlog_errno(status);
5538                         goto bail;
5539                 }
5540
5541                 /* TODO: Perhaps we can calculate the bulk of the
5542                  * credits up front rather than extending like
5543                  * this. */
5544                 status = ocfs2_extend_trans(handle,
5545                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5546                 if (status < 0) {
5547                         mlog_errno(status);
5548                         goto bail;
5549                 }
5550
5551                 rec = tl->tl_recs[i];
5552                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5553                                                     le32_to_cpu(rec.t_start));
5554                 num_clusters = le32_to_cpu(rec.t_clusters);
5555
5556                 /* if start_blk is not set, we ignore the record as
5557                  * invalid. */
5558                 if (start_blk) {
5559                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5560                              i, le32_to_cpu(rec.t_start), num_clusters);
5561
5562                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5563                                                      data_alloc_bh, start_blk,
5564                                                      num_clusters);
5565                         if (status < 0) {
5566                                 mlog_errno(status);
5567                                 goto bail;
5568                         }
5569                 }
5570                 i--;
5571         }
5572
5573 bail:
5574         mlog_exit(status);
5575         return status;
5576 }
5577
5578 /* Expects you to already be holding tl_inode->i_mutex */
5579 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5580 {
5581         int status;
5582         unsigned int num_to_flush;
5583         handle_t *handle;
5584         struct inode *tl_inode = osb->osb_tl_inode;
5585         struct inode *data_alloc_inode = NULL;
5586         struct buffer_head *tl_bh = osb->osb_tl_bh;
5587         struct buffer_head *data_alloc_bh = NULL;
5588         struct ocfs2_dinode *di;
5589         struct ocfs2_truncate_log *tl;
5590
5591         mlog_entry_void();
5592
5593         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5594
5595         di = (struct ocfs2_dinode *) tl_bh->b_data;
5596
5597         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5598          * by the underlying call to ocfs2_read_inode_block(), so any
5599          * corruption is a code bug */
5600         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5601
5602         tl = &di->id2.i_dealloc;
5603         num_to_flush = le16_to_cpu(tl->tl_used);
5604         mlog(0, "Flush %u records from truncate log #%llu\n",
5605              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5606         if (!num_to_flush) {
5607                 status = 0;
5608                 goto out;
5609         }
5610
5611         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5612                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5613                                                        OCFS2_INVALID_SLOT);
5614         if (!data_alloc_inode) {
5615                 status = -EINVAL;
5616                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5617                 goto out;
5618         }
5619
5620         mutex_lock(&data_alloc_inode->i_mutex);
5621
5622         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5623         if (status < 0) {
5624                 mlog_errno(status);
5625                 goto out_mutex;
5626         }
5627
5628         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5629         if (IS_ERR(handle)) {
5630                 status = PTR_ERR(handle);
5631                 mlog_errno(status);
5632                 goto out_unlock;
5633         }
5634
5635         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5636                                                data_alloc_bh);
5637         if (status < 0)
5638                 mlog_errno(status);
5639
5640         ocfs2_commit_trans(osb, handle);
5641
5642 out_unlock:
5643         brelse(data_alloc_bh);
5644         ocfs2_inode_unlock(data_alloc_inode, 1);
5645
5646 out_mutex:
5647         mutex_unlock(&data_alloc_inode->i_mutex);
5648         iput(data_alloc_inode);
5649
5650 out:
5651         mlog_exit(status);
5652         return status;
5653 }
5654
5655 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5656 {
5657         int status;
5658         struct inode *tl_inode = osb->osb_tl_inode;
5659
5660         mutex_lock(&tl_inode->i_mutex);
5661         status = __ocfs2_flush_truncate_log(osb);
5662         mutex_unlock(&tl_inode->i_mutex);
5663
5664         return status;
5665 }
5666
5667 static void ocfs2_truncate_log_worker(struct work_struct *work)
5668 {
5669         int status;
5670         struct ocfs2_super *osb =
5671                 container_of(work, struct ocfs2_super,
5672                              osb_truncate_log_wq.work);
5673
5674         mlog_entry_void();
5675
5676         status = ocfs2_flush_truncate_log(osb);
5677         if (status < 0)
5678                 mlog_errno(status);
5679         else
5680                 ocfs2_init_inode_steal_slot(osb);
5681
5682         mlog_exit(status);
5683 }
5684
5685 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5686 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5687                                        int cancel)
5688 {
5689         if (osb->osb_tl_inode) {
5690                 /* We want to push off log flushes while truncates are
5691                  * still running. */
5692                 if (cancel)
5693                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5694
5695                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5696                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5697         }
5698 }
5699
5700 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5701                                        int slot_num,
5702                                        struct inode **tl_inode,
5703                                        struct buffer_head **tl_bh)
5704 {
5705         int status;
5706         struct inode *inode = NULL;
5707         struct buffer_head *bh = NULL;
5708
5709         inode = ocfs2_get_system_file_inode(osb,
5710                                            TRUNCATE_LOG_SYSTEM_INODE,
5711                                            slot_num);
5712         if (!inode) {
5713                 status = -EINVAL;
5714                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5715                 goto bail;
5716         }
5717
5718         status = ocfs2_read_inode_block(inode, &bh);
5719         if (status < 0) {
5720                 iput(inode);
5721                 mlog_errno(status);
5722                 goto bail;
5723         }
5724
5725         *tl_inode = inode;
5726         *tl_bh    = bh;
5727 bail:
5728         mlog_exit(status);
5729         return status;
5730 }
5731
5732 /* called during the 1st stage of node recovery. we stamp a clean
5733  * truncate log and pass back a copy for processing later. if the
5734  * truncate log does not require processing, a *tl_copy is set to
5735  * NULL. */
5736 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5737                                       int slot_num,
5738                                       struct ocfs2_dinode **tl_copy)
5739 {
5740         int status;
5741         struct inode *tl_inode = NULL;
5742         struct buffer_head *tl_bh = NULL;
5743         struct ocfs2_dinode *di;
5744         struct ocfs2_truncate_log *tl;
5745
5746         *tl_copy = NULL;
5747
5748         mlog(0, "recover truncate log from slot %d\n", slot_num);
5749
5750         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5751         if (status < 0) {
5752                 mlog_errno(status);
5753                 goto bail;
5754         }
5755
5756         di = (struct ocfs2_dinode *) tl_bh->b_data;
5757
5758         /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
5759          * validated by the underlying call to ocfs2_read_inode_block(),
5760          * so any corruption is a code bug */
5761         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5762
5763         tl = &di->id2.i_dealloc;
5764         if (le16_to_cpu(tl->tl_used)) {
5765                 mlog(0, "We'll have %u logs to recover\n",
5766                      le16_to_cpu(tl->tl_used));
5767
5768                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5769                 if (!(*tl_copy)) {
5770                         status = -ENOMEM;
5771                         mlog_errno(status);
5772                         goto bail;
5773                 }
5774
5775                 /* Assuming the write-out below goes well, this copy
5776                  * will be passed back to recovery for processing. */
5777                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5778
5779                 /* All we need to do to clear the truncate log is set
5780                  * tl_used. */
5781                 tl->tl_used = 0;
5782
5783                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5784                 if (status < 0) {
5785                         mlog_errno(status);
5786                         goto bail;
5787                 }
5788         }
5789
5790 bail:
5791         if (tl_inode)
5792                 iput(tl_inode);
5793         brelse(tl_bh);
5794
5795         if (status < 0 && (*tl_copy)) {
5796                 kfree(*tl_copy);
5797                 *tl_copy = NULL;
5798         }
5799
5800         mlog_exit(status);
5801         return status;
5802 }
5803
5804 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5805                                          struct ocfs2_dinode *tl_copy)
5806 {
5807         int status = 0;
5808         int i;
5809         unsigned int clusters, num_recs, start_cluster;
5810         u64 start_blk;
5811         handle_t *handle;
5812         struct inode *tl_inode = osb->osb_tl_inode;
5813         struct ocfs2_truncate_log *tl;
5814
5815         mlog_entry_void();
5816
5817         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5818                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5819                 return -EINVAL;
5820         }
5821
5822         tl = &tl_copy->id2.i_dealloc;
5823         num_recs = le16_to_cpu(tl->tl_used);
5824         mlog(0, "cleanup %u records from %llu\n", num_recs,
5825              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5826
5827         mutex_lock(&tl_inode->i_mutex);
5828         for(i = 0; i < num_recs; i++) {
5829                 if (ocfs2_truncate_log_needs_flush(osb)) {
5830                         status = __ocfs2_flush_truncate_log(osb);
5831                         if (status < 0) {
5832                                 mlog_errno(status);
5833                                 goto bail_up;
5834                         }
5835                 }
5836
5837                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5838                 if (IS_ERR(handle)) {
5839                         status = PTR_ERR(handle);
5840                         mlog_errno(status);
5841                         goto bail_up;
5842                 }
5843
5844                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5845                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5846                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5847
5848                 status = ocfs2_truncate_log_append(osb, handle,
5849                                                    start_blk, clusters);
5850                 ocfs2_commit_trans(osb, handle);
5851                 if (status < 0) {
5852                         mlog_errno(status);
5853                         goto bail_up;
5854                 }
5855         }
5856
5857 bail_up:
5858         mutex_unlock(&tl_inode->i_mutex);
5859
5860         mlog_exit(status);
5861         return status;
5862 }
5863
5864 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5865 {
5866         int status;
5867         struct inode *tl_inode = osb->osb_tl_inode;
5868
5869         mlog_entry_void();
5870
5871         if (tl_inode) {
5872                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5873                 flush_workqueue(ocfs2_wq);
5874
5875                 status = ocfs2_flush_truncate_log(osb);
5876                 if (status < 0)
5877                         mlog_errno(status);
5878
5879                 brelse(osb->osb_tl_bh);
5880                 iput(osb->osb_tl_inode);
5881         }
5882
5883         mlog_exit_void();
5884 }
5885
5886 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5887 {
5888         int status;
5889         struct inode *tl_inode = NULL;
5890         struct buffer_head *tl_bh = NULL;
5891
5892         mlog_entry_void();
5893
5894         status = ocfs2_get_truncate_log_info(osb,
5895                                              osb->slot_num,
5896                                              &tl_inode,
5897                                              &tl_bh);
5898         if (status < 0)
5899                 mlog_errno(status);
5900
5901         /* ocfs2_truncate_log_shutdown keys on the existence of
5902          * osb->osb_tl_inode so we don't set any of the osb variables
5903          * until we're sure all is well. */
5904         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5905                           ocfs2_truncate_log_worker);
5906         osb->osb_tl_bh    = tl_bh;
5907         osb->osb_tl_inode = tl_inode;
5908
5909         mlog_exit(status);
5910         return status;
5911 }
5912
5913 /*
5914  * Delayed de-allocation of suballocator blocks.
5915  *
5916  * Some sets of block de-allocations might involve multiple suballocator inodes.
5917  *
5918  * The locking for this can get extremely complicated, especially when
5919  * the suballocator inodes to delete from aren't known until deep
5920  * within an unrelated codepath.
5921  *
5922  * ocfs2_extent_block structures are a good example of this - an inode
5923  * btree could have been grown by any number of nodes each allocating
5924  * out of their own suballoc inode.
5925  *
5926  * These structures allow the delay of block de-allocation until a
5927  * later time, when locking of multiple cluster inodes won't cause
5928  * deadlock.
5929  */
5930
5931 /*
5932  * Describe a single bit freed from a suballocator.  For the block
5933  * suballocators, it represents one block.  For the global cluster
5934  * allocator, it represents some clusters and free_bit indicates
5935  * clusters number.
5936  */
5937 struct ocfs2_cached_block_free {
5938         struct ocfs2_cached_block_free          *free_next;
5939         u64                                     free_blk;
5940         unsigned int                            free_bit;
5941 };
5942
5943 struct ocfs2_per_slot_free_list {
5944         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5945         int                                     f_inode_type;
5946         int                                     f_slot;
5947         struct ocfs2_cached_block_free          *f_first;
5948 };
5949
5950 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
5951                                     int sysfile_type,
5952                                     int slot,
5953                                     struct ocfs2_cached_block_free *head)
5954 {
5955         int ret;
5956         u64 bg_blkno;
5957         handle_t *handle;
5958         struct inode *inode;
5959         struct buffer_head *di_bh = NULL;
5960         struct ocfs2_cached_block_free *tmp;
5961
5962         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5963         if (!inode) {
5964                 ret = -EINVAL;
5965                 mlog_errno(ret);
5966                 goto out;
5967         }
5968
5969         mutex_lock(&inode->i_mutex);
5970
5971         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5972         if (ret) {
5973                 mlog_errno(ret);
5974                 goto out_mutex;
5975         }
5976
5977         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5978         if (IS_ERR(handle)) {
5979                 ret = PTR_ERR(handle);
5980                 mlog_errno(ret);
5981                 goto out_unlock;
5982         }
5983
5984         while (head) {
5985                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5986                                                       head->free_bit);
5987                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5988                      head->free_bit, (unsigned long long)head->free_blk);
5989
5990                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5991                                                head->free_bit, bg_blkno, 1);
5992                 if (ret) {
5993                         mlog_errno(ret);
5994                         goto out_journal;
5995                 }
5996
5997                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5998                 if (ret) {
5999                         mlog_errno(ret);
6000                         goto out_journal;
6001                 }
6002
6003                 tmp = head;
6004                 head = head->free_next;
6005                 kfree(tmp);
6006         }
6007
6008 out_journal:
6009         ocfs2_commit_trans(osb, handle);
6010
6011 out_unlock:
6012         ocfs2_inode_unlock(inode, 1);
6013         brelse(di_bh);
6014 out_mutex:
6015         mutex_unlock(&inode->i_mutex);
6016         iput(inode);
6017 out:
6018         while(head) {
6019                 /* Premature exit may have left some dangling items. */
6020                 tmp = head;
6021                 head = head->free_next;
6022                 kfree(tmp);
6023         }
6024
6025         return ret;
6026 }
6027
6028 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6029                                 u64 blkno, unsigned int bit)
6030 {
6031         int ret = 0;
6032         struct ocfs2_cached_block_free *item;
6033
6034         item = kmalloc(sizeof(*item), GFP_NOFS);
6035         if (item == NULL) {
6036                 ret = -ENOMEM;
6037                 mlog_errno(ret);
6038                 return ret;
6039         }
6040
6041         mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6042              bit, (unsigned long long)blkno);
6043
6044         item->free_blk = blkno;
6045         item->free_bit = bit;
6046         item->free_next = ctxt->c_global_allocator;
6047
6048         ctxt->c_global_allocator = item;
6049         return ret;
6050 }
6051
6052 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6053                                       struct ocfs2_cached_block_free *head)
6054 {
6055         struct ocfs2_cached_block_free *tmp;
6056         struct inode *tl_inode = osb->osb_tl_inode;
6057         handle_t *handle;
6058         int ret = 0;
6059
6060         mutex_lock(&tl_inode->i_mutex);
6061
6062         while (head) {
6063                 if (ocfs2_truncate_log_needs_flush(osb)) {
6064                         ret = __ocfs2_flush_truncate_log(osb);
6065                         if (ret < 0) {
6066                                 mlog_errno(ret);
6067                                 break;
6068                         }
6069                 }
6070
6071                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6072                 if (IS_ERR(handle)) {
6073                         ret = PTR_ERR(handle);
6074                         mlog_errno(ret);
6075                         break;
6076                 }
6077
6078                 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6079                                                 head->free_bit);
6080
6081                 ocfs2_commit_trans(osb, handle);
6082                 tmp = head;
6083                 head = head->free_next;
6084                 kfree(tmp);
6085
6086                 if (ret < 0) {
6087                         mlog_errno(ret);
6088                         break;
6089                 }
6090         }
6091
6092         mutex_unlock(&tl_inode->i_mutex);
6093
6094         while (head) {
6095                 /* Premature exit may have left some dangling items. */
6096                 tmp = head;
6097                 head = head->free_next;
6098                 kfree(tmp);
6099         }
6100
6101         return ret;
6102 }
6103
6104 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6105                        struct ocfs2_cached_dealloc_ctxt *ctxt)
6106 {
6107         int ret = 0, ret2;
6108         struct ocfs2_per_slot_free_list *fl;
6109
6110         if (!ctxt)
6111                 return 0;
6112
6113         while (ctxt->c_first_suballocator) {
6114                 fl = ctxt->c_first_suballocator;
6115
6116                 if (fl->f_first) {
6117                         mlog(0, "Free items: (type %u, slot %d)\n",
6118                              fl->f_inode_type, fl->f_slot);
6119                         ret2 = ocfs2_free_cached_blocks(osb,
6120                                                         fl->f_inode_type,
6121                                                         fl->f_slot,
6122                                                         fl->f_first);
6123                         if (ret2)
6124                                 mlog_errno(ret2);
6125                         if (!ret)
6126                                 ret = ret2;
6127                 }
6128
6129                 ctxt->c_first_suballocator = fl->f_next_suballocator;
6130                 kfree(fl);
6131         }
6132
6133         if (ctxt->c_global_allocator) {
6134                 ret2 = ocfs2_free_cached_clusters(osb,
6135                                                   ctxt->c_global_allocator);
6136                 if (ret2)
6137                         mlog_errno(ret2);
6138                 if (!ret)
6139                         ret = ret2;
6140
6141                 ctxt->c_global_allocator = NULL;
6142         }
6143
6144         return ret;
6145 }
6146
6147 static struct ocfs2_per_slot_free_list *
6148 ocfs2_find_per_slot_free_list(int type,
6149                               int slot,
6150                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6151 {
6152         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6153
6154         while (fl) {
6155                 if (fl->f_inode_type == type && fl->f_slot == slot)
6156                         return fl;
6157
6158                 fl = fl->f_next_suballocator;
6159         }
6160
6161         fl = kmalloc(sizeof(*fl), GFP_NOFS);
6162         if (fl) {
6163                 fl->f_inode_type = type;
6164                 fl->f_slot = slot;
6165                 fl->f_first = NULL;
6166                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6167
6168                 ctxt->c_first_suballocator = fl;
6169         }
6170         return fl;
6171 }
6172
6173 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6174                                      int type, int slot, u64 blkno,
6175                                      unsigned int bit)
6176 {
6177         int ret;
6178         struct ocfs2_per_slot_free_list *fl;
6179         struct ocfs2_cached_block_free *item;
6180
6181         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6182         if (fl == NULL) {
6183                 ret = -ENOMEM;
6184                 mlog_errno(ret);
6185                 goto out;
6186         }
6187
6188         item = kmalloc(sizeof(*item), GFP_NOFS);
6189         if (item == NULL) {
6190                 ret = -ENOMEM;
6191                 mlog_errno(ret);
6192                 goto out;
6193         }
6194
6195         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6196              type, slot, bit, (unsigned long long)blkno);
6197
6198         item->free_blk = blkno;
6199         item->free_bit = bit;
6200         item->free_next = fl->f_first;
6201
6202         fl->f_first = item;
6203
6204         ret = 0;
6205 out:
6206         return ret;
6207 }
6208
6209 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6210                                          struct ocfs2_extent_block *eb)
6211 {
6212         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6213                                          le16_to_cpu(eb->h_suballoc_slot),
6214                                          le64_to_cpu(eb->h_blkno),
6215                                          le16_to_cpu(eb->h_suballoc_bit));
6216 }
6217
6218 /* This function will figure out whether the currently last extent
6219  * block will be deleted, and if it will, what the new last extent
6220  * block will be so we can update his h_next_leaf_blk field, as well
6221  * as the dinodes i_last_eb_blk */
6222 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6223                                        unsigned int clusters_to_del,
6224                                        struct ocfs2_path *path,
6225                                        struct buffer_head **new_last_eb)
6226 {
6227         int next_free, ret = 0;
6228         u32 cpos;
6229         struct ocfs2_extent_rec *rec;
6230         struct ocfs2_extent_block *eb;
6231         struct ocfs2_extent_list *el;
6232         struct buffer_head *bh = NULL;
6233
6234         *new_last_eb = NULL;
6235
6236         /* we have no tree, so of course, no last_eb. */
6237         if (!path->p_tree_depth)
6238                 goto out;
6239
6240         /* trunc to zero special case - this makes tree_depth = 0
6241          * regardless of what it is.  */
6242         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6243                 goto out;
6244
6245         el = path_leaf_el(path);
6246         BUG_ON(!el->l_next_free_rec);
6247
6248         /*
6249          * Make sure that this extent list will actually be empty
6250          * after we clear away the data. We can shortcut out if
6251          * there's more than one non-empty extent in the
6252          * list. Otherwise, a check of the remaining extent is
6253          * necessary.
6254          */
6255         next_free = le16_to_cpu(el->l_next_free_rec);
6256         rec = NULL;
6257         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6258                 if (next_free > 2)
6259                         goto out;
6260
6261                 /* We may have a valid extent in index 1, check it. */
6262                 if (next_free == 2)
6263                         rec = &el->l_recs[1];
6264
6265                 /*
6266                  * Fall through - no more nonempty extents, so we want
6267                  * to delete this leaf.
6268                  */
6269         } else {
6270                 if (next_free > 1)
6271                         goto out;
6272
6273                 rec = &el->l_recs[0];
6274         }
6275
6276         if (rec) {
6277                 /*
6278                  * Check it we'll only be trimming off the end of this
6279                  * cluster.
6280                  */
6281                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6282                         goto out;
6283         }
6284
6285         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6286         if (ret) {
6287                 mlog_errno(ret);
6288                 goto out;
6289         }
6290
6291         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6292         if (ret) {
6293                 mlog_errno(ret);
6294                 goto out;
6295         }
6296
6297         eb = (struct ocfs2_extent_block *) bh->b_data;
6298         el = &eb->h_list;
6299
6300         /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6301          * Any corruption is a code bug. */
6302         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6303
6304         *new_last_eb = bh;
6305         get_bh(*new_last_eb);
6306         mlog(0, "returning block %llu, (cpos: %u)\n",
6307              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6308 out:
6309         brelse(bh);
6310
6311         return ret;
6312 }
6313
6314 /*
6315  * Trim some clusters off the rightmost edge of a tree. Only called
6316  * during truncate.
6317  *
6318  * The caller needs to:
6319  *   - start journaling of each path component.
6320  *   - compute and fully set up any new last ext block
6321  */
6322 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6323                            handle_t *handle, struct ocfs2_truncate_context *tc,
6324                            u32 clusters_to_del, u64 *delete_start)
6325 {
6326         int ret, i, index = path->p_tree_depth;
6327         u32 new_edge = 0;
6328         u64 deleted_eb = 0;
6329         struct buffer_head *bh;
6330         struct ocfs2_extent_list *el;
6331         struct ocfs2_extent_rec *rec;
6332
6333         *delete_start = 0;
6334
6335         while (index >= 0) {
6336                 bh = path->p_node[index].bh;
6337                 el = path->p_node[index].el;
6338
6339                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6340                      index,  (unsigned long long)bh->b_blocknr);
6341
6342                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6343
6344                 if (index !=
6345                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6346                         ocfs2_error(inode->i_sb,
6347                                     "Inode %lu has invalid ext. block %llu",
6348                                     inode->i_ino,
6349                                     (unsigned long long)bh->b_blocknr);
6350                         ret = -EROFS;
6351                         goto out;
6352                 }
6353
6354 find_tail_record:
6355                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6356                 rec = &el->l_recs[i];
6357
6358                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6359                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6360                      ocfs2_rec_clusters(el, rec),
6361                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6362                      le16_to_cpu(el->l_next_free_rec));
6363
6364                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6365
6366                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6367                         /*
6368                          * If the leaf block contains a single empty
6369                          * extent and no records, we can just remove
6370                          * the block.
6371                          */
6372                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6373                                 memset(rec, 0,
6374                                        sizeof(struct ocfs2_extent_rec));
6375                                 el->l_next_free_rec = cpu_to_le16(0);
6376
6377                                 goto delete;
6378                         }
6379
6380                         /*
6381                          * Remove any empty extents by shifting things
6382                          * left. That should make life much easier on
6383                          * the code below. This condition is rare
6384                          * enough that we shouldn't see a performance
6385                          * hit.
6386                          */
6387                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6388                                 le16_add_cpu(&el->l_next_free_rec, -1);
6389
6390                                 for(i = 0;
6391                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6392                                         el->l_recs[i] = el->l_recs[i + 1];
6393
6394                                 memset(&el->l_recs[i], 0,
6395                                        sizeof(struct ocfs2_extent_rec));
6396
6397                                 /*
6398                                  * We've modified our extent list. The
6399                                  * simplest way to handle this change
6400                                  * is to being the search from the
6401                                  * start again.
6402                                  */
6403                                 goto find_tail_record;
6404                         }
6405
6406                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6407
6408                         /*
6409                          * We'll use "new_edge" on our way back up the
6410                          * tree to know what our rightmost cpos is.
6411                          */
6412                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6413                         new_edge += le32_to_cpu(rec->e_cpos);
6414
6415                         /*
6416                          * The caller will use this to delete data blocks.
6417                          */
6418                         *delete_start = le64_to_cpu(rec->e_blkno)
6419                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6420                                         le16_to_cpu(rec->e_leaf_clusters));
6421
6422                         /*
6423                          * If it's now empty, remove this record.
6424                          */
6425                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6426                                 memset(rec, 0,
6427                                        sizeof(struct ocfs2_extent_rec));
6428                                 le16_add_cpu(&el->l_next_free_rec, -1);
6429                         }
6430                 } else {
6431                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6432                                 memset(rec, 0,
6433                                        sizeof(struct ocfs2_extent_rec));
6434                                 le16_add_cpu(&el->l_next_free_rec, -1);
6435
6436                                 goto delete;
6437                         }
6438
6439                         /* Can this actually happen? */
6440                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6441                                 goto delete;
6442
6443                         /*
6444                          * We never actually deleted any clusters
6445                          * because our leaf was empty. There's no
6446                          * reason to adjust the rightmost edge then.
6447                          */
6448                         if (new_edge == 0)
6449                                 goto delete;
6450
6451                         rec->e_int_clusters = cpu_to_le32(new_edge);
6452                         le32_add_cpu(&rec->e_int_clusters,
6453                                      -le32_to_cpu(rec->e_cpos));
6454
6455                          /*
6456                           * A deleted child record should have been
6457                           * caught above.
6458                           */
6459                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6460                 }
6461
6462 delete:
6463                 ret = ocfs2_journal_dirty(handle, bh);
6464                 if (ret) {
6465                         mlog_errno(ret);
6466                         goto out;
6467                 }
6468
6469                 mlog(0, "extent list container %llu, after: record %d: "
6470                      "(%u, %u, %llu), next = %u.\n",
6471                      (unsigned long long)bh->b_blocknr, i,
6472                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6473                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6474                      le16_to_cpu(el->l_next_free_rec));
6475
6476                 /*
6477                  * We must be careful to only attempt delete of an
6478                  * extent block (and not the root inode block).
6479                  */
6480                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6481                         struct ocfs2_extent_block *eb =
6482                                 (struct ocfs2_extent_block *)bh->b_data;
6483
6484                         /*
6485                          * Save this for use when processing the
6486                          * parent block.
6487                          */
6488                         deleted_eb = le64_to_cpu(eb->h_blkno);
6489
6490                         mlog(0, "deleting this extent block.\n");
6491
6492                         ocfs2_remove_from_cache(inode, bh);
6493
6494                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6495                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6496                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6497
6498                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6499                         /* An error here is not fatal. */
6500                         if (ret < 0)
6501                                 mlog_errno(ret);
6502                 } else {
6503                         deleted_eb = 0;
6504                 }
6505
6506                 index--;
6507         }
6508
6509         ret = 0;
6510 out:
6511         return ret;
6512 }
6513
6514 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6515                              unsigned int clusters_to_del,
6516                              struct inode *inode,
6517                              struct buffer_head *fe_bh,
6518                              handle_t *handle,
6519                              struct ocfs2_truncate_context *tc,
6520                              struct ocfs2_path *path)
6521 {
6522         int status;
6523         struct ocfs2_dinode *fe;
6524         struct ocfs2_extent_block *last_eb = NULL;
6525         struct ocfs2_extent_list *el;
6526         struct buffer_head *last_eb_bh = NULL;
6527         u64 delete_blk = 0;
6528
6529         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6530
6531         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6532                                              path, &last_eb_bh);
6533         if (status < 0) {
6534                 mlog_errno(status);
6535                 goto bail;
6536         }
6537
6538         /*
6539          * Each component will be touched, so we might as well journal
6540          * here to avoid having to handle errors later.
6541          */
6542         status = ocfs2_journal_access_path(inode, handle, path);
6543         if (status < 0) {
6544                 mlog_errno(status);
6545                 goto bail;
6546         }
6547
6548         if (last_eb_bh) {
6549                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
6550                                               OCFS2_JOURNAL_ACCESS_WRITE);
6551                 if (status < 0) {
6552                         mlog_errno(status);
6553                         goto bail;
6554                 }
6555
6556                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6557         }
6558
6559         el = &(fe->id2.i_list);
6560
6561         /*
6562          * Lower levels depend on this never happening, but it's best
6563          * to check it up here before changing the tree.
6564          */
6565         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6566                 ocfs2_error(inode->i_sb,
6567                             "Inode %lu has an empty extent record, depth %u\n",
6568                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6569                 status = -EROFS;
6570                 goto bail;
6571         }
6572
6573         vfs_dq_free_space_nodirty(inode,
6574                         ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6575         spin_lock(&OCFS2_I(inode)->ip_lock);
6576         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6577                                       clusters_to_del;
6578         spin_unlock(&OCFS2_I(inode)->ip_lock);
6579         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6580         inode->i_blocks = ocfs2_inode_sector_count(inode);
6581
6582         status = ocfs2_trim_tree(inode, path, handle, tc,
6583                                  clusters_to_del, &delete_blk);
6584         if (status) {
6585                 mlog_errno(status);
6586                 goto bail;
6587         }
6588
6589         if (le32_to_cpu(fe->i_clusters) == 0) {
6590                 /* trunc to zero is a special case. */
6591                 el->l_tree_depth = 0;
6592                 fe->i_last_eb_blk = 0;
6593         } else if (last_eb)
6594                 fe->i_last_eb_blk = last_eb->h_blkno;
6595
6596         status = ocfs2_journal_dirty(handle, fe_bh);
6597         if (status < 0) {
6598                 mlog_errno(status);
6599                 goto bail;
6600         }
6601
6602         if (last_eb) {
6603                 /* If there will be a new last extent block, then by
6604                  * definition, there cannot be any leaves to the right of
6605                  * him. */
6606                 last_eb->h_next_leaf_blk = 0;
6607                 status = ocfs2_journal_dirty(handle, last_eb_bh);
6608                 if (status < 0) {
6609                         mlog_errno(status);
6610                         goto bail;
6611                 }
6612         }
6613
6614         if (delete_blk) {
6615                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6616                                                    clusters_to_del);
6617                 if (status < 0) {
6618                         mlog_errno(status);
6619                         goto bail;
6620                 }
6621         }
6622         status = 0;
6623 bail:
6624
6625         mlog_exit(status);
6626         return status;
6627 }
6628
6629 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6630 {
6631         set_buffer_uptodate(bh);
6632         mark_buffer_dirty(bh);
6633         return 0;
6634 }
6635
6636 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6637                                      unsigned int from, unsigned int to,
6638                                      struct page *page, int zero, u64 *phys)
6639 {
6640         int ret, partial = 0;
6641
6642         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6643         if (ret)
6644                 mlog_errno(ret);
6645
6646         if (zero)
6647                 zero_user_segment(page, from, to);
6648
6649         /*
6650          * Need to set the buffers we zero'd into uptodate
6651          * here if they aren't - ocfs2_map_page_blocks()
6652          * might've skipped some
6653          */
6654         ret = walk_page_buffers(handle, page_buffers(page),
6655                                 from, to, &partial,
6656                                 ocfs2_zero_func);
6657         if (ret < 0)
6658                 mlog_errno(ret);
6659         else if (ocfs2_should_order_data(inode)) {
6660                 ret = ocfs2_jbd2_file_inode(handle, inode);
6661                 if (ret < 0)
6662                         mlog_errno(ret);
6663         }
6664
6665         if (!partial)
6666                 SetPageUptodate(page);
6667
6668         flush_dcache_page(page);
6669 }
6670
6671 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6672                                      loff_t end, struct page **pages,
6673                                      int numpages, u64 phys, handle_t *handle)
6674 {
6675         int i;
6676         struct page *page;
6677         unsigned int from, to = PAGE_CACHE_SIZE;
6678         struct super_block *sb = inode->i_sb;
6679
6680         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6681
6682         if (numpages == 0)
6683                 goto out;
6684
6685         to = PAGE_CACHE_SIZE;
6686         for(i = 0; i < numpages; i++) {
6687                 page = pages[i];
6688
6689                 from = start & (PAGE_CACHE_SIZE - 1);
6690                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6691                         to = end & (PAGE_CACHE_SIZE - 1);
6692
6693                 BUG_ON(from > PAGE_CACHE_SIZE);
6694                 BUG_ON(to > PAGE_CACHE_SIZE);
6695
6696                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6697                                          &phys);
6698
6699                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6700         }
6701 out:
6702         if (pages)
6703                 ocfs2_unlock_and_free_pages(pages, numpages);
6704 }
6705
6706 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6707                                 struct page **pages, int *num)
6708 {
6709         int numpages, ret = 0;
6710         struct super_block *sb = inode->i_sb;
6711         struct address_space *mapping = inode->i_mapping;
6712         unsigned long index;
6713         loff_t last_page_bytes;
6714
6715         BUG_ON(start > end);
6716
6717         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6718                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6719
6720         numpages = 0;
6721         last_page_bytes = PAGE_ALIGN(end);
6722         index = start >> PAGE_CACHE_SHIFT;
6723         do {
6724                 pages[numpages] = grab_cache_page(mapping, index);
6725                 if (!pages[numpages]) {
6726                         ret = -ENOMEM;
6727                         mlog_errno(ret);
6728                         goto out;
6729                 }
6730
6731                 numpages++;
6732                 index++;
6733         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6734
6735 out:
6736         if (ret != 0) {
6737                 if (pages)
6738                         ocfs2_unlock_and_free_pages(pages, numpages);
6739                 numpages = 0;
6740         }
6741
6742         *num = numpages;
6743
6744         return ret;
6745 }
6746
6747 /*
6748  * Zero the area past i_size but still within an allocated
6749  * cluster. This avoids exposing nonzero data on subsequent file
6750  * extends.
6751  *
6752  * We need to call this before i_size is updated on the inode because
6753  * otherwise block_write_full_page() will skip writeout of pages past
6754  * i_size. The new_i_size parameter is passed for this reason.
6755  */
6756 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6757                                   u64 range_start, u64 range_end)
6758 {
6759         int ret = 0, numpages;
6760         struct page **pages = NULL;
6761         u64 phys;
6762         unsigned int ext_flags;
6763         struct super_block *sb = inode->i_sb;
6764
6765         /*
6766          * File systems which don't support sparse files zero on every
6767          * extend.
6768          */
6769         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6770                 return 0;
6771
6772         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6773                         sizeof(struct page *), GFP_NOFS);
6774         if (pages == NULL) {
6775                 ret = -ENOMEM;
6776                 mlog_errno(ret);
6777                 goto out;
6778         }
6779
6780         if (range_start == range_end)
6781                 goto out;
6782
6783         ret = ocfs2_extent_map_get_blocks(inode,
6784                                           range_start >> sb->s_blocksize_bits,
6785                                           &phys, NULL, &ext_flags);
6786         if (ret) {
6787                 mlog_errno(ret);
6788                 goto out;
6789         }
6790
6791         /*
6792          * Tail is a hole, or is marked unwritten. In either case, we
6793          * can count on read and write to return/push zero's.
6794          */
6795         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6796                 goto out;
6797
6798         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6799                                    &numpages);
6800         if (ret) {
6801                 mlog_errno(ret);
6802                 goto out;
6803         }
6804
6805         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6806                                  numpages, phys, handle);
6807
6808         /*
6809          * Initiate writeout of the pages we zero'd here. We don't
6810          * wait on them - the truncate_inode_pages() call later will
6811          * do that for us.
6812          */
6813         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6814                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6815         if (ret)
6816                 mlog_errno(ret);
6817
6818 out:
6819         if (pages)
6820                 kfree(pages);
6821
6822         return ret;
6823 }
6824
6825 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6826                                              struct ocfs2_dinode *di)
6827 {
6828         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6829         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6830
6831         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6832                 memset(&di->id2, 0, blocksize -
6833                                     offsetof(struct ocfs2_dinode, id2) -
6834                                     xattrsize);
6835         else
6836                 memset(&di->id2, 0, blocksize -
6837                                     offsetof(struct ocfs2_dinode, id2));
6838 }
6839
6840 void ocfs2_dinode_new_extent_list(struct inode *inode,
6841                                   struct ocfs2_dinode *di)
6842 {
6843         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6844         di->id2.i_list.l_tree_depth = 0;
6845         di->id2.i_list.l_next_free_rec = 0;
6846         di->id2.i_list.l_count = cpu_to_le16(
6847                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6848 }
6849
6850 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6851 {
6852         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6853         struct ocfs2_inline_data *idata = &di->id2.i_data;
6854
6855         spin_lock(&oi->ip_lock);
6856         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6857         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6858         spin_unlock(&oi->ip_lock);
6859
6860         /*
6861          * We clear the entire i_data structure here so that all
6862          * fields can be properly initialized.
6863          */
6864         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6865
6866         idata->id_count = cpu_to_le16(
6867                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6868 }
6869
6870 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6871                                          struct buffer_head *di_bh)
6872 {
6873         int ret, i, has_data, num_pages = 0;
6874         handle_t *handle;
6875         u64 uninitialized_var(block);
6876         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6877         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6878         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6879         struct ocfs2_alloc_context *data_ac = NULL;
6880         struct page **pages = NULL;
6881         loff_t end = osb->s_clustersize;
6882         struct ocfs2_extent_tree et;
6883         int did_quota = 0;
6884
6885         has_data = i_size_read(inode) ? 1 : 0;
6886
6887         if (has_data) {
6888                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6889                                 sizeof(struct page *), GFP_NOFS);
6890                 if (pages == NULL) {
6891                         ret = -ENOMEM;
6892                         mlog_errno(ret);
6893                         goto out;
6894                 }
6895
6896                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6897                 if (ret) {
6898                         mlog_errno(ret);
6899                         goto out;
6900                 }
6901         }
6902
6903         handle = ocfs2_start_trans(osb,
6904                                    ocfs2_inline_to_extents_credits(osb->sb));
6905         if (IS_ERR(handle)) {
6906                 ret = PTR_ERR(handle);
6907                 mlog_errno(ret);
6908                 goto out_unlock;
6909         }
6910
6911         ret = ocfs2_journal_access(handle, inode, di_bh,
6912                                    OCFS2_JOURNAL_ACCESS_WRITE);
6913         if (ret) {
6914                 mlog_errno(ret);
6915                 goto out_commit;
6916         }
6917
6918         if (has_data) {
6919                 u32 bit_off, num;
6920                 unsigned int page_end;
6921                 u64 phys;
6922
6923                 if (vfs_dq_alloc_space_nodirty(inode,
6924                                        ocfs2_clusters_to_bytes(osb->sb, 1))) {
6925                         ret = -EDQUOT;
6926                         goto out_commit;
6927                 }
6928                 did_quota = 1;
6929
6930                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6931                                            &num);
6932                 if (ret) {
6933                         mlog_errno(ret);
6934                         goto out_commit;
6935                 }
6936
6937                 /*
6938                  * Save two copies, one for insert, and one that can
6939                  * be changed by ocfs2_map_and_dirty_page() below.
6940                  */
6941                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6942
6943                 /*
6944                  * Non sparse file systems zero on extend, so no need
6945                  * to do that now.
6946                  */
6947                 if (!ocfs2_sparse_alloc(osb) &&
6948                     PAGE_CACHE_SIZE < osb->s_clustersize)
6949                         end = PAGE_CACHE_SIZE;
6950
6951                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6952                 if (ret) {
6953                         mlog_errno(ret);
6954                         goto out_commit;
6955                 }
6956
6957                 /*
6958                  * This should populate the 1st page for us and mark
6959                  * it up to date.
6960                  */
6961                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6962                 if (ret) {
6963                         mlog_errno(ret);
6964                         goto out_commit;
6965                 }
6966
6967                 page_end = PAGE_CACHE_SIZE;
6968                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6969                         page_end = osb->s_clustersize;
6970
6971                 for (i = 0; i < num_pages; i++)
6972                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6973                                                  pages[i], i > 0, &phys);
6974         }
6975
6976         spin_lock(&oi->ip_lock);
6977         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6978         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6979         spin_unlock(&oi->ip_lock);
6980
6981         ocfs2_dinode_new_extent_list(inode, di);
6982
6983         ocfs2_journal_dirty(handle, di_bh);
6984
6985         if (has_data) {
6986                 /*
6987                  * An error at this point should be extremely rare. If
6988                  * this proves to be false, we could always re-build
6989                  * the in-inode data from our pages.
6990                  */
6991                 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
6992                 ret = ocfs2_insert_extent(osb, handle, inode, &et,
6993                                           0, block, 1, 0, NULL);
6994                 if (ret) {
6995                         mlog_errno(ret);
6996                         goto out_commit;
6997                 }
6998
6999                 inode->i_blocks = ocfs2_inode_sector_count(inode);
7000         }
7001
7002 out_commit:
7003         if (ret < 0 && did_quota)
7004                 vfs_dq_free_space_nodirty(inode,
7005                                           ocfs2_clusters_to_bytes(osb->sb, 1));
7006
7007         ocfs2_commit_trans(osb, handle);
7008
7009 out_unlock:
7010         if (data_ac)
7011                 ocfs2_free_alloc_context(data_ac);
7012
7013 out:
7014         if (pages) {
7015                 ocfs2_unlock_and_free_pages(pages, num_pages);
7016                 kfree(pages);
7017         }
7018
7019         return ret;
7020 }
7021
7022 /*
7023  * It is expected, that by the time you call this function,
7024  * inode->i_size and fe->i_size have been adjusted.
7025  *
7026  * WARNING: This will kfree the truncate context
7027  */
7028 int ocfs2_commit_truncate(struct ocfs2_super *osb,
7029                           struct inode *inode,
7030                           struct buffer_head *fe_bh,
7031                           struct ocfs2_truncate_context *tc)
7032 {
7033         int status, i, credits, tl_sem = 0;
7034         u32 clusters_to_del, new_highest_cpos, range;
7035         struct ocfs2_extent_list *el;
7036         handle_t *handle = NULL;
7037         struct inode *tl_inode = osb->osb_tl_inode;
7038         struct ocfs2_path *path = NULL;
7039         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7040
7041         mlog_entry_void();
7042
7043         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7044                                                      i_size_read(inode));
7045
7046         path = ocfs2_new_path(fe_bh, &di->id2.i_list);
7047         if (!path) {
7048                 status = -ENOMEM;
7049                 mlog_errno(status);
7050                 goto bail;
7051         }
7052
7053         ocfs2_extent_map_trunc(inode, new_highest_cpos);
7054
7055 start:
7056         /*
7057          * Check that we still have allocation to delete.
7058          */
7059         if (OCFS2_I(inode)->ip_clusters == 0) {
7060                 status = 0;
7061                 goto bail;
7062         }
7063
7064         /*
7065          * Truncate always works against the rightmost tree branch.
7066          */
7067         status = ocfs2_find_path(inode, path, UINT_MAX);
7068         if (status) {
7069                 mlog_errno(status);
7070                 goto bail;
7071         }
7072
7073         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7074              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7075
7076         /*
7077          * By now, el will point to the extent list on the bottom most
7078          * portion of this tree. Only the tail record is considered in
7079          * each pass.
7080          *
7081          * We handle the following cases, in order:
7082          * - empty extent: delete the remaining branch
7083          * - remove the entire record
7084          * - remove a partial record
7085          * - no record needs to be removed (truncate has completed)
7086          */
7087         el = path_leaf_el(path);
7088         if (le16_to_cpu(el->l_next_free_rec) == 0) {
7089                 ocfs2_error(inode->i_sb,
7090                             "Inode %llu has empty extent block at %llu\n",
7091                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7092                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
7093                 status = -EROFS;
7094                 goto bail;
7095         }
7096
7097         i = le16_to_cpu(el->l_next_free_rec) - 1;
7098         range = le32_to_cpu(el->l_recs[i].e_cpos) +
7099                 ocfs2_rec_clusters(el, &el->l_recs[i]);
7100         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
7101                 clusters_to_del = 0;
7102         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7103                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7104         } else if (range > new_highest_cpos) {
7105                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7106                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
7107                                   new_highest_cpos;
7108         } else {
7109                 status = 0;
7110                 goto bail;
7111         }
7112
7113         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7114              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7115
7116         mutex_lock(&tl_inode->i_mutex);
7117         tl_sem = 1;
7118         /* ocfs2_truncate_log_needs_flush guarantees us at least one
7119          * record is free for use. If there isn't any, we flush to get
7120          * an empty truncate log.  */
7121         if (ocfs2_truncate_log_needs_flush(osb)) {
7122                 status = __ocfs2_flush_truncate_log(osb);
7123                 if (status < 0) {
7124                         mlog_errno(status);
7125                         goto bail;
7126                 }
7127         }
7128
7129         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7130                                                 (struct ocfs2_dinode *)fe_bh->b_data,
7131                                                 el);
7132         handle = ocfs2_start_trans(osb, credits);
7133         if (IS_ERR(handle)) {
7134                 status = PTR_ERR(handle);
7135                 handle = NULL;
7136                 mlog_errno(status);
7137                 goto bail;
7138         }
7139
7140         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7141                                    tc, path);
7142         if (status < 0) {
7143                 mlog_errno(status);
7144                 goto bail;
7145         }
7146
7147         mutex_unlock(&tl_inode->i_mutex);
7148         tl_sem = 0;
7149
7150         ocfs2_commit_trans(osb, handle);
7151         handle = NULL;
7152
7153         ocfs2_reinit_path(path, 1);
7154
7155         /*
7156          * The check above will catch the case where we've truncated
7157          * away all allocation.
7158          */
7159         goto start;
7160
7161 bail:
7162
7163         ocfs2_schedule_truncate_log_flush(osb, 1);
7164
7165         if (tl_sem)
7166                 mutex_unlock(&tl_inode->i_mutex);
7167
7168         if (handle)
7169                 ocfs2_commit_trans(osb, handle);
7170
7171         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7172
7173         ocfs2_free_path(path);
7174
7175         /* This will drop the ext_alloc cluster lock for us */
7176         ocfs2_free_truncate_context(tc);
7177
7178         mlog_exit(status);
7179         return status;
7180 }
7181
7182 /*
7183  * Expects the inode to already be locked.
7184  */
7185 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7186                            struct inode *inode,
7187                            struct buffer_head *fe_bh,
7188                            struct ocfs2_truncate_context **tc)
7189 {
7190         int status;
7191         unsigned int new_i_clusters;
7192         struct ocfs2_dinode *fe;
7193         struct ocfs2_extent_block *eb;
7194         struct buffer_head *last_eb_bh = NULL;
7195
7196         mlog_entry_void();
7197
7198         *tc = NULL;
7199
7200         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7201                                                   i_size_read(inode));
7202         fe = (struct ocfs2_dinode *) fe_bh->b_data;
7203
7204         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7205              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7206              (unsigned long long)le64_to_cpu(fe->i_size));
7207
7208         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7209         if (!(*tc)) {
7210                 status = -ENOMEM;
7211                 mlog_errno(status);
7212                 goto bail;
7213         }
7214         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7215
7216         if (fe->id2.i_list.l_tree_depth) {
7217                 status = ocfs2_read_extent_block(inode,
7218                                                  le64_to_cpu(fe->i_last_eb_blk),
7219                                                  &last_eb_bh);
7220                 if (status < 0) {
7221                         mlog_errno(status);
7222                         goto bail;
7223                 }
7224                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7225         }
7226
7227         (*tc)->tc_last_eb_bh = last_eb_bh;
7228
7229         status = 0;
7230 bail:
7231         if (status < 0) {
7232                 if (*tc)
7233                         ocfs2_free_truncate_context(*tc);
7234                 *tc = NULL;
7235         }
7236         mlog_exit_void();
7237         return status;
7238 }
7239
7240 /*
7241  * 'start' is inclusive, 'end' is not.
7242  */
7243 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7244                           unsigned int start, unsigned int end, int trunc)
7245 {
7246         int ret;
7247         unsigned int numbytes;
7248         handle_t *handle;
7249         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7250         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7251         struct ocfs2_inline_data *idata = &di->id2.i_data;
7252
7253         if (end > i_size_read(inode))
7254                 end = i_size_read(inode);
7255
7256         BUG_ON(start >= end);
7257
7258         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7259             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7260             !ocfs2_supports_inline_data(osb)) {
7261                 ocfs2_error(inode->i_sb,
7262                             "Inline data flags for inode %llu don't agree! "
7263                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7264                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7265                             le16_to_cpu(di->i_dyn_features),
7266                             OCFS2_I(inode)->ip_dyn_features,
7267                             osb->s_feature_incompat);
7268                 ret = -EROFS;
7269                 goto out;
7270         }
7271
7272         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7273         if (IS_ERR(handle)) {
7274                 ret = PTR_ERR(handle);
7275                 mlog_errno(ret);
7276                 goto out;
7277         }
7278
7279         ret = ocfs2_journal_access(handle, inode, di_bh,
7280                                    OCFS2_JOURNAL_ACCESS_WRITE);
7281         if (ret) {
7282                 mlog_errno(ret);
7283                 goto out_commit;
7284         }
7285
7286         numbytes = end - start;
7287         memset(idata->id_data + start, 0, numbytes);
7288
7289         /*
7290          * No need to worry about the data page here - it's been
7291          * truncated already and inline data doesn't need it for
7292          * pushing zero's to disk, so we'll let readpage pick it up
7293          * later.
7294          */
7295         if (trunc) {
7296                 i_size_write(inode, start);
7297                 di->i_size = cpu_to_le64(start);
7298         }
7299
7300         inode->i_blocks = ocfs2_inode_sector_count(inode);
7301         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7302
7303         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7304         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7305
7306         ocfs2_journal_dirty(handle, di_bh);
7307
7308 out_commit:
7309         ocfs2_commit_trans(osb, handle);
7310
7311 out:
7312         return ret;
7313 }
7314
7315 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7316 {
7317         /*
7318          * The caller is responsible for completing deallocation
7319          * before freeing the context.
7320          */
7321         if (tc->tc_dealloc.c_first_suballocator != NULL)
7322                 mlog(ML_NOTICE,
7323                      "Truncate completion has non-empty dealloc context\n");
7324
7325         brelse(tc->tc_last_eb_bh);
7326
7327         kfree(tc);
7328 }