ocfs2: Add quota calls for allocation and freeing of inodes and space
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 #include <linux/quotaops.h>
32
33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
34 #include <cluster/masklog.h>
35
36 #include "ocfs2.h"
37
38 #include "alloc.h"
39 #include "aops.h"
40 #include "dlmglue.h"
41 #include "extent_map.h"
42 #include "inode.h"
43 #include "journal.h"
44 #include "localalloc.h"
45 #include "suballoc.h"
46 #include "sysfile.h"
47 #include "file.h"
48 #include "super.h"
49 #include "uptodate.h"
50
51 #include "buffer_head_io.h"
52
53
54 /*
55  * Operations for a specific extent tree type.
56  *
57  * To implement an on-disk btree (extent tree) type in ocfs2, add
58  * an ocfs2_extent_tree_operations structure and the matching
59  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
60  * for the allocation portion of the extent tree.
61  */
62 struct ocfs2_extent_tree_operations {
63         /*
64          * last_eb_blk is the block number of the right most leaf extent
65          * block.  Most on-disk structures containing an extent tree store
66          * this value for fast access.  The ->eo_set_last_eb_blk() and
67          * ->eo_get_last_eb_blk() operations access this value.  They are
68          *  both required.
69          */
70         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
71                                    u64 blkno);
72         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
73
74         /*
75          * The on-disk structure usually keeps track of how many total
76          * clusters are stored in this extent tree.  This function updates
77          * that value.  new_clusters is the delta, and must be
78          * added to the total.  Required.
79          */
80         void (*eo_update_clusters)(struct inode *inode,
81                                    struct ocfs2_extent_tree *et,
82                                    u32 new_clusters);
83
84         /*
85          * If ->eo_insert_check() exists, it is called before rec is
86          * inserted into the extent tree.  It is optional.
87          */
88         int (*eo_insert_check)(struct inode *inode,
89                                struct ocfs2_extent_tree *et,
90                                struct ocfs2_extent_rec *rec);
91         int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
92
93         /*
94          * --------------------------------------------------------------
95          * The remaining are internal to ocfs2_extent_tree and don't have
96          * accessor functions
97          */
98
99         /*
100          * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
101          * It is required.
102          */
103         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
104
105         /*
106          * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
107          * it exists.  If it does not, et->et_max_leaf_clusters is set
108          * to 0 (unlimited).  Optional.
109          */
110         void (*eo_fill_max_leaf_clusters)(struct inode *inode,
111                                           struct ocfs2_extent_tree *et);
112 };
113
114
115 /*
116  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
117  * in the methods.
118  */
119 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
120 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
121                                          u64 blkno);
122 static void ocfs2_dinode_update_clusters(struct inode *inode,
123                                          struct ocfs2_extent_tree *et,
124                                          u32 clusters);
125 static int ocfs2_dinode_insert_check(struct inode *inode,
126                                      struct ocfs2_extent_tree *et,
127                                      struct ocfs2_extent_rec *rec);
128 static int ocfs2_dinode_sanity_check(struct inode *inode,
129                                      struct ocfs2_extent_tree *et);
130 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
131 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
132         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
133         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
134         .eo_update_clusters     = ocfs2_dinode_update_clusters,
135         .eo_insert_check        = ocfs2_dinode_insert_check,
136         .eo_sanity_check        = ocfs2_dinode_sanity_check,
137         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
138 };
139
140 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
141                                          u64 blkno)
142 {
143         struct ocfs2_dinode *di = et->et_object;
144
145         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
146         di->i_last_eb_blk = cpu_to_le64(blkno);
147 }
148
149 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
150 {
151         struct ocfs2_dinode *di = et->et_object;
152
153         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
154         return le64_to_cpu(di->i_last_eb_blk);
155 }
156
157 static void ocfs2_dinode_update_clusters(struct inode *inode,
158                                          struct ocfs2_extent_tree *et,
159                                          u32 clusters)
160 {
161         struct ocfs2_dinode *di = et->et_object;
162
163         le32_add_cpu(&di->i_clusters, clusters);
164         spin_lock(&OCFS2_I(inode)->ip_lock);
165         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
166         spin_unlock(&OCFS2_I(inode)->ip_lock);
167 }
168
169 static int ocfs2_dinode_insert_check(struct inode *inode,
170                                      struct ocfs2_extent_tree *et,
171                                      struct ocfs2_extent_rec *rec)
172 {
173         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
174
175         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
176         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
177                         (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
178                         "Device %s, asking for sparse allocation: inode %llu, "
179                         "cpos %u, clusters %u\n",
180                         osb->dev_str,
181                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
182                         rec->e_cpos,
183                         OCFS2_I(inode)->ip_clusters);
184
185         return 0;
186 }
187
188 static int ocfs2_dinode_sanity_check(struct inode *inode,
189                                      struct ocfs2_extent_tree *et)
190 {
191         struct ocfs2_dinode *di = et->et_object;
192
193         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
194         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
195
196         return 0;
197 }
198
199 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
200 {
201         struct ocfs2_dinode *di = et->et_object;
202
203         et->et_root_el = &di->id2.i_list;
204 }
205
206
207 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
208 {
209         struct ocfs2_xattr_value_root *xv = et->et_object;
210
211         et->et_root_el = &xv->xr_list;
212 }
213
214 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
215                                               u64 blkno)
216 {
217         struct ocfs2_xattr_value_root *xv =
218                 (struct ocfs2_xattr_value_root *)et->et_object;
219
220         xv->xr_last_eb_blk = cpu_to_le64(blkno);
221 }
222
223 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
224 {
225         struct ocfs2_xattr_value_root *xv =
226                 (struct ocfs2_xattr_value_root *) et->et_object;
227
228         return le64_to_cpu(xv->xr_last_eb_blk);
229 }
230
231 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
232                                               struct ocfs2_extent_tree *et,
233                                               u32 clusters)
234 {
235         struct ocfs2_xattr_value_root *xv =
236                 (struct ocfs2_xattr_value_root *)et->et_object;
237
238         le32_add_cpu(&xv->xr_clusters, clusters);
239 }
240
241 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
242         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
243         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
244         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
245         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
246 };
247
248 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
249 {
250         struct ocfs2_xattr_block *xb = et->et_object;
251
252         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
253 }
254
255 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
256                                                     struct ocfs2_extent_tree *et)
257 {
258         et->et_max_leaf_clusters =
259                 ocfs2_clusters_for_bytes(inode->i_sb,
260                                          OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
261 }
262
263 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
264                                              u64 blkno)
265 {
266         struct ocfs2_xattr_block *xb = et->et_object;
267         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
268
269         xt->xt_last_eb_blk = cpu_to_le64(blkno);
270 }
271
272 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
273 {
274         struct ocfs2_xattr_block *xb = et->et_object;
275         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
276
277         return le64_to_cpu(xt->xt_last_eb_blk);
278 }
279
280 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
281                                              struct ocfs2_extent_tree *et,
282                                              u32 clusters)
283 {
284         struct ocfs2_xattr_block *xb = et->et_object;
285
286         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
287 }
288
289 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
290         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
291         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
292         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
293         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
294         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
295 };
296
297 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
298                                      struct inode *inode,
299                                      struct buffer_head *bh,
300                                      void *obj,
301                                      struct ocfs2_extent_tree_operations *ops)
302 {
303         et->et_ops = ops;
304         et->et_root_bh = bh;
305         if (!obj)
306                 obj = (void *)bh->b_data;
307         et->et_object = obj;
308
309         et->et_ops->eo_fill_root_el(et);
310         if (!et->et_ops->eo_fill_max_leaf_clusters)
311                 et->et_max_leaf_clusters = 0;
312         else
313                 et->et_ops->eo_fill_max_leaf_clusters(inode, et);
314 }
315
316 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
317                                    struct inode *inode,
318                                    struct buffer_head *bh)
319 {
320         __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
321 }
322
323 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
324                                        struct inode *inode,
325                                        struct buffer_head *bh)
326 {
327         __ocfs2_init_extent_tree(et, inode, bh, NULL,
328                                  &ocfs2_xattr_tree_et_ops);
329 }
330
331 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
332                                         struct inode *inode,
333                                         struct buffer_head *bh,
334                                         struct ocfs2_xattr_value_root *xv)
335 {
336         __ocfs2_init_extent_tree(et, inode, bh, xv,
337                                  &ocfs2_xattr_value_et_ops);
338 }
339
340 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
341                                             u64 new_last_eb_blk)
342 {
343         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
344 }
345
346 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
347 {
348         return et->et_ops->eo_get_last_eb_blk(et);
349 }
350
351 static inline void ocfs2_et_update_clusters(struct inode *inode,
352                                             struct ocfs2_extent_tree *et,
353                                             u32 clusters)
354 {
355         et->et_ops->eo_update_clusters(inode, et, clusters);
356 }
357
358 static inline int ocfs2_et_insert_check(struct inode *inode,
359                                         struct ocfs2_extent_tree *et,
360                                         struct ocfs2_extent_rec *rec)
361 {
362         int ret = 0;
363
364         if (et->et_ops->eo_insert_check)
365                 ret = et->et_ops->eo_insert_check(inode, et, rec);
366         return ret;
367 }
368
369 static inline int ocfs2_et_sanity_check(struct inode *inode,
370                                         struct ocfs2_extent_tree *et)
371 {
372         int ret = 0;
373
374         if (et->et_ops->eo_sanity_check)
375                 ret = et->et_ops->eo_sanity_check(inode, et);
376         return ret;
377 }
378
379 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
380 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
381                                          struct ocfs2_extent_block *eb);
382
383 /*
384  * Structures which describe a path through a btree, and functions to
385  * manipulate them.
386  *
387  * The idea here is to be as generic as possible with the tree
388  * manipulation code.
389  */
390 struct ocfs2_path_item {
391         struct buffer_head              *bh;
392         struct ocfs2_extent_list        *el;
393 };
394
395 #define OCFS2_MAX_PATH_DEPTH    5
396
397 struct ocfs2_path {
398         int                     p_tree_depth;
399         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
400 };
401
402 #define path_root_bh(_path) ((_path)->p_node[0].bh)
403 #define path_root_el(_path) ((_path)->p_node[0].el)
404 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
405 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
406 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
407
408 /*
409  * Reset the actual path elements so that we can re-use the structure
410  * to build another path. Generally, this involves freeing the buffer
411  * heads.
412  */
413 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
414 {
415         int i, start = 0, depth = 0;
416         struct ocfs2_path_item *node;
417
418         if (keep_root)
419                 start = 1;
420
421         for(i = start; i < path_num_items(path); i++) {
422                 node = &path->p_node[i];
423
424                 brelse(node->bh);
425                 node->bh = NULL;
426                 node->el = NULL;
427         }
428
429         /*
430          * Tree depth may change during truncate, or insert. If we're
431          * keeping the root extent list, then make sure that our path
432          * structure reflects the proper depth.
433          */
434         if (keep_root)
435                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
436
437         path->p_tree_depth = depth;
438 }
439
440 static void ocfs2_free_path(struct ocfs2_path *path)
441 {
442         if (path) {
443                 ocfs2_reinit_path(path, 0);
444                 kfree(path);
445         }
446 }
447
448 /*
449  * All the elements of src into dest. After this call, src could be freed
450  * without affecting dest.
451  *
452  * Both paths should have the same root. Any non-root elements of dest
453  * will be freed.
454  */
455 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
456 {
457         int i;
458
459         BUG_ON(path_root_bh(dest) != path_root_bh(src));
460         BUG_ON(path_root_el(dest) != path_root_el(src));
461
462         ocfs2_reinit_path(dest, 1);
463
464         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
465                 dest->p_node[i].bh = src->p_node[i].bh;
466                 dest->p_node[i].el = src->p_node[i].el;
467
468                 if (dest->p_node[i].bh)
469                         get_bh(dest->p_node[i].bh);
470         }
471 }
472
473 /*
474  * Make the *dest path the same as src and re-initialize src path to
475  * have a root only.
476  */
477 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
478 {
479         int i;
480
481         BUG_ON(path_root_bh(dest) != path_root_bh(src));
482
483         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
484                 brelse(dest->p_node[i].bh);
485
486                 dest->p_node[i].bh = src->p_node[i].bh;
487                 dest->p_node[i].el = src->p_node[i].el;
488
489                 src->p_node[i].bh = NULL;
490                 src->p_node[i].el = NULL;
491         }
492 }
493
494 /*
495  * Insert an extent block at given index.
496  *
497  * This will not take an additional reference on eb_bh.
498  */
499 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
500                                         struct buffer_head *eb_bh)
501 {
502         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
503
504         /*
505          * Right now, no root bh is an extent block, so this helps
506          * catch code errors with dinode trees. The assertion can be
507          * safely removed if we ever need to insert extent block
508          * structures at the root.
509          */
510         BUG_ON(index == 0);
511
512         path->p_node[index].bh = eb_bh;
513         path->p_node[index].el = &eb->h_list;
514 }
515
516 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
517                                          struct ocfs2_extent_list *root_el)
518 {
519         struct ocfs2_path *path;
520
521         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
522
523         path = kzalloc(sizeof(*path), GFP_NOFS);
524         if (path) {
525                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
526                 get_bh(root_bh);
527                 path_root_bh(path) = root_bh;
528                 path_root_el(path) = root_el;
529         }
530
531         return path;
532 }
533
534 /*
535  * Convenience function to journal all components in a path.
536  */
537 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
538                                      struct ocfs2_path *path)
539 {
540         int i, ret = 0;
541
542         if (!path)
543                 goto out;
544
545         for(i = 0; i < path_num_items(path); i++) {
546                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
547                                            OCFS2_JOURNAL_ACCESS_WRITE);
548                 if (ret < 0) {
549                         mlog_errno(ret);
550                         goto out;
551                 }
552         }
553
554 out:
555         return ret;
556 }
557
558 /*
559  * Return the index of the extent record which contains cluster #v_cluster.
560  * -1 is returned if it was not found.
561  *
562  * Should work fine on interior and exterior nodes.
563  */
564 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
565 {
566         int ret = -1;
567         int i;
568         struct ocfs2_extent_rec *rec;
569         u32 rec_end, rec_start, clusters;
570
571         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
572                 rec = &el->l_recs[i];
573
574                 rec_start = le32_to_cpu(rec->e_cpos);
575                 clusters = ocfs2_rec_clusters(el, rec);
576
577                 rec_end = rec_start + clusters;
578
579                 if (v_cluster >= rec_start && v_cluster < rec_end) {
580                         ret = i;
581                         break;
582                 }
583         }
584
585         return ret;
586 }
587
588 enum ocfs2_contig_type {
589         CONTIG_NONE = 0,
590         CONTIG_LEFT,
591         CONTIG_RIGHT,
592         CONTIG_LEFTRIGHT,
593 };
594
595
596 /*
597  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
598  * ocfs2_extent_contig only work properly against leaf nodes!
599  */
600 static int ocfs2_block_extent_contig(struct super_block *sb,
601                                      struct ocfs2_extent_rec *ext,
602                                      u64 blkno)
603 {
604         u64 blk_end = le64_to_cpu(ext->e_blkno);
605
606         blk_end += ocfs2_clusters_to_blocks(sb,
607                                     le16_to_cpu(ext->e_leaf_clusters));
608
609         return blkno == blk_end;
610 }
611
612 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
613                                   struct ocfs2_extent_rec *right)
614 {
615         u32 left_range;
616
617         left_range = le32_to_cpu(left->e_cpos) +
618                 le16_to_cpu(left->e_leaf_clusters);
619
620         return (left_range == le32_to_cpu(right->e_cpos));
621 }
622
623 static enum ocfs2_contig_type
624         ocfs2_extent_contig(struct inode *inode,
625                             struct ocfs2_extent_rec *ext,
626                             struct ocfs2_extent_rec *insert_rec)
627 {
628         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
629
630         /*
631          * Refuse to coalesce extent records with different flag
632          * fields - we don't want to mix unwritten extents with user
633          * data.
634          */
635         if (ext->e_flags != insert_rec->e_flags)
636                 return CONTIG_NONE;
637
638         if (ocfs2_extents_adjacent(ext, insert_rec) &&
639             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
640                         return CONTIG_RIGHT;
641
642         blkno = le64_to_cpu(ext->e_blkno);
643         if (ocfs2_extents_adjacent(insert_rec, ext) &&
644             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
645                 return CONTIG_LEFT;
646
647         return CONTIG_NONE;
648 }
649
650 /*
651  * NOTE: We can have pretty much any combination of contiguousness and
652  * appending.
653  *
654  * The usefulness of APPEND_TAIL is more in that it lets us know that
655  * we'll have to update the path to that leaf.
656  */
657 enum ocfs2_append_type {
658         APPEND_NONE = 0,
659         APPEND_TAIL,
660 };
661
662 enum ocfs2_split_type {
663         SPLIT_NONE = 0,
664         SPLIT_LEFT,
665         SPLIT_RIGHT,
666 };
667
668 struct ocfs2_insert_type {
669         enum ocfs2_split_type   ins_split;
670         enum ocfs2_append_type  ins_appending;
671         enum ocfs2_contig_type  ins_contig;
672         int                     ins_contig_index;
673         int                     ins_tree_depth;
674 };
675
676 struct ocfs2_merge_ctxt {
677         enum ocfs2_contig_type  c_contig_type;
678         int                     c_has_empty_extent;
679         int                     c_split_covers_rec;
680 };
681
682 static int ocfs2_validate_extent_block(struct super_block *sb,
683                                        struct buffer_head *bh)
684 {
685         struct ocfs2_extent_block *eb =
686                 (struct ocfs2_extent_block *)bh->b_data;
687
688         mlog(0, "Validating extent block %llu\n",
689              (unsigned long long)bh->b_blocknr);
690
691         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
692                 ocfs2_error(sb,
693                             "Extent block #%llu has bad signature %.*s",
694                             (unsigned long long)bh->b_blocknr, 7,
695                             eb->h_signature);
696                 return -EINVAL;
697         }
698
699         if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
700                 ocfs2_error(sb,
701                             "Extent block #%llu has an invalid h_blkno "
702                             "of %llu",
703                             (unsigned long long)bh->b_blocknr,
704                             (unsigned long long)le64_to_cpu(eb->h_blkno));
705                 return -EINVAL;
706         }
707
708         if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
709                 ocfs2_error(sb,
710                             "Extent block #%llu has an invalid "
711                             "h_fs_generation of #%u",
712                             (unsigned long long)bh->b_blocknr,
713                             le32_to_cpu(eb->h_fs_generation));
714                 return -EINVAL;
715         }
716
717         return 0;
718 }
719
720 int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
721                             struct buffer_head **bh)
722 {
723         int rc;
724         struct buffer_head *tmp = *bh;
725
726         rc = ocfs2_read_block(inode, eb_blkno, &tmp,
727                               ocfs2_validate_extent_block);
728
729         /* If ocfs2_read_block() got us a new bh, pass it up. */
730         if (!rc && !*bh)
731                 *bh = tmp;
732
733         return rc;
734 }
735
736
737 /*
738  * How many free extents have we got before we need more meta data?
739  */
740 int ocfs2_num_free_extents(struct ocfs2_super *osb,
741                            struct inode *inode,
742                            struct ocfs2_extent_tree *et)
743 {
744         int retval;
745         struct ocfs2_extent_list *el = NULL;
746         struct ocfs2_extent_block *eb;
747         struct buffer_head *eb_bh = NULL;
748         u64 last_eb_blk = 0;
749
750         mlog_entry_void();
751
752         el = et->et_root_el;
753         last_eb_blk = ocfs2_et_get_last_eb_blk(et);
754
755         if (last_eb_blk) {
756                 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
757                 if (retval < 0) {
758                         mlog_errno(retval);
759                         goto bail;
760                 }
761                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
762                 el = &eb->h_list;
763         }
764
765         BUG_ON(el->l_tree_depth != 0);
766
767         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
768 bail:
769         brelse(eb_bh);
770
771         mlog_exit(retval);
772         return retval;
773 }
774
775 /* expects array to already be allocated
776  *
777  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
778  * l_count for you
779  */
780 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
781                                      handle_t *handle,
782                                      struct inode *inode,
783                                      int wanted,
784                                      struct ocfs2_alloc_context *meta_ac,
785                                      struct buffer_head *bhs[])
786 {
787         int count, status, i;
788         u16 suballoc_bit_start;
789         u32 num_got;
790         u64 first_blkno;
791         struct ocfs2_extent_block *eb;
792
793         mlog_entry_void();
794
795         count = 0;
796         while (count < wanted) {
797                 status = ocfs2_claim_metadata(osb,
798                                               handle,
799                                               meta_ac,
800                                               wanted - count,
801                                               &suballoc_bit_start,
802                                               &num_got,
803                                               &first_blkno);
804                 if (status < 0) {
805                         mlog_errno(status);
806                         goto bail;
807                 }
808
809                 for(i = count;  i < (num_got + count); i++) {
810                         bhs[i] = sb_getblk(osb->sb, first_blkno);
811                         if (bhs[i] == NULL) {
812                                 status = -EIO;
813                                 mlog_errno(status);
814                                 goto bail;
815                         }
816                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
817
818                         status = ocfs2_journal_access(handle, inode, bhs[i],
819                                                       OCFS2_JOURNAL_ACCESS_CREATE);
820                         if (status < 0) {
821                                 mlog_errno(status);
822                                 goto bail;
823                         }
824
825                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
826                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
827                         /* Ok, setup the minimal stuff here. */
828                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
829                         eb->h_blkno = cpu_to_le64(first_blkno);
830                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
831                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
832                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
833                         eb->h_list.l_count =
834                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
835
836                         suballoc_bit_start++;
837                         first_blkno++;
838
839                         /* We'll also be dirtied by the caller, so
840                          * this isn't absolutely necessary. */
841                         status = ocfs2_journal_dirty(handle, bhs[i]);
842                         if (status < 0) {
843                                 mlog_errno(status);
844                                 goto bail;
845                         }
846                 }
847
848                 count += num_got;
849         }
850
851         status = 0;
852 bail:
853         if (status < 0) {
854                 for(i = 0; i < wanted; i++) {
855                         brelse(bhs[i]);
856                         bhs[i] = NULL;
857                 }
858         }
859         mlog_exit(status);
860         return status;
861 }
862
863 /*
864  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
865  *
866  * Returns the sum of the rightmost extent rec logical offset and
867  * cluster count.
868  *
869  * ocfs2_add_branch() uses this to determine what logical cluster
870  * value should be populated into the leftmost new branch records.
871  *
872  * ocfs2_shift_tree_depth() uses this to determine the # clusters
873  * value for the new topmost tree record.
874  */
875 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
876 {
877         int i;
878
879         i = le16_to_cpu(el->l_next_free_rec) - 1;
880
881         return le32_to_cpu(el->l_recs[i].e_cpos) +
882                 ocfs2_rec_clusters(el, &el->l_recs[i]);
883 }
884
885 /*
886  * Add an entire tree branch to our inode. eb_bh is the extent block
887  * to start at, if we don't want to start the branch at the dinode
888  * structure.
889  *
890  * last_eb_bh is required as we have to update it's next_leaf pointer
891  * for the new last extent block.
892  *
893  * the new branch will be 'empty' in the sense that every block will
894  * contain a single record with cluster count == 0.
895  */
896 static int ocfs2_add_branch(struct ocfs2_super *osb,
897                             handle_t *handle,
898                             struct inode *inode,
899                             struct ocfs2_extent_tree *et,
900                             struct buffer_head *eb_bh,
901                             struct buffer_head **last_eb_bh,
902                             struct ocfs2_alloc_context *meta_ac)
903 {
904         int status, new_blocks, i;
905         u64 next_blkno, new_last_eb_blk;
906         struct buffer_head *bh;
907         struct buffer_head **new_eb_bhs = NULL;
908         struct ocfs2_extent_block *eb;
909         struct ocfs2_extent_list  *eb_el;
910         struct ocfs2_extent_list  *el;
911         u32 new_cpos;
912
913         mlog_entry_void();
914
915         BUG_ON(!last_eb_bh || !*last_eb_bh);
916
917         if (eb_bh) {
918                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
919                 el = &eb->h_list;
920         } else
921                 el = et->et_root_el;
922
923         /* we never add a branch to a leaf. */
924         BUG_ON(!el->l_tree_depth);
925
926         new_blocks = le16_to_cpu(el->l_tree_depth);
927
928         /* allocate the number of new eb blocks we need */
929         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
930                              GFP_KERNEL);
931         if (!new_eb_bhs) {
932                 status = -ENOMEM;
933                 mlog_errno(status);
934                 goto bail;
935         }
936
937         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
938                                            meta_ac, new_eb_bhs);
939         if (status < 0) {
940                 mlog_errno(status);
941                 goto bail;
942         }
943
944         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
945         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
946
947         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
948          * linked with the rest of the tree.
949          * conversly, new_eb_bhs[0] is the new bottommost leaf.
950          *
951          * when we leave the loop, new_last_eb_blk will point to the
952          * newest leaf, and next_blkno will point to the topmost extent
953          * block. */
954         next_blkno = new_last_eb_blk = 0;
955         for(i = 0; i < new_blocks; i++) {
956                 bh = new_eb_bhs[i];
957                 eb = (struct ocfs2_extent_block *) bh->b_data;
958                 /* ocfs2_create_new_meta_bhs() should create it right! */
959                 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
960                 eb_el = &eb->h_list;
961
962                 status = ocfs2_journal_access(handle, inode, bh,
963                                               OCFS2_JOURNAL_ACCESS_CREATE);
964                 if (status < 0) {
965                         mlog_errno(status);
966                         goto bail;
967                 }
968
969                 eb->h_next_leaf_blk = 0;
970                 eb_el->l_tree_depth = cpu_to_le16(i);
971                 eb_el->l_next_free_rec = cpu_to_le16(1);
972                 /*
973                  * This actually counts as an empty extent as
974                  * c_clusters == 0
975                  */
976                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
977                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
978                 /*
979                  * eb_el isn't always an interior node, but even leaf
980                  * nodes want a zero'd flags and reserved field so
981                  * this gets the whole 32 bits regardless of use.
982                  */
983                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
984                 if (!eb_el->l_tree_depth)
985                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
986
987                 status = ocfs2_journal_dirty(handle, bh);
988                 if (status < 0) {
989                         mlog_errno(status);
990                         goto bail;
991                 }
992
993                 next_blkno = le64_to_cpu(eb->h_blkno);
994         }
995
996         /* This is a bit hairy. We want to update up to three blocks
997          * here without leaving any of them in an inconsistent state
998          * in case of error. We don't have to worry about
999          * journal_dirty erroring as it won't unless we've aborted the
1000          * handle (in which case we would never be here) so reserving
1001          * the write with journal_access is all we need to do. */
1002         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
1003                                       OCFS2_JOURNAL_ACCESS_WRITE);
1004         if (status < 0) {
1005                 mlog_errno(status);
1006                 goto bail;
1007         }
1008         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1009                                       OCFS2_JOURNAL_ACCESS_WRITE);
1010         if (status < 0) {
1011                 mlog_errno(status);
1012                 goto bail;
1013         }
1014         if (eb_bh) {
1015                 status = ocfs2_journal_access(handle, inode, eb_bh,
1016                                               OCFS2_JOURNAL_ACCESS_WRITE);
1017                 if (status < 0) {
1018                         mlog_errno(status);
1019                         goto bail;
1020                 }
1021         }
1022
1023         /* Link the new branch into the rest of the tree (el will
1024          * either be on the root_bh, or the extent block passed in. */
1025         i = le16_to_cpu(el->l_next_free_rec);
1026         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1027         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1028         el->l_recs[i].e_int_clusters = 0;
1029         le16_add_cpu(&el->l_next_free_rec, 1);
1030
1031         /* fe needs a new last extent block pointer, as does the
1032          * next_leaf on the previously last-extent-block. */
1033         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1034
1035         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1036         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1037
1038         status = ocfs2_journal_dirty(handle, *last_eb_bh);
1039         if (status < 0)
1040                 mlog_errno(status);
1041         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1042         if (status < 0)
1043                 mlog_errno(status);
1044         if (eb_bh) {
1045                 status = ocfs2_journal_dirty(handle, eb_bh);
1046                 if (status < 0)
1047                         mlog_errno(status);
1048         }
1049
1050         /*
1051          * Some callers want to track the rightmost leaf so pass it
1052          * back here.
1053          */
1054         brelse(*last_eb_bh);
1055         get_bh(new_eb_bhs[0]);
1056         *last_eb_bh = new_eb_bhs[0];
1057
1058         status = 0;
1059 bail:
1060         if (new_eb_bhs) {
1061                 for (i = 0; i < new_blocks; i++)
1062                         brelse(new_eb_bhs[i]);
1063                 kfree(new_eb_bhs);
1064         }
1065
1066         mlog_exit(status);
1067         return status;
1068 }
1069
1070 /*
1071  * adds another level to the allocation tree.
1072  * returns back the new extent block so you can add a branch to it
1073  * after this call.
1074  */
1075 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1076                                   handle_t *handle,
1077                                   struct inode *inode,
1078                                   struct ocfs2_extent_tree *et,
1079                                   struct ocfs2_alloc_context *meta_ac,
1080                                   struct buffer_head **ret_new_eb_bh)
1081 {
1082         int status, i;
1083         u32 new_clusters;
1084         struct buffer_head *new_eb_bh = NULL;
1085         struct ocfs2_extent_block *eb;
1086         struct ocfs2_extent_list  *root_el;
1087         struct ocfs2_extent_list  *eb_el;
1088
1089         mlog_entry_void();
1090
1091         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1092                                            &new_eb_bh);
1093         if (status < 0) {
1094                 mlog_errno(status);
1095                 goto bail;
1096         }
1097
1098         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1099         /* ocfs2_create_new_meta_bhs() should create it right! */
1100         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1101
1102         eb_el = &eb->h_list;
1103         root_el = et->et_root_el;
1104
1105         status = ocfs2_journal_access(handle, inode, new_eb_bh,
1106                                       OCFS2_JOURNAL_ACCESS_CREATE);
1107         if (status < 0) {
1108                 mlog_errno(status);
1109                 goto bail;
1110         }
1111
1112         /* copy the root extent list data into the new extent block */
1113         eb_el->l_tree_depth = root_el->l_tree_depth;
1114         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1115         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1116                 eb_el->l_recs[i] = root_el->l_recs[i];
1117
1118         status = ocfs2_journal_dirty(handle, new_eb_bh);
1119         if (status < 0) {
1120                 mlog_errno(status);
1121                 goto bail;
1122         }
1123
1124         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1125                                       OCFS2_JOURNAL_ACCESS_WRITE);
1126         if (status < 0) {
1127                 mlog_errno(status);
1128                 goto bail;
1129         }
1130
1131         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1132
1133         /* update root_bh now */
1134         le16_add_cpu(&root_el->l_tree_depth, 1);
1135         root_el->l_recs[0].e_cpos = 0;
1136         root_el->l_recs[0].e_blkno = eb->h_blkno;
1137         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1138         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1139                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1140         root_el->l_next_free_rec = cpu_to_le16(1);
1141
1142         /* If this is our 1st tree depth shift, then last_eb_blk
1143          * becomes the allocated extent block */
1144         if (root_el->l_tree_depth == cpu_to_le16(1))
1145                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1146
1147         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1148         if (status < 0) {
1149                 mlog_errno(status);
1150                 goto bail;
1151         }
1152
1153         *ret_new_eb_bh = new_eb_bh;
1154         new_eb_bh = NULL;
1155         status = 0;
1156 bail:
1157         brelse(new_eb_bh);
1158
1159         mlog_exit(status);
1160         return status;
1161 }
1162
1163 /*
1164  * Should only be called when there is no space left in any of the
1165  * leaf nodes. What we want to do is find the lowest tree depth
1166  * non-leaf extent block with room for new records. There are three
1167  * valid results of this search:
1168  *
1169  * 1) a lowest extent block is found, then we pass it back in
1170  *    *lowest_eb_bh and return '0'
1171  *
1172  * 2) the search fails to find anything, but the root_el has room. We
1173  *    pass NULL back in *lowest_eb_bh, but still return '0'
1174  *
1175  * 3) the search fails to find anything AND the root_el is full, in
1176  *    which case we return > 0
1177  *
1178  * return status < 0 indicates an error.
1179  */
1180 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1181                                     struct inode *inode,
1182                                     struct ocfs2_extent_tree *et,
1183                                     struct buffer_head **target_bh)
1184 {
1185         int status = 0, i;
1186         u64 blkno;
1187         struct ocfs2_extent_block *eb;
1188         struct ocfs2_extent_list  *el;
1189         struct buffer_head *bh = NULL;
1190         struct buffer_head *lowest_bh = NULL;
1191
1192         mlog_entry_void();
1193
1194         *target_bh = NULL;
1195
1196         el = et->et_root_el;
1197
1198         while(le16_to_cpu(el->l_tree_depth) > 1) {
1199                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1200                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1201                                     "extent list (next_free_rec == 0)",
1202                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1203                         status = -EIO;
1204                         goto bail;
1205                 }
1206                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1207                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1208                 if (!blkno) {
1209                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1210                                     "list where extent # %d has no physical "
1211                                     "block start",
1212                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1213                         status = -EIO;
1214                         goto bail;
1215                 }
1216
1217                 brelse(bh);
1218                 bh = NULL;
1219
1220                 status = ocfs2_read_extent_block(inode, blkno, &bh);
1221                 if (status < 0) {
1222                         mlog_errno(status);
1223                         goto bail;
1224                 }
1225
1226                 eb = (struct ocfs2_extent_block *) bh->b_data;
1227                 el = &eb->h_list;
1228
1229                 if (le16_to_cpu(el->l_next_free_rec) <
1230                     le16_to_cpu(el->l_count)) {
1231                         brelse(lowest_bh);
1232                         lowest_bh = bh;
1233                         get_bh(lowest_bh);
1234                 }
1235         }
1236
1237         /* If we didn't find one and the fe doesn't have any room,
1238          * then return '1' */
1239         el = et->et_root_el;
1240         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1241                 status = 1;
1242
1243         *target_bh = lowest_bh;
1244 bail:
1245         brelse(bh);
1246
1247         mlog_exit(status);
1248         return status;
1249 }
1250
1251 /*
1252  * Grow a b-tree so that it has more records.
1253  *
1254  * We might shift the tree depth in which case existing paths should
1255  * be considered invalid.
1256  *
1257  * Tree depth after the grow is returned via *final_depth.
1258  *
1259  * *last_eb_bh will be updated by ocfs2_add_branch().
1260  */
1261 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1262                            struct ocfs2_extent_tree *et, int *final_depth,
1263                            struct buffer_head **last_eb_bh,
1264                            struct ocfs2_alloc_context *meta_ac)
1265 {
1266         int ret, shift;
1267         struct ocfs2_extent_list *el = et->et_root_el;
1268         int depth = le16_to_cpu(el->l_tree_depth);
1269         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1270         struct buffer_head *bh = NULL;
1271
1272         BUG_ON(meta_ac == NULL);
1273
1274         shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1275         if (shift < 0) {
1276                 ret = shift;
1277                 mlog_errno(ret);
1278                 goto out;
1279         }
1280
1281         /* We traveled all the way to the bottom of the allocation tree
1282          * and didn't find room for any more extents - we need to add
1283          * another tree level */
1284         if (shift) {
1285                 BUG_ON(bh);
1286                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1287
1288                 /* ocfs2_shift_tree_depth will return us a buffer with
1289                  * the new extent block (so we can pass that to
1290                  * ocfs2_add_branch). */
1291                 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1292                                              meta_ac, &bh);
1293                 if (ret < 0) {
1294                         mlog_errno(ret);
1295                         goto out;
1296                 }
1297                 depth++;
1298                 if (depth == 1) {
1299                         /*
1300                          * Special case: we have room now if we shifted from
1301                          * tree_depth 0, so no more work needs to be done.
1302                          *
1303                          * We won't be calling add_branch, so pass
1304                          * back *last_eb_bh as the new leaf. At depth
1305                          * zero, it should always be null so there's
1306                          * no reason to brelse.
1307                          */
1308                         BUG_ON(*last_eb_bh);
1309                         get_bh(bh);
1310                         *last_eb_bh = bh;
1311                         goto out;
1312                 }
1313         }
1314
1315         /* call ocfs2_add_branch to add the final part of the tree with
1316          * the new data. */
1317         mlog(0, "add branch. bh = %p\n", bh);
1318         ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1319                                meta_ac);
1320         if (ret < 0) {
1321                 mlog_errno(ret);
1322                 goto out;
1323         }
1324
1325 out:
1326         if (final_depth)
1327                 *final_depth = depth;
1328         brelse(bh);
1329         return ret;
1330 }
1331
1332 /*
1333  * This function will discard the rightmost extent record.
1334  */
1335 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1336 {
1337         int next_free = le16_to_cpu(el->l_next_free_rec);
1338         int count = le16_to_cpu(el->l_count);
1339         unsigned int num_bytes;
1340
1341         BUG_ON(!next_free);
1342         /* This will cause us to go off the end of our extent list. */
1343         BUG_ON(next_free >= count);
1344
1345         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1346
1347         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1348 }
1349
1350 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1351                               struct ocfs2_extent_rec *insert_rec)
1352 {
1353         int i, insert_index, next_free, has_empty, num_bytes;
1354         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1355         struct ocfs2_extent_rec *rec;
1356
1357         next_free = le16_to_cpu(el->l_next_free_rec);
1358         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1359
1360         BUG_ON(!next_free);
1361
1362         /* The tree code before us didn't allow enough room in the leaf. */
1363         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1364
1365         /*
1366          * The easiest way to approach this is to just remove the
1367          * empty extent and temporarily decrement next_free.
1368          */
1369         if (has_empty) {
1370                 /*
1371                  * If next_free was 1 (only an empty extent), this
1372                  * loop won't execute, which is fine. We still want
1373                  * the decrement above to happen.
1374                  */
1375                 for(i = 0; i < (next_free - 1); i++)
1376                         el->l_recs[i] = el->l_recs[i+1];
1377
1378                 next_free--;
1379         }
1380
1381         /*
1382          * Figure out what the new record index should be.
1383          */
1384         for(i = 0; i < next_free; i++) {
1385                 rec = &el->l_recs[i];
1386
1387                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1388                         break;
1389         }
1390         insert_index = i;
1391
1392         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1393              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1394
1395         BUG_ON(insert_index < 0);
1396         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1397         BUG_ON(insert_index > next_free);
1398
1399         /*
1400          * No need to memmove if we're just adding to the tail.
1401          */
1402         if (insert_index != next_free) {
1403                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1404
1405                 num_bytes = next_free - insert_index;
1406                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1407                 memmove(&el->l_recs[insert_index + 1],
1408                         &el->l_recs[insert_index],
1409                         num_bytes);
1410         }
1411
1412         /*
1413          * Either we had an empty extent, and need to re-increment or
1414          * there was no empty extent on a non full rightmost leaf node,
1415          * in which case we still need to increment.
1416          */
1417         next_free++;
1418         el->l_next_free_rec = cpu_to_le16(next_free);
1419         /*
1420          * Make sure none of the math above just messed up our tree.
1421          */
1422         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1423
1424         el->l_recs[insert_index] = *insert_rec;
1425
1426 }
1427
1428 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1429 {
1430         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1431
1432         BUG_ON(num_recs == 0);
1433
1434         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1435                 num_recs--;
1436                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1437                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1438                 memset(&el->l_recs[num_recs], 0,
1439                        sizeof(struct ocfs2_extent_rec));
1440                 el->l_next_free_rec = cpu_to_le16(num_recs);
1441         }
1442 }
1443
1444 /*
1445  * Create an empty extent record .
1446  *
1447  * l_next_free_rec may be updated.
1448  *
1449  * If an empty extent already exists do nothing.
1450  */
1451 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1452 {
1453         int next_free = le16_to_cpu(el->l_next_free_rec);
1454
1455         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1456
1457         if (next_free == 0)
1458                 goto set_and_inc;
1459
1460         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1461                 return;
1462
1463         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1464                         "Asked to create an empty extent in a full list:\n"
1465                         "count = %u, tree depth = %u",
1466                         le16_to_cpu(el->l_count),
1467                         le16_to_cpu(el->l_tree_depth));
1468
1469         ocfs2_shift_records_right(el);
1470
1471 set_and_inc:
1472         le16_add_cpu(&el->l_next_free_rec, 1);
1473         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1474 }
1475
1476 /*
1477  * For a rotation which involves two leaf nodes, the "root node" is
1478  * the lowest level tree node which contains a path to both leafs. This
1479  * resulting set of information can be used to form a complete "subtree"
1480  *
1481  * This function is passed two full paths from the dinode down to a
1482  * pair of adjacent leaves. It's task is to figure out which path
1483  * index contains the subtree root - this can be the root index itself
1484  * in a worst-case rotation.
1485  *
1486  * The array index of the subtree root is passed back.
1487  */
1488 static int ocfs2_find_subtree_root(struct inode *inode,
1489                                    struct ocfs2_path *left,
1490                                    struct ocfs2_path *right)
1491 {
1492         int i = 0;
1493
1494         /*
1495          * Check that the caller passed in two paths from the same tree.
1496          */
1497         BUG_ON(path_root_bh(left) != path_root_bh(right));
1498
1499         do {
1500                 i++;
1501
1502                 /*
1503                  * The caller didn't pass two adjacent paths.
1504                  */
1505                 mlog_bug_on_msg(i > left->p_tree_depth,
1506                                 "Inode %lu, left depth %u, right depth %u\n"
1507                                 "left leaf blk %llu, right leaf blk %llu\n",
1508                                 inode->i_ino, left->p_tree_depth,
1509                                 right->p_tree_depth,
1510                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1511                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1512         } while (left->p_node[i].bh->b_blocknr ==
1513                  right->p_node[i].bh->b_blocknr);
1514
1515         return i - 1;
1516 }
1517
1518 typedef void (path_insert_t)(void *, struct buffer_head *);
1519
1520 /*
1521  * Traverse a btree path in search of cpos, starting at root_el.
1522  *
1523  * This code can be called with a cpos larger than the tree, in which
1524  * case it will return the rightmost path.
1525  */
1526 static int __ocfs2_find_path(struct inode *inode,
1527                              struct ocfs2_extent_list *root_el, u32 cpos,
1528                              path_insert_t *func, void *data)
1529 {
1530         int i, ret = 0;
1531         u32 range;
1532         u64 blkno;
1533         struct buffer_head *bh = NULL;
1534         struct ocfs2_extent_block *eb;
1535         struct ocfs2_extent_list *el;
1536         struct ocfs2_extent_rec *rec;
1537         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1538
1539         el = root_el;
1540         while (el->l_tree_depth) {
1541                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1542                         ocfs2_error(inode->i_sb,
1543                                     "Inode %llu has empty extent list at "
1544                                     "depth %u\n",
1545                                     (unsigned long long)oi->ip_blkno,
1546                                     le16_to_cpu(el->l_tree_depth));
1547                         ret = -EROFS;
1548                         goto out;
1549
1550                 }
1551
1552                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1553                         rec = &el->l_recs[i];
1554
1555                         /*
1556                          * In the case that cpos is off the allocation
1557                          * tree, this should just wind up returning the
1558                          * rightmost record.
1559                          */
1560                         range = le32_to_cpu(rec->e_cpos) +
1561                                 ocfs2_rec_clusters(el, rec);
1562                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1563                             break;
1564                 }
1565
1566                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1567                 if (blkno == 0) {
1568                         ocfs2_error(inode->i_sb,
1569                                     "Inode %llu has bad blkno in extent list "
1570                                     "at depth %u (index %d)\n",
1571                                     (unsigned long long)oi->ip_blkno,
1572                                     le16_to_cpu(el->l_tree_depth), i);
1573                         ret = -EROFS;
1574                         goto out;
1575                 }
1576
1577                 brelse(bh);
1578                 bh = NULL;
1579                 ret = ocfs2_read_extent_block(inode, blkno, &bh);
1580                 if (ret) {
1581                         mlog_errno(ret);
1582                         goto out;
1583                 }
1584
1585                 eb = (struct ocfs2_extent_block *) bh->b_data;
1586                 el = &eb->h_list;
1587
1588                 if (le16_to_cpu(el->l_next_free_rec) >
1589                     le16_to_cpu(el->l_count)) {
1590                         ocfs2_error(inode->i_sb,
1591                                     "Inode %llu has bad count in extent list "
1592                                     "at block %llu (next free=%u, count=%u)\n",
1593                                     (unsigned long long)oi->ip_blkno,
1594                                     (unsigned long long)bh->b_blocknr,
1595                                     le16_to_cpu(el->l_next_free_rec),
1596                                     le16_to_cpu(el->l_count));
1597                         ret = -EROFS;
1598                         goto out;
1599                 }
1600
1601                 if (func)
1602                         func(data, bh);
1603         }
1604
1605 out:
1606         /*
1607          * Catch any trailing bh that the loop didn't handle.
1608          */
1609         brelse(bh);
1610
1611         return ret;
1612 }
1613
1614 /*
1615  * Given an initialized path (that is, it has a valid root extent
1616  * list), this function will traverse the btree in search of the path
1617  * which would contain cpos.
1618  *
1619  * The path traveled is recorded in the path structure.
1620  *
1621  * Note that this will not do any comparisons on leaf node extent
1622  * records, so it will work fine in the case that we just added a tree
1623  * branch.
1624  */
1625 struct find_path_data {
1626         int index;
1627         struct ocfs2_path *path;
1628 };
1629 static void find_path_ins(void *data, struct buffer_head *bh)
1630 {
1631         struct find_path_data *fp = data;
1632
1633         get_bh(bh);
1634         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1635         fp->index++;
1636 }
1637 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1638                            u32 cpos)
1639 {
1640         struct find_path_data data;
1641
1642         data.index = 1;
1643         data.path = path;
1644         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1645                                  find_path_ins, &data);
1646 }
1647
1648 static void find_leaf_ins(void *data, struct buffer_head *bh)
1649 {
1650         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1651         struct ocfs2_extent_list *el = &eb->h_list;
1652         struct buffer_head **ret = data;
1653
1654         /* We want to retain only the leaf block. */
1655         if (le16_to_cpu(el->l_tree_depth) == 0) {
1656                 get_bh(bh);
1657                 *ret = bh;
1658         }
1659 }
1660 /*
1661  * Find the leaf block in the tree which would contain cpos. No
1662  * checking of the actual leaf is done.
1663  *
1664  * Some paths want to call this instead of allocating a path structure
1665  * and calling ocfs2_find_path().
1666  *
1667  * This function doesn't handle non btree extent lists.
1668  */
1669 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1670                     u32 cpos, struct buffer_head **leaf_bh)
1671 {
1672         int ret;
1673         struct buffer_head *bh = NULL;
1674
1675         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1676         if (ret) {
1677                 mlog_errno(ret);
1678                 goto out;
1679         }
1680
1681         *leaf_bh = bh;
1682 out:
1683         return ret;
1684 }
1685
1686 /*
1687  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1688  *
1689  * Basically, we've moved stuff around at the bottom of the tree and
1690  * we need to fix up the extent records above the changes to reflect
1691  * the new changes.
1692  *
1693  * left_rec: the record on the left.
1694  * left_child_el: is the child list pointed to by left_rec
1695  * right_rec: the record to the right of left_rec
1696  * right_child_el: is the child list pointed to by right_rec
1697  *
1698  * By definition, this only works on interior nodes.
1699  */
1700 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1701                                   struct ocfs2_extent_list *left_child_el,
1702                                   struct ocfs2_extent_rec *right_rec,
1703                                   struct ocfs2_extent_list *right_child_el)
1704 {
1705         u32 left_clusters, right_end;
1706
1707         /*
1708          * Interior nodes never have holes. Their cpos is the cpos of
1709          * the leftmost record in their child list. Their cluster
1710          * count covers the full theoretical range of their child list
1711          * - the range between their cpos and the cpos of the record
1712          * immediately to their right.
1713          */
1714         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1715         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1716                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1717                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1718         }
1719         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1720         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1721
1722         /*
1723          * Calculate the rightmost cluster count boundary before
1724          * moving cpos - we will need to adjust clusters after
1725          * updating e_cpos to keep the same highest cluster count.
1726          */
1727         right_end = le32_to_cpu(right_rec->e_cpos);
1728         right_end += le32_to_cpu(right_rec->e_int_clusters);
1729
1730         right_rec->e_cpos = left_rec->e_cpos;
1731         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1732
1733         right_end -= le32_to_cpu(right_rec->e_cpos);
1734         right_rec->e_int_clusters = cpu_to_le32(right_end);
1735 }
1736
1737 /*
1738  * Adjust the adjacent root node records involved in a
1739  * rotation. left_el_blkno is passed in as a key so that we can easily
1740  * find it's index in the root list.
1741  */
1742 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1743                                       struct ocfs2_extent_list *left_el,
1744                                       struct ocfs2_extent_list *right_el,
1745                                       u64 left_el_blkno)
1746 {
1747         int i;
1748
1749         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1750                le16_to_cpu(left_el->l_tree_depth));
1751
1752         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1753                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1754                         break;
1755         }
1756
1757         /*
1758          * The path walking code should have never returned a root and
1759          * two paths which are not adjacent.
1760          */
1761         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1762
1763         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1764                                       &root_el->l_recs[i + 1], right_el);
1765 }
1766
1767 /*
1768  * We've changed a leaf block (in right_path) and need to reflect that
1769  * change back up the subtree.
1770  *
1771  * This happens in multiple places:
1772  *   - When we've moved an extent record from the left path leaf to the right
1773  *     path leaf to make room for an empty extent in the left path leaf.
1774  *   - When our insert into the right path leaf is at the leftmost edge
1775  *     and requires an update of the path immediately to it's left. This
1776  *     can occur at the end of some types of rotation and appending inserts.
1777  *   - When we've adjusted the last extent record in the left path leaf and the
1778  *     1st extent record in the right path leaf during cross extent block merge.
1779  */
1780 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1781                                        struct ocfs2_path *left_path,
1782                                        struct ocfs2_path *right_path,
1783                                        int subtree_index)
1784 {
1785         int ret, i, idx;
1786         struct ocfs2_extent_list *el, *left_el, *right_el;
1787         struct ocfs2_extent_rec *left_rec, *right_rec;
1788         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1789
1790         /*
1791          * Update the counts and position values within all the
1792          * interior nodes to reflect the leaf rotation we just did.
1793          *
1794          * The root node is handled below the loop.
1795          *
1796          * We begin the loop with right_el and left_el pointing to the
1797          * leaf lists and work our way up.
1798          *
1799          * NOTE: within this loop, left_el and right_el always refer
1800          * to the *child* lists.
1801          */
1802         left_el = path_leaf_el(left_path);
1803         right_el = path_leaf_el(right_path);
1804         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1805                 mlog(0, "Adjust records at index %u\n", i);
1806
1807                 /*
1808                  * One nice property of knowing that all of these
1809                  * nodes are below the root is that we only deal with
1810                  * the leftmost right node record and the rightmost
1811                  * left node record.
1812                  */
1813                 el = left_path->p_node[i].el;
1814                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1815                 left_rec = &el->l_recs[idx];
1816
1817                 el = right_path->p_node[i].el;
1818                 right_rec = &el->l_recs[0];
1819
1820                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1821                                               right_el);
1822
1823                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1824                 if (ret)
1825                         mlog_errno(ret);
1826
1827                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1828                 if (ret)
1829                         mlog_errno(ret);
1830
1831                 /*
1832                  * Setup our list pointers now so that the current
1833                  * parents become children in the next iteration.
1834                  */
1835                 left_el = left_path->p_node[i].el;
1836                 right_el = right_path->p_node[i].el;
1837         }
1838
1839         /*
1840          * At the root node, adjust the two adjacent records which
1841          * begin our path to the leaves.
1842          */
1843
1844         el = left_path->p_node[subtree_index].el;
1845         left_el = left_path->p_node[subtree_index + 1].el;
1846         right_el = right_path->p_node[subtree_index + 1].el;
1847
1848         ocfs2_adjust_root_records(el, left_el, right_el,
1849                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1850
1851         root_bh = left_path->p_node[subtree_index].bh;
1852
1853         ret = ocfs2_journal_dirty(handle, root_bh);
1854         if (ret)
1855                 mlog_errno(ret);
1856 }
1857
1858 static int ocfs2_rotate_subtree_right(struct inode *inode,
1859                                       handle_t *handle,
1860                                       struct ocfs2_path *left_path,
1861                                       struct ocfs2_path *right_path,
1862                                       int subtree_index)
1863 {
1864         int ret, i;
1865         struct buffer_head *right_leaf_bh;
1866         struct buffer_head *left_leaf_bh = NULL;
1867         struct buffer_head *root_bh;
1868         struct ocfs2_extent_list *right_el, *left_el;
1869         struct ocfs2_extent_rec move_rec;
1870
1871         left_leaf_bh = path_leaf_bh(left_path);
1872         left_el = path_leaf_el(left_path);
1873
1874         if (left_el->l_next_free_rec != left_el->l_count) {
1875                 ocfs2_error(inode->i_sb,
1876                             "Inode %llu has non-full interior leaf node %llu"
1877                             "(next free = %u)",
1878                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1879                             (unsigned long long)left_leaf_bh->b_blocknr,
1880                             le16_to_cpu(left_el->l_next_free_rec));
1881                 return -EROFS;
1882         }
1883
1884         /*
1885          * This extent block may already have an empty record, so we
1886          * return early if so.
1887          */
1888         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1889                 return 0;
1890
1891         root_bh = left_path->p_node[subtree_index].bh;
1892         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1893
1894         ret = ocfs2_journal_access(handle, inode, root_bh,
1895                                    OCFS2_JOURNAL_ACCESS_WRITE);
1896         if (ret) {
1897                 mlog_errno(ret);
1898                 goto out;
1899         }
1900
1901         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1902                 ret = ocfs2_journal_access(handle, inode,
1903                                            right_path->p_node[i].bh,
1904                                            OCFS2_JOURNAL_ACCESS_WRITE);
1905                 if (ret) {
1906                         mlog_errno(ret);
1907                         goto out;
1908                 }
1909
1910                 ret = ocfs2_journal_access(handle, inode,
1911                                            left_path->p_node[i].bh,
1912                                            OCFS2_JOURNAL_ACCESS_WRITE);
1913                 if (ret) {
1914                         mlog_errno(ret);
1915                         goto out;
1916                 }
1917         }
1918
1919         right_leaf_bh = path_leaf_bh(right_path);
1920         right_el = path_leaf_el(right_path);
1921
1922         /* This is a code error, not a disk corruption. */
1923         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1924                         "because rightmost leaf block %llu is empty\n",
1925                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1926                         (unsigned long long)right_leaf_bh->b_blocknr);
1927
1928         ocfs2_create_empty_extent(right_el);
1929
1930         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1931         if (ret) {
1932                 mlog_errno(ret);
1933                 goto out;
1934         }
1935
1936         /* Do the copy now. */
1937         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1938         move_rec = left_el->l_recs[i];
1939         right_el->l_recs[0] = move_rec;
1940
1941         /*
1942          * Clear out the record we just copied and shift everything
1943          * over, leaving an empty extent in the left leaf.
1944          *
1945          * We temporarily subtract from next_free_rec so that the
1946          * shift will lose the tail record (which is now defunct).
1947          */
1948         le16_add_cpu(&left_el->l_next_free_rec, -1);
1949         ocfs2_shift_records_right(left_el);
1950         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1951         le16_add_cpu(&left_el->l_next_free_rec, 1);
1952
1953         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1954         if (ret) {
1955                 mlog_errno(ret);
1956                 goto out;
1957         }
1958
1959         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1960                                 subtree_index);
1961
1962 out:
1963         return ret;
1964 }
1965
1966 /*
1967  * Given a full path, determine what cpos value would return us a path
1968  * containing the leaf immediately to the left of the current one.
1969  *
1970  * Will return zero if the path passed in is already the leftmost path.
1971  */
1972 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1973                                          struct ocfs2_path *path, u32 *cpos)
1974 {
1975         int i, j, ret = 0;
1976         u64 blkno;
1977         struct ocfs2_extent_list *el;
1978
1979         BUG_ON(path->p_tree_depth == 0);
1980
1981         *cpos = 0;
1982
1983         blkno = path_leaf_bh(path)->b_blocknr;
1984
1985         /* Start at the tree node just above the leaf and work our way up. */
1986         i = path->p_tree_depth - 1;
1987         while (i >= 0) {
1988                 el = path->p_node[i].el;
1989
1990                 /*
1991                  * Find the extent record just before the one in our
1992                  * path.
1993                  */
1994                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1995                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1996                                 if (j == 0) {
1997                                         if (i == 0) {
1998                                                 /*
1999                                                  * We've determined that the
2000                                                  * path specified is already
2001                                                  * the leftmost one - return a
2002                                                  * cpos of zero.
2003                                                  */
2004                                                 goto out;
2005                                         }
2006                                         /*
2007                                          * The leftmost record points to our
2008                                          * leaf - we need to travel up the
2009                                          * tree one level.
2010                                          */
2011                                         goto next_node;
2012                                 }
2013
2014                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2015                                 *cpos = *cpos + ocfs2_rec_clusters(el,
2016                                                            &el->l_recs[j - 1]);
2017                                 *cpos = *cpos - 1;
2018                                 goto out;
2019                         }
2020                 }
2021
2022                 /*
2023                  * If we got here, we never found a valid node where
2024                  * the tree indicated one should be.
2025                  */
2026                 ocfs2_error(sb,
2027                             "Invalid extent tree at extent block %llu\n",
2028                             (unsigned long long)blkno);
2029                 ret = -EROFS;
2030                 goto out;
2031
2032 next_node:
2033                 blkno = path->p_node[i].bh->b_blocknr;
2034                 i--;
2035         }
2036
2037 out:
2038         return ret;
2039 }
2040
2041 /*
2042  * Extend the transaction by enough credits to complete the rotation,
2043  * and still leave at least the original number of credits allocated
2044  * to this transaction.
2045  */
2046 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2047                                            int op_credits,
2048                                            struct ocfs2_path *path)
2049 {
2050         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2051
2052         if (handle->h_buffer_credits < credits)
2053                 return ocfs2_extend_trans(handle, credits);
2054
2055         return 0;
2056 }
2057
2058 /*
2059  * Trap the case where we're inserting into the theoretical range past
2060  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2061  * whose cpos is less than ours into the right leaf.
2062  *
2063  * It's only necessary to look at the rightmost record of the left
2064  * leaf because the logic that calls us should ensure that the
2065  * theoretical ranges in the path components above the leaves are
2066  * correct.
2067  */
2068 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2069                                                  u32 insert_cpos)
2070 {
2071         struct ocfs2_extent_list *left_el;
2072         struct ocfs2_extent_rec *rec;
2073         int next_free;
2074
2075         left_el = path_leaf_el(left_path);
2076         next_free = le16_to_cpu(left_el->l_next_free_rec);
2077         rec = &left_el->l_recs[next_free - 1];
2078
2079         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2080                 return 1;
2081         return 0;
2082 }
2083
2084 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2085 {
2086         int next_free = le16_to_cpu(el->l_next_free_rec);
2087         unsigned int range;
2088         struct ocfs2_extent_rec *rec;
2089
2090         if (next_free == 0)
2091                 return 0;
2092
2093         rec = &el->l_recs[0];
2094         if (ocfs2_is_empty_extent(rec)) {
2095                 /* Empty list. */
2096                 if (next_free == 1)
2097                         return 0;
2098                 rec = &el->l_recs[1];
2099         }
2100
2101         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2102         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2103                 return 1;
2104         return 0;
2105 }
2106
2107 /*
2108  * Rotate all the records in a btree right one record, starting at insert_cpos.
2109  *
2110  * The path to the rightmost leaf should be passed in.
2111  *
2112  * The array is assumed to be large enough to hold an entire path (tree depth).
2113  *
2114  * Upon succesful return from this function:
2115  *
2116  * - The 'right_path' array will contain a path to the leaf block
2117  *   whose range contains e_cpos.
2118  * - That leaf block will have a single empty extent in list index 0.
2119  * - In the case that the rotation requires a post-insert update,
2120  *   *ret_left_path will contain a valid path which can be passed to
2121  *   ocfs2_insert_path().
2122  */
2123 static int ocfs2_rotate_tree_right(struct inode *inode,
2124                                    handle_t *handle,
2125                                    enum ocfs2_split_type split,
2126                                    u32 insert_cpos,
2127                                    struct ocfs2_path *right_path,
2128                                    struct ocfs2_path **ret_left_path)
2129 {
2130         int ret, start, orig_credits = handle->h_buffer_credits;
2131         u32 cpos;
2132         struct ocfs2_path *left_path = NULL;
2133
2134         *ret_left_path = NULL;
2135
2136         left_path = ocfs2_new_path(path_root_bh(right_path),
2137                                    path_root_el(right_path));
2138         if (!left_path) {
2139                 ret = -ENOMEM;
2140                 mlog_errno(ret);
2141                 goto out;
2142         }
2143
2144         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2145         if (ret) {
2146                 mlog_errno(ret);
2147                 goto out;
2148         }
2149
2150         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2151
2152         /*
2153          * What we want to do here is:
2154          *
2155          * 1) Start with the rightmost path.
2156          *
2157          * 2) Determine a path to the leaf block directly to the left
2158          *    of that leaf.
2159          *
2160          * 3) Determine the 'subtree root' - the lowest level tree node
2161          *    which contains a path to both leaves.
2162          *
2163          * 4) Rotate the subtree.
2164          *
2165          * 5) Find the next subtree by considering the left path to be
2166          *    the new right path.
2167          *
2168          * The check at the top of this while loop also accepts
2169          * insert_cpos == cpos because cpos is only a _theoretical_
2170          * value to get us the left path - insert_cpos might very well
2171          * be filling that hole.
2172          *
2173          * Stop at a cpos of '0' because we either started at the
2174          * leftmost branch (i.e., a tree with one branch and a
2175          * rotation inside of it), or we've gone as far as we can in
2176          * rotating subtrees.
2177          */
2178         while (cpos && insert_cpos <= cpos) {
2179                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2180                      insert_cpos, cpos);
2181
2182                 ret = ocfs2_find_path(inode, left_path, cpos);
2183                 if (ret) {
2184                         mlog_errno(ret);
2185                         goto out;
2186                 }
2187
2188                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2189                                 path_leaf_bh(right_path),
2190                                 "Inode %lu: error during insert of %u "
2191                                 "(left path cpos %u) results in two identical "
2192                                 "paths ending at %llu\n",
2193                                 inode->i_ino, insert_cpos, cpos,
2194                                 (unsigned long long)
2195                                 path_leaf_bh(left_path)->b_blocknr);
2196
2197                 if (split == SPLIT_NONE &&
2198                     ocfs2_rotate_requires_path_adjustment(left_path,
2199                                                           insert_cpos)) {
2200
2201                         /*
2202                          * We've rotated the tree as much as we
2203                          * should. The rest is up to
2204                          * ocfs2_insert_path() to complete, after the
2205                          * record insertion. We indicate this
2206                          * situation by returning the left path.
2207                          *
2208                          * The reason we don't adjust the records here
2209                          * before the record insert is that an error
2210                          * later might break the rule where a parent
2211                          * record e_cpos will reflect the actual
2212                          * e_cpos of the 1st nonempty record of the
2213                          * child list.
2214                          */
2215                         *ret_left_path = left_path;
2216                         goto out_ret_path;
2217                 }
2218
2219                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
2220
2221                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2222                      start,
2223                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2224                      right_path->p_tree_depth);
2225
2226                 ret = ocfs2_extend_rotate_transaction(handle, start,
2227                                                       orig_credits, right_path);
2228                 if (ret) {
2229                         mlog_errno(ret);
2230                         goto out;
2231                 }
2232
2233                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2234                                                  right_path, start);
2235                 if (ret) {
2236                         mlog_errno(ret);
2237                         goto out;
2238                 }
2239
2240                 if (split != SPLIT_NONE &&
2241                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2242                                                 insert_cpos)) {
2243                         /*
2244                          * A rotate moves the rightmost left leaf
2245                          * record over to the leftmost right leaf
2246                          * slot. If we're doing an extent split
2247                          * instead of a real insert, then we have to
2248                          * check that the extent to be split wasn't
2249                          * just moved over. If it was, then we can
2250                          * exit here, passing left_path back -
2251                          * ocfs2_split_extent() is smart enough to
2252                          * search both leaves.
2253                          */
2254                         *ret_left_path = left_path;
2255                         goto out_ret_path;
2256                 }
2257
2258                 /*
2259                  * There is no need to re-read the next right path
2260                  * as we know that it'll be our current left
2261                  * path. Optimize by copying values instead.
2262                  */
2263                 ocfs2_mv_path(right_path, left_path);
2264
2265                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2266                                                     &cpos);
2267                 if (ret) {
2268                         mlog_errno(ret);
2269                         goto out;
2270                 }
2271         }
2272
2273 out:
2274         ocfs2_free_path(left_path);
2275
2276 out_ret_path:
2277         return ret;
2278 }
2279
2280 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2281                                       struct ocfs2_path *path)
2282 {
2283         int i, idx;
2284         struct ocfs2_extent_rec *rec;
2285         struct ocfs2_extent_list *el;
2286         struct ocfs2_extent_block *eb;
2287         u32 range;
2288
2289         /* Path should always be rightmost. */
2290         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2291         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2292
2293         el = &eb->h_list;
2294         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2295         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2296         rec = &el->l_recs[idx];
2297         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2298
2299         for (i = 0; i < path->p_tree_depth; i++) {
2300                 el = path->p_node[i].el;
2301                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2302                 rec = &el->l_recs[idx];
2303
2304                 rec->e_int_clusters = cpu_to_le32(range);
2305                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2306
2307                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2308         }
2309 }
2310
2311 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2312                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2313                               struct ocfs2_path *path, int unlink_start)
2314 {
2315         int ret, i;
2316         struct ocfs2_extent_block *eb;
2317         struct ocfs2_extent_list *el;
2318         struct buffer_head *bh;
2319
2320         for(i = unlink_start; i < path_num_items(path); i++) {
2321                 bh = path->p_node[i].bh;
2322
2323                 eb = (struct ocfs2_extent_block *)bh->b_data;
2324                 /*
2325                  * Not all nodes might have had their final count
2326                  * decremented by the caller - handle this here.
2327                  */
2328                 el = &eb->h_list;
2329                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2330                         mlog(ML_ERROR,
2331                              "Inode %llu, attempted to remove extent block "
2332                              "%llu with %u records\n",
2333                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2334                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2335                              le16_to_cpu(el->l_next_free_rec));
2336
2337                         ocfs2_journal_dirty(handle, bh);
2338                         ocfs2_remove_from_cache(inode, bh);
2339                         continue;
2340                 }
2341
2342                 el->l_next_free_rec = 0;
2343                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2344
2345                 ocfs2_journal_dirty(handle, bh);
2346
2347                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2348                 if (ret)
2349                         mlog_errno(ret);
2350
2351                 ocfs2_remove_from_cache(inode, bh);
2352         }
2353 }
2354
2355 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2356                                  struct ocfs2_path *left_path,
2357                                  struct ocfs2_path *right_path,
2358                                  int subtree_index,
2359                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2360 {
2361         int i;
2362         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2363         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2364         struct ocfs2_extent_list *el;
2365         struct ocfs2_extent_block *eb;
2366
2367         el = path_leaf_el(left_path);
2368
2369         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2370
2371         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2372                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2373                         break;
2374
2375         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2376
2377         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2378         le16_add_cpu(&root_el->l_next_free_rec, -1);
2379
2380         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2381         eb->h_next_leaf_blk = 0;
2382
2383         ocfs2_journal_dirty(handle, root_bh);
2384         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2385
2386         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2387                           subtree_index + 1);
2388 }
2389
2390 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2391                                      struct ocfs2_path *left_path,
2392                                      struct ocfs2_path *right_path,
2393                                      int subtree_index,
2394                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2395                                      int *deleted,
2396                                      struct ocfs2_extent_tree *et)
2397 {
2398         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2399         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2400         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2401         struct ocfs2_extent_block *eb;
2402
2403         *deleted = 0;
2404
2405         right_leaf_el = path_leaf_el(right_path);
2406         left_leaf_el = path_leaf_el(left_path);
2407         root_bh = left_path->p_node[subtree_index].bh;
2408         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2409
2410         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2411                 return 0;
2412
2413         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2414         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2415                 /*
2416                  * It's legal for us to proceed if the right leaf is
2417                  * the rightmost one and it has an empty extent. There
2418                  * are two cases to handle - whether the leaf will be
2419                  * empty after removal or not. If the leaf isn't empty
2420                  * then just remove the empty extent up front. The
2421                  * next block will handle empty leaves by flagging
2422                  * them for unlink.
2423                  *
2424                  * Non rightmost leaves will throw -EAGAIN and the
2425                  * caller can manually move the subtree and retry.
2426                  */
2427
2428                 if (eb->h_next_leaf_blk != 0ULL)
2429                         return -EAGAIN;
2430
2431                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2432                         ret = ocfs2_journal_access(handle, inode,
2433                                                    path_leaf_bh(right_path),
2434                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2435                         if (ret) {
2436                                 mlog_errno(ret);
2437                                 goto out;
2438                         }
2439
2440                         ocfs2_remove_empty_extent(right_leaf_el);
2441                 } else
2442                         right_has_empty = 1;
2443         }
2444
2445         if (eb->h_next_leaf_blk == 0ULL &&
2446             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2447                 /*
2448                  * We have to update i_last_eb_blk during the meta
2449                  * data delete.
2450                  */
2451                 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2452                                            OCFS2_JOURNAL_ACCESS_WRITE);
2453                 if (ret) {
2454                         mlog_errno(ret);
2455                         goto out;
2456                 }
2457
2458                 del_right_subtree = 1;
2459         }
2460
2461         /*
2462          * Getting here with an empty extent in the right path implies
2463          * that it's the rightmost path and will be deleted.
2464          */
2465         BUG_ON(right_has_empty && !del_right_subtree);
2466
2467         ret = ocfs2_journal_access(handle, inode, root_bh,
2468                                    OCFS2_JOURNAL_ACCESS_WRITE);
2469         if (ret) {
2470                 mlog_errno(ret);
2471                 goto out;
2472         }
2473
2474         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2475                 ret = ocfs2_journal_access(handle, inode,
2476                                            right_path->p_node[i].bh,
2477                                            OCFS2_JOURNAL_ACCESS_WRITE);
2478                 if (ret) {
2479                         mlog_errno(ret);
2480                         goto out;
2481                 }
2482
2483                 ret = ocfs2_journal_access(handle, inode,
2484                                            left_path->p_node[i].bh,
2485                                            OCFS2_JOURNAL_ACCESS_WRITE);
2486                 if (ret) {
2487                         mlog_errno(ret);
2488                         goto out;
2489                 }
2490         }
2491
2492         if (!right_has_empty) {
2493                 /*
2494                  * Only do this if we're moving a real
2495                  * record. Otherwise, the action is delayed until
2496                  * after removal of the right path in which case we
2497                  * can do a simple shift to remove the empty extent.
2498                  */
2499                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2500                 memset(&right_leaf_el->l_recs[0], 0,
2501                        sizeof(struct ocfs2_extent_rec));
2502         }
2503         if (eb->h_next_leaf_blk == 0ULL) {
2504                 /*
2505                  * Move recs over to get rid of empty extent, decrease
2506                  * next_free. This is allowed to remove the last
2507                  * extent in our leaf (setting l_next_free_rec to
2508                  * zero) - the delete code below won't care.
2509                  */
2510                 ocfs2_remove_empty_extent(right_leaf_el);
2511         }
2512
2513         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2514         if (ret)
2515                 mlog_errno(ret);
2516         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2517         if (ret)
2518                 mlog_errno(ret);
2519
2520         if (del_right_subtree) {
2521                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2522                                      subtree_index, dealloc);
2523                 ocfs2_update_edge_lengths(inode, handle, left_path);
2524
2525                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2526                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2527
2528                 /*
2529                  * Removal of the extent in the left leaf was skipped
2530                  * above so we could delete the right path
2531                  * 1st.
2532                  */
2533                 if (right_has_empty)
2534                         ocfs2_remove_empty_extent(left_leaf_el);
2535
2536                 ret = ocfs2_journal_dirty(handle, et_root_bh);
2537                 if (ret)
2538                         mlog_errno(ret);
2539
2540                 *deleted = 1;
2541         } else
2542                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2543                                            subtree_index);
2544
2545 out:
2546         return ret;
2547 }
2548
2549 /*
2550  * Given a full path, determine what cpos value would return us a path
2551  * containing the leaf immediately to the right of the current one.
2552  *
2553  * Will return zero if the path passed in is already the rightmost path.
2554  *
2555  * This looks similar, but is subtly different to
2556  * ocfs2_find_cpos_for_left_leaf().
2557  */
2558 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2559                                           struct ocfs2_path *path, u32 *cpos)
2560 {
2561         int i, j, ret = 0;
2562         u64 blkno;
2563         struct ocfs2_extent_list *el;
2564
2565         *cpos = 0;
2566
2567         if (path->p_tree_depth == 0)
2568                 return 0;
2569
2570         blkno = path_leaf_bh(path)->b_blocknr;
2571
2572         /* Start at the tree node just above the leaf and work our way up. */
2573         i = path->p_tree_depth - 1;
2574         while (i >= 0) {
2575                 int next_free;
2576
2577                 el = path->p_node[i].el;
2578
2579                 /*
2580                  * Find the extent record just after the one in our
2581                  * path.
2582                  */
2583                 next_free = le16_to_cpu(el->l_next_free_rec);
2584                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2585                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2586                                 if (j == (next_free - 1)) {
2587                                         if (i == 0) {
2588                                                 /*
2589                                                  * We've determined that the
2590                                                  * path specified is already
2591                                                  * the rightmost one - return a
2592                                                  * cpos of zero.
2593                                                  */
2594                                                 goto out;
2595                                         }
2596                                         /*
2597                                          * The rightmost record points to our
2598                                          * leaf - we need to travel up the
2599                                          * tree one level.
2600                                          */
2601                                         goto next_node;
2602                                 }
2603
2604                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2605                                 goto out;
2606                         }
2607                 }
2608
2609                 /*
2610                  * If we got here, we never found a valid node where
2611                  * the tree indicated one should be.
2612                  */
2613                 ocfs2_error(sb,
2614                             "Invalid extent tree at extent block %llu\n",
2615                             (unsigned long long)blkno);
2616                 ret = -EROFS;
2617                 goto out;
2618
2619 next_node:
2620                 blkno = path->p_node[i].bh->b_blocknr;
2621                 i--;
2622         }
2623
2624 out:
2625         return ret;
2626 }
2627
2628 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2629                                             handle_t *handle,
2630                                             struct buffer_head *bh,
2631                                             struct ocfs2_extent_list *el)
2632 {
2633         int ret;
2634
2635         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2636                 return 0;
2637
2638         ret = ocfs2_journal_access(handle, inode, bh,
2639                                    OCFS2_JOURNAL_ACCESS_WRITE);
2640         if (ret) {
2641                 mlog_errno(ret);
2642                 goto out;
2643         }
2644
2645         ocfs2_remove_empty_extent(el);
2646
2647         ret = ocfs2_journal_dirty(handle, bh);
2648         if (ret)
2649                 mlog_errno(ret);
2650
2651 out:
2652         return ret;
2653 }
2654
2655 static int __ocfs2_rotate_tree_left(struct inode *inode,
2656                                     handle_t *handle, int orig_credits,
2657                                     struct ocfs2_path *path,
2658                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2659                                     struct ocfs2_path **empty_extent_path,
2660                                     struct ocfs2_extent_tree *et)
2661 {
2662         int ret, subtree_root, deleted;
2663         u32 right_cpos;
2664         struct ocfs2_path *left_path = NULL;
2665         struct ocfs2_path *right_path = NULL;
2666
2667         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2668
2669         *empty_extent_path = NULL;
2670
2671         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2672                                              &right_cpos);
2673         if (ret) {
2674                 mlog_errno(ret);
2675                 goto out;
2676         }
2677
2678         left_path = ocfs2_new_path(path_root_bh(path),
2679                                    path_root_el(path));
2680         if (!left_path) {
2681                 ret = -ENOMEM;
2682                 mlog_errno(ret);
2683                 goto out;
2684         }
2685
2686         ocfs2_cp_path(left_path, path);
2687
2688         right_path = ocfs2_new_path(path_root_bh(path),
2689                                     path_root_el(path));
2690         if (!right_path) {
2691                 ret = -ENOMEM;
2692                 mlog_errno(ret);
2693                 goto out;
2694         }
2695
2696         while (right_cpos) {
2697                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2698                 if (ret) {
2699                         mlog_errno(ret);
2700                         goto out;
2701                 }
2702
2703                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2704                                                        right_path);
2705
2706                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2707                      subtree_root,
2708                      (unsigned long long)
2709                      right_path->p_node[subtree_root].bh->b_blocknr,
2710                      right_path->p_tree_depth);
2711
2712                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2713                                                       orig_credits, left_path);
2714                 if (ret) {
2715                         mlog_errno(ret);
2716                         goto out;
2717                 }
2718
2719                 /*
2720                  * Caller might still want to make changes to the
2721                  * tree root, so re-add it to the journal here.
2722                  */
2723                 ret = ocfs2_journal_access(handle, inode,
2724                                            path_root_bh(left_path),
2725                                            OCFS2_JOURNAL_ACCESS_WRITE);
2726                 if (ret) {
2727                         mlog_errno(ret);
2728                         goto out;
2729                 }
2730
2731                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2732                                                 right_path, subtree_root,
2733                                                 dealloc, &deleted, et);
2734                 if (ret == -EAGAIN) {
2735                         /*
2736                          * The rotation has to temporarily stop due to
2737                          * the right subtree having an empty
2738                          * extent. Pass it back to the caller for a
2739                          * fixup.
2740                          */
2741                         *empty_extent_path = right_path;
2742                         right_path = NULL;
2743                         goto out;
2744                 }
2745                 if (ret) {
2746                         mlog_errno(ret);
2747                         goto out;
2748                 }
2749
2750                 /*
2751                  * The subtree rotate might have removed records on
2752                  * the rightmost edge. If so, then rotation is
2753                  * complete.
2754                  */
2755                 if (deleted)
2756                         break;
2757
2758                 ocfs2_mv_path(left_path, right_path);
2759
2760                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2761                                                      &right_cpos);
2762                 if (ret) {
2763                         mlog_errno(ret);
2764                         goto out;
2765                 }
2766         }
2767
2768 out:
2769         ocfs2_free_path(right_path);
2770         ocfs2_free_path(left_path);
2771
2772         return ret;
2773 }
2774
2775 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2776                                 struct ocfs2_path *path,
2777                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2778                                 struct ocfs2_extent_tree *et)
2779 {
2780         int ret, subtree_index;
2781         u32 cpos;
2782         struct ocfs2_path *left_path = NULL;
2783         struct ocfs2_extent_block *eb;
2784         struct ocfs2_extent_list *el;
2785
2786
2787         ret = ocfs2_et_sanity_check(inode, et);
2788         if (ret)
2789                 goto out;
2790         /*
2791          * There's two ways we handle this depending on
2792          * whether path is the only existing one.
2793          */
2794         ret = ocfs2_extend_rotate_transaction(handle, 0,
2795                                               handle->h_buffer_credits,
2796                                               path);
2797         if (ret) {
2798                 mlog_errno(ret);
2799                 goto out;
2800         }
2801
2802         ret = ocfs2_journal_access_path(inode, handle, path);
2803         if (ret) {
2804                 mlog_errno(ret);
2805                 goto out;
2806         }
2807
2808         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2809         if (ret) {
2810                 mlog_errno(ret);
2811                 goto out;
2812         }
2813
2814         if (cpos) {
2815                 /*
2816                  * We have a path to the left of this one - it needs
2817                  * an update too.
2818                  */
2819                 left_path = ocfs2_new_path(path_root_bh(path),
2820                                            path_root_el(path));
2821                 if (!left_path) {
2822                         ret = -ENOMEM;
2823                         mlog_errno(ret);
2824                         goto out;
2825                 }
2826
2827                 ret = ocfs2_find_path(inode, left_path, cpos);
2828                 if (ret) {
2829                         mlog_errno(ret);
2830                         goto out;
2831                 }
2832
2833                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2834                 if (ret) {
2835                         mlog_errno(ret);
2836                         goto out;
2837                 }
2838
2839                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2840
2841                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2842                                      subtree_index, dealloc);
2843                 ocfs2_update_edge_lengths(inode, handle, left_path);
2844
2845                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2846                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2847         } else {
2848                 /*
2849                  * 'path' is also the leftmost path which
2850                  * means it must be the only one. This gets
2851                  * handled differently because we want to
2852                  * revert the inode back to having extents
2853                  * in-line.
2854                  */
2855                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2856
2857                 el = et->et_root_el;
2858                 el->l_tree_depth = 0;
2859                 el->l_next_free_rec = 0;
2860                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2861
2862                 ocfs2_et_set_last_eb_blk(et, 0);
2863         }
2864
2865         ocfs2_journal_dirty(handle, path_root_bh(path));
2866
2867 out:
2868         ocfs2_free_path(left_path);
2869         return ret;
2870 }
2871
2872 /*
2873  * Left rotation of btree records.
2874  *
2875  * In many ways, this is (unsurprisingly) the opposite of right
2876  * rotation. We start at some non-rightmost path containing an empty
2877  * extent in the leaf block. The code works its way to the rightmost
2878  * path by rotating records to the left in every subtree.
2879  *
2880  * This is used by any code which reduces the number of extent records
2881  * in a leaf. After removal, an empty record should be placed in the
2882  * leftmost list position.
2883  *
2884  * This won't handle a length update of the rightmost path records if
2885  * the rightmost tree leaf record is removed so the caller is
2886  * responsible for detecting and correcting that.
2887  */
2888 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2889                                   struct ocfs2_path *path,
2890                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
2891                                   struct ocfs2_extent_tree *et)
2892 {
2893         int ret, orig_credits = handle->h_buffer_credits;
2894         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2895         struct ocfs2_extent_block *eb;
2896         struct ocfs2_extent_list *el;
2897
2898         el = path_leaf_el(path);
2899         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2900                 return 0;
2901
2902         if (path->p_tree_depth == 0) {
2903 rightmost_no_delete:
2904                 /*
2905                  * Inline extents. This is trivially handled, so do
2906                  * it up front.
2907                  */
2908                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2909                                                        path_leaf_bh(path),
2910                                                        path_leaf_el(path));
2911                 if (ret)
2912                         mlog_errno(ret);
2913                 goto out;
2914         }
2915
2916         /*
2917          * Handle rightmost branch now. There's several cases:
2918          *  1) simple rotation leaving records in there. That's trivial.
2919          *  2) rotation requiring a branch delete - there's no more
2920          *     records left. Two cases of this:
2921          *     a) There are branches to the left.
2922          *     b) This is also the leftmost (the only) branch.
2923          *
2924          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2925          *  2a) we need the left branch so that we can update it with the unlink
2926          *  2b) we need to bring the inode back to inline extents.
2927          */
2928
2929         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2930         el = &eb->h_list;
2931         if (eb->h_next_leaf_blk == 0) {
2932                 /*
2933                  * This gets a bit tricky if we're going to delete the
2934                  * rightmost path. Get the other cases out of the way
2935                  * 1st.
2936                  */
2937                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2938                         goto rightmost_no_delete;
2939
2940                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2941                         ret = -EIO;
2942                         ocfs2_error(inode->i_sb,
2943                                     "Inode %llu has empty extent block at %llu",
2944                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2945                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2946                         goto out;
2947                 }
2948
2949                 /*
2950                  * XXX: The caller can not trust "path" any more after
2951                  * this as it will have been deleted. What do we do?
2952                  *
2953                  * In theory the rotate-for-merge code will never get
2954                  * here because it'll always ask for a rotate in a
2955                  * nonempty list.
2956                  */
2957
2958                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2959                                                   dealloc, et);
2960                 if (ret)
2961                         mlog_errno(ret);
2962                 goto out;
2963         }
2964
2965         /*
2966          * Now we can loop, remembering the path we get from -EAGAIN
2967          * and restarting from there.
2968          */
2969 try_rotate:
2970         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2971                                        dealloc, &restart_path, et);
2972         if (ret && ret != -EAGAIN) {
2973                 mlog_errno(ret);
2974                 goto out;
2975         }
2976
2977         while (ret == -EAGAIN) {
2978                 tmp_path = restart_path;
2979                 restart_path = NULL;
2980
2981                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2982                                                tmp_path, dealloc,
2983                                                &restart_path, et);
2984                 if (ret && ret != -EAGAIN) {
2985                         mlog_errno(ret);
2986                         goto out;
2987                 }
2988
2989                 ocfs2_free_path(tmp_path);
2990                 tmp_path = NULL;
2991
2992                 if (ret == 0)
2993                         goto try_rotate;
2994         }
2995
2996 out:
2997         ocfs2_free_path(tmp_path);
2998         ocfs2_free_path(restart_path);
2999         return ret;
3000 }
3001
3002 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3003                                 int index)
3004 {
3005         struct ocfs2_extent_rec *rec = &el->l_recs[index];
3006         unsigned int size;
3007
3008         if (rec->e_leaf_clusters == 0) {
3009                 /*
3010                  * We consumed all of the merged-from record. An empty
3011                  * extent cannot exist anywhere but the 1st array
3012                  * position, so move things over if the merged-from
3013                  * record doesn't occupy that position.
3014                  *
3015                  * This creates a new empty extent so the caller
3016                  * should be smart enough to have removed any existing
3017                  * ones.
3018                  */
3019                 if (index > 0) {
3020                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3021                         size = index * sizeof(struct ocfs2_extent_rec);
3022                         memmove(&el->l_recs[1], &el->l_recs[0], size);
3023                 }
3024
3025                 /*
3026                  * Always memset - the caller doesn't check whether it
3027                  * created an empty extent, so there could be junk in
3028                  * the other fields.
3029                  */
3030                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3031         }
3032 }
3033
3034 static int ocfs2_get_right_path(struct inode *inode,
3035                                 struct ocfs2_path *left_path,
3036                                 struct ocfs2_path **ret_right_path)
3037 {
3038         int ret;
3039         u32 right_cpos;
3040         struct ocfs2_path *right_path = NULL;
3041         struct ocfs2_extent_list *left_el;
3042
3043         *ret_right_path = NULL;
3044
3045         /* This function shouldn't be called for non-trees. */
3046         BUG_ON(left_path->p_tree_depth == 0);
3047
3048         left_el = path_leaf_el(left_path);
3049         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3050
3051         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
3052                                              &right_cpos);
3053         if (ret) {
3054                 mlog_errno(ret);
3055                 goto out;
3056         }
3057
3058         /* This function shouldn't be called for the rightmost leaf. */
3059         BUG_ON(right_cpos == 0);
3060
3061         right_path = ocfs2_new_path(path_root_bh(left_path),
3062                                     path_root_el(left_path));
3063         if (!right_path) {
3064                 ret = -ENOMEM;
3065                 mlog_errno(ret);
3066                 goto out;
3067         }
3068
3069         ret = ocfs2_find_path(inode, right_path, right_cpos);
3070         if (ret) {
3071                 mlog_errno(ret);
3072                 goto out;
3073         }
3074
3075         *ret_right_path = right_path;
3076 out:
3077         if (ret)
3078                 ocfs2_free_path(right_path);
3079         return ret;
3080 }
3081
3082 /*
3083  * Remove split_rec clusters from the record at index and merge them
3084  * onto the beginning of the record "next" to it.
3085  * For index < l_count - 1, the next means the extent rec at index + 1.
3086  * For index == l_count - 1, the "next" means the 1st extent rec of the
3087  * next extent block.
3088  */
3089 static int ocfs2_merge_rec_right(struct inode *inode,
3090                                  struct ocfs2_path *left_path,
3091                                  handle_t *handle,
3092                                  struct ocfs2_extent_rec *split_rec,
3093                                  int index)
3094 {
3095         int ret, next_free, i;
3096         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3097         struct ocfs2_extent_rec *left_rec;
3098         struct ocfs2_extent_rec *right_rec;
3099         struct ocfs2_extent_list *right_el;
3100         struct ocfs2_path *right_path = NULL;
3101         int subtree_index = 0;
3102         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3103         struct buffer_head *bh = path_leaf_bh(left_path);
3104         struct buffer_head *root_bh = NULL;
3105
3106         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3107         left_rec = &el->l_recs[index];
3108
3109         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3110             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3111                 /* we meet with a cross extent block merge. */
3112                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
3113                 if (ret) {
3114                         mlog_errno(ret);
3115                         goto out;
3116                 }
3117
3118                 right_el = path_leaf_el(right_path);
3119                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3120                 BUG_ON(next_free <= 0);
3121                 right_rec = &right_el->l_recs[0];
3122                 if (ocfs2_is_empty_extent(right_rec)) {
3123                         BUG_ON(next_free <= 1);
3124                         right_rec = &right_el->l_recs[1];
3125                 }
3126
3127                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3128                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3129                        le32_to_cpu(right_rec->e_cpos));
3130
3131                 subtree_index = ocfs2_find_subtree_root(inode,
3132                                                         left_path, right_path);
3133
3134                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3135                                                       handle->h_buffer_credits,
3136                                                       right_path);
3137                 if (ret) {
3138                         mlog_errno(ret);
3139                         goto out;
3140                 }
3141
3142                 root_bh = left_path->p_node[subtree_index].bh;
3143                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3144
3145                 ret = ocfs2_journal_access(handle, inode, root_bh,
3146                                            OCFS2_JOURNAL_ACCESS_WRITE);
3147                 if (ret) {
3148                         mlog_errno(ret);
3149                         goto out;
3150                 }
3151
3152                 for (i = subtree_index + 1;
3153                      i < path_num_items(right_path); i++) {
3154                         ret = ocfs2_journal_access(handle, inode,
3155                                                    right_path->p_node[i].bh,
3156                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3157                         if (ret) {
3158                                 mlog_errno(ret);
3159                                 goto out;
3160                         }
3161
3162                         ret = ocfs2_journal_access(handle, inode,
3163                                                    left_path->p_node[i].bh,
3164                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3165                         if (ret) {
3166                                 mlog_errno(ret);
3167                                 goto out;
3168                         }
3169                 }
3170
3171         } else {
3172                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3173                 right_rec = &el->l_recs[index + 1];
3174         }
3175
3176         ret = ocfs2_journal_access(handle, inode, bh,
3177                                    OCFS2_JOURNAL_ACCESS_WRITE);
3178         if (ret) {
3179                 mlog_errno(ret);
3180                 goto out;
3181         }
3182
3183         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3184
3185         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3186         le64_add_cpu(&right_rec->e_blkno,
3187                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3188         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3189
3190         ocfs2_cleanup_merge(el, index);
3191
3192         ret = ocfs2_journal_dirty(handle, bh);
3193         if (ret)
3194                 mlog_errno(ret);
3195
3196         if (right_path) {
3197                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3198                 if (ret)
3199                         mlog_errno(ret);
3200
3201                 ocfs2_complete_edge_insert(inode, handle, left_path,
3202                                            right_path, subtree_index);
3203         }
3204 out:
3205         if (right_path)
3206                 ocfs2_free_path(right_path);
3207         return ret;
3208 }
3209
3210 static int ocfs2_get_left_path(struct inode *inode,
3211                                struct ocfs2_path *right_path,
3212                                struct ocfs2_path **ret_left_path)
3213 {
3214         int ret;
3215         u32 left_cpos;
3216         struct ocfs2_path *left_path = NULL;
3217
3218         *ret_left_path = NULL;
3219
3220         /* This function shouldn't be called for non-trees. */
3221         BUG_ON(right_path->p_tree_depth == 0);
3222
3223         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3224                                             right_path, &left_cpos);
3225         if (ret) {
3226                 mlog_errno(ret);
3227                 goto out;
3228         }
3229
3230         /* This function shouldn't be called for the leftmost leaf. */
3231         BUG_ON(left_cpos == 0);
3232
3233         left_path = ocfs2_new_path(path_root_bh(right_path),
3234                                    path_root_el(right_path));
3235         if (!left_path) {
3236                 ret = -ENOMEM;
3237                 mlog_errno(ret);
3238                 goto out;
3239         }
3240
3241         ret = ocfs2_find_path(inode, left_path, left_cpos);
3242         if (ret) {
3243                 mlog_errno(ret);
3244                 goto out;
3245         }
3246
3247         *ret_left_path = left_path;
3248 out:
3249         if (ret)
3250                 ocfs2_free_path(left_path);
3251         return ret;
3252 }
3253
3254 /*
3255  * Remove split_rec clusters from the record at index and merge them
3256  * onto the tail of the record "before" it.
3257  * For index > 0, the "before" means the extent rec at index - 1.
3258  *
3259  * For index == 0, the "before" means the last record of the previous
3260  * extent block. And there is also a situation that we may need to
3261  * remove the rightmost leaf extent block in the right_path and change
3262  * the right path to indicate the new rightmost path.
3263  */
3264 static int ocfs2_merge_rec_left(struct inode *inode,
3265                                 struct ocfs2_path *right_path,
3266                                 handle_t *handle,
3267                                 struct ocfs2_extent_rec *split_rec,
3268                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3269                                 struct ocfs2_extent_tree *et,
3270                                 int index)
3271 {
3272         int ret, i, subtree_index = 0, has_empty_extent = 0;
3273         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3274         struct ocfs2_extent_rec *left_rec;
3275         struct ocfs2_extent_rec *right_rec;
3276         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3277         struct buffer_head *bh = path_leaf_bh(right_path);
3278         struct buffer_head *root_bh = NULL;
3279         struct ocfs2_path *left_path = NULL;
3280         struct ocfs2_extent_list *left_el;
3281
3282         BUG_ON(index < 0);
3283
3284         right_rec = &el->l_recs[index];
3285         if (index == 0) {
3286                 /* we meet with a cross extent block merge. */
3287                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
3288                 if (ret) {
3289                         mlog_errno(ret);
3290                         goto out;
3291                 }
3292
3293                 left_el = path_leaf_el(left_path);
3294                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3295                        le16_to_cpu(left_el->l_count));
3296
3297                 left_rec = &left_el->l_recs[
3298                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3299                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3300                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3301                        le32_to_cpu(split_rec->e_cpos));
3302
3303                 subtree_index = ocfs2_find_subtree_root(inode,
3304                                                         left_path, right_path);
3305
3306                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3307                                                       handle->h_buffer_credits,
3308                                                       left_path);
3309                 if (ret) {
3310                         mlog_errno(ret);
3311                         goto out;
3312                 }
3313
3314                 root_bh = left_path->p_node[subtree_index].bh;
3315                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3316
3317                 ret = ocfs2_journal_access(handle, inode, root_bh,
3318                                            OCFS2_JOURNAL_ACCESS_WRITE);
3319                 if (ret) {
3320                         mlog_errno(ret);
3321                         goto out;
3322                 }
3323
3324                 for (i = subtree_index + 1;
3325                      i < path_num_items(right_path); i++) {
3326                         ret = ocfs2_journal_access(handle, inode,
3327                                                    right_path->p_node[i].bh,
3328                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3329                         if (ret) {
3330                                 mlog_errno(ret);
3331                                 goto out;
3332                         }
3333
3334                         ret = ocfs2_journal_access(handle, inode,
3335                                                    left_path->p_node[i].bh,
3336                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3337                         if (ret) {
3338                                 mlog_errno(ret);
3339                                 goto out;
3340                         }
3341                 }
3342         } else {
3343                 left_rec = &el->l_recs[index - 1];
3344                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3345                         has_empty_extent = 1;
3346         }
3347
3348         ret = ocfs2_journal_access(handle, inode, bh,
3349                                    OCFS2_JOURNAL_ACCESS_WRITE);
3350         if (ret) {
3351                 mlog_errno(ret);
3352                 goto out;
3353         }
3354
3355         if (has_empty_extent && index == 1) {
3356                 /*
3357                  * The easy case - we can just plop the record right in.
3358                  */
3359                 *left_rec = *split_rec;
3360
3361                 has_empty_extent = 0;
3362         } else
3363                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3364
3365         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3366         le64_add_cpu(&right_rec->e_blkno,
3367                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3368         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3369
3370         ocfs2_cleanup_merge(el, index);
3371
3372         ret = ocfs2_journal_dirty(handle, bh);
3373         if (ret)
3374                 mlog_errno(ret);
3375
3376         if (left_path) {
3377                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3378                 if (ret)
3379                         mlog_errno(ret);
3380
3381                 /*
3382                  * In the situation that the right_rec is empty and the extent
3383                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3384                  * it and we need to delete the right extent block.
3385                  */
3386                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3387                     le16_to_cpu(el->l_next_free_rec) == 1) {
3388
3389                         ret = ocfs2_remove_rightmost_path(inode, handle,
3390                                                           right_path,
3391                                                           dealloc, et);
3392                         if (ret) {
3393                                 mlog_errno(ret);
3394                                 goto out;
3395                         }
3396
3397                         /* Now the rightmost extent block has been deleted.
3398                          * So we use the new rightmost path.
3399                          */
3400                         ocfs2_mv_path(right_path, left_path);
3401                         left_path = NULL;
3402                 } else
3403                         ocfs2_complete_edge_insert(inode, handle, left_path,
3404                                                    right_path, subtree_index);
3405         }
3406 out:
3407         if (left_path)
3408                 ocfs2_free_path(left_path);
3409         return ret;
3410 }
3411
3412 static int ocfs2_try_to_merge_extent(struct inode *inode,
3413                                      handle_t *handle,
3414                                      struct ocfs2_path *path,
3415                                      int split_index,
3416                                      struct ocfs2_extent_rec *split_rec,
3417                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3418                                      struct ocfs2_merge_ctxt *ctxt,
3419                                      struct ocfs2_extent_tree *et)
3420
3421 {
3422         int ret = 0;
3423         struct ocfs2_extent_list *el = path_leaf_el(path);
3424         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3425
3426         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3427
3428         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3429                 /*
3430                  * The merge code will need to create an empty
3431                  * extent to take the place of the newly
3432                  * emptied slot. Remove any pre-existing empty
3433                  * extents - having more than one in a leaf is
3434                  * illegal.
3435                  */
3436                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3437                                              dealloc, et);
3438                 if (ret) {
3439                         mlog_errno(ret);
3440                         goto out;
3441                 }
3442                 split_index--;
3443                 rec = &el->l_recs[split_index];
3444         }
3445
3446         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3447                 /*
3448                  * Left-right contig implies this.
3449                  */
3450                 BUG_ON(!ctxt->c_split_covers_rec);
3451
3452                 /*
3453                  * Since the leftright insert always covers the entire
3454                  * extent, this call will delete the insert record
3455                  * entirely, resulting in an empty extent record added to
3456                  * the extent block.
3457                  *
3458                  * Since the adding of an empty extent shifts
3459                  * everything back to the right, there's no need to
3460                  * update split_index here.
3461                  *
3462                  * When the split_index is zero, we need to merge it to the
3463                  * prevoius extent block. It is more efficient and easier
3464                  * if we do merge_right first and merge_left later.
3465                  */
3466                 ret = ocfs2_merge_rec_right(inode, path,
3467                                             handle, split_rec,
3468                                             split_index);
3469                 if (ret) {
3470                         mlog_errno(ret);
3471                         goto out;
3472                 }
3473
3474                 /*
3475                  * We can only get this from logic error above.
3476                  */
3477                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3478
3479                 /* The merge left us with an empty extent, remove it. */
3480                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3481                                              dealloc, et);
3482                 if (ret) {
3483                         mlog_errno(ret);
3484                         goto out;
3485                 }
3486
3487                 rec = &el->l_recs[split_index];
3488
3489                 /*
3490                  * Note that we don't pass split_rec here on purpose -
3491                  * we've merged it into the rec already.
3492                  */
3493                 ret = ocfs2_merge_rec_left(inode, path,
3494                                            handle, rec,
3495                                            dealloc, et,
3496                                            split_index);
3497
3498                 if (ret) {
3499                         mlog_errno(ret);
3500                         goto out;
3501                 }
3502
3503                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3504                                              dealloc, et);
3505                 /*
3506                  * Error from this last rotate is not critical, so
3507                  * print but don't bubble it up.
3508                  */
3509                 if (ret)
3510                         mlog_errno(ret);
3511                 ret = 0;
3512         } else {
3513                 /*
3514                  * Merge a record to the left or right.
3515                  *
3516                  * 'contig_type' is relative to the existing record,
3517                  * so for example, if we're "right contig", it's to
3518                  * the record on the left (hence the left merge).
3519                  */
3520                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3521                         ret = ocfs2_merge_rec_left(inode,
3522                                                    path,
3523                                                    handle, split_rec,
3524                                                    dealloc, et,
3525                                                    split_index);
3526                         if (ret) {
3527                                 mlog_errno(ret);
3528                                 goto out;
3529                         }
3530                 } else {
3531                         ret = ocfs2_merge_rec_right(inode,
3532                                                     path,
3533                                                     handle, split_rec,
3534                                                     split_index);
3535                         if (ret) {
3536                                 mlog_errno(ret);
3537                                 goto out;
3538                         }
3539                 }
3540
3541                 if (ctxt->c_split_covers_rec) {
3542                         /*
3543                          * The merge may have left an empty extent in
3544                          * our leaf. Try to rotate it away.
3545                          */
3546                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3547                                                      dealloc, et);
3548                         if (ret)
3549                                 mlog_errno(ret);
3550                         ret = 0;
3551                 }
3552         }
3553
3554 out:
3555         return ret;
3556 }
3557
3558 static void ocfs2_subtract_from_rec(struct super_block *sb,
3559                                     enum ocfs2_split_type split,
3560                                     struct ocfs2_extent_rec *rec,
3561                                     struct ocfs2_extent_rec *split_rec)
3562 {
3563         u64 len_blocks;
3564
3565         len_blocks = ocfs2_clusters_to_blocks(sb,
3566                                 le16_to_cpu(split_rec->e_leaf_clusters));
3567
3568         if (split == SPLIT_LEFT) {
3569                 /*
3570                  * Region is on the left edge of the existing
3571                  * record.
3572                  */
3573                 le32_add_cpu(&rec->e_cpos,
3574                              le16_to_cpu(split_rec->e_leaf_clusters));
3575                 le64_add_cpu(&rec->e_blkno, len_blocks);
3576                 le16_add_cpu(&rec->e_leaf_clusters,
3577                              -le16_to_cpu(split_rec->e_leaf_clusters));
3578         } else {
3579                 /*
3580                  * Region is on the right edge of the existing
3581                  * record.
3582                  */
3583                 le16_add_cpu(&rec->e_leaf_clusters,
3584                              -le16_to_cpu(split_rec->e_leaf_clusters));
3585         }
3586 }
3587
3588 /*
3589  * Do the final bits of extent record insertion at the target leaf
3590  * list. If this leaf is part of an allocation tree, it is assumed
3591  * that the tree above has been prepared.
3592  */
3593 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3594                                  struct ocfs2_extent_list *el,
3595                                  struct ocfs2_insert_type *insert,
3596                                  struct inode *inode)
3597 {
3598         int i = insert->ins_contig_index;
3599         unsigned int range;
3600         struct ocfs2_extent_rec *rec;
3601
3602         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3603
3604         if (insert->ins_split != SPLIT_NONE) {
3605                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3606                 BUG_ON(i == -1);
3607                 rec = &el->l_recs[i];
3608                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3609                                         insert_rec);
3610                 goto rotate;
3611         }
3612
3613         /*
3614          * Contiguous insert - either left or right.
3615          */
3616         if (insert->ins_contig != CONTIG_NONE) {
3617                 rec = &el->l_recs[i];
3618                 if (insert->ins_contig == CONTIG_LEFT) {
3619                         rec->e_blkno = insert_rec->e_blkno;
3620                         rec->e_cpos = insert_rec->e_cpos;
3621                 }
3622                 le16_add_cpu(&rec->e_leaf_clusters,
3623                              le16_to_cpu(insert_rec->e_leaf_clusters));
3624                 return;
3625         }
3626
3627         /*
3628          * Handle insert into an empty leaf.
3629          */
3630         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3631             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3632              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3633                 el->l_recs[0] = *insert_rec;
3634                 el->l_next_free_rec = cpu_to_le16(1);
3635                 return;
3636         }
3637
3638         /*
3639          * Appending insert.
3640          */
3641         if (insert->ins_appending == APPEND_TAIL) {
3642                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3643                 rec = &el->l_recs[i];
3644                 range = le32_to_cpu(rec->e_cpos)
3645                         + le16_to_cpu(rec->e_leaf_clusters);
3646                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3647
3648                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3649                                 le16_to_cpu(el->l_count),
3650                                 "inode %lu, depth %u, count %u, next free %u, "
3651                                 "rec.cpos %u, rec.clusters %u, "
3652                                 "insert.cpos %u, insert.clusters %u\n",
3653                                 inode->i_ino,
3654                                 le16_to_cpu(el->l_tree_depth),
3655                                 le16_to_cpu(el->l_count),
3656                                 le16_to_cpu(el->l_next_free_rec),
3657                                 le32_to_cpu(el->l_recs[i].e_cpos),
3658                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3659                                 le32_to_cpu(insert_rec->e_cpos),
3660                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3661                 i++;
3662                 el->l_recs[i] = *insert_rec;
3663                 le16_add_cpu(&el->l_next_free_rec, 1);
3664                 return;
3665         }
3666
3667 rotate:
3668         /*
3669          * Ok, we have to rotate.
3670          *
3671          * At this point, it is safe to assume that inserting into an
3672          * empty leaf and appending to a leaf have both been handled
3673          * above.
3674          *
3675          * This leaf needs to have space, either by the empty 1st
3676          * extent record, or by virtue of an l_next_rec < l_count.
3677          */
3678         ocfs2_rotate_leaf(el, insert_rec);
3679 }
3680
3681 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3682                                            handle_t *handle,
3683                                            struct ocfs2_path *path,
3684                                            struct ocfs2_extent_rec *insert_rec)
3685 {
3686         int ret, i, next_free;
3687         struct buffer_head *bh;
3688         struct ocfs2_extent_list *el;
3689         struct ocfs2_extent_rec *rec;
3690
3691         /*
3692          * Update everything except the leaf block.
3693          */
3694         for (i = 0; i < path->p_tree_depth; i++) {
3695                 bh = path->p_node[i].bh;
3696                 el = path->p_node[i].el;
3697
3698                 next_free = le16_to_cpu(el->l_next_free_rec);
3699                 if (next_free == 0) {
3700                         ocfs2_error(inode->i_sb,
3701                                     "Dinode %llu has a bad extent list",
3702                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3703                         ret = -EIO;
3704                         return;
3705                 }
3706
3707                 rec = &el->l_recs[next_free - 1];
3708
3709                 rec->e_int_clusters = insert_rec->e_cpos;
3710                 le32_add_cpu(&rec->e_int_clusters,
3711                              le16_to_cpu(insert_rec->e_leaf_clusters));
3712                 le32_add_cpu(&rec->e_int_clusters,
3713                              -le32_to_cpu(rec->e_cpos));
3714
3715                 ret = ocfs2_journal_dirty(handle, bh);
3716                 if (ret)
3717                         mlog_errno(ret);
3718
3719         }
3720 }
3721
3722 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3723                                     struct ocfs2_extent_rec *insert_rec,
3724                                     struct ocfs2_path *right_path,
3725                                     struct ocfs2_path **ret_left_path)
3726 {
3727         int ret, next_free;
3728         struct ocfs2_extent_list *el;
3729         struct ocfs2_path *left_path = NULL;
3730
3731         *ret_left_path = NULL;
3732
3733         /*
3734          * This shouldn't happen for non-trees. The extent rec cluster
3735          * count manipulation below only works for interior nodes.
3736          */
3737         BUG_ON(right_path->p_tree_depth == 0);
3738
3739         /*
3740          * If our appending insert is at the leftmost edge of a leaf,
3741          * then we might need to update the rightmost records of the
3742          * neighboring path.
3743          */
3744         el = path_leaf_el(right_path);
3745         next_free = le16_to_cpu(el->l_next_free_rec);
3746         if (next_free == 0 ||
3747             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3748                 u32 left_cpos;
3749
3750                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3751                                                     &left_cpos);
3752                 if (ret) {
3753                         mlog_errno(ret);
3754                         goto out;
3755                 }
3756
3757                 mlog(0, "Append may need a left path update. cpos: %u, "
3758                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3759                      left_cpos);
3760
3761                 /*
3762                  * No need to worry if the append is already in the
3763                  * leftmost leaf.
3764                  */
3765                 if (left_cpos) {
3766                         left_path = ocfs2_new_path(path_root_bh(right_path),
3767                                                    path_root_el(right_path));
3768                         if (!left_path) {
3769                                 ret = -ENOMEM;
3770                                 mlog_errno(ret);
3771                                 goto out;
3772                         }
3773
3774                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3775                         if (ret) {
3776                                 mlog_errno(ret);
3777                                 goto out;
3778                         }
3779
3780                         /*
3781                          * ocfs2_insert_path() will pass the left_path to the
3782                          * journal for us.
3783                          */
3784                 }
3785         }
3786
3787         ret = ocfs2_journal_access_path(inode, handle, right_path);
3788         if (ret) {
3789                 mlog_errno(ret);
3790                 goto out;
3791         }
3792
3793         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3794
3795         *ret_left_path = left_path;
3796         ret = 0;
3797 out:
3798         if (ret != 0)
3799                 ocfs2_free_path(left_path);
3800
3801         return ret;
3802 }
3803
3804 static void ocfs2_split_record(struct inode *inode,
3805                                struct ocfs2_path *left_path,
3806                                struct ocfs2_path *right_path,
3807                                struct ocfs2_extent_rec *split_rec,
3808                                enum ocfs2_split_type split)
3809 {
3810         int index;
3811         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3812         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3813         struct ocfs2_extent_rec *rec, *tmprec;
3814
3815         right_el = path_leaf_el(right_path);;
3816         if (left_path)
3817                 left_el = path_leaf_el(left_path);
3818
3819         el = right_el;
3820         insert_el = right_el;
3821         index = ocfs2_search_extent_list(el, cpos);
3822         if (index != -1) {
3823                 if (index == 0 && left_path) {
3824                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3825
3826                         /*
3827                          * This typically means that the record
3828                          * started in the left path but moved to the
3829                          * right as a result of rotation. We either
3830                          * move the existing record to the left, or we
3831                          * do the later insert there.
3832                          *
3833                          * In this case, the left path should always
3834                          * exist as the rotate code will have passed
3835                          * it back for a post-insert update.
3836                          */
3837
3838                         if (split == SPLIT_LEFT) {
3839                                 /*
3840                                  * It's a left split. Since we know
3841                                  * that the rotate code gave us an
3842                                  * empty extent in the left path, we
3843                                  * can just do the insert there.
3844                                  */
3845                                 insert_el = left_el;
3846                         } else {
3847                                 /*
3848                                  * Right split - we have to move the
3849                                  * existing record over to the left
3850                                  * leaf. The insert will be into the
3851                                  * newly created empty extent in the
3852                                  * right leaf.
3853                                  */
3854                                 tmprec = &right_el->l_recs[index];
3855                                 ocfs2_rotate_leaf(left_el, tmprec);
3856                                 el = left_el;
3857
3858                                 memset(tmprec, 0, sizeof(*tmprec));
3859                                 index = ocfs2_search_extent_list(left_el, cpos);
3860                                 BUG_ON(index == -1);
3861                         }
3862                 }
3863         } else {
3864                 BUG_ON(!left_path);
3865                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3866                 /*
3867                  * Left path is easy - we can just allow the insert to
3868                  * happen.
3869                  */
3870                 el = left_el;
3871                 insert_el = left_el;
3872                 index = ocfs2_search_extent_list(el, cpos);
3873                 BUG_ON(index == -1);
3874         }
3875
3876         rec = &el->l_recs[index];
3877         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3878         ocfs2_rotate_leaf(insert_el, split_rec);
3879 }
3880
3881 /*
3882  * This function only does inserts on an allocation b-tree. For tree
3883  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3884  *
3885  * right_path is the path we want to do the actual insert
3886  * in. left_path should only be passed in if we need to update that
3887  * portion of the tree after an edge insert.
3888  */
3889 static int ocfs2_insert_path(struct inode *inode,
3890                              handle_t *handle,
3891                              struct ocfs2_path *left_path,
3892                              struct ocfs2_path *right_path,
3893                              struct ocfs2_extent_rec *insert_rec,
3894                              struct ocfs2_insert_type *insert)
3895 {
3896         int ret, subtree_index;
3897         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3898
3899         if (left_path) {
3900                 int credits = handle->h_buffer_credits;
3901
3902                 /*
3903                  * There's a chance that left_path got passed back to
3904                  * us without being accounted for in the
3905                  * journal. Extend our transaction here to be sure we
3906                  * can change those blocks.
3907                  */
3908                 credits += left_path->p_tree_depth;
3909
3910                 ret = ocfs2_extend_trans(handle, credits);
3911                 if (ret < 0) {
3912                         mlog_errno(ret);
3913                         goto out;
3914                 }
3915
3916                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3917                 if (ret < 0) {
3918                         mlog_errno(ret);
3919                         goto out;
3920                 }
3921         }
3922
3923         /*
3924          * Pass both paths to the journal. The majority of inserts
3925          * will be touching all components anyway.
3926          */
3927         ret = ocfs2_journal_access_path(inode, handle, right_path);
3928         if (ret < 0) {
3929                 mlog_errno(ret);
3930                 goto out;
3931         }
3932
3933         if (insert->ins_split != SPLIT_NONE) {
3934                 /*
3935                  * We could call ocfs2_insert_at_leaf() for some types
3936                  * of splits, but it's easier to just let one separate
3937                  * function sort it all out.
3938                  */
3939                 ocfs2_split_record(inode, left_path, right_path,
3940                                    insert_rec, insert->ins_split);
3941
3942                 /*
3943                  * Split might have modified either leaf and we don't
3944                  * have a guarantee that the later edge insert will
3945                  * dirty this for us.
3946                  */
3947                 if (left_path)
3948                         ret = ocfs2_journal_dirty(handle,
3949                                                   path_leaf_bh(left_path));
3950                         if (ret)
3951                                 mlog_errno(ret);
3952         } else
3953                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3954                                      insert, inode);
3955
3956         ret = ocfs2_journal_dirty(handle, leaf_bh);
3957         if (ret)
3958                 mlog_errno(ret);
3959
3960         if (left_path) {
3961                 /*
3962                  * The rotate code has indicated that we need to fix
3963                  * up portions of the tree after the insert.
3964                  *
3965                  * XXX: Should we extend the transaction here?
3966                  */
3967                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3968                                                         right_path);
3969                 ocfs2_complete_edge_insert(inode, handle, left_path,
3970                                            right_path, subtree_index);
3971         }
3972
3973         ret = 0;
3974 out:
3975         return ret;
3976 }
3977
3978 static int ocfs2_do_insert_extent(struct inode *inode,
3979                                   handle_t *handle,
3980                                   struct ocfs2_extent_tree *et,
3981                                   struct ocfs2_extent_rec *insert_rec,
3982                                   struct ocfs2_insert_type *type)
3983 {
3984         int ret, rotate = 0;
3985         u32 cpos;
3986         struct ocfs2_path *right_path = NULL;
3987         struct ocfs2_path *left_path = NULL;
3988         struct ocfs2_extent_list *el;
3989
3990         el = et->et_root_el;
3991
3992         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3993                                    OCFS2_JOURNAL_ACCESS_WRITE);
3994         if (ret) {
3995                 mlog_errno(ret);
3996                 goto out;
3997         }
3998
3999         if (le16_to_cpu(el->l_tree_depth) == 0) {
4000                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
4001                 goto out_update_clusters;
4002         }
4003
4004         right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4005         if (!right_path) {
4006                 ret = -ENOMEM;
4007                 mlog_errno(ret);
4008                 goto out;
4009         }
4010
4011         /*
4012          * Determine the path to start with. Rotations need the
4013          * rightmost path, everything else can go directly to the
4014          * target leaf.
4015          */
4016         cpos = le32_to_cpu(insert_rec->e_cpos);
4017         if (type->ins_appending == APPEND_NONE &&
4018             type->ins_contig == CONTIG_NONE) {
4019                 rotate = 1;
4020                 cpos = UINT_MAX;
4021         }
4022
4023         ret = ocfs2_find_path(inode, right_path, cpos);
4024         if (ret) {
4025                 mlog_errno(ret);
4026                 goto out;
4027         }
4028
4029         /*
4030          * Rotations and appends need special treatment - they modify
4031          * parts of the tree's above them.
4032          *
4033          * Both might pass back a path immediate to the left of the
4034          * one being inserted to. This will be cause
4035          * ocfs2_insert_path() to modify the rightmost records of
4036          * left_path to account for an edge insert.
4037          *
4038          * XXX: When modifying this code, keep in mind that an insert
4039          * can wind up skipping both of these two special cases...
4040          */
4041         if (rotate) {
4042                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
4043                                               le32_to_cpu(insert_rec->e_cpos),
4044                                               right_path, &left_path);
4045                 if (ret) {
4046                         mlog_errno(ret);
4047                         goto out;
4048                 }
4049
4050                 /*
4051                  * ocfs2_rotate_tree_right() might have extended the
4052                  * transaction without re-journaling our tree root.
4053                  */
4054                 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
4055                                            OCFS2_JOURNAL_ACCESS_WRITE);
4056                 if (ret) {
4057                         mlog_errno(ret);
4058                         goto out;
4059                 }
4060         } else if (type->ins_appending == APPEND_TAIL
4061                    && type->ins_contig != CONTIG_LEFT) {
4062                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4063                                                right_path, &left_path);
4064                 if (ret) {
4065                         mlog_errno(ret);
4066                         goto out;
4067                 }
4068         }
4069
4070         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4071                                 insert_rec, type);
4072         if (ret) {
4073                 mlog_errno(ret);
4074                 goto out;
4075         }
4076
4077 out_update_clusters:
4078         if (type->ins_split == SPLIT_NONE)
4079                 ocfs2_et_update_clusters(inode, et,
4080                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4081
4082         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4083         if (ret)
4084                 mlog_errno(ret);
4085
4086 out:
4087         ocfs2_free_path(left_path);
4088         ocfs2_free_path(right_path);
4089
4090         return ret;
4091 }
4092
4093 static enum ocfs2_contig_type
4094 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4095                                struct ocfs2_extent_list *el, int index,
4096                                struct ocfs2_extent_rec *split_rec)
4097 {
4098         int status;
4099         enum ocfs2_contig_type ret = CONTIG_NONE;
4100         u32 left_cpos, right_cpos;
4101         struct ocfs2_extent_rec *rec = NULL;
4102         struct ocfs2_extent_list *new_el;
4103         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4104         struct buffer_head *bh;
4105         struct ocfs2_extent_block *eb;
4106
4107         if (index > 0) {
4108                 rec = &el->l_recs[index - 1];
4109         } else if (path->p_tree_depth > 0) {
4110                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4111                                                        path, &left_cpos);
4112                 if (status)
4113                         goto out;
4114
4115                 if (left_cpos != 0) {
4116                         left_path = ocfs2_new_path(path_root_bh(path),
4117                                                    path_root_el(path));
4118                         if (!left_path)
4119                                 goto out;
4120
4121                         status = ocfs2_find_path(inode, left_path, left_cpos);
4122                         if (status)
4123                                 goto out;
4124
4125                         new_el = path_leaf_el(left_path);
4126
4127                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4128                             le16_to_cpu(new_el->l_count)) {
4129                                 bh = path_leaf_bh(left_path);
4130                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4131                                 ocfs2_error(inode->i_sb,
4132                                             "Extent block #%llu has an "
4133                                             "invalid l_next_free_rec of "
4134                                             "%d.  It should have "
4135                                             "matched the l_count of %d",
4136                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4137                                             le16_to_cpu(new_el->l_next_free_rec),
4138                                             le16_to_cpu(new_el->l_count));
4139                                 status = -EINVAL;
4140                                 goto out;
4141                         }
4142                         rec = &new_el->l_recs[
4143                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4144                 }
4145         }
4146
4147         /*
4148          * We're careful to check for an empty extent record here -
4149          * the merge code will know what to do if it sees one.
4150          */
4151         if (rec) {
4152                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4153                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4154                                 ret = CONTIG_RIGHT;
4155                 } else {
4156                         ret = ocfs2_extent_contig(inode, rec, split_rec);
4157                 }
4158         }
4159
4160         rec = NULL;
4161         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4162                 rec = &el->l_recs[index + 1];
4163         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4164                  path->p_tree_depth > 0) {
4165                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4166                                                         path, &right_cpos);
4167                 if (status)
4168                         goto out;
4169
4170                 if (right_cpos == 0)
4171                         goto out;
4172
4173                 right_path = ocfs2_new_path(path_root_bh(path),
4174                                             path_root_el(path));
4175                 if (!right_path)
4176                         goto out;
4177
4178                 status = ocfs2_find_path(inode, right_path, right_cpos);
4179                 if (status)
4180                         goto out;
4181
4182                 new_el = path_leaf_el(right_path);
4183                 rec = &new_el->l_recs[0];
4184                 if (ocfs2_is_empty_extent(rec)) {
4185                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4186                                 bh = path_leaf_bh(right_path);
4187                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4188                                 ocfs2_error(inode->i_sb,
4189                                             "Extent block #%llu has an "
4190                                             "invalid l_next_free_rec of %d",
4191                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4192                                             le16_to_cpu(new_el->l_next_free_rec));
4193                                 status = -EINVAL;
4194                                 goto out;
4195                         }
4196                         rec = &new_el->l_recs[1];
4197                 }
4198         }
4199
4200         if (rec) {
4201                 enum ocfs2_contig_type contig_type;
4202
4203                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4204
4205                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4206                         ret = CONTIG_LEFTRIGHT;
4207                 else if (ret == CONTIG_NONE)
4208                         ret = contig_type;
4209         }
4210
4211 out:
4212         if (left_path)
4213                 ocfs2_free_path(left_path);
4214         if (right_path)
4215                 ocfs2_free_path(right_path);
4216
4217         return ret;
4218 }
4219
4220 static void ocfs2_figure_contig_type(struct inode *inode,
4221                                      struct ocfs2_insert_type *insert,
4222                                      struct ocfs2_extent_list *el,
4223                                      struct ocfs2_extent_rec *insert_rec,
4224                                      struct ocfs2_extent_tree *et)
4225 {
4226         int i;
4227         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4228
4229         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4230
4231         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4232                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4233                                                   insert_rec);
4234                 if (contig_type != CONTIG_NONE) {
4235                         insert->ins_contig_index = i;
4236                         break;
4237                 }
4238         }
4239         insert->ins_contig = contig_type;
4240
4241         if (insert->ins_contig != CONTIG_NONE) {
4242                 struct ocfs2_extent_rec *rec =
4243                                 &el->l_recs[insert->ins_contig_index];
4244                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4245                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4246
4247                 /*
4248                  * Caller might want us to limit the size of extents, don't
4249                  * calculate contiguousness if we might exceed that limit.
4250                  */
4251                 if (et->et_max_leaf_clusters &&
4252                     (len > et->et_max_leaf_clusters))
4253                         insert->ins_contig = CONTIG_NONE;
4254         }
4255 }
4256
4257 /*
4258  * This should only be called against the righmost leaf extent list.
4259  *
4260  * ocfs2_figure_appending_type() will figure out whether we'll have to
4261  * insert at the tail of the rightmost leaf.
4262  *
4263  * This should also work against the root extent list for tree's with 0
4264  * depth. If we consider the root extent list to be the rightmost leaf node
4265  * then the logic here makes sense.
4266  */
4267 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4268                                         struct ocfs2_extent_list *el,
4269                                         struct ocfs2_extent_rec *insert_rec)
4270 {
4271         int i;
4272         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4273         struct ocfs2_extent_rec *rec;
4274
4275         insert->ins_appending = APPEND_NONE;
4276
4277         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4278
4279         if (!el->l_next_free_rec)
4280                 goto set_tail_append;
4281
4282         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4283                 /* Were all records empty? */
4284                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4285                         goto set_tail_append;
4286         }
4287
4288         i = le16_to_cpu(el->l_next_free_rec) - 1;
4289         rec = &el->l_recs[i];
4290
4291         if (cpos >=
4292             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4293                 goto set_tail_append;
4294
4295         return;
4296
4297 set_tail_append:
4298         insert->ins_appending = APPEND_TAIL;
4299 }
4300
4301 /*
4302  * Helper function called at the begining of an insert.
4303  *
4304  * This computes a few things that are commonly used in the process of
4305  * inserting into the btree:
4306  *   - Whether the new extent is contiguous with an existing one.
4307  *   - The current tree depth.
4308  *   - Whether the insert is an appending one.
4309  *   - The total # of free records in the tree.
4310  *
4311  * All of the information is stored on the ocfs2_insert_type
4312  * structure.
4313  */
4314 static int ocfs2_figure_insert_type(struct inode *inode,
4315                                     struct ocfs2_extent_tree *et,
4316                                     struct buffer_head **last_eb_bh,
4317                                     struct ocfs2_extent_rec *insert_rec,
4318                                     int *free_records,
4319                                     struct ocfs2_insert_type *insert)
4320 {
4321         int ret;
4322         struct ocfs2_extent_block *eb;
4323         struct ocfs2_extent_list *el;
4324         struct ocfs2_path *path = NULL;
4325         struct buffer_head *bh = NULL;
4326
4327         insert->ins_split = SPLIT_NONE;
4328
4329         el = et->et_root_el;
4330         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4331
4332         if (el->l_tree_depth) {
4333                 /*
4334                  * If we have tree depth, we read in the
4335                  * rightmost extent block ahead of time as
4336                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4337                  * may want it later.
4338                  */
4339                 ret = ocfs2_read_extent_block(inode,
4340                                               ocfs2_et_get_last_eb_blk(et),
4341                                               &bh);
4342                 if (ret) {
4343                         mlog_exit(ret);
4344                         goto out;
4345                 }
4346                 eb = (struct ocfs2_extent_block *) bh->b_data;
4347                 el = &eb->h_list;
4348         }
4349
4350         /*
4351          * Unless we have a contiguous insert, we'll need to know if
4352          * there is room left in our allocation tree for another
4353          * extent record.
4354          *
4355          * XXX: This test is simplistic, we can search for empty
4356          * extent records too.
4357          */
4358         *free_records = le16_to_cpu(el->l_count) -
4359                 le16_to_cpu(el->l_next_free_rec);
4360
4361         if (!insert->ins_tree_depth) {
4362                 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4363                 ocfs2_figure_appending_type(insert, el, insert_rec);
4364                 return 0;
4365         }
4366
4367         path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4368         if (!path) {
4369                 ret = -ENOMEM;
4370                 mlog_errno(ret);
4371                 goto out;
4372         }
4373
4374         /*
4375          * In the case that we're inserting past what the tree
4376          * currently accounts for, ocfs2_find_path() will return for
4377          * us the rightmost tree path. This is accounted for below in
4378          * the appending code.
4379          */
4380         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4381         if (ret) {
4382                 mlog_errno(ret);
4383                 goto out;
4384         }
4385
4386         el = path_leaf_el(path);
4387
4388         /*
4389          * Now that we have the path, there's two things we want to determine:
4390          * 1) Contiguousness (also set contig_index if this is so)
4391          *
4392          * 2) Are we doing an append? We can trivially break this up
4393          *     into two types of appends: simple record append, or a
4394          *     rotate inside the tail leaf.
4395          */
4396         ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4397
4398         /*
4399          * The insert code isn't quite ready to deal with all cases of
4400          * left contiguousness. Specifically, if it's an insert into
4401          * the 1st record in a leaf, it will require the adjustment of
4402          * cluster count on the last record of the path directly to it's
4403          * left. For now, just catch that case and fool the layers
4404          * above us. This works just fine for tree_depth == 0, which
4405          * is why we allow that above.
4406          */
4407         if (insert->ins_contig == CONTIG_LEFT &&
4408             insert->ins_contig_index == 0)
4409                 insert->ins_contig = CONTIG_NONE;
4410
4411         /*
4412          * Ok, so we can simply compare against last_eb to figure out
4413          * whether the path doesn't exist. This will only happen in
4414          * the case that we're doing a tail append, so maybe we can
4415          * take advantage of that information somehow.
4416          */
4417         if (ocfs2_et_get_last_eb_blk(et) ==
4418             path_leaf_bh(path)->b_blocknr) {
4419                 /*
4420                  * Ok, ocfs2_find_path() returned us the rightmost
4421                  * tree path. This might be an appending insert. There are
4422                  * two cases:
4423                  *    1) We're doing a true append at the tail:
4424                  *      -This might even be off the end of the leaf
4425                  *    2) We're "appending" by rotating in the tail
4426                  */
4427                 ocfs2_figure_appending_type(insert, el, insert_rec);
4428         }
4429
4430 out:
4431         ocfs2_free_path(path);
4432
4433         if (ret == 0)
4434                 *last_eb_bh = bh;
4435         else
4436                 brelse(bh);
4437         return ret;
4438 }
4439
4440 /*
4441  * Insert an extent into an inode btree.
4442  *
4443  * The caller needs to update fe->i_clusters
4444  */
4445 int ocfs2_insert_extent(struct ocfs2_super *osb,
4446                         handle_t *handle,
4447                         struct inode *inode,
4448                         struct ocfs2_extent_tree *et,
4449                         u32 cpos,
4450                         u64 start_blk,
4451                         u32 new_clusters,
4452                         u8 flags,
4453                         struct ocfs2_alloc_context *meta_ac)
4454 {
4455         int status;
4456         int uninitialized_var(free_records);
4457         struct buffer_head *last_eb_bh = NULL;
4458         struct ocfs2_insert_type insert = {0, };
4459         struct ocfs2_extent_rec rec;
4460
4461         mlog(0, "add %u clusters at position %u to inode %llu\n",
4462              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4463
4464         memset(&rec, 0, sizeof(rec));
4465         rec.e_cpos = cpu_to_le32(cpos);
4466         rec.e_blkno = cpu_to_le64(start_blk);
4467         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4468         rec.e_flags = flags;
4469         status = ocfs2_et_insert_check(inode, et, &rec);
4470         if (status) {
4471                 mlog_errno(status);
4472                 goto bail;
4473         }
4474
4475         status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4476                                           &free_records, &insert);
4477         if (status < 0) {
4478                 mlog_errno(status);
4479                 goto bail;
4480         }
4481
4482         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4483              "Insert.contig_index: %d, Insert.free_records: %d, "
4484              "Insert.tree_depth: %d\n",
4485              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4486              free_records, insert.ins_tree_depth);
4487
4488         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4489                 status = ocfs2_grow_tree(inode, handle, et,
4490                                          &insert.ins_tree_depth, &last_eb_bh,
4491                                          meta_ac);
4492                 if (status) {
4493                         mlog_errno(status);
4494                         goto bail;
4495                 }
4496         }
4497
4498         /* Finally, we can add clusters. This might rotate the tree for us. */
4499         status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4500         if (status < 0)
4501                 mlog_errno(status);
4502         else if (et->et_ops == &ocfs2_dinode_et_ops)
4503                 ocfs2_extent_map_insert_rec(inode, &rec);
4504
4505 bail:
4506         brelse(last_eb_bh);
4507
4508         mlog_exit(status);
4509         return status;
4510 }
4511
4512 /*
4513  * Allcate and add clusters into the extent b-tree.
4514  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4515  * The extent b-tree's root is specified by et, and
4516  * it is not limited to the file storage. Any extent tree can use this
4517  * function if it implements the proper ocfs2_extent_tree.
4518  */
4519 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4520                                 struct inode *inode,
4521                                 u32 *logical_offset,
4522                                 u32 clusters_to_add,
4523                                 int mark_unwritten,
4524                                 struct ocfs2_extent_tree *et,
4525                                 handle_t *handle,
4526                                 struct ocfs2_alloc_context *data_ac,
4527                                 struct ocfs2_alloc_context *meta_ac,
4528                                 enum ocfs2_alloc_restarted *reason_ret)
4529 {
4530         int status = 0;
4531         int free_extents;
4532         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4533         u32 bit_off, num_bits;
4534         u64 block;
4535         u8 flags = 0;
4536
4537         BUG_ON(!clusters_to_add);
4538
4539         if (mark_unwritten)
4540                 flags = OCFS2_EXT_UNWRITTEN;
4541
4542         free_extents = ocfs2_num_free_extents(osb, inode, et);
4543         if (free_extents < 0) {
4544                 status = free_extents;
4545                 mlog_errno(status);
4546                 goto leave;
4547         }
4548
4549         /* there are two cases which could cause us to EAGAIN in the
4550          * we-need-more-metadata case:
4551          * 1) we haven't reserved *any*
4552          * 2) we are so fragmented, we've needed to add metadata too
4553          *    many times. */
4554         if (!free_extents && !meta_ac) {
4555                 mlog(0, "we haven't reserved any metadata!\n");
4556                 status = -EAGAIN;
4557                 reason = RESTART_META;
4558                 goto leave;
4559         } else if ((!free_extents)
4560                    && (ocfs2_alloc_context_bits_left(meta_ac)
4561                        < ocfs2_extend_meta_needed(et->et_root_el))) {
4562                 mlog(0, "filesystem is really fragmented...\n");
4563                 status = -EAGAIN;
4564                 reason = RESTART_META;
4565                 goto leave;
4566         }
4567
4568         status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4569                                         clusters_to_add, &bit_off, &num_bits);
4570         if (status < 0) {
4571                 if (status != -ENOSPC)
4572                         mlog_errno(status);
4573                 goto leave;
4574         }
4575
4576         BUG_ON(num_bits > clusters_to_add);
4577
4578         /* reserve our write early -- insert_extent may update the inode */
4579         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
4580                                       OCFS2_JOURNAL_ACCESS_WRITE);
4581         if (status < 0) {
4582                 mlog_errno(status);
4583                 goto leave;
4584         }
4585
4586         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4587         mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4588              num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4589         status = ocfs2_insert_extent(osb, handle, inode, et,
4590                                      *logical_offset, block,
4591                                      num_bits, flags, meta_ac);
4592         if (status < 0) {
4593                 mlog_errno(status);
4594                 goto leave;
4595         }
4596
4597         status = ocfs2_journal_dirty(handle, et->et_root_bh);
4598         if (status < 0) {
4599                 mlog_errno(status);
4600                 goto leave;
4601         }
4602
4603         clusters_to_add -= num_bits;
4604         *logical_offset += num_bits;
4605
4606         if (clusters_to_add) {
4607                 mlog(0, "need to alloc once more, wanted = %u\n",
4608                      clusters_to_add);
4609                 status = -EAGAIN;
4610                 reason = RESTART_TRANS;
4611         }
4612
4613 leave:
4614         mlog_exit(status);
4615         if (reason_ret)
4616                 *reason_ret = reason;
4617         return status;
4618 }
4619
4620 static void ocfs2_make_right_split_rec(struct super_block *sb,
4621                                        struct ocfs2_extent_rec *split_rec,
4622                                        u32 cpos,
4623                                        struct ocfs2_extent_rec *rec)
4624 {
4625         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4626         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4627
4628         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4629
4630         split_rec->e_cpos = cpu_to_le32(cpos);
4631         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4632
4633         split_rec->e_blkno = rec->e_blkno;
4634         le64_add_cpu(&split_rec->e_blkno,
4635                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4636
4637         split_rec->e_flags = rec->e_flags;
4638 }
4639
4640 static int ocfs2_split_and_insert(struct inode *inode,
4641                                   handle_t *handle,
4642                                   struct ocfs2_path *path,
4643                                   struct ocfs2_extent_tree *et,
4644                                   struct buffer_head **last_eb_bh,
4645                                   int split_index,
4646                                   struct ocfs2_extent_rec *orig_split_rec,
4647                                   struct ocfs2_alloc_context *meta_ac)
4648 {
4649         int ret = 0, depth;
4650         unsigned int insert_range, rec_range, do_leftright = 0;
4651         struct ocfs2_extent_rec tmprec;
4652         struct ocfs2_extent_list *rightmost_el;
4653         struct ocfs2_extent_rec rec;
4654         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4655         struct ocfs2_insert_type insert;
4656         struct ocfs2_extent_block *eb;
4657
4658 leftright:
4659         /*
4660          * Store a copy of the record on the stack - it might move
4661          * around as the tree is manipulated below.
4662          */
4663         rec = path_leaf_el(path)->l_recs[split_index];
4664
4665         rightmost_el = et->et_root_el;
4666
4667         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4668         if (depth) {
4669                 BUG_ON(!(*last_eb_bh));
4670                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4671                 rightmost_el = &eb->h_list;
4672         }
4673
4674         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4675             le16_to_cpu(rightmost_el->l_count)) {
4676                 ret = ocfs2_grow_tree(inode, handle, et,
4677                                       &depth, last_eb_bh, meta_ac);
4678                 if (ret) {
4679                         mlog_errno(ret);
4680                         goto out;
4681                 }
4682         }
4683
4684         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4685         insert.ins_appending = APPEND_NONE;
4686         insert.ins_contig = CONTIG_NONE;
4687         insert.ins_tree_depth = depth;
4688
4689         insert_range = le32_to_cpu(split_rec.e_cpos) +
4690                 le16_to_cpu(split_rec.e_leaf_clusters);
4691         rec_range = le32_to_cpu(rec.e_cpos) +
4692                 le16_to_cpu(rec.e_leaf_clusters);
4693
4694         if (split_rec.e_cpos == rec.e_cpos) {
4695                 insert.ins_split = SPLIT_LEFT;
4696         } else if (insert_range == rec_range) {
4697                 insert.ins_split = SPLIT_RIGHT;
4698         } else {
4699                 /*
4700                  * Left/right split. We fake this as a right split
4701                  * first and then make a second pass as a left split.
4702                  */
4703                 insert.ins_split = SPLIT_RIGHT;
4704
4705                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4706                                            &rec);
4707
4708                 split_rec = tmprec;
4709
4710                 BUG_ON(do_leftright);
4711                 do_leftright = 1;
4712         }
4713
4714         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4715         if (ret) {
4716                 mlog_errno(ret);
4717                 goto out;
4718         }
4719
4720         if (do_leftright == 1) {
4721                 u32 cpos;
4722                 struct ocfs2_extent_list *el;
4723
4724                 do_leftright++;
4725                 split_rec = *orig_split_rec;
4726
4727                 ocfs2_reinit_path(path, 1);
4728
4729                 cpos = le32_to_cpu(split_rec.e_cpos);
4730                 ret = ocfs2_find_path(inode, path, cpos);
4731                 if (ret) {
4732                         mlog_errno(ret);
4733                         goto out;
4734                 }
4735
4736                 el = path_leaf_el(path);
4737                 split_index = ocfs2_search_extent_list(el, cpos);
4738                 goto leftright;
4739         }
4740 out:
4741
4742         return ret;
4743 }
4744
4745 /*
4746  * Mark part or all of the extent record at split_index in the leaf
4747  * pointed to by path as written. This removes the unwritten
4748  * extent flag.
4749  *
4750  * Care is taken to handle contiguousness so as to not grow the tree.
4751  *
4752  * meta_ac is not strictly necessary - we only truly need it if growth
4753  * of the tree is required. All other cases will degrade into a less
4754  * optimal tree layout.
4755  *
4756  * last_eb_bh should be the rightmost leaf block for any extent
4757  * btree. Since a split may grow the tree or a merge might shrink it,
4758  * the caller cannot trust the contents of that buffer after this call.
4759  *
4760  * This code is optimized for readability - several passes might be
4761  * made over certain portions of the tree. All of those blocks will
4762  * have been brought into cache (and pinned via the journal), so the
4763  * extra overhead is not expressed in terms of disk reads.
4764  */
4765 static int __ocfs2_mark_extent_written(struct inode *inode,
4766                                        struct ocfs2_extent_tree *et,
4767                                        handle_t *handle,
4768                                        struct ocfs2_path *path,
4769                                        int split_index,
4770                                        struct ocfs2_extent_rec *split_rec,
4771                                        struct ocfs2_alloc_context *meta_ac,
4772                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4773 {
4774         int ret = 0;
4775         struct ocfs2_extent_list *el = path_leaf_el(path);
4776         struct buffer_head *last_eb_bh = NULL;
4777         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4778         struct ocfs2_merge_ctxt ctxt;
4779         struct ocfs2_extent_list *rightmost_el;
4780
4781         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4782                 ret = -EIO;
4783                 mlog_errno(ret);
4784                 goto out;
4785         }
4786
4787         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4788             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4789              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4790                 ret = -EIO;
4791                 mlog_errno(ret);
4792                 goto out;
4793         }
4794
4795         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4796                                                             split_index,
4797                                                             split_rec);
4798
4799         /*
4800          * The core merge / split code wants to know how much room is
4801          * left in this inodes allocation tree, so we pass the
4802          * rightmost extent list.
4803          */
4804         if (path->p_tree_depth) {
4805                 struct ocfs2_extent_block *eb;
4806
4807                 ret = ocfs2_read_extent_block(inode,
4808                                               ocfs2_et_get_last_eb_blk(et),
4809                                               &last_eb_bh);
4810                 if (ret) {
4811                         mlog_exit(ret);
4812                         goto out;
4813                 }
4814
4815                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4816                 rightmost_el = &eb->h_list;
4817         } else
4818                 rightmost_el = path_root_el(path);
4819
4820         if (rec->e_cpos == split_rec->e_cpos &&
4821             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4822                 ctxt.c_split_covers_rec = 1;
4823         else
4824                 ctxt.c_split_covers_rec = 0;
4825
4826         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4827
4828         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4829              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4830              ctxt.c_split_covers_rec);
4831
4832         if (ctxt.c_contig_type == CONTIG_NONE) {
4833                 if (ctxt.c_split_covers_rec)
4834                         el->l_recs[split_index] = *split_rec;
4835                 else
4836                         ret = ocfs2_split_and_insert(inode, handle, path, et,
4837                                                      &last_eb_bh, split_index,
4838                                                      split_rec, meta_ac);
4839                 if (ret)
4840                         mlog_errno(ret);
4841         } else {
4842                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4843                                                 split_index, split_rec,
4844                                                 dealloc, &ctxt, et);
4845                 if (ret)
4846                         mlog_errno(ret);
4847         }
4848
4849 out:
4850         brelse(last_eb_bh);
4851         return ret;
4852 }
4853
4854 /*
4855  * Mark the already-existing extent at cpos as written for len clusters.
4856  *
4857  * If the existing extent is larger than the request, initiate a
4858  * split. An attempt will be made at merging with adjacent extents.
4859  *
4860  * The caller is responsible for passing down meta_ac if we'll need it.
4861  */
4862 int ocfs2_mark_extent_written(struct inode *inode,
4863                               struct ocfs2_extent_tree *et,
4864                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4865                               struct ocfs2_alloc_context *meta_ac,
4866                               struct ocfs2_cached_dealloc_ctxt *dealloc)
4867 {
4868         int ret, index;
4869         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4870         struct ocfs2_extent_rec split_rec;
4871         struct ocfs2_path *left_path = NULL;
4872         struct ocfs2_extent_list *el;
4873
4874         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4875              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4876
4877         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4878                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4879                             "that are being written to, but the feature bit "
4880                             "is not set in the super block.",
4881                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4882                 ret = -EROFS;
4883                 goto out;
4884         }
4885
4886         /*
4887          * XXX: This should be fixed up so that we just re-insert the
4888          * next extent records.
4889          *
4890          * XXX: This is a hack on the extent tree, maybe it should be
4891          * an op?
4892          */
4893         if (et->et_ops == &ocfs2_dinode_et_ops)
4894                 ocfs2_extent_map_trunc(inode, 0);
4895
4896         left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4897         if (!left_path) {
4898                 ret = -ENOMEM;
4899                 mlog_errno(ret);
4900                 goto out;
4901         }
4902
4903         ret = ocfs2_find_path(inode, left_path, cpos);
4904         if (ret) {
4905                 mlog_errno(ret);
4906                 goto out;
4907         }
4908         el = path_leaf_el(left_path);
4909
4910         index = ocfs2_search_extent_list(el, cpos);
4911         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4912                 ocfs2_error(inode->i_sb,
4913                             "Inode %llu has an extent at cpos %u which can no "
4914                             "longer be found.\n",
4915                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4916                 ret = -EROFS;
4917                 goto out;
4918         }
4919
4920         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4921         split_rec.e_cpos = cpu_to_le32(cpos);
4922         split_rec.e_leaf_clusters = cpu_to_le16(len);
4923         split_rec.e_blkno = cpu_to_le64(start_blkno);
4924         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4925         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4926
4927         ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4928                                           index, &split_rec, meta_ac,
4929                                           dealloc);
4930         if (ret)
4931                 mlog_errno(ret);
4932
4933 out:
4934         ocfs2_free_path(left_path);
4935         return ret;
4936 }
4937
4938 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4939                             handle_t *handle, struct ocfs2_path *path,
4940                             int index, u32 new_range,
4941                             struct ocfs2_alloc_context *meta_ac)
4942 {
4943         int ret, depth, credits = handle->h_buffer_credits;
4944         struct buffer_head *last_eb_bh = NULL;
4945         struct ocfs2_extent_block *eb;
4946         struct ocfs2_extent_list *rightmost_el, *el;
4947         struct ocfs2_extent_rec split_rec;
4948         struct ocfs2_extent_rec *rec;
4949         struct ocfs2_insert_type insert;
4950
4951         /*
4952          * Setup the record to split before we grow the tree.
4953          */
4954         el = path_leaf_el(path);
4955         rec = &el->l_recs[index];
4956         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4957
4958         depth = path->p_tree_depth;
4959         if (depth > 0) {
4960                 ret = ocfs2_read_extent_block(inode,
4961                                               ocfs2_et_get_last_eb_blk(et),
4962                                               &last_eb_bh);
4963                 if (ret < 0) {
4964                         mlog_errno(ret);
4965                         goto out;
4966                 }
4967
4968                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4969                 rightmost_el = &eb->h_list;
4970         } else
4971                 rightmost_el = path_leaf_el(path);
4972
4973         credits += path->p_tree_depth +
4974                    ocfs2_extend_meta_needed(et->et_root_el);
4975         ret = ocfs2_extend_trans(handle, credits);
4976         if (ret) {
4977                 mlog_errno(ret);
4978                 goto out;
4979         }
4980
4981         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4982             le16_to_cpu(rightmost_el->l_count)) {
4983                 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4984                                       meta_ac);
4985                 if (ret) {
4986                         mlog_errno(ret);
4987                         goto out;
4988                 }
4989         }
4990
4991         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4992         insert.ins_appending = APPEND_NONE;
4993         insert.ins_contig = CONTIG_NONE;
4994         insert.ins_split = SPLIT_RIGHT;
4995         insert.ins_tree_depth = depth;
4996
4997         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4998         if (ret)
4999                 mlog_errno(ret);
5000
5001 out:
5002         brelse(last_eb_bh);
5003         return ret;
5004 }
5005
5006 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5007                               struct ocfs2_path *path, int index,
5008                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5009                               u32 cpos, u32 len,
5010                               struct ocfs2_extent_tree *et)
5011 {
5012         int ret;
5013         u32 left_cpos, rec_range, trunc_range;
5014         int wants_rotate = 0, is_rightmost_tree_rec = 0;
5015         struct super_block *sb = inode->i_sb;
5016         struct ocfs2_path *left_path = NULL;
5017         struct ocfs2_extent_list *el = path_leaf_el(path);
5018         struct ocfs2_extent_rec *rec;
5019         struct ocfs2_extent_block *eb;
5020
5021         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5022                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5023                 if (ret) {
5024                         mlog_errno(ret);
5025                         goto out;
5026                 }
5027
5028                 index--;
5029         }
5030
5031         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5032             path->p_tree_depth) {
5033                 /*
5034                  * Check whether this is the rightmost tree record. If
5035                  * we remove all of this record or part of its right
5036                  * edge then an update of the record lengths above it
5037                  * will be required.
5038                  */
5039                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5040                 if (eb->h_next_leaf_blk == 0)
5041                         is_rightmost_tree_rec = 1;
5042         }
5043
5044         rec = &el->l_recs[index];
5045         if (index == 0 && path->p_tree_depth &&
5046             le32_to_cpu(rec->e_cpos) == cpos) {
5047                 /*
5048                  * Changing the leftmost offset (via partial or whole
5049                  * record truncate) of an interior (or rightmost) path
5050                  * means we have to update the subtree that is formed
5051                  * by this leaf and the one to it's left.
5052                  *
5053                  * There are two cases we can skip:
5054                  *   1) Path is the leftmost one in our inode tree.
5055                  *   2) The leaf is rightmost and will be empty after
5056                  *      we remove the extent record - the rotate code
5057                  *      knows how to update the newly formed edge.
5058                  */
5059
5060                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5061                                                     &left_cpos);
5062                 if (ret) {
5063                         mlog_errno(ret);
5064                         goto out;
5065                 }
5066
5067                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5068                         left_path = ocfs2_new_path(path_root_bh(path),
5069                                                    path_root_el(path));
5070                         if (!left_path) {
5071                                 ret = -ENOMEM;
5072                                 mlog_errno(ret);
5073                                 goto out;
5074                         }
5075
5076                         ret = ocfs2_find_path(inode, left_path, left_cpos);
5077                         if (ret) {
5078                                 mlog_errno(ret);
5079                                 goto out;
5080                         }
5081                 }
5082         }
5083
5084         ret = ocfs2_extend_rotate_transaction(handle, 0,
5085                                               handle->h_buffer_credits,
5086                                               path);
5087         if (ret) {
5088                 mlog_errno(ret);
5089                 goto out;
5090         }
5091
5092         ret = ocfs2_journal_access_path(inode, handle, path);
5093         if (ret) {
5094                 mlog_errno(ret);
5095                 goto out;
5096         }
5097
5098         ret = ocfs2_journal_access_path(inode, handle, left_path);
5099         if (ret) {
5100                 mlog_errno(ret);
5101                 goto out;
5102         }
5103
5104         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5105         trunc_range = cpos + len;
5106
5107         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5108                 int next_free;
5109
5110                 memset(rec, 0, sizeof(*rec));
5111                 ocfs2_cleanup_merge(el, index);
5112                 wants_rotate = 1;
5113
5114                 next_free = le16_to_cpu(el->l_next_free_rec);
5115                 if (is_rightmost_tree_rec && next_free > 1) {
5116                         /*
5117                          * We skip the edge update if this path will
5118                          * be deleted by the rotate code.
5119                          */
5120                         rec = &el->l_recs[next_free - 1];
5121                         ocfs2_adjust_rightmost_records(inode, handle, path,
5122                                                        rec);
5123                 }
5124         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5125                 /* Remove leftmost portion of the record. */
5126                 le32_add_cpu(&rec->e_cpos, len);
5127                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5128                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5129         } else if (rec_range == trunc_range) {
5130                 /* Remove rightmost portion of the record */
5131                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5132                 if (is_rightmost_tree_rec)
5133                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5134         } else {
5135                 /* Caller should have trapped this. */
5136                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5137                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5138                      le32_to_cpu(rec->e_cpos),
5139                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5140                 BUG();
5141         }
5142
5143         if (left_path) {
5144                 int subtree_index;
5145
5146                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5147                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
5148                                            subtree_index);
5149         }
5150
5151         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5152
5153         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5154         if (ret) {
5155                 mlog_errno(ret);
5156                 goto out;
5157         }
5158
5159 out:
5160         ocfs2_free_path(left_path);
5161         return ret;
5162 }
5163
5164 int ocfs2_remove_extent(struct inode *inode,
5165                         struct ocfs2_extent_tree *et,
5166                         u32 cpos, u32 len, handle_t *handle,
5167                         struct ocfs2_alloc_context *meta_ac,
5168                         struct ocfs2_cached_dealloc_ctxt *dealloc)
5169 {
5170         int ret, index;
5171         u32 rec_range, trunc_range;
5172         struct ocfs2_extent_rec *rec;
5173         struct ocfs2_extent_list *el;
5174         struct ocfs2_path *path = NULL;
5175
5176         ocfs2_extent_map_trunc(inode, 0);
5177
5178         path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
5179         if (!path) {
5180                 ret = -ENOMEM;
5181                 mlog_errno(ret);
5182                 goto out;
5183         }
5184
5185         ret = ocfs2_find_path(inode, path, cpos);
5186         if (ret) {
5187                 mlog_errno(ret);
5188                 goto out;
5189         }
5190
5191         el = path_leaf_el(path);
5192         index = ocfs2_search_extent_list(el, cpos);
5193         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5194                 ocfs2_error(inode->i_sb,
5195                             "Inode %llu has an extent at cpos %u which can no "
5196                             "longer be found.\n",
5197                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5198                 ret = -EROFS;
5199                 goto out;
5200         }
5201
5202         /*
5203          * We have 3 cases of extent removal:
5204          *   1) Range covers the entire extent rec
5205          *   2) Range begins or ends on one edge of the extent rec
5206          *   3) Range is in the middle of the extent rec (no shared edges)
5207          *
5208          * For case 1 we remove the extent rec and left rotate to
5209          * fill the hole.
5210          *
5211          * For case 2 we just shrink the existing extent rec, with a
5212          * tree update if the shrinking edge is also the edge of an
5213          * extent block.
5214          *
5215          * For case 3 we do a right split to turn the extent rec into
5216          * something case 2 can handle.
5217          */
5218         rec = &el->l_recs[index];
5219         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5220         trunc_range = cpos + len;
5221
5222         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5223
5224         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5225              "(cpos %u, len %u)\n",
5226              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5227              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5228
5229         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5230                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5231                                          cpos, len, et);
5232                 if (ret) {
5233                         mlog_errno(ret);
5234                         goto out;
5235                 }
5236         } else {
5237                 ret = ocfs2_split_tree(inode, et, handle, path, index,
5238                                        trunc_range, meta_ac);
5239                 if (ret) {
5240                         mlog_errno(ret);
5241                         goto out;
5242                 }
5243
5244                 /*
5245                  * The split could have manipulated the tree enough to
5246                  * move the record location, so we have to look for it again.
5247                  */
5248                 ocfs2_reinit_path(path, 1);
5249
5250                 ret = ocfs2_find_path(inode, path, cpos);
5251                 if (ret) {
5252                         mlog_errno(ret);
5253                         goto out;
5254                 }
5255
5256                 el = path_leaf_el(path);
5257                 index = ocfs2_search_extent_list(el, cpos);
5258                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5259                         ocfs2_error(inode->i_sb,
5260                                     "Inode %llu: split at cpos %u lost record.",
5261                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5262                                     cpos);
5263                         ret = -EROFS;
5264                         goto out;
5265                 }
5266
5267                 /*
5268                  * Double check our values here. If anything is fishy,
5269                  * it's easier to catch it at the top level.
5270                  */
5271                 rec = &el->l_recs[index];
5272                 rec_range = le32_to_cpu(rec->e_cpos) +
5273                         ocfs2_rec_clusters(el, rec);
5274                 if (rec_range != trunc_range) {
5275                         ocfs2_error(inode->i_sb,
5276                                     "Inode %llu: error after split at cpos %u"
5277                                     "trunc len %u, existing record is (%u,%u)",
5278                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5279                                     cpos, len, le32_to_cpu(rec->e_cpos),
5280                                     ocfs2_rec_clusters(el, rec));
5281                         ret = -EROFS;
5282                         goto out;
5283                 }
5284
5285                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5286                                          cpos, len, et);
5287                 if (ret) {
5288                         mlog_errno(ret);
5289                         goto out;
5290                 }
5291         }
5292
5293 out:
5294         ocfs2_free_path(path);
5295         return ret;
5296 }
5297
5298 int ocfs2_remove_btree_range(struct inode *inode,
5299                              struct ocfs2_extent_tree *et,
5300                              u32 cpos, u32 phys_cpos, u32 len,
5301                              struct ocfs2_cached_dealloc_ctxt *dealloc)
5302 {
5303         int ret;
5304         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5305         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5306         struct inode *tl_inode = osb->osb_tl_inode;
5307         handle_t *handle;
5308         struct ocfs2_alloc_context *meta_ac = NULL;
5309
5310         ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5311         if (ret) {
5312                 mlog_errno(ret);
5313                 return ret;
5314         }
5315
5316         mutex_lock(&tl_inode->i_mutex);
5317
5318         if (ocfs2_truncate_log_needs_flush(osb)) {
5319                 ret = __ocfs2_flush_truncate_log(osb);
5320                 if (ret < 0) {
5321                         mlog_errno(ret);
5322                         goto out;
5323                 }
5324         }
5325
5326         handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5327         if (IS_ERR(handle)) {
5328                 ret = PTR_ERR(handle);
5329                 mlog_errno(ret);
5330                 goto out;
5331         }
5332
5333         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
5334                                    OCFS2_JOURNAL_ACCESS_WRITE);
5335         if (ret) {
5336                 mlog_errno(ret);
5337                 goto out;
5338         }
5339
5340         ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5341                                   dealloc);
5342         if (ret) {
5343                 mlog_errno(ret);
5344                 goto out_commit;
5345         }
5346
5347         ocfs2_et_update_clusters(inode, et, -len);
5348
5349         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5350         if (ret) {
5351                 mlog_errno(ret);
5352                 goto out_commit;
5353         }
5354
5355         ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5356         if (ret)
5357                 mlog_errno(ret);
5358
5359 out_commit:
5360         ocfs2_commit_trans(osb, handle);
5361 out:
5362         mutex_unlock(&tl_inode->i_mutex);
5363
5364         if (meta_ac)
5365                 ocfs2_free_alloc_context(meta_ac);
5366
5367         return ret;
5368 }
5369
5370 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5371 {
5372         struct buffer_head *tl_bh = osb->osb_tl_bh;
5373         struct ocfs2_dinode *di;
5374         struct ocfs2_truncate_log *tl;
5375
5376         di = (struct ocfs2_dinode *) tl_bh->b_data;
5377         tl = &di->id2.i_dealloc;
5378
5379         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5380                         "slot %d, invalid truncate log parameters: used = "
5381                         "%u, count = %u\n", osb->slot_num,
5382                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5383         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5384 }
5385
5386 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5387                                            unsigned int new_start)
5388 {
5389         unsigned int tail_index;
5390         unsigned int current_tail;
5391
5392         /* No records, nothing to coalesce */
5393         if (!le16_to_cpu(tl->tl_used))
5394                 return 0;
5395
5396         tail_index = le16_to_cpu(tl->tl_used) - 1;
5397         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5398         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5399
5400         return current_tail == new_start;
5401 }
5402
5403 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5404                               handle_t *handle,
5405                               u64 start_blk,
5406                               unsigned int num_clusters)
5407 {
5408         int status, index;
5409         unsigned int start_cluster, tl_count;
5410         struct inode *tl_inode = osb->osb_tl_inode;
5411         struct buffer_head *tl_bh = osb->osb_tl_bh;
5412         struct ocfs2_dinode *di;
5413         struct ocfs2_truncate_log *tl;
5414
5415         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5416                    (unsigned long long)start_blk, num_clusters);
5417
5418         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5419
5420         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5421
5422         di = (struct ocfs2_dinode *) tl_bh->b_data;
5423
5424         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5425          * by the underlying call to ocfs2_read_inode_block(), so any
5426          * corruption is a code bug */
5427         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5428
5429         tl = &di->id2.i_dealloc;
5430         tl_count = le16_to_cpu(tl->tl_count);
5431         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5432                         tl_count == 0,
5433                         "Truncate record count on #%llu invalid "
5434                         "wanted %u, actual %u\n",
5435                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5436                         ocfs2_truncate_recs_per_inode(osb->sb),
5437                         le16_to_cpu(tl->tl_count));
5438
5439         /* Caller should have known to flush before calling us. */
5440         index = le16_to_cpu(tl->tl_used);
5441         if (index >= tl_count) {
5442                 status = -ENOSPC;
5443                 mlog_errno(status);
5444                 goto bail;
5445         }
5446
5447         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5448                                       OCFS2_JOURNAL_ACCESS_WRITE);
5449         if (status < 0) {
5450                 mlog_errno(status);
5451                 goto bail;
5452         }
5453
5454         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5455              "%llu (index = %d)\n", num_clusters, start_cluster,
5456              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5457
5458         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5459                 /*
5460                  * Move index back to the record we are coalescing with.
5461                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5462                  */
5463                 index--;
5464
5465                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5466                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5467                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5468                      num_clusters);
5469         } else {
5470                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5471                 tl->tl_used = cpu_to_le16(index + 1);
5472         }
5473         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5474
5475         status = ocfs2_journal_dirty(handle, tl_bh);
5476         if (status < 0) {
5477                 mlog_errno(status);
5478                 goto bail;
5479         }
5480
5481 bail:
5482         mlog_exit(status);
5483         return status;
5484 }
5485
5486 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5487                                          handle_t *handle,
5488                                          struct inode *data_alloc_inode,
5489                                          struct buffer_head *data_alloc_bh)
5490 {
5491         int status = 0;
5492         int i;
5493         unsigned int num_clusters;
5494         u64 start_blk;
5495         struct ocfs2_truncate_rec rec;
5496         struct ocfs2_dinode *di;
5497         struct ocfs2_truncate_log *tl;
5498         struct inode *tl_inode = osb->osb_tl_inode;
5499         struct buffer_head *tl_bh = osb->osb_tl_bh;
5500
5501         mlog_entry_void();
5502
5503         di = (struct ocfs2_dinode *) tl_bh->b_data;
5504         tl = &di->id2.i_dealloc;
5505         i = le16_to_cpu(tl->tl_used) - 1;
5506         while (i >= 0) {
5507                 /* Caller has given us at least enough credits to
5508                  * update the truncate log dinode */
5509                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5510                                               OCFS2_JOURNAL_ACCESS_WRITE);
5511                 if (status < 0) {
5512                         mlog_errno(status);
5513                         goto bail;
5514                 }
5515
5516                 tl->tl_used = cpu_to_le16(i);
5517
5518                 status = ocfs2_journal_dirty(handle, tl_bh);
5519                 if (status < 0) {
5520                         mlog_errno(status);
5521                         goto bail;
5522                 }
5523
5524                 /* TODO: Perhaps we can calculate the bulk of the
5525                  * credits up front rather than extending like
5526                  * this. */
5527                 status = ocfs2_extend_trans(handle,
5528                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5529                 if (status < 0) {
5530                         mlog_errno(status);
5531                         goto bail;
5532                 }
5533
5534                 rec = tl->tl_recs[i];
5535                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5536                                                     le32_to_cpu(rec.t_start));
5537                 num_clusters = le32_to_cpu(rec.t_clusters);
5538
5539                 /* if start_blk is not set, we ignore the record as
5540                  * invalid. */
5541                 if (start_blk) {
5542                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5543                              i, le32_to_cpu(rec.t_start), num_clusters);
5544
5545                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5546                                                      data_alloc_bh, start_blk,
5547                                                      num_clusters);
5548                         if (status < 0) {
5549                                 mlog_errno(status);
5550                                 goto bail;
5551                         }
5552                 }
5553                 i--;
5554         }
5555
5556 bail:
5557         mlog_exit(status);
5558         return status;
5559 }
5560
5561 /* Expects you to already be holding tl_inode->i_mutex */
5562 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5563 {
5564         int status;
5565         unsigned int num_to_flush;
5566         handle_t *handle;
5567         struct inode *tl_inode = osb->osb_tl_inode;
5568         struct inode *data_alloc_inode = NULL;
5569         struct buffer_head *tl_bh = osb->osb_tl_bh;
5570         struct buffer_head *data_alloc_bh = NULL;
5571         struct ocfs2_dinode *di;
5572         struct ocfs2_truncate_log *tl;
5573
5574         mlog_entry_void();
5575
5576         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5577
5578         di = (struct ocfs2_dinode *) tl_bh->b_data;
5579
5580         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5581          * by the underlying call to ocfs2_read_inode_block(), so any
5582          * corruption is a code bug */
5583         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5584
5585         tl = &di->id2.i_dealloc;
5586         num_to_flush = le16_to_cpu(tl->tl_used);
5587         mlog(0, "Flush %u records from truncate log #%llu\n",
5588              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5589         if (!num_to_flush) {
5590                 status = 0;
5591                 goto out;
5592         }
5593
5594         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5595                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5596                                                        OCFS2_INVALID_SLOT);
5597         if (!data_alloc_inode) {
5598                 status = -EINVAL;
5599                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5600                 goto out;
5601         }
5602
5603         mutex_lock(&data_alloc_inode->i_mutex);
5604
5605         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5606         if (status < 0) {
5607                 mlog_errno(status);
5608                 goto out_mutex;
5609         }
5610
5611         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5612         if (IS_ERR(handle)) {
5613                 status = PTR_ERR(handle);
5614                 mlog_errno(status);
5615                 goto out_unlock;
5616         }
5617
5618         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5619                                                data_alloc_bh);
5620         if (status < 0)
5621                 mlog_errno(status);
5622
5623         ocfs2_commit_trans(osb, handle);
5624
5625 out_unlock:
5626         brelse(data_alloc_bh);
5627         ocfs2_inode_unlock(data_alloc_inode, 1);
5628
5629 out_mutex:
5630         mutex_unlock(&data_alloc_inode->i_mutex);
5631         iput(data_alloc_inode);
5632
5633 out:
5634         mlog_exit(status);
5635         return status;
5636 }
5637
5638 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5639 {
5640         int status;
5641         struct inode *tl_inode = osb->osb_tl_inode;
5642
5643         mutex_lock(&tl_inode->i_mutex);
5644         status = __ocfs2_flush_truncate_log(osb);
5645         mutex_unlock(&tl_inode->i_mutex);
5646
5647         return status;
5648 }
5649
5650 static void ocfs2_truncate_log_worker(struct work_struct *work)
5651 {
5652         int status;
5653         struct ocfs2_super *osb =
5654                 container_of(work, struct ocfs2_super,
5655                              osb_truncate_log_wq.work);
5656
5657         mlog_entry_void();
5658
5659         status = ocfs2_flush_truncate_log(osb);
5660         if (status < 0)
5661                 mlog_errno(status);
5662         else
5663                 ocfs2_init_inode_steal_slot(osb);
5664
5665         mlog_exit(status);
5666 }
5667
5668 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5669 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5670                                        int cancel)
5671 {
5672         if (osb->osb_tl_inode) {
5673                 /* We want to push off log flushes while truncates are
5674                  * still running. */
5675                 if (cancel)
5676                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5677
5678                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5679                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5680         }
5681 }
5682
5683 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5684                                        int slot_num,
5685                                        struct inode **tl_inode,
5686                                        struct buffer_head **tl_bh)
5687 {
5688         int status;
5689         struct inode *inode = NULL;
5690         struct buffer_head *bh = NULL;
5691
5692         inode = ocfs2_get_system_file_inode(osb,
5693                                            TRUNCATE_LOG_SYSTEM_INODE,
5694                                            slot_num);
5695         if (!inode) {
5696                 status = -EINVAL;
5697                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5698                 goto bail;
5699         }
5700
5701         status = ocfs2_read_inode_block(inode, &bh);
5702         if (status < 0) {
5703                 iput(inode);
5704                 mlog_errno(status);
5705                 goto bail;
5706         }
5707
5708         *tl_inode = inode;
5709         *tl_bh    = bh;
5710 bail:
5711         mlog_exit(status);
5712         return status;
5713 }
5714
5715 /* called during the 1st stage of node recovery. we stamp a clean
5716  * truncate log and pass back a copy for processing later. if the
5717  * truncate log does not require processing, a *tl_copy is set to
5718  * NULL. */
5719 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5720                                       int slot_num,
5721                                       struct ocfs2_dinode **tl_copy)
5722 {
5723         int status;
5724         struct inode *tl_inode = NULL;
5725         struct buffer_head *tl_bh = NULL;
5726         struct ocfs2_dinode *di;
5727         struct ocfs2_truncate_log *tl;
5728
5729         *tl_copy = NULL;
5730
5731         mlog(0, "recover truncate log from slot %d\n", slot_num);
5732
5733         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5734         if (status < 0) {
5735                 mlog_errno(status);
5736                 goto bail;
5737         }
5738
5739         di = (struct ocfs2_dinode *) tl_bh->b_data;
5740
5741         /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
5742          * validated by the underlying call to ocfs2_read_inode_block(),
5743          * so any corruption is a code bug */
5744         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5745
5746         tl = &di->id2.i_dealloc;
5747         if (le16_to_cpu(tl->tl_used)) {
5748                 mlog(0, "We'll have %u logs to recover\n",
5749                      le16_to_cpu(tl->tl_used));
5750
5751                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5752                 if (!(*tl_copy)) {
5753                         status = -ENOMEM;
5754                         mlog_errno(status);
5755                         goto bail;
5756                 }
5757
5758                 /* Assuming the write-out below goes well, this copy
5759                  * will be passed back to recovery for processing. */
5760                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5761
5762                 /* All we need to do to clear the truncate log is set
5763                  * tl_used. */
5764                 tl->tl_used = 0;
5765
5766                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5767                 if (status < 0) {
5768                         mlog_errno(status);
5769                         goto bail;
5770                 }
5771         }
5772
5773 bail:
5774         if (tl_inode)
5775                 iput(tl_inode);
5776         brelse(tl_bh);
5777
5778         if (status < 0 && (*tl_copy)) {
5779                 kfree(*tl_copy);
5780                 *tl_copy = NULL;
5781         }
5782
5783         mlog_exit(status);
5784         return status;
5785 }
5786
5787 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5788                                          struct ocfs2_dinode *tl_copy)
5789 {
5790         int status = 0;
5791         int i;
5792         unsigned int clusters, num_recs, start_cluster;
5793         u64 start_blk;
5794         handle_t *handle;
5795         struct inode *tl_inode = osb->osb_tl_inode;
5796         struct ocfs2_truncate_log *tl;
5797
5798         mlog_entry_void();
5799
5800         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5801                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5802                 return -EINVAL;
5803         }
5804
5805         tl = &tl_copy->id2.i_dealloc;
5806         num_recs = le16_to_cpu(tl->tl_used);
5807         mlog(0, "cleanup %u records from %llu\n", num_recs,
5808              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5809
5810         mutex_lock(&tl_inode->i_mutex);
5811         for(i = 0; i < num_recs; i++) {
5812                 if (ocfs2_truncate_log_needs_flush(osb)) {
5813                         status = __ocfs2_flush_truncate_log(osb);
5814                         if (status < 0) {
5815                                 mlog_errno(status);
5816                                 goto bail_up;
5817                         }
5818                 }
5819
5820                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5821                 if (IS_ERR(handle)) {
5822                         status = PTR_ERR(handle);
5823                         mlog_errno(status);
5824                         goto bail_up;
5825                 }
5826
5827                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5828                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5829                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5830
5831                 status = ocfs2_truncate_log_append(osb, handle,
5832                                                    start_blk, clusters);
5833                 ocfs2_commit_trans(osb, handle);
5834                 if (status < 0) {
5835                         mlog_errno(status);
5836                         goto bail_up;
5837                 }
5838         }
5839
5840 bail_up:
5841         mutex_unlock(&tl_inode->i_mutex);
5842
5843         mlog_exit(status);
5844         return status;
5845 }
5846
5847 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5848 {
5849         int status;
5850         struct inode *tl_inode = osb->osb_tl_inode;
5851
5852         mlog_entry_void();
5853
5854         if (tl_inode) {
5855                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5856                 flush_workqueue(ocfs2_wq);
5857
5858                 status = ocfs2_flush_truncate_log(osb);
5859                 if (status < 0)
5860                         mlog_errno(status);
5861
5862                 brelse(osb->osb_tl_bh);
5863                 iput(osb->osb_tl_inode);
5864         }
5865
5866         mlog_exit_void();
5867 }
5868
5869 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5870 {
5871         int status;
5872         struct inode *tl_inode = NULL;
5873         struct buffer_head *tl_bh = NULL;
5874
5875         mlog_entry_void();
5876
5877         status = ocfs2_get_truncate_log_info(osb,
5878                                              osb->slot_num,
5879                                              &tl_inode,
5880                                              &tl_bh);
5881         if (status < 0)
5882                 mlog_errno(status);
5883
5884         /* ocfs2_truncate_log_shutdown keys on the existence of
5885          * osb->osb_tl_inode so we don't set any of the osb variables
5886          * until we're sure all is well. */
5887         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5888                           ocfs2_truncate_log_worker);
5889         osb->osb_tl_bh    = tl_bh;
5890         osb->osb_tl_inode = tl_inode;
5891
5892         mlog_exit(status);
5893         return status;
5894 }
5895
5896 /*
5897  * Delayed de-allocation of suballocator blocks.
5898  *
5899  * Some sets of block de-allocations might involve multiple suballocator inodes.
5900  *
5901  * The locking for this can get extremely complicated, especially when
5902  * the suballocator inodes to delete from aren't known until deep
5903  * within an unrelated codepath.
5904  *
5905  * ocfs2_extent_block structures are a good example of this - an inode
5906  * btree could have been grown by any number of nodes each allocating
5907  * out of their own suballoc inode.
5908  *
5909  * These structures allow the delay of block de-allocation until a
5910  * later time, when locking of multiple cluster inodes won't cause
5911  * deadlock.
5912  */
5913
5914 /*
5915  * Describe a single bit freed from a suballocator.  For the block
5916  * suballocators, it represents one block.  For the global cluster
5917  * allocator, it represents some clusters and free_bit indicates
5918  * clusters number.
5919  */
5920 struct ocfs2_cached_block_free {
5921         struct ocfs2_cached_block_free          *free_next;
5922         u64                                     free_blk;
5923         unsigned int                            free_bit;
5924 };
5925
5926 struct ocfs2_per_slot_free_list {
5927         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5928         int                                     f_inode_type;
5929         int                                     f_slot;
5930         struct ocfs2_cached_block_free          *f_first;
5931 };
5932
5933 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
5934                                     int sysfile_type,
5935                                     int slot,
5936                                     struct ocfs2_cached_block_free *head)
5937 {
5938         int ret;
5939         u64 bg_blkno;
5940         handle_t *handle;
5941         struct inode *inode;
5942         struct buffer_head *di_bh = NULL;
5943         struct ocfs2_cached_block_free *tmp;
5944
5945         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5946         if (!inode) {
5947                 ret = -EINVAL;
5948                 mlog_errno(ret);
5949                 goto out;
5950         }
5951
5952         mutex_lock(&inode->i_mutex);
5953
5954         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5955         if (ret) {
5956                 mlog_errno(ret);
5957                 goto out_mutex;
5958         }
5959
5960         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5961         if (IS_ERR(handle)) {
5962                 ret = PTR_ERR(handle);
5963                 mlog_errno(ret);
5964                 goto out_unlock;
5965         }
5966
5967         while (head) {
5968                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5969                                                       head->free_bit);
5970                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5971                      head->free_bit, (unsigned long long)head->free_blk);
5972
5973                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5974                                                head->free_bit, bg_blkno, 1);
5975                 if (ret) {
5976                         mlog_errno(ret);
5977                         goto out_journal;
5978                 }
5979
5980                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5981                 if (ret) {
5982                         mlog_errno(ret);
5983                         goto out_journal;
5984                 }
5985
5986                 tmp = head;
5987                 head = head->free_next;
5988                 kfree(tmp);
5989         }
5990
5991 out_journal:
5992         ocfs2_commit_trans(osb, handle);
5993
5994 out_unlock:
5995         ocfs2_inode_unlock(inode, 1);
5996         brelse(di_bh);
5997 out_mutex:
5998         mutex_unlock(&inode->i_mutex);
5999         iput(inode);
6000 out:
6001         while(head) {
6002                 /* Premature exit may have left some dangling items. */
6003                 tmp = head;
6004                 head = head->free_next;
6005                 kfree(tmp);
6006         }
6007
6008         return ret;
6009 }
6010
6011 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6012                                 u64 blkno, unsigned int bit)
6013 {
6014         int ret = 0;
6015         struct ocfs2_cached_block_free *item;
6016
6017         item = kmalloc(sizeof(*item), GFP_NOFS);
6018         if (item == NULL) {
6019                 ret = -ENOMEM;
6020                 mlog_errno(ret);
6021                 return ret;
6022         }
6023
6024         mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6025              bit, (unsigned long long)blkno);
6026
6027         item->free_blk = blkno;
6028         item->free_bit = bit;
6029         item->free_next = ctxt->c_global_allocator;
6030
6031         ctxt->c_global_allocator = item;
6032         return ret;
6033 }
6034
6035 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6036                                       struct ocfs2_cached_block_free *head)
6037 {
6038         struct ocfs2_cached_block_free *tmp;
6039         struct inode *tl_inode = osb->osb_tl_inode;
6040         handle_t *handle;
6041         int ret = 0;
6042
6043         mutex_lock(&tl_inode->i_mutex);
6044
6045         while (head) {
6046                 if (ocfs2_truncate_log_needs_flush(osb)) {
6047                         ret = __ocfs2_flush_truncate_log(osb);
6048                         if (ret < 0) {
6049                                 mlog_errno(ret);
6050                                 break;
6051                         }
6052                 }
6053
6054                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6055                 if (IS_ERR(handle)) {
6056                         ret = PTR_ERR(handle);
6057                         mlog_errno(ret);
6058                         break;
6059                 }
6060
6061                 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6062                                                 head->free_bit);
6063
6064                 ocfs2_commit_trans(osb, handle);
6065                 tmp = head;
6066                 head = head->free_next;
6067                 kfree(tmp);
6068
6069                 if (ret < 0) {
6070                         mlog_errno(ret);
6071                         break;
6072                 }
6073         }
6074
6075         mutex_unlock(&tl_inode->i_mutex);
6076
6077         while (head) {
6078                 /* Premature exit may have left some dangling items. */
6079                 tmp = head;
6080                 head = head->free_next;
6081                 kfree(tmp);
6082         }
6083
6084         return ret;
6085 }
6086
6087 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6088                        struct ocfs2_cached_dealloc_ctxt *ctxt)
6089 {
6090         int ret = 0, ret2;
6091         struct ocfs2_per_slot_free_list *fl;
6092
6093         if (!ctxt)
6094                 return 0;
6095
6096         while (ctxt->c_first_suballocator) {
6097                 fl = ctxt->c_first_suballocator;
6098
6099                 if (fl->f_first) {
6100                         mlog(0, "Free items: (type %u, slot %d)\n",
6101                              fl->f_inode_type, fl->f_slot);
6102                         ret2 = ocfs2_free_cached_blocks(osb,
6103                                                         fl->f_inode_type,
6104                                                         fl->f_slot,
6105                                                         fl->f_first);
6106                         if (ret2)
6107                                 mlog_errno(ret2);
6108                         if (!ret)
6109                                 ret = ret2;
6110                 }
6111
6112                 ctxt->c_first_suballocator = fl->f_next_suballocator;
6113                 kfree(fl);
6114         }
6115
6116         if (ctxt->c_global_allocator) {
6117                 ret2 = ocfs2_free_cached_clusters(osb,
6118                                                   ctxt->c_global_allocator);
6119                 if (ret2)
6120                         mlog_errno(ret2);
6121                 if (!ret)
6122                         ret = ret2;
6123
6124                 ctxt->c_global_allocator = NULL;
6125         }
6126
6127         return ret;
6128 }
6129
6130 static struct ocfs2_per_slot_free_list *
6131 ocfs2_find_per_slot_free_list(int type,
6132                               int slot,
6133                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6134 {
6135         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6136
6137         while (fl) {
6138                 if (fl->f_inode_type == type && fl->f_slot == slot)
6139                         return fl;
6140
6141                 fl = fl->f_next_suballocator;
6142         }
6143
6144         fl = kmalloc(sizeof(*fl), GFP_NOFS);
6145         if (fl) {
6146                 fl->f_inode_type = type;
6147                 fl->f_slot = slot;
6148                 fl->f_first = NULL;
6149                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6150
6151                 ctxt->c_first_suballocator = fl;
6152         }
6153         return fl;
6154 }
6155
6156 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6157                                      int type, int slot, u64 blkno,
6158                                      unsigned int bit)
6159 {
6160         int ret;
6161         struct ocfs2_per_slot_free_list *fl;
6162         struct ocfs2_cached_block_free *item;
6163
6164         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6165         if (fl == NULL) {
6166                 ret = -ENOMEM;
6167                 mlog_errno(ret);
6168                 goto out;
6169         }
6170
6171         item = kmalloc(sizeof(*item), GFP_NOFS);
6172         if (item == NULL) {
6173                 ret = -ENOMEM;
6174                 mlog_errno(ret);
6175                 goto out;
6176         }
6177
6178         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6179              type, slot, bit, (unsigned long long)blkno);
6180
6181         item->free_blk = blkno;
6182         item->free_bit = bit;
6183         item->free_next = fl->f_first;
6184
6185         fl->f_first = item;
6186
6187         ret = 0;
6188 out:
6189         return ret;
6190 }
6191
6192 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6193                                          struct ocfs2_extent_block *eb)
6194 {
6195         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6196                                          le16_to_cpu(eb->h_suballoc_slot),
6197                                          le64_to_cpu(eb->h_blkno),
6198                                          le16_to_cpu(eb->h_suballoc_bit));
6199 }
6200
6201 /* This function will figure out whether the currently last extent
6202  * block will be deleted, and if it will, what the new last extent
6203  * block will be so we can update his h_next_leaf_blk field, as well
6204  * as the dinodes i_last_eb_blk */
6205 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6206                                        unsigned int clusters_to_del,
6207                                        struct ocfs2_path *path,
6208                                        struct buffer_head **new_last_eb)
6209 {
6210         int next_free, ret = 0;
6211         u32 cpos;
6212         struct ocfs2_extent_rec *rec;
6213         struct ocfs2_extent_block *eb;
6214         struct ocfs2_extent_list *el;
6215         struct buffer_head *bh = NULL;
6216
6217         *new_last_eb = NULL;
6218
6219         /* we have no tree, so of course, no last_eb. */
6220         if (!path->p_tree_depth)
6221                 goto out;
6222
6223         /* trunc to zero special case - this makes tree_depth = 0
6224          * regardless of what it is.  */
6225         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6226                 goto out;
6227
6228         el = path_leaf_el(path);
6229         BUG_ON(!el->l_next_free_rec);
6230
6231         /*
6232          * Make sure that this extent list will actually be empty
6233          * after we clear away the data. We can shortcut out if
6234          * there's more than one non-empty extent in the
6235          * list. Otherwise, a check of the remaining extent is
6236          * necessary.
6237          */
6238         next_free = le16_to_cpu(el->l_next_free_rec);
6239         rec = NULL;
6240         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6241                 if (next_free > 2)
6242                         goto out;
6243
6244                 /* We may have a valid extent in index 1, check it. */
6245                 if (next_free == 2)
6246                         rec = &el->l_recs[1];
6247
6248                 /*
6249                  * Fall through - no more nonempty extents, so we want
6250                  * to delete this leaf.
6251                  */
6252         } else {
6253                 if (next_free > 1)
6254                         goto out;
6255
6256                 rec = &el->l_recs[0];
6257         }
6258
6259         if (rec) {
6260                 /*
6261                  * Check it we'll only be trimming off the end of this
6262                  * cluster.
6263                  */
6264                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6265                         goto out;
6266         }
6267
6268         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6269         if (ret) {
6270                 mlog_errno(ret);
6271                 goto out;
6272         }
6273
6274         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6275         if (ret) {
6276                 mlog_errno(ret);
6277                 goto out;
6278         }
6279
6280         eb = (struct ocfs2_extent_block *) bh->b_data;
6281         el = &eb->h_list;
6282
6283         /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6284          * Any corruption is a code bug. */
6285         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6286
6287         *new_last_eb = bh;
6288         get_bh(*new_last_eb);
6289         mlog(0, "returning block %llu, (cpos: %u)\n",
6290              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6291 out:
6292         brelse(bh);
6293
6294         return ret;
6295 }
6296
6297 /*
6298  * Trim some clusters off the rightmost edge of a tree. Only called
6299  * during truncate.
6300  *
6301  * The caller needs to:
6302  *   - start journaling of each path component.
6303  *   - compute and fully set up any new last ext block
6304  */
6305 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6306                            handle_t *handle, struct ocfs2_truncate_context *tc,
6307                            u32 clusters_to_del, u64 *delete_start)
6308 {
6309         int ret, i, index = path->p_tree_depth;
6310         u32 new_edge = 0;
6311         u64 deleted_eb = 0;
6312         struct buffer_head *bh;
6313         struct ocfs2_extent_list *el;
6314         struct ocfs2_extent_rec *rec;
6315
6316         *delete_start = 0;
6317
6318         while (index >= 0) {
6319                 bh = path->p_node[index].bh;
6320                 el = path->p_node[index].el;
6321
6322                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6323                      index,  (unsigned long long)bh->b_blocknr);
6324
6325                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6326
6327                 if (index !=
6328                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6329                         ocfs2_error(inode->i_sb,
6330                                     "Inode %lu has invalid ext. block %llu",
6331                                     inode->i_ino,
6332                                     (unsigned long long)bh->b_blocknr);
6333                         ret = -EROFS;
6334                         goto out;
6335                 }
6336
6337 find_tail_record:
6338                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6339                 rec = &el->l_recs[i];
6340
6341                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6342                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6343                      ocfs2_rec_clusters(el, rec),
6344                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6345                      le16_to_cpu(el->l_next_free_rec));
6346
6347                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6348
6349                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6350                         /*
6351                          * If the leaf block contains a single empty
6352                          * extent and no records, we can just remove
6353                          * the block.
6354                          */
6355                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6356                                 memset(rec, 0,
6357                                        sizeof(struct ocfs2_extent_rec));
6358                                 el->l_next_free_rec = cpu_to_le16(0);
6359
6360                                 goto delete;
6361                         }
6362
6363                         /*
6364                          * Remove any empty extents by shifting things
6365                          * left. That should make life much easier on
6366                          * the code below. This condition is rare
6367                          * enough that we shouldn't see a performance
6368                          * hit.
6369                          */
6370                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6371                                 le16_add_cpu(&el->l_next_free_rec, -1);
6372
6373                                 for(i = 0;
6374                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6375                                         el->l_recs[i] = el->l_recs[i + 1];
6376
6377                                 memset(&el->l_recs[i], 0,
6378                                        sizeof(struct ocfs2_extent_rec));
6379
6380                                 /*
6381                                  * We've modified our extent list. The
6382                                  * simplest way to handle this change
6383                                  * is to being the search from the
6384                                  * start again.
6385                                  */
6386                                 goto find_tail_record;
6387                         }
6388
6389                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6390
6391                         /*
6392                          * We'll use "new_edge" on our way back up the
6393                          * tree to know what our rightmost cpos is.
6394                          */
6395                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6396                         new_edge += le32_to_cpu(rec->e_cpos);
6397
6398                         /*
6399                          * The caller will use this to delete data blocks.
6400                          */
6401                         *delete_start = le64_to_cpu(rec->e_blkno)
6402                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6403                                         le16_to_cpu(rec->e_leaf_clusters));
6404
6405                         /*
6406                          * If it's now empty, remove this record.
6407                          */
6408                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6409                                 memset(rec, 0,
6410                                        sizeof(struct ocfs2_extent_rec));
6411                                 le16_add_cpu(&el->l_next_free_rec, -1);
6412                         }
6413                 } else {
6414                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6415                                 memset(rec, 0,
6416                                        sizeof(struct ocfs2_extent_rec));
6417                                 le16_add_cpu(&el->l_next_free_rec, -1);
6418
6419                                 goto delete;
6420                         }
6421
6422                         /* Can this actually happen? */
6423                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6424                                 goto delete;
6425
6426                         /*
6427                          * We never actually deleted any clusters
6428                          * because our leaf was empty. There's no
6429                          * reason to adjust the rightmost edge then.
6430                          */
6431                         if (new_edge == 0)
6432                                 goto delete;
6433
6434                         rec->e_int_clusters = cpu_to_le32(new_edge);
6435                         le32_add_cpu(&rec->e_int_clusters,
6436                                      -le32_to_cpu(rec->e_cpos));
6437
6438                          /*
6439                           * A deleted child record should have been
6440                           * caught above.
6441                           */
6442                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6443                 }
6444
6445 delete:
6446                 ret = ocfs2_journal_dirty(handle, bh);
6447                 if (ret) {
6448                         mlog_errno(ret);
6449                         goto out;
6450                 }
6451
6452                 mlog(0, "extent list container %llu, after: record %d: "
6453                      "(%u, %u, %llu), next = %u.\n",
6454                      (unsigned long long)bh->b_blocknr, i,
6455                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6456                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6457                      le16_to_cpu(el->l_next_free_rec));
6458
6459                 /*
6460                  * We must be careful to only attempt delete of an
6461                  * extent block (and not the root inode block).
6462                  */
6463                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6464                         struct ocfs2_extent_block *eb =
6465                                 (struct ocfs2_extent_block *)bh->b_data;
6466
6467                         /*
6468                          * Save this for use when processing the
6469                          * parent block.
6470                          */
6471                         deleted_eb = le64_to_cpu(eb->h_blkno);
6472
6473                         mlog(0, "deleting this extent block.\n");
6474
6475                         ocfs2_remove_from_cache(inode, bh);
6476
6477                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6478                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6479                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6480
6481                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6482                         /* An error here is not fatal. */
6483                         if (ret < 0)
6484                                 mlog_errno(ret);
6485                 } else {
6486                         deleted_eb = 0;
6487                 }
6488
6489                 index--;
6490         }
6491
6492         ret = 0;
6493 out:
6494         return ret;
6495 }
6496
6497 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6498                              unsigned int clusters_to_del,
6499                              struct inode *inode,
6500                              struct buffer_head *fe_bh,
6501                              handle_t *handle,
6502                              struct ocfs2_truncate_context *tc,
6503                              struct ocfs2_path *path)
6504 {
6505         int status;
6506         struct ocfs2_dinode *fe;
6507         struct ocfs2_extent_block *last_eb = NULL;
6508         struct ocfs2_extent_list *el;
6509         struct buffer_head *last_eb_bh = NULL;
6510         u64 delete_blk = 0;
6511
6512         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6513
6514         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6515                                              path, &last_eb_bh);
6516         if (status < 0) {
6517                 mlog_errno(status);
6518                 goto bail;
6519         }
6520
6521         /*
6522          * Each component will be touched, so we might as well journal
6523          * here to avoid having to handle errors later.
6524          */
6525         status = ocfs2_journal_access_path(inode, handle, path);
6526         if (status < 0) {
6527                 mlog_errno(status);
6528                 goto bail;
6529         }
6530
6531         if (last_eb_bh) {
6532                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
6533                                               OCFS2_JOURNAL_ACCESS_WRITE);
6534                 if (status < 0) {
6535                         mlog_errno(status);
6536                         goto bail;
6537                 }
6538
6539                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6540         }
6541
6542         el = &(fe->id2.i_list);
6543
6544         /*
6545          * Lower levels depend on this never happening, but it's best
6546          * to check it up here before changing the tree.
6547          */
6548         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6549                 ocfs2_error(inode->i_sb,
6550                             "Inode %lu has an empty extent record, depth %u\n",
6551                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6552                 status = -EROFS;
6553                 goto bail;
6554         }
6555
6556         vfs_dq_free_space_nodirty(inode,
6557                         ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6558         spin_lock(&OCFS2_I(inode)->ip_lock);
6559         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6560                                       clusters_to_del;
6561         spin_unlock(&OCFS2_I(inode)->ip_lock);
6562         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6563         inode->i_blocks = ocfs2_inode_sector_count(inode);
6564
6565         status = ocfs2_trim_tree(inode, path, handle, tc,
6566                                  clusters_to_del, &delete_blk);
6567         if (status) {
6568                 mlog_errno(status);
6569                 goto bail;
6570         }
6571
6572         if (le32_to_cpu(fe->i_clusters) == 0) {
6573                 /* trunc to zero is a special case. */
6574                 el->l_tree_depth = 0;
6575                 fe->i_last_eb_blk = 0;
6576         } else if (last_eb)
6577                 fe->i_last_eb_blk = last_eb->h_blkno;
6578
6579         status = ocfs2_journal_dirty(handle, fe_bh);
6580         if (status < 0) {
6581                 mlog_errno(status);
6582                 goto bail;
6583         }
6584
6585         if (last_eb) {
6586                 /* If there will be a new last extent block, then by
6587                  * definition, there cannot be any leaves to the right of
6588                  * him. */
6589                 last_eb->h_next_leaf_blk = 0;
6590                 status = ocfs2_journal_dirty(handle, last_eb_bh);
6591                 if (status < 0) {
6592                         mlog_errno(status);
6593                         goto bail;
6594                 }
6595         }
6596
6597         if (delete_blk) {
6598                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6599                                                    clusters_to_del);
6600                 if (status < 0) {
6601                         mlog_errno(status);
6602                         goto bail;
6603                 }
6604         }
6605         status = 0;
6606 bail:
6607
6608         mlog_exit(status);
6609         return status;
6610 }
6611
6612 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6613 {
6614         set_buffer_uptodate(bh);
6615         mark_buffer_dirty(bh);
6616         return 0;
6617 }
6618
6619 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6620                                      unsigned int from, unsigned int to,
6621                                      struct page *page, int zero, u64 *phys)
6622 {
6623         int ret, partial = 0;
6624
6625         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6626         if (ret)
6627                 mlog_errno(ret);
6628
6629         if (zero)
6630                 zero_user_segment(page, from, to);
6631
6632         /*
6633          * Need to set the buffers we zero'd into uptodate
6634          * here if they aren't - ocfs2_map_page_blocks()
6635          * might've skipped some
6636          */
6637         ret = walk_page_buffers(handle, page_buffers(page),
6638                                 from, to, &partial,
6639                                 ocfs2_zero_func);
6640         if (ret < 0)
6641                 mlog_errno(ret);
6642         else if (ocfs2_should_order_data(inode)) {
6643                 ret = ocfs2_jbd2_file_inode(handle, inode);
6644                 if (ret < 0)
6645                         mlog_errno(ret);
6646         }
6647
6648         if (!partial)
6649                 SetPageUptodate(page);
6650
6651         flush_dcache_page(page);
6652 }
6653
6654 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6655                                      loff_t end, struct page **pages,
6656                                      int numpages, u64 phys, handle_t *handle)
6657 {
6658         int i;
6659         struct page *page;
6660         unsigned int from, to = PAGE_CACHE_SIZE;
6661         struct super_block *sb = inode->i_sb;
6662
6663         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6664
6665         if (numpages == 0)
6666                 goto out;
6667
6668         to = PAGE_CACHE_SIZE;
6669         for(i = 0; i < numpages; i++) {
6670                 page = pages[i];
6671
6672                 from = start & (PAGE_CACHE_SIZE - 1);
6673                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6674                         to = end & (PAGE_CACHE_SIZE - 1);
6675
6676                 BUG_ON(from > PAGE_CACHE_SIZE);
6677                 BUG_ON(to > PAGE_CACHE_SIZE);
6678
6679                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6680                                          &phys);
6681
6682                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6683         }
6684 out:
6685         if (pages)
6686                 ocfs2_unlock_and_free_pages(pages, numpages);
6687 }
6688
6689 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6690                                 struct page **pages, int *num)
6691 {
6692         int numpages, ret = 0;
6693         struct super_block *sb = inode->i_sb;
6694         struct address_space *mapping = inode->i_mapping;
6695         unsigned long index;
6696         loff_t last_page_bytes;
6697
6698         BUG_ON(start > end);
6699
6700         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6701                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6702
6703         numpages = 0;
6704         last_page_bytes = PAGE_ALIGN(end);
6705         index = start >> PAGE_CACHE_SHIFT;
6706         do {
6707                 pages[numpages] = grab_cache_page(mapping, index);
6708                 if (!pages[numpages]) {
6709                         ret = -ENOMEM;
6710                         mlog_errno(ret);
6711                         goto out;
6712                 }
6713
6714                 numpages++;
6715                 index++;
6716         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6717
6718 out:
6719         if (ret != 0) {
6720                 if (pages)
6721                         ocfs2_unlock_and_free_pages(pages, numpages);
6722                 numpages = 0;
6723         }
6724
6725         *num = numpages;
6726
6727         return ret;
6728 }
6729
6730 /*
6731  * Zero the area past i_size but still within an allocated
6732  * cluster. This avoids exposing nonzero data on subsequent file
6733  * extends.
6734  *
6735  * We need to call this before i_size is updated on the inode because
6736  * otherwise block_write_full_page() will skip writeout of pages past
6737  * i_size. The new_i_size parameter is passed for this reason.
6738  */
6739 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6740                                   u64 range_start, u64 range_end)
6741 {
6742         int ret = 0, numpages;
6743         struct page **pages = NULL;
6744         u64 phys;
6745         unsigned int ext_flags;
6746         struct super_block *sb = inode->i_sb;
6747
6748         /*
6749          * File systems which don't support sparse files zero on every
6750          * extend.
6751          */
6752         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6753                 return 0;
6754
6755         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6756                         sizeof(struct page *), GFP_NOFS);
6757         if (pages == NULL) {
6758                 ret = -ENOMEM;
6759                 mlog_errno(ret);
6760                 goto out;
6761         }
6762
6763         if (range_start == range_end)
6764                 goto out;
6765
6766         ret = ocfs2_extent_map_get_blocks(inode,
6767                                           range_start >> sb->s_blocksize_bits,
6768                                           &phys, NULL, &ext_flags);
6769         if (ret) {
6770                 mlog_errno(ret);
6771                 goto out;
6772         }
6773
6774         /*
6775          * Tail is a hole, or is marked unwritten. In either case, we
6776          * can count on read and write to return/push zero's.
6777          */
6778         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6779                 goto out;
6780
6781         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6782                                    &numpages);
6783         if (ret) {
6784                 mlog_errno(ret);
6785                 goto out;
6786         }
6787
6788         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6789                                  numpages, phys, handle);
6790
6791         /*
6792          * Initiate writeout of the pages we zero'd here. We don't
6793          * wait on them - the truncate_inode_pages() call later will
6794          * do that for us.
6795          */
6796         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6797                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6798         if (ret)
6799                 mlog_errno(ret);
6800
6801 out:
6802         if (pages)
6803                 kfree(pages);
6804
6805         return ret;
6806 }
6807
6808 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6809                                              struct ocfs2_dinode *di)
6810 {
6811         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6812         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6813
6814         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6815                 memset(&di->id2, 0, blocksize -
6816                                     offsetof(struct ocfs2_dinode, id2) -
6817                                     xattrsize);
6818         else
6819                 memset(&di->id2, 0, blocksize -
6820                                     offsetof(struct ocfs2_dinode, id2));
6821 }
6822
6823 void ocfs2_dinode_new_extent_list(struct inode *inode,
6824                                   struct ocfs2_dinode *di)
6825 {
6826         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6827         di->id2.i_list.l_tree_depth = 0;
6828         di->id2.i_list.l_next_free_rec = 0;
6829         di->id2.i_list.l_count = cpu_to_le16(
6830                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6831 }
6832
6833 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6834 {
6835         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6836         struct ocfs2_inline_data *idata = &di->id2.i_data;
6837
6838         spin_lock(&oi->ip_lock);
6839         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6840         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6841         spin_unlock(&oi->ip_lock);
6842
6843         /*
6844          * We clear the entire i_data structure here so that all
6845          * fields can be properly initialized.
6846          */
6847         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6848
6849         idata->id_count = cpu_to_le16(
6850                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6851 }
6852
6853 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6854                                          struct buffer_head *di_bh)
6855 {
6856         int ret, i, has_data, num_pages = 0;
6857         handle_t *handle;
6858         u64 uninitialized_var(block);
6859         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6860         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6861         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6862         struct ocfs2_alloc_context *data_ac = NULL;
6863         struct page **pages = NULL;
6864         loff_t end = osb->s_clustersize;
6865         struct ocfs2_extent_tree et;
6866         int did_quota = 0;
6867
6868         has_data = i_size_read(inode) ? 1 : 0;
6869
6870         if (has_data) {
6871                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6872                                 sizeof(struct page *), GFP_NOFS);
6873                 if (pages == NULL) {
6874                         ret = -ENOMEM;
6875                         mlog_errno(ret);
6876                         goto out;
6877                 }
6878
6879                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6880                 if (ret) {
6881                         mlog_errno(ret);
6882                         goto out;
6883                 }
6884         }
6885
6886         handle = ocfs2_start_trans(osb,
6887                                    ocfs2_inline_to_extents_credits(osb->sb));
6888         if (IS_ERR(handle)) {
6889                 ret = PTR_ERR(handle);
6890                 mlog_errno(ret);
6891                 goto out_unlock;
6892         }
6893
6894         ret = ocfs2_journal_access(handle, inode, di_bh,
6895                                    OCFS2_JOURNAL_ACCESS_WRITE);
6896         if (ret) {
6897                 mlog_errno(ret);
6898                 goto out_commit;
6899         }
6900
6901         if (has_data) {
6902                 u32 bit_off, num;
6903                 unsigned int page_end;
6904                 u64 phys;
6905
6906                 if (vfs_dq_alloc_space_nodirty(inode,
6907                                        ocfs2_clusters_to_bytes(osb->sb, 1))) {
6908                         ret = -EDQUOT;
6909                         goto out_commit;
6910                 }
6911                 did_quota = 1;
6912
6913                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6914                                            &num);
6915                 if (ret) {
6916                         mlog_errno(ret);
6917                         goto out_commit;
6918                 }
6919
6920                 /*
6921                  * Save two copies, one for insert, and one that can
6922                  * be changed by ocfs2_map_and_dirty_page() below.
6923                  */
6924                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6925
6926                 /*
6927                  * Non sparse file systems zero on extend, so no need
6928                  * to do that now.
6929                  */
6930                 if (!ocfs2_sparse_alloc(osb) &&
6931                     PAGE_CACHE_SIZE < osb->s_clustersize)
6932                         end = PAGE_CACHE_SIZE;
6933
6934                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6935                 if (ret) {
6936                         mlog_errno(ret);
6937                         goto out_commit;
6938                 }
6939
6940                 /*
6941                  * This should populate the 1st page for us and mark
6942                  * it up to date.
6943                  */
6944                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6945                 if (ret) {
6946                         mlog_errno(ret);
6947                         goto out_commit;
6948                 }
6949
6950                 page_end = PAGE_CACHE_SIZE;
6951                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6952                         page_end = osb->s_clustersize;
6953
6954                 for (i = 0; i < num_pages; i++)
6955                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6956                                                  pages[i], i > 0, &phys);
6957         }
6958
6959         spin_lock(&oi->ip_lock);
6960         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6961         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6962         spin_unlock(&oi->ip_lock);
6963
6964         ocfs2_dinode_new_extent_list(inode, di);
6965
6966         ocfs2_journal_dirty(handle, di_bh);
6967
6968         if (has_data) {
6969                 /*
6970                  * An error at this point should be extremely rare. If
6971                  * this proves to be false, we could always re-build
6972                  * the in-inode data from our pages.
6973                  */
6974                 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
6975                 ret = ocfs2_insert_extent(osb, handle, inode, &et,
6976                                           0, block, 1, 0, NULL);
6977                 if (ret) {
6978                         mlog_errno(ret);
6979                         goto out_commit;
6980                 }
6981
6982                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6983         }
6984
6985 out_commit:
6986         if (ret < 0 && did_quota)
6987                 vfs_dq_free_space_nodirty(inode,
6988                                           ocfs2_clusters_to_bytes(osb->sb, 1));
6989
6990         ocfs2_commit_trans(osb, handle);
6991
6992 out_unlock:
6993         if (data_ac)
6994                 ocfs2_free_alloc_context(data_ac);
6995
6996 out:
6997         if (pages) {
6998                 ocfs2_unlock_and_free_pages(pages, num_pages);
6999                 kfree(pages);
7000         }
7001
7002         return ret;
7003 }
7004
7005 /*
7006  * It is expected, that by the time you call this function,
7007  * inode->i_size and fe->i_size have been adjusted.
7008  *
7009  * WARNING: This will kfree the truncate context
7010  */
7011 int ocfs2_commit_truncate(struct ocfs2_super *osb,
7012                           struct inode *inode,
7013                           struct buffer_head *fe_bh,
7014                           struct ocfs2_truncate_context *tc)
7015 {
7016         int status, i, credits, tl_sem = 0;
7017         u32 clusters_to_del, new_highest_cpos, range;
7018         struct ocfs2_extent_list *el;
7019         handle_t *handle = NULL;
7020         struct inode *tl_inode = osb->osb_tl_inode;
7021         struct ocfs2_path *path = NULL;
7022         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7023
7024         mlog_entry_void();
7025
7026         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7027                                                      i_size_read(inode));
7028
7029         path = ocfs2_new_path(fe_bh, &di->id2.i_list);
7030         if (!path) {
7031                 status = -ENOMEM;
7032                 mlog_errno(status);
7033                 goto bail;
7034         }
7035
7036         ocfs2_extent_map_trunc(inode, new_highest_cpos);
7037
7038 start:
7039         /*
7040          * Check that we still have allocation to delete.
7041          */
7042         if (OCFS2_I(inode)->ip_clusters == 0) {
7043                 status = 0;
7044                 goto bail;
7045         }
7046
7047         /*
7048          * Truncate always works against the rightmost tree branch.
7049          */
7050         status = ocfs2_find_path(inode, path, UINT_MAX);
7051         if (status) {
7052                 mlog_errno(status);
7053                 goto bail;
7054         }
7055
7056         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7057              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7058
7059         /*
7060          * By now, el will point to the extent list on the bottom most
7061          * portion of this tree. Only the tail record is considered in
7062          * each pass.
7063          *
7064          * We handle the following cases, in order:
7065          * - empty extent: delete the remaining branch
7066          * - remove the entire record
7067          * - remove a partial record
7068          * - no record needs to be removed (truncate has completed)
7069          */
7070         el = path_leaf_el(path);
7071         if (le16_to_cpu(el->l_next_free_rec) == 0) {
7072                 ocfs2_error(inode->i_sb,
7073                             "Inode %llu has empty extent block at %llu\n",
7074                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7075                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
7076                 status = -EROFS;
7077                 goto bail;
7078         }
7079
7080         i = le16_to_cpu(el->l_next_free_rec) - 1;
7081         range = le32_to_cpu(el->l_recs[i].e_cpos) +
7082                 ocfs2_rec_clusters(el, &el->l_recs[i]);
7083         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
7084                 clusters_to_del = 0;
7085         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7086                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7087         } else if (range > new_highest_cpos) {
7088                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7089                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
7090                                   new_highest_cpos;
7091         } else {
7092                 status = 0;
7093                 goto bail;
7094         }
7095
7096         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7097              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7098
7099         mutex_lock(&tl_inode->i_mutex);
7100         tl_sem = 1;
7101         /* ocfs2_truncate_log_needs_flush guarantees us at least one
7102          * record is free for use. If there isn't any, we flush to get
7103          * an empty truncate log.  */
7104         if (ocfs2_truncate_log_needs_flush(osb)) {
7105                 status = __ocfs2_flush_truncate_log(osb);
7106                 if (status < 0) {
7107                         mlog_errno(status);
7108                         goto bail;
7109                 }
7110         }
7111
7112         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7113                                                 (struct ocfs2_dinode *)fe_bh->b_data,
7114                                                 el);
7115         handle = ocfs2_start_trans(osb, credits);
7116         if (IS_ERR(handle)) {
7117                 status = PTR_ERR(handle);
7118                 handle = NULL;
7119                 mlog_errno(status);
7120                 goto bail;
7121         }
7122
7123         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7124                                    tc, path);
7125         if (status < 0) {
7126                 mlog_errno(status);
7127                 goto bail;
7128         }
7129
7130         mutex_unlock(&tl_inode->i_mutex);
7131         tl_sem = 0;
7132
7133         ocfs2_commit_trans(osb, handle);
7134         handle = NULL;
7135
7136         ocfs2_reinit_path(path, 1);
7137
7138         /*
7139          * The check above will catch the case where we've truncated
7140          * away all allocation.
7141          */
7142         goto start;
7143
7144 bail:
7145
7146         ocfs2_schedule_truncate_log_flush(osb, 1);
7147
7148         if (tl_sem)
7149                 mutex_unlock(&tl_inode->i_mutex);
7150
7151         if (handle)
7152                 ocfs2_commit_trans(osb, handle);
7153
7154         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7155
7156         ocfs2_free_path(path);
7157
7158         /* This will drop the ext_alloc cluster lock for us */
7159         ocfs2_free_truncate_context(tc);
7160
7161         mlog_exit(status);
7162         return status;
7163 }
7164
7165 /*
7166  * Expects the inode to already be locked.
7167  */
7168 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7169                            struct inode *inode,
7170                            struct buffer_head *fe_bh,
7171                            struct ocfs2_truncate_context **tc)
7172 {
7173         int status;
7174         unsigned int new_i_clusters;
7175         struct ocfs2_dinode *fe;
7176         struct ocfs2_extent_block *eb;
7177         struct buffer_head *last_eb_bh = NULL;
7178
7179         mlog_entry_void();
7180
7181         *tc = NULL;
7182
7183         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7184                                                   i_size_read(inode));
7185         fe = (struct ocfs2_dinode *) fe_bh->b_data;
7186
7187         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7188              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7189              (unsigned long long)le64_to_cpu(fe->i_size));
7190
7191         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7192         if (!(*tc)) {
7193                 status = -ENOMEM;
7194                 mlog_errno(status);
7195                 goto bail;
7196         }
7197         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7198
7199         if (fe->id2.i_list.l_tree_depth) {
7200                 status = ocfs2_read_extent_block(inode,
7201                                                  le64_to_cpu(fe->i_last_eb_blk),
7202                                                  &last_eb_bh);
7203                 if (status < 0) {
7204                         mlog_errno(status);
7205                         goto bail;
7206                 }
7207                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7208         }
7209
7210         (*tc)->tc_last_eb_bh = last_eb_bh;
7211
7212         status = 0;
7213 bail:
7214         if (status < 0) {
7215                 if (*tc)
7216                         ocfs2_free_truncate_context(*tc);
7217                 *tc = NULL;
7218         }
7219         mlog_exit_void();
7220         return status;
7221 }
7222
7223 /*
7224  * 'start' is inclusive, 'end' is not.
7225  */
7226 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7227                           unsigned int start, unsigned int end, int trunc)
7228 {
7229         int ret;
7230         unsigned int numbytes;
7231         handle_t *handle;
7232         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7233         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7234         struct ocfs2_inline_data *idata = &di->id2.i_data;
7235
7236         if (end > i_size_read(inode))
7237                 end = i_size_read(inode);
7238
7239         BUG_ON(start >= end);
7240
7241         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7242             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7243             !ocfs2_supports_inline_data(osb)) {
7244                 ocfs2_error(inode->i_sb,
7245                             "Inline data flags for inode %llu don't agree! "
7246                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7247                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7248                             le16_to_cpu(di->i_dyn_features),
7249                             OCFS2_I(inode)->ip_dyn_features,
7250                             osb->s_feature_incompat);
7251                 ret = -EROFS;
7252                 goto out;
7253         }
7254
7255         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7256         if (IS_ERR(handle)) {
7257                 ret = PTR_ERR(handle);
7258                 mlog_errno(ret);
7259                 goto out;
7260         }
7261
7262         ret = ocfs2_journal_access(handle, inode, di_bh,
7263                                    OCFS2_JOURNAL_ACCESS_WRITE);
7264         if (ret) {
7265                 mlog_errno(ret);
7266                 goto out_commit;
7267         }
7268
7269         numbytes = end - start;
7270         memset(idata->id_data + start, 0, numbytes);
7271
7272         /*
7273          * No need to worry about the data page here - it's been
7274          * truncated already and inline data doesn't need it for
7275          * pushing zero's to disk, so we'll let readpage pick it up
7276          * later.
7277          */
7278         if (trunc) {
7279                 i_size_write(inode, start);
7280                 di->i_size = cpu_to_le64(start);
7281         }
7282
7283         inode->i_blocks = ocfs2_inode_sector_count(inode);
7284         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7285
7286         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7287         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7288
7289         ocfs2_journal_dirty(handle, di_bh);
7290
7291 out_commit:
7292         ocfs2_commit_trans(osb, handle);
7293
7294 out:
7295         return ret;
7296 }
7297
7298 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7299 {
7300         /*
7301          * The caller is responsible for completing deallocation
7302          * before freeing the context.
7303          */
7304         if (tc->tc_dealloc.c_first_suballocator != NULL)
7305                 mlog(ML_NOTICE,
7306                      "Truncate completion has non-empty dealloc context\n");
7307
7308         brelse(tc->tc_last_eb_bh);
7309
7310         kfree(tc);
7311 }